diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24591 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3507, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00028514399771884804, + "grad_norm": 73.35986033232847, + "learning_rate": 9.433962264150944e-08, + "loss": 0.9276, + "step": 1 + }, + { + "epoch": 0.0005702879954376961, + "grad_norm": 258.36563461213944, + "learning_rate": 1.886792452830189e-07, + "loss": 0.9933, + "step": 2 + }, + { + "epoch": 0.000855431993156544, + "grad_norm": 166.05860991922756, + "learning_rate": 2.8301886792452833e-07, + "loss": 0.8841, + "step": 3 + }, + { + "epoch": 0.0011405759908753922, + "grad_norm": 80.04075689751123, + "learning_rate": 3.773584905660378e-07, + "loss": 0.8703, + "step": 4 + }, + { + "epoch": 0.00142571998859424, + "grad_norm": 94.13324251180957, + "learning_rate": 4.716981132075472e-07, + "loss": 0.7985, + "step": 5 + }, + { + "epoch": 0.001710863986313088, + "grad_norm": 95.32563753359732, + "learning_rate": 5.660377358490567e-07, + "loss": 0.8165, + "step": 6 + }, + { + "epoch": 0.001996007984031936, + "grad_norm": 107.85429854364156, + "learning_rate": 6.603773584905661e-07, + "loss": 0.7712, + "step": 7 + }, + { + "epoch": 0.0022811519817507843, + "grad_norm": 81.4209081472541, + "learning_rate": 7.547169811320755e-07, + "loss": 0.8074, + "step": 8 + }, + { + "epoch": 0.0025662959794696323, + "grad_norm": 102.25957565902327, + "learning_rate": 8.490566037735849e-07, + "loss": 0.71, + "step": 9 + }, + { + "epoch": 0.00285143997718848, + "grad_norm": 93.50952676307716, + "learning_rate": 9.433962264150944e-07, + "loss": 0.5691, + "step": 10 + }, + { + "epoch": 0.003136583974907328, + "grad_norm": 28.59917240226574, + "learning_rate": 1.037735849056604e-06, + "loss": 0.2667, + "step": 11 + }, + { + "epoch": 0.003421727972626176, + "grad_norm": 31.65592294166331, + "learning_rate": 1.1320754716981133e-06, + "loss": 0.1884, + "step": 12 + }, + { + "epoch": 0.0037068719703450244, + "grad_norm": 19.767941232025557, + "learning_rate": 1.2264150943396227e-06, + "loss": 0.1949, + "step": 13 + }, + { + "epoch": 0.003992015968063872, + "grad_norm": 18.679288467520557, + "learning_rate": 1.3207547169811322e-06, + "loss": 0.1665, + "step": 14 + }, + { + "epoch": 0.00427715996578272, + "grad_norm": 11.019999790672726, + "learning_rate": 1.4150943396226415e-06, + "loss": 0.083, + "step": 15 + }, + { + "epoch": 0.004562303963501569, + "grad_norm": 10.59204980956432, + "learning_rate": 1.509433962264151e-06, + "loss": 0.1157, + "step": 16 + }, + { + "epoch": 0.004847447961220416, + "grad_norm": 3.1617167080707995, + "learning_rate": 1.6037735849056604e-06, + "loss": 0.0322, + "step": 17 + }, + { + "epoch": 0.0051325919589392645, + "grad_norm": 4.146835741088157, + "learning_rate": 1.6981132075471698e-06, + "loss": 0.0497, + "step": 18 + }, + { + "epoch": 0.005417735956658112, + "grad_norm": 7.254151558151055, + "learning_rate": 1.7924528301886793e-06, + "loss": 0.0879, + "step": 19 + }, + { + "epoch": 0.00570287995437696, + "grad_norm": 42.70760196636873, + "learning_rate": 1.8867924528301889e-06, + "loss": 0.1596, + "step": 20 + }, + { + "epoch": 0.005988023952095809, + "grad_norm": 9.554484439254637, + "learning_rate": 1.981132075471698e-06, + "loss": 0.1138, + "step": 21 + }, + { + "epoch": 0.006273167949814656, + "grad_norm": 7.238913126784841, + "learning_rate": 2.075471698113208e-06, + "loss": 0.0812, + "step": 22 + }, + { + "epoch": 0.006558311947533505, + "grad_norm": 4.259308808989036, + "learning_rate": 2.1698113207547173e-06, + "loss": 0.0413, + "step": 23 + }, + { + "epoch": 0.006843455945252352, + "grad_norm": 6.566396501150489, + "learning_rate": 2.2641509433962266e-06, + "loss": 0.0549, + "step": 24 + }, + { + "epoch": 0.0071285999429712005, + "grad_norm": 5.066416110741747, + "learning_rate": 2.358490566037736e-06, + "loss": 0.037, + "step": 25 + }, + { + "epoch": 0.007413743940690049, + "grad_norm": 4.723721752248351, + "learning_rate": 2.4528301886792453e-06, + "loss": 0.0585, + "step": 26 + }, + { + "epoch": 0.007698887938408896, + "grad_norm": 2.560432972171629, + "learning_rate": 2.547169811320755e-06, + "loss": 0.0371, + "step": 27 + }, + { + "epoch": 0.007984031936127744, + "grad_norm": 4.2665726201781276, + "learning_rate": 2.6415094339622644e-06, + "loss": 0.0507, + "step": 28 + }, + { + "epoch": 0.008269175933846592, + "grad_norm": 4.63734681578259, + "learning_rate": 2.7358490566037738e-06, + "loss": 0.0511, + "step": 29 + }, + { + "epoch": 0.00855431993156544, + "grad_norm": 2.5768861044123095, + "learning_rate": 2.830188679245283e-06, + "loss": 0.0357, + "step": 30 + }, + { + "epoch": 0.008839463929284289, + "grad_norm": 3.634984231769664, + "learning_rate": 2.9245283018867924e-06, + "loss": 0.0625, + "step": 31 + }, + { + "epoch": 0.009124607927003137, + "grad_norm": 4.840413439326456, + "learning_rate": 3.018867924528302e-06, + "loss": 0.0414, + "step": 32 + }, + { + "epoch": 0.009409751924721984, + "grad_norm": 9.287836956870658, + "learning_rate": 3.1132075471698115e-06, + "loss": 0.0882, + "step": 33 + }, + { + "epoch": 0.009694895922440832, + "grad_norm": 6.973138828619164, + "learning_rate": 3.207547169811321e-06, + "loss": 0.0593, + "step": 34 + }, + { + "epoch": 0.00998003992015968, + "grad_norm": 3.6359531137785113, + "learning_rate": 3.30188679245283e-06, + "loss": 0.0335, + "step": 35 + }, + { + "epoch": 0.010265183917878529, + "grad_norm": 3.4832523392273527, + "learning_rate": 3.3962264150943395e-06, + "loss": 0.0621, + "step": 36 + }, + { + "epoch": 0.010550327915597377, + "grad_norm": 6.152559798979658, + "learning_rate": 3.4905660377358493e-06, + "loss": 0.0768, + "step": 37 + }, + { + "epoch": 0.010835471913316224, + "grad_norm": 3.142333726843115, + "learning_rate": 3.5849056603773586e-06, + "loss": 0.0421, + "step": 38 + }, + { + "epoch": 0.011120615911035072, + "grad_norm": 4.4376556128819615, + "learning_rate": 3.679245283018868e-06, + "loss": 0.0618, + "step": 39 + }, + { + "epoch": 0.01140575990875392, + "grad_norm": 3.119036132535718, + "learning_rate": 3.7735849056603777e-06, + "loss": 0.0395, + "step": 40 + }, + { + "epoch": 0.011690903906472769, + "grad_norm": 2.8819748677454564, + "learning_rate": 3.8679245283018875e-06, + "loss": 0.036, + "step": 41 + }, + { + "epoch": 0.011976047904191617, + "grad_norm": 3.0285102030013147, + "learning_rate": 3.962264150943396e-06, + "loss": 0.0451, + "step": 42 + }, + { + "epoch": 0.012261191901910464, + "grad_norm": 2.4198586652875713, + "learning_rate": 4.056603773584906e-06, + "loss": 0.0359, + "step": 43 + }, + { + "epoch": 0.012546335899629312, + "grad_norm": 4.01556344965596, + "learning_rate": 4.150943396226416e-06, + "loss": 0.0654, + "step": 44 + }, + { + "epoch": 0.01283147989734816, + "grad_norm": 4.152755564375582, + "learning_rate": 4.245283018867925e-06, + "loss": 0.0688, + "step": 45 + }, + { + "epoch": 0.01311662389506701, + "grad_norm": 5.650506818749513, + "learning_rate": 4.339622641509435e-06, + "loss": 0.0621, + "step": 46 + }, + { + "epoch": 0.013401767892785858, + "grad_norm": 4.772858263739585, + "learning_rate": 4.4339622641509435e-06, + "loss": 0.043, + "step": 47 + }, + { + "epoch": 0.013686911890504704, + "grad_norm": 3.281056276083866, + "learning_rate": 4.528301886792453e-06, + "loss": 0.0409, + "step": 48 + }, + { + "epoch": 0.013972055888223553, + "grad_norm": 2.665340695119781, + "learning_rate": 4.622641509433963e-06, + "loss": 0.0261, + "step": 49 + }, + { + "epoch": 0.014257199885942401, + "grad_norm": 2.1051762190060757, + "learning_rate": 4.716981132075472e-06, + "loss": 0.046, + "step": 50 + }, + { + "epoch": 0.01454234388366125, + "grad_norm": 4.140757787048907, + "learning_rate": 4.811320754716982e-06, + "loss": 0.0399, + "step": 51 + }, + { + "epoch": 0.014827487881380098, + "grad_norm": 4.806224421900785, + "learning_rate": 4.905660377358491e-06, + "loss": 0.0733, + "step": 52 + }, + { + "epoch": 0.015112631879098944, + "grad_norm": 5.244037355061446, + "learning_rate": 5e-06, + "loss": 0.0829, + "step": 53 + }, + { + "epoch": 0.015397775876817793, + "grad_norm": 2.529956827555466, + "learning_rate": 5.09433962264151e-06, + "loss": 0.0414, + "step": 54 + }, + { + "epoch": 0.01568291987453664, + "grad_norm": 3.1547732507353037, + "learning_rate": 5.188679245283019e-06, + "loss": 0.0492, + "step": 55 + }, + { + "epoch": 0.015968063872255488, + "grad_norm": 3.6253749901832553, + "learning_rate": 5.283018867924529e-06, + "loss": 0.0409, + "step": 56 + }, + { + "epoch": 0.016253207869974338, + "grad_norm": 8.40170080651301, + "learning_rate": 5.377358490566038e-06, + "loss": 0.0788, + "step": 57 + }, + { + "epoch": 0.016538351867693184, + "grad_norm": 2.2891004276086444, + "learning_rate": 5.4716981132075475e-06, + "loss": 0.0452, + "step": 58 + }, + { + "epoch": 0.016823495865412035, + "grad_norm": 1.968563607614644, + "learning_rate": 5.566037735849057e-06, + "loss": 0.0374, + "step": 59 + }, + { + "epoch": 0.01710863986313088, + "grad_norm": 2.2826667099920424, + "learning_rate": 5.660377358490566e-06, + "loss": 0.0318, + "step": 60 + }, + { + "epoch": 0.017393783860849728, + "grad_norm": 3.075829515844115, + "learning_rate": 5.754716981132076e-06, + "loss": 0.0456, + "step": 61 + }, + { + "epoch": 0.017678927858568578, + "grad_norm": 2.641793301245682, + "learning_rate": 5.849056603773585e-06, + "loss": 0.0382, + "step": 62 + }, + { + "epoch": 0.017964071856287425, + "grad_norm": 3.0357867774553275, + "learning_rate": 5.943396226415095e-06, + "loss": 0.0391, + "step": 63 + }, + { + "epoch": 0.018249215854006275, + "grad_norm": 1.3459497638743567, + "learning_rate": 6.037735849056604e-06, + "loss": 0.0192, + "step": 64 + }, + { + "epoch": 0.01853435985172512, + "grad_norm": 0.8896089027285865, + "learning_rate": 6.132075471698113e-06, + "loss": 0.026, + "step": 65 + }, + { + "epoch": 0.018819503849443968, + "grad_norm": 17.332060998609155, + "learning_rate": 6.226415094339623e-06, + "loss": 0.0781, + "step": 66 + }, + { + "epoch": 0.019104647847162818, + "grad_norm": 2.071064848030776, + "learning_rate": 6.320754716981132e-06, + "loss": 0.0431, + "step": 67 + }, + { + "epoch": 0.019389791844881665, + "grad_norm": 4.062997719292194, + "learning_rate": 6.415094339622642e-06, + "loss": 0.0354, + "step": 68 + }, + { + "epoch": 0.019674935842600515, + "grad_norm": 2.350390848534792, + "learning_rate": 6.5094339622641515e-06, + "loss": 0.0274, + "step": 69 + }, + { + "epoch": 0.01996007984031936, + "grad_norm": 2.566245599377888, + "learning_rate": 6.60377358490566e-06, + "loss": 0.0699, + "step": 70 + }, + { + "epoch": 0.020245223838038208, + "grad_norm": 3.5006687012241122, + "learning_rate": 6.69811320754717e-06, + "loss": 0.0684, + "step": 71 + }, + { + "epoch": 0.020530367835757058, + "grad_norm": 4.243644946589001, + "learning_rate": 6.792452830188679e-06, + "loss": 0.0957, + "step": 72 + }, + { + "epoch": 0.020815511833475905, + "grad_norm": 1.5269476114061467, + "learning_rate": 6.886792452830189e-06, + "loss": 0.0372, + "step": 73 + }, + { + "epoch": 0.021100655831194755, + "grad_norm": 41.74516371814436, + "learning_rate": 6.981132075471699e-06, + "loss": 0.2785, + "step": 74 + }, + { + "epoch": 0.0213857998289136, + "grad_norm": 3.8783540833989076, + "learning_rate": 7.0754716981132075e-06, + "loss": 0.0375, + "step": 75 + }, + { + "epoch": 0.021670943826632448, + "grad_norm": 4.643543581746758, + "learning_rate": 7.169811320754717e-06, + "loss": 0.036, + "step": 76 + }, + { + "epoch": 0.021956087824351298, + "grad_norm": 4.600001137811726, + "learning_rate": 7.264150943396226e-06, + "loss": 0.0636, + "step": 77 + }, + { + "epoch": 0.022241231822070145, + "grad_norm": 3.128559332322622, + "learning_rate": 7.358490566037736e-06, + "loss": 0.0429, + "step": 78 + }, + { + "epoch": 0.022526375819788995, + "grad_norm": 2.8641659687703678, + "learning_rate": 7.452830188679246e-06, + "loss": 0.042, + "step": 79 + }, + { + "epoch": 0.02281151981750784, + "grad_norm": 12.271691655074866, + "learning_rate": 7.5471698113207555e-06, + "loss": 0.0862, + "step": 80 + }, + { + "epoch": 0.023096663815226688, + "grad_norm": 5.5069227290477665, + "learning_rate": 7.641509433962266e-06, + "loss": 0.0403, + "step": 81 + }, + { + "epoch": 0.023381807812945538, + "grad_norm": 1.4595795029247436, + "learning_rate": 7.735849056603775e-06, + "loss": 0.0266, + "step": 82 + }, + { + "epoch": 0.023666951810664385, + "grad_norm": 8.876102538357202, + "learning_rate": 7.830188679245284e-06, + "loss": 0.0521, + "step": 83 + }, + { + "epoch": 0.023952095808383235, + "grad_norm": 1.6782963289907105, + "learning_rate": 7.924528301886793e-06, + "loss": 0.0326, + "step": 84 + }, + { + "epoch": 0.02423723980610208, + "grad_norm": 2.681755314762461, + "learning_rate": 8.018867924528303e-06, + "loss": 0.0839, + "step": 85 + }, + { + "epoch": 0.024522383803820928, + "grad_norm": 5.714032889612906, + "learning_rate": 8.113207547169812e-06, + "loss": 0.0316, + "step": 86 + }, + { + "epoch": 0.02480752780153978, + "grad_norm": 5.028130886749829, + "learning_rate": 8.207547169811321e-06, + "loss": 0.0731, + "step": 87 + }, + { + "epoch": 0.025092671799258625, + "grad_norm": 3.0570511261289917, + "learning_rate": 8.301886792452832e-06, + "loss": 0.0333, + "step": 88 + }, + { + "epoch": 0.025377815796977475, + "grad_norm": 5.146547952949438, + "learning_rate": 8.39622641509434e-06, + "loss": 0.0514, + "step": 89 + }, + { + "epoch": 0.02566295979469632, + "grad_norm": 4.7531766250387255, + "learning_rate": 8.49056603773585e-06, + "loss": 0.0527, + "step": 90 + }, + { + "epoch": 0.02594810379241517, + "grad_norm": 4.335096216573719, + "learning_rate": 8.58490566037736e-06, + "loss": 0.0268, + "step": 91 + }, + { + "epoch": 0.02623324779013402, + "grad_norm": 2.132428089649692, + "learning_rate": 8.67924528301887e-06, + "loss": 0.0168, + "step": 92 + }, + { + "epoch": 0.026518391787852865, + "grad_norm": 6.833117952820334, + "learning_rate": 8.773584905660378e-06, + "loss": 0.0868, + "step": 93 + }, + { + "epoch": 0.026803535785571715, + "grad_norm": 11.160832029176879, + "learning_rate": 8.867924528301887e-06, + "loss": 0.0925, + "step": 94 + }, + { + "epoch": 0.027088679783290562, + "grad_norm": 1.8890461785442043, + "learning_rate": 8.962264150943398e-06, + "loss": 0.0376, + "step": 95 + }, + { + "epoch": 0.02737382378100941, + "grad_norm": 2.0039805311692573, + "learning_rate": 9.056603773584907e-06, + "loss": 0.0534, + "step": 96 + }, + { + "epoch": 0.02765896777872826, + "grad_norm": 3.256805619531125, + "learning_rate": 9.150943396226416e-06, + "loss": 0.0476, + "step": 97 + }, + { + "epoch": 0.027944111776447105, + "grad_norm": 2.3004970717202595, + "learning_rate": 9.245283018867926e-06, + "loss": 0.061, + "step": 98 + }, + { + "epoch": 0.028229255774165955, + "grad_norm": 1.8448990035600177, + "learning_rate": 9.339622641509435e-06, + "loss": 0.0422, + "step": 99 + }, + { + "epoch": 0.028514399771884802, + "grad_norm": 4.002283614106026, + "learning_rate": 9.433962264150944e-06, + "loss": 0.0603, + "step": 100 + }, + { + "epoch": 0.02879954376960365, + "grad_norm": 8.180190475095822, + "learning_rate": 9.528301886792455e-06, + "loss": 0.1227, + "step": 101 + }, + { + "epoch": 0.0290846877673225, + "grad_norm": 6.4834435384859015, + "learning_rate": 9.622641509433963e-06, + "loss": 0.0903, + "step": 102 + }, + { + "epoch": 0.029369831765041345, + "grad_norm": 6.419140173971885, + "learning_rate": 9.716981132075472e-06, + "loss": 0.0973, + "step": 103 + }, + { + "epoch": 0.029654975762760195, + "grad_norm": 5.167609682986552, + "learning_rate": 9.811320754716981e-06, + "loss": 0.0464, + "step": 104 + }, + { + "epoch": 0.029940119760479042, + "grad_norm": 2.6286695618463733, + "learning_rate": 9.905660377358492e-06, + "loss": 0.047, + "step": 105 + }, + { + "epoch": 0.03022526375819789, + "grad_norm": 2.385355633734913, + "learning_rate": 1e-05, + "loss": 0.0366, + "step": 106 + }, + { + "epoch": 0.03051040775591674, + "grad_norm": 5.045824475523923, + "learning_rate": 9.999997866825128e-06, + "loss": 0.0928, + "step": 107 + }, + { + "epoch": 0.030795551753635585, + "grad_norm": 5.882161471784182, + "learning_rate": 9.999991467302332e-06, + "loss": 0.1048, + "step": 108 + }, + { + "epoch": 0.031080695751354435, + "grad_norm": 8.71717621944331, + "learning_rate": 9.99998080143707e-06, + "loss": 0.0885, + "step": 109 + }, + { + "epoch": 0.03136583974907328, + "grad_norm": 2.090876100875096, + "learning_rate": 9.999965869238445e-06, + "loss": 0.0592, + "step": 110 + }, + { + "epoch": 0.03165098374679213, + "grad_norm": 15.890424714577145, + "learning_rate": 9.999946670719197e-06, + "loss": 0.0438, + "step": 111 + }, + { + "epoch": 0.031936127744510975, + "grad_norm": 1.4259811755691145, + "learning_rate": 9.99992320589571e-06, + "loss": 0.0395, + "step": 112 + }, + { + "epoch": 0.03222127174222983, + "grad_norm": 1.9327015405417511, + "learning_rate": 9.999895474788003e-06, + "loss": 0.0371, + "step": 113 + }, + { + "epoch": 0.032506415739948676, + "grad_norm": 1.8632732706869015, + "learning_rate": 9.999863477419739e-06, + "loss": 0.0245, + "step": 114 + }, + { + "epoch": 0.03279155973766752, + "grad_norm": 1.8917211984814235, + "learning_rate": 9.99982721381822e-06, + "loss": 0.0407, + "step": 115 + }, + { + "epoch": 0.03307670373538637, + "grad_norm": 2.2247432119892956, + "learning_rate": 9.999786684014393e-06, + "loss": 0.0334, + "step": 116 + }, + { + "epoch": 0.033361847733105215, + "grad_norm": 5.678799374308072, + "learning_rate": 9.999741888042832e-06, + "loss": 0.0646, + "step": 117 + }, + { + "epoch": 0.03364699173082407, + "grad_norm": 2.2979438290144656, + "learning_rate": 9.99969282594177e-06, + "loss": 0.0352, + "step": 118 + }, + { + "epoch": 0.033932135728542916, + "grad_norm": 3.401247202535555, + "learning_rate": 9.999639497753062e-06, + "loss": 0.0457, + "step": 119 + }, + { + "epoch": 0.03421727972626176, + "grad_norm": 1.3220531908758983, + "learning_rate": 9.999581903522214e-06, + "loss": 0.0148, + "step": 120 + }, + { + "epoch": 0.03450242372398061, + "grad_norm": 1.0582272674397575, + "learning_rate": 9.999520043298374e-06, + "loss": 0.0214, + "step": 121 + }, + { + "epoch": 0.034787567721699456, + "grad_norm": 1.5349811431314222, + "learning_rate": 9.99945391713432e-06, + "loss": 0.0237, + "step": 122 + }, + { + "epoch": 0.03507271171941831, + "grad_norm": 5.061670772936218, + "learning_rate": 9.999383525086478e-06, + "loss": 0.0393, + "step": 123 + }, + { + "epoch": 0.035357855717137156, + "grad_norm": 3.8169991926514313, + "learning_rate": 9.999308867214908e-06, + "loss": 0.0622, + "step": 124 + }, + { + "epoch": 0.035642999714856, + "grad_norm": 33.60130740332581, + "learning_rate": 9.999229943583318e-06, + "loss": 0.0982, + "step": 125 + }, + { + "epoch": 0.03592814371257485, + "grad_norm": 1.7047090809257088, + "learning_rate": 9.999146754259048e-06, + "loss": 0.0321, + "step": 126 + }, + { + "epoch": 0.036213287710293696, + "grad_norm": 2.9744010851446103, + "learning_rate": 9.999059299313082e-06, + "loss": 0.0207, + "step": 127 + }, + { + "epoch": 0.03649843170801255, + "grad_norm": 4.240730548981665, + "learning_rate": 9.998967578820042e-06, + "loss": 0.0289, + "step": 128 + }, + { + "epoch": 0.036783575705731396, + "grad_norm": 3.3812717287163463, + "learning_rate": 9.998871592858193e-06, + "loss": 0.0495, + "step": 129 + }, + { + "epoch": 0.03706871970345024, + "grad_norm": 1.8193189165651151, + "learning_rate": 9.998771341509434e-06, + "loss": 0.0667, + "step": 130 + }, + { + "epoch": 0.03735386370116909, + "grad_norm": 6.060011032912712, + "learning_rate": 9.998666824859307e-06, + "loss": 0.1307, + "step": 131 + }, + { + "epoch": 0.037639007698887936, + "grad_norm": 2.402353091095481, + "learning_rate": 9.998558042996993e-06, + "loss": 0.0736, + "step": 132 + }, + { + "epoch": 0.03792415169660679, + "grad_norm": 5.839543926949511, + "learning_rate": 9.998444996015314e-06, + "loss": 0.0945, + "step": 133 + }, + { + "epoch": 0.038209295694325636, + "grad_norm": 1.4473227758832712, + "learning_rate": 9.998327684010727e-06, + "loss": 0.0432, + "step": 134 + }, + { + "epoch": 0.03849443969204448, + "grad_norm": 2.396507073747093, + "learning_rate": 9.998206107083333e-06, + "loss": 0.0679, + "step": 135 + }, + { + "epoch": 0.03877958368976333, + "grad_norm": 2.6450829119213073, + "learning_rate": 9.998080265336867e-06, + "loss": 0.0206, + "step": 136 + }, + { + "epoch": 0.039064727687482176, + "grad_norm": 8.44539739688482, + "learning_rate": 9.997950158878712e-06, + "loss": 0.0362, + "step": 137 + }, + { + "epoch": 0.03934987168520103, + "grad_norm": 3.940799243397122, + "learning_rate": 9.997815787819876e-06, + "loss": 0.0452, + "step": 138 + }, + { + "epoch": 0.039635015682919876, + "grad_norm": 1.2577768949582078, + "learning_rate": 9.997677152275019e-06, + "loss": 0.029, + "step": 139 + }, + { + "epoch": 0.03992015968063872, + "grad_norm": 2.172531523049211, + "learning_rate": 9.997534252362432e-06, + "loss": 0.0289, + "step": 140 + }, + { + "epoch": 0.04020530367835757, + "grad_norm": 1.7163630652895192, + "learning_rate": 9.99738708820405e-06, + "loss": 0.0374, + "step": 141 + }, + { + "epoch": 0.040490447676076416, + "grad_norm": 1.9849481929945414, + "learning_rate": 9.99723565992544e-06, + "loss": 0.0385, + "step": 142 + }, + { + "epoch": 0.04077559167379527, + "grad_norm": 1.0957767923455073, + "learning_rate": 9.997079967655816e-06, + "loss": 0.0345, + "step": 143 + }, + { + "epoch": 0.041060735671514116, + "grad_norm": 1.845401941137247, + "learning_rate": 9.996920011528022e-06, + "loss": 0.0298, + "step": 144 + }, + { + "epoch": 0.04134587966923296, + "grad_norm": 1.709458777909131, + "learning_rate": 9.996755791678544e-06, + "loss": 0.023, + "step": 145 + }, + { + "epoch": 0.04163102366695181, + "grad_norm": 2.416877485324215, + "learning_rate": 9.996587308247507e-06, + "loss": 0.041, + "step": 146 + }, + { + "epoch": 0.041916167664670656, + "grad_norm": 1.8466041607808619, + "learning_rate": 9.996414561378671e-06, + "loss": 0.0308, + "step": 147 + }, + { + "epoch": 0.04220131166238951, + "grad_norm": 2.7453406578055746, + "learning_rate": 9.996237551219439e-06, + "loss": 0.0805, + "step": 148 + }, + { + "epoch": 0.042486455660108356, + "grad_norm": 1.3012783290944823, + "learning_rate": 9.996056277920845e-06, + "loss": 0.0459, + "step": 149 + }, + { + "epoch": 0.0427715996578272, + "grad_norm": 1.6845595049974698, + "learning_rate": 9.995870741637566e-06, + "loss": 0.0206, + "step": 150 + }, + { + "epoch": 0.04305674365554605, + "grad_norm": 1.9673407080566452, + "learning_rate": 9.995680942527915e-06, + "loss": 0.0277, + "step": 151 + }, + { + "epoch": 0.043341887653264896, + "grad_norm": 1.096649599808759, + "learning_rate": 9.99548688075384e-06, + "loss": 0.0104, + "step": 152 + }, + { + "epoch": 0.04362703165098375, + "grad_norm": 4.340215678141815, + "learning_rate": 9.99528855648093e-06, + "loss": 0.0782, + "step": 153 + }, + { + "epoch": 0.043912175648702596, + "grad_norm": 0.962700831894613, + "learning_rate": 9.995085969878408e-06, + "loss": 0.0084, + "step": 154 + }, + { + "epoch": 0.04419731964642144, + "grad_norm": 1.3431951362099261, + "learning_rate": 9.994879121119134e-06, + "loss": 0.0347, + "step": 155 + }, + { + "epoch": 0.04448246364414029, + "grad_norm": 2.396590839420942, + "learning_rate": 9.99466801037961e-06, + "loss": 0.0518, + "step": 156 + }, + { + "epoch": 0.044767607641859136, + "grad_norm": 1.87825274755581, + "learning_rate": 9.994452637839964e-06, + "loss": 0.0316, + "step": 157 + }, + { + "epoch": 0.04505275163957799, + "grad_norm": 2.6008704806074707, + "learning_rate": 9.994233003683972e-06, + "loss": 0.0338, + "step": 158 + }, + { + "epoch": 0.045337895637296836, + "grad_norm": 3.2329437736534476, + "learning_rate": 9.994009108099038e-06, + "loss": 0.0821, + "step": 159 + }, + { + "epoch": 0.04562303963501568, + "grad_norm": 1.59183218675036, + "learning_rate": 9.99378095127621e-06, + "loss": 0.0245, + "step": 160 + }, + { + "epoch": 0.04590818363273453, + "grad_norm": 1.6404471636929772, + "learning_rate": 9.993548533410162e-06, + "loss": 0.0188, + "step": 161 + }, + { + "epoch": 0.046193327630453376, + "grad_norm": 5.826108372985289, + "learning_rate": 9.993311854699214e-06, + "loss": 0.0963, + "step": 162 + }, + { + "epoch": 0.04647847162817223, + "grad_norm": 3.8052953300535393, + "learning_rate": 9.993070915345313e-06, + "loss": 0.063, + "step": 163 + }, + { + "epoch": 0.046763615625891077, + "grad_norm": 3.166036914377814, + "learning_rate": 9.992825715554047e-06, + "loss": 0.0391, + "step": 164 + }, + { + "epoch": 0.04704875962360992, + "grad_norm": 0.9851882069270612, + "learning_rate": 9.992576255534637e-06, + "loss": 0.0412, + "step": 165 + }, + { + "epoch": 0.04733390362132877, + "grad_norm": 1.9312831934515782, + "learning_rate": 9.99232253549994e-06, + "loss": 0.0201, + "step": 166 + }, + { + "epoch": 0.047619047619047616, + "grad_norm": 0.902848957710233, + "learning_rate": 9.992064555666448e-06, + "loss": 0.0235, + "step": 167 + }, + { + "epoch": 0.04790419161676647, + "grad_norm": 2.4508039029457254, + "learning_rate": 9.991802316254286e-06, + "loss": 0.0236, + "step": 168 + }, + { + "epoch": 0.04818933561448532, + "grad_norm": 2.052929331756409, + "learning_rate": 9.991535817487218e-06, + "loss": 0.038, + "step": 169 + }, + { + "epoch": 0.04847447961220416, + "grad_norm": 1.480706200852706, + "learning_rate": 9.991265059592638e-06, + "loss": 0.0556, + "step": 170 + }, + { + "epoch": 0.04875962360992301, + "grad_norm": 4.557244503271385, + "learning_rate": 9.990990042801573e-06, + "loss": 0.0669, + "step": 171 + }, + { + "epoch": 0.049044767607641856, + "grad_norm": 2.087362034480544, + "learning_rate": 9.990710767348692e-06, + "loss": 0.0531, + "step": 172 + }, + { + "epoch": 0.04932991160536071, + "grad_norm": 1.290096043611282, + "learning_rate": 9.990427233472286e-06, + "loss": 0.0347, + "step": 173 + }, + { + "epoch": 0.04961505560307956, + "grad_norm": 4.300260420077906, + "learning_rate": 9.990139441414291e-06, + "loss": 0.0686, + "step": 174 + }, + { + "epoch": 0.0499001996007984, + "grad_norm": 2.3277629405503455, + "learning_rate": 9.989847391420268e-06, + "loss": 0.0355, + "step": 175 + }, + { + "epoch": 0.05018534359851725, + "grad_norm": 1.0758597876794864, + "learning_rate": 9.989551083739416e-06, + "loss": 0.0379, + "step": 176 + }, + { + "epoch": 0.0504704875962361, + "grad_norm": 2.296501614131635, + "learning_rate": 9.989250518624566e-06, + "loss": 0.0463, + "step": 177 + }, + { + "epoch": 0.05075563159395495, + "grad_norm": 1.3724194734047541, + "learning_rate": 9.98894569633218e-06, + "loss": 0.0314, + "step": 178 + }, + { + "epoch": 0.0510407755916738, + "grad_norm": 1.7362884773572984, + "learning_rate": 9.988636617122354e-06, + "loss": 0.0585, + "step": 179 + }, + { + "epoch": 0.05132591958939264, + "grad_norm": 0.7782751167457802, + "learning_rate": 9.988323281258817e-06, + "loss": 0.0309, + "step": 180 + }, + { + "epoch": 0.05161106358711149, + "grad_norm": 2.6395856764774583, + "learning_rate": 9.988005689008926e-06, + "loss": 0.0502, + "step": 181 + }, + { + "epoch": 0.05189620758483034, + "grad_norm": 2.978324666472321, + "learning_rate": 9.987683840643679e-06, + "loss": 0.0429, + "step": 182 + }, + { + "epoch": 0.05218135158254919, + "grad_norm": 3.4011230134034167, + "learning_rate": 9.987357736437691e-06, + "loss": 0.0637, + "step": 183 + }, + { + "epoch": 0.05246649558026804, + "grad_norm": 1.941822329126047, + "learning_rate": 9.987027376669224e-06, + "loss": 0.0315, + "step": 184 + }, + { + "epoch": 0.052751639577986884, + "grad_norm": 2.8549897117814567, + "learning_rate": 9.986692761620163e-06, + "loss": 0.0382, + "step": 185 + }, + { + "epoch": 0.05303678357570573, + "grad_norm": 2.582295889648147, + "learning_rate": 9.986353891576021e-06, + "loss": 0.0385, + "step": 186 + }, + { + "epoch": 0.05332192757342458, + "grad_norm": 2.471870150657678, + "learning_rate": 9.98601076682595e-06, + "loss": 0.0322, + "step": 187 + }, + { + "epoch": 0.05360707157114343, + "grad_norm": 2.6685644561553006, + "learning_rate": 9.985663387662726e-06, + "loss": 0.0603, + "step": 188 + }, + { + "epoch": 0.05389221556886228, + "grad_norm": 2.4073947494866186, + "learning_rate": 9.985311754382758e-06, + "loss": 0.0362, + "step": 189 + }, + { + "epoch": 0.054177359566581124, + "grad_norm": 2.4836029774438666, + "learning_rate": 9.984955867286083e-06, + "loss": 0.0384, + "step": 190 + }, + { + "epoch": 0.05446250356429997, + "grad_norm": 2.5496037219151018, + "learning_rate": 9.98459572667637e-06, + "loss": 0.055, + "step": 191 + }, + { + "epoch": 0.05474764756201882, + "grad_norm": 2.2290862127290585, + "learning_rate": 9.984231332860914e-06, + "loss": 0.03, + "step": 192 + }, + { + "epoch": 0.05503279155973767, + "grad_norm": 2.0539664937082893, + "learning_rate": 9.983862686150644e-06, + "loss": 0.0486, + "step": 193 + }, + { + "epoch": 0.05531793555745652, + "grad_norm": 1.347135991088387, + "learning_rate": 9.983489786860115e-06, + "loss": 0.0335, + "step": 194 + }, + { + "epoch": 0.055603079555175364, + "grad_norm": 1.152598221757865, + "learning_rate": 9.983112635307508e-06, + "loss": 0.0417, + "step": 195 + }, + { + "epoch": 0.05588822355289421, + "grad_norm": 1.1394323676423384, + "learning_rate": 9.982731231814637e-06, + "loss": 0.04, + "step": 196 + }, + { + "epoch": 0.05617336755061306, + "grad_norm": 0.5751858615186785, + "learning_rate": 9.982345576706942e-06, + "loss": 0.0257, + "step": 197 + }, + { + "epoch": 0.05645851154833191, + "grad_norm": 2.228495954241724, + "learning_rate": 9.981955670313491e-06, + "loss": 0.0244, + "step": 198 + }, + { + "epoch": 0.05674365554605076, + "grad_norm": 2.1167156058410566, + "learning_rate": 9.98156151296698e-06, + "loss": 0.0255, + "step": 199 + }, + { + "epoch": 0.057028799543769604, + "grad_norm": 1.1499351454121258, + "learning_rate": 9.981163105003731e-06, + "loss": 0.0399, + "step": 200 + }, + { + "epoch": 0.05731394354148845, + "grad_norm": 1.0863620605603235, + "learning_rate": 9.980760446763693e-06, + "loss": 0.0336, + "step": 201 + }, + { + "epoch": 0.0575990875392073, + "grad_norm": 1.8021997995153731, + "learning_rate": 9.980353538590441e-06, + "loss": 0.049, + "step": 202 + }, + { + "epoch": 0.05788423153692615, + "grad_norm": 0.8060673496091526, + "learning_rate": 9.97994238083118e-06, + "loss": 0.02, + "step": 203 + }, + { + "epoch": 0.058169375534645, + "grad_norm": 2.8431892970960226, + "learning_rate": 9.97952697383674e-06, + "loss": 0.0459, + "step": 204 + }, + { + "epoch": 0.058454519532363844, + "grad_norm": 2.2266671109715475, + "learning_rate": 9.979107317961572e-06, + "loss": 0.0379, + "step": 205 + }, + { + "epoch": 0.05873966353008269, + "grad_norm": 1.4259804628148744, + "learning_rate": 9.978683413563755e-06, + "loss": 0.0346, + "step": 206 + }, + { + "epoch": 0.05902480752780154, + "grad_norm": 1.9019710115897548, + "learning_rate": 9.978255261004996e-06, + "loss": 0.0321, + "step": 207 + }, + { + "epoch": 0.05930995152552039, + "grad_norm": 3.2763885427824433, + "learning_rate": 9.977822860650626e-06, + "loss": 0.0327, + "step": 208 + }, + { + "epoch": 0.05959509552323924, + "grad_norm": 4.085746329437377, + "learning_rate": 9.977386212869597e-06, + "loss": 0.0785, + "step": 209 + }, + { + "epoch": 0.059880239520958084, + "grad_norm": 1.4461803192099967, + "learning_rate": 9.976945318034487e-06, + "loss": 0.0372, + "step": 210 + }, + { + "epoch": 0.06016538351867693, + "grad_norm": 1.6159101224346348, + "learning_rate": 9.9765001765215e-06, + "loss": 0.0432, + "step": 211 + }, + { + "epoch": 0.06045052751639578, + "grad_norm": 2.109127926712865, + "learning_rate": 9.976050788710462e-06, + "loss": 0.0474, + "step": 212 + }, + { + "epoch": 0.06073567151411463, + "grad_norm": 1.6172712792447292, + "learning_rate": 9.97559715498482e-06, + "loss": 0.0299, + "step": 213 + }, + { + "epoch": 0.06102081551183348, + "grad_norm": 1.1823346555015843, + "learning_rate": 9.975139275731649e-06, + "loss": 0.0224, + "step": 214 + }, + { + "epoch": 0.061305959509552324, + "grad_norm": 1.2000963098670507, + "learning_rate": 9.97467715134164e-06, + "loss": 0.0362, + "step": 215 + }, + { + "epoch": 0.06159110350727117, + "grad_norm": 1.6613781060710882, + "learning_rate": 9.974210782209113e-06, + "loss": 0.0382, + "step": 216 + }, + { + "epoch": 0.06187624750499002, + "grad_norm": 2.2297827161797406, + "learning_rate": 9.973740168732006e-06, + "loss": 0.0407, + "step": 217 + }, + { + "epoch": 0.06216139150270887, + "grad_norm": 3.0544112277949713, + "learning_rate": 9.973265311311877e-06, + "loss": 0.0534, + "step": 218 + }, + { + "epoch": 0.06244653550042772, + "grad_norm": 1.8606402993779663, + "learning_rate": 9.972786210353913e-06, + "loss": 0.031, + "step": 219 + }, + { + "epoch": 0.06273167949814656, + "grad_norm": 1.9835976433808742, + "learning_rate": 9.97230286626691e-06, + "loss": 0.0512, + "step": 220 + }, + { + "epoch": 0.06301682349586542, + "grad_norm": 2.4671374293137474, + "learning_rate": 9.971815279463294e-06, + "loss": 0.0261, + "step": 221 + }, + { + "epoch": 0.06330196749358426, + "grad_norm": 1.9426433678441557, + "learning_rate": 9.971323450359109e-06, + "loss": 0.0354, + "step": 222 + }, + { + "epoch": 0.06358711149130311, + "grad_norm": 2.833304865793606, + "learning_rate": 9.970827379374016e-06, + "loss": 0.0658, + "step": 223 + }, + { + "epoch": 0.06387225548902195, + "grad_norm": 1.364065742400369, + "learning_rate": 9.9703270669313e-06, + "loss": 0.023, + "step": 224 + }, + { + "epoch": 0.0641573994867408, + "grad_norm": 2.0598784617586783, + "learning_rate": 9.96982251345786e-06, + "loss": 0.0303, + "step": 225 + }, + { + "epoch": 0.06444254348445966, + "grad_norm": 1.6800528252103155, + "learning_rate": 9.969313719384217e-06, + "loss": 0.0232, + "step": 226 + }, + { + "epoch": 0.0647276874821785, + "grad_norm": 2.5347983635463067, + "learning_rate": 9.96880068514451e-06, + "loss": 0.051, + "step": 227 + }, + { + "epoch": 0.06501283147989735, + "grad_norm": 4.198478034633089, + "learning_rate": 9.968283411176499e-06, + "loss": 0.0637, + "step": 228 + }, + { + "epoch": 0.06529797547761619, + "grad_norm": 2.4608303176324013, + "learning_rate": 9.967761897921553e-06, + "loss": 0.0356, + "step": 229 + }, + { + "epoch": 0.06558311947533504, + "grad_norm": 1.1830386086249287, + "learning_rate": 9.967236145824666e-06, + "loss": 0.0225, + "step": 230 + }, + { + "epoch": 0.0658682634730539, + "grad_norm": 2.335905650803093, + "learning_rate": 9.966706155334445e-06, + "loss": 0.0484, + "step": 231 + }, + { + "epoch": 0.06615340747077274, + "grad_norm": 0.7365538318490851, + "learning_rate": 9.966171926903116e-06, + "loss": 0.0242, + "step": 232 + }, + { + "epoch": 0.06643855146849159, + "grad_norm": 0.8332673431312235, + "learning_rate": 9.965633460986521e-06, + "loss": 0.0143, + "step": 233 + }, + { + "epoch": 0.06672369546621043, + "grad_norm": 1.1115342044179635, + "learning_rate": 9.965090758044116e-06, + "loss": 0.0315, + "step": 234 + }, + { + "epoch": 0.06700883946392928, + "grad_norm": 2.828412875905856, + "learning_rate": 9.964543818538974e-06, + "loss": 0.0724, + "step": 235 + }, + { + "epoch": 0.06729398346164814, + "grad_norm": 1.0745851388736904, + "learning_rate": 9.963992642937782e-06, + "loss": 0.0155, + "step": 236 + }, + { + "epoch": 0.06757912745936698, + "grad_norm": 1.9642520660921505, + "learning_rate": 9.963437231710838e-06, + "loss": 0.0296, + "step": 237 + }, + { + "epoch": 0.06786427145708583, + "grad_norm": 0.6051981185401907, + "learning_rate": 9.962877585332062e-06, + "loss": 0.0177, + "step": 238 + }, + { + "epoch": 0.06814941545480467, + "grad_norm": 4.122674586469668, + "learning_rate": 9.962313704278981e-06, + "loss": 0.0523, + "step": 239 + }, + { + "epoch": 0.06843455945252352, + "grad_norm": 1.6897654144243972, + "learning_rate": 9.96174558903274e-06, + "loss": 0.0132, + "step": 240 + }, + { + "epoch": 0.06871970345024238, + "grad_norm": 3.1253283435389196, + "learning_rate": 9.961173240078092e-06, + "loss": 0.0321, + "step": 241 + }, + { + "epoch": 0.06900484744796122, + "grad_norm": 1.3965076576538442, + "learning_rate": 9.960596657903407e-06, + "loss": 0.029, + "step": 242 + }, + { + "epoch": 0.06928999144568007, + "grad_norm": 1.9660002863377228, + "learning_rate": 9.960015843000666e-06, + "loss": 0.0522, + "step": 243 + }, + { + "epoch": 0.06957513544339891, + "grad_norm": 1.3233952783403555, + "learning_rate": 9.959430795865457e-06, + "loss": 0.0353, + "step": 244 + }, + { + "epoch": 0.06986027944111776, + "grad_norm": 1.7033322453474304, + "learning_rate": 9.958841516996989e-06, + "loss": 0.0223, + "step": 245 + }, + { + "epoch": 0.07014542343883662, + "grad_norm": 2.5317243818879764, + "learning_rate": 9.95824800689807e-06, + "loss": 0.0598, + "step": 246 + }, + { + "epoch": 0.07043056743655546, + "grad_norm": 2.69732116281974, + "learning_rate": 9.957650266075129e-06, + "loss": 0.0419, + "step": 247 + }, + { + "epoch": 0.07071571143427431, + "grad_norm": 1.4150311093853096, + "learning_rate": 9.957048295038197e-06, + "loss": 0.0472, + "step": 248 + }, + { + "epoch": 0.07100085543199315, + "grad_norm": 1.2224281181933392, + "learning_rate": 9.95644209430092e-06, + "loss": 0.0389, + "step": 249 + }, + { + "epoch": 0.071285999429712, + "grad_norm": 51.194853358315605, + "learning_rate": 9.955831664380548e-06, + "loss": 0.0451, + "step": 250 + }, + { + "epoch": 0.07157114342743086, + "grad_norm": 1.46254711704071, + "learning_rate": 9.955217005797946e-06, + "loss": 0.0276, + "step": 251 + }, + { + "epoch": 0.0718562874251497, + "grad_norm": 3.4636231015416974, + "learning_rate": 9.954598119077583e-06, + "loss": 0.0396, + "step": 252 + }, + { + "epoch": 0.07214143142286855, + "grad_norm": 0.8518861435588755, + "learning_rate": 9.953975004747535e-06, + "loss": 0.0313, + "step": 253 + }, + { + "epoch": 0.07242657542058739, + "grad_norm": 1.4742031582432285, + "learning_rate": 9.953347663339487e-06, + "loss": 0.0397, + "step": 254 + }, + { + "epoch": 0.07271171941830624, + "grad_norm": 0.8505976022407974, + "learning_rate": 9.95271609538873e-06, + "loss": 0.0259, + "step": 255 + }, + { + "epoch": 0.0729968634160251, + "grad_norm": 1.3056007814387238, + "learning_rate": 9.952080301434165e-06, + "loss": 0.0109, + "step": 256 + }, + { + "epoch": 0.07328200741374394, + "grad_norm": 2.734719524535765, + "learning_rate": 9.951440282018294e-06, + "loss": 0.0424, + "step": 257 + }, + { + "epoch": 0.07356715141146279, + "grad_norm": 0.915900286106093, + "learning_rate": 9.950796037687224e-06, + "loss": 0.0279, + "step": 258 + }, + { + "epoch": 0.07385229540918163, + "grad_norm": 1.1576885381186377, + "learning_rate": 9.950147568990672e-06, + "loss": 0.0291, + "step": 259 + }, + { + "epoch": 0.07413743940690048, + "grad_norm": 1.1396907116761987, + "learning_rate": 9.949494876481957e-06, + "loss": 0.0573, + "step": 260 + }, + { + "epoch": 0.07442258340461934, + "grad_norm": 0.8954392948824229, + "learning_rate": 9.948837960718001e-06, + "loss": 0.0109, + "step": 261 + }, + { + "epoch": 0.07470772740233818, + "grad_norm": 2.2187692519372955, + "learning_rate": 9.94817682225933e-06, + "loss": 0.0405, + "step": 262 + }, + { + "epoch": 0.07499287140005703, + "grad_norm": 1.1502737187174694, + "learning_rate": 9.947511461670076e-06, + "loss": 0.0194, + "step": 263 + }, + { + "epoch": 0.07527801539777587, + "grad_norm": 1.061028206812728, + "learning_rate": 9.946841879517968e-06, + "loss": 0.0419, + "step": 264 + }, + { + "epoch": 0.07556315939549473, + "grad_norm": 1.3832069086922913, + "learning_rate": 9.94616807637434e-06, + "loss": 0.015, + "step": 265 + }, + { + "epoch": 0.07584830339321358, + "grad_norm": 0.6459735906834944, + "learning_rate": 9.945490052814133e-06, + "loss": 0.0255, + "step": 266 + }, + { + "epoch": 0.07613344739093242, + "grad_norm": 1.1494098773455543, + "learning_rate": 9.94480780941588e-06, + "loss": 0.0346, + "step": 267 + }, + { + "epoch": 0.07641859138865127, + "grad_norm": 0.9289181620653825, + "learning_rate": 9.944121346761718e-06, + "loss": 0.014, + "step": 268 + }, + { + "epoch": 0.07670373538637011, + "grad_norm": 1.2203907189231082, + "learning_rate": 9.943430665437388e-06, + "loss": 0.0167, + "step": 269 + }, + { + "epoch": 0.07698887938408897, + "grad_norm": 2.620259167299579, + "learning_rate": 9.942735766032228e-06, + "loss": 0.044, + "step": 270 + }, + { + "epoch": 0.07727402338180782, + "grad_norm": 1.7206246319986442, + "learning_rate": 9.94203664913917e-06, + "loss": 0.0235, + "step": 271 + }, + { + "epoch": 0.07755916737952666, + "grad_norm": 1.4642402716762304, + "learning_rate": 9.941333315354755e-06, + "loss": 0.0258, + "step": 272 + }, + { + "epoch": 0.07784431137724551, + "grad_norm": 1.7553631989445817, + "learning_rate": 9.940625765279112e-06, + "loss": 0.0345, + "step": 273 + }, + { + "epoch": 0.07812945537496435, + "grad_norm": 1.6716431070541198, + "learning_rate": 9.939913999515976e-06, + "loss": 0.0454, + "step": 274 + }, + { + "epoch": 0.0784145993726832, + "grad_norm": 1.9526398951956696, + "learning_rate": 9.939198018672671e-06, + "loss": 0.0382, + "step": 275 + }, + { + "epoch": 0.07869974337040206, + "grad_norm": 1.2954788594239341, + "learning_rate": 9.938477823360127e-06, + "loss": 0.0309, + "step": 276 + }, + { + "epoch": 0.0789848873681209, + "grad_norm": 4.4338813137709385, + "learning_rate": 9.937753414192862e-06, + "loss": 0.072, + "step": 277 + }, + { + "epoch": 0.07927003136583975, + "grad_norm": 1.56094475712123, + "learning_rate": 9.937024791788991e-06, + "loss": 0.0517, + "step": 278 + }, + { + "epoch": 0.07955517536355859, + "grad_norm": 1.7375848109236927, + "learning_rate": 9.93629195677023e-06, + "loss": 0.0345, + "step": 279 + }, + { + "epoch": 0.07984031936127745, + "grad_norm": 2.649452584682016, + "learning_rate": 9.935554909761882e-06, + "loss": 0.0594, + "step": 280 + }, + { + "epoch": 0.0801254633589963, + "grad_norm": 1.6499457117477292, + "learning_rate": 9.93481365139285e-06, + "loss": 0.0376, + "step": 281 + }, + { + "epoch": 0.08041060735671514, + "grad_norm": 1.1619489575147839, + "learning_rate": 9.934068182295622e-06, + "loss": 0.04, + "step": 282 + }, + { + "epoch": 0.08069575135443399, + "grad_norm": 1.7247725696145613, + "learning_rate": 9.933318503106291e-06, + "loss": 0.0323, + "step": 283 + }, + { + "epoch": 0.08098089535215283, + "grad_norm": 2.1577985956712666, + "learning_rate": 9.93256461446453e-06, + "loss": 0.0526, + "step": 284 + }, + { + "epoch": 0.08126603934987169, + "grad_norm": 0.8867124373033844, + "learning_rate": 9.931806517013612e-06, + "loss": 0.0144, + "step": 285 + }, + { + "epoch": 0.08155118334759054, + "grad_norm": 0.9443505140091544, + "learning_rate": 9.9310442114004e-06, + "loss": 0.0233, + "step": 286 + }, + { + "epoch": 0.08183632734530938, + "grad_norm": 0.8757124141344964, + "learning_rate": 9.930277698275347e-06, + "loss": 0.0235, + "step": 287 + }, + { + "epoch": 0.08212147134302823, + "grad_norm": 2.2248864075955805, + "learning_rate": 9.92950697829249e-06, + "loss": 0.0495, + "step": 288 + }, + { + "epoch": 0.08240661534074707, + "grad_norm": 1.794320095937349, + "learning_rate": 9.928732052109466e-06, + "loss": 0.0575, + "step": 289 + }, + { + "epoch": 0.08269175933846593, + "grad_norm": 0.5271974530133274, + "learning_rate": 9.927952920387497e-06, + "loss": 0.0174, + "step": 290 + }, + { + "epoch": 0.08297690333618478, + "grad_norm": 2.130600355467149, + "learning_rate": 9.92716958379139e-06, + "loss": 0.0269, + "step": 291 + }, + { + "epoch": 0.08326204733390362, + "grad_norm": 1.6078717603842572, + "learning_rate": 9.926382042989544e-06, + "loss": 0.0381, + "step": 292 + }, + { + "epoch": 0.08354719133162247, + "grad_norm": 1.067101418411665, + "learning_rate": 9.925590298653942e-06, + "loss": 0.0416, + "step": 293 + }, + { + "epoch": 0.08383233532934131, + "grad_norm": 1.8888059772289436, + "learning_rate": 9.924794351460159e-06, + "loss": 0.0431, + "step": 294 + }, + { + "epoch": 0.08411747932706017, + "grad_norm": 1.1366438944954262, + "learning_rate": 9.92399420208735e-06, + "loss": 0.034, + "step": 295 + }, + { + "epoch": 0.08440262332477902, + "grad_norm": 66.83013849628288, + "learning_rate": 9.923189851218259e-06, + "loss": 0.303, + "step": 296 + }, + { + "epoch": 0.08468776732249786, + "grad_norm": 2.2777335388392643, + "learning_rate": 9.922381299539214e-06, + "loss": 0.0426, + "step": 297 + }, + { + "epoch": 0.08497291132021671, + "grad_norm": 1.3777833351571647, + "learning_rate": 9.921568547740131e-06, + "loss": 0.0551, + "step": 298 + }, + { + "epoch": 0.08525805531793555, + "grad_norm": 0.6388835255348255, + "learning_rate": 9.920751596514502e-06, + "loss": 0.0228, + "step": 299 + }, + { + "epoch": 0.0855431993156544, + "grad_norm": 2.278298167977118, + "learning_rate": 9.919930446559412e-06, + "loss": 0.0349, + "step": 300 + }, + { + "epoch": 0.08582834331337326, + "grad_norm": 0.873059407793711, + "learning_rate": 9.91910509857552e-06, + "loss": 0.0212, + "step": 301 + }, + { + "epoch": 0.0861134873110921, + "grad_norm": 2.2132859296620038, + "learning_rate": 9.918275553267069e-06, + "loss": 0.0439, + "step": 302 + }, + { + "epoch": 0.08639863130881095, + "grad_norm": 2.615477056500074, + "learning_rate": 9.917441811341887e-06, + "loss": 0.0299, + "step": 303 + }, + { + "epoch": 0.08668377530652979, + "grad_norm": 1.0557781544658622, + "learning_rate": 9.916603873511386e-06, + "loss": 0.0158, + "step": 304 + }, + { + "epoch": 0.08696891930424865, + "grad_norm": 3.9400306558554754, + "learning_rate": 9.915761740490545e-06, + "loss": 0.0444, + "step": 305 + }, + { + "epoch": 0.0872540633019675, + "grad_norm": 3.1606798637531544, + "learning_rate": 9.914915412997937e-06, + "loss": 0.035, + "step": 306 + }, + { + "epoch": 0.08753920729968634, + "grad_norm": 2.2379796936077754, + "learning_rate": 9.914064891755703e-06, + "loss": 0.0353, + "step": 307 + }, + { + "epoch": 0.08782435129740519, + "grad_norm": 2.0924722936161007, + "learning_rate": 9.91321017748957e-06, + "loss": 0.0466, + "step": 308 + }, + { + "epoch": 0.08810949529512403, + "grad_norm": 1.1933158739643803, + "learning_rate": 9.91235127092884e-06, + "loss": 0.0237, + "step": 309 + }, + { + "epoch": 0.08839463929284289, + "grad_norm": 1.450838486064019, + "learning_rate": 9.911488172806392e-06, + "loss": 0.0296, + "step": 310 + }, + { + "epoch": 0.08867978329056174, + "grad_norm": 2.8659368821250095, + "learning_rate": 9.91062088385868e-06, + "loss": 0.0644, + "step": 311 + }, + { + "epoch": 0.08896492728828058, + "grad_norm": 2.3883067154612823, + "learning_rate": 9.909749404825736e-06, + "loss": 0.0558, + "step": 312 + }, + { + "epoch": 0.08925007128599943, + "grad_norm": 1.338846186227719, + "learning_rate": 9.90887373645117e-06, + "loss": 0.0142, + "step": 313 + }, + { + "epoch": 0.08953521528371827, + "grad_norm": 2.2418774559520105, + "learning_rate": 9.907993879482161e-06, + "loss": 0.0362, + "step": 314 + }, + { + "epoch": 0.08982035928143713, + "grad_norm": 1.5944639604254414, + "learning_rate": 9.907109834669465e-06, + "loss": 0.0277, + "step": 315 + }, + { + "epoch": 0.09010550327915598, + "grad_norm": 1.8710437857203388, + "learning_rate": 9.90622160276741e-06, + "loss": 0.0235, + "step": 316 + }, + { + "epoch": 0.09039064727687482, + "grad_norm": 1.4117522494101777, + "learning_rate": 9.905329184533897e-06, + "loss": 0.0328, + "step": 317 + }, + { + "epoch": 0.09067579127459367, + "grad_norm": 1.2111494335027815, + "learning_rate": 9.904432580730404e-06, + "loss": 0.0445, + "step": 318 + }, + { + "epoch": 0.09096093527231251, + "grad_norm": 1.0923531873837928, + "learning_rate": 9.90353179212197e-06, + "loss": 0.0363, + "step": 319 + }, + { + "epoch": 0.09124607927003137, + "grad_norm": 0.9108263761049934, + "learning_rate": 9.902626819477214e-06, + "loss": 0.0233, + "step": 320 + }, + { + "epoch": 0.09153122326775022, + "grad_norm": 0.9052416100597438, + "learning_rate": 9.901717663568323e-06, + "loss": 0.0221, + "step": 321 + }, + { + "epoch": 0.09181636726546906, + "grad_norm": 1.5417885755673133, + "learning_rate": 9.900804325171052e-06, + "loss": 0.0357, + "step": 322 + }, + { + "epoch": 0.09210151126318791, + "grad_norm": 1.1483931076981324, + "learning_rate": 9.899886805064723e-06, + "loss": 0.042, + "step": 323 + }, + { + "epoch": 0.09238665526090675, + "grad_norm": 0.8264374345543277, + "learning_rate": 9.89896510403223e-06, + "loss": 0.0135, + "step": 324 + }, + { + "epoch": 0.0926717992586256, + "grad_norm": 3.028712934159954, + "learning_rate": 9.898039222860032e-06, + "loss": 0.096, + "step": 325 + }, + { + "epoch": 0.09295694325634446, + "grad_norm": 3.908443382779185, + "learning_rate": 9.897109162338157e-06, + "loss": 0.0678, + "step": 326 + }, + { + "epoch": 0.0932420872540633, + "grad_norm": 1.4504578551621745, + "learning_rate": 9.896174923260198e-06, + "loss": 0.028, + "step": 327 + }, + { + "epoch": 0.09352723125178215, + "grad_norm": 1.0918597452992596, + "learning_rate": 9.89523650642331e-06, + "loss": 0.0352, + "step": 328 + }, + { + "epoch": 0.09381237524950099, + "grad_norm": 1.3624134945746003, + "learning_rate": 9.89429391262822e-06, + "loss": 0.034, + "step": 329 + }, + { + "epoch": 0.09409751924721985, + "grad_norm": 1.9386090064569985, + "learning_rate": 9.893347142679211e-06, + "loss": 0.0213, + "step": 330 + }, + { + "epoch": 0.0943826632449387, + "grad_norm": 2.442430474342043, + "learning_rate": 9.892396197384135e-06, + "loss": 0.0274, + "step": 331 + }, + { + "epoch": 0.09466780724265754, + "grad_norm": 2.7994856234060372, + "learning_rate": 9.891441077554405e-06, + "loss": 0.0496, + "step": 332 + }, + { + "epoch": 0.0949529512403764, + "grad_norm": 1.0457890691940244, + "learning_rate": 9.890481784004998e-06, + "loss": 0.0197, + "step": 333 + }, + { + "epoch": 0.09523809523809523, + "grad_norm": 6.1017012731170865, + "learning_rate": 9.889518317554446e-06, + "loss": 0.047, + "step": 334 + }, + { + "epoch": 0.09552323923581409, + "grad_norm": 2.090726587494363, + "learning_rate": 9.88855067902485e-06, + "loss": 0.0326, + "step": 335 + }, + { + "epoch": 0.09580838323353294, + "grad_norm": 0.699893313587921, + "learning_rate": 9.887578869241866e-06, + "loss": 0.0164, + "step": 336 + }, + { + "epoch": 0.09609352723125178, + "grad_norm": 3.507183876219138, + "learning_rate": 9.886602889034709e-06, + "loss": 0.0416, + "step": 337 + }, + { + "epoch": 0.09637867122897063, + "grad_norm": 0.5771253616906034, + "learning_rate": 9.885622739236154e-06, + "loss": 0.0161, + "step": 338 + }, + { + "epoch": 0.09666381522668947, + "grad_norm": 1.231417111947857, + "learning_rate": 9.884638420682534e-06, + "loss": 0.0269, + "step": 339 + }, + { + "epoch": 0.09694895922440833, + "grad_norm": 1.7586310033147416, + "learning_rate": 9.883649934213738e-06, + "loss": 0.0303, + "step": 340 + }, + { + "epoch": 0.09723410322212718, + "grad_norm": 1.6088417745313677, + "learning_rate": 9.882657280673212e-06, + "loss": 0.0277, + "step": 341 + }, + { + "epoch": 0.09751924721984602, + "grad_norm": 1.8001784084893846, + "learning_rate": 9.881660460907957e-06, + "loss": 0.0232, + "step": 342 + }, + { + "epoch": 0.09780439121756487, + "grad_norm": 3.549553604790123, + "learning_rate": 9.880659475768526e-06, + "loss": 0.1015, + "step": 343 + }, + { + "epoch": 0.09808953521528371, + "grad_norm": 1.0138498172858956, + "learning_rate": 9.879654326109037e-06, + "loss": 0.0219, + "step": 344 + }, + { + "epoch": 0.09837467921300257, + "grad_norm": 0.5962872642185872, + "learning_rate": 9.878645012787149e-06, + "loss": 0.0073, + "step": 345 + }, + { + "epoch": 0.09865982321072142, + "grad_norm": 1.5344142515023333, + "learning_rate": 9.87763153666408e-06, + "loss": 0.0188, + "step": 346 + }, + { + "epoch": 0.09894496720844026, + "grad_norm": 2.6171999253216014, + "learning_rate": 9.8766138986046e-06, + "loss": 0.0448, + "step": 347 + }, + { + "epoch": 0.09923011120615911, + "grad_norm": 2.0090501521581885, + "learning_rate": 9.875592099477025e-06, + "loss": 0.0481, + "step": 348 + }, + { + "epoch": 0.09951525520387795, + "grad_norm": 2.3354612758931386, + "learning_rate": 9.874566140153228e-06, + "loss": 0.0417, + "step": 349 + }, + { + "epoch": 0.0998003992015968, + "grad_norm": 1.5781598957515535, + "learning_rate": 9.87353602150863e-06, + "loss": 0.0188, + "step": 350 + }, + { + "epoch": 0.10008554319931566, + "grad_norm": 1.928184575603502, + "learning_rate": 9.8725017444222e-06, + "loss": 0.0234, + "step": 351 + }, + { + "epoch": 0.1003706871970345, + "grad_norm": 1.0876133323980701, + "learning_rate": 9.871463309776455e-06, + "loss": 0.009, + "step": 352 + }, + { + "epoch": 0.10065583119475335, + "grad_norm": 11.324227273778364, + "learning_rate": 9.870420718457458e-06, + "loss": 0.011, + "step": 353 + }, + { + "epoch": 0.1009409751924722, + "grad_norm": 2.3272703667045516, + "learning_rate": 9.869373971354826e-06, + "loss": 0.0427, + "step": 354 + }, + { + "epoch": 0.10122611919019105, + "grad_norm": 1.2649728160919362, + "learning_rate": 9.868323069361712e-06, + "loss": 0.0259, + "step": 355 + }, + { + "epoch": 0.1015112631879099, + "grad_norm": 2.0664662137667467, + "learning_rate": 9.867268013374822e-06, + "loss": 0.0226, + "step": 356 + }, + { + "epoch": 0.10179640718562874, + "grad_norm": 0.9365874475796875, + "learning_rate": 9.866208804294401e-06, + "loss": 0.0107, + "step": 357 + }, + { + "epoch": 0.1020815511833476, + "grad_norm": 0.5822681312488291, + "learning_rate": 9.865145443024243e-06, + "loss": 0.0073, + "step": 358 + }, + { + "epoch": 0.10236669518106643, + "grad_norm": 0.5773090002383701, + "learning_rate": 9.86407793047168e-06, + "loss": 0.0064, + "step": 359 + }, + { + "epoch": 0.10265183917878529, + "grad_norm": 1.298097020704675, + "learning_rate": 9.863006267547591e-06, + "loss": 0.0293, + "step": 360 + }, + { + "epoch": 0.10293698317650414, + "grad_norm": 2.01105743317194, + "learning_rate": 9.861930455166392e-06, + "loss": 0.0423, + "step": 361 + }, + { + "epoch": 0.10322212717422298, + "grad_norm": 3.854044900737908, + "learning_rate": 9.86085049424604e-06, + "loss": 0.0646, + "step": 362 + }, + { + "epoch": 0.10350727117194183, + "grad_norm": 3.1428521017863393, + "learning_rate": 9.859766385708035e-06, + "loss": 0.0603, + "step": 363 + }, + { + "epoch": 0.10379241516966067, + "grad_norm": 1.1415144666210106, + "learning_rate": 9.858678130477415e-06, + "loss": 0.0089, + "step": 364 + }, + { + "epoch": 0.10407755916737953, + "grad_norm": 1.4442748658078282, + "learning_rate": 9.857585729482753e-06, + "loss": 0.0271, + "step": 365 + }, + { + "epoch": 0.10436270316509838, + "grad_norm": 0.8289767742462578, + "learning_rate": 9.856489183656163e-06, + "loss": 0.0159, + "step": 366 + }, + { + "epoch": 0.10464784716281722, + "grad_norm": 1.7124474237003318, + "learning_rate": 9.855388493933298e-06, + "loss": 0.0527, + "step": 367 + }, + { + "epoch": 0.10493299116053607, + "grad_norm": 2.981703921126902, + "learning_rate": 9.854283661253338e-06, + "loss": 0.0357, + "step": 368 + }, + { + "epoch": 0.10521813515825491, + "grad_norm": 2.1585606753904965, + "learning_rate": 9.853174686559006e-06, + "loss": 0.0393, + "step": 369 + }, + { + "epoch": 0.10550327915597377, + "grad_norm": 1.5348910023395037, + "learning_rate": 9.852061570796557e-06, + "loss": 0.0398, + "step": 370 + }, + { + "epoch": 0.10578842315369262, + "grad_norm": 1.535951948243033, + "learning_rate": 9.85094431491578e-06, + "loss": 0.0208, + "step": 371 + }, + { + "epoch": 0.10607356715141146, + "grad_norm": 1.0394143122475457, + "learning_rate": 9.849822919869993e-06, + "loss": 0.0127, + "step": 372 + }, + { + "epoch": 0.10635871114913031, + "grad_norm": 1.8866287812876101, + "learning_rate": 9.848697386616052e-06, + "loss": 0.0243, + "step": 373 + }, + { + "epoch": 0.10664385514684915, + "grad_norm": 2.152613819596744, + "learning_rate": 9.847567716114339e-06, + "loss": 0.0475, + "step": 374 + }, + { + "epoch": 0.10692899914456801, + "grad_norm": 1.8499798292082466, + "learning_rate": 9.846433909328768e-06, + "loss": 0.0482, + "step": 375 + }, + { + "epoch": 0.10721414314228686, + "grad_norm": 2.1403198834038255, + "learning_rate": 9.845295967226782e-06, + "loss": 0.0359, + "step": 376 + }, + { + "epoch": 0.1074992871400057, + "grad_norm": 0.679326626935762, + "learning_rate": 9.844153890779352e-06, + "loss": 0.014, + "step": 377 + }, + { + "epoch": 0.10778443113772455, + "grad_norm": 2.4149761091081277, + "learning_rate": 9.84300768096098e-06, + "loss": 0.035, + "step": 378 + }, + { + "epoch": 0.1080695751354434, + "grad_norm": 1.1906254157269311, + "learning_rate": 9.841857338749693e-06, + "loss": 0.0321, + "step": 379 + }, + { + "epoch": 0.10835471913316225, + "grad_norm": 0.5918164701274412, + "learning_rate": 9.840702865127039e-06, + "loss": 0.0201, + "step": 380 + }, + { + "epoch": 0.1086398631308811, + "grad_norm": 1.7497628829871825, + "learning_rate": 9.839544261078099e-06, + "loss": 0.0211, + "step": 381 + }, + { + "epoch": 0.10892500712859994, + "grad_norm": 1.3983788980140046, + "learning_rate": 9.838381527591475e-06, + "loss": 0.0236, + "step": 382 + }, + { + "epoch": 0.1092101511263188, + "grad_norm": 3.1851074927752623, + "learning_rate": 9.83721466565929e-06, + "loss": 0.0372, + "step": 383 + }, + { + "epoch": 0.10949529512403763, + "grad_norm": 1.4171014306009728, + "learning_rate": 9.836043676277195e-06, + "loss": 0.0252, + "step": 384 + }, + { + "epoch": 0.10978043912175649, + "grad_norm": 0.9261846959930898, + "learning_rate": 9.83486856044436e-06, + "loss": 0.0191, + "step": 385 + }, + { + "epoch": 0.11006558311947534, + "grad_norm": 1.6296524588712908, + "learning_rate": 9.833689319163473e-06, + "loss": 0.0576, + "step": 386 + }, + { + "epoch": 0.11035072711719418, + "grad_norm": 0.8019311699722914, + "learning_rate": 9.832505953440748e-06, + "loss": 0.0202, + "step": 387 + }, + { + "epoch": 0.11063587111491303, + "grad_norm": 1.5113990773927573, + "learning_rate": 9.831318464285914e-06, + "loss": 0.046, + "step": 388 + }, + { + "epoch": 0.11092101511263187, + "grad_norm": 2.728229672787799, + "learning_rate": 9.83012685271222e-06, + "loss": 0.0537, + "step": 389 + }, + { + "epoch": 0.11120615911035073, + "grad_norm": 0.913884511065414, + "learning_rate": 9.828931119736435e-06, + "loss": 0.0102, + "step": 390 + }, + { + "epoch": 0.11149130310806958, + "grad_norm": 4.01446357656375, + "learning_rate": 9.827731266378839e-06, + "loss": 0.061, + "step": 391 + }, + { + "epoch": 0.11177644710578842, + "grad_norm": 1.3370546467948305, + "learning_rate": 9.82652729366323e-06, + "loss": 0.0371, + "step": 392 + }, + { + "epoch": 0.11206159110350727, + "grad_norm": 1.3728857509837673, + "learning_rate": 9.825319202616926e-06, + "loss": 0.0258, + "step": 393 + }, + { + "epoch": 0.11234673510122611, + "grad_norm": 1.8638416100609516, + "learning_rate": 9.82410699427075e-06, + "loss": 0.0386, + "step": 394 + }, + { + "epoch": 0.11263187909894497, + "grad_norm": 0.7022867943321849, + "learning_rate": 9.822890669659044e-06, + "loss": 0.0144, + "step": 395 + }, + { + "epoch": 0.11291702309666382, + "grad_norm": 1.4752375376718947, + "learning_rate": 9.821670229819663e-06, + "loss": 0.049, + "step": 396 + }, + { + "epoch": 0.11320216709438266, + "grad_norm": 0.7023676759664046, + "learning_rate": 9.820445675793973e-06, + "loss": 0.0241, + "step": 397 + }, + { + "epoch": 0.11348731109210151, + "grad_norm": 0.49880724393845305, + "learning_rate": 9.819217008626847e-06, + "loss": 0.0241, + "step": 398 + }, + { + "epoch": 0.11377245508982035, + "grad_norm": 2.064257083055477, + "learning_rate": 9.817984229366669e-06, + "loss": 0.058, + "step": 399 + }, + { + "epoch": 0.11405759908753921, + "grad_norm": 0.5760537754446529, + "learning_rate": 9.816747339065333e-06, + "loss": 0.0233, + "step": 400 + }, + { + "epoch": 0.11434274308525806, + "grad_norm": 1.2124787291362593, + "learning_rate": 9.81550633877824e-06, + "loss": 0.0614, + "step": 401 + }, + { + "epoch": 0.1146278870829769, + "grad_norm": 1.5895231814003101, + "learning_rate": 9.8142612295643e-06, + "loss": 0.0569, + "step": 402 + }, + { + "epoch": 0.11491303108069575, + "grad_norm": 1.0484784015774489, + "learning_rate": 9.813012012485925e-06, + "loss": 0.0161, + "step": 403 + }, + { + "epoch": 0.1151981750784146, + "grad_norm": 2.0015024924134988, + "learning_rate": 9.811758688609036e-06, + "loss": 0.0288, + "step": 404 + }, + { + "epoch": 0.11548331907613345, + "grad_norm": 1.0183121982064807, + "learning_rate": 9.810501259003058e-06, + "loss": 0.0272, + "step": 405 + }, + { + "epoch": 0.1157684630738523, + "grad_norm": 0.7040323962320703, + "learning_rate": 9.809239724740913e-06, + "loss": 0.0127, + "step": 406 + }, + { + "epoch": 0.11605360707157114, + "grad_norm": 1.2738847744090185, + "learning_rate": 9.807974086899037e-06, + "loss": 0.0326, + "step": 407 + }, + { + "epoch": 0.11633875106929, + "grad_norm": 1.6928730590425576, + "learning_rate": 9.806704346557354e-06, + "loss": 0.0467, + "step": 408 + }, + { + "epoch": 0.11662389506700883, + "grad_norm": 1.5237688876565934, + "learning_rate": 9.8054305047993e-06, + "loss": 0.0428, + "step": 409 + }, + { + "epoch": 0.11690903906472769, + "grad_norm": 2.5601247962488047, + "learning_rate": 9.804152562711804e-06, + "loss": 0.0312, + "step": 410 + }, + { + "epoch": 0.11719418306244654, + "grad_norm": 1.6898452734577185, + "learning_rate": 9.802870521385295e-06, + "loss": 0.0165, + "step": 411 + }, + { + "epoch": 0.11747932706016538, + "grad_norm": 3.202435062260781, + "learning_rate": 9.801584381913702e-06, + "loss": 0.0365, + "step": 412 + }, + { + "epoch": 0.11776447105788423, + "grad_norm": 2.4208311464375267, + "learning_rate": 9.800294145394449e-06, + "loss": 0.0456, + "step": 413 + }, + { + "epoch": 0.11804961505560307, + "grad_norm": 1.8464478511128473, + "learning_rate": 9.798999812928454e-06, + "loss": 0.0275, + "step": 414 + }, + { + "epoch": 0.11833475905332193, + "grad_norm": 1.7466714628297064, + "learning_rate": 9.797701385620135e-06, + "loss": 0.0664, + "step": 415 + }, + { + "epoch": 0.11861990305104078, + "grad_norm": 1.374398177525167, + "learning_rate": 9.796398864577398e-06, + "loss": 0.0249, + "step": 416 + }, + { + "epoch": 0.11890504704875962, + "grad_norm": 0.682379469399641, + "learning_rate": 9.795092250911646e-06, + "loss": 0.0324, + "step": 417 + }, + { + "epoch": 0.11919019104647847, + "grad_norm": 1.345099432009825, + "learning_rate": 9.793781545737775e-06, + "loss": 0.0304, + "step": 418 + }, + { + "epoch": 0.11947533504419731, + "grad_norm": 2.459208618588478, + "learning_rate": 9.79246675017417e-06, + "loss": 0.038, + "step": 419 + }, + { + "epoch": 0.11976047904191617, + "grad_norm": 1.1582497240978238, + "learning_rate": 9.791147865342703e-06, + "loss": 0.0239, + "step": 420 + }, + { + "epoch": 0.12004562303963502, + "grad_norm": 0.6873012288522815, + "learning_rate": 9.789824892368742e-06, + "loss": 0.0219, + "step": 421 + }, + { + "epoch": 0.12033076703735386, + "grad_norm": 1.4503792824156856, + "learning_rate": 9.78849783238114e-06, + "loss": 0.039, + "step": 422 + }, + { + "epoch": 0.12061591103507271, + "grad_norm": 1.4695551960581217, + "learning_rate": 9.787166686512237e-06, + "loss": 0.0225, + "step": 423 + }, + { + "epoch": 0.12090105503279155, + "grad_norm": 1.0755209767496783, + "learning_rate": 9.785831455897859e-06, + "loss": 0.0187, + "step": 424 + }, + { + "epoch": 0.12118619903051041, + "grad_norm": 2.8435317285492223, + "learning_rate": 9.784492141677318e-06, + "loss": 0.0377, + "step": 425 + }, + { + "epoch": 0.12147134302822926, + "grad_norm": 0.6194176889222343, + "learning_rate": 9.783148744993413e-06, + "loss": 0.014, + "step": 426 + }, + { + "epoch": 0.1217564870259481, + "grad_norm": 1.197556570522883, + "learning_rate": 9.781801266992421e-06, + "loss": 0.0144, + "step": 427 + }, + { + "epoch": 0.12204163102366695, + "grad_norm": 1.509780270356931, + "learning_rate": 9.780449708824107e-06, + "loss": 0.0149, + "step": 428 + }, + { + "epoch": 0.1223267750213858, + "grad_norm": 1.0425109169214848, + "learning_rate": 9.779094071641712e-06, + "loss": 0.0287, + "step": 429 + }, + { + "epoch": 0.12261191901910465, + "grad_norm": 2.9496669420120383, + "learning_rate": 9.777734356601964e-06, + "loss": 0.0152, + "step": 430 + }, + { + "epoch": 0.1228970630168235, + "grad_norm": 1.4441615240576289, + "learning_rate": 9.776370564865066e-06, + "loss": 0.0265, + "step": 431 + }, + { + "epoch": 0.12318220701454234, + "grad_norm": 2.8617609288687773, + "learning_rate": 9.775002697594696e-06, + "loss": 0.0259, + "step": 432 + }, + { + "epoch": 0.1234673510122612, + "grad_norm": 2.019269634607327, + "learning_rate": 9.773630755958021e-06, + "loss": 0.0419, + "step": 433 + }, + { + "epoch": 0.12375249500998003, + "grad_norm": 3.762297631517556, + "learning_rate": 9.772254741125672e-06, + "loss": 0.0328, + "step": 434 + }, + { + "epoch": 0.12403763900769889, + "grad_norm": 1.9417548664384192, + "learning_rate": 9.770874654271768e-06, + "loss": 0.0572, + "step": 435 + }, + { + "epoch": 0.12432278300541774, + "grad_norm": 1.7851777881726447, + "learning_rate": 9.769490496573886e-06, + "loss": 0.049, + "step": 436 + }, + { + "epoch": 0.12460792700313658, + "grad_norm": 2.477982800391376, + "learning_rate": 9.768102269213093e-06, + "loss": 0.0705, + "step": 437 + }, + { + "epoch": 0.12489307100085544, + "grad_norm": 0.3032358120092834, + "learning_rate": 9.76670997337392e-06, + "loss": 0.0047, + "step": 438 + }, + { + "epoch": 0.1251782149985743, + "grad_norm": 1.483642571394467, + "learning_rate": 9.765313610244372e-06, + "loss": 0.0151, + "step": 439 + }, + { + "epoch": 0.12546335899629313, + "grad_norm": 1.814367508425431, + "learning_rate": 9.763913181015923e-06, + "loss": 0.0459, + "step": 440 + }, + { + "epoch": 0.12574850299401197, + "grad_norm": 0.7952597090870602, + "learning_rate": 9.762508686883515e-06, + "loss": 0.0201, + "step": 441 + }, + { + "epoch": 0.12603364699173084, + "grad_norm": 3.185384390855697, + "learning_rate": 9.761100129045565e-06, + "loss": 0.0432, + "step": 442 + }, + { + "epoch": 0.12631879098944968, + "grad_norm": 1.0661696789911552, + "learning_rate": 9.759687508703948e-06, + "loss": 0.0139, + "step": 443 + }, + { + "epoch": 0.12660393498716851, + "grad_norm": 3.7725165197415964, + "learning_rate": 9.758270827064016e-06, + "loss": 0.0483, + "step": 444 + }, + { + "epoch": 0.12688907898488735, + "grad_norm": 2.746968288043148, + "learning_rate": 9.756850085334576e-06, + "loss": 0.0574, + "step": 445 + }, + { + "epoch": 0.12717422298260622, + "grad_norm": 0.926770487815625, + "learning_rate": 9.755425284727908e-06, + "loss": 0.018, + "step": 446 + }, + { + "epoch": 0.12745936698032506, + "grad_norm": 0.9082707028939784, + "learning_rate": 9.753996426459748e-06, + "loss": 0.0263, + "step": 447 + }, + { + "epoch": 0.1277445109780439, + "grad_norm": 1.164510189232619, + "learning_rate": 9.752563511749301e-06, + "loss": 0.0337, + "step": 448 + }, + { + "epoch": 0.12802965497576277, + "grad_norm": 2.3418912893557704, + "learning_rate": 9.75112654181923e-06, + "loss": 0.0416, + "step": 449 + }, + { + "epoch": 0.1283147989734816, + "grad_norm": 4.056148524420801, + "learning_rate": 9.749685517895654e-06, + "loss": 0.0539, + "step": 450 + }, + { + "epoch": 0.12859994297120045, + "grad_norm": 1.3521227970804115, + "learning_rate": 9.748240441208158e-06, + "loss": 0.0395, + "step": 451 + }, + { + "epoch": 0.12888508696891932, + "grad_norm": 1.9419704386778984, + "learning_rate": 9.746791312989785e-06, + "loss": 0.0393, + "step": 452 + }, + { + "epoch": 0.12917023096663816, + "grad_norm": 1.0595383076344087, + "learning_rate": 9.745338134477031e-06, + "loss": 0.0214, + "step": 453 + }, + { + "epoch": 0.129455374964357, + "grad_norm": 1.4679175053059728, + "learning_rate": 9.743880906909849e-06, + "loss": 0.0525, + "step": 454 + }, + { + "epoch": 0.12974051896207583, + "grad_norm": 1.4310123455755626, + "learning_rate": 9.742419631531647e-06, + "loss": 0.0158, + "step": 455 + }, + { + "epoch": 0.1300256629597947, + "grad_norm": 0.6588067935353689, + "learning_rate": 9.740954309589288e-06, + "loss": 0.0277, + "step": 456 + }, + { + "epoch": 0.13031080695751354, + "grad_norm": 0.9397591945163838, + "learning_rate": 9.739484942333087e-06, + "loss": 0.0312, + "step": 457 + }, + { + "epoch": 0.13059595095523238, + "grad_norm": 1.1729515696135353, + "learning_rate": 9.738011531016809e-06, + "loss": 0.0369, + "step": 458 + }, + { + "epoch": 0.13088109495295125, + "grad_norm": 2.9106198831897734, + "learning_rate": 9.736534076897676e-06, + "loss": 0.0698, + "step": 459 + }, + { + "epoch": 0.1311662389506701, + "grad_norm": 2.418685278178657, + "learning_rate": 9.735052581236353e-06, + "loss": 0.0529, + "step": 460 + }, + { + "epoch": 0.13145138294838893, + "grad_norm": 1.1991557936515589, + "learning_rate": 9.733567045296955e-06, + "loss": 0.019, + "step": 461 + }, + { + "epoch": 0.1317365269461078, + "grad_norm": 2.168170383166244, + "learning_rate": 9.732077470347043e-06, + "loss": 0.0336, + "step": 462 + }, + { + "epoch": 0.13202167094382664, + "grad_norm": 0.8382241005884289, + "learning_rate": 9.730583857657632e-06, + "loss": 0.0241, + "step": 463 + }, + { + "epoch": 0.13230681494154548, + "grad_norm": 1.1602419211992863, + "learning_rate": 9.729086208503174e-06, + "loss": 0.0285, + "step": 464 + }, + { + "epoch": 0.13259195893926431, + "grad_norm": 2.3952533816912234, + "learning_rate": 9.727584524161568e-06, + "loss": 0.062, + "step": 465 + }, + { + "epoch": 0.13287710293698318, + "grad_norm": 3.1634854012983693, + "learning_rate": 9.726078805914156e-06, + "loss": 0.0585, + "step": 466 + }, + { + "epoch": 0.13316224693470202, + "grad_norm": 2.081821935988785, + "learning_rate": 9.724569055045722e-06, + "loss": 0.0497, + "step": 467 + }, + { + "epoch": 0.13344739093242086, + "grad_norm": 1.7133129071243365, + "learning_rate": 9.723055272844492e-06, + "loss": 0.0231, + "step": 468 + }, + { + "epoch": 0.13373253493013973, + "grad_norm": 1.3379025505204591, + "learning_rate": 9.72153746060213e-06, + "loss": 0.0296, + "step": 469 + }, + { + "epoch": 0.13401767892785857, + "grad_norm": 2.4905984184567176, + "learning_rate": 9.720015619613738e-06, + "loss": 0.0341, + "step": 470 + }, + { + "epoch": 0.1343028229255774, + "grad_norm": 1.7732686088505376, + "learning_rate": 9.718489751177863e-06, + "loss": 0.046, + "step": 471 + }, + { + "epoch": 0.13458796692329628, + "grad_norm": 1.4099272711066784, + "learning_rate": 9.716959856596476e-06, + "loss": 0.0517, + "step": 472 + }, + { + "epoch": 0.13487311092101512, + "grad_norm": 0.5759751962800088, + "learning_rate": 9.715425937174992e-06, + "loss": 0.0205, + "step": 473 + }, + { + "epoch": 0.13515825491873396, + "grad_norm": 1.8213929340301374, + "learning_rate": 9.71388799422226e-06, + "loss": 0.0424, + "step": 474 + }, + { + "epoch": 0.1354433989164528, + "grad_norm": 1.1189923917553266, + "learning_rate": 9.712346029050561e-06, + "loss": 0.0261, + "step": 475 + }, + { + "epoch": 0.13572854291417166, + "grad_norm": 0.7740540942029566, + "learning_rate": 9.710800042975604e-06, + "loss": 0.0229, + "step": 476 + }, + { + "epoch": 0.1360136869118905, + "grad_norm": 0.7017604368884549, + "learning_rate": 9.709250037316535e-06, + "loss": 0.0189, + "step": 477 + }, + { + "epoch": 0.13629883090960934, + "grad_norm": 0.8260830471935292, + "learning_rate": 9.707696013395929e-06, + "loss": 0.0222, + "step": 478 + }, + { + "epoch": 0.1365839749073282, + "grad_norm": 1.2766002745562819, + "learning_rate": 9.706137972539784e-06, + "loss": 0.0264, + "step": 479 + }, + { + "epoch": 0.13686911890504705, + "grad_norm": 1.316995242325395, + "learning_rate": 9.70457591607753e-06, + "loss": 0.029, + "step": 480 + }, + { + "epoch": 0.1371542629027659, + "grad_norm": 1.3031733188507375, + "learning_rate": 9.703009845342027e-06, + "loss": 0.0306, + "step": 481 + }, + { + "epoch": 0.13743940690048476, + "grad_norm": 2.0441243406454515, + "learning_rate": 9.701439761669551e-06, + "loss": 0.0384, + "step": 482 + }, + { + "epoch": 0.1377245508982036, + "grad_norm": 4.4329827478789765, + "learning_rate": 9.69986566639981e-06, + "loss": 0.0669, + "step": 483 + }, + { + "epoch": 0.13800969489592244, + "grad_norm": 0.550863641831382, + "learning_rate": 9.698287560875932e-06, + "loss": 0.011, + "step": 484 + }, + { + "epoch": 0.13829483889364128, + "grad_norm": 1.367528196902721, + "learning_rate": 9.696705446444465e-06, + "loss": 0.0475, + "step": 485 + }, + { + "epoch": 0.13857998289136014, + "grad_norm": 2.9049463748248625, + "learning_rate": 9.695119324455383e-06, + "loss": 0.048, + "step": 486 + }, + { + "epoch": 0.13886512688907898, + "grad_norm": 2.152275954468813, + "learning_rate": 9.693529196262073e-06, + "loss": 0.0426, + "step": 487 + }, + { + "epoch": 0.13915027088679782, + "grad_norm": 1.4509315149948045, + "learning_rate": 9.691935063221347e-06, + "loss": 0.0371, + "step": 488 + }, + { + "epoch": 0.1394354148845167, + "grad_norm": 0.924165717850074, + "learning_rate": 9.690336926693427e-06, + "loss": 0.0165, + "step": 489 + }, + { + "epoch": 0.13972055888223553, + "grad_norm": 1.5645440774065995, + "learning_rate": 9.688734788041958e-06, + "loss": 0.0222, + "step": 490 + }, + { + "epoch": 0.14000570287995437, + "grad_norm": 1.8857089742319397, + "learning_rate": 9.687128648633995e-06, + "loss": 0.0338, + "step": 491 + }, + { + "epoch": 0.14029084687767324, + "grad_norm": 1.2496068613960891, + "learning_rate": 9.685518509840008e-06, + "loss": 0.0393, + "step": 492 + }, + { + "epoch": 0.14057599087539208, + "grad_norm": 2.2609376738497797, + "learning_rate": 9.683904373033884e-06, + "loss": 0.0424, + "step": 493 + }, + { + "epoch": 0.14086113487311092, + "grad_norm": 0.6716431956995677, + "learning_rate": 9.682286239592912e-06, + "loss": 0.0189, + "step": 494 + }, + { + "epoch": 0.14114627887082976, + "grad_norm": 1.8445797069209673, + "learning_rate": 9.6806641108978e-06, + "loss": 0.0467, + "step": 495 + }, + { + "epoch": 0.14143142286854862, + "grad_norm": 1.039428374248318, + "learning_rate": 9.67903798833266e-06, + "loss": 0.0296, + "step": 496 + }, + { + "epoch": 0.14171656686626746, + "grad_norm": 2.241000783231958, + "learning_rate": 9.677407873285016e-06, + "loss": 0.021, + "step": 497 + }, + { + "epoch": 0.1420017108639863, + "grad_norm": 2.1580510158063904, + "learning_rate": 9.675773767145795e-06, + "loss": 0.0404, + "step": 498 + }, + { + "epoch": 0.14228685486170517, + "grad_norm": 2.1848379943693854, + "learning_rate": 9.674135671309329e-06, + "loss": 0.0392, + "step": 499 + }, + { + "epoch": 0.142571998859424, + "grad_norm": 1.2914001435408309, + "learning_rate": 9.672493587173356e-06, + "loss": 0.0193, + "step": 500 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 1.6915143202313678, + "learning_rate": 9.670847516139019e-06, + "loss": 0.0287, + "step": 501 + }, + { + "epoch": 0.14314228685486172, + "grad_norm": 0.6349880464190206, + "learning_rate": 9.66919745961086e-06, + "loss": 0.011, + "step": 502 + }, + { + "epoch": 0.14342743085258056, + "grad_norm": 1.3888680526489037, + "learning_rate": 9.667543418996824e-06, + "loss": 0.0206, + "step": 503 + }, + { + "epoch": 0.1437125748502994, + "grad_norm": 1.9166676274285204, + "learning_rate": 9.665885395708252e-06, + "loss": 0.0321, + "step": 504 + }, + { + "epoch": 0.14399771884801824, + "grad_norm": 0.6986695674839958, + "learning_rate": 9.664223391159885e-06, + "loss": 0.0222, + "step": 505 + }, + { + "epoch": 0.1442828628457371, + "grad_norm": 0.9132750549727724, + "learning_rate": 9.662557406769865e-06, + "loss": 0.026, + "step": 506 + }, + { + "epoch": 0.14456800684345594, + "grad_norm": 1.5540547156511098, + "learning_rate": 9.660887443959726e-06, + "loss": 0.039, + "step": 507 + }, + { + "epoch": 0.14485315084117478, + "grad_norm": 0.9598545046429632, + "learning_rate": 9.659213504154393e-06, + "loss": 0.0211, + "step": 508 + }, + { + "epoch": 0.14513829483889365, + "grad_norm": 2.2737404573757862, + "learning_rate": 9.65753558878219e-06, + "loss": 0.0239, + "step": 509 + }, + { + "epoch": 0.1454234388366125, + "grad_norm": 1.5062825135884002, + "learning_rate": 9.655853699274834e-06, + "loss": 0.0345, + "step": 510 + }, + { + "epoch": 0.14570858283433133, + "grad_norm": 1.6018765019531456, + "learning_rate": 9.65416783706743e-06, + "loss": 0.0209, + "step": 511 + }, + { + "epoch": 0.1459937268320502, + "grad_norm": 1.2442982344382103, + "learning_rate": 9.652478003598471e-06, + "loss": 0.0285, + "step": 512 + }, + { + "epoch": 0.14627887082976904, + "grad_norm": 1.9216622846090483, + "learning_rate": 9.650784200309847e-06, + "loss": 0.0339, + "step": 513 + }, + { + "epoch": 0.14656401482748788, + "grad_norm": 2.3825991592561717, + "learning_rate": 9.64908642864682e-06, + "loss": 0.044, + "step": 514 + }, + { + "epoch": 0.14684915882520672, + "grad_norm": 2.5818323385820916, + "learning_rate": 9.647384690058058e-06, + "loss": 0.0353, + "step": 515 + }, + { + "epoch": 0.14713430282292558, + "grad_norm": 1.5120506419436006, + "learning_rate": 9.645678985995597e-06, + "loss": 0.0184, + "step": 516 + }, + { + "epoch": 0.14741944682064442, + "grad_norm": 3.092529038403767, + "learning_rate": 9.643969317914865e-06, + "loss": 0.0459, + "step": 517 + }, + { + "epoch": 0.14770459081836326, + "grad_norm": 1.6305582095574176, + "learning_rate": 9.642255687274669e-06, + "loss": 0.0745, + "step": 518 + }, + { + "epoch": 0.14798973481608213, + "grad_norm": 1.9745427124068613, + "learning_rate": 9.6405380955372e-06, + "loss": 0.0431, + "step": 519 + }, + { + "epoch": 0.14827487881380097, + "grad_norm": 1.1060350887232473, + "learning_rate": 9.638816544168027e-06, + "loss": 0.0401, + "step": 520 + }, + { + "epoch": 0.1485600228115198, + "grad_norm": 1.48122125532716, + "learning_rate": 9.637091034636097e-06, + "loss": 0.0523, + "step": 521 + }, + { + "epoch": 0.14884516680923868, + "grad_norm": 1.7854176889849915, + "learning_rate": 9.635361568413739e-06, + "loss": 0.0392, + "step": 522 + }, + { + "epoch": 0.14913031080695752, + "grad_norm": 1.5771118563507494, + "learning_rate": 9.633628146976649e-06, + "loss": 0.0291, + "step": 523 + }, + { + "epoch": 0.14941545480467636, + "grad_norm": 1.170813729239735, + "learning_rate": 9.631890771803909e-06, + "loss": 0.0324, + "step": 524 + }, + { + "epoch": 0.1497005988023952, + "grad_norm": 1.1607083768274689, + "learning_rate": 9.630149444377964e-06, + "loss": 0.027, + "step": 525 + }, + { + "epoch": 0.14998574280011406, + "grad_norm": 2.6420375275270946, + "learning_rate": 9.628404166184639e-06, + "loss": 0.0392, + "step": 526 + }, + { + "epoch": 0.1502708867978329, + "grad_norm": 1.5977547719023326, + "learning_rate": 9.626654938713128e-06, + "loss": 0.039, + "step": 527 + }, + { + "epoch": 0.15055603079555174, + "grad_norm": 2.4449410251819055, + "learning_rate": 9.624901763455994e-06, + "loss": 0.1085, + "step": 528 + }, + { + "epoch": 0.1508411747932706, + "grad_norm": 1.66447527220779, + "learning_rate": 9.623144641909167e-06, + "loss": 0.0453, + "step": 529 + }, + { + "epoch": 0.15112631879098945, + "grad_norm": 3.13057218774973, + "learning_rate": 9.621383575571948e-06, + "loss": 0.057, + "step": 530 + }, + { + "epoch": 0.1514114627887083, + "grad_norm": 1.2160935811246358, + "learning_rate": 9.619618565947e-06, + "loss": 0.0424, + "step": 531 + }, + { + "epoch": 0.15169660678642716, + "grad_norm": 1.4641008759404945, + "learning_rate": 9.617849614540356e-06, + "loss": 0.048, + "step": 532 + }, + { + "epoch": 0.151981750784146, + "grad_norm": 1.4923618640634606, + "learning_rate": 9.616076722861406e-06, + "loss": 0.035, + "step": 533 + }, + { + "epoch": 0.15226689478186484, + "grad_norm": 0.7354465216068986, + "learning_rate": 9.614299892422905e-06, + "loss": 0.0328, + "step": 534 + }, + { + "epoch": 0.15255203877958368, + "grad_norm": 1.3156709351195746, + "learning_rate": 9.61251912474097e-06, + "loss": 0.0295, + "step": 535 + }, + { + "epoch": 0.15283718277730254, + "grad_norm": 2.465389798366464, + "learning_rate": 9.610734421335078e-06, + "loss": 0.0355, + "step": 536 + }, + { + "epoch": 0.15312232677502138, + "grad_norm": 2.107752927710834, + "learning_rate": 9.608945783728061e-06, + "loss": 0.0304, + "step": 537 + }, + { + "epoch": 0.15340747077274022, + "grad_norm": 0.8382376238641022, + "learning_rate": 9.60715321344611e-06, + "loss": 0.0292, + "step": 538 + }, + { + "epoch": 0.1536926147704591, + "grad_norm": 1.3467398331487281, + "learning_rate": 9.605356712018773e-06, + "loss": 0.0385, + "step": 539 + }, + { + "epoch": 0.15397775876817793, + "grad_norm": 2.0809206260900552, + "learning_rate": 9.603556280978947e-06, + "loss": 0.0572, + "step": 540 + }, + { + "epoch": 0.15426290276589677, + "grad_norm": 1.2901980818849303, + "learning_rate": 9.60175192186289e-06, + "loss": 0.0193, + "step": 541 + }, + { + "epoch": 0.15454804676361564, + "grad_norm": 0.736806847215193, + "learning_rate": 9.599943636210204e-06, + "loss": 0.0284, + "step": 542 + }, + { + "epoch": 0.15483319076133448, + "grad_norm": 1.5679254655317187, + "learning_rate": 9.598131425563847e-06, + "loss": 0.0317, + "step": 543 + }, + { + "epoch": 0.15511833475905332, + "grad_norm": 0.7132612878107385, + "learning_rate": 9.596315291470122e-06, + "loss": 0.0114, + "step": 544 + }, + { + "epoch": 0.15540347875677216, + "grad_norm": 2.593670926612122, + "learning_rate": 9.594495235478685e-06, + "loss": 0.0463, + "step": 545 + }, + { + "epoch": 0.15568862275449102, + "grad_norm": 1.1955510290910427, + "learning_rate": 9.59267125914253e-06, + "loss": 0.0254, + "step": 546 + }, + { + "epoch": 0.15597376675220986, + "grad_norm": 1.045769334119973, + "learning_rate": 9.590843364018005e-06, + "loss": 0.029, + "step": 547 + }, + { + "epoch": 0.1562589107499287, + "grad_norm": 1.6801520699380361, + "learning_rate": 9.589011551664797e-06, + "loss": 0.0572, + "step": 548 + }, + { + "epoch": 0.15654405474764757, + "grad_norm": 1.5891642039931364, + "learning_rate": 9.587175823645936e-06, + "loss": 0.0296, + "step": 549 + }, + { + "epoch": 0.1568291987453664, + "grad_norm": 1.4409334872176962, + "learning_rate": 9.585336181527795e-06, + "loss": 0.0264, + "step": 550 + }, + { + "epoch": 0.15711434274308525, + "grad_norm": 2.2586423562130116, + "learning_rate": 9.583492626880082e-06, + "loss": 0.058, + "step": 551 + }, + { + "epoch": 0.15739948674080412, + "grad_norm": 1.661999933017668, + "learning_rate": 9.581645161275852e-06, + "loss": 0.0551, + "step": 552 + }, + { + "epoch": 0.15768463073852296, + "grad_norm": 1.7655597655232174, + "learning_rate": 9.579793786291486e-06, + "loss": 0.0278, + "step": 553 + }, + { + "epoch": 0.1579697747362418, + "grad_norm": 3.589911729513461, + "learning_rate": 9.577938503506712e-06, + "loss": 0.0517, + "step": 554 + }, + { + "epoch": 0.15825491873396064, + "grad_norm": 3.001411920745496, + "learning_rate": 9.576079314504584e-06, + "loss": 0.0449, + "step": 555 + }, + { + "epoch": 0.1585400627316795, + "grad_norm": 1.604218093067663, + "learning_rate": 9.574216220871492e-06, + "loss": 0.0246, + "step": 556 + }, + { + "epoch": 0.15882520672939834, + "grad_norm": 0.935156656620672, + "learning_rate": 9.57234922419716e-06, + "loss": 0.0161, + "step": 557 + }, + { + "epoch": 0.15911035072711718, + "grad_norm": 1.1513995231538736, + "learning_rate": 9.570478326074638e-06, + "loss": 0.0283, + "step": 558 + }, + { + "epoch": 0.15939549472483605, + "grad_norm": 1.4012387052969366, + "learning_rate": 9.568603528100306e-06, + "loss": 0.0325, + "step": 559 + }, + { + "epoch": 0.1596806387225549, + "grad_norm": 1.5162153854252265, + "learning_rate": 9.566724831873876e-06, + "loss": 0.0225, + "step": 560 + }, + { + "epoch": 0.15996578272027373, + "grad_norm": 0.846516598739829, + "learning_rate": 9.564842238998381e-06, + "loss": 0.0099, + "step": 561 + }, + { + "epoch": 0.1602509267179926, + "grad_norm": 2.647135598055157, + "learning_rate": 9.562955751080183e-06, + "loss": 0.0532, + "step": 562 + }, + { + "epoch": 0.16053607071571144, + "grad_norm": 1.073278252413763, + "learning_rate": 9.561065369728963e-06, + "loss": 0.0355, + "step": 563 + }, + { + "epoch": 0.16082121471343028, + "grad_norm": 1.170548007164538, + "learning_rate": 9.559171096557728e-06, + "loss": 0.029, + "step": 564 + }, + { + "epoch": 0.16110635871114912, + "grad_norm": 1.9939002606252199, + "learning_rate": 9.557272933182804e-06, + "loss": 0.036, + "step": 565 + }, + { + "epoch": 0.16139150270886798, + "grad_norm": 2.1480044032388466, + "learning_rate": 9.555370881223837e-06, + "loss": 0.0436, + "step": 566 + }, + { + "epoch": 0.16167664670658682, + "grad_norm": 1.325637196128801, + "learning_rate": 9.55346494230379e-06, + "loss": 0.0157, + "step": 567 + }, + { + "epoch": 0.16196179070430566, + "grad_norm": 0.7564851499202597, + "learning_rate": 9.551555118048943e-06, + "loss": 0.0249, + "step": 568 + }, + { + "epoch": 0.16224693470202453, + "grad_norm": 0.9722401904633631, + "learning_rate": 9.549641410088895e-06, + "loss": 0.0231, + "step": 569 + }, + { + "epoch": 0.16253207869974337, + "grad_norm": 0.8478045120928479, + "learning_rate": 9.547723820056552e-06, + "loss": 0.0105, + "step": 570 + }, + { + "epoch": 0.1628172226974622, + "grad_norm": 1.351787113285877, + "learning_rate": 9.545802349588136e-06, + "loss": 0.0311, + "step": 571 + }, + { + "epoch": 0.16310236669518108, + "grad_norm": 0.836648012433114, + "learning_rate": 9.543877000323181e-06, + "loss": 0.023, + "step": 572 + }, + { + "epoch": 0.16338751069289992, + "grad_norm": 0.7901380885568204, + "learning_rate": 9.54194777390453e-06, + "loss": 0.014, + "step": 573 + }, + { + "epoch": 0.16367265469061876, + "grad_norm": 1.2938393184716226, + "learning_rate": 9.540014671978335e-06, + "loss": 0.0632, + "step": 574 + }, + { + "epoch": 0.1639577986883376, + "grad_norm": 1.2836949271527522, + "learning_rate": 9.53807769619405e-06, + "loss": 0.0231, + "step": 575 + }, + { + "epoch": 0.16424294268605646, + "grad_norm": 2.27381818725564, + "learning_rate": 9.536136848204443e-06, + "loss": 0.0418, + "step": 576 + }, + { + "epoch": 0.1645280866837753, + "grad_norm": 2.4569515266903026, + "learning_rate": 9.534192129665578e-06, + "loss": 0.0141, + "step": 577 + }, + { + "epoch": 0.16481323068149414, + "grad_norm": 1.7019979551998075, + "learning_rate": 9.532243542236826e-06, + "loss": 0.0159, + "step": 578 + }, + { + "epoch": 0.165098374679213, + "grad_norm": 0.7491727489330064, + "learning_rate": 9.530291087580857e-06, + "loss": 0.011, + "step": 579 + }, + { + "epoch": 0.16538351867693185, + "grad_norm": 1.0697725490475511, + "learning_rate": 9.528334767363643e-06, + "loss": 0.0316, + "step": 580 + }, + { + "epoch": 0.1656686626746507, + "grad_norm": 1.514917067065579, + "learning_rate": 9.526374583254454e-06, + "loss": 0.0247, + "step": 581 + }, + { + "epoch": 0.16595380667236956, + "grad_norm": 1.6526372073335776, + "learning_rate": 9.524410536925854e-06, + "loss": 0.0242, + "step": 582 + }, + { + "epoch": 0.1662389506700884, + "grad_norm": 1.154206493649726, + "learning_rate": 9.522442630053708e-06, + "loss": 0.0151, + "step": 583 + }, + { + "epoch": 0.16652409466780724, + "grad_norm": 2.220783739275899, + "learning_rate": 9.520470864317169e-06, + "loss": 0.0405, + "step": 584 + }, + { + "epoch": 0.16680923866552608, + "grad_norm": 2.1478234313482005, + "learning_rate": 9.518495241398684e-06, + "loss": 0.0255, + "step": 585 + }, + { + "epoch": 0.16709438266324494, + "grad_norm": 1.888990019043023, + "learning_rate": 9.516515762983996e-06, + "loss": 0.023, + "step": 586 + }, + { + "epoch": 0.16737952666096378, + "grad_norm": 0.568362822177166, + "learning_rate": 9.514532430762133e-06, + "loss": 0.0234, + "step": 587 + }, + { + "epoch": 0.16766467065868262, + "grad_norm": 2.666053376987735, + "learning_rate": 9.512545246425416e-06, + "loss": 0.0334, + "step": 588 + }, + { + "epoch": 0.1679498146564015, + "grad_norm": 0.9889048397741379, + "learning_rate": 9.510554211669443e-06, + "loss": 0.0272, + "step": 589 + }, + { + "epoch": 0.16823495865412033, + "grad_norm": 2.663367144657973, + "learning_rate": 9.50855932819311e-06, + "loss": 0.0863, + "step": 590 + }, + { + "epoch": 0.16852010265183917, + "grad_norm": 3.0842378390540564, + "learning_rate": 9.506560597698588e-06, + "loss": 0.0544, + "step": 591 + }, + { + "epoch": 0.16880524664955804, + "grad_norm": 2.2352938418828945, + "learning_rate": 9.504558021891335e-06, + "loss": 0.0597, + "step": 592 + }, + { + "epoch": 0.16909039064727688, + "grad_norm": 0.9718298379039145, + "learning_rate": 9.502551602480087e-06, + "loss": 0.0243, + "step": 593 + }, + { + "epoch": 0.16937553464499572, + "grad_norm": 0.9679204345252653, + "learning_rate": 9.500541341176865e-06, + "loss": 0.0226, + "step": 594 + }, + { + "epoch": 0.16966067864271456, + "grad_norm": 1.9624333575904165, + "learning_rate": 9.498527239696962e-06, + "loss": 0.0303, + "step": 595 + }, + { + "epoch": 0.16994582264043342, + "grad_norm": 1.0983438075432492, + "learning_rate": 9.496509299758949e-06, + "loss": 0.0263, + "step": 596 + }, + { + "epoch": 0.17023096663815226, + "grad_norm": 1.485540944433959, + "learning_rate": 9.494487523084676e-06, + "loss": 0.018, + "step": 597 + }, + { + "epoch": 0.1705161106358711, + "grad_norm": 0.7546809410329396, + "learning_rate": 9.492461911399265e-06, + "loss": 0.0208, + "step": 598 + }, + { + "epoch": 0.17080125463358997, + "grad_norm": 1.2028050799273944, + "learning_rate": 9.490432466431107e-06, + "loss": 0.0126, + "step": 599 + }, + { + "epoch": 0.1710863986313088, + "grad_norm": 2.3029305869735377, + "learning_rate": 9.488399189911866e-06, + "loss": 0.0396, + "step": 600 + }, + { + "epoch": 0.17137154262902765, + "grad_norm": 1.7854430171080835, + "learning_rate": 9.486362083576479e-06, + "loss": 0.0228, + "step": 601 + }, + { + "epoch": 0.17165668662674652, + "grad_norm": 1.2139930054544301, + "learning_rate": 9.484321149163145e-06, + "loss": 0.053, + "step": 602 + }, + { + "epoch": 0.17194183062446536, + "grad_norm": 1.8652637627111008, + "learning_rate": 9.482276388413331e-06, + "loss": 0.0243, + "step": 603 + }, + { + "epoch": 0.1722269746221842, + "grad_norm": 1.733593841920375, + "learning_rate": 9.480227803071775e-06, + "loss": 0.0305, + "step": 604 + }, + { + "epoch": 0.17251211861990304, + "grad_norm": 2.218846595287222, + "learning_rate": 9.478175394886469e-06, + "loss": 0.0345, + "step": 605 + }, + { + "epoch": 0.1727972626176219, + "grad_norm": 2.4849800012227345, + "learning_rate": 9.47611916560867e-06, + "loss": 0.0359, + "step": 606 + }, + { + "epoch": 0.17308240661534074, + "grad_norm": 3.832248662370049, + "learning_rate": 9.474059116992901e-06, + "loss": 0.0489, + "step": 607 + }, + { + "epoch": 0.17336755061305958, + "grad_norm": 1.2556167721835865, + "learning_rate": 9.471995250796936e-06, + "loss": 0.0224, + "step": 608 + }, + { + "epoch": 0.17365269461077845, + "grad_norm": 1.3725885311717074, + "learning_rate": 9.469927568781814e-06, + "loss": 0.0246, + "step": 609 + }, + { + "epoch": 0.1739378386084973, + "grad_norm": 2.0268680892167312, + "learning_rate": 9.467856072711821e-06, + "loss": 0.0377, + "step": 610 + }, + { + "epoch": 0.17422298260621613, + "grad_norm": 2.13735531583806, + "learning_rate": 9.465780764354505e-06, + "loss": 0.032, + "step": 611 + }, + { + "epoch": 0.174508126603935, + "grad_norm": 1.0077175687597997, + "learning_rate": 9.463701645480665e-06, + "loss": 0.0125, + "step": 612 + }, + { + "epoch": 0.17479327060165384, + "grad_norm": 1.4433624480865979, + "learning_rate": 9.46161871786435e-06, + "loss": 0.0213, + "step": 613 + }, + { + "epoch": 0.17507841459937268, + "grad_norm": 1.7363894985553223, + "learning_rate": 9.459531983282858e-06, + "loss": 0.0312, + "step": 614 + }, + { + "epoch": 0.17536355859709152, + "grad_norm": 1.1480559254413287, + "learning_rate": 9.45744144351674e-06, + "loss": 0.0126, + "step": 615 + }, + { + "epoch": 0.17564870259481039, + "grad_norm": 1.0144730105958946, + "learning_rate": 9.455347100349785e-06, + "loss": 0.0465, + "step": 616 + }, + { + "epoch": 0.17593384659252922, + "grad_norm": 2.3315897375286956, + "learning_rate": 9.453248955569041e-06, + "loss": 0.0375, + "step": 617 + }, + { + "epoch": 0.17621899059024806, + "grad_norm": 1.517116537675244, + "learning_rate": 9.451147010964786e-06, + "loss": 0.0255, + "step": 618 + }, + { + "epoch": 0.17650413458796693, + "grad_norm": 1.3539214290926733, + "learning_rate": 9.449041268330549e-06, + "loss": 0.0175, + "step": 619 + }, + { + "epoch": 0.17678927858568577, + "grad_norm": 0.8250630045444727, + "learning_rate": 9.446931729463093e-06, + "loss": 0.0189, + "step": 620 + }, + { + "epoch": 0.1770744225834046, + "grad_norm": 1.6566428427949393, + "learning_rate": 9.44481839616243e-06, + "loss": 0.0263, + "step": 621 + }, + { + "epoch": 0.17735956658112348, + "grad_norm": 1.296549496531528, + "learning_rate": 9.442701270231799e-06, + "loss": 0.0357, + "step": 622 + }, + { + "epoch": 0.17764471057884232, + "grad_norm": 1.2623177453839831, + "learning_rate": 9.440580353477682e-06, + "loss": 0.0138, + "step": 623 + }, + { + "epoch": 0.17792985457656116, + "grad_norm": 0.9401424954471435, + "learning_rate": 9.438455647709794e-06, + "loss": 0.0344, + "step": 624 + }, + { + "epoch": 0.17821499857428, + "grad_norm": 1.2061707620169295, + "learning_rate": 9.436327154741082e-06, + "loss": 0.0204, + "step": 625 + }, + { + "epoch": 0.17850014257199887, + "grad_norm": 3.5812415668133295, + "learning_rate": 9.434194876387723e-06, + "loss": 0.0348, + "step": 626 + }, + { + "epoch": 0.1787852865697177, + "grad_norm": 2.179594593613911, + "learning_rate": 9.43205881446913e-06, + "loss": 0.0405, + "step": 627 + }, + { + "epoch": 0.17907043056743654, + "grad_norm": 3.133447888160429, + "learning_rate": 9.429918970807939e-06, + "loss": 0.0883, + "step": 628 + }, + { + "epoch": 0.1793555745651554, + "grad_norm": 1.1894173921814282, + "learning_rate": 9.427775347230013e-06, + "loss": 0.0459, + "step": 629 + }, + { + "epoch": 0.17964071856287425, + "grad_norm": 1.220795451723468, + "learning_rate": 9.425627945564442e-06, + "loss": 0.0276, + "step": 630 + }, + { + "epoch": 0.1799258625605931, + "grad_norm": 0.9556676101760306, + "learning_rate": 9.423476767643539e-06, + "loss": 0.0408, + "step": 631 + }, + { + "epoch": 0.18021100655831196, + "grad_norm": 1.3012772103524362, + "learning_rate": 9.42132181530284e-06, + "loss": 0.0147, + "step": 632 + }, + { + "epoch": 0.1804961505560308, + "grad_norm": 0.8734603524960712, + "learning_rate": 9.419163090381102e-06, + "loss": 0.0232, + "step": 633 + }, + { + "epoch": 0.18078129455374964, + "grad_norm": 1.2200670729025649, + "learning_rate": 9.4170005947203e-06, + "loss": 0.0329, + "step": 634 + }, + { + "epoch": 0.18106643855146848, + "grad_norm": 1.1685662362242049, + "learning_rate": 9.414834330165626e-06, + "loss": 0.0372, + "step": 635 + }, + { + "epoch": 0.18135158254918735, + "grad_norm": 0.6949628469219308, + "learning_rate": 9.412664298565486e-06, + "loss": 0.0246, + "step": 636 + }, + { + "epoch": 0.18163672654690619, + "grad_norm": 1.771280848314573, + "learning_rate": 9.410490501771507e-06, + "loss": 0.0306, + "step": 637 + }, + { + "epoch": 0.18192187054462503, + "grad_norm": 1.2498455618438535, + "learning_rate": 9.408312941638522e-06, + "loss": 0.0301, + "step": 638 + }, + { + "epoch": 0.1822070145423439, + "grad_norm": 2.278447179095516, + "learning_rate": 9.406131620024576e-06, + "loss": 0.0466, + "step": 639 + }, + { + "epoch": 0.18249215854006273, + "grad_norm": 1.0355835464758893, + "learning_rate": 9.403946538790931e-06, + "loss": 0.023, + "step": 640 + }, + { + "epoch": 0.18277730253778157, + "grad_norm": 0.683840225577905, + "learning_rate": 9.401757699802046e-06, + "loss": 0.0171, + "step": 641 + }, + { + "epoch": 0.18306244653550044, + "grad_norm": 2.090222100453515, + "learning_rate": 9.399565104925591e-06, + "loss": 0.0425, + "step": 642 + }, + { + "epoch": 0.18334759053321928, + "grad_norm": 2.036391317556621, + "learning_rate": 9.397368756032445e-06, + "loss": 0.0266, + "step": 643 + }, + { + "epoch": 0.18363273453093812, + "grad_norm": 1.738387885682237, + "learning_rate": 9.395168654996685e-06, + "loss": 0.0519, + "step": 644 + }, + { + "epoch": 0.18391787852865696, + "grad_norm": 2.117881545376208, + "learning_rate": 9.392964803695592e-06, + "loss": 0.032, + "step": 645 + }, + { + "epoch": 0.18420302252637583, + "grad_norm": 0.6059383955679531, + "learning_rate": 9.390757204009644e-06, + "loss": 0.0083, + "step": 646 + }, + { + "epoch": 0.18448816652409467, + "grad_norm": 0.5792275998008327, + "learning_rate": 9.38854585782252e-06, + "loss": 0.0127, + "step": 647 + }, + { + "epoch": 0.1847733105218135, + "grad_norm": 0.9221379221878636, + "learning_rate": 9.386330767021098e-06, + "loss": 0.0255, + "step": 648 + }, + { + "epoch": 0.18505845451953237, + "grad_norm": 0.6920928264035611, + "learning_rate": 9.384111933495442e-06, + "loss": 0.0084, + "step": 649 + }, + { + "epoch": 0.1853435985172512, + "grad_norm": 1.3255760857611412, + "learning_rate": 9.381889359138823e-06, + "loss": 0.0191, + "step": 650 + }, + { + "epoch": 0.18562874251497005, + "grad_norm": 1.056441268866933, + "learning_rate": 9.379663045847693e-06, + "loss": 0.0479, + "step": 651 + }, + { + "epoch": 0.18591388651268892, + "grad_norm": 2.0724011021981594, + "learning_rate": 9.377432995521701e-06, + "loss": 0.0248, + "step": 652 + }, + { + "epoch": 0.18619903051040776, + "grad_norm": 1.7914795536241521, + "learning_rate": 9.375199210063676e-06, + "loss": 0.0342, + "step": 653 + }, + { + "epoch": 0.1864841745081266, + "grad_norm": 1.540949023356334, + "learning_rate": 9.372961691379649e-06, + "loss": 0.0599, + "step": 654 + }, + { + "epoch": 0.18676931850584544, + "grad_norm": 0.6650657087107381, + "learning_rate": 9.37072044137882e-06, + "loss": 0.0148, + "step": 655 + }, + { + "epoch": 0.1870544625035643, + "grad_norm": 1.9248212315910453, + "learning_rate": 9.368475461973582e-06, + "loss": 0.0375, + "step": 656 + }, + { + "epoch": 0.18733960650128315, + "grad_norm": 2.808110344533141, + "learning_rate": 9.366226755079513e-06, + "loss": 0.0422, + "step": 657 + }, + { + "epoch": 0.18762475049900199, + "grad_norm": 2.1184357826543194, + "learning_rate": 9.36397432261536e-06, + "loss": 0.0282, + "step": 658 + }, + { + "epoch": 0.18790989449672085, + "grad_norm": 1.5031809588854805, + "learning_rate": 9.361718166503062e-06, + "loss": 0.0151, + "step": 659 + }, + { + "epoch": 0.1881950384944397, + "grad_norm": 0.9601384811433088, + "learning_rate": 9.359458288667725e-06, + "loss": 0.0414, + "step": 660 + }, + { + "epoch": 0.18848018249215853, + "grad_norm": 2.0522251099402053, + "learning_rate": 9.357194691037637e-06, + "loss": 0.0436, + "step": 661 + }, + { + "epoch": 0.1887653264898774, + "grad_norm": 0.8416753384643524, + "learning_rate": 9.354927375544256e-06, + "loss": 0.0124, + "step": 662 + }, + { + "epoch": 0.18905047048759624, + "grad_norm": 1.0668298335424686, + "learning_rate": 9.352656344122216e-06, + "loss": 0.0439, + "step": 663 + }, + { + "epoch": 0.18933561448531508, + "grad_norm": 1.34820663049341, + "learning_rate": 9.350381598709319e-06, + "loss": 0.0305, + "step": 664 + }, + { + "epoch": 0.18962075848303392, + "grad_norm": 1.5705169220785646, + "learning_rate": 9.348103141246538e-06, + "loss": 0.0191, + "step": 665 + }, + { + "epoch": 0.1899059024807528, + "grad_norm": 2.322186069862557, + "learning_rate": 9.345820973678011e-06, + "loss": 0.0284, + "step": 666 + }, + { + "epoch": 0.19019104647847163, + "grad_norm": 1.344929313625686, + "learning_rate": 9.343535097951044e-06, + "loss": 0.0343, + "step": 667 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 1.1700086372233123, + "learning_rate": 9.341245516016105e-06, + "loss": 0.018, + "step": 668 + }, + { + "epoch": 0.19076133447390933, + "grad_norm": 1.0706753371606232, + "learning_rate": 9.338952229826825e-06, + "loss": 0.0273, + "step": 669 + }, + { + "epoch": 0.19104647847162817, + "grad_norm": 1.6254449347628146, + "learning_rate": 9.336655241339999e-06, + "loss": 0.0367, + "step": 670 + }, + { + "epoch": 0.191331622469347, + "grad_norm": 67.18866812915265, + "learning_rate": 9.334354552515576e-06, + "loss": 0.0859, + "step": 671 + }, + { + "epoch": 0.19161676646706588, + "grad_norm": 191.39881973611224, + "learning_rate": 9.332050165316664e-06, + "loss": 0.5247, + "step": 672 + }, + { + "epoch": 0.19190191046478472, + "grad_norm": 33.27059153720094, + "learning_rate": 9.32974208170953e-06, + "loss": 0.0762, + "step": 673 + }, + { + "epoch": 0.19218705446250356, + "grad_norm": 2.1745024710329655, + "learning_rate": 9.327430303663589e-06, + "loss": 0.0331, + "step": 674 + }, + { + "epoch": 0.1924721984602224, + "grad_norm": 0.958496001953392, + "learning_rate": 9.325114833151414e-06, + "loss": 0.0174, + "step": 675 + }, + { + "epoch": 0.19275734245794127, + "grad_norm": 1.5630060617545107, + "learning_rate": 9.322795672148726e-06, + "loss": 0.0434, + "step": 676 + }, + { + "epoch": 0.1930424864556601, + "grad_norm": 4.90486633391519, + "learning_rate": 9.320472822634395e-06, + "loss": 0.0542, + "step": 677 + }, + { + "epoch": 0.19332763045337895, + "grad_norm": 0.5026753644154698, + "learning_rate": 9.31814628659044e-06, + "loss": 0.0108, + "step": 678 + }, + { + "epoch": 0.1936127744510978, + "grad_norm": 2.318017474873092, + "learning_rate": 9.315816066002024e-06, + "loss": 0.0408, + "step": 679 + }, + { + "epoch": 0.19389791844881665, + "grad_norm": 2.903171610551692, + "learning_rate": 9.313482162857452e-06, + "loss": 0.0557, + "step": 680 + }, + { + "epoch": 0.1941830624465355, + "grad_norm": 1.4218762758102965, + "learning_rate": 9.311144579148175e-06, + "loss": 0.0159, + "step": 681 + }, + { + "epoch": 0.19446820644425436, + "grad_norm": 2.009139671030263, + "learning_rate": 9.308803316868783e-06, + "loss": 0.0458, + "step": 682 + }, + { + "epoch": 0.1947533504419732, + "grad_norm": 1.6113043050375866, + "learning_rate": 9.306458378017004e-06, + "loss": 0.0678, + "step": 683 + }, + { + "epoch": 0.19503849443969204, + "grad_norm": 3.4623487395539505, + "learning_rate": 9.304109764593705e-06, + "loss": 0.0398, + "step": 684 + }, + { + "epoch": 0.19532363843741088, + "grad_norm": 2.3381560335801646, + "learning_rate": 9.301757478602886e-06, + "loss": 0.0445, + "step": 685 + }, + { + "epoch": 0.19560878243512975, + "grad_norm": 64.94755869755691, + "learning_rate": 9.299401522051685e-06, + "loss": 1.2332, + "step": 686 + }, + { + "epoch": 0.1958939264328486, + "grad_norm": 18.353718727926776, + "learning_rate": 9.297041896950365e-06, + "loss": 0.1454, + "step": 687 + }, + { + "epoch": 0.19617907043056743, + "grad_norm": 0.6617552332955904, + "learning_rate": 9.294678605312323e-06, + "loss": 0.0201, + "step": 688 + }, + { + "epoch": 0.1964642144282863, + "grad_norm": 1.1973919773785329, + "learning_rate": 9.292311649154088e-06, + "loss": 0.0133, + "step": 689 + }, + { + "epoch": 0.19674935842600513, + "grad_norm": 2.181331281045211, + "learning_rate": 9.289941030495313e-06, + "loss": 0.0279, + "step": 690 + }, + { + "epoch": 0.19703450242372397, + "grad_norm": 11.102336408880397, + "learning_rate": 9.287566751358773e-06, + "loss": 0.0318, + "step": 691 + }, + { + "epoch": 0.19731964642144284, + "grad_norm": 0.9201931408854377, + "learning_rate": 9.285188813770368e-06, + "loss": 0.025, + "step": 692 + }, + { + "epoch": 0.19760479041916168, + "grad_norm": 1.508525890818712, + "learning_rate": 9.282807219759123e-06, + "loss": 0.0414, + "step": 693 + }, + { + "epoch": 0.19788993441688052, + "grad_norm": 2.464653813173033, + "learning_rate": 9.280421971357181e-06, + "loss": 0.0389, + "step": 694 + }, + { + "epoch": 0.19817507841459936, + "grad_norm": 0.8743425835485196, + "learning_rate": 9.2780330705998e-06, + "loss": 0.0165, + "step": 695 + }, + { + "epoch": 0.19846022241231823, + "grad_norm": 1.8205084856824216, + "learning_rate": 9.27564051952536e-06, + "loss": 0.0286, + "step": 696 + }, + { + "epoch": 0.19874536641003707, + "grad_norm": 1.3224132996211422, + "learning_rate": 9.273244320175352e-06, + "loss": 0.0333, + "step": 697 + }, + { + "epoch": 0.1990305104077559, + "grad_norm": 1.0636884242784166, + "learning_rate": 9.270844474594381e-06, + "loss": 0.0165, + "step": 698 + }, + { + "epoch": 0.19931565440547477, + "grad_norm": 1.0517832694626335, + "learning_rate": 9.268440984830163e-06, + "loss": 0.0186, + "step": 699 + }, + { + "epoch": 0.1996007984031936, + "grad_norm": 1.0627875282801584, + "learning_rate": 9.266033852933525e-06, + "loss": 0.0299, + "step": 700 + }, + { + "epoch": 0.19988594240091245, + "grad_norm": 0.7272225161607353, + "learning_rate": 9.263623080958398e-06, + "loss": 0.0062, + "step": 701 + }, + { + "epoch": 0.20017108639863132, + "grad_norm": 1.847720921414192, + "learning_rate": 9.26120867096182e-06, + "loss": 0.0506, + "step": 702 + }, + { + "epoch": 0.20045623039635016, + "grad_norm": 1.6823362190282132, + "learning_rate": 9.258790625003939e-06, + "loss": 0.0653, + "step": 703 + }, + { + "epoch": 0.200741374394069, + "grad_norm": 1.4787235791879039, + "learning_rate": 9.256368945147998e-06, + "loss": 0.0538, + "step": 704 + }, + { + "epoch": 0.20102651839178784, + "grad_norm": 1.2040529131598932, + "learning_rate": 9.253943633460344e-06, + "loss": 0.0175, + "step": 705 + }, + { + "epoch": 0.2013116623895067, + "grad_norm": 1.6910714189347542, + "learning_rate": 9.251514692010423e-06, + "loss": 0.0349, + "step": 706 + }, + { + "epoch": 0.20159680638722555, + "grad_norm": 1.472948498746407, + "learning_rate": 9.249082122870779e-06, + "loss": 0.0369, + "step": 707 + }, + { + "epoch": 0.2018819503849444, + "grad_norm": 0.8762692780359592, + "learning_rate": 9.246645928117047e-06, + "loss": 0.0224, + "step": 708 + }, + { + "epoch": 0.20216709438266325, + "grad_norm": 2.4349769570385162, + "learning_rate": 9.24420610982796e-06, + "loss": 0.0254, + "step": 709 + }, + { + "epoch": 0.2024522383803821, + "grad_norm": 1.9112438219894465, + "learning_rate": 9.241762670085343e-06, + "loss": 0.0455, + "step": 710 + }, + { + "epoch": 0.20273738237810093, + "grad_norm": 2.434206504065968, + "learning_rate": 9.239315610974109e-06, + "loss": 0.0428, + "step": 711 + }, + { + "epoch": 0.2030225263758198, + "grad_norm": 1.6611178690442459, + "learning_rate": 9.236864934582259e-06, + "loss": 0.0311, + "step": 712 + }, + { + "epoch": 0.20330767037353864, + "grad_norm": 2.7886821146690237, + "learning_rate": 9.234410643000884e-06, + "loss": 0.0471, + "step": 713 + }, + { + "epoch": 0.20359281437125748, + "grad_norm": 1.7169627167940775, + "learning_rate": 9.231952738324155e-06, + "loss": 0.0302, + "step": 714 + }, + { + "epoch": 0.20387795836897632, + "grad_norm": 2.5621764097108652, + "learning_rate": 9.229491222649328e-06, + "loss": 0.0485, + "step": 715 + }, + { + "epoch": 0.2041631023666952, + "grad_norm": 1.17459682130259, + "learning_rate": 9.227026098076742e-06, + "loss": 0.0264, + "step": 716 + }, + { + "epoch": 0.20444824636441403, + "grad_norm": 1.3071201080016008, + "learning_rate": 9.224557366709813e-06, + "loss": 0.0255, + "step": 717 + }, + { + "epoch": 0.20473339036213287, + "grad_norm": 2.051560890960995, + "learning_rate": 9.222085030655035e-06, + "loss": 0.0476, + "step": 718 + }, + { + "epoch": 0.20501853435985173, + "grad_norm": 1.019980133767295, + "learning_rate": 9.219609092021976e-06, + "loss": 0.0202, + "step": 719 + }, + { + "epoch": 0.20530367835757057, + "grad_norm": 0.8053908676837997, + "learning_rate": 9.217129552923287e-06, + "loss": 0.0144, + "step": 720 + }, + { + "epoch": 0.2055888223552894, + "grad_norm": 0.44915310802273345, + "learning_rate": 9.214646415474676e-06, + "loss": 0.0188, + "step": 721 + }, + { + "epoch": 0.20587396635300828, + "grad_norm": 2.0651332669928664, + "learning_rate": 9.212159681794935e-06, + "loss": 0.0576, + "step": 722 + }, + { + "epoch": 0.20615911035072712, + "grad_norm": 1.1845182088251505, + "learning_rate": 9.209669354005915e-06, + "loss": 0.0251, + "step": 723 + }, + { + "epoch": 0.20644425434844596, + "grad_norm": 0.8224946140289474, + "learning_rate": 9.20717543423254e-06, + "loss": 0.0397, + "step": 724 + }, + { + "epoch": 0.2067293983461648, + "grad_norm": 0.922555862579926, + "learning_rate": 9.204677924602799e-06, + "loss": 0.0228, + "step": 725 + }, + { + "epoch": 0.20701454234388367, + "grad_norm": 0.7195816655824018, + "learning_rate": 9.202176827247739e-06, + "loss": 0.0234, + "step": 726 + }, + { + "epoch": 0.2072996863416025, + "grad_norm": 1.5777936992378956, + "learning_rate": 9.19967214430147e-06, + "loss": 0.047, + "step": 727 + }, + { + "epoch": 0.20758483033932135, + "grad_norm": 2.115593197700014, + "learning_rate": 9.197163877901167e-06, + "loss": 0.0804, + "step": 728 + }, + { + "epoch": 0.20786997433704021, + "grad_norm": 1.7316346105054923, + "learning_rate": 9.194652030187055e-06, + "loss": 0.0452, + "step": 729 + }, + { + "epoch": 0.20815511833475905, + "grad_norm": 1.732715865988259, + "learning_rate": 9.19213660330242e-06, + "loss": 0.0288, + "step": 730 + }, + { + "epoch": 0.2084402623324779, + "grad_norm": 2.9257895919417085, + "learning_rate": 9.1896175993936e-06, + "loss": 0.0551, + "step": 731 + }, + { + "epoch": 0.20872540633019676, + "grad_norm": 1.2005441423398024, + "learning_rate": 9.187095020609982e-06, + "loss": 0.0591, + "step": 732 + }, + { + "epoch": 0.2090105503279156, + "grad_norm": 1.1442567758861109, + "learning_rate": 9.18456886910401e-06, + "loss": 0.0316, + "step": 733 + }, + { + "epoch": 0.20929569432563444, + "grad_norm": 1.442435487183275, + "learning_rate": 9.182039147031174e-06, + "loss": 0.0357, + "step": 734 + }, + { + "epoch": 0.20958083832335328, + "grad_norm": 0.7389338008585332, + "learning_rate": 9.179505856550006e-06, + "loss": 0.0213, + "step": 735 + }, + { + "epoch": 0.20986598232107215, + "grad_norm": 0.5763570642711087, + "learning_rate": 9.176968999822091e-06, + "loss": 0.0261, + "step": 736 + }, + { + "epoch": 0.210151126318791, + "grad_norm": 2.9913704549741467, + "learning_rate": 9.174428579012051e-06, + "loss": 0.0466, + "step": 737 + }, + { + "epoch": 0.21043627031650983, + "grad_norm": 0.9514972565621266, + "learning_rate": 9.171884596287548e-06, + "loss": 0.0316, + "step": 738 + }, + { + "epoch": 0.2107214143142287, + "grad_norm": 1.1166267794394404, + "learning_rate": 9.16933705381929e-06, + "loss": 0.0425, + "step": 739 + }, + { + "epoch": 0.21100655831194753, + "grad_norm": 2.6475916702209794, + "learning_rate": 9.166785953781017e-06, + "loss": 0.0383, + "step": 740 + }, + { + "epoch": 0.21129170230966637, + "grad_norm": 0.5035517781871393, + "learning_rate": 9.164231298349505e-06, + "loss": 0.0116, + "step": 741 + }, + { + "epoch": 0.21157684630738524, + "grad_norm": 2.735386210598088, + "learning_rate": 9.161673089704565e-06, + "loss": 0.0439, + "step": 742 + }, + { + "epoch": 0.21186199030510408, + "grad_norm": 1.7132023378847019, + "learning_rate": 9.159111330029041e-06, + "loss": 0.0428, + "step": 743 + }, + { + "epoch": 0.21214713430282292, + "grad_norm": 1.2102951850569579, + "learning_rate": 9.156546021508803e-06, + "loss": 0.0181, + "step": 744 + }, + { + "epoch": 0.21243227830054176, + "grad_norm": 1.822168441101455, + "learning_rate": 9.153977166332756e-06, + "loss": 0.038, + "step": 745 + }, + { + "epoch": 0.21271742229826063, + "grad_norm": 3.1595776380649334, + "learning_rate": 9.151404766692822e-06, + "loss": 0.0676, + "step": 746 + }, + { + "epoch": 0.21300256629597947, + "grad_norm": 2.4059297211150947, + "learning_rate": 9.148828824783956e-06, + "loss": 0.0418, + "step": 747 + }, + { + "epoch": 0.2132877102936983, + "grad_norm": 1.1863131598693795, + "learning_rate": 9.146249342804128e-06, + "loss": 0.0179, + "step": 748 + }, + { + "epoch": 0.21357285429141717, + "grad_norm": 1.6629250296893985, + "learning_rate": 9.143666322954336e-06, + "loss": 0.0321, + "step": 749 + }, + { + "epoch": 0.21385799828913601, + "grad_norm": 0.702256216039878, + "learning_rate": 9.141079767438592e-06, + "loss": 0.0272, + "step": 750 + }, + { + "epoch": 0.21414314228685485, + "grad_norm": 1.0421114336526116, + "learning_rate": 9.138489678463927e-06, + "loss": 0.0288, + "step": 751 + }, + { + "epoch": 0.21442828628457372, + "grad_norm": 1.398071589217176, + "learning_rate": 9.135896058240384e-06, + "loss": 0.0164, + "step": 752 + }, + { + "epoch": 0.21471343028229256, + "grad_norm": 1.180627226683535, + "learning_rate": 9.133298908981021e-06, + "loss": 0.0302, + "step": 753 + }, + { + "epoch": 0.2149985742800114, + "grad_norm": 2.12849533813395, + "learning_rate": 9.13069823290191e-06, + "loss": 0.034, + "step": 754 + }, + { + "epoch": 0.21528371827773024, + "grad_norm": 1.1520445091389948, + "learning_rate": 9.12809403222213e-06, + "loss": 0.02, + "step": 755 + }, + { + "epoch": 0.2155688622754491, + "grad_norm": 0.9314697391415853, + "learning_rate": 9.125486309163764e-06, + "loss": 0.022, + "step": 756 + }, + { + "epoch": 0.21585400627316795, + "grad_norm": 0.4807671860197455, + "learning_rate": 9.122875065951907e-06, + "loss": 0.0129, + "step": 757 + }, + { + "epoch": 0.2161391502708868, + "grad_norm": 1.0284661378275857, + "learning_rate": 9.12026030481465e-06, + "loss": 0.0198, + "step": 758 + }, + { + "epoch": 0.21642429426860565, + "grad_norm": 1.3711724506820158, + "learning_rate": 9.117642027983096e-06, + "loss": 0.0275, + "step": 759 + }, + { + "epoch": 0.2167094382663245, + "grad_norm": 1.2056627878364008, + "learning_rate": 9.115020237691336e-06, + "loss": 0.0233, + "step": 760 + }, + { + "epoch": 0.21699458226404333, + "grad_norm": 1.5615068937947674, + "learning_rate": 9.11239493617647e-06, + "loss": 0.0472, + "step": 761 + }, + { + "epoch": 0.2172797262617622, + "grad_norm": 1.053111712057514, + "learning_rate": 9.109766125678585e-06, + "loss": 0.0236, + "step": 762 + }, + { + "epoch": 0.21756487025948104, + "grad_norm": 1.492926709547356, + "learning_rate": 9.107133808440767e-06, + "loss": 0.0278, + "step": 763 + }, + { + "epoch": 0.21785001425719988, + "grad_norm": 0.7321893976881936, + "learning_rate": 9.104497986709096e-06, + "loss": 0.011, + "step": 764 + }, + { + "epoch": 0.21813515825491872, + "grad_norm": 0.972053791654552, + "learning_rate": 9.101858662732635e-06, + "loss": 0.0284, + "step": 765 + }, + { + "epoch": 0.2184203022526376, + "grad_norm": 0.384726545541271, + "learning_rate": 9.099215838763444e-06, + "loss": 0.0091, + "step": 766 + }, + { + "epoch": 0.21870544625035643, + "grad_norm": 1.4964596931892333, + "learning_rate": 9.096569517056562e-06, + "loss": 0.0155, + "step": 767 + }, + { + "epoch": 0.21899059024807527, + "grad_norm": 1.4119719189435824, + "learning_rate": 9.093919699870017e-06, + "loss": 0.0278, + "step": 768 + }, + { + "epoch": 0.21927573424579413, + "grad_norm": 2.196204389526829, + "learning_rate": 9.091266389464818e-06, + "loss": 0.0621, + "step": 769 + }, + { + "epoch": 0.21956087824351297, + "grad_norm": 1.437174200010841, + "learning_rate": 9.088609588104958e-06, + "loss": 0.0207, + "step": 770 + }, + { + "epoch": 0.21984602224123181, + "grad_norm": 2.1986672533499307, + "learning_rate": 9.085949298057402e-06, + "loss": 0.0357, + "step": 771 + }, + { + "epoch": 0.22013116623895068, + "grad_norm": 2.0638899957028642, + "learning_rate": 9.083285521592097e-06, + "loss": 0.0473, + "step": 772 + }, + { + "epoch": 0.22041631023666952, + "grad_norm": 1.1744345395272808, + "learning_rate": 9.080618260981964e-06, + "loss": 0.0155, + "step": 773 + }, + { + "epoch": 0.22070145423438836, + "grad_norm": 2.306169669849833, + "learning_rate": 9.077947518502894e-06, + "loss": 0.033, + "step": 774 + }, + { + "epoch": 0.2209865982321072, + "grad_norm": 1.9124745318350345, + "learning_rate": 9.075273296433753e-06, + "loss": 0.0379, + "step": 775 + }, + { + "epoch": 0.22127174222982607, + "grad_norm": 2.0494348960481785, + "learning_rate": 9.072595597056375e-06, + "loss": 0.0671, + "step": 776 + }, + { + "epoch": 0.2215568862275449, + "grad_norm": 1.6680009681455517, + "learning_rate": 9.069914422655559e-06, + "loss": 0.0301, + "step": 777 + }, + { + "epoch": 0.22184203022526375, + "grad_norm": 1.6771017256370313, + "learning_rate": 9.067229775519071e-06, + "loss": 0.0229, + "step": 778 + }, + { + "epoch": 0.22212717422298262, + "grad_norm": 2.0275590166360806, + "learning_rate": 9.064541657937641e-06, + "loss": 0.0685, + "step": 779 + }, + { + "epoch": 0.22241231822070145, + "grad_norm": 2.360895367226193, + "learning_rate": 9.061850072204958e-06, + "loss": 0.0299, + "step": 780 + }, + { + "epoch": 0.2226974622184203, + "grad_norm": 1.5539467978062291, + "learning_rate": 9.05915502061767e-06, + "loss": 0.0403, + "step": 781 + }, + { + "epoch": 0.22298260621613916, + "grad_norm": 0.7670508938888166, + "learning_rate": 9.056456505475385e-06, + "loss": 0.0105, + "step": 782 + }, + { + "epoch": 0.223267750213858, + "grad_norm": 0.7937880675570071, + "learning_rate": 9.053754529080664e-06, + "loss": 0.0176, + "step": 783 + }, + { + "epoch": 0.22355289421157684, + "grad_norm": 2.157831660066521, + "learning_rate": 9.051049093739023e-06, + "loss": 0.0336, + "step": 784 + }, + { + "epoch": 0.22383803820929568, + "grad_norm": 1.1663071318976364, + "learning_rate": 9.048340201758929e-06, + "loss": 0.0252, + "step": 785 + }, + { + "epoch": 0.22412318220701455, + "grad_norm": 1.292838079011928, + "learning_rate": 9.045627855451797e-06, + "loss": 0.0363, + "step": 786 + }, + { + "epoch": 0.2244083262047334, + "grad_norm": 1.5733550446218494, + "learning_rate": 9.04291205713199e-06, + "loss": 0.0213, + "step": 787 + }, + { + "epoch": 0.22469347020245223, + "grad_norm": 1.1259643693921413, + "learning_rate": 9.04019280911682e-06, + "loss": 0.0149, + "step": 788 + }, + { + "epoch": 0.2249786142001711, + "grad_norm": 1.0083139262164555, + "learning_rate": 9.037470113726537e-06, + "loss": 0.0172, + "step": 789 + }, + { + "epoch": 0.22526375819788994, + "grad_norm": 1.3511474029563015, + "learning_rate": 9.034743973284337e-06, + "loss": 0.0215, + "step": 790 + }, + { + "epoch": 0.22554890219560877, + "grad_norm": 0.8363495828122983, + "learning_rate": 9.032014390116351e-06, + "loss": 0.0266, + "step": 791 + }, + { + "epoch": 0.22583404619332764, + "grad_norm": 3.1833632224433996, + "learning_rate": 9.029281366551654e-06, + "loss": 0.0254, + "step": 792 + }, + { + "epoch": 0.22611919019104648, + "grad_norm": 2.1181522596055435, + "learning_rate": 9.02654490492225e-06, + "loss": 0.034, + "step": 793 + }, + { + "epoch": 0.22640433418876532, + "grad_norm": 1.125558580379027, + "learning_rate": 9.02380500756308e-06, + "loss": 0.0244, + "step": 794 + }, + { + "epoch": 0.22668947818648416, + "grad_norm": 1.2351475059799766, + "learning_rate": 9.021061676812016e-06, + "loss": 0.0175, + "step": 795 + }, + { + "epoch": 0.22697462218420303, + "grad_norm": 2.0823417135384426, + "learning_rate": 9.01831491500986e-06, + "loss": 0.0314, + "step": 796 + }, + { + "epoch": 0.22725976618192187, + "grad_norm": 2.6273804667622955, + "learning_rate": 9.015564724500343e-06, + "loss": 0.0439, + "step": 797 + }, + { + "epoch": 0.2275449101796407, + "grad_norm": 0.5129484219330254, + "learning_rate": 9.012811107630118e-06, + "loss": 0.0101, + "step": 798 + }, + { + "epoch": 0.22783005417735958, + "grad_norm": 1.1938003324425006, + "learning_rate": 9.010054066748764e-06, + "loss": 0.0379, + "step": 799 + }, + { + "epoch": 0.22811519817507842, + "grad_norm": 3.097898875063007, + "learning_rate": 9.00729360420878e-06, + "loss": 0.0781, + "step": 800 + }, + { + "epoch": 0.22840034217279725, + "grad_norm": 1.0145281059549673, + "learning_rate": 9.004529722365585e-06, + "loss": 0.0137, + "step": 801 + }, + { + "epoch": 0.22868548617051612, + "grad_norm": 0.9369644953910446, + "learning_rate": 9.001762423577521e-06, + "loss": 0.0129, + "step": 802 + }, + { + "epoch": 0.22897063016823496, + "grad_norm": 1.6897375760595272, + "learning_rate": 8.998991710205837e-06, + "loss": 0.0358, + "step": 803 + }, + { + "epoch": 0.2292557741659538, + "grad_norm": 1.5083615821843614, + "learning_rate": 8.996217584614702e-06, + "loss": 0.0299, + "step": 804 + }, + { + "epoch": 0.22954091816367264, + "grad_norm": 1.5739940630915592, + "learning_rate": 8.99344004917119e-06, + "loss": 0.0256, + "step": 805 + }, + { + "epoch": 0.2298260621613915, + "grad_norm": 3.8075761182654593, + "learning_rate": 8.990659106245292e-06, + "loss": 0.0503, + "step": 806 + }, + { + "epoch": 0.23011120615911035, + "grad_norm": 0.679718812603675, + "learning_rate": 8.9878747582099e-06, + "loss": 0.0152, + "step": 807 + }, + { + "epoch": 0.2303963501568292, + "grad_norm": 1.6636057899743955, + "learning_rate": 8.98508700744082e-06, + "loss": 0.0236, + "step": 808 + }, + { + "epoch": 0.23068149415454806, + "grad_norm": 1.0031959304118039, + "learning_rate": 8.98229585631675e-06, + "loss": 0.0223, + "step": 809 + }, + { + "epoch": 0.2309666381522669, + "grad_norm": 0.49804281066136624, + "learning_rate": 8.979501307219298e-06, + "loss": 0.0126, + "step": 810 + }, + { + "epoch": 0.23125178214998574, + "grad_norm": 2.390993893561002, + "learning_rate": 8.976703362532971e-06, + "loss": 0.0316, + "step": 811 + }, + { + "epoch": 0.2315369261477046, + "grad_norm": 1.9859352958275294, + "learning_rate": 8.973902024645165e-06, + "loss": 0.0164, + "step": 812 + }, + { + "epoch": 0.23182207014542344, + "grad_norm": 2.609134116114303, + "learning_rate": 8.971097295946183e-06, + "loss": 0.0511, + "step": 813 + }, + { + "epoch": 0.23210721414314228, + "grad_norm": 1.3455604419784448, + "learning_rate": 8.968289178829214e-06, + "loss": 0.0232, + "step": 814 + }, + { + "epoch": 0.23239235814086112, + "grad_norm": 2.843848539977979, + "learning_rate": 8.96547767569034e-06, + "loss": 0.0263, + "step": 815 + }, + { + "epoch": 0.23267750213858, + "grad_norm": 3.7860602043709983, + "learning_rate": 8.962662788928531e-06, + "loss": 0.0428, + "step": 816 + }, + { + "epoch": 0.23296264613629883, + "grad_norm": 2.4050408956018763, + "learning_rate": 8.959844520945646e-06, + "loss": 0.0357, + "step": 817 + }, + { + "epoch": 0.23324779013401767, + "grad_norm": 2.7114207248581113, + "learning_rate": 8.957022874146429e-06, + "loss": 0.0348, + "step": 818 + }, + { + "epoch": 0.23353293413173654, + "grad_norm": 2.5893678936272444, + "learning_rate": 8.954197850938506e-06, + "loss": 0.0257, + "step": 819 + }, + { + "epoch": 0.23381807812945538, + "grad_norm": 1.1802173880782538, + "learning_rate": 8.951369453732386e-06, + "loss": 0.0354, + "step": 820 + }, + { + "epoch": 0.23410322212717422, + "grad_norm": 0.5538609744161046, + "learning_rate": 8.948537684941452e-06, + "loss": 0.0121, + "step": 821 + }, + { + "epoch": 0.23438836612489308, + "grad_norm": 0.7526259552569778, + "learning_rate": 8.94570254698197e-06, + "loss": 0.0348, + "step": 822 + }, + { + "epoch": 0.23467351012261192, + "grad_norm": 0.38116984197805204, + "learning_rate": 8.942864042273075e-06, + "loss": 0.0036, + "step": 823 + }, + { + "epoch": 0.23495865412033076, + "grad_norm": 1.31857014571736, + "learning_rate": 8.94002217323678e-06, + "loss": 0.0163, + "step": 824 + }, + { + "epoch": 0.2352437981180496, + "grad_norm": 2.921327945172875, + "learning_rate": 8.937176942297968e-06, + "loss": 0.0399, + "step": 825 + }, + { + "epoch": 0.23552894211576847, + "grad_norm": 1.5144558599662687, + "learning_rate": 8.934328351884386e-06, + "loss": 0.0382, + "step": 826 + }, + { + "epoch": 0.2358140861134873, + "grad_norm": 1.2344815677516623, + "learning_rate": 8.931476404426653e-06, + "loss": 0.0332, + "step": 827 + }, + { + "epoch": 0.23609923011120615, + "grad_norm": 0.7033882183737051, + "learning_rate": 8.928621102358248e-06, + "loss": 0.0118, + "step": 828 + }, + { + "epoch": 0.23638437410892502, + "grad_norm": 6.45022989283852, + "learning_rate": 8.925762448115516e-06, + "loss": 0.0202, + "step": 829 + }, + { + "epoch": 0.23666951810664386, + "grad_norm": 2.5229595774880087, + "learning_rate": 8.92290044413766e-06, + "loss": 0.0368, + "step": 830 + }, + { + "epoch": 0.2369546621043627, + "grad_norm": 1.258824021540127, + "learning_rate": 8.92003509286674e-06, + "loss": 0.0202, + "step": 831 + }, + { + "epoch": 0.23723980610208156, + "grad_norm": 2.6905988020090956, + "learning_rate": 8.917166396747681e-06, + "loss": 0.0397, + "step": 832 + }, + { + "epoch": 0.2375249500998004, + "grad_norm": 1.4334425139307863, + "learning_rate": 8.914294358228245e-06, + "loss": 0.0233, + "step": 833 + }, + { + "epoch": 0.23781009409751924, + "grad_norm": 3.3639351642610373, + "learning_rate": 8.911418979759066e-06, + "loss": 0.035, + "step": 834 + }, + { + "epoch": 0.23809523809523808, + "grad_norm": 1.0725408705087607, + "learning_rate": 8.908540263793611e-06, + "loss": 0.0357, + "step": 835 + }, + { + "epoch": 0.23838038209295695, + "grad_norm": 0.8357183521879652, + "learning_rate": 8.905658212788207e-06, + "loss": 0.0134, + "step": 836 + }, + { + "epoch": 0.2386655260906758, + "grad_norm": 2.0411720213460347, + "learning_rate": 8.902772829202015e-06, + "loss": 0.0561, + "step": 837 + }, + { + "epoch": 0.23895067008839463, + "grad_norm": 0.4962525894662451, + "learning_rate": 8.899884115497053e-06, + "loss": 0.0089, + "step": 838 + }, + { + "epoch": 0.2392358140861135, + "grad_norm": 5.584077909965726, + "learning_rate": 8.896992074138171e-06, + "loss": 0.0478, + "step": 839 + }, + { + "epoch": 0.23952095808383234, + "grad_norm": 1.8286855246809188, + "learning_rate": 8.89409670759306e-06, + "loss": 0.0356, + "step": 840 + }, + { + "epoch": 0.23980610208155118, + "grad_norm": 0.7242311973923197, + "learning_rate": 8.89119801833225e-06, + "loss": 0.0196, + "step": 841 + }, + { + "epoch": 0.24009124607927004, + "grad_norm": 1.6524840292900185, + "learning_rate": 8.888296008829106e-06, + "loss": 0.0313, + "step": 842 + }, + { + "epoch": 0.24037639007698888, + "grad_norm": 1.6025270226398667, + "learning_rate": 8.885390681559822e-06, + "loss": 0.0589, + "step": 843 + }, + { + "epoch": 0.24066153407470772, + "grad_norm": 0.6828303452986196, + "learning_rate": 8.88248203900343e-06, + "loss": 0.0129, + "step": 844 + }, + { + "epoch": 0.24094667807242656, + "grad_norm": 1.0847428212810786, + "learning_rate": 8.879570083641788e-06, + "loss": 0.0169, + "step": 845 + }, + { + "epoch": 0.24123182207014543, + "grad_norm": 1.3682289401821128, + "learning_rate": 8.87665481795958e-06, + "loss": 0.0253, + "step": 846 + }, + { + "epoch": 0.24151696606786427, + "grad_norm": 3.7461358288629434, + "learning_rate": 8.873736244444311e-06, + "loss": 0.0483, + "step": 847 + }, + { + "epoch": 0.2418021100655831, + "grad_norm": 1.9237644513844145, + "learning_rate": 8.870814365586315e-06, + "loss": 0.0205, + "step": 848 + }, + { + "epoch": 0.24208725406330198, + "grad_norm": 1.2152994567992417, + "learning_rate": 8.867889183878742e-06, + "loss": 0.0268, + "step": 849 + }, + { + "epoch": 0.24237239806102082, + "grad_norm": 1.91329866781686, + "learning_rate": 8.864960701817564e-06, + "loss": 0.0178, + "step": 850 + }, + { + "epoch": 0.24265754205873966, + "grad_norm": 1.9431409917366391, + "learning_rate": 8.862028921901563e-06, + "loss": 0.033, + "step": 851 + }, + { + "epoch": 0.24294268605645852, + "grad_norm": 1.2283113327719135, + "learning_rate": 8.859093846632343e-06, + "loss": 0.0295, + "step": 852 + }, + { + "epoch": 0.24322783005417736, + "grad_norm": 0.9973988056561106, + "learning_rate": 8.856155478514313e-06, + "loss": 0.0212, + "step": 853 + }, + { + "epoch": 0.2435129740518962, + "grad_norm": 2.4421795168152753, + "learning_rate": 8.853213820054693e-06, + "loss": 0.0172, + "step": 854 + }, + { + "epoch": 0.24379811804961504, + "grad_norm": 0.6853186509231574, + "learning_rate": 8.850268873763514e-06, + "loss": 0.0071, + "step": 855 + }, + { + "epoch": 0.2440832620473339, + "grad_norm": 0.20714634708932925, + "learning_rate": 8.84732064215361e-06, + "loss": 0.0081, + "step": 856 + }, + { + "epoch": 0.24436840604505275, + "grad_norm": 1.4038906766225647, + "learning_rate": 8.844369127740617e-06, + "loss": 0.0591, + "step": 857 + }, + { + "epoch": 0.2446535500427716, + "grad_norm": 3.2053223620330944, + "learning_rate": 8.841414333042975e-06, + "loss": 0.0932, + "step": 858 + }, + { + "epoch": 0.24493869404049046, + "grad_norm": 0.7632677420756766, + "learning_rate": 8.83845626058192e-06, + "loss": 0.016, + "step": 859 + }, + { + "epoch": 0.2452238380382093, + "grad_norm": 1.4585990037580068, + "learning_rate": 8.835494912881487e-06, + "loss": 0.0264, + "step": 860 + }, + { + "epoch": 0.24550898203592814, + "grad_norm": 0.9392293367012389, + "learning_rate": 8.832530292468509e-06, + "loss": 0.0176, + "step": 861 + }, + { + "epoch": 0.245794126033647, + "grad_norm": 0.9313249492245359, + "learning_rate": 8.8295624018726e-06, + "loss": 0.0239, + "step": 862 + }, + { + "epoch": 0.24607927003136584, + "grad_norm": 1.8503666333746411, + "learning_rate": 8.826591243626178e-06, + "loss": 0.0468, + "step": 863 + }, + { + "epoch": 0.24636441402908468, + "grad_norm": 1.5599510249156554, + "learning_rate": 8.82361682026444e-06, + "loss": 0.0466, + "step": 864 + }, + { + "epoch": 0.24664955802680352, + "grad_norm": 0.7932008240844486, + "learning_rate": 8.820639134325371e-06, + "loss": 0.0174, + "step": 865 + }, + { + "epoch": 0.2469347020245224, + "grad_norm": 1.7519462558310501, + "learning_rate": 8.817658188349745e-06, + "loss": 0.0308, + "step": 866 + }, + { + "epoch": 0.24721984602224123, + "grad_norm": 0.7374567291630424, + "learning_rate": 8.81467398488111e-06, + "loss": 0.0268, + "step": 867 + }, + { + "epoch": 0.24750499001996007, + "grad_norm": 2.2486150692979185, + "learning_rate": 8.811686526465799e-06, + "loss": 0.0364, + "step": 868 + }, + { + "epoch": 0.24779013401767894, + "grad_norm": 1.743439371955327, + "learning_rate": 8.808695815652922e-06, + "loss": 0.0414, + "step": 869 + }, + { + "epoch": 0.24807527801539778, + "grad_norm": 1.0726966014600394, + "learning_rate": 8.805701854994358e-06, + "loss": 0.0208, + "step": 870 + }, + { + "epoch": 0.24836042201311662, + "grad_norm": 2.634942103615967, + "learning_rate": 8.802704647044766e-06, + "loss": 0.0335, + "step": 871 + }, + { + "epoch": 0.24864556601083548, + "grad_norm": 1.2580046690380893, + "learning_rate": 8.799704194361575e-06, + "loss": 0.0253, + "step": 872 + }, + { + "epoch": 0.24893071000855432, + "grad_norm": 1.096234047219748, + "learning_rate": 8.79670049950498e-06, + "loss": 0.0328, + "step": 873 + }, + { + "epoch": 0.24921585400627316, + "grad_norm": 1.547027261492061, + "learning_rate": 8.793693565037942e-06, + "loss": 0.034, + "step": 874 + }, + { + "epoch": 0.249500998003992, + "grad_norm": 0.5892064156881954, + "learning_rate": 8.790683393526192e-06, + "loss": 0.0164, + "step": 875 + }, + { + "epoch": 0.24978614200171087, + "grad_norm": 1.3901316717407268, + "learning_rate": 8.787669987538214e-06, + "loss": 0.0405, + "step": 876 + }, + { + "epoch": 0.25007128599942974, + "grad_norm": 1.7467544871345946, + "learning_rate": 8.784653349645259e-06, + "loss": 0.0464, + "step": 877 + }, + { + "epoch": 0.2503564299971486, + "grad_norm": 0.8752141247160915, + "learning_rate": 8.781633482421333e-06, + "loss": 0.0186, + "step": 878 + }, + { + "epoch": 0.2506415739948674, + "grad_norm": 2.4276853957481612, + "learning_rate": 8.7786103884432e-06, + "loss": 0.0584, + "step": 879 + }, + { + "epoch": 0.25092671799258626, + "grad_norm": 1.3072797801662137, + "learning_rate": 8.775584070290373e-06, + "loss": 0.0283, + "step": 880 + }, + { + "epoch": 0.2512118619903051, + "grad_norm": 0.676361478133077, + "learning_rate": 8.772554530545118e-06, + "loss": 0.0127, + "step": 881 + }, + { + "epoch": 0.25149700598802394, + "grad_norm": 2.1577548832377618, + "learning_rate": 8.769521771792453e-06, + "loss": 0.0389, + "step": 882 + }, + { + "epoch": 0.2517821499857428, + "grad_norm": 2.3856943630410496, + "learning_rate": 8.766485796620135e-06, + "loss": 0.0634, + "step": 883 + }, + { + "epoch": 0.25206729398346167, + "grad_norm": 1.0305070102089764, + "learning_rate": 8.763446607618675e-06, + "loss": 0.0202, + "step": 884 + }, + { + "epoch": 0.2523524379811805, + "grad_norm": 1.2830717231149895, + "learning_rate": 8.76040420738132e-06, + "loss": 0.0284, + "step": 885 + }, + { + "epoch": 0.25263758197889935, + "grad_norm": 1.6411073612068923, + "learning_rate": 8.75735859850406e-06, + "loss": 0.0219, + "step": 886 + }, + { + "epoch": 0.2529227259766182, + "grad_norm": 1.1756235902210506, + "learning_rate": 8.754309783585619e-06, + "loss": 0.0349, + "step": 887 + }, + { + "epoch": 0.25320786997433703, + "grad_norm": 0.6847813278483827, + "learning_rate": 8.751257765227462e-06, + "loss": 0.0206, + "step": 888 + }, + { + "epoch": 0.25349301397205587, + "grad_norm": 0.5892100476513646, + "learning_rate": 8.748202546033781e-06, + "loss": 0.0202, + "step": 889 + }, + { + "epoch": 0.2537781579697747, + "grad_norm": 0.6949316617452543, + "learning_rate": 8.745144128611506e-06, + "loss": 0.021, + "step": 890 + }, + { + "epoch": 0.2540633019674936, + "grad_norm": 0.9729222569070698, + "learning_rate": 8.742082515570291e-06, + "loss": 0.0194, + "step": 891 + }, + { + "epoch": 0.25434844596521244, + "grad_norm": 0.45568034642641875, + "learning_rate": 8.739017709522519e-06, + "loss": 0.0106, + "step": 892 + }, + { + "epoch": 0.2546335899629313, + "grad_norm": 1.2410486423220612, + "learning_rate": 8.735949713083295e-06, + "loss": 0.0118, + "step": 893 + }, + { + "epoch": 0.2549187339606501, + "grad_norm": 1.9597718656925054, + "learning_rate": 8.732878528870452e-06, + "loss": 0.0271, + "step": 894 + }, + { + "epoch": 0.25520387795836896, + "grad_norm": 1.7196417456122466, + "learning_rate": 8.729804159504537e-06, + "loss": 0.0498, + "step": 895 + }, + { + "epoch": 0.2554890219560878, + "grad_norm": 0.9101465781142756, + "learning_rate": 8.726726607608817e-06, + "loss": 0.009, + "step": 896 + }, + { + "epoch": 0.2557741659538067, + "grad_norm": 1.5268690830009926, + "learning_rate": 8.723645875809274e-06, + "loss": 0.0311, + "step": 897 + }, + { + "epoch": 0.25605930995152554, + "grad_norm": 1.2252517246259709, + "learning_rate": 8.720561966734604e-06, + "loss": 0.0102, + "step": 898 + }, + { + "epoch": 0.2563444539492444, + "grad_norm": 1.2280625351519197, + "learning_rate": 8.717474883016214e-06, + "loss": 0.0085, + "step": 899 + }, + { + "epoch": 0.2566295979469632, + "grad_norm": 3.117132439752605, + "learning_rate": 8.714384627288222e-06, + "loss": 0.0791, + "step": 900 + }, + { + "epoch": 0.25691474194468206, + "grad_norm": 1.4949161090600118, + "learning_rate": 8.711291202187447e-06, + "loss": 0.0322, + "step": 901 + }, + { + "epoch": 0.2571998859424009, + "grad_norm": 1.8302629323751376, + "learning_rate": 8.708194610353418e-06, + "loss": 0.0617, + "step": 902 + }, + { + "epoch": 0.25748502994011974, + "grad_norm": 2.308044393671751, + "learning_rate": 8.705094854428362e-06, + "loss": 0.0443, + "step": 903 + }, + { + "epoch": 0.25777017393783863, + "grad_norm": 1.6208323890738643, + "learning_rate": 8.701991937057211e-06, + "loss": 0.0229, + "step": 904 + }, + { + "epoch": 0.25805531793555747, + "grad_norm": 0.9063595773842802, + "learning_rate": 8.698885860887587e-06, + "loss": 0.0076, + "step": 905 + }, + { + "epoch": 0.2583404619332763, + "grad_norm": 0.7271393808586717, + "learning_rate": 8.695776628569813e-06, + "loss": 0.0305, + "step": 906 + }, + { + "epoch": 0.25862560593099515, + "grad_norm": 1.9625970380708422, + "learning_rate": 8.692664242756902e-06, + "loss": 0.0415, + "step": 907 + }, + { + "epoch": 0.258910749928714, + "grad_norm": 0.5666328525418538, + "learning_rate": 8.689548706104564e-06, + "loss": 0.0159, + "step": 908 + }, + { + "epoch": 0.25919589392643283, + "grad_norm": 2.8145375900053806, + "learning_rate": 8.68643002127119e-06, + "loss": 0.0574, + "step": 909 + }, + { + "epoch": 0.25948103792415167, + "grad_norm": 1.5275888603420062, + "learning_rate": 8.683308190917857e-06, + "loss": 0.0341, + "step": 910 + }, + { + "epoch": 0.25976618192187056, + "grad_norm": 3.1426880240693214, + "learning_rate": 8.680183217708334e-06, + "loss": 0.0549, + "step": 911 + }, + { + "epoch": 0.2600513259195894, + "grad_norm": 0.7773518110968769, + "learning_rate": 8.677055104309062e-06, + "loss": 0.0145, + "step": 912 + }, + { + "epoch": 0.26033646991730824, + "grad_norm": 1.9823459424727548, + "learning_rate": 8.673923853389172e-06, + "loss": 0.0437, + "step": 913 + }, + { + "epoch": 0.2606216139150271, + "grad_norm": 1.1164470316091455, + "learning_rate": 8.670789467620461e-06, + "loss": 0.0231, + "step": 914 + }, + { + "epoch": 0.2609067579127459, + "grad_norm": 0.6832591626715077, + "learning_rate": 8.667651949677409e-06, + "loss": 0.0163, + "step": 915 + }, + { + "epoch": 0.26119190191046476, + "grad_norm": 1.876547933242045, + "learning_rate": 8.664511302237164e-06, + "loss": 0.0279, + "step": 916 + }, + { + "epoch": 0.26147704590818366, + "grad_norm": 1.152597026479379, + "learning_rate": 8.661367527979547e-06, + "loss": 0.0161, + "step": 917 + }, + { + "epoch": 0.2617621899059025, + "grad_norm": 1.9164628134925323, + "learning_rate": 8.658220629587046e-06, + "loss": 0.0457, + "step": 918 + }, + { + "epoch": 0.26204733390362134, + "grad_norm": 1.5725610393465852, + "learning_rate": 8.655070609744816e-06, + "loss": 0.0271, + "step": 919 + }, + { + "epoch": 0.2623324779013402, + "grad_norm": 1.6679600131417485, + "learning_rate": 8.651917471140673e-06, + "loss": 0.0562, + "step": 920 + }, + { + "epoch": 0.262617621899059, + "grad_norm": 1.7758731689450964, + "learning_rate": 8.648761216465096e-06, + "loss": 0.0299, + "step": 921 + }, + { + "epoch": 0.26290276589677786, + "grad_norm": 1.3868053732760177, + "learning_rate": 8.64560184841122e-06, + "loss": 0.0178, + "step": 922 + }, + { + "epoch": 0.2631879098944967, + "grad_norm": 1.9904157646953466, + "learning_rate": 8.642439369674845e-06, + "loss": 0.0239, + "step": 923 + }, + { + "epoch": 0.2634730538922156, + "grad_norm": 1.7623169430202168, + "learning_rate": 8.639273782954412e-06, + "loss": 0.0263, + "step": 924 + }, + { + "epoch": 0.26375819788993443, + "grad_norm": 1.0216670315613425, + "learning_rate": 8.636105090951022e-06, + "loss": 0.0286, + "step": 925 + }, + { + "epoch": 0.26404334188765327, + "grad_norm": 1.0887871495489123, + "learning_rate": 8.63293329636843e-06, + "loss": 0.0395, + "step": 926 + }, + { + "epoch": 0.2643284858853721, + "grad_norm": 1.0890011420632484, + "learning_rate": 8.629758401913027e-06, + "loss": 0.0269, + "step": 927 + }, + { + "epoch": 0.26461362988309095, + "grad_norm": 1.0417579612272687, + "learning_rate": 8.626580410293859e-06, + "loss": 0.0133, + "step": 928 + }, + { + "epoch": 0.2648987738808098, + "grad_norm": 1.236301382418515, + "learning_rate": 8.623399324222608e-06, + "loss": 0.0164, + "step": 929 + }, + { + "epoch": 0.26518391787852863, + "grad_norm": 1.4547427099574102, + "learning_rate": 8.620215146413603e-06, + "loss": 0.0193, + "step": 930 + }, + { + "epoch": 0.2654690618762475, + "grad_norm": 0.9108297216575995, + "learning_rate": 8.617027879583801e-06, + "loss": 0.0211, + "step": 931 + }, + { + "epoch": 0.26575420587396636, + "grad_norm": 1.0697944180054113, + "learning_rate": 8.613837526452806e-06, + "loss": 0.0295, + "step": 932 + }, + { + "epoch": 0.2660393498716852, + "grad_norm": 0.6751528441766895, + "learning_rate": 8.61064408974285e-06, + "loss": 0.0156, + "step": 933 + }, + { + "epoch": 0.26632449386940404, + "grad_norm": 1.8490910731349444, + "learning_rate": 8.607447572178796e-06, + "loss": 0.0642, + "step": 934 + }, + { + "epoch": 0.2666096378671229, + "grad_norm": 0.7855324683250933, + "learning_rate": 8.604247976488137e-06, + "loss": 0.0292, + "step": 935 + }, + { + "epoch": 0.2668947818648417, + "grad_norm": 1.731820383728147, + "learning_rate": 8.601045305400988e-06, + "loss": 0.0373, + "step": 936 + }, + { + "epoch": 0.2671799258625606, + "grad_norm": 2.1895639341457716, + "learning_rate": 8.597839561650096e-06, + "loss": 0.0577, + "step": 937 + }, + { + "epoch": 0.26746506986027946, + "grad_norm": 0.891986333285142, + "learning_rate": 8.594630747970824e-06, + "loss": 0.0315, + "step": 938 + }, + { + "epoch": 0.2677502138579983, + "grad_norm": 0.6734024951645515, + "learning_rate": 8.591418867101158e-06, + "loss": 0.0169, + "step": 939 + }, + { + "epoch": 0.26803535785571714, + "grad_norm": 0.7790384279219437, + "learning_rate": 8.588203921781699e-06, + "loss": 0.024, + "step": 940 + }, + { + "epoch": 0.268320501853436, + "grad_norm": 1.8442659646304564, + "learning_rate": 8.584985914755663e-06, + "loss": 0.0476, + "step": 941 + }, + { + "epoch": 0.2686056458511548, + "grad_norm": 1.1596418180170927, + "learning_rate": 8.581764848768878e-06, + "loss": 0.0444, + "step": 942 + }, + { + "epoch": 0.26889078984887366, + "grad_norm": 1.4812227170866508, + "learning_rate": 8.578540726569782e-06, + "loss": 0.0379, + "step": 943 + }, + { + "epoch": 0.26917593384659255, + "grad_norm": 0.778765028574822, + "learning_rate": 8.575313550909424e-06, + "loss": 0.017, + "step": 944 + }, + { + "epoch": 0.2694610778443114, + "grad_norm": 0.6214418267264455, + "learning_rate": 8.572083324541454e-06, + "loss": 0.0193, + "step": 945 + }, + { + "epoch": 0.26974622184203023, + "grad_norm": 0.5537904546033956, + "learning_rate": 8.568850050222129e-06, + "loss": 0.0156, + "step": 946 + }, + { + "epoch": 0.27003136583974907, + "grad_norm": 1.9665330817864057, + "learning_rate": 8.565613730710303e-06, + "loss": 0.0363, + "step": 947 + }, + { + "epoch": 0.2703165098374679, + "grad_norm": 1.7179405656998434, + "learning_rate": 8.56237436876743e-06, + "loss": 0.0282, + "step": 948 + }, + { + "epoch": 0.27060165383518675, + "grad_norm": 1.591795064619829, + "learning_rate": 8.559131967157561e-06, + "loss": 0.0381, + "step": 949 + }, + { + "epoch": 0.2708867978329056, + "grad_norm": 1.4855162991515705, + "learning_rate": 8.55588652864734e-06, + "loss": 0.0319, + "step": 950 + }, + { + "epoch": 0.2711719418306245, + "grad_norm": 1.23660525088581, + "learning_rate": 8.552638056006004e-06, + "loss": 0.0211, + "step": 951 + }, + { + "epoch": 0.2714570858283433, + "grad_norm": 1.3975270889745315, + "learning_rate": 8.549386552005375e-06, + "loss": 0.0134, + "step": 952 + }, + { + "epoch": 0.27174222982606216, + "grad_norm": 1.9571395698904834, + "learning_rate": 8.546132019419862e-06, + "loss": 0.0248, + "step": 953 + }, + { + "epoch": 0.272027373823781, + "grad_norm": 2.1677184607215527, + "learning_rate": 8.542874461026462e-06, + "loss": 0.0502, + "step": 954 + }, + { + "epoch": 0.27231251782149984, + "grad_norm": 1.699300392038293, + "learning_rate": 8.539613879604751e-06, + "loss": 0.049, + "step": 955 + }, + { + "epoch": 0.2725976618192187, + "grad_norm": 1.5110559904082717, + "learning_rate": 8.536350277936887e-06, + "loss": 0.0361, + "step": 956 + }, + { + "epoch": 0.2728828058169376, + "grad_norm": 1.694634659292682, + "learning_rate": 8.533083658807601e-06, + "loss": 0.0602, + "step": 957 + }, + { + "epoch": 0.2731679498146564, + "grad_norm": 0.6948943012372113, + "learning_rate": 8.529814025004202e-06, + "loss": 0.0208, + "step": 958 + }, + { + "epoch": 0.27345309381237526, + "grad_norm": 1.2290435954640724, + "learning_rate": 8.526541379316569e-06, + "loss": 0.037, + "step": 959 + }, + { + "epoch": 0.2737382378100941, + "grad_norm": 0.9917533429856523, + "learning_rate": 8.523265724537153e-06, + "loss": 0.0326, + "step": 960 + }, + { + "epoch": 0.27402338180781294, + "grad_norm": 0.5603987383374495, + "learning_rate": 8.519987063460973e-06, + "loss": 0.0135, + "step": 961 + }, + { + "epoch": 0.2743085258055318, + "grad_norm": 0.7544090611895435, + "learning_rate": 8.51670539888561e-06, + "loss": 0.0346, + "step": 962 + }, + { + "epoch": 0.2745936698032506, + "grad_norm": 1.2787254933712948, + "learning_rate": 8.513420733611212e-06, + "loss": 0.0456, + "step": 963 + }, + { + "epoch": 0.2748788138009695, + "grad_norm": 1.186340995532642, + "learning_rate": 8.510133070440483e-06, + "loss": 0.023, + "step": 964 + }, + { + "epoch": 0.27516395779868835, + "grad_norm": 1.1520864734843645, + "learning_rate": 8.506842412178688e-06, + "loss": 0.0204, + "step": 965 + }, + { + "epoch": 0.2754491017964072, + "grad_norm": 1.3801482600792678, + "learning_rate": 8.503548761633646e-06, + "loss": 0.0513, + "step": 966 + }, + { + "epoch": 0.27573424579412603, + "grad_norm": 1.4900548822914077, + "learning_rate": 8.500252121615733e-06, + "loss": 0.0357, + "step": 967 + }, + { + "epoch": 0.27601938979184487, + "grad_norm": 1.3136520745656428, + "learning_rate": 8.496952494937869e-06, + "loss": 0.0287, + "step": 968 + }, + { + "epoch": 0.2763045337895637, + "grad_norm": 1.1985944568873772, + "learning_rate": 8.493649884415529e-06, + "loss": 0.0288, + "step": 969 + }, + { + "epoch": 0.27658967778728255, + "grad_norm": 1.3680126216074568, + "learning_rate": 8.490344292866728e-06, + "loss": 0.042, + "step": 970 + }, + { + "epoch": 0.27687482178500145, + "grad_norm": 1.3159870085955017, + "learning_rate": 8.487035723112033e-06, + "loss": 0.0205, + "step": 971 + }, + { + "epoch": 0.2771599657827203, + "grad_norm": 1.0078868077879315, + "learning_rate": 8.483724177974543e-06, + "loss": 0.0172, + "step": 972 + }, + { + "epoch": 0.2774451097804391, + "grad_norm": 1.1647342396721414, + "learning_rate": 8.480409660279903e-06, + "loss": 0.0187, + "step": 973 + }, + { + "epoch": 0.27773025377815796, + "grad_norm": 2.31581097184704, + "learning_rate": 8.477092172856287e-06, + "loss": 0.0386, + "step": 974 + }, + { + "epoch": 0.2780153977758768, + "grad_norm": 2.810835578979634, + "learning_rate": 8.473771718534411e-06, + "loss": 0.0757, + "step": 975 + }, + { + "epoch": 0.27830054177359564, + "grad_norm": 2.79707883978544, + "learning_rate": 8.47044830014752e-06, + "loss": 0.0477, + "step": 976 + }, + { + "epoch": 0.27858568577131454, + "grad_norm": 0.9667581419870983, + "learning_rate": 8.467121920531383e-06, + "loss": 0.0242, + "step": 977 + }, + { + "epoch": 0.2788708297690334, + "grad_norm": 0.6681742594213469, + "learning_rate": 8.463792582524302e-06, + "loss": 0.0096, + "step": 978 + }, + { + "epoch": 0.2791559737667522, + "grad_norm": 1.4545154240977922, + "learning_rate": 8.460460288967101e-06, + "loss": 0.0217, + "step": 979 + }, + { + "epoch": 0.27944111776447106, + "grad_norm": 0.3637082821635198, + "learning_rate": 8.457125042703124e-06, + "loss": 0.0067, + "step": 980 + }, + { + "epoch": 0.2797262617621899, + "grad_norm": 1.7754271211354633, + "learning_rate": 8.45378684657824e-06, + "loss": 0.0622, + "step": 981 + }, + { + "epoch": 0.28001140575990874, + "grad_norm": 0.512206035741963, + "learning_rate": 8.45044570344083e-06, + "loss": 0.0079, + "step": 982 + }, + { + "epoch": 0.2802965497576276, + "grad_norm": 0.287836877286698, + "learning_rate": 8.44710161614179e-06, + "loss": 0.0067, + "step": 983 + }, + { + "epoch": 0.2805816937553465, + "grad_norm": 0.5981293412787716, + "learning_rate": 8.443754587534529e-06, + "loss": 0.0182, + "step": 984 + }, + { + "epoch": 0.2808668377530653, + "grad_norm": 1.5761429513855125, + "learning_rate": 8.440404620474967e-06, + "loss": 0.0237, + "step": 985 + }, + { + "epoch": 0.28115198175078415, + "grad_norm": 1.3666585287674748, + "learning_rate": 8.43705171782153e-06, + "loss": 0.0425, + "step": 986 + }, + { + "epoch": 0.281437125748503, + "grad_norm": 1.697391675178744, + "learning_rate": 8.43369588243515e-06, + "loss": 0.0236, + "step": 987 + }, + { + "epoch": 0.28172226974622183, + "grad_norm": 3.0482492895545796, + "learning_rate": 8.430337117179259e-06, + "loss": 0.0409, + "step": 988 + }, + { + "epoch": 0.28200741374394067, + "grad_norm": 0.7229976500220648, + "learning_rate": 8.426975424919791e-06, + "loss": 0.01, + "step": 989 + }, + { + "epoch": 0.2822925577416595, + "grad_norm": 1.4158032540007655, + "learning_rate": 8.423610808525177e-06, + "loss": 0.0186, + "step": 990 + }, + { + "epoch": 0.2825777017393784, + "grad_norm": 1.6615373983383692, + "learning_rate": 8.420243270866343e-06, + "loss": 0.0652, + "step": 991 + }, + { + "epoch": 0.28286284573709725, + "grad_norm": 2.307370170129627, + "learning_rate": 8.416872814816707e-06, + "loss": 0.047, + "step": 992 + }, + { + "epoch": 0.2831479897348161, + "grad_norm": 1.8461913330119348, + "learning_rate": 8.41349944325218e-06, + "loss": 0.0465, + "step": 993 + }, + { + "epoch": 0.2834331337325349, + "grad_norm": 1.8568994890741377, + "learning_rate": 8.410123159051155e-06, + "loss": 0.0205, + "step": 994 + }, + { + "epoch": 0.28371827773025377, + "grad_norm": 1.2212193684268584, + "learning_rate": 8.40674396509452e-06, + "loss": 0.0218, + "step": 995 + }, + { + "epoch": 0.2840034217279726, + "grad_norm": 1.1766960236291593, + "learning_rate": 8.40336186426563e-06, + "loss": 0.0457, + "step": 996 + }, + { + "epoch": 0.2842885657256915, + "grad_norm": 1.1251722175740966, + "learning_rate": 8.39997685945034e-06, + "loss": 0.0216, + "step": 997 + }, + { + "epoch": 0.28457370972341034, + "grad_norm": 2.3908402551223786, + "learning_rate": 8.396588953536968e-06, + "loss": 0.0403, + "step": 998 + }, + { + "epoch": 0.2848588537211292, + "grad_norm": 2.331842901417062, + "learning_rate": 8.393198149416311e-06, + "loss": 0.0701, + "step": 999 + }, + { + "epoch": 0.285143997718848, + "grad_norm": 1.9563319254857874, + "learning_rate": 8.389804449981645e-06, + "loss": 0.0171, + "step": 1000 + }, + { + "epoch": 0.28542914171656686, + "grad_norm": 1.5049554471669486, + "learning_rate": 8.386407858128707e-06, + "loss": 0.0368, + "step": 1001 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 2.3620183376762647, + "learning_rate": 8.383008376755707e-06, + "loss": 0.0298, + "step": 1002 + }, + { + "epoch": 0.28599942971200454, + "grad_norm": 1.5127708826099804, + "learning_rate": 8.379606008763325e-06, + "loss": 0.0384, + "step": 1003 + }, + { + "epoch": 0.28628457370972343, + "grad_norm": 1.9077680147517553, + "learning_rate": 8.376200757054695e-06, + "loss": 0.0313, + "step": 1004 + }, + { + "epoch": 0.2865697177074423, + "grad_norm": 1.3689906306129866, + "learning_rate": 8.372792624535417e-06, + "loss": 0.0321, + "step": 1005 + }, + { + "epoch": 0.2868548617051611, + "grad_norm": 1.1238958908515264, + "learning_rate": 8.369381614113547e-06, + "loss": 0.014, + "step": 1006 + }, + { + "epoch": 0.28714000570287995, + "grad_norm": 1.2072815121718892, + "learning_rate": 8.365967728699602e-06, + "loss": 0.0179, + "step": 1007 + }, + { + "epoch": 0.2874251497005988, + "grad_norm": 0.4667146551541128, + "learning_rate": 8.362550971206543e-06, + "loss": 0.0146, + "step": 1008 + }, + { + "epoch": 0.28771029369831763, + "grad_norm": 1.6836956764062203, + "learning_rate": 8.359131344549788e-06, + "loss": 0.0299, + "step": 1009 + }, + { + "epoch": 0.28799543769603647, + "grad_norm": 0.8142984273923918, + "learning_rate": 8.355708851647202e-06, + "loss": 0.0226, + "step": 1010 + }, + { + "epoch": 0.28828058169375537, + "grad_norm": 1.6389536841060004, + "learning_rate": 8.352283495419096e-06, + "loss": 0.0308, + "step": 1011 + }, + { + "epoch": 0.2885657256914742, + "grad_norm": 1.5444478808693596, + "learning_rate": 8.348855278788224e-06, + "loss": 0.0589, + "step": 1012 + }, + { + "epoch": 0.28885086968919305, + "grad_norm": 0.7259294284234575, + "learning_rate": 8.345424204679778e-06, + "loss": 0.0191, + "step": 1013 + }, + { + "epoch": 0.2891360136869119, + "grad_norm": 0.6088052137967528, + "learning_rate": 8.34199027602139e-06, + "loss": 0.0158, + "step": 1014 + }, + { + "epoch": 0.2894211576846307, + "grad_norm": 0.6287296236873937, + "learning_rate": 8.338553495743132e-06, + "loss": 0.0218, + "step": 1015 + }, + { + "epoch": 0.28970630168234957, + "grad_norm": 1.7658297184074514, + "learning_rate": 8.335113866777502e-06, + "loss": 0.0441, + "step": 1016 + }, + { + "epoch": 0.28999144568006846, + "grad_norm": 0.5235221980156642, + "learning_rate": 8.331671392059433e-06, + "loss": 0.0166, + "step": 1017 + }, + { + "epoch": 0.2902765896777873, + "grad_norm": 0.8033872378713534, + "learning_rate": 8.328226074526284e-06, + "loss": 0.0292, + "step": 1018 + }, + { + "epoch": 0.29056173367550614, + "grad_norm": 1.568699813214378, + "learning_rate": 8.324777917117843e-06, + "loss": 0.029, + "step": 1019 + }, + { + "epoch": 0.290846877673225, + "grad_norm": 0.8052329509177221, + "learning_rate": 8.321326922776321e-06, + "loss": 0.0231, + "step": 1020 + }, + { + "epoch": 0.2911320216709438, + "grad_norm": 0.5269396898380581, + "learning_rate": 8.31787309444634e-06, + "loss": 0.0174, + "step": 1021 + }, + { + "epoch": 0.29141716566866266, + "grad_norm": 1.6205080390020445, + "learning_rate": 8.314416435074956e-06, + "loss": 0.0197, + "step": 1022 + }, + { + "epoch": 0.2917023096663815, + "grad_norm": 1.666001562463425, + "learning_rate": 8.31095694761163e-06, + "loss": 0.019, + "step": 1023 + }, + { + "epoch": 0.2919874536641004, + "grad_norm": 1.835698903031861, + "learning_rate": 8.307494635008237e-06, + "loss": 0.0214, + "step": 1024 + }, + { + "epoch": 0.29227259766181923, + "grad_norm": 0.8037987767198944, + "learning_rate": 8.304029500219064e-06, + "loss": 0.0168, + "step": 1025 + }, + { + "epoch": 0.2925577416595381, + "grad_norm": 2.370065976232439, + "learning_rate": 8.300561546200812e-06, + "loss": 0.0247, + "step": 1026 + }, + { + "epoch": 0.2928428856572569, + "grad_norm": 1.5504716392290512, + "learning_rate": 8.297090775912574e-06, + "loss": 0.0391, + "step": 1027 + }, + { + "epoch": 0.29312802965497575, + "grad_norm": 1.2554327251719692, + "learning_rate": 8.293617192315859e-06, + "loss": 0.0517, + "step": 1028 + }, + { + "epoch": 0.2934131736526946, + "grad_norm": 0.790459152689314, + "learning_rate": 8.29014079837457e-06, + "loss": 0.0119, + "step": 1029 + }, + { + "epoch": 0.29369831765041343, + "grad_norm": 1.4149648173860672, + "learning_rate": 8.28666159705501e-06, + "loss": 0.0133, + "step": 1030 + }, + { + "epoch": 0.2939834616481323, + "grad_norm": 0.9973359366143715, + "learning_rate": 8.283179591325879e-06, + "loss": 0.013, + "step": 1031 + }, + { + "epoch": 0.29426860564585117, + "grad_norm": 0.36926897766983774, + "learning_rate": 8.279694784158262e-06, + "loss": 0.008, + "step": 1032 + }, + { + "epoch": 0.29455374964357, + "grad_norm": 0.9623976721838673, + "learning_rate": 8.276207178525646e-06, + "loss": 0.0196, + "step": 1033 + }, + { + "epoch": 0.29483889364128885, + "grad_norm": 1.4554871378304686, + "learning_rate": 8.272716777403898e-06, + "loss": 0.041, + "step": 1034 + }, + { + "epoch": 0.2951240376390077, + "grad_norm": 2.6162190558641463, + "learning_rate": 8.26922358377127e-06, + "loss": 0.0745, + "step": 1035 + }, + { + "epoch": 0.2954091816367265, + "grad_norm": 0.8422156448296566, + "learning_rate": 8.265727600608401e-06, + "loss": 0.0194, + "step": 1036 + }, + { + "epoch": 0.2956943256344454, + "grad_norm": 1.0120407410840147, + "learning_rate": 8.262228830898313e-06, + "loss": 0.0387, + "step": 1037 + }, + { + "epoch": 0.29597946963216426, + "grad_norm": 0.7723311949173021, + "learning_rate": 8.258727277626394e-06, + "loss": 0.0102, + "step": 1038 + }, + { + "epoch": 0.2962646136298831, + "grad_norm": 0.9132982937599546, + "learning_rate": 8.255222943780419e-06, + "loss": 0.0236, + "step": 1039 + }, + { + "epoch": 0.29654975762760194, + "grad_norm": 1.4697397386034843, + "learning_rate": 8.251715832350526e-06, + "loss": 0.0406, + "step": 1040 + }, + { + "epoch": 0.2968349016253208, + "grad_norm": 1.3125795625460037, + "learning_rate": 8.248205946329233e-06, + "loss": 0.0249, + "step": 1041 + }, + { + "epoch": 0.2971200456230396, + "grad_norm": 2.0022314201473765, + "learning_rate": 8.244693288711416e-06, + "loss": 0.0389, + "step": 1042 + }, + { + "epoch": 0.29740518962075846, + "grad_norm": 0.9233574909374214, + "learning_rate": 8.241177862494323e-06, + "loss": 0.0177, + "step": 1043 + }, + { + "epoch": 0.29769033361847735, + "grad_norm": 1.118407928249881, + "learning_rate": 8.23765967067756e-06, + "loss": 0.0504, + "step": 1044 + }, + { + "epoch": 0.2979754776161962, + "grad_norm": 2.874005101086032, + "learning_rate": 8.234138716263095e-06, + "loss": 0.0629, + "step": 1045 + }, + { + "epoch": 0.29826062161391503, + "grad_norm": 1.7249213253113764, + "learning_rate": 8.230615002255254e-06, + "loss": 0.039, + "step": 1046 + }, + { + "epoch": 0.2985457656116339, + "grad_norm": 1.282268931205406, + "learning_rate": 8.227088531660712e-06, + "loss": 0.0313, + "step": 1047 + }, + { + "epoch": 0.2988309096093527, + "grad_norm": 1.5416921550475424, + "learning_rate": 8.223559307488506e-06, + "loss": 0.0517, + "step": 1048 + }, + { + "epoch": 0.29911605360707155, + "grad_norm": 0.9912931776780044, + "learning_rate": 8.220027332750012e-06, + "loss": 0.0316, + "step": 1049 + }, + { + "epoch": 0.2994011976047904, + "grad_norm": 0.6255964251801459, + "learning_rate": 8.21649261045896e-06, + "loss": 0.0218, + "step": 1050 + }, + { + "epoch": 0.2996863416025093, + "grad_norm": 1.0299927686497676, + "learning_rate": 8.212955143631425e-06, + "loss": 0.0246, + "step": 1051 + }, + { + "epoch": 0.2999714856002281, + "grad_norm": 1.5809553916222134, + "learning_rate": 8.209414935285816e-06, + "loss": 0.036, + "step": 1052 + }, + { + "epoch": 0.30025662959794697, + "grad_norm": 1.0729865806566057, + "learning_rate": 8.20587198844289e-06, + "loss": 0.0227, + "step": 1053 + }, + { + "epoch": 0.3005417735956658, + "grad_norm": 2.1564117236579454, + "learning_rate": 8.202326306125736e-06, + "loss": 0.0314, + "step": 1054 + }, + { + "epoch": 0.30082691759338465, + "grad_norm": 0.9502094736160007, + "learning_rate": 8.198777891359778e-06, + "loss": 0.0248, + "step": 1055 + }, + { + "epoch": 0.3011120615911035, + "grad_norm": 0.9962396374644391, + "learning_rate": 8.19522674717277e-06, + "loss": 0.041, + "step": 1056 + }, + { + "epoch": 0.3013972055888224, + "grad_norm": 0.6369816183985646, + "learning_rate": 8.1916728765948e-06, + "loss": 0.0237, + "step": 1057 + }, + { + "epoch": 0.3016823495865412, + "grad_norm": 2.5117579204688623, + "learning_rate": 8.188116282658278e-06, + "loss": 0.0359, + "step": 1058 + }, + { + "epoch": 0.30196749358426006, + "grad_norm": 1.6764540415557843, + "learning_rate": 8.184556968397938e-06, + "loss": 0.0259, + "step": 1059 + }, + { + "epoch": 0.3022526375819789, + "grad_norm": 0.9996801308305837, + "learning_rate": 8.180994936850834e-06, + "loss": 0.0242, + "step": 1060 + }, + { + "epoch": 0.30253778157969774, + "grad_norm": 0.7595100913126963, + "learning_rate": 8.177430191056346e-06, + "loss": 0.0173, + "step": 1061 + }, + { + "epoch": 0.3028229255774166, + "grad_norm": 0.9654704679405917, + "learning_rate": 8.173862734056158e-06, + "loss": 0.0277, + "step": 1062 + }, + { + "epoch": 0.3031080695751354, + "grad_norm": 1.5038824626171603, + "learning_rate": 8.170292568894278e-06, + "loss": 0.0237, + "step": 1063 + }, + { + "epoch": 0.3033932135728543, + "grad_norm": 0.7973443754741695, + "learning_rate": 8.16671969861702e-06, + "loss": 0.027, + "step": 1064 + }, + { + "epoch": 0.30367835757057315, + "grad_norm": 1.8187366967593017, + "learning_rate": 8.163144126273004e-06, + "loss": 0.025, + "step": 1065 + }, + { + "epoch": 0.303963501568292, + "grad_norm": 0.9250825610118477, + "learning_rate": 8.159565854913162e-06, + "loss": 0.0175, + "step": 1066 + }, + { + "epoch": 0.30424864556601083, + "grad_norm": 1.506560630883658, + "learning_rate": 8.155984887590724e-06, + "loss": 0.0332, + "step": 1067 + }, + { + "epoch": 0.3045337895637297, + "grad_norm": 1.2512981744236478, + "learning_rate": 8.152401227361224e-06, + "loss": 0.0369, + "step": 1068 + }, + { + "epoch": 0.3048189335614485, + "grad_norm": 1.8833596709136027, + "learning_rate": 8.148814877282487e-06, + "loss": 0.0292, + "step": 1069 + }, + { + "epoch": 0.30510407755916735, + "grad_norm": 0.7881838063455426, + "learning_rate": 8.145225840414641e-06, + "loss": 0.0101, + "step": 1070 + }, + { + "epoch": 0.30538922155688625, + "grad_norm": 1.6796194211057058, + "learning_rate": 8.141634119820101e-06, + "loss": 0.0287, + "step": 1071 + }, + { + "epoch": 0.3056743655546051, + "grad_norm": 1.2891380669074353, + "learning_rate": 8.138039718563578e-06, + "loss": 0.0492, + "step": 1072 + }, + { + "epoch": 0.3059595095523239, + "grad_norm": 2.357993042502323, + "learning_rate": 8.134442639712063e-06, + "loss": 0.0456, + "step": 1073 + }, + { + "epoch": 0.30624465355004277, + "grad_norm": 0.5191000758439441, + "learning_rate": 8.130842886334837e-06, + "loss": 0.0083, + "step": 1074 + }, + { + "epoch": 0.3065297975477616, + "grad_norm": 0.9634677131357932, + "learning_rate": 8.127240461503462e-06, + "loss": 0.0159, + "step": 1075 + }, + { + "epoch": 0.30681494154548045, + "grad_norm": 0.27264204707454004, + "learning_rate": 8.123635368291777e-06, + "loss": 0.0051, + "step": 1076 + }, + { + "epoch": 0.30710008554319934, + "grad_norm": 2.2718130414005446, + "learning_rate": 8.120027609775902e-06, + "loss": 0.04, + "step": 1077 + }, + { + "epoch": 0.3073852295409182, + "grad_norm": 1.2599259798035607, + "learning_rate": 8.116417189034227e-06, + "loss": 0.0234, + "step": 1078 + }, + { + "epoch": 0.307670373538637, + "grad_norm": 1.2003202010441418, + "learning_rate": 8.112804109147416e-06, + "loss": 0.0137, + "step": 1079 + }, + { + "epoch": 0.30795551753635586, + "grad_norm": 1.525563420275153, + "learning_rate": 8.1091883731984e-06, + "loss": 0.0452, + "step": 1080 + }, + { + "epoch": 0.3082406615340747, + "grad_norm": 1.8645400213251933, + "learning_rate": 8.10556998427238e-06, + "loss": 0.0267, + "step": 1081 + }, + { + "epoch": 0.30852580553179354, + "grad_norm": 2.063747391446373, + "learning_rate": 8.10194894545682e-06, + "loss": 0.0357, + "step": 1082 + }, + { + "epoch": 0.3088109495295124, + "grad_norm": 1.2254249014948786, + "learning_rate": 8.09832525984144e-06, + "loss": 0.0226, + "step": 1083 + }, + { + "epoch": 0.3090960935272313, + "grad_norm": 1.4378801385084123, + "learning_rate": 8.094698930518224e-06, + "loss": 0.0207, + "step": 1084 + }, + { + "epoch": 0.3093812375249501, + "grad_norm": 1.6399576082537426, + "learning_rate": 8.091069960581408e-06, + "loss": 0.0208, + "step": 1085 + }, + { + "epoch": 0.30966638152266895, + "grad_norm": 0.4934963525040569, + "learning_rate": 8.087438353127486e-06, + "loss": 0.0065, + "step": 1086 + }, + { + "epoch": 0.3099515255203878, + "grad_norm": 1.009916460402547, + "learning_rate": 8.083804111255197e-06, + "loss": 0.0179, + "step": 1087 + }, + { + "epoch": 0.31023666951810663, + "grad_norm": 0.8948637487646737, + "learning_rate": 8.08016723806553e-06, + "loss": 0.0202, + "step": 1088 + }, + { + "epoch": 0.3105218135158255, + "grad_norm": 1.3076962346304433, + "learning_rate": 8.076527736661724e-06, + "loss": 0.0362, + "step": 1089 + }, + { + "epoch": 0.3108069575135443, + "grad_norm": 2.205028660931532, + "learning_rate": 8.072885610149251e-06, + "loss": 0.0309, + "step": 1090 + }, + { + "epoch": 0.3110921015112632, + "grad_norm": 1.4313613753949053, + "learning_rate": 8.06924086163583e-06, + "loss": 0.021, + "step": 1091 + }, + { + "epoch": 0.31137724550898205, + "grad_norm": 2.054475718794255, + "learning_rate": 8.065593494231418e-06, + "loss": 0.0468, + "step": 1092 + }, + { + "epoch": 0.3116623895067009, + "grad_norm": 1.7740986866927582, + "learning_rate": 8.061943511048199e-06, + "loss": 0.0516, + "step": 1093 + }, + { + "epoch": 0.3119475335044197, + "grad_norm": 46.640258129930466, + "learning_rate": 8.058290915200597e-06, + "loss": 0.1971, + "step": 1094 + }, + { + "epoch": 0.31223267750213857, + "grad_norm": 0.781762262501865, + "learning_rate": 8.054635709805263e-06, + "loss": 0.0248, + "step": 1095 + }, + { + "epoch": 0.3125178214998574, + "grad_norm": 1.082335674544725, + "learning_rate": 8.050977897981071e-06, + "loss": 0.0357, + "step": 1096 + }, + { + "epoch": 0.3128029654975763, + "grad_norm": 1.7052642583096669, + "learning_rate": 8.047317482849124e-06, + "loss": 0.0305, + "step": 1097 + }, + { + "epoch": 0.31308810949529514, + "grad_norm": 1.9320064281915101, + "learning_rate": 8.043654467532744e-06, + "loss": 0.0486, + "step": 1098 + }, + { + "epoch": 0.313373253493014, + "grad_norm": 2.6104512138319564, + "learning_rate": 8.039988855157472e-06, + "loss": 0.0608, + "step": 1099 + }, + { + "epoch": 0.3136583974907328, + "grad_norm": 0.906309465890772, + "learning_rate": 8.036320648851064e-06, + "loss": 0.0188, + "step": 1100 + }, + { + "epoch": 0.31394354148845166, + "grad_norm": 1.2528075290998113, + "learning_rate": 8.032649851743493e-06, + "loss": 0.0458, + "step": 1101 + }, + { + "epoch": 0.3142286854861705, + "grad_norm": 1.7482113067259941, + "learning_rate": 8.028976466966934e-06, + "loss": 0.0369, + "step": 1102 + }, + { + "epoch": 0.31451382948388934, + "grad_norm": 2.9189918710398755, + "learning_rate": 8.025300497655783e-06, + "loss": 0.0377, + "step": 1103 + }, + { + "epoch": 0.31479897348160824, + "grad_norm": 1.0983245681069267, + "learning_rate": 8.021621946946628e-06, + "loss": 0.0242, + "step": 1104 + }, + { + "epoch": 0.3150841174793271, + "grad_norm": 1.2121132358162416, + "learning_rate": 8.01794081797827e-06, + "loss": 0.0335, + "step": 1105 + }, + { + "epoch": 0.3153692614770459, + "grad_norm": 0.8067611328102654, + "learning_rate": 8.014257113891704e-06, + "loss": 0.0301, + "step": 1106 + }, + { + "epoch": 0.31565440547476475, + "grad_norm": 0.8529627858655344, + "learning_rate": 8.010570837830124e-06, + "loss": 0.0367, + "step": 1107 + }, + { + "epoch": 0.3159395494724836, + "grad_norm": 0.8676313074444348, + "learning_rate": 8.00688199293892e-06, + "loss": 0.019, + "step": 1108 + }, + { + "epoch": 0.31622469347020243, + "grad_norm": 0.7427102352300117, + "learning_rate": 8.003190582365669e-06, + "loss": 0.0262, + "step": 1109 + }, + { + "epoch": 0.3165098374679213, + "grad_norm": 0.9736769790238554, + "learning_rate": 7.999496609260144e-06, + "loss": 0.0184, + "step": 1110 + }, + { + "epoch": 0.31679498146564017, + "grad_norm": 1.27619488452561, + "learning_rate": 7.995800076774301e-06, + "loss": 0.0232, + "step": 1111 + }, + { + "epoch": 0.317080125463359, + "grad_norm": 2.0389975152307476, + "learning_rate": 7.992100988062277e-06, + "loss": 0.0393, + "step": 1112 + }, + { + "epoch": 0.31736526946107785, + "grad_norm": 0.7124345990268323, + "learning_rate": 7.988399346280398e-06, + "loss": 0.0157, + "step": 1113 + }, + { + "epoch": 0.3176504134587967, + "grad_norm": 0.7642116938852253, + "learning_rate": 7.98469515458716e-06, + "loss": 0.0239, + "step": 1114 + }, + { + "epoch": 0.3179355574565155, + "grad_norm": 0.8303694050770815, + "learning_rate": 7.980988416143239e-06, + "loss": 0.0188, + "step": 1115 + }, + { + "epoch": 0.31822070145423437, + "grad_norm": 1.7299262380985978, + "learning_rate": 7.977279134111487e-06, + "loss": 0.0384, + "step": 1116 + }, + { + "epoch": 0.31850584545195326, + "grad_norm": 0.821877902826892, + "learning_rate": 7.973567311656917e-06, + "loss": 0.0152, + "step": 1117 + }, + { + "epoch": 0.3187909894496721, + "grad_norm": 0.841185794140611, + "learning_rate": 7.96985295194672e-06, + "loss": 0.0213, + "step": 1118 + }, + { + "epoch": 0.31907613344739094, + "grad_norm": 2.9117336343809694, + "learning_rate": 7.966136058150247e-06, + "loss": 0.0337, + "step": 1119 + }, + { + "epoch": 0.3193612774451098, + "grad_norm": 0.37780063918483164, + "learning_rate": 7.962416633439008e-06, + "loss": 0.0143, + "step": 1120 + }, + { + "epoch": 0.3196464214428286, + "grad_norm": 1.1660782504301215, + "learning_rate": 7.958694680986682e-06, + "loss": 0.0258, + "step": 1121 + }, + { + "epoch": 0.31993156544054746, + "grad_norm": 0.6348660624052973, + "learning_rate": 7.954970203969095e-06, + "loss": 0.0401, + "step": 1122 + }, + { + "epoch": 0.3202167094382663, + "grad_norm": 0.5893537112468747, + "learning_rate": 7.951243205564234e-06, + "loss": 0.0106, + "step": 1123 + }, + { + "epoch": 0.3205018534359852, + "grad_norm": 0.7759687299701817, + "learning_rate": 7.947513688952234e-06, + "loss": 0.0202, + "step": 1124 + }, + { + "epoch": 0.32078699743370404, + "grad_norm": 1.7145098320011822, + "learning_rate": 7.943781657315377e-06, + "loss": 0.0227, + "step": 1125 + }, + { + "epoch": 0.3210721414314229, + "grad_norm": 1.8907573081270583, + "learning_rate": 7.940047113838096e-06, + "loss": 0.0218, + "step": 1126 + }, + { + "epoch": 0.3213572854291417, + "grad_norm": 1.1764345975362007, + "learning_rate": 7.936310061706965e-06, + "loss": 0.0399, + "step": 1127 + }, + { + "epoch": 0.32164242942686055, + "grad_norm": 0.8821811693487115, + "learning_rate": 7.932570504110697e-06, + "loss": 0.0252, + "step": 1128 + }, + { + "epoch": 0.3219275734245794, + "grad_norm": 0.8404345204905768, + "learning_rate": 7.928828444240144e-06, + "loss": 0.0273, + "step": 1129 + }, + { + "epoch": 0.32221271742229823, + "grad_norm": 0.714789071782589, + "learning_rate": 7.925083885288296e-06, + "loss": 0.0165, + "step": 1130 + }, + { + "epoch": 0.32249786142001713, + "grad_norm": 0.9439440999089335, + "learning_rate": 7.921336830450268e-06, + "loss": 0.0174, + "step": 1131 + }, + { + "epoch": 0.32278300541773597, + "grad_norm": 1.9591792214351356, + "learning_rate": 7.917587282923312e-06, + "loss": 0.0492, + "step": 1132 + }, + { + "epoch": 0.3230681494154548, + "grad_norm": 0.9159150616345992, + "learning_rate": 7.913835245906805e-06, + "loss": 0.0174, + "step": 1133 + }, + { + "epoch": 0.32335329341317365, + "grad_norm": 1.6772121272739662, + "learning_rate": 7.910080722602245e-06, + "loss": 0.0219, + "step": 1134 + }, + { + "epoch": 0.3236384374108925, + "grad_norm": 1.1616225108993936, + "learning_rate": 7.906323716213256e-06, + "loss": 0.0263, + "step": 1135 + }, + { + "epoch": 0.3239235814086113, + "grad_norm": 1.151082996421702, + "learning_rate": 7.902564229945577e-06, + "loss": 0.0111, + "step": 1136 + }, + { + "epoch": 0.3242087254063302, + "grad_norm": 1.2986081859094945, + "learning_rate": 7.898802267007067e-06, + "loss": 0.0116, + "step": 1137 + }, + { + "epoch": 0.32449386940404906, + "grad_norm": 1.3947759655941292, + "learning_rate": 7.895037830607692e-06, + "loss": 0.0389, + "step": 1138 + }, + { + "epoch": 0.3247790134017679, + "grad_norm": 1.0484963530466218, + "learning_rate": 7.891270923959537e-06, + "loss": 0.014, + "step": 1139 + }, + { + "epoch": 0.32506415739948674, + "grad_norm": 1.504826560429216, + "learning_rate": 7.887501550276789e-06, + "loss": 0.0423, + "step": 1140 + }, + { + "epoch": 0.3253493013972056, + "grad_norm": 1.5608082869525692, + "learning_rate": 7.88372971277574e-06, + "loss": 0.0684, + "step": 1141 + }, + { + "epoch": 0.3256344453949244, + "grad_norm": 2.7628731807480653, + "learning_rate": 7.879955414674784e-06, + "loss": 0.0681, + "step": 1142 + }, + { + "epoch": 0.32591958939264326, + "grad_norm": 1.613849056092281, + "learning_rate": 7.87617865919442e-06, + "loss": 0.0463, + "step": 1143 + }, + { + "epoch": 0.32620473339036216, + "grad_norm": 2.5686764919053124, + "learning_rate": 7.872399449557238e-06, + "loss": 0.0536, + "step": 1144 + }, + { + "epoch": 0.326489877388081, + "grad_norm": 1.4183432016049962, + "learning_rate": 7.868617788987925e-06, + "loss": 0.0285, + "step": 1145 + }, + { + "epoch": 0.32677502138579984, + "grad_norm": 1.7124522563822497, + "learning_rate": 7.864833680713256e-06, + "loss": 0.0593, + "step": 1146 + }, + { + "epoch": 0.3270601653835187, + "grad_norm": 1.2283635139885853, + "learning_rate": 7.861047127962099e-06, + "loss": 0.0321, + "step": 1147 + }, + { + "epoch": 0.3273453093812375, + "grad_norm": 1.6249903419870981, + "learning_rate": 7.857258133965405e-06, + "loss": 0.0399, + "step": 1148 + }, + { + "epoch": 0.32763045337895635, + "grad_norm": 3.053147088970876, + "learning_rate": 7.853466701956208e-06, + "loss": 0.0531, + "step": 1149 + }, + { + "epoch": 0.3279155973766752, + "grad_norm": 24.31024864362681, + "learning_rate": 7.849672835169625e-06, + "loss": 0.1359, + "step": 1150 + }, + { + "epoch": 0.3282007413743941, + "grad_norm": 1.696805065592631, + "learning_rate": 7.845876536842846e-06, + "loss": 0.0418, + "step": 1151 + }, + { + "epoch": 0.32848588537211293, + "grad_norm": 1.3445704598716866, + "learning_rate": 7.84207781021514e-06, + "loss": 0.0254, + "step": 1152 + }, + { + "epoch": 0.32877102936983177, + "grad_norm": 2.8671303035698616, + "learning_rate": 7.838276658527847e-06, + "loss": 0.0395, + "step": 1153 + }, + { + "epoch": 0.3290561733675506, + "grad_norm": 2.130710041966832, + "learning_rate": 7.834473085024373e-06, + "loss": 0.0316, + "step": 1154 + }, + { + "epoch": 0.32934131736526945, + "grad_norm": 2.6888945875232424, + "learning_rate": 7.830667092950195e-06, + "loss": 0.0586, + "step": 1155 + }, + { + "epoch": 0.3296264613629883, + "grad_norm": 1.9325581252131516, + "learning_rate": 7.826858685552851e-06, + "loss": 0.0405, + "step": 1156 + }, + { + "epoch": 0.3299116053607072, + "grad_norm": 2.1097020635632804, + "learning_rate": 7.82304786608194e-06, + "loss": 0.0237, + "step": 1157 + }, + { + "epoch": 0.330196749358426, + "grad_norm": 1.0799679655070387, + "learning_rate": 7.819234637789122e-06, + "loss": 0.0218, + "step": 1158 + }, + { + "epoch": 0.33048189335614486, + "grad_norm": 1.0940971424110126, + "learning_rate": 7.815419003928107e-06, + "loss": 0.0156, + "step": 1159 + }, + { + "epoch": 0.3307670373538637, + "grad_norm": 0.45652875771745444, + "learning_rate": 7.811600967754661e-06, + "loss": 0.018, + "step": 1160 + }, + { + "epoch": 0.33105218135158254, + "grad_norm": 1.5961724407974645, + "learning_rate": 7.807780532526604e-06, + "loss": 0.0442, + "step": 1161 + }, + { + "epoch": 0.3313373253493014, + "grad_norm": 1.5457552919670423, + "learning_rate": 7.80395770150379e-06, + "loss": 0.035, + "step": 1162 + }, + { + "epoch": 0.3316224693470202, + "grad_norm": 1.5335975806195943, + "learning_rate": 7.800132477948137e-06, + "loss": 0.0428, + "step": 1163 + }, + { + "epoch": 0.3319076133447391, + "grad_norm": 1.791656104690741, + "learning_rate": 7.796304865123583e-06, + "loss": 0.0402, + "step": 1164 + }, + { + "epoch": 0.33219275734245796, + "grad_norm": 0.8626638971656029, + "learning_rate": 7.79247486629612e-06, + "loss": 0.0207, + "step": 1165 + }, + { + "epoch": 0.3324779013401768, + "grad_norm": 1.3786813993640084, + "learning_rate": 7.788642484733773e-06, + "loss": 0.0305, + "step": 1166 + }, + { + "epoch": 0.33276304533789564, + "grad_norm": 0.9962974441495369, + "learning_rate": 7.784807723706593e-06, + "loss": 0.0369, + "step": 1167 + }, + { + "epoch": 0.3330481893356145, + "grad_norm": 0.8448490830062997, + "learning_rate": 7.780970586486668e-06, + "loss": 0.032, + "step": 1168 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 141.46865931940235, + "learning_rate": 7.777131076348115e-06, + "loss": 0.5834, + "step": 1169 + }, + { + "epoch": 0.33361847733105215, + "grad_norm": 0.6240936201732423, + "learning_rate": 7.773289196567066e-06, + "loss": 0.0202, + "step": 1170 + }, + { + "epoch": 0.33390362132877105, + "grad_norm": 1.6855103583591178, + "learning_rate": 7.76944495042169e-06, + "loss": 0.031, + "step": 1171 + }, + { + "epoch": 0.3341887653264899, + "grad_norm": 1.061334418928402, + "learning_rate": 7.76559834119216e-06, + "loss": 0.0393, + "step": 1172 + }, + { + "epoch": 0.33447390932420873, + "grad_norm": 2.3860632467930434, + "learning_rate": 7.761749372160676e-06, + "loss": 0.0546, + "step": 1173 + }, + { + "epoch": 0.33475905332192757, + "grad_norm": 0.7787717819480605, + "learning_rate": 7.757898046611446e-06, + "loss": 0.0196, + "step": 1174 + }, + { + "epoch": 0.3350441973196464, + "grad_norm": 1.1796680979025957, + "learning_rate": 7.754044367830689e-06, + "loss": 0.038, + "step": 1175 + }, + { + "epoch": 0.33532934131736525, + "grad_norm": 1.3148557580390934, + "learning_rate": 7.750188339106635e-06, + "loss": 0.0304, + "step": 1176 + }, + { + "epoch": 0.33561448531508414, + "grad_norm": 1.2054539450580688, + "learning_rate": 7.746329963729517e-06, + "loss": 0.0228, + "step": 1177 + }, + { + "epoch": 0.335899629312803, + "grad_norm": 2.0449048447789386, + "learning_rate": 7.742469244991572e-06, + "loss": 0.0661, + "step": 1178 + }, + { + "epoch": 0.3361847733105218, + "grad_norm": 0.5740583915566436, + "learning_rate": 7.738606186187034e-06, + "loss": 0.0179, + "step": 1179 + }, + { + "epoch": 0.33646991730824066, + "grad_norm": 0.6910488636543247, + "learning_rate": 7.734740790612137e-06, + "loss": 0.0277, + "step": 1180 + }, + { + "epoch": 0.3367550613059595, + "grad_norm": 1.0694033044260076, + "learning_rate": 7.730873061565101e-06, + "loss": 0.0224, + "step": 1181 + }, + { + "epoch": 0.33704020530367834, + "grad_norm": 0.790204261875235, + "learning_rate": 7.72700300234615e-06, + "loss": 0.0248, + "step": 1182 + }, + { + "epoch": 0.3373253493013972, + "grad_norm": 1.33625424302328, + "learning_rate": 7.723130616257485e-06, + "loss": 0.0274, + "step": 1183 + }, + { + "epoch": 0.3376104932991161, + "grad_norm": 1.0567353013473344, + "learning_rate": 7.719255906603298e-06, + "loss": 0.0232, + "step": 1184 + }, + { + "epoch": 0.3378956372968349, + "grad_norm": 0.30339439930941897, + "learning_rate": 7.715378876689763e-06, + "loss": 0.0076, + "step": 1185 + }, + { + "epoch": 0.33818078129455376, + "grad_norm": 0.6213587414411561, + "learning_rate": 7.711499529825032e-06, + "loss": 0.0182, + "step": 1186 + }, + { + "epoch": 0.3384659252922726, + "grad_norm": 1.6039813029461647, + "learning_rate": 7.707617869319235e-06, + "loss": 0.041, + "step": 1187 + }, + { + "epoch": 0.33875106928999144, + "grad_norm": 0.9910996874197118, + "learning_rate": 7.703733898484479e-06, + "loss": 0.0186, + "step": 1188 + }, + { + "epoch": 0.3390362132877103, + "grad_norm": 0.4274479084393529, + "learning_rate": 7.699847620634834e-06, + "loss": 0.0073, + "step": 1189 + }, + { + "epoch": 0.3393213572854291, + "grad_norm": 1.1140118331424291, + "learning_rate": 7.695959039086349e-06, + "loss": 0.0169, + "step": 1190 + }, + { + "epoch": 0.339606501283148, + "grad_norm": 0.7561760159586366, + "learning_rate": 7.692068157157032e-06, + "loss": 0.0165, + "step": 1191 + }, + { + "epoch": 0.33989164528086685, + "grad_norm": 0.5463013328367404, + "learning_rate": 7.688174978166855e-06, + "loss": 0.0118, + "step": 1192 + }, + { + "epoch": 0.3401767892785857, + "grad_norm": 1.1686541927437784, + "learning_rate": 7.684279505437754e-06, + "loss": 0.0183, + "step": 1193 + }, + { + "epoch": 0.34046193327630453, + "grad_norm": 1.5635211370198485, + "learning_rate": 7.680381742293615e-06, + "loss": 0.0548, + "step": 1194 + }, + { + "epoch": 0.34074707727402337, + "grad_norm": 1.3537452233019234, + "learning_rate": 7.676481692060284e-06, + "loss": 0.024, + "step": 1195 + }, + { + "epoch": 0.3410322212717422, + "grad_norm": 0.7266402871499255, + "learning_rate": 7.672579358065554e-06, + "loss": 0.0175, + "step": 1196 + }, + { + "epoch": 0.3413173652694611, + "grad_norm": 0.5773819066167983, + "learning_rate": 7.668674743639173e-06, + "loss": 0.0158, + "step": 1197 + }, + { + "epoch": 0.34160250926717994, + "grad_norm": 1.1945971562545137, + "learning_rate": 7.66476785211283e-06, + "loss": 0.0232, + "step": 1198 + }, + { + "epoch": 0.3418876532648988, + "grad_norm": 1.7947296038208627, + "learning_rate": 7.660858686820157e-06, + "loss": 0.0372, + "step": 1199 + }, + { + "epoch": 0.3421727972626176, + "grad_norm": 1.2339145196734935, + "learning_rate": 7.656947251096729e-06, + "loss": 0.0301, + "step": 1200 + }, + { + "epoch": 0.34245794126033646, + "grad_norm": 1.4239632584351836, + "learning_rate": 7.653033548280056e-06, + "loss": 0.0223, + "step": 1201 + }, + { + "epoch": 0.3427430852580553, + "grad_norm": 3.5661005357954902, + "learning_rate": 7.649117581709581e-06, + "loss": 0.0782, + "step": 1202 + }, + { + "epoch": 0.34302822925577414, + "grad_norm": 0.4606205342130907, + "learning_rate": 7.645199354726681e-06, + "loss": 0.008, + "step": 1203 + }, + { + "epoch": 0.34331337325349304, + "grad_norm": 1.1808740305202428, + "learning_rate": 7.641278870674664e-06, + "loss": 0.0138, + "step": 1204 + }, + { + "epoch": 0.3435985172512119, + "grad_norm": 1.181312784517819, + "learning_rate": 7.637356132898762e-06, + "loss": 0.0113, + "step": 1205 + }, + { + "epoch": 0.3438836612489307, + "grad_norm": 1.9393291910241433, + "learning_rate": 7.633431144746123e-06, + "loss": 0.0366, + "step": 1206 + }, + { + "epoch": 0.34416880524664956, + "grad_norm": 0.5655309202758987, + "learning_rate": 7.629503909565829e-06, + "loss": 0.0084, + "step": 1207 + }, + { + "epoch": 0.3444539492443684, + "grad_norm": 0.9828332776151091, + "learning_rate": 7.625574430708867e-06, + "loss": 0.0436, + "step": 1208 + }, + { + "epoch": 0.34473909324208724, + "grad_norm": 1.9944966409431035, + "learning_rate": 7.621642711528143e-06, + "loss": 0.0303, + "step": 1209 + }, + { + "epoch": 0.3450242372398061, + "grad_norm": 0.7784849227021717, + "learning_rate": 7.617708755378477e-06, + "loss": 0.0121, + "step": 1210 + }, + { + "epoch": 0.34530938123752497, + "grad_norm": 1.4230792460322752, + "learning_rate": 7.613772565616595e-06, + "loss": 0.0213, + "step": 1211 + }, + { + "epoch": 0.3455945252352438, + "grad_norm": 1.1879362008117293, + "learning_rate": 7.609834145601129e-06, + "loss": 0.0331, + "step": 1212 + }, + { + "epoch": 0.34587966923296265, + "grad_norm": 0.9145028403418679, + "learning_rate": 7.605893498692616e-06, + "loss": 0.0103, + "step": 1213 + }, + { + "epoch": 0.3461648132306815, + "grad_norm": 0.8655139809708533, + "learning_rate": 7.601950628253489e-06, + "loss": 0.023, + "step": 1214 + }, + { + "epoch": 0.34644995722840033, + "grad_norm": 1.7832506022566148, + "learning_rate": 7.598005537648082e-06, + "loss": 0.0564, + "step": 1215 + }, + { + "epoch": 0.34673510122611917, + "grad_norm": 2.1239358299514586, + "learning_rate": 7.594058230242623e-06, + "loss": 0.0366, + "step": 1216 + }, + { + "epoch": 0.34702024522383806, + "grad_norm": 0.878389399490451, + "learning_rate": 7.59010870940523e-06, + "loss": 0.0119, + "step": 1217 + }, + { + "epoch": 0.3473053892215569, + "grad_norm": 2.733265706518621, + "learning_rate": 7.58615697850591e-06, + "loss": 0.0543, + "step": 1218 + }, + { + "epoch": 0.34759053321927574, + "grad_norm": 0.7518484980410628, + "learning_rate": 7.582203040916558e-06, + "loss": 0.0129, + "step": 1219 + }, + { + "epoch": 0.3478756772169946, + "grad_norm": 0.9989613237234266, + "learning_rate": 7.578246900010948e-06, + "loss": 0.0128, + "step": 1220 + }, + { + "epoch": 0.3481608212147134, + "grad_norm": 1.0141029301162927, + "learning_rate": 7.57428855916474e-06, + "loss": 0.0225, + "step": 1221 + }, + { + "epoch": 0.34844596521243226, + "grad_norm": 1.820213625141049, + "learning_rate": 7.5703280217554594e-06, + "loss": 0.0539, + "step": 1222 + }, + { + "epoch": 0.3487311092101511, + "grad_norm": 0.4916304901754792, + "learning_rate": 7.566365291162523e-06, + "loss": 0.0087, + "step": 1223 + }, + { + "epoch": 0.34901625320787, + "grad_norm": 1.5289568499969866, + "learning_rate": 7.5624003707672036e-06, + "loss": 0.0262, + "step": 1224 + }, + { + "epoch": 0.34930139720558884, + "grad_norm": 0.6704716345946136, + "learning_rate": 7.558433263952652e-06, + "loss": 0.0263, + "step": 1225 + }, + { + "epoch": 0.3495865412033077, + "grad_norm": 0.8476756802724102, + "learning_rate": 7.554463974103876e-06, + "loss": 0.0091, + "step": 1226 + }, + { + "epoch": 0.3498716852010265, + "grad_norm": 0.5944495719636592, + "learning_rate": 7.5504925046077596e-06, + "loss": 0.0154, + "step": 1227 + }, + { + "epoch": 0.35015682919874536, + "grad_norm": 1.545818848015318, + "learning_rate": 7.546518858853032e-06, + "loss": 0.0341, + "step": 1228 + }, + { + "epoch": 0.3504419731964642, + "grad_norm": 1.9221079880766434, + "learning_rate": 7.542543040230287e-06, + "loss": 0.0332, + "step": 1229 + }, + { + "epoch": 0.35072711719418304, + "grad_norm": 1.9442964587621157, + "learning_rate": 7.538565052131972e-06, + "loss": 0.0424, + "step": 1230 + }, + { + "epoch": 0.35101226119190193, + "grad_norm": 2.1530989721825544, + "learning_rate": 7.534584897952385e-06, + "loss": 0.0644, + "step": 1231 + }, + { + "epoch": 0.35129740518962077, + "grad_norm": 1.0592053494006788, + "learning_rate": 7.530602581087672e-06, + "loss": 0.0418, + "step": 1232 + }, + { + "epoch": 0.3515825491873396, + "grad_norm": 2.701165505211902, + "learning_rate": 7.526618104935824e-06, + "loss": 0.0423, + "step": 1233 + }, + { + "epoch": 0.35186769318505845, + "grad_norm": 1.9681046663599653, + "learning_rate": 7.522631472896671e-06, + "loss": 0.0292, + "step": 1234 + }, + { + "epoch": 0.3521528371827773, + "grad_norm": 1.0834902051848114, + "learning_rate": 7.518642688371893e-06, + "loss": 0.0165, + "step": 1235 + }, + { + "epoch": 0.35243798118049613, + "grad_norm": 0.729111577118215, + "learning_rate": 7.514651754764996e-06, + "loss": 0.0113, + "step": 1236 + }, + { + "epoch": 0.352723125178215, + "grad_norm": 0.9677835962247436, + "learning_rate": 7.510658675481324e-06, + "loss": 0.0089, + "step": 1237 + }, + { + "epoch": 0.35300826917593386, + "grad_norm": 1.7687113644828218, + "learning_rate": 7.5066634539280524e-06, + "loss": 0.0368, + "step": 1238 + }, + { + "epoch": 0.3532934131736527, + "grad_norm": 0.8467747355722813, + "learning_rate": 7.502666093514184e-06, + "loss": 0.0212, + "step": 1239 + }, + { + "epoch": 0.35357855717137154, + "grad_norm": 1.1715828228303156, + "learning_rate": 7.498666597650544e-06, + "loss": 0.0316, + "step": 1240 + }, + { + "epoch": 0.3538637011690904, + "grad_norm": 0.9081021165247576, + "learning_rate": 7.494664969749785e-06, + "loss": 0.0227, + "step": 1241 + }, + { + "epoch": 0.3541488451668092, + "grad_norm": 1.1369108712808864, + "learning_rate": 7.490661213226374e-06, + "loss": 0.0359, + "step": 1242 + }, + { + "epoch": 0.35443398916452806, + "grad_norm": 2.0527453246581557, + "learning_rate": 7.486655331496597e-06, + "loss": 0.0355, + "step": 1243 + }, + { + "epoch": 0.35471913316224696, + "grad_norm": 1.0630257009017088, + "learning_rate": 7.482647327978551e-06, + "loss": 0.022, + "step": 1244 + }, + { + "epoch": 0.3550042771599658, + "grad_norm": 1.1552022033934928, + "learning_rate": 7.478637206092147e-06, + "loss": 0.0244, + "step": 1245 + }, + { + "epoch": 0.35528942115768464, + "grad_norm": 1.2041123400974558, + "learning_rate": 7.474624969259101e-06, + "loss": 0.0311, + "step": 1246 + }, + { + "epoch": 0.3555745651554035, + "grad_norm": 0.6912486293307928, + "learning_rate": 7.470610620902934e-06, + "loss": 0.016, + "step": 1247 + }, + { + "epoch": 0.3558597091531223, + "grad_norm": 0.7069761757438487, + "learning_rate": 7.466594164448967e-06, + "loss": 0.0121, + "step": 1248 + }, + { + "epoch": 0.35614485315084116, + "grad_norm": 2.406861730589254, + "learning_rate": 7.462575603324325e-06, + "loss": 0.0573, + "step": 1249 + }, + { + "epoch": 0.35642999714856, + "grad_norm": 1.073491482084253, + "learning_rate": 7.458554940957922e-06, + "loss": 0.0285, + "step": 1250 + }, + { + "epoch": 0.3567151411462789, + "grad_norm": 1.4809174105565874, + "learning_rate": 7.45453218078047e-06, + "loss": 0.0187, + "step": 1251 + }, + { + "epoch": 0.35700028514399773, + "grad_norm": 2.5263068063454104, + "learning_rate": 7.450507326224469e-06, + "loss": 0.0346, + "step": 1252 + }, + { + "epoch": 0.35728542914171657, + "grad_norm": 1.0743279609219285, + "learning_rate": 7.446480380724208e-06, + "loss": 0.015, + "step": 1253 + }, + { + "epoch": 0.3575705731394354, + "grad_norm": 1.2267919789411827, + "learning_rate": 7.442451347715758e-06, + "loss": 0.041, + "step": 1254 + }, + { + "epoch": 0.35785571713715425, + "grad_norm": 1.0637373721906973, + "learning_rate": 7.438420230636969e-06, + "loss": 0.0161, + "step": 1255 + }, + { + "epoch": 0.3581408611348731, + "grad_norm": 1.8824790740102795, + "learning_rate": 7.434387032927475e-06, + "loss": 0.0358, + "step": 1256 + }, + { + "epoch": 0.358426005132592, + "grad_norm": 1.2092687912850297, + "learning_rate": 7.430351758028682e-06, + "loss": 0.0301, + "step": 1257 + }, + { + "epoch": 0.3587111491303108, + "grad_norm": 0.6835211094012288, + "learning_rate": 7.426314409383768e-06, + "loss": 0.0062, + "step": 1258 + }, + { + "epoch": 0.35899629312802966, + "grad_norm": 1.3255750122183771, + "learning_rate": 7.422274990437682e-06, + "loss": 0.0318, + "step": 1259 + }, + { + "epoch": 0.3592814371257485, + "grad_norm": 0.9156502566543411, + "learning_rate": 7.418233504637138e-06, + "loss": 0.0143, + "step": 1260 + }, + { + "epoch": 0.35956658112346734, + "grad_norm": 1.1727165754738422, + "learning_rate": 7.414189955430615e-06, + "loss": 0.0369, + "step": 1261 + }, + { + "epoch": 0.3598517251211862, + "grad_norm": 1.4468665217354895, + "learning_rate": 7.410144346268351e-06, + "loss": 0.0417, + "step": 1262 + }, + { + "epoch": 0.360136869118905, + "grad_norm": 1.824457712002065, + "learning_rate": 7.4060966806023445e-06, + "loss": 0.0353, + "step": 1263 + }, + { + "epoch": 0.3604220131166239, + "grad_norm": 0.9767535191086671, + "learning_rate": 7.4020469618863455e-06, + "loss": 0.021, + "step": 1264 + }, + { + "epoch": 0.36070715711434276, + "grad_norm": 1.079593807537144, + "learning_rate": 7.3979951935758596e-06, + "loss": 0.0178, + "step": 1265 + }, + { + "epoch": 0.3609923011120616, + "grad_norm": 1.6139308826860321, + "learning_rate": 7.393941379128136e-06, + "loss": 0.0293, + "step": 1266 + }, + { + "epoch": 0.36127744510978044, + "grad_norm": 1.1527677219155397, + "learning_rate": 7.3898855220021734e-06, + "loss": 0.032, + "step": 1267 + }, + { + "epoch": 0.3615625891074993, + "grad_norm": 1.5856659207811066, + "learning_rate": 7.385827625658713e-06, + "loss": 0.0261, + "step": 1268 + }, + { + "epoch": 0.3618477331052181, + "grad_norm": 1.086362153999573, + "learning_rate": 7.3817676935602376e-06, + "loss": 0.0128, + "step": 1269 + }, + { + "epoch": 0.36213287710293696, + "grad_norm": 1.6988841433717032, + "learning_rate": 7.377705729170962e-06, + "loss": 0.0371, + "step": 1270 + }, + { + "epoch": 0.36241802110065585, + "grad_norm": 1.4325229855389487, + "learning_rate": 7.373641735956843e-06, + "loss": 0.0227, + "step": 1271 + }, + { + "epoch": 0.3627031650983747, + "grad_norm": 0.8830703221340672, + "learning_rate": 7.369575717385557e-06, + "loss": 0.019, + "step": 1272 + }, + { + "epoch": 0.36298830909609353, + "grad_norm": 2.209036557846552, + "learning_rate": 7.365507676926523e-06, + "loss": 0.0346, + "step": 1273 + }, + { + "epoch": 0.36327345309381237, + "grad_norm": 1.5334503346203225, + "learning_rate": 7.361437618050873e-06, + "loss": 0.0407, + "step": 1274 + }, + { + "epoch": 0.3635585970915312, + "grad_norm": 0.8070701947902329, + "learning_rate": 7.3573655442314674e-06, + "loss": 0.0324, + "step": 1275 + }, + { + "epoch": 0.36384374108925005, + "grad_norm": 1.430194693309841, + "learning_rate": 7.353291458942884e-06, + "loss": 0.0196, + "step": 1276 + }, + { + "epoch": 0.36412888508696895, + "grad_norm": 0.3562037606493946, + "learning_rate": 7.349215365661417e-06, + "loss": 0.0126, + "step": 1277 + }, + { + "epoch": 0.3644140290846878, + "grad_norm": 1.1955582337051391, + "learning_rate": 7.345137267865075e-06, + "loss": 0.0287, + "step": 1278 + }, + { + "epoch": 0.3646991730824066, + "grad_norm": 1.5568689189039489, + "learning_rate": 7.341057169033576e-06, + "loss": 0.0283, + "step": 1279 + }, + { + "epoch": 0.36498431708012546, + "grad_norm": 1.3478008946354623, + "learning_rate": 7.336975072648346e-06, + "loss": 0.0288, + "step": 1280 + }, + { + "epoch": 0.3652694610778443, + "grad_norm": 1.0580096876065161, + "learning_rate": 7.332890982192514e-06, + "loss": 0.0166, + "step": 1281 + }, + { + "epoch": 0.36555460507556314, + "grad_norm": 0.5686865684902279, + "learning_rate": 7.328804901150914e-06, + "loss": 0.0116, + "step": 1282 + }, + { + "epoch": 0.365839749073282, + "grad_norm": 1.716366052959552, + "learning_rate": 7.324716833010074e-06, + "loss": 0.0467, + "step": 1283 + }, + { + "epoch": 0.3661248930710009, + "grad_norm": 0.7791990463845944, + "learning_rate": 7.32062678125822e-06, + "loss": 0.0111, + "step": 1284 + }, + { + "epoch": 0.3664100370687197, + "grad_norm": 1.3893700452732172, + "learning_rate": 7.316534749385272e-06, + "loss": 0.0194, + "step": 1285 + }, + { + "epoch": 0.36669518106643856, + "grad_norm": 1.5762832721071067, + "learning_rate": 7.312440740882836e-06, + "loss": 0.0238, + "step": 1286 + }, + { + "epoch": 0.3669803250641574, + "grad_norm": 1.0209692180493717, + "learning_rate": 7.308344759244208e-06, + "loss": 0.0154, + "step": 1287 + }, + { + "epoch": 0.36726546906187624, + "grad_norm": 0.3701931947426716, + "learning_rate": 7.304246807964363e-06, + "loss": 0.0061, + "step": 1288 + }, + { + "epoch": 0.3675506130595951, + "grad_norm": 0.9193845325767704, + "learning_rate": 7.300146890539962e-06, + "loss": 0.0277, + "step": 1289 + }, + { + "epoch": 0.3678357570573139, + "grad_norm": 1.8222398498763657, + "learning_rate": 7.2960450104693415e-06, + "loss": 0.04, + "step": 1290 + }, + { + "epoch": 0.3681209010550328, + "grad_norm": 0.643942314383188, + "learning_rate": 7.291941171252512e-06, + "loss": 0.0084, + "step": 1291 + }, + { + "epoch": 0.36840604505275165, + "grad_norm": 2.2276882078911346, + "learning_rate": 7.287835376391157e-06, + "loss": 0.0351, + "step": 1292 + }, + { + "epoch": 0.3686911890504705, + "grad_norm": 1.0734551871869193, + "learning_rate": 7.283727629388628e-06, + "loss": 0.0255, + "step": 1293 + }, + { + "epoch": 0.36897633304818933, + "grad_norm": 0.9919685813925658, + "learning_rate": 7.279617933749937e-06, + "loss": 0.0188, + "step": 1294 + }, + { + "epoch": 0.36926147704590817, + "grad_norm": 1.6546322046981001, + "learning_rate": 7.275506292981771e-06, + "loss": 0.032, + "step": 1295 + }, + { + "epoch": 0.369546621043627, + "grad_norm": 0.6427488874964754, + "learning_rate": 7.271392710592466e-06, + "loss": 0.0183, + "step": 1296 + }, + { + "epoch": 0.3698317650413459, + "grad_norm": 2.1105811666825605, + "learning_rate": 7.2672771900920195e-06, + "loss": 0.0342, + "step": 1297 + }, + { + "epoch": 0.37011690903906475, + "grad_norm": 1.7984565917708282, + "learning_rate": 7.263159734992079e-06, + "loss": 0.0358, + "step": 1298 + }, + { + "epoch": 0.3704020530367836, + "grad_norm": 2.0892827974495973, + "learning_rate": 7.259040348805948e-06, + "loss": 0.0637, + "step": 1299 + }, + { + "epoch": 0.3706871970345024, + "grad_norm": 1.9245223100243691, + "learning_rate": 7.2549190350485734e-06, + "loss": 0.0256, + "step": 1300 + }, + { + "epoch": 0.37097234103222126, + "grad_norm": 1.332108407780507, + "learning_rate": 7.250795797236549e-06, + "loss": 0.0288, + "step": 1301 + }, + { + "epoch": 0.3712574850299401, + "grad_norm": 2.832303589679555, + "learning_rate": 7.246670638888109e-06, + "loss": 0.0746, + "step": 1302 + }, + { + "epoch": 0.37154262902765894, + "grad_norm": 1.105425246209834, + "learning_rate": 7.242543563523128e-06, + "loss": 0.0386, + "step": 1303 + }, + { + "epoch": 0.37182777302537784, + "grad_norm": 1.228230849256297, + "learning_rate": 7.238414574663115e-06, + "loss": 0.0233, + "step": 1304 + }, + { + "epoch": 0.3721129170230967, + "grad_norm": 1.4222277233624963, + "learning_rate": 7.234283675831212e-06, + "loss": 0.0163, + "step": 1305 + }, + { + "epoch": 0.3723980610208155, + "grad_norm": 1.6737164988568887, + "learning_rate": 7.230150870552191e-06, + "loss": 0.0436, + "step": 1306 + }, + { + "epoch": 0.37268320501853436, + "grad_norm": 0.6672570457070205, + "learning_rate": 7.22601616235245e-06, + "loss": 0.0154, + "step": 1307 + }, + { + "epoch": 0.3729683490162532, + "grad_norm": 0.5059774786468529, + "learning_rate": 7.221879554760012e-06, + "loss": 0.0119, + "step": 1308 + }, + { + "epoch": 0.37325349301397204, + "grad_norm": 2.10429975207149, + "learning_rate": 7.217741051304519e-06, + "loss": 0.0672, + "step": 1309 + }, + { + "epoch": 0.3735386370116909, + "grad_norm": 2.0092255665050374, + "learning_rate": 7.213600655517233e-06, + "loss": 0.0542, + "step": 1310 + }, + { + "epoch": 0.3738237810094098, + "grad_norm": 1.352010356984478, + "learning_rate": 7.209458370931029e-06, + "loss": 0.0142, + "step": 1311 + }, + { + "epoch": 0.3741089250071286, + "grad_norm": 1.3811098769189896, + "learning_rate": 7.205314201080392e-06, + "loss": 0.0243, + "step": 1312 + }, + { + "epoch": 0.37439406900484745, + "grad_norm": 1.1209070881289154, + "learning_rate": 7.201168149501421e-06, + "loss": 0.0389, + "step": 1313 + }, + { + "epoch": 0.3746792130025663, + "grad_norm": 2.065215176648648, + "learning_rate": 7.197020219731814e-06, + "loss": 0.0266, + "step": 1314 + }, + { + "epoch": 0.37496435700028513, + "grad_norm": 2.5480094718136703, + "learning_rate": 7.192870415310877e-06, + "loss": 0.0328, + "step": 1315 + }, + { + "epoch": 0.37524950099800397, + "grad_norm": 0.7872176078118113, + "learning_rate": 7.188718739779511e-06, + "loss": 0.0139, + "step": 1316 + }, + { + "epoch": 0.37553464499572287, + "grad_norm": 0.6808650456171224, + "learning_rate": 7.1845651966802184e-06, + "loss": 0.0234, + "step": 1317 + }, + { + "epoch": 0.3758197889934417, + "grad_norm": 0.49101205287280486, + "learning_rate": 7.1804097895570924e-06, + "loss": 0.0178, + "step": 1318 + }, + { + "epoch": 0.37610493299116055, + "grad_norm": 1.0821122693778222, + "learning_rate": 7.176252521955817e-06, + "loss": 0.025, + "step": 1319 + }, + { + "epoch": 0.3763900769888794, + "grad_norm": 1.6304437741369961, + "learning_rate": 7.172093397423663e-06, + "loss": 0.0422, + "step": 1320 + }, + { + "epoch": 0.3766752209865982, + "grad_norm": 0.7386475855241287, + "learning_rate": 7.167932419509485e-06, + "loss": 0.0249, + "step": 1321 + }, + { + "epoch": 0.37696036498431706, + "grad_norm": 1.2169154892764618, + "learning_rate": 7.163769591763723e-06, + "loss": 0.045, + "step": 1322 + }, + { + "epoch": 0.3772455089820359, + "grad_norm": 1.780773790899091, + "learning_rate": 7.159604917738392e-06, + "loss": 0.0457, + "step": 1323 + }, + { + "epoch": 0.3775306529797548, + "grad_norm": 1.3352312459837943, + "learning_rate": 7.155438400987083e-06, + "loss": 0.0429, + "step": 1324 + }, + { + "epoch": 0.37781579697747364, + "grad_norm": 1.1305902394114138, + "learning_rate": 7.151270045064958e-06, + "loss": 0.0341, + "step": 1325 + }, + { + "epoch": 0.3781009409751925, + "grad_norm": 1.126670135618066, + "learning_rate": 7.147099853528753e-06, + "loss": 0.0199, + "step": 1326 + }, + { + "epoch": 0.3783860849729113, + "grad_norm": 1.137621576993765, + "learning_rate": 7.142927829936766e-06, + "loss": 0.0518, + "step": 1327 + }, + { + "epoch": 0.37867122897063016, + "grad_norm": 1.1773282692234626, + "learning_rate": 7.138753977848858e-06, + "loss": 0.0279, + "step": 1328 + }, + { + "epoch": 0.378956372968349, + "grad_norm": 0.5840674438803083, + "learning_rate": 7.134578300826452e-06, + "loss": 0.0224, + "step": 1329 + }, + { + "epoch": 0.37924151696606784, + "grad_norm": 1.1129186349385658, + "learning_rate": 7.130400802432529e-06, + "loss": 0.0322, + "step": 1330 + }, + { + "epoch": 0.37952666096378673, + "grad_norm": 1.8462980707500372, + "learning_rate": 7.12622148623162e-06, + "loss": 0.0342, + "step": 1331 + }, + { + "epoch": 0.3798118049615056, + "grad_norm": 2.107804657803562, + "learning_rate": 7.122040355789815e-06, + "loss": 0.0427, + "step": 1332 + }, + { + "epoch": 0.3800969489592244, + "grad_norm": 0.7450612627575915, + "learning_rate": 7.117857414674741e-06, + "loss": 0.0197, + "step": 1333 + }, + { + "epoch": 0.38038209295694325, + "grad_norm": 1.1691762047073135, + "learning_rate": 7.11367266645558e-06, + "loss": 0.0276, + "step": 1334 + }, + { + "epoch": 0.3806672369546621, + "grad_norm": 1.0971833274442095, + "learning_rate": 7.1094861147030514e-06, + "loss": 0.0326, + "step": 1335 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 1.5810742872190795, + "learning_rate": 7.105297762989413e-06, + "loss": 0.038, + "step": 1336 + }, + { + "epoch": 0.3812375249500998, + "grad_norm": 1.801470439156759, + "learning_rate": 7.101107614888458e-06, + "loss": 0.0214, + "step": 1337 + }, + { + "epoch": 0.38152266894781867, + "grad_norm": 0.7796639176297265, + "learning_rate": 7.096915673975517e-06, + "loss": 0.0145, + "step": 1338 + }, + { + "epoch": 0.3818078129455375, + "grad_norm": 0.3858558390410885, + "learning_rate": 7.092721943827446e-06, + "loss": 0.0105, + "step": 1339 + }, + { + "epoch": 0.38209295694325635, + "grad_norm": 1.033014186947543, + "learning_rate": 7.088526428022628e-06, + "loss": 0.0281, + "step": 1340 + }, + { + "epoch": 0.3823781009409752, + "grad_norm": 1.9224999850785538, + "learning_rate": 7.084329130140972e-06, + "loss": 0.0465, + "step": 1341 + }, + { + "epoch": 0.382663244938694, + "grad_norm": 1.6708579732466118, + "learning_rate": 7.080130053763906e-06, + "loss": 0.0379, + "step": 1342 + }, + { + "epoch": 0.38294838893641286, + "grad_norm": 0.8987241866315161, + "learning_rate": 7.075929202474374e-06, + "loss": 0.0251, + "step": 1343 + }, + { + "epoch": 0.38323353293413176, + "grad_norm": 1.339937928755385, + "learning_rate": 7.071726579856838e-06, + "loss": 0.0561, + "step": 1344 + }, + { + "epoch": 0.3835186769318506, + "grad_norm": 0.958679031321206, + "learning_rate": 7.067522189497269e-06, + "loss": 0.041, + "step": 1345 + }, + { + "epoch": 0.38380382092956944, + "grad_norm": 1.3336443045217854, + "learning_rate": 7.063316034983146e-06, + "loss": 0.0176, + "step": 1346 + }, + { + "epoch": 0.3840889649272883, + "grad_norm": 0.5339713884516041, + "learning_rate": 7.059108119903455e-06, + "loss": 0.0127, + "step": 1347 + }, + { + "epoch": 0.3843741089250071, + "grad_norm": 0.4258851843158557, + "learning_rate": 7.054898447848684e-06, + "loss": 0.0169, + "step": 1348 + }, + { + "epoch": 0.38465925292272596, + "grad_norm": 1.1816276456155221, + "learning_rate": 7.050687022410819e-06, + "loss": 0.0324, + "step": 1349 + }, + { + "epoch": 0.3849443969204448, + "grad_norm": 0.8946255755547217, + "learning_rate": 7.0464738471833436e-06, + "loss": 0.0326, + "step": 1350 + }, + { + "epoch": 0.3852295409181637, + "grad_norm": 1.1202911450635167, + "learning_rate": 7.042258925761233e-06, + "loss": 0.0228, + "step": 1351 + }, + { + "epoch": 0.38551468491588253, + "grad_norm": 1.6902455592606156, + "learning_rate": 7.038042261740952e-06, + "loss": 0.0473, + "step": 1352 + }, + { + "epoch": 0.3857998289136014, + "grad_norm": 0.5948513631594184, + "learning_rate": 7.033823858720454e-06, + "loss": 0.02, + "step": 1353 + }, + { + "epoch": 0.3860849729113202, + "grad_norm": 1.7951572919018244, + "learning_rate": 7.029603720299178e-06, + "loss": 0.0324, + "step": 1354 + }, + { + "epoch": 0.38637011690903905, + "grad_norm": 2.0429468456038555, + "learning_rate": 7.025381850078037e-06, + "loss": 0.0345, + "step": 1355 + }, + { + "epoch": 0.3866552609067579, + "grad_norm": 0.6318621960506157, + "learning_rate": 7.021158251659429e-06, + "loss": 0.0163, + "step": 1356 + }, + { + "epoch": 0.3869404049044768, + "grad_norm": 1.4457221608558466, + "learning_rate": 7.0169329286472235e-06, + "loss": 0.0377, + "step": 1357 + }, + { + "epoch": 0.3872255489021956, + "grad_norm": 1.01032061892398, + "learning_rate": 7.01270588464676e-06, + "loss": 0.0219, + "step": 1358 + }, + { + "epoch": 0.38751069289991447, + "grad_norm": 0.748496429252588, + "learning_rate": 7.008477123264849e-06, + "loss": 0.0324, + "step": 1359 + }, + { + "epoch": 0.3877958368976333, + "grad_norm": 1.1526332961541026, + "learning_rate": 7.004246648109765e-06, + "loss": 0.0254, + "step": 1360 + }, + { + "epoch": 0.38808098089535215, + "grad_norm": 1.4294101465113094, + "learning_rate": 7.000014462791245e-06, + "loss": 0.024, + "step": 1361 + }, + { + "epoch": 0.388366124893071, + "grad_norm": 0.5519193648295497, + "learning_rate": 6.995780570920488e-06, + "loss": 0.0244, + "step": 1362 + }, + { + "epoch": 0.3886512688907898, + "grad_norm": 1.054024957106729, + "learning_rate": 6.991544976110144e-06, + "loss": 0.0187, + "step": 1363 + }, + { + "epoch": 0.3889364128885087, + "grad_norm": 1.2014758092356506, + "learning_rate": 6.98730768197432e-06, + "loss": 0.0396, + "step": 1364 + }, + { + "epoch": 0.38922155688622756, + "grad_norm": 0.7448263297295125, + "learning_rate": 6.9830686921285724e-06, + "loss": 0.0155, + "step": 1365 + }, + { + "epoch": 0.3895067008839464, + "grad_norm": 0.844577177735389, + "learning_rate": 6.978828010189903e-06, + "loss": 0.0129, + "step": 1366 + }, + { + "epoch": 0.38979184488166524, + "grad_norm": 0.818598787369236, + "learning_rate": 6.974585639776757e-06, + "loss": 0.0155, + "step": 1367 + }, + { + "epoch": 0.3900769888793841, + "grad_norm": 0.6332227156256431, + "learning_rate": 6.970341584509025e-06, + "loss": 0.0143, + "step": 1368 + }, + { + "epoch": 0.3903621328771029, + "grad_norm": 1.3864888311347223, + "learning_rate": 6.966095848008028e-06, + "loss": 0.0232, + "step": 1369 + }, + { + "epoch": 0.39064727687482176, + "grad_norm": 0.807302899981098, + "learning_rate": 6.9618484338965274e-06, + "loss": 0.0136, + "step": 1370 + }, + { + "epoch": 0.39093242087254065, + "grad_norm": 0.7532233628664813, + "learning_rate": 6.957599345798714e-06, + "loss": 0.0121, + "step": 1371 + }, + { + "epoch": 0.3912175648702595, + "grad_norm": 0.8057671659634474, + "learning_rate": 6.953348587340205e-06, + "loss": 0.0078, + "step": 1372 + }, + { + "epoch": 0.39150270886797833, + "grad_norm": 1.6056173303746508, + "learning_rate": 6.949096162148048e-06, + "loss": 0.0255, + "step": 1373 + }, + { + "epoch": 0.3917878528656972, + "grad_norm": 0.921091363738595, + "learning_rate": 6.944842073850709e-06, + "loss": 0.0411, + "step": 1374 + }, + { + "epoch": 0.392072996863416, + "grad_norm": 1.0604866928198136, + "learning_rate": 6.94058632607807e-06, + "loss": 0.0129, + "step": 1375 + }, + { + "epoch": 0.39235814086113485, + "grad_norm": 1.0589721417569484, + "learning_rate": 6.9363289224614395e-06, + "loss": 0.0358, + "step": 1376 + }, + { + "epoch": 0.39264328485885375, + "grad_norm": 0.7464112469258565, + "learning_rate": 6.932069866633524e-06, + "loss": 0.015, + "step": 1377 + }, + { + "epoch": 0.3929284288565726, + "grad_norm": 0.911581766103546, + "learning_rate": 6.927809162228456e-06, + "loss": 0.0067, + "step": 1378 + }, + { + "epoch": 0.3932135728542914, + "grad_norm": 1.9277059427524286, + "learning_rate": 6.923546812881759e-06, + "loss": 0.0528, + "step": 1379 + }, + { + "epoch": 0.39349871685201027, + "grad_norm": 1.1879086450728118, + "learning_rate": 6.919282822230372e-06, + "loss": 0.0475, + "step": 1380 + }, + { + "epoch": 0.3937838608497291, + "grad_norm": 1.0922499341160856, + "learning_rate": 6.91501719391263e-06, + "loss": 0.0235, + "step": 1381 + }, + { + "epoch": 0.39406900484744795, + "grad_norm": 1.0937840797322815, + "learning_rate": 6.910749931568265e-06, + "loss": 0.0235, + "step": 1382 + }, + { + "epoch": 0.3943541488451668, + "grad_norm": 0.7304053983961637, + "learning_rate": 6.906481038838401e-06, + "loss": 0.0258, + "step": 1383 + }, + { + "epoch": 0.3946392928428857, + "grad_norm": 0.937712719820358, + "learning_rate": 6.902210519365561e-06, + "loss": 0.0246, + "step": 1384 + }, + { + "epoch": 0.3949244368406045, + "grad_norm": 1.4809251423169405, + "learning_rate": 6.897938376793646e-06, + "loss": 0.0474, + "step": 1385 + }, + { + "epoch": 0.39520958083832336, + "grad_norm": 0.6276884351320375, + "learning_rate": 6.89366461476795e-06, + "loss": 0.0167, + "step": 1386 + }, + { + "epoch": 0.3954947248360422, + "grad_norm": 1.567668460942998, + "learning_rate": 6.889389236935145e-06, + "loss": 0.0301, + "step": 1387 + }, + { + "epoch": 0.39577986883376104, + "grad_norm": 1.7706303043509528, + "learning_rate": 6.885112246943282e-06, + "loss": 0.0306, + "step": 1388 + }, + { + "epoch": 0.3960650128314799, + "grad_norm": 1.4185752918637853, + "learning_rate": 6.880833648441788e-06, + "loss": 0.0288, + "step": 1389 + }, + { + "epoch": 0.3963501568291987, + "grad_norm": 1.8432413219321235, + "learning_rate": 6.876553445081463e-06, + "loss": 0.0333, + "step": 1390 + }, + { + "epoch": 0.3966353008269176, + "grad_norm": 0.7805065400041529, + "learning_rate": 6.872271640514475e-06, + "loss": 0.0211, + "step": 1391 + }, + { + "epoch": 0.39692044482463645, + "grad_norm": 1.4299986011501835, + "learning_rate": 6.867988238394361e-06, + "loss": 0.018, + "step": 1392 + }, + { + "epoch": 0.3972055888223553, + "grad_norm": 0.8738977526847553, + "learning_rate": 6.863703242376016e-06, + "loss": 0.0213, + "step": 1393 + }, + { + "epoch": 0.39749073282007413, + "grad_norm": 1.883407055792336, + "learning_rate": 6.859416656115702e-06, + "loss": 0.0263, + "step": 1394 + }, + { + "epoch": 0.397775876817793, + "grad_norm": 0.42498552122724303, + "learning_rate": 6.855128483271033e-06, + "loss": 0.0159, + "step": 1395 + }, + { + "epoch": 0.3980610208155118, + "grad_norm": 0.6640145903003858, + "learning_rate": 6.850838727500977e-06, + "loss": 0.0241, + "step": 1396 + }, + { + "epoch": 0.3983461648132307, + "grad_norm": 1.7915114587711756, + "learning_rate": 6.846547392465854e-06, + "loss": 0.0256, + "step": 1397 + }, + { + "epoch": 0.39863130881094955, + "grad_norm": 2.294360606894688, + "learning_rate": 6.8422544818273336e-06, + "loss": 0.0479, + "step": 1398 + }, + { + "epoch": 0.3989164528086684, + "grad_norm": 0.8371789914473499, + "learning_rate": 6.837959999248423e-06, + "loss": 0.019, + "step": 1399 + }, + { + "epoch": 0.3992015968063872, + "grad_norm": 0.7596077582888351, + "learning_rate": 6.833663948393479e-06, + "loss": 0.0183, + "step": 1400 + }, + { + "epoch": 0.39948674080410607, + "grad_norm": 1.3500494545169412, + "learning_rate": 6.829366332928191e-06, + "loss": 0.0322, + "step": 1401 + }, + { + "epoch": 0.3997718848018249, + "grad_norm": 0.5127260480269076, + "learning_rate": 6.825067156519584e-06, + "loss": 0.0158, + "step": 1402 + }, + { + "epoch": 0.40005702879954375, + "grad_norm": 0.6087593590841202, + "learning_rate": 6.82076642283602e-06, + "loss": 0.01, + "step": 1403 + }, + { + "epoch": 0.40034217279726264, + "grad_norm": 1.174996634641959, + "learning_rate": 6.816464135547183e-06, + "loss": 0.0217, + "step": 1404 + }, + { + "epoch": 0.4006273167949815, + "grad_norm": 0.8541388840281969, + "learning_rate": 6.812160298324083e-06, + "loss": 0.0153, + "step": 1405 + }, + { + "epoch": 0.4009124607927003, + "grad_norm": 1.0410711202292575, + "learning_rate": 6.807854914839061e-06, + "loss": 0.0274, + "step": 1406 + }, + { + "epoch": 0.40119760479041916, + "grad_norm": 1.2608003422751877, + "learning_rate": 6.803547988765765e-06, + "loss": 0.0348, + "step": 1407 + }, + { + "epoch": 0.401482748788138, + "grad_norm": 0.8195812427271472, + "learning_rate": 6.799239523779171e-06, + "loss": 0.0203, + "step": 1408 + }, + { + "epoch": 0.40176789278585684, + "grad_norm": 0.6649371018762259, + "learning_rate": 6.794929523555559e-06, + "loss": 0.0141, + "step": 1409 + }, + { + "epoch": 0.4020530367835757, + "grad_norm": 0.38933100823711925, + "learning_rate": 6.790617991772525e-06, + "loss": 0.0063, + "step": 1410 + }, + { + "epoch": 0.4023381807812946, + "grad_norm": 1.8663796814203395, + "learning_rate": 6.786304932108967e-06, + "loss": 0.0551, + "step": 1411 + }, + { + "epoch": 0.4026233247790134, + "grad_norm": 2.198862423064739, + "learning_rate": 6.781990348245092e-06, + "loss": 0.0528, + "step": 1412 + }, + { + "epoch": 0.40290846877673225, + "grad_norm": 1.5778666491396456, + "learning_rate": 6.7776742438624026e-06, + "loss": 0.0272, + "step": 1413 + }, + { + "epoch": 0.4031936127744511, + "grad_norm": 1.082839151667033, + "learning_rate": 6.773356622643703e-06, + "loss": 0.0173, + "step": 1414 + }, + { + "epoch": 0.40347875677216993, + "grad_norm": 1.2816980567733227, + "learning_rate": 6.769037488273087e-06, + "loss": 0.0382, + "step": 1415 + }, + { + "epoch": 0.4037639007698888, + "grad_norm": 1.6873641138117723, + "learning_rate": 6.764716844435944e-06, + "loss": 0.0343, + "step": 1416 + }, + { + "epoch": 0.40404904476760767, + "grad_norm": 1.5688848234236163, + "learning_rate": 6.760394694818949e-06, + "loss": 0.0323, + "step": 1417 + }, + { + "epoch": 0.4043341887653265, + "grad_norm": 0.613312664131555, + "learning_rate": 6.756071043110064e-06, + "loss": 0.0163, + "step": 1418 + }, + { + "epoch": 0.40461933276304535, + "grad_norm": 1.0497026541574568, + "learning_rate": 6.751745892998527e-06, + "loss": 0.0308, + "step": 1419 + }, + { + "epoch": 0.4049044767607642, + "grad_norm": 1.6397322996436774, + "learning_rate": 6.747419248174864e-06, + "loss": 0.0263, + "step": 1420 + }, + { + "epoch": 0.405189620758483, + "grad_norm": 0.718022101787763, + "learning_rate": 6.743091112330866e-06, + "loss": 0.0115, + "step": 1421 + }, + { + "epoch": 0.40547476475620187, + "grad_norm": 0.8207958542231343, + "learning_rate": 6.738761489159604e-06, + "loss": 0.0319, + "step": 1422 + }, + { + "epoch": 0.4057599087539207, + "grad_norm": 1.574000171294008, + "learning_rate": 6.734430382355417e-06, + "loss": 0.0293, + "step": 1423 + }, + { + "epoch": 0.4060450527516396, + "grad_norm": 1.019984530943042, + "learning_rate": 6.730097795613903e-06, + "loss": 0.0124, + "step": 1424 + }, + { + "epoch": 0.40633019674935844, + "grad_norm": 2.417307028940788, + "learning_rate": 6.725763732631933e-06, + "loss": 0.0384, + "step": 1425 + }, + { + "epoch": 0.4066153407470773, + "grad_norm": 0.7247645218622035, + "learning_rate": 6.721428197107631e-06, + "loss": 0.0126, + "step": 1426 + }, + { + "epoch": 0.4069004847447961, + "grad_norm": 1.48458332799653, + "learning_rate": 6.717091192740378e-06, + "loss": 0.0422, + "step": 1427 + }, + { + "epoch": 0.40718562874251496, + "grad_norm": 2.9745577201399938, + "learning_rate": 6.712752723230812e-06, + "loss": 0.0758, + "step": 1428 + }, + { + "epoch": 0.4074707727402338, + "grad_norm": 1.2408092239041282, + "learning_rate": 6.708412792280816e-06, + "loss": 0.0236, + "step": 1429 + }, + { + "epoch": 0.40775591673795264, + "grad_norm": 0.505712627957945, + "learning_rate": 6.704071403593524e-06, + "loss": 0.0075, + "step": 1430 + }, + { + "epoch": 0.40804106073567153, + "grad_norm": 0.8007241478639011, + "learning_rate": 6.699728560873313e-06, + "loss": 0.012, + "step": 1431 + }, + { + "epoch": 0.4083262047333904, + "grad_norm": 1.22383156743963, + "learning_rate": 6.695384267825799e-06, + "loss": 0.022, + "step": 1432 + }, + { + "epoch": 0.4086113487311092, + "grad_norm": 1.643834967167695, + "learning_rate": 6.691038528157836e-06, + "loss": 0.0297, + "step": 1433 + }, + { + "epoch": 0.40889649272882805, + "grad_norm": 0.981130245185286, + "learning_rate": 6.686691345577517e-06, + "loss": 0.0311, + "step": 1434 + }, + { + "epoch": 0.4091816367265469, + "grad_norm": 0.7535527214250041, + "learning_rate": 6.682342723794157e-06, + "loss": 0.0079, + "step": 1435 + }, + { + "epoch": 0.40946678072426573, + "grad_norm": 1.4979534384423328, + "learning_rate": 6.67799266651831e-06, + "loss": 0.0286, + "step": 1436 + }, + { + "epoch": 0.40975192472198463, + "grad_norm": 2.7428696820621137, + "learning_rate": 6.673641177461743e-06, + "loss": 0.0526, + "step": 1437 + }, + { + "epoch": 0.41003706871970347, + "grad_norm": 3.078770079875373, + "learning_rate": 6.669288260337455e-06, + "loss": 0.0906, + "step": 1438 + }, + { + "epoch": 0.4103222127174223, + "grad_norm": 1.6189746261305773, + "learning_rate": 6.66493391885966e-06, + "loss": 0.0318, + "step": 1439 + }, + { + "epoch": 0.41060735671514115, + "grad_norm": 1.2290982178336693, + "learning_rate": 6.660578156743782e-06, + "loss": 0.0205, + "step": 1440 + }, + { + "epoch": 0.41089250071286, + "grad_norm": 0.9641872975294026, + "learning_rate": 6.656220977706465e-06, + "loss": 0.0189, + "step": 1441 + }, + { + "epoch": 0.4111776447105788, + "grad_norm": 1.1312515292408134, + "learning_rate": 6.6518623854655615e-06, + "loss": 0.0288, + "step": 1442 + }, + { + "epoch": 0.41146278870829767, + "grad_norm": 0.8164142521434059, + "learning_rate": 6.647502383740123e-06, + "loss": 0.019, + "step": 1443 + }, + { + "epoch": 0.41174793270601656, + "grad_norm": 2.1749643294622114, + "learning_rate": 6.64314097625041e-06, + "loss": 0.0326, + "step": 1444 + }, + { + "epoch": 0.4120330767037354, + "grad_norm": 0.6688149744527794, + "learning_rate": 6.638778166717879e-06, + "loss": 0.0117, + "step": 1445 + }, + { + "epoch": 0.41231822070145424, + "grad_norm": 0.9144749242956802, + "learning_rate": 6.634413958865187e-06, + "loss": 0.0292, + "step": 1446 + }, + { + "epoch": 0.4126033646991731, + "grad_norm": 2.0650177025758447, + "learning_rate": 6.6300483564161775e-06, + "loss": 0.0408, + "step": 1447 + }, + { + "epoch": 0.4128885086968919, + "grad_norm": 1.5032365198506417, + "learning_rate": 6.625681363095892e-06, + "loss": 0.0305, + "step": 1448 + }, + { + "epoch": 0.41317365269461076, + "grad_norm": 1.2503228614033306, + "learning_rate": 6.621312982630551e-06, + "loss": 0.0214, + "step": 1449 + }, + { + "epoch": 0.4134587966923296, + "grad_norm": 1.5579491899758215, + "learning_rate": 6.616943218747566e-06, + "loss": 0.0208, + "step": 1450 + }, + { + "epoch": 0.4137439406900485, + "grad_norm": 0.5046986151483627, + "learning_rate": 6.612572075175521e-06, + "loss": 0.0077, + "step": 1451 + }, + { + "epoch": 0.41402908468776733, + "grad_norm": 1.3996170926428428, + "learning_rate": 6.608199555644186e-06, + "loss": 0.0335, + "step": 1452 + }, + { + "epoch": 0.4143142286854862, + "grad_norm": 1.8024425394477386, + "learning_rate": 6.603825663884497e-06, + "loss": 0.0391, + "step": 1453 + }, + { + "epoch": 0.414599372683205, + "grad_norm": 1.4905431440579864, + "learning_rate": 6.599450403628564e-06, + "loss": 0.0417, + "step": 1454 + }, + { + "epoch": 0.41488451668092385, + "grad_norm": 1.874227242654859, + "learning_rate": 6.59507377860967e-06, + "loss": 0.06, + "step": 1455 + }, + { + "epoch": 0.4151696606786427, + "grad_norm": 0.6546908200537064, + "learning_rate": 6.590695792562251e-06, + "loss": 0.0129, + "step": 1456 + }, + { + "epoch": 0.4154548046763616, + "grad_norm": 1.2440886365948998, + "learning_rate": 6.5863164492219165e-06, + "loss": 0.0125, + "step": 1457 + }, + { + "epoch": 0.41573994867408043, + "grad_norm": 0.9649143457410724, + "learning_rate": 6.5819357523254255e-06, + "loss": 0.0215, + "step": 1458 + }, + { + "epoch": 0.41602509267179927, + "grad_norm": 0.9042777652710828, + "learning_rate": 6.577553705610695e-06, + "loss": 0.0156, + "step": 1459 + }, + { + "epoch": 0.4163102366695181, + "grad_norm": 1.5328371548614659, + "learning_rate": 6.573170312816797e-06, + "loss": 0.0389, + "step": 1460 + }, + { + "epoch": 0.41659538066723695, + "grad_norm": 1.7516837901407996, + "learning_rate": 6.568785577683945e-06, + "loss": 0.0282, + "step": 1461 + }, + { + "epoch": 0.4168805246649558, + "grad_norm": 0.7279340731763488, + "learning_rate": 6.564399503953502e-06, + "loss": 0.0082, + "step": 1462 + }, + { + "epoch": 0.4171656686626746, + "grad_norm": 1.0423109263190555, + "learning_rate": 6.560012095367976e-06, + "loss": 0.0256, + "step": 1463 + }, + { + "epoch": 0.4174508126603935, + "grad_norm": 0.8846001166727955, + "learning_rate": 6.555623355671008e-06, + "loss": 0.0184, + "step": 1464 + }, + { + "epoch": 0.41773595665811236, + "grad_norm": 0.8616631964720626, + "learning_rate": 6.551233288607378e-06, + "loss": 0.0269, + "step": 1465 + }, + { + "epoch": 0.4180211006558312, + "grad_norm": 0.6610304293410116, + "learning_rate": 6.5468418979229995e-06, + "loss": 0.011, + "step": 1466 + }, + { + "epoch": 0.41830624465355004, + "grad_norm": 2.0231953278181263, + "learning_rate": 6.542449187364913e-06, + "loss": 0.0481, + "step": 1467 + }, + { + "epoch": 0.4185913886512689, + "grad_norm": 0.7272833402001642, + "learning_rate": 6.538055160681288e-06, + "loss": 0.0129, + "step": 1468 + }, + { + "epoch": 0.4188765326489877, + "grad_norm": 1.4865399155591759, + "learning_rate": 6.533659821621414e-06, + "loss": 0.0188, + "step": 1469 + }, + { + "epoch": 0.41916167664670656, + "grad_norm": 0.9858783676809323, + "learning_rate": 6.529263173935703e-06, + "loss": 0.0218, + "step": 1470 + }, + { + "epoch": 0.41944682064442546, + "grad_norm": 0.9797679454187829, + "learning_rate": 6.524865221375681e-06, + "loss": 0.0236, + "step": 1471 + }, + { + "epoch": 0.4197319646421443, + "grad_norm": 0.6172684716843654, + "learning_rate": 6.52046596769399e-06, + "loss": 0.009, + "step": 1472 + }, + { + "epoch": 0.42001710863986313, + "grad_norm": 1.7614776588873011, + "learning_rate": 6.51606541664438e-06, + "loss": 0.0498, + "step": 1473 + }, + { + "epoch": 0.420302252637582, + "grad_norm": 0.5970946705806073, + "learning_rate": 6.511663571981708e-06, + "loss": 0.0085, + "step": 1474 + }, + { + "epoch": 0.4205873966353008, + "grad_norm": 0.8599355447218311, + "learning_rate": 6.507260437461939e-06, + "loss": 0.0214, + "step": 1475 + }, + { + "epoch": 0.42087254063301965, + "grad_norm": 1.0987421015708239, + "learning_rate": 6.502856016842132e-06, + "loss": 0.0185, + "step": 1476 + }, + { + "epoch": 0.42115768463073855, + "grad_norm": 0.44275052835685647, + "learning_rate": 6.498450313880449e-06, + "loss": 0.0062, + "step": 1477 + }, + { + "epoch": 0.4214428286284574, + "grad_norm": 1.813940453890275, + "learning_rate": 6.4940433323361425e-06, + "loss": 0.0287, + "step": 1478 + }, + { + "epoch": 0.42172797262617623, + "grad_norm": 1.140392945104892, + "learning_rate": 6.489635075969558e-06, + "loss": 0.0305, + "step": 1479 + }, + { + "epoch": 0.42201311662389507, + "grad_norm": 1.8157436853386422, + "learning_rate": 6.485225548542129e-06, + "loss": 0.0561, + "step": 1480 + }, + { + "epoch": 0.4222982606216139, + "grad_norm": 2.1226592153844464, + "learning_rate": 6.4808147538163715e-06, + "loss": 0.0298, + "step": 1481 + }, + { + "epoch": 0.42258340461933275, + "grad_norm": 0.20001428446574743, + "learning_rate": 6.476402695555884e-06, + "loss": 0.005, + "step": 1482 + }, + { + "epoch": 0.4228685486170516, + "grad_norm": 2.5663295718601873, + "learning_rate": 6.471989377525344e-06, + "loss": 0.0587, + "step": 1483 + }, + { + "epoch": 0.4231536926147705, + "grad_norm": 1.0554595923300076, + "learning_rate": 6.467574803490504e-06, + "loss": 0.0356, + "step": 1484 + }, + { + "epoch": 0.4234388366124893, + "grad_norm": 1.0690511762758468, + "learning_rate": 6.463158977218185e-06, + "loss": 0.0199, + "step": 1485 + }, + { + "epoch": 0.42372398061020816, + "grad_norm": 1.1410017307638216, + "learning_rate": 6.458741902476281e-06, + "loss": 0.0301, + "step": 1486 + }, + { + "epoch": 0.424009124607927, + "grad_norm": 2.2153038532557616, + "learning_rate": 6.454323583033748e-06, + "loss": 0.036, + "step": 1487 + }, + { + "epoch": 0.42429426860564584, + "grad_norm": 1.1095267953641579, + "learning_rate": 6.449904022660604e-06, + "loss": 0.0212, + "step": 1488 + }, + { + "epoch": 0.4245794126033647, + "grad_norm": 0.8268000886152754, + "learning_rate": 6.445483225127931e-06, + "loss": 0.029, + "step": 1489 + }, + { + "epoch": 0.4248645566010835, + "grad_norm": 0.5393177997173784, + "learning_rate": 6.441061194207858e-06, + "loss": 0.0102, + "step": 1490 + }, + { + "epoch": 0.4251497005988024, + "grad_norm": 1.8543252777041668, + "learning_rate": 6.436637933673575e-06, + "loss": 0.0242, + "step": 1491 + }, + { + "epoch": 0.42543484459652126, + "grad_norm": 1.63129603581456, + "learning_rate": 6.4322134472993145e-06, + "loss": 0.0428, + "step": 1492 + }, + { + "epoch": 0.4257199885942401, + "grad_norm": 1.6335340263445486, + "learning_rate": 6.42778773886036e-06, + "loss": 0.0353, + "step": 1493 + }, + { + "epoch": 0.42600513259195893, + "grad_norm": 1.5007279654260655, + "learning_rate": 6.423360812133034e-06, + "loss": 0.0449, + "step": 1494 + }, + { + "epoch": 0.4262902765896778, + "grad_norm": 0.34770534993641233, + "learning_rate": 6.4189326708946995e-06, + "loss": 0.0123, + "step": 1495 + }, + { + "epoch": 0.4265754205873966, + "grad_norm": 0.5863596956438394, + "learning_rate": 6.414503318923757e-06, + "loss": 0.0157, + "step": 1496 + }, + { + "epoch": 0.4268605645851155, + "grad_norm": 0.476093826497124, + "learning_rate": 6.410072759999643e-06, + "loss": 0.0125, + "step": 1497 + }, + { + "epoch": 0.42714570858283435, + "grad_norm": 1.7901802911684266, + "learning_rate": 6.405640997902813e-06, + "loss": 0.0329, + "step": 1498 + }, + { + "epoch": 0.4274308525805532, + "grad_norm": 0.5017338212552557, + "learning_rate": 6.401208036414762e-06, + "loss": 0.0056, + "step": 1499 + }, + { + "epoch": 0.42771599657827203, + "grad_norm": 2.0718273104929588, + "learning_rate": 6.396773879318001e-06, + "loss": 0.0372, + "step": 1500 + }, + { + "epoch": 0.42800114057599087, + "grad_norm": 1.55254709417258, + "learning_rate": 6.392338530396065e-06, + "loss": 0.0098, + "step": 1501 + }, + { + "epoch": 0.4282862845737097, + "grad_norm": 0.8803879294604957, + "learning_rate": 6.387901993433501e-06, + "loss": 0.0137, + "step": 1502 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.9428880424761917, + "learning_rate": 6.383464272215874e-06, + "loss": 0.0186, + "step": 1503 + }, + { + "epoch": 0.42885657256914744, + "grad_norm": 1.2850260719004754, + "learning_rate": 6.379025370529755e-06, + "loss": 0.0149, + "step": 1504 + }, + { + "epoch": 0.4291417165668663, + "grad_norm": 0.4867048785280101, + "learning_rate": 6.374585292162732e-06, + "loss": 0.0168, + "step": 1505 + }, + { + "epoch": 0.4294268605645851, + "grad_norm": 0.6247035768828255, + "learning_rate": 6.370144040903385e-06, + "loss": 0.0145, + "step": 1506 + }, + { + "epoch": 0.42971200456230396, + "grad_norm": 0.3953757300162384, + "learning_rate": 6.365701620541304e-06, + "loss": 0.0056, + "step": 1507 + }, + { + "epoch": 0.4299971485600228, + "grad_norm": 0.6458629274018028, + "learning_rate": 6.361258034867071e-06, + "loss": 0.0204, + "step": 1508 + }, + { + "epoch": 0.43028229255774164, + "grad_norm": 1.7431227583997453, + "learning_rate": 6.356813287672262e-06, + "loss": 0.0472, + "step": 1509 + }, + { + "epoch": 0.4305674365554605, + "grad_norm": 1.3899155935216172, + "learning_rate": 6.352367382749448e-06, + "loss": 0.0242, + "step": 1510 + }, + { + "epoch": 0.4308525805531794, + "grad_norm": 1.2294226877678534, + "learning_rate": 6.347920323892189e-06, + "loss": 0.0272, + "step": 1511 + }, + { + "epoch": 0.4311377245508982, + "grad_norm": 1.1560872748885018, + "learning_rate": 6.343472114895022e-06, + "loss": 0.0314, + "step": 1512 + }, + { + "epoch": 0.43142286854861706, + "grad_norm": 0.2978850832967228, + "learning_rate": 6.339022759553474e-06, + "loss": 0.0098, + "step": 1513 + }, + { + "epoch": 0.4317080125463359, + "grad_norm": 1.1481971058772948, + "learning_rate": 6.334572261664041e-06, + "loss": 0.0289, + "step": 1514 + }, + { + "epoch": 0.43199315654405473, + "grad_norm": 2.2882732185225914, + "learning_rate": 6.330120625024204e-06, + "loss": 0.0508, + "step": 1515 + }, + { + "epoch": 0.4322783005417736, + "grad_norm": 0.80064981962937, + "learning_rate": 6.32566785343241e-06, + "loss": 0.0124, + "step": 1516 + }, + { + "epoch": 0.43256344453949247, + "grad_norm": 1.5961426275986392, + "learning_rate": 6.321213950688073e-06, + "loss": 0.0304, + "step": 1517 + }, + { + "epoch": 0.4328485885372113, + "grad_norm": 1.2598701107379529, + "learning_rate": 6.316758920591576e-06, + "loss": 0.0249, + "step": 1518 + }, + { + "epoch": 0.43313373253493015, + "grad_norm": 1.8114852964203785, + "learning_rate": 6.312302766944263e-06, + "loss": 0.0433, + "step": 1519 + }, + { + "epoch": 0.433418876532649, + "grad_norm": 2.1634382274709756, + "learning_rate": 6.307845493548433e-06, + "loss": 0.0384, + "step": 1520 + }, + { + "epoch": 0.43370402053036783, + "grad_norm": 1.6679111444495003, + "learning_rate": 6.303387104207347e-06, + "loss": 0.0361, + "step": 1521 + }, + { + "epoch": 0.43398916452808667, + "grad_norm": 1.0012504241438478, + "learning_rate": 6.2989276027252134e-06, + "loss": 0.0218, + "step": 1522 + }, + { + "epoch": 0.4342743085258055, + "grad_norm": 1.2371276572473315, + "learning_rate": 6.2944669929071885e-06, + "loss": 0.0268, + "step": 1523 + }, + { + "epoch": 0.4345594525235244, + "grad_norm": 0.5710678689888669, + "learning_rate": 6.29000527855938e-06, + "loss": 0.0115, + "step": 1524 + }, + { + "epoch": 0.43484459652124324, + "grad_norm": 1.0062812882252696, + "learning_rate": 6.285542463488834e-06, + "loss": 0.0141, + "step": 1525 + }, + { + "epoch": 0.4351297405189621, + "grad_norm": 1.2851888569327077, + "learning_rate": 6.2810785515035345e-06, + "loss": 0.027, + "step": 1526 + }, + { + "epoch": 0.4354148845166809, + "grad_norm": 1.4671361717954792, + "learning_rate": 6.276613546412406e-06, + "loss": 0.0416, + "step": 1527 + }, + { + "epoch": 0.43570002851439976, + "grad_norm": 1.224206846786883, + "learning_rate": 6.2721474520253e-06, + "loss": 0.0195, + "step": 1528 + }, + { + "epoch": 0.4359851725121186, + "grad_norm": 1.9344425177803088, + "learning_rate": 6.2676802721530035e-06, + "loss": 0.0291, + "step": 1529 + }, + { + "epoch": 0.43627031650983744, + "grad_norm": 0.34256835211154624, + "learning_rate": 6.263212010607226e-06, + "loss": 0.0074, + "step": 1530 + }, + { + "epoch": 0.43655546050755634, + "grad_norm": 1.4291677082822674, + "learning_rate": 6.2587426712006005e-06, + "loss": 0.0313, + "step": 1531 + }, + { + "epoch": 0.4368406045052752, + "grad_norm": 1.1108722988310464, + "learning_rate": 6.254272257746678e-06, + "loss": 0.0235, + "step": 1532 + }, + { + "epoch": 0.437125748502994, + "grad_norm": 0.876472008383419, + "learning_rate": 6.249800774059934e-06, + "loss": 0.0121, + "step": 1533 + }, + { + "epoch": 0.43741089250071286, + "grad_norm": 1.5645424818809175, + "learning_rate": 6.245328223955744e-06, + "loss": 0.0377, + "step": 1534 + }, + { + "epoch": 0.4376960364984317, + "grad_norm": 2.1968349193676375, + "learning_rate": 6.240854611250406e-06, + "loss": 0.0447, + "step": 1535 + }, + { + "epoch": 0.43798118049615054, + "grad_norm": 1.6968143052217195, + "learning_rate": 6.236379939761117e-06, + "loss": 0.0398, + "step": 1536 + }, + { + "epoch": 0.43826632449386943, + "grad_norm": 1.7686078679002104, + "learning_rate": 6.231904213305979e-06, + "loss": 0.0277, + "step": 1537 + }, + { + "epoch": 0.43855146849158827, + "grad_norm": 2.1556753958526764, + "learning_rate": 6.227427435703997e-06, + "loss": 0.0453, + "step": 1538 + }, + { + "epoch": 0.4388366124893071, + "grad_norm": 1.5287737335788831, + "learning_rate": 6.2229496107750685e-06, + "loss": 0.0547, + "step": 1539 + }, + { + "epoch": 0.43912175648702595, + "grad_norm": 1.9309123246484545, + "learning_rate": 6.218470742339987e-06, + "loss": 0.0245, + "step": 1540 + }, + { + "epoch": 0.4394069004847448, + "grad_norm": 1.1195972239606753, + "learning_rate": 6.21399083422044e-06, + "loss": 0.0373, + "step": 1541 + }, + { + "epoch": 0.43969204448246363, + "grad_norm": 0.8200635359851965, + "learning_rate": 6.2095098902389926e-06, + "loss": 0.0108, + "step": 1542 + }, + { + "epoch": 0.43997718848018247, + "grad_norm": 1.633335602846522, + "learning_rate": 6.205027914219105e-06, + "loss": 0.0248, + "step": 1543 + }, + { + "epoch": 0.44026233247790136, + "grad_norm": 0.9579796630214975, + "learning_rate": 6.20054490998511e-06, + "loss": 0.0135, + "step": 1544 + }, + { + "epoch": 0.4405474764756202, + "grad_norm": 0.9207521721967611, + "learning_rate": 6.1960608813622215e-06, + "loss": 0.0276, + "step": 1545 + }, + { + "epoch": 0.44083262047333904, + "grad_norm": 1.5832680552524554, + "learning_rate": 6.191575832176524e-06, + "loss": 0.0256, + "step": 1546 + }, + { + "epoch": 0.4411177644710579, + "grad_norm": 1.3157435085679845, + "learning_rate": 6.187089766254979e-06, + "loss": 0.0224, + "step": 1547 + }, + { + "epoch": 0.4414029084687767, + "grad_norm": 1.026787695779842, + "learning_rate": 6.182602687425407e-06, + "loss": 0.0372, + "step": 1548 + }, + { + "epoch": 0.44168805246649556, + "grad_norm": 0.948727292811553, + "learning_rate": 6.178114599516504e-06, + "loss": 0.0287, + "step": 1549 + }, + { + "epoch": 0.4419731964642144, + "grad_norm": 0.6305066109325781, + "learning_rate": 6.173625506357814e-06, + "loss": 0.0223, + "step": 1550 + }, + { + "epoch": 0.4422583404619333, + "grad_norm": 0.7164994011017372, + "learning_rate": 6.169135411779749e-06, + "loss": 0.0119, + "step": 1551 + }, + { + "epoch": 0.44254348445965214, + "grad_norm": 0.8216759106611643, + "learning_rate": 6.164644319613571e-06, + "loss": 0.0271, + "step": 1552 + }, + { + "epoch": 0.442828628457371, + "grad_norm": 0.8327268854217976, + "learning_rate": 6.160152233691393e-06, + "loss": 0.022, + "step": 1553 + }, + { + "epoch": 0.4431137724550898, + "grad_norm": 1.1286241666524117, + "learning_rate": 6.155659157846178e-06, + "loss": 0.0303, + "step": 1554 + }, + { + "epoch": 0.44339891645280866, + "grad_norm": 1.5466332840928882, + "learning_rate": 6.151165095911733e-06, + "loss": 0.0355, + "step": 1555 + }, + { + "epoch": 0.4436840604505275, + "grad_norm": 1.164291430707095, + "learning_rate": 6.1466700517227044e-06, + "loss": 0.0292, + "step": 1556 + }, + { + "epoch": 0.4439692044482464, + "grad_norm": 0.7237425608379466, + "learning_rate": 6.142174029114579e-06, + "loss": 0.0204, + "step": 1557 + }, + { + "epoch": 0.44425434844596523, + "grad_norm": 1.525607471024849, + "learning_rate": 6.137677031923679e-06, + "loss": 0.0319, + "step": 1558 + }, + { + "epoch": 0.44453949244368407, + "grad_norm": 1.5076148523093191, + "learning_rate": 6.133179063987156e-06, + "loss": 0.0336, + "step": 1559 + }, + { + "epoch": 0.4448246364414029, + "grad_norm": 2.470324848338266, + "learning_rate": 6.128680129142991e-06, + "loss": 0.0173, + "step": 1560 + }, + { + "epoch": 0.44510978043912175, + "grad_norm": 3.040160807782214, + "learning_rate": 6.1241802312299895e-06, + "loss": 0.0709, + "step": 1561 + }, + { + "epoch": 0.4453949244368406, + "grad_norm": 1.0375626585244984, + "learning_rate": 6.119679374087778e-06, + "loss": 0.027, + "step": 1562 + }, + { + "epoch": 0.44568006843455943, + "grad_norm": 1.3814735291614966, + "learning_rate": 6.115177561556806e-06, + "loss": 0.0252, + "step": 1563 + }, + { + "epoch": 0.4459652124322783, + "grad_norm": 1.2628722116063182, + "learning_rate": 6.110674797478332e-06, + "loss": 0.0215, + "step": 1564 + }, + { + "epoch": 0.44625035642999716, + "grad_norm": 1.0677934713873056, + "learning_rate": 6.10617108569443e-06, + "loss": 0.0293, + "step": 1565 + }, + { + "epoch": 0.446535500427716, + "grad_norm": 1.3607904858833422, + "learning_rate": 6.1016664300479835e-06, + "loss": 0.0382, + "step": 1566 + }, + { + "epoch": 0.44682064442543484, + "grad_norm": 2.4973158350192355, + "learning_rate": 6.097160834382678e-06, + "loss": 0.0577, + "step": 1567 + }, + { + "epoch": 0.4471057884231537, + "grad_norm": 1.4543142327687228, + "learning_rate": 6.092654302543002e-06, + "loss": 0.0569, + "step": 1568 + }, + { + "epoch": 0.4473909324208725, + "grad_norm": 1.547257628130523, + "learning_rate": 6.088146838374247e-06, + "loss": 0.0202, + "step": 1569 + }, + { + "epoch": 0.44767607641859136, + "grad_norm": 1.1158703535313215, + "learning_rate": 6.083638445722493e-06, + "loss": 0.0192, + "step": 1570 + }, + { + "epoch": 0.44796122041631026, + "grad_norm": 0.9709682394236485, + "learning_rate": 6.079129128434619e-06, + "loss": 0.031, + "step": 1571 + }, + { + "epoch": 0.4482463644140291, + "grad_norm": 1.6318763828200638, + "learning_rate": 6.074618890358287e-06, + "loss": 0.0184, + "step": 1572 + }, + { + "epoch": 0.44853150841174794, + "grad_norm": 1.8459498966727417, + "learning_rate": 6.07010773534195e-06, + "loss": 0.039, + "step": 1573 + }, + { + "epoch": 0.4488166524094668, + "grad_norm": 1.2404446802074922, + "learning_rate": 6.06559566723484e-06, + "loss": 0.0182, + "step": 1574 + }, + { + "epoch": 0.4491017964071856, + "grad_norm": 1.4442916601321651, + "learning_rate": 6.06108268988697e-06, + "loss": 0.0401, + "step": 1575 + }, + { + "epoch": 0.44938694040490446, + "grad_norm": 0.9320753790810795, + "learning_rate": 6.056568807149127e-06, + "loss": 0.0239, + "step": 1576 + }, + { + "epoch": 0.44967208440262335, + "grad_norm": 1.4017667599463124, + "learning_rate": 6.052054022872873e-06, + "loss": 0.023, + "step": 1577 + }, + { + "epoch": 0.4499572284003422, + "grad_norm": 0.6833624466019657, + "learning_rate": 6.047538340910534e-06, + "loss": 0.0144, + "step": 1578 + }, + { + "epoch": 0.45024237239806103, + "grad_norm": 0.6630610636763746, + "learning_rate": 6.0430217651152105e-06, + "loss": 0.02, + "step": 1579 + }, + { + "epoch": 0.45052751639577987, + "grad_norm": 0.6627483075350021, + "learning_rate": 6.0385042993407574e-06, + "loss": 0.0152, + "step": 1580 + }, + { + "epoch": 0.4508126603934987, + "grad_norm": 0.7801825576978031, + "learning_rate": 6.033985947441795e-06, + "loss": 0.0186, + "step": 1581 + }, + { + "epoch": 0.45109780439121755, + "grad_norm": 0.7431161777834322, + "learning_rate": 6.029466713273695e-06, + "loss": 0.0115, + "step": 1582 + }, + { + "epoch": 0.4513829483889364, + "grad_norm": 0.4049070036061823, + "learning_rate": 6.024946600692585e-06, + "loss": 0.0074, + "step": 1583 + }, + { + "epoch": 0.4516680923866553, + "grad_norm": 1.0478163608067583, + "learning_rate": 6.020425613555341e-06, + "loss": 0.0199, + "step": 1584 + }, + { + "epoch": 0.4519532363843741, + "grad_norm": 1.5885560292025787, + "learning_rate": 6.015903755719588e-06, + "loss": 0.0454, + "step": 1585 + }, + { + "epoch": 0.45223838038209296, + "grad_norm": 1.0854421779536836, + "learning_rate": 6.011381031043686e-06, + "loss": 0.0294, + "step": 1586 + }, + { + "epoch": 0.4525235243798118, + "grad_norm": 0.4134891652172671, + "learning_rate": 6.006857443386746e-06, + "loss": 0.012, + "step": 1587 + }, + { + "epoch": 0.45280866837753064, + "grad_norm": 1.7921075489007599, + "learning_rate": 6.002332996608605e-06, + "loss": 0.0277, + "step": 1588 + }, + { + "epoch": 0.4530938123752495, + "grad_norm": 0.9673035410671176, + "learning_rate": 5.9978076945698395e-06, + "loss": 0.017, + "step": 1589 + }, + { + "epoch": 0.4533789563729683, + "grad_norm": 0.9724566217783264, + "learning_rate": 5.993281541131753e-06, + "loss": 0.0319, + "step": 1590 + }, + { + "epoch": 0.4536641003706872, + "grad_norm": 0.7175181508761327, + "learning_rate": 5.9887545401563775e-06, + "loss": 0.017, + "step": 1591 + }, + { + "epoch": 0.45394924436840606, + "grad_norm": 1.1816527100772103, + "learning_rate": 5.984226695506464e-06, + "loss": 0.0279, + "step": 1592 + }, + { + "epoch": 0.4542343883661249, + "grad_norm": 1.9390267938168535, + "learning_rate": 5.979698011045492e-06, + "loss": 0.038, + "step": 1593 + }, + { + "epoch": 0.45451953236384374, + "grad_norm": 0.9155009626135624, + "learning_rate": 5.975168490637644e-06, + "loss": 0.052, + "step": 1594 + }, + { + "epoch": 0.4548046763615626, + "grad_norm": 0.7977284818854188, + "learning_rate": 5.970638138147829e-06, + "loss": 0.0107, + "step": 1595 + }, + { + "epoch": 0.4550898203592814, + "grad_norm": 0.9196711480479474, + "learning_rate": 5.966106957441661e-06, + "loss": 0.0176, + "step": 1596 + }, + { + "epoch": 0.4553749643570003, + "grad_norm": 4.36651610023969, + "learning_rate": 5.961574952385457e-06, + "loss": 0.0302, + "step": 1597 + }, + { + "epoch": 0.45566010835471915, + "grad_norm": 2.9956145689115043, + "learning_rate": 5.957042126846243e-06, + "loss": 0.0496, + "step": 1598 + }, + { + "epoch": 0.455945252352438, + "grad_norm": 1.318241867584188, + "learning_rate": 5.952508484691742e-06, + "loss": 0.0248, + "step": 1599 + }, + { + "epoch": 0.45623039635015683, + "grad_norm": 1.4632248370825405, + "learning_rate": 5.947974029790375e-06, + "loss": 0.0288, + "step": 1600 + }, + { + "epoch": 0.45651554034787567, + "grad_norm": 1.5614361952777818, + "learning_rate": 5.943438766011256e-06, + "loss": 0.0505, + "step": 1601 + }, + { + "epoch": 0.4568006843455945, + "grad_norm": 0.5404823438930301, + "learning_rate": 5.938902697224189e-06, + "loss": 0.0065, + "step": 1602 + }, + { + "epoch": 0.45708582834331335, + "grad_norm": 1.0138237952237483, + "learning_rate": 5.934365827299666e-06, + "loss": 0.0248, + "step": 1603 + }, + { + "epoch": 0.45737097234103224, + "grad_norm": 1.3912521886461697, + "learning_rate": 5.9298281601088616e-06, + "loss": 0.0226, + "step": 1604 + }, + { + "epoch": 0.4576561163387511, + "grad_norm": 1.346109081340996, + "learning_rate": 5.92528969952363e-06, + "loss": 0.0431, + "step": 1605 + }, + { + "epoch": 0.4579412603364699, + "grad_norm": 0.6722377723333736, + "learning_rate": 5.9207504494165025e-06, + "loss": 0.0057, + "step": 1606 + }, + { + "epoch": 0.45822640433418876, + "grad_norm": 1.5756505347726604, + "learning_rate": 5.916210413660687e-06, + "loss": 0.0323, + "step": 1607 + }, + { + "epoch": 0.4585115483319076, + "grad_norm": 1.372238468851667, + "learning_rate": 5.9116695961300584e-06, + "loss": 0.0216, + "step": 1608 + }, + { + "epoch": 0.45879669232962644, + "grad_norm": 0.7560326833267913, + "learning_rate": 5.907128000699159e-06, + "loss": 0.0141, + "step": 1609 + }, + { + "epoch": 0.4590818363273453, + "grad_norm": 1.8272989751745645, + "learning_rate": 5.9025856312431985e-06, + "loss": 0.031, + "step": 1610 + }, + { + "epoch": 0.4593669803250642, + "grad_norm": 1.515596989644, + "learning_rate": 5.898042491638042e-06, + "loss": 0.0302, + "step": 1611 + }, + { + "epoch": 0.459652124322783, + "grad_norm": 1.3089273048621382, + "learning_rate": 5.8934985857602144e-06, + "loss": 0.0314, + "step": 1612 + }, + { + "epoch": 0.45993726832050186, + "grad_norm": 1.0921868001336128, + "learning_rate": 5.8889539174868935e-06, + "loss": 0.0199, + "step": 1613 + }, + { + "epoch": 0.4602224123182207, + "grad_norm": 0.9944417014298922, + "learning_rate": 5.88440849069591e-06, + "loss": 0.0139, + "step": 1614 + }, + { + "epoch": 0.46050755631593954, + "grad_norm": 0.9965377028835146, + "learning_rate": 5.879862309265738e-06, + "loss": 0.0235, + "step": 1615 + }, + { + "epoch": 0.4607927003136584, + "grad_norm": 1.4517072630287036, + "learning_rate": 5.875315377075497e-06, + "loss": 0.0417, + "step": 1616 + }, + { + "epoch": 0.46107784431137727, + "grad_norm": 0.4827625546781854, + "learning_rate": 5.87076769800495e-06, + "loss": 0.0184, + "step": 1617 + }, + { + "epoch": 0.4613629883090961, + "grad_norm": 1.9595291045089964, + "learning_rate": 5.866219275934494e-06, + "loss": 0.0388, + "step": 1618 + }, + { + "epoch": 0.46164813230681495, + "grad_norm": 0.9604209914453493, + "learning_rate": 5.86167011474516e-06, + "loss": 0.0438, + "step": 1619 + }, + { + "epoch": 0.4619332763045338, + "grad_norm": 1.5066448543730424, + "learning_rate": 5.857120218318612e-06, + "loss": 0.0418, + "step": 1620 + }, + { + "epoch": 0.46221842030225263, + "grad_norm": 1.5026872619543066, + "learning_rate": 5.852569590537138e-06, + "loss": 0.0284, + "step": 1621 + }, + { + "epoch": 0.46250356429997147, + "grad_norm": 1.0384555882506465, + "learning_rate": 5.848018235283654e-06, + "loss": 0.0217, + "step": 1622 + }, + { + "epoch": 0.4627887082976903, + "grad_norm": 1.2663294731314405, + "learning_rate": 5.843466156441693e-06, + "loss": 0.0215, + "step": 1623 + }, + { + "epoch": 0.4630738522954092, + "grad_norm": 1.4544791021792642, + "learning_rate": 5.838913357895408e-06, + "loss": 0.043, + "step": 1624 + }, + { + "epoch": 0.46335899629312804, + "grad_norm": 1.9146030561749325, + "learning_rate": 5.834359843529565e-06, + "loss": 0.0429, + "step": 1625 + }, + { + "epoch": 0.4636441402908469, + "grad_norm": 0.5614062355847471, + "learning_rate": 5.82980561722954e-06, + "loss": 0.0084, + "step": 1626 + }, + { + "epoch": 0.4639292842885657, + "grad_norm": 1.0668880452470082, + "learning_rate": 5.8252506828813195e-06, + "loss": 0.0259, + "step": 1627 + }, + { + "epoch": 0.46421442828628456, + "grad_norm": 0.8362296876846605, + "learning_rate": 5.82069504437149e-06, + "loss": 0.0219, + "step": 1628 + }, + { + "epoch": 0.4644995722840034, + "grad_norm": 1.865874275154151, + "learning_rate": 5.816138705587242e-06, + "loss": 0.0339, + "step": 1629 + }, + { + "epoch": 0.46478471628172224, + "grad_norm": 0.7831355484928149, + "learning_rate": 5.811581670416363e-06, + "loss": 0.011, + "step": 1630 + }, + { + "epoch": 0.46506986027944114, + "grad_norm": 0.9660875848485134, + "learning_rate": 5.807023942747232e-06, + "loss": 0.0145, + "step": 1631 + }, + { + "epoch": 0.46535500427716, + "grad_norm": 0.6626505988518036, + "learning_rate": 5.802465526468825e-06, + "loss": 0.0171, + "step": 1632 + }, + { + "epoch": 0.4656401482748788, + "grad_norm": 1.8732921332420605, + "learning_rate": 5.797906425470696e-06, + "loss": 0.0412, + "step": 1633 + }, + { + "epoch": 0.46592529227259766, + "grad_norm": 0.9110735893383872, + "learning_rate": 5.7933466436429945e-06, + "loss": 0.0196, + "step": 1634 + }, + { + "epoch": 0.4662104362703165, + "grad_norm": 0.9187090445005635, + "learning_rate": 5.788786184876443e-06, + "loss": 0.0086, + "step": 1635 + }, + { + "epoch": 0.46649558026803534, + "grad_norm": 0.4577963271928218, + "learning_rate": 5.784225053062342e-06, + "loss": 0.0172, + "step": 1636 + }, + { + "epoch": 0.46678072426575423, + "grad_norm": 0.6514140579973696, + "learning_rate": 5.77966325209257e-06, + "loss": 0.0185, + "step": 1637 + }, + { + "epoch": 0.46706586826347307, + "grad_norm": 1.5558388744790788, + "learning_rate": 5.775100785859576e-06, + "loss": 0.0379, + "step": 1638 + }, + { + "epoch": 0.4673510122611919, + "grad_norm": 1.3399688016409907, + "learning_rate": 5.770537658256373e-06, + "loss": 0.0327, + "step": 1639 + }, + { + "epoch": 0.46763615625891075, + "grad_norm": 1.1869080276664048, + "learning_rate": 5.765973873176544e-06, + "loss": 0.0194, + "step": 1640 + }, + { + "epoch": 0.4679213002566296, + "grad_norm": 1.2920295998953273, + "learning_rate": 5.761409434514223e-06, + "loss": 0.0158, + "step": 1641 + }, + { + "epoch": 0.46820644425434843, + "grad_norm": 0.5899684738930414, + "learning_rate": 5.756844346164115e-06, + "loss": 0.0119, + "step": 1642 + }, + { + "epoch": 0.46849158825206727, + "grad_norm": 0.7664070156217964, + "learning_rate": 5.752278612021471e-06, + "loss": 0.0178, + "step": 1643 + }, + { + "epoch": 0.46877673224978617, + "grad_norm": 1.0535022309389133, + "learning_rate": 5.747712235982094e-06, + "loss": 0.0217, + "step": 1644 + }, + { + "epoch": 0.469061876247505, + "grad_norm": 0.5765641658694458, + "learning_rate": 5.743145221942333e-06, + "loss": 0.0088, + "step": 1645 + }, + { + "epoch": 0.46934702024522384, + "grad_norm": 1.220409106870033, + "learning_rate": 5.73857757379909e-06, + "loss": 0.0312, + "step": 1646 + }, + { + "epoch": 0.4696321642429427, + "grad_norm": 1.1369091506463587, + "learning_rate": 5.7340092954497965e-06, + "loss": 0.0241, + "step": 1647 + }, + { + "epoch": 0.4699173082406615, + "grad_norm": 1.0456981783807946, + "learning_rate": 5.729440390792429e-06, + "loss": 0.0241, + "step": 1648 + }, + { + "epoch": 0.47020245223838036, + "grad_norm": 2.3070988825991146, + "learning_rate": 5.724870863725497e-06, + "loss": 0.0403, + "step": 1649 + }, + { + "epoch": 0.4704875962360992, + "grad_norm": 1.3376794149494875, + "learning_rate": 5.72030071814804e-06, + "loss": 0.0313, + "step": 1650 + }, + { + "epoch": 0.4707727402338181, + "grad_norm": 0.9046821280854266, + "learning_rate": 5.715729957959625e-06, + "loss": 0.0101, + "step": 1651 + }, + { + "epoch": 0.47105788423153694, + "grad_norm": 1.0675955765788645, + "learning_rate": 5.7111585870603455e-06, + "loss": 0.0191, + "step": 1652 + }, + { + "epoch": 0.4713430282292558, + "grad_norm": 0.5649793938230241, + "learning_rate": 5.7065866093508145e-06, + "loss": 0.0112, + "step": 1653 + }, + { + "epoch": 0.4716281722269746, + "grad_norm": 1.3714092185991984, + "learning_rate": 5.7020140287321645e-06, + "loss": 0.0291, + "step": 1654 + }, + { + "epoch": 0.47191331622469346, + "grad_norm": 0.5776238056475193, + "learning_rate": 5.697440849106039e-06, + "loss": 0.0129, + "step": 1655 + }, + { + "epoch": 0.4721984602224123, + "grad_norm": 0.9710900939411276, + "learning_rate": 5.692867074374596e-06, + "loss": 0.0429, + "step": 1656 + }, + { + "epoch": 0.4724836042201312, + "grad_norm": 0.6235588357191956, + "learning_rate": 5.688292708440501e-06, + "loss": 0.0092, + "step": 1657 + }, + { + "epoch": 0.47276874821785003, + "grad_norm": 1.3140976352998177, + "learning_rate": 5.6837177552069204e-06, + "loss": 0.0238, + "step": 1658 + }, + { + "epoch": 0.47305389221556887, + "grad_norm": 1.4703932405577713, + "learning_rate": 5.679142218577525e-06, + "loss": 0.0233, + "step": 1659 + }, + { + "epoch": 0.4733390362132877, + "grad_norm": 3.059192393508655, + "learning_rate": 5.6745661024564854e-06, + "loss": 0.0941, + "step": 1660 + }, + { + "epoch": 0.47362418021100655, + "grad_norm": 1.2480858491288185, + "learning_rate": 5.66998941074846e-06, + "loss": 0.0254, + "step": 1661 + }, + { + "epoch": 0.4739093242087254, + "grad_norm": 1.3391566425560755, + "learning_rate": 5.665412147358607e-06, + "loss": 0.0192, + "step": 1662 + }, + { + "epoch": 0.47419446820644423, + "grad_norm": 1.9530592193510963, + "learning_rate": 5.6608343161925616e-06, + "loss": 0.0348, + "step": 1663 + }, + { + "epoch": 0.4744796122041631, + "grad_norm": 0.5882117331069833, + "learning_rate": 5.656255921156455e-06, + "loss": 0.0078, + "step": 1664 + }, + { + "epoch": 0.47476475620188197, + "grad_norm": 0.4806565752566173, + "learning_rate": 5.651676966156892e-06, + "loss": 0.0166, + "step": 1665 + }, + { + "epoch": 0.4750499001996008, + "grad_norm": 2.5545952290218805, + "learning_rate": 5.6470974551009565e-06, + "loss": 0.0401, + "step": 1666 + }, + { + "epoch": 0.47533504419731964, + "grad_norm": 3.454081035186067, + "learning_rate": 5.642517391896208e-06, + "loss": 0.0461, + "step": 1667 + }, + { + "epoch": 0.4756201881950385, + "grad_norm": 1.2109065735867222, + "learning_rate": 5.637936780450679e-06, + "loss": 0.0287, + "step": 1668 + }, + { + "epoch": 0.4759053321927573, + "grad_norm": 1.1002493916885174, + "learning_rate": 5.633355624672865e-06, + "loss": 0.0123, + "step": 1669 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 0.7138212776082841, + "learning_rate": 5.628773928471729e-06, + "loss": 0.008, + "step": 1670 + }, + { + "epoch": 0.47647562018819506, + "grad_norm": 1.273915884803688, + "learning_rate": 5.6241916957566955e-06, + "loss": 0.014, + "step": 1671 + }, + { + "epoch": 0.4767607641859139, + "grad_norm": 1.0112889834715977, + "learning_rate": 5.619608930437647e-06, + "loss": 0.0296, + "step": 1672 + }, + { + "epoch": 0.47704590818363274, + "grad_norm": 2.625237325555971, + "learning_rate": 5.6150256364249165e-06, + "loss": 0.0315, + "step": 1673 + }, + { + "epoch": 0.4773310521813516, + "grad_norm": 0.9222000915216457, + "learning_rate": 5.610441817629293e-06, + "loss": 0.0214, + "step": 1674 + }, + { + "epoch": 0.4776161961790704, + "grad_norm": 1.2738057180224518, + "learning_rate": 5.605857477962009e-06, + "loss": 0.06, + "step": 1675 + }, + { + "epoch": 0.47790134017678926, + "grad_norm": 1.7750846497598578, + "learning_rate": 5.601272621334749e-06, + "loss": 0.0754, + "step": 1676 + }, + { + "epoch": 0.47818648417450815, + "grad_norm": 1.7959815648598039, + "learning_rate": 5.5966872516596275e-06, + "loss": 0.0617, + "step": 1677 + }, + { + "epoch": 0.478471628172227, + "grad_norm": 1.1371874135314433, + "learning_rate": 5.592101372849207e-06, + "loss": 0.0196, + "step": 1678 + }, + { + "epoch": 0.47875677216994583, + "grad_norm": 0.9732582128190234, + "learning_rate": 5.587514988816477e-06, + "loss": 0.0151, + "step": 1679 + }, + { + "epoch": 0.47904191616766467, + "grad_norm": 1.3674954779188182, + "learning_rate": 5.582928103474863e-06, + "loss": 0.0298, + "step": 1680 + }, + { + "epoch": 0.4793270601653835, + "grad_norm": 0.9010991025266389, + "learning_rate": 5.5783407207382146e-06, + "loss": 0.0208, + "step": 1681 + }, + { + "epoch": 0.47961220416310235, + "grad_norm": 0.7798121219399466, + "learning_rate": 5.573752844520812e-06, + "loss": 0.0186, + "step": 1682 + }, + { + "epoch": 0.4798973481608212, + "grad_norm": 0.8532491665099611, + "learning_rate": 5.569164478737346e-06, + "loss": 0.0285, + "step": 1683 + }, + { + "epoch": 0.4801824921585401, + "grad_norm": 1.2652300108229633, + "learning_rate": 5.564575627302936e-06, + "loss": 0.0146, + "step": 1684 + }, + { + "epoch": 0.4804676361562589, + "grad_norm": 1.0842547917810184, + "learning_rate": 5.5599862941331084e-06, + "loss": 0.0236, + "step": 1685 + }, + { + "epoch": 0.48075278015397777, + "grad_norm": 0.7692465444203684, + "learning_rate": 5.5553964831438044e-06, + "loss": 0.0098, + "step": 1686 + }, + { + "epoch": 0.4810379241516966, + "grad_norm": 0.6516988532185854, + "learning_rate": 5.5508061982513715e-06, + "loss": 0.0095, + "step": 1687 + }, + { + "epoch": 0.48132306814941545, + "grad_norm": 0.86894709268537, + "learning_rate": 5.546215443372562e-06, + "loss": 0.0207, + "step": 1688 + }, + { + "epoch": 0.4816082121471343, + "grad_norm": 1.1589813428803937, + "learning_rate": 5.541624222424529e-06, + "loss": 0.0182, + "step": 1689 + }, + { + "epoch": 0.4818933561448531, + "grad_norm": 0.4753465975864277, + "learning_rate": 5.537032539324824e-06, + "loss": 0.0143, + "step": 1690 + }, + { + "epoch": 0.482178500142572, + "grad_norm": 0.3429031884155126, + "learning_rate": 5.53244039799139e-06, + "loss": 0.0089, + "step": 1691 + }, + { + "epoch": 0.48246364414029086, + "grad_norm": 1.9674672537859532, + "learning_rate": 5.527847802342567e-06, + "loss": 0.054, + "step": 1692 + }, + { + "epoch": 0.4827487881380097, + "grad_norm": 0.9183324853750284, + "learning_rate": 5.523254756297074e-06, + "loss": 0.0332, + "step": 1693 + }, + { + "epoch": 0.48303393213572854, + "grad_norm": 1.5405624693680082, + "learning_rate": 5.518661263774023e-06, + "loss": 0.0625, + "step": 1694 + }, + { + "epoch": 0.4833190761334474, + "grad_norm": 1.4478394054980839, + "learning_rate": 5.514067328692902e-06, + "loss": 0.0234, + "step": 1695 + }, + { + "epoch": 0.4836042201311662, + "grad_norm": 1.198470695313575, + "learning_rate": 5.509472954973578e-06, + "loss": 0.0339, + "step": 1696 + }, + { + "epoch": 0.4838893641288851, + "grad_norm": 1.4872008148949296, + "learning_rate": 5.504878146536291e-06, + "loss": 0.0152, + "step": 1697 + }, + { + "epoch": 0.48417450812660395, + "grad_norm": 0.9089989321086841, + "learning_rate": 5.500282907301655e-06, + "loss": 0.0227, + "step": 1698 + }, + { + "epoch": 0.4844596521243228, + "grad_norm": 0.9108428863142347, + "learning_rate": 5.495687241190646e-06, + "loss": 0.0081, + "step": 1699 + }, + { + "epoch": 0.48474479612204163, + "grad_norm": 1.7700228667742393, + "learning_rate": 5.491091152124611e-06, + "loss": 0.0397, + "step": 1700 + }, + { + "epoch": 0.48502994011976047, + "grad_norm": 1.086368323032613, + "learning_rate": 5.4864946440252555e-06, + "loss": 0.0295, + "step": 1701 + }, + { + "epoch": 0.4853150841174793, + "grad_norm": 0.8881192538969089, + "learning_rate": 5.4818977208146375e-06, + "loss": 0.0315, + "step": 1702 + }, + { + "epoch": 0.48560022811519815, + "grad_norm": 0.8299716634196358, + "learning_rate": 5.477300386415176e-06, + "loss": 0.0125, + "step": 1703 + }, + { + "epoch": 0.48588537211291705, + "grad_norm": 0.8778183424306557, + "learning_rate": 5.47270264474964e-06, + "loss": 0.0225, + "step": 1704 + }, + { + "epoch": 0.4861705161106359, + "grad_norm": 0.3759935217970214, + "learning_rate": 5.46810449974114e-06, + "loss": 0.0067, + "step": 1705 + }, + { + "epoch": 0.4864556601083547, + "grad_norm": 1.012504942599993, + "learning_rate": 5.46350595531314e-06, + "loss": 0.0342, + "step": 1706 + }, + { + "epoch": 0.48674080410607357, + "grad_norm": 0.3155753567046455, + "learning_rate": 5.458907015389435e-06, + "loss": 0.0068, + "step": 1707 + }, + { + "epoch": 0.4870259481037924, + "grad_norm": 0.8772082266297694, + "learning_rate": 5.454307683894166e-06, + "loss": 0.0137, + "step": 1708 + }, + { + "epoch": 0.48731109210151125, + "grad_norm": 0.8540371209025869, + "learning_rate": 5.449707964751801e-06, + "loss": 0.011, + "step": 1709 + }, + { + "epoch": 0.4875962360992301, + "grad_norm": 0.9996853289237507, + "learning_rate": 5.445107861887144e-06, + "loss": 0.0156, + "step": 1710 + }, + { + "epoch": 0.487881380096949, + "grad_norm": 1.4433308389528134, + "learning_rate": 5.440507379225325e-06, + "loss": 0.0347, + "step": 1711 + }, + { + "epoch": 0.4881665240946678, + "grad_norm": 0.6514104791176355, + "learning_rate": 5.435906520691798e-06, + "loss": 0.0279, + "step": 1712 + }, + { + "epoch": 0.48845166809238666, + "grad_norm": 1.1622561862286143, + "learning_rate": 5.431305290212335e-06, + "loss": 0.0531, + "step": 1713 + }, + { + "epoch": 0.4887368120901055, + "grad_norm": 1.0205243505665194, + "learning_rate": 5.426703691713029e-06, + "loss": 0.034, + "step": 1714 + }, + { + "epoch": 0.48902195608782434, + "grad_norm": 1.5178878114736611, + "learning_rate": 5.422101729120287e-06, + "loss": 0.0326, + "step": 1715 + }, + { + "epoch": 0.4893071000855432, + "grad_norm": 1.5210315919144914, + "learning_rate": 5.417499406360823e-06, + "loss": 0.0244, + "step": 1716 + }, + { + "epoch": 0.4895922440832621, + "grad_norm": 0.7365109722429185, + "learning_rate": 5.412896727361663e-06, + "loss": 0.0109, + "step": 1717 + }, + { + "epoch": 0.4898773880809809, + "grad_norm": 1.5587515861114862, + "learning_rate": 5.408293696050132e-06, + "loss": 0.0252, + "step": 1718 + }, + { + "epoch": 0.49016253207869975, + "grad_norm": 0.8960330096591171, + "learning_rate": 5.40369031635386e-06, + "loss": 0.0364, + "step": 1719 + }, + { + "epoch": 0.4904476760764186, + "grad_norm": 0.6398440868605254, + "learning_rate": 5.399086592200774e-06, + "loss": 0.0134, + "step": 1720 + }, + { + "epoch": 0.49073282007413743, + "grad_norm": 1.2962219418993994, + "learning_rate": 5.394482527519089e-06, + "loss": 0.0417, + "step": 1721 + }, + { + "epoch": 0.49101796407185627, + "grad_norm": 1.1342913204571548, + "learning_rate": 5.38987812623732e-06, + "loss": 0.0349, + "step": 1722 + }, + { + "epoch": 0.4913031080695751, + "grad_norm": 1.415562479211778, + "learning_rate": 5.385273392284262e-06, + "loss": 0.0341, + "step": 1723 + }, + { + "epoch": 0.491588252067294, + "grad_norm": 0.570721980159015, + "learning_rate": 5.380668329588996e-06, + "loss": 0.0092, + "step": 1724 + }, + { + "epoch": 0.49187339606501285, + "grad_norm": 1.2011080438952686, + "learning_rate": 5.376062942080883e-06, + "loss": 0.0199, + "step": 1725 + }, + { + "epoch": 0.4921585400627317, + "grad_norm": 0.8641679648671838, + "learning_rate": 5.371457233689563e-06, + "loss": 0.0344, + "step": 1726 + }, + { + "epoch": 0.4924436840604505, + "grad_norm": 1.3972786914451918, + "learning_rate": 5.366851208344948e-06, + "loss": 0.0267, + "step": 1727 + }, + { + "epoch": 0.49272882805816937, + "grad_norm": 0.9723899245443296, + "learning_rate": 5.362244869977224e-06, + "loss": 0.0241, + "step": 1728 + }, + { + "epoch": 0.4930139720558882, + "grad_norm": 1.2779418929535273, + "learning_rate": 5.357638222516833e-06, + "loss": 0.0228, + "step": 1729 + }, + { + "epoch": 0.49329911605360705, + "grad_norm": 1.596318162921975, + "learning_rate": 5.353031269894497e-06, + "loss": 0.0324, + "step": 1730 + }, + { + "epoch": 0.49358426005132594, + "grad_norm": 1.066749627601742, + "learning_rate": 5.348424016041188e-06, + "loss": 0.02, + "step": 1731 + }, + { + "epoch": 0.4938694040490448, + "grad_norm": 2.035347543339062, + "learning_rate": 5.343816464888136e-06, + "loss": 0.0404, + "step": 1732 + }, + { + "epoch": 0.4941545480467636, + "grad_norm": 3.3356286353565414, + "learning_rate": 5.339208620366827e-06, + "loss": 0.0574, + "step": 1733 + }, + { + "epoch": 0.49443969204448246, + "grad_norm": 1.5158864916198709, + "learning_rate": 5.334600486408995e-06, + "loss": 0.0407, + "step": 1734 + }, + { + "epoch": 0.4947248360422013, + "grad_norm": 1.0504079623189557, + "learning_rate": 5.3299920669466224e-06, + "loss": 0.0199, + "step": 1735 + }, + { + "epoch": 0.49500998003992014, + "grad_norm": 1.2772606321263538, + "learning_rate": 5.325383365911937e-06, + "loss": 0.0203, + "step": 1736 + }, + { + "epoch": 0.49529512403763903, + "grad_norm": 0.7819050740242356, + "learning_rate": 5.320774387237403e-06, + "loss": 0.0141, + "step": 1737 + }, + { + "epoch": 0.4955802680353579, + "grad_norm": 1.0278612287001418, + "learning_rate": 5.316165134855724e-06, + "loss": 0.0157, + "step": 1738 + }, + { + "epoch": 0.4958654120330767, + "grad_norm": 1.4731255811320176, + "learning_rate": 5.311555612699837e-06, + "loss": 0.0289, + "step": 1739 + }, + { + "epoch": 0.49615055603079555, + "grad_norm": 0.7990479306692042, + "learning_rate": 5.306945824702908e-06, + "loss": 0.0091, + "step": 1740 + }, + { + "epoch": 0.4964357000285144, + "grad_norm": 1.3645150027377915, + "learning_rate": 5.302335774798331e-06, + "loss": 0.0417, + "step": 1741 + }, + { + "epoch": 0.49672084402623323, + "grad_norm": 0.5097086067391874, + "learning_rate": 5.297725466919722e-06, + "loss": 0.0067, + "step": 1742 + }, + { + "epoch": 0.49700598802395207, + "grad_norm": 1.220765771700666, + "learning_rate": 5.29311490500092e-06, + "loss": 0.0422, + "step": 1743 + }, + { + "epoch": 0.49729113202167097, + "grad_norm": 0.6971180492227302, + "learning_rate": 5.288504092975975e-06, + "loss": 0.0206, + "step": 1744 + }, + { + "epoch": 0.4975762760193898, + "grad_norm": 1.0855891533617872, + "learning_rate": 5.2838930347791605e-06, + "loss": 0.0377, + "step": 1745 + }, + { + "epoch": 0.49786142001710865, + "grad_norm": 0.5153649468444501, + "learning_rate": 5.279281734344951e-06, + "loss": 0.0115, + "step": 1746 + }, + { + "epoch": 0.4981465640148275, + "grad_norm": 0.8506337273843959, + "learning_rate": 5.274670195608029e-06, + "loss": 0.0127, + "step": 1747 + }, + { + "epoch": 0.4984317080125463, + "grad_norm": 1.1073160831408677, + "learning_rate": 5.270058422503284e-06, + "loss": 0.0191, + "step": 1748 + }, + { + "epoch": 0.49871685201026517, + "grad_norm": 1.7959372472589628, + "learning_rate": 5.265446418965801e-06, + "loss": 0.0349, + "step": 1749 + }, + { + "epoch": 0.499001996007984, + "grad_norm": 1.2999018982531467, + "learning_rate": 5.260834188930867e-06, + "loss": 0.0144, + "step": 1750 + }, + { + "epoch": 0.4992871400057029, + "grad_norm": 1.1152004670396407, + "learning_rate": 5.256221736333959e-06, + "loss": 0.0389, + "step": 1751 + }, + { + "epoch": 0.49957228400342174, + "grad_norm": 1.194164124119914, + "learning_rate": 5.25160906511074e-06, + "loss": 0.0264, + "step": 1752 + }, + { + "epoch": 0.4998574280011406, + "grad_norm": 1.2213677453155716, + "learning_rate": 5.246996179197068e-06, + "loss": 0.0245, + "step": 1753 + }, + { + "epoch": 0.5001425719988595, + "grad_norm": 0.5547879059243418, + "learning_rate": 5.24238308252898e-06, + "loss": 0.017, + "step": 1754 + }, + { + "epoch": 0.5004277159965783, + "grad_norm": 0.7060694139102462, + "learning_rate": 5.23776977904269e-06, + "loss": 0.0196, + "step": 1755 + }, + { + "epoch": 0.5007128599942972, + "grad_norm": 0.8256575425352183, + "learning_rate": 5.233156272674595e-06, + "loss": 0.0109, + "step": 1756 + }, + { + "epoch": 0.500998003992016, + "grad_norm": 1.2395878805797904, + "learning_rate": 5.228542567361258e-06, + "loss": 0.0264, + "step": 1757 + }, + { + "epoch": 0.5012831479897348, + "grad_norm": 0.5321859041552769, + "learning_rate": 5.223928667039414e-06, + "loss": 0.0097, + "step": 1758 + }, + { + "epoch": 0.5015682919874537, + "grad_norm": 0.6929216929421017, + "learning_rate": 5.219314575645972e-06, + "loss": 0.03, + "step": 1759 + }, + { + "epoch": 0.5018534359851725, + "grad_norm": 1.3885854817205727, + "learning_rate": 5.214700297117989e-06, + "loss": 0.0214, + "step": 1760 + }, + { + "epoch": 0.5021385799828914, + "grad_norm": 1.0482514572374837, + "learning_rate": 5.210085835392696e-06, + "loss": 0.0229, + "step": 1761 + }, + { + "epoch": 0.5024237239806102, + "grad_norm": 1.180111096741413, + "learning_rate": 5.205471194407474e-06, + "loss": 0.0476, + "step": 1762 + }, + { + "epoch": 0.502708867978329, + "grad_norm": 1.8163067516658773, + "learning_rate": 5.2008563780998554e-06, + "loss": 0.0289, + "step": 1763 + }, + { + "epoch": 0.5029940119760479, + "grad_norm": 0.7027677059471549, + "learning_rate": 5.196241390407526e-06, + "loss": 0.023, + "step": 1764 + }, + { + "epoch": 0.5032791559737667, + "grad_norm": 1.8922631201234337, + "learning_rate": 5.1916262352683154e-06, + "loss": 0.0283, + "step": 1765 + }, + { + "epoch": 0.5035642999714856, + "grad_norm": 1.5571212020941443, + "learning_rate": 5.187010916620196e-06, + "loss": 0.0361, + "step": 1766 + }, + { + "epoch": 0.5038494439692045, + "grad_norm": 0.8174284451105421, + "learning_rate": 5.182395438401282e-06, + "loss": 0.0224, + "step": 1767 + }, + { + "epoch": 0.5041345879669233, + "grad_norm": 1.3080120864075888, + "learning_rate": 5.177779804549821e-06, + "loss": 0.0142, + "step": 1768 + }, + { + "epoch": 0.5044197319646422, + "grad_norm": 1.8386928220755963, + "learning_rate": 5.173164019004195e-06, + "loss": 0.0305, + "step": 1769 + }, + { + "epoch": 0.504704875962361, + "grad_norm": 1.1936198989153632, + "learning_rate": 5.168548085702916e-06, + "loss": 0.0387, + "step": 1770 + }, + { + "epoch": 0.5049900199600799, + "grad_norm": 1.6312386336058349, + "learning_rate": 5.16393200858462e-06, + "loss": 0.0306, + "step": 1771 + }, + { + "epoch": 0.5052751639577987, + "grad_norm": 0.9145988600574612, + "learning_rate": 5.159315791588068e-06, + "loss": 0.011, + "step": 1772 + }, + { + "epoch": 0.5055603079555175, + "grad_norm": 0.902930547135958, + "learning_rate": 5.154699438652139e-06, + "loss": 0.0148, + "step": 1773 + }, + { + "epoch": 0.5058454519532364, + "grad_norm": 0.8416064963586531, + "learning_rate": 5.150082953715826e-06, + "loss": 0.0142, + "step": 1774 + }, + { + "epoch": 0.5061305959509552, + "grad_norm": 0.8900714074477286, + "learning_rate": 5.145466340718241e-06, + "loss": 0.0219, + "step": 1775 + }, + { + "epoch": 0.5064157399486741, + "grad_norm": 1.66826695692096, + "learning_rate": 5.140849603598598e-06, + "loss": 0.042, + "step": 1776 + }, + { + "epoch": 0.5067008839463929, + "grad_norm": 2.656687757346725, + "learning_rate": 5.13623274629622e-06, + "loss": 0.0651, + "step": 1777 + }, + { + "epoch": 0.5069860279441117, + "grad_norm": 0.5318371444639051, + "learning_rate": 5.131615772750534e-06, + "loss": 0.0124, + "step": 1778 + }, + { + "epoch": 0.5072711719418306, + "grad_norm": 1.870474093394001, + "learning_rate": 5.126998686901066e-06, + "loss": 0.0444, + "step": 1779 + }, + { + "epoch": 0.5075563159395494, + "grad_norm": 0.27375968758014385, + "learning_rate": 5.122381492687431e-06, + "loss": 0.0082, + "step": 1780 + }, + { + "epoch": 0.5078414599372684, + "grad_norm": 0.5572083483107301, + "learning_rate": 5.1177641940493505e-06, + "loss": 0.0198, + "step": 1781 + }, + { + "epoch": 0.5081266039349872, + "grad_norm": 1.8211201480961516, + "learning_rate": 5.113146794926619e-06, + "loss": 0.0605, + "step": 1782 + }, + { + "epoch": 0.508411747932706, + "grad_norm": 1.629287587730149, + "learning_rate": 5.108529299259129e-06, + "loss": 0.0485, + "step": 1783 + }, + { + "epoch": 0.5086968919304249, + "grad_norm": 1.2077436875656857, + "learning_rate": 5.10391171098685e-06, + "loss": 0.0262, + "step": 1784 + }, + { + "epoch": 0.5089820359281437, + "grad_norm": 1.016814636419437, + "learning_rate": 5.099294034049829e-06, + "loss": 0.0424, + "step": 1785 + }, + { + "epoch": 0.5092671799258626, + "grad_norm": 0.7224054605047718, + "learning_rate": 5.0946762723881935e-06, + "loss": 0.0101, + "step": 1786 + }, + { + "epoch": 0.5095523239235814, + "grad_norm": 1.081931745901931, + "learning_rate": 5.09005842994214e-06, + "loss": 0.0277, + "step": 1787 + }, + { + "epoch": 0.5098374679213002, + "grad_norm": 0.5980162131093523, + "learning_rate": 5.0854405106519336e-06, + "loss": 0.0144, + "step": 1788 + }, + { + "epoch": 0.5101226119190191, + "grad_norm": 0.32617215799420385, + "learning_rate": 5.080822518457909e-06, + "loss": 0.0091, + "step": 1789 + }, + { + "epoch": 0.5104077559167379, + "grad_norm": 1.637195558047833, + "learning_rate": 5.0762044573004555e-06, + "loss": 0.0346, + "step": 1790 + }, + { + "epoch": 0.5106928999144568, + "grad_norm": 1.9376813290083361, + "learning_rate": 5.071586331120031e-06, + "loss": 0.0268, + "step": 1791 + }, + { + "epoch": 0.5109780439121756, + "grad_norm": 0.8777109804793124, + "learning_rate": 5.066968143857141e-06, + "loss": 0.0099, + "step": 1792 + }, + { + "epoch": 0.5112631879098944, + "grad_norm": 1.1639274694760067, + "learning_rate": 5.062349899452345e-06, + "loss": 0.0325, + "step": 1793 + }, + { + "epoch": 0.5115483319076134, + "grad_norm": 0.9469853264768066, + "learning_rate": 5.057731601846253e-06, + "loss": 0.0388, + "step": 1794 + }, + { + "epoch": 0.5118334759053322, + "grad_norm": 1.55965099823004, + "learning_rate": 5.053113254979522e-06, + "loss": 0.0317, + "step": 1795 + }, + { + "epoch": 0.5121186199030511, + "grad_norm": 1.0572192459413616, + "learning_rate": 5.048494862792845e-06, + "loss": 0.0279, + "step": 1796 + }, + { + "epoch": 0.5124037639007699, + "grad_norm": 0.6540128291643972, + "learning_rate": 5.043876429226962e-06, + "loss": 0.0082, + "step": 1797 + }, + { + "epoch": 0.5126889078984888, + "grad_norm": 1.4677331238705937, + "learning_rate": 5.039257958222638e-06, + "loss": 0.052, + "step": 1798 + }, + { + "epoch": 0.5129740518962076, + "grad_norm": 0.6886544253965207, + "learning_rate": 5.0346394537206776e-06, + "loss": 0.0224, + "step": 1799 + }, + { + "epoch": 0.5132591958939264, + "grad_norm": 1.674371036871377, + "learning_rate": 5.030020919661913e-06, + "loss": 0.0487, + "step": 1800 + }, + { + "epoch": 0.5135443398916453, + "grad_norm": 1.374316064323489, + "learning_rate": 5.025402359987201e-06, + "loss": 0.0288, + "step": 1801 + }, + { + "epoch": 0.5138294838893641, + "grad_norm": 0.8573307882459512, + "learning_rate": 5.020783778637415e-06, + "loss": 0.0275, + "step": 1802 + }, + { + "epoch": 0.514114627887083, + "grad_norm": 1.2301193990464756, + "learning_rate": 5.016165179553459e-06, + "loss": 0.0263, + "step": 1803 + }, + { + "epoch": 0.5143997718848018, + "grad_norm": 1.2182295130590761, + "learning_rate": 5.011546566676239e-06, + "loss": 0.0359, + "step": 1804 + }, + { + "epoch": 0.5146849158825206, + "grad_norm": 1.133488244714276, + "learning_rate": 5.006927943946681e-06, + "loss": 0.0167, + "step": 1805 + }, + { + "epoch": 0.5149700598802395, + "grad_norm": 0.9176237886469092, + "learning_rate": 5.0023093153057174e-06, + "loss": 0.02, + "step": 1806 + }, + { + "epoch": 0.5152552038779584, + "grad_norm": 1.0261123312743583, + "learning_rate": 4.997690684694285e-06, + "loss": 0.0333, + "step": 1807 + }, + { + "epoch": 0.5155403478756773, + "grad_norm": 1.5110718674130426, + "learning_rate": 4.99307205605332e-06, + "loss": 0.0294, + "step": 1808 + }, + { + "epoch": 0.5158254918733961, + "grad_norm": 0.6864276343112098, + "learning_rate": 4.988453433323763e-06, + "loss": 0.0347, + "step": 1809 + }, + { + "epoch": 0.5161106358711149, + "grad_norm": 1.1248317448171876, + "learning_rate": 4.983834820446542e-06, + "loss": 0.0347, + "step": 1810 + }, + { + "epoch": 0.5163957798688338, + "grad_norm": 0.9943083426147765, + "learning_rate": 4.979216221362585e-06, + "loss": 0.0255, + "step": 1811 + }, + { + "epoch": 0.5166809238665526, + "grad_norm": 0.9053821929583953, + "learning_rate": 4.9745976400128e-06, + "loss": 0.0165, + "step": 1812 + }, + { + "epoch": 0.5169660678642715, + "grad_norm": 1.3710164561870049, + "learning_rate": 4.96997908033809e-06, + "loss": 0.0246, + "step": 1813 + }, + { + "epoch": 0.5172512118619903, + "grad_norm": 0.879598522359968, + "learning_rate": 4.965360546279324e-06, + "loss": 0.0239, + "step": 1814 + }, + { + "epoch": 0.5175363558597091, + "grad_norm": 0.6133796107685641, + "learning_rate": 4.960742041777365e-06, + "loss": 0.016, + "step": 1815 + }, + { + "epoch": 0.517821499857428, + "grad_norm": 0.7668338592272866, + "learning_rate": 4.95612357077304e-06, + "loss": 0.0233, + "step": 1816 + }, + { + "epoch": 0.5181066438551468, + "grad_norm": 1.8711063892336894, + "learning_rate": 4.951505137207155e-06, + "loss": 0.0433, + "step": 1817 + }, + { + "epoch": 0.5183917878528657, + "grad_norm": 1.4693009707813043, + "learning_rate": 4.94688674502048e-06, + "loss": 0.0192, + "step": 1818 + }, + { + "epoch": 0.5186769318505845, + "grad_norm": 1.0043129018742638, + "learning_rate": 4.942268398153748e-06, + "loss": 0.032, + "step": 1819 + }, + { + "epoch": 0.5189620758483033, + "grad_norm": 0.9700760923720372, + "learning_rate": 4.937650100547656e-06, + "loss": 0.03, + "step": 1820 + }, + { + "epoch": 0.5192472198460223, + "grad_norm": 1.6874603824297643, + "learning_rate": 4.933031856142862e-06, + "loss": 0.0434, + "step": 1821 + }, + { + "epoch": 0.5195323638437411, + "grad_norm": 1.6134747702626453, + "learning_rate": 4.92841366887997e-06, + "loss": 0.0249, + "step": 1822 + }, + { + "epoch": 0.51981750784146, + "grad_norm": 0.7329551510874709, + "learning_rate": 4.9237955426995445e-06, + "loss": 0.0282, + "step": 1823 + }, + { + "epoch": 0.5201026518391788, + "grad_norm": 1.502060656484669, + "learning_rate": 4.919177481542094e-06, + "loss": 0.0248, + "step": 1824 + }, + { + "epoch": 0.5203877958368976, + "grad_norm": 1.0356299700200353, + "learning_rate": 4.914559489348068e-06, + "loss": 0.0133, + "step": 1825 + }, + { + "epoch": 0.5206729398346165, + "grad_norm": 1.3304571261943245, + "learning_rate": 4.909941570057861e-06, + "loss": 0.0259, + "step": 1826 + }, + { + "epoch": 0.5209580838323353, + "grad_norm": 0.8800083351648127, + "learning_rate": 4.905323727611807e-06, + "loss": 0.0253, + "step": 1827 + }, + { + "epoch": 0.5212432278300542, + "grad_norm": 0.3567933218332538, + "learning_rate": 4.900705965950172e-06, + "loss": 0.012, + "step": 1828 + }, + { + "epoch": 0.521528371827773, + "grad_norm": 0.8401451215832865, + "learning_rate": 4.896088289013153e-06, + "loss": 0.0191, + "step": 1829 + }, + { + "epoch": 0.5218135158254918, + "grad_norm": 1.1818507943017555, + "learning_rate": 4.891470700740872e-06, + "loss": 0.0253, + "step": 1830 + }, + { + "epoch": 0.5220986598232107, + "grad_norm": 0.9347321602247102, + "learning_rate": 4.886853205073382e-06, + "loss": 0.0204, + "step": 1831 + }, + { + "epoch": 0.5223838038209295, + "grad_norm": 1.2285692447409555, + "learning_rate": 4.882235805950651e-06, + "loss": 0.0204, + "step": 1832 + }, + { + "epoch": 0.5226689478186484, + "grad_norm": 0.43955725006957547, + "learning_rate": 4.877618507312568e-06, + "loss": 0.0174, + "step": 1833 + }, + { + "epoch": 0.5229540918163673, + "grad_norm": 0.6565153634012186, + "learning_rate": 4.873001313098937e-06, + "loss": 0.0135, + "step": 1834 + }, + { + "epoch": 0.5232392358140862, + "grad_norm": 1.113652633722711, + "learning_rate": 4.868384227249468e-06, + "loss": 0.0239, + "step": 1835 + }, + { + "epoch": 0.523524379811805, + "grad_norm": 1.2861091962055604, + "learning_rate": 4.8637672537037815e-06, + "loss": 0.0519, + "step": 1836 + }, + { + "epoch": 0.5238095238095238, + "grad_norm": 0.8292555437480343, + "learning_rate": 4.859150396401404e-06, + "loss": 0.0145, + "step": 1837 + }, + { + "epoch": 0.5240946678072427, + "grad_norm": 2.2543232111596896, + "learning_rate": 4.85453365928176e-06, + "loss": 0.0325, + "step": 1838 + }, + { + "epoch": 0.5243798118049615, + "grad_norm": 0.7480667232985087, + "learning_rate": 4.849917046284175e-06, + "loss": 0.0139, + "step": 1839 + }, + { + "epoch": 0.5246649558026804, + "grad_norm": 0.4608883072551659, + "learning_rate": 4.8453005613478635e-06, + "loss": 0.0082, + "step": 1840 + }, + { + "epoch": 0.5249500998003992, + "grad_norm": 0.9653852346298967, + "learning_rate": 4.8406842084119345e-06, + "loss": 0.0378, + "step": 1841 + }, + { + "epoch": 0.525235243798118, + "grad_norm": 0.6538953169532171, + "learning_rate": 4.8360679914153805e-06, + "loss": 0.0236, + "step": 1842 + }, + { + "epoch": 0.5255203877958369, + "grad_norm": 2.7130229909581063, + "learning_rate": 4.831451914297086e-06, + "loss": 0.0351, + "step": 1843 + }, + { + "epoch": 0.5258055317935557, + "grad_norm": 0.9988544260022847, + "learning_rate": 4.826835980995806e-06, + "loss": 0.0313, + "step": 1844 + }, + { + "epoch": 0.5260906757912746, + "grad_norm": 0.4820387580248128, + "learning_rate": 4.82222019545018e-06, + "loss": 0.0094, + "step": 1845 + }, + { + "epoch": 0.5263758197889934, + "grad_norm": 0.6020909214467295, + "learning_rate": 4.81760456159872e-06, + "loss": 0.0207, + "step": 1846 + }, + { + "epoch": 0.5266609637867123, + "grad_norm": 0.34297314496197695, + "learning_rate": 4.812989083379806e-06, + "loss": 0.0091, + "step": 1847 + }, + { + "epoch": 0.5269461077844312, + "grad_norm": 0.9821597455371975, + "learning_rate": 4.808373764731686e-06, + "loss": 0.026, + "step": 1848 + }, + { + "epoch": 0.52723125178215, + "grad_norm": 1.427547834808575, + "learning_rate": 4.8037586095924756e-06, + "loss": 0.04, + "step": 1849 + }, + { + "epoch": 0.5275163957798689, + "grad_norm": 0.9056240346340622, + "learning_rate": 4.799143621900146e-06, + "loss": 0.0355, + "step": 1850 + }, + { + "epoch": 0.5278015397775877, + "grad_norm": 1.603722643513683, + "learning_rate": 4.794528805592529e-06, + "loss": 0.0342, + "step": 1851 + }, + { + "epoch": 0.5280866837753065, + "grad_norm": 0.5955654585757876, + "learning_rate": 4.789914164607305e-06, + "loss": 0.0158, + "step": 1852 + }, + { + "epoch": 0.5283718277730254, + "grad_norm": 1.1955850884312524, + "learning_rate": 4.785299702882012e-06, + "loss": 0.0153, + "step": 1853 + }, + { + "epoch": 0.5286569717707442, + "grad_norm": 0.8638149177647332, + "learning_rate": 4.7806854243540304e-06, + "loss": 0.0283, + "step": 1854 + }, + { + "epoch": 0.5289421157684631, + "grad_norm": 0.9683679448303014, + "learning_rate": 4.776071332960586e-06, + "loss": 0.0224, + "step": 1855 + }, + { + "epoch": 0.5292272597661819, + "grad_norm": 1.0544906834955496, + "learning_rate": 4.771457432638744e-06, + "loss": 0.018, + "step": 1856 + }, + { + "epoch": 0.5295124037639007, + "grad_norm": 0.5133813970021956, + "learning_rate": 4.766843727325408e-06, + "loss": 0.0095, + "step": 1857 + }, + { + "epoch": 0.5297975477616196, + "grad_norm": 1.0106462976613688, + "learning_rate": 4.7622302209573105e-06, + "loss": 0.0304, + "step": 1858 + }, + { + "epoch": 0.5300826917593384, + "grad_norm": 1.4067320553021943, + "learning_rate": 4.757616917471021e-06, + "loss": 0.0326, + "step": 1859 + }, + { + "epoch": 0.5303678357570573, + "grad_norm": 0.9000475309193106, + "learning_rate": 4.753003820802933e-06, + "loss": 0.0154, + "step": 1860 + }, + { + "epoch": 0.5306529797547762, + "grad_norm": 0.7559939464533812, + "learning_rate": 4.748390934889261e-06, + "loss": 0.0252, + "step": 1861 + }, + { + "epoch": 0.530938123752495, + "grad_norm": 0.7567111911478112, + "learning_rate": 4.743778263666045e-06, + "loss": 0.0098, + "step": 1862 + }, + { + "epoch": 0.5312232677502139, + "grad_norm": 2.6084997004648582, + "learning_rate": 4.739165811069135e-06, + "loss": 0.0385, + "step": 1863 + }, + { + "epoch": 0.5315084117479327, + "grad_norm": 0.7239400307328033, + "learning_rate": 4.734553581034199e-06, + "loss": 0.0275, + "step": 1864 + }, + { + "epoch": 0.5317935557456516, + "grad_norm": 0.7103481052891866, + "learning_rate": 4.729941577496718e-06, + "loss": 0.0091, + "step": 1865 + }, + { + "epoch": 0.5320786997433704, + "grad_norm": 1.4097950329346804, + "learning_rate": 4.7253298043919725e-06, + "loss": 0.0234, + "step": 1866 + }, + { + "epoch": 0.5323638437410892, + "grad_norm": 1.7077434751311544, + "learning_rate": 4.72071826565505e-06, + "loss": 0.0292, + "step": 1867 + }, + { + "epoch": 0.5326489877388081, + "grad_norm": 0.5457004148411975, + "learning_rate": 4.71610696522084e-06, + "loss": 0.009, + "step": 1868 + }, + { + "epoch": 0.5329341317365269, + "grad_norm": 0.5406920476646435, + "learning_rate": 4.7114959070240254e-06, + "loss": 0.0079, + "step": 1869 + }, + { + "epoch": 0.5332192757342458, + "grad_norm": 1.8334680965862749, + "learning_rate": 4.706885094999082e-06, + "loss": 0.0344, + "step": 1870 + }, + { + "epoch": 0.5335044197319646, + "grad_norm": 1.011692063206279, + "learning_rate": 4.70227453308028e-06, + "loss": 0.0244, + "step": 1871 + }, + { + "epoch": 0.5337895637296834, + "grad_norm": 0.8936841628052329, + "learning_rate": 4.697664225201671e-06, + "loss": 0.0127, + "step": 1872 + }, + { + "epoch": 0.5340747077274023, + "grad_norm": 0.8113649868154692, + "learning_rate": 4.693054175297095e-06, + "loss": 0.0192, + "step": 1873 + }, + { + "epoch": 0.5343598517251212, + "grad_norm": 1.1137611632560478, + "learning_rate": 4.688444387300165e-06, + "loss": 0.0284, + "step": 1874 + }, + { + "epoch": 0.5346449957228401, + "grad_norm": 0.9679934453395643, + "learning_rate": 4.683834865144277e-06, + "loss": 0.0222, + "step": 1875 + }, + { + "epoch": 0.5349301397205589, + "grad_norm": 2.406256084642815, + "learning_rate": 4.679225612762598e-06, + "loss": 0.0469, + "step": 1876 + }, + { + "epoch": 0.5352152837182778, + "grad_norm": 1.9285676628551522, + "learning_rate": 4.6746166340880636e-06, + "loss": 0.0238, + "step": 1877 + }, + { + "epoch": 0.5355004277159966, + "grad_norm": 1.5021513223725813, + "learning_rate": 4.6700079330533775e-06, + "loss": 0.0436, + "step": 1878 + }, + { + "epoch": 0.5357855717137154, + "grad_norm": 1.0941741453384195, + "learning_rate": 4.665399513591008e-06, + "loss": 0.0329, + "step": 1879 + }, + { + "epoch": 0.5360707157114343, + "grad_norm": 0.39730754365810067, + "learning_rate": 4.660791379633175e-06, + "loss": 0.01, + "step": 1880 + }, + { + "epoch": 0.5363558597091531, + "grad_norm": 1.7227973412048632, + "learning_rate": 4.656183535111865e-06, + "loss": 0.0294, + "step": 1881 + }, + { + "epoch": 0.536641003706872, + "grad_norm": 0.4622647741414944, + "learning_rate": 4.651575983958813e-06, + "loss": 0.0105, + "step": 1882 + }, + { + "epoch": 0.5369261477045908, + "grad_norm": 0.4489009206942475, + "learning_rate": 4.646968730105503e-06, + "loss": 0.0092, + "step": 1883 + }, + { + "epoch": 0.5372112917023096, + "grad_norm": 0.6693708589940758, + "learning_rate": 4.6423617774831675e-06, + "loss": 0.0131, + "step": 1884 + }, + { + "epoch": 0.5374964357000285, + "grad_norm": 0.8512303652055226, + "learning_rate": 4.63775513002278e-06, + "loss": 0.0085, + "step": 1885 + }, + { + "epoch": 0.5377815796977473, + "grad_norm": 1.6322548013574654, + "learning_rate": 4.633148791655053e-06, + "loss": 0.0318, + "step": 1886 + }, + { + "epoch": 0.5380667236954663, + "grad_norm": 1.2405260613430487, + "learning_rate": 4.628542766310438e-06, + "loss": 0.0147, + "step": 1887 + }, + { + "epoch": 0.5383518676931851, + "grad_norm": 0.4807074519508227, + "learning_rate": 4.623937057919118e-06, + "loss": 0.0111, + "step": 1888 + }, + { + "epoch": 0.5386370116909039, + "grad_norm": 1.3954446874800244, + "learning_rate": 4.619331670411005e-06, + "loss": 0.0262, + "step": 1889 + }, + { + "epoch": 0.5389221556886228, + "grad_norm": 1.116115723360862, + "learning_rate": 4.61472660771574e-06, + "loss": 0.0225, + "step": 1890 + }, + { + "epoch": 0.5392072996863416, + "grad_norm": 0.5958119317926713, + "learning_rate": 4.610121873762681e-06, + "loss": 0.0177, + "step": 1891 + }, + { + "epoch": 0.5394924436840605, + "grad_norm": 1.299507428802239, + "learning_rate": 4.605517472480912e-06, + "loss": 0.0259, + "step": 1892 + }, + { + "epoch": 0.5397775876817793, + "grad_norm": 0.8638980441903176, + "learning_rate": 4.6009134077992276e-06, + "loss": 0.0165, + "step": 1893 + }, + { + "epoch": 0.5400627316794981, + "grad_norm": 1.549781320683181, + "learning_rate": 4.5963096836461405e-06, + "loss": 0.0241, + "step": 1894 + }, + { + "epoch": 0.540347875677217, + "grad_norm": 0.8766341228007742, + "learning_rate": 4.5917063039498705e-06, + "loss": 0.0163, + "step": 1895 + }, + { + "epoch": 0.5406330196749358, + "grad_norm": 0.7287071167783781, + "learning_rate": 4.587103272638339e-06, + "loss": 0.0062, + "step": 1896 + }, + { + "epoch": 0.5409181636726547, + "grad_norm": 0.8942627426261667, + "learning_rate": 4.582500593639178e-06, + "loss": 0.0124, + "step": 1897 + }, + { + "epoch": 0.5412033076703735, + "grad_norm": 0.9701042144339818, + "learning_rate": 4.5778982708797146e-06, + "loss": 0.0094, + "step": 1898 + }, + { + "epoch": 0.5414884516680923, + "grad_norm": 0.755399450943285, + "learning_rate": 4.573296308286971e-06, + "loss": 0.0141, + "step": 1899 + }, + { + "epoch": 0.5417735956658112, + "grad_norm": 0.7506799238469901, + "learning_rate": 4.568694709787665e-06, + "loss": 0.0257, + "step": 1900 + }, + { + "epoch": 0.5420587396635301, + "grad_norm": 1.2250538112210514, + "learning_rate": 4.564093479308205e-06, + "loss": 0.0179, + "step": 1901 + }, + { + "epoch": 0.542343883661249, + "grad_norm": 1.6755192524244813, + "learning_rate": 4.559492620774676e-06, + "loss": 0.0249, + "step": 1902 + }, + { + "epoch": 0.5426290276589678, + "grad_norm": 0.8966395601407366, + "learning_rate": 4.554892138112856e-06, + "loss": 0.0127, + "step": 1903 + }, + { + "epoch": 0.5429141716566867, + "grad_norm": 1.5035988745836084, + "learning_rate": 4.5502920352482004e-06, + "loss": 0.0439, + "step": 1904 + }, + { + "epoch": 0.5431993156544055, + "grad_norm": 1.7840819508041883, + "learning_rate": 4.545692316105835e-06, + "loss": 0.0239, + "step": 1905 + }, + { + "epoch": 0.5434844596521243, + "grad_norm": 0.31678794815398437, + "learning_rate": 4.541092984610566e-06, + "loss": 0.0033, + "step": 1906 + }, + { + "epoch": 0.5437696036498432, + "grad_norm": 1.0022488627050703, + "learning_rate": 4.536494044686863e-06, + "loss": 0.0187, + "step": 1907 + }, + { + "epoch": 0.544054747647562, + "grad_norm": 2.297509688971944, + "learning_rate": 4.531895500258861e-06, + "loss": 0.0299, + "step": 1908 + }, + { + "epoch": 0.5443398916452808, + "grad_norm": 1.5802445456531595, + "learning_rate": 4.5272973552503614e-06, + "loss": 0.0303, + "step": 1909 + }, + { + "epoch": 0.5446250356429997, + "grad_norm": 0.7996338326856399, + "learning_rate": 4.522699613584825e-06, + "loss": 0.0123, + "step": 1910 + }, + { + "epoch": 0.5449101796407185, + "grad_norm": 1.5678553036220793, + "learning_rate": 4.5181022791853625e-06, + "loss": 0.0278, + "step": 1911 + }, + { + "epoch": 0.5451953236384374, + "grad_norm": 1.2576223644296152, + "learning_rate": 4.513505355974747e-06, + "loss": 0.0215, + "step": 1912 + }, + { + "epoch": 0.5454804676361562, + "grad_norm": 2.7271023649179162, + "learning_rate": 4.50890884787539e-06, + "loss": 0.0357, + "step": 1913 + }, + { + "epoch": 0.5457656116338752, + "grad_norm": 0.5599793478699585, + "learning_rate": 4.5043127588093545e-06, + "loss": 0.0136, + "step": 1914 + }, + { + "epoch": 0.546050755631594, + "grad_norm": 1.134277118458878, + "learning_rate": 4.4997170926983465e-06, + "loss": 0.0126, + "step": 1915 + }, + { + "epoch": 0.5463358996293128, + "grad_norm": 1.7917881712046206, + "learning_rate": 4.495121853463709e-06, + "loss": 0.0273, + "step": 1916 + }, + { + "epoch": 0.5466210436270317, + "grad_norm": 2.0607418747445347, + "learning_rate": 4.490527045026423e-06, + "loss": 0.0351, + "step": 1917 + }, + { + "epoch": 0.5469061876247505, + "grad_norm": 0.4922356489606496, + "learning_rate": 4.4859326713070996e-06, + "loss": 0.008, + "step": 1918 + }, + { + "epoch": 0.5471913316224694, + "grad_norm": 0.9315432349848304, + "learning_rate": 4.481338736225978e-06, + "loss": 0.0252, + "step": 1919 + }, + { + "epoch": 0.5474764756201882, + "grad_norm": 1.2682141717774775, + "learning_rate": 4.476745243702927e-06, + "loss": 0.0212, + "step": 1920 + }, + { + "epoch": 0.547761619617907, + "grad_norm": 0.850819395547047, + "learning_rate": 4.472152197657434e-06, + "loss": 0.0206, + "step": 1921 + }, + { + "epoch": 0.5480467636156259, + "grad_norm": 0.8083997376498508, + "learning_rate": 4.46755960200861e-06, + "loss": 0.0197, + "step": 1922 + }, + { + "epoch": 0.5483319076133447, + "grad_norm": 1.4339268897454358, + "learning_rate": 4.462967460675178e-06, + "loss": 0.0295, + "step": 1923 + }, + { + "epoch": 0.5486170516110636, + "grad_norm": 2.1314141503209276, + "learning_rate": 4.458375777575473e-06, + "loss": 0.056, + "step": 1924 + }, + { + "epoch": 0.5489021956087824, + "grad_norm": 1.7031785767526983, + "learning_rate": 4.453784556627439e-06, + "loss": 0.0476, + "step": 1925 + }, + { + "epoch": 0.5491873396065012, + "grad_norm": 1.2263205466527023, + "learning_rate": 4.449193801748629e-06, + "loss": 0.016, + "step": 1926 + }, + { + "epoch": 0.5494724836042202, + "grad_norm": 0.7947498299549632, + "learning_rate": 4.4446035168561955e-06, + "loss": 0.0155, + "step": 1927 + }, + { + "epoch": 0.549757627601939, + "grad_norm": 0.4966588434774226, + "learning_rate": 4.440013705866895e-06, + "loss": 0.0085, + "step": 1928 + }, + { + "epoch": 0.5500427715996579, + "grad_norm": 1.6419794017402862, + "learning_rate": 4.435424372697066e-06, + "loss": 0.0344, + "step": 1929 + }, + { + "epoch": 0.5503279155973767, + "grad_norm": 0.8164655165099096, + "learning_rate": 4.430835521262656e-06, + "loss": 0.0304, + "step": 1930 + }, + { + "epoch": 0.5506130595950955, + "grad_norm": 1.6944102182051353, + "learning_rate": 4.4262471554791895e-06, + "loss": 0.0395, + "step": 1931 + }, + { + "epoch": 0.5508982035928144, + "grad_norm": 1.7309887958840326, + "learning_rate": 4.421659279261785e-06, + "loss": 0.0425, + "step": 1932 + }, + { + "epoch": 0.5511833475905332, + "grad_norm": 2.5273881172214496, + "learning_rate": 4.417071896525138e-06, + "loss": 0.0521, + "step": 1933 + }, + { + "epoch": 0.5514684915882521, + "grad_norm": 3.4173362033274346, + "learning_rate": 4.412485011183525e-06, + "loss": 0.0638, + "step": 1934 + }, + { + "epoch": 0.5517536355859709, + "grad_norm": 0.9743219257548228, + "learning_rate": 4.407898627150795e-06, + "loss": 0.0186, + "step": 1935 + }, + { + "epoch": 0.5520387795836897, + "grad_norm": 2.349422676591217, + "learning_rate": 4.403312748340373e-06, + "loss": 0.0569, + "step": 1936 + }, + { + "epoch": 0.5523239235814086, + "grad_norm": 0.53449757299917, + "learning_rate": 4.398727378665252e-06, + "loss": 0.0097, + "step": 1937 + }, + { + "epoch": 0.5526090675791274, + "grad_norm": 1.0831620617986117, + "learning_rate": 4.394142522037991e-06, + "loss": 0.0263, + "step": 1938 + }, + { + "epoch": 0.5528942115768463, + "grad_norm": 1.383221056170318, + "learning_rate": 4.38955818237071e-06, + "loss": 0.0337, + "step": 1939 + }, + { + "epoch": 0.5531793555745651, + "grad_norm": 1.6813526071732885, + "learning_rate": 4.384974363575086e-06, + "loss": 0.0291, + "step": 1940 + }, + { + "epoch": 0.553464499572284, + "grad_norm": 0.41544282220199685, + "learning_rate": 4.3803910695623555e-06, + "loss": 0.017, + "step": 1941 + }, + { + "epoch": 0.5537496435700029, + "grad_norm": 2.098566976792298, + "learning_rate": 4.375808304243305e-06, + "loss": 0.0207, + "step": 1942 + }, + { + "epoch": 0.5540347875677217, + "grad_norm": 0.5784708780835224, + "learning_rate": 4.3712260715282716e-06, + "loss": 0.007, + "step": 1943 + }, + { + "epoch": 0.5543199315654406, + "grad_norm": 1.2225541332263947, + "learning_rate": 4.3666443753271355e-06, + "loss": 0.0319, + "step": 1944 + }, + { + "epoch": 0.5546050755631594, + "grad_norm": 1.08491250220113, + "learning_rate": 4.362063219549323e-06, + "loss": 0.0229, + "step": 1945 + }, + { + "epoch": 0.5548902195608783, + "grad_norm": 0.6817244520114021, + "learning_rate": 4.3574826081037935e-06, + "loss": 0.0214, + "step": 1946 + }, + { + "epoch": 0.5551753635585971, + "grad_norm": 1.1664569501875723, + "learning_rate": 4.352902544899044e-06, + "loss": 0.0229, + "step": 1947 + }, + { + "epoch": 0.5554605075563159, + "grad_norm": 1.293836452879822, + "learning_rate": 4.34832303384311e-06, + "loss": 0.0156, + "step": 1948 + }, + { + "epoch": 0.5557456515540348, + "grad_norm": 0.8022586547523411, + "learning_rate": 4.343744078843545e-06, + "loss": 0.0338, + "step": 1949 + }, + { + "epoch": 0.5560307955517536, + "grad_norm": 0.6083879507772137, + "learning_rate": 4.33916568380744e-06, + "loss": 0.0184, + "step": 1950 + }, + { + "epoch": 0.5563159395494724, + "grad_norm": 0.7357572185407785, + "learning_rate": 4.334587852641396e-06, + "loss": 0.0127, + "step": 1951 + }, + { + "epoch": 0.5566010835471913, + "grad_norm": 0.7103951655209286, + "learning_rate": 4.330010589251541e-06, + "loss": 0.0137, + "step": 1952 + }, + { + "epoch": 0.5568862275449101, + "grad_norm": 2.0436097540893847, + "learning_rate": 4.325433897543516e-06, + "loss": 0.0439, + "step": 1953 + }, + { + "epoch": 0.5571713715426291, + "grad_norm": 1.0347939995255582, + "learning_rate": 4.320857781422476e-06, + "loss": 0.0144, + "step": 1954 + }, + { + "epoch": 0.5574565155403479, + "grad_norm": 1.2205245382671843, + "learning_rate": 4.31628224479308e-06, + "loss": 0.0343, + "step": 1955 + }, + { + "epoch": 0.5577416595380668, + "grad_norm": 0.48096511602458747, + "learning_rate": 4.311707291559502e-06, + "loss": 0.0151, + "step": 1956 + }, + { + "epoch": 0.5580268035357856, + "grad_norm": 1.858218874925223, + "learning_rate": 4.307132925625405e-06, + "loss": 0.024, + "step": 1957 + }, + { + "epoch": 0.5583119475335044, + "grad_norm": 0.41558860469055026, + "learning_rate": 4.302559150893962e-06, + "loss": 0.0081, + "step": 1958 + }, + { + "epoch": 0.5585970915312233, + "grad_norm": 0.6185726461872646, + "learning_rate": 4.297985971267836e-06, + "loss": 0.0148, + "step": 1959 + }, + { + "epoch": 0.5588822355289421, + "grad_norm": 1.629041987647595, + "learning_rate": 4.293413390649186e-06, + "loss": 0.0382, + "step": 1960 + }, + { + "epoch": 0.559167379526661, + "grad_norm": 1.1013738682804133, + "learning_rate": 4.288841412939656e-06, + "loss": 0.0123, + "step": 1961 + }, + { + "epoch": 0.5594525235243798, + "grad_norm": 0.7207246603680382, + "learning_rate": 4.284270042040377e-06, + "loss": 0.0092, + "step": 1962 + }, + { + "epoch": 0.5597376675220986, + "grad_norm": 1.3146322345300236, + "learning_rate": 4.279699281851962e-06, + "loss": 0.0497, + "step": 1963 + }, + { + "epoch": 0.5600228115198175, + "grad_norm": 1.3720103682911193, + "learning_rate": 4.275129136274505e-06, + "loss": 0.02, + "step": 1964 + }, + { + "epoch": 0.5603079555175363, + "grad_norm": 0.4413728823645133, + "learning_rate": 4.270559609207572e-06, + "loss": 0.0068, + "step": 1965 + }, + { + "epoch": 0.5605930995152552, + "grad_norm": 1.042243710858581, + "learning_rate": 4.265990704550204e-06, + "loss": 0.0202, + "step": 1966 + }, + { + "epoch": 0.5608782435129741, + "grad_norm": 1.6738885287470393, + "learning_rate": 4.261422426200912e-06, + "loss": 0.05, + "step": 1967 + }, + { + "epoch": 0.561163387510693, + "grad_norm": 0.6925766021391758, + "learning_rate": 4.256854778057668e-06, + "loss": 0.0084, + "step": 1968 + }, + { + "epoch": 0.5614485315084118, + "grad_norm": 1.3643708719882088, + "learning_rate": 4.252287764017908e-06, + "loss": 0.0386, + "step": 1969 + }, + { + "epoch": 0.5617336755061306, + "grad_norm": 0.5506155217176552, + "learning_rate": 4.24772138797853e-06, + "loss": 0.0084, + "step": 1970 + }, + { + "epoch": 0.5620188195038495, + "grad_norm": 0.555170139364492, + "learning_rate": 4.243155653835886e-06, + "loss": 0.0118, + "step": 1971 + }, + { + "epoch": 0.5623039635015683, + "grad_norm": 0.9763249846495575, + "learning_rate": 4.238590565485779e-06, + "loss": 0.0139, + "step": 1972 + }, + { + "epoch": 0.5625891074992871, + "grad_norm": 0.7772741042704194, + "learning_rate": 4.23402612682346e-06, + "loss": 0.0125, + "step": 1973 + }, + { + "epoch": 0.562874251497006, + "grad_norm": 0.40993352316801507, + "learning_rate": 4.2294623417436284e-06, + "loss": 0.0057, + "step": 1974 + }, + { + "epoch": 0.5631593954947248, + "grad_norm": 2.595191318759739, + "learning_rate": 4.224899214140425e-06, + "loss": 0.0483, + "step": 1975 + }, + { + "epoch": 0.5634445394924437, + "grad_norm": 1.0047435454745783, + "learning_rate": 4.220336747907431e-06, + "loss": 0.0387, + "step": 1976 + }, + { + "epoch": 0.5637296834901625, + "grad_norm": 0.8252660492184049, + "learning_rate": 4.2157749469376594e-06, + "loss": 0.0179, + "step": 1977 + }, + { + "epoch": 0.5640148274878813, + "grad_norm": 1.7631265123768307, + "learning_rate": 4.21121381512356e-06, + "loss": 0.0259, + "step": 1978 + }, + { + "epoch": 0.5642999714856002, + "grad_norm": 1.1505145111500783, + "learning_rate": 4.206653356357007e-06, + "loss": 0.0294, + "step": 1979 + }, + { + "epoch": 0.564585115483319, + "grad_norm": 1.4455244317990026, + "learning_rate": 4.202093574529305e-06, + "loss": 0.0312, + "step": 1980 + }, + { + "epoch": 0.564870259481038, + "grad_norm": 0.8626915090427404, + "learning_rate": 4.197534473531177e-06, + "loss": 0.0276, + "step": 1981 + }, + { + "epoch": 0.5651554034787568, + "grad_norm": 1.5858817877250049, + "learning_rate": 4.192976057252768e-06, + "loss": 0.0192, + "step": 1982 + }, + { + "epoch": 0.5654405474764757, + "grad_norm": 1.410131591822338, + "learning_rate": 4.18841832958364e-06, + "loss": 0.0247, + "step": 1983 + }, + { + "epoch": 0.5657256914741945, + "grad_norm": 0.7445170622774078, + "learning_rate": 4.18386129441276e-06, + "loss": 0.0121, + "step": 1984 + }, + { + "epoch": 0.5660108354719133, + "grad_norm": 0.8809924184189645, + "learning_rate": 4.179304955628511e-06, + "loss": 0.019, + "step": 1985 + }, + { + "epoch": 0.5662959794696322, + "grad_norm": 1.5083664563446835, + "learning_rate": 4.174749317118683e-06, + "loss": 0.0261, + "step": 1986 + }, + { + "epoch": 0.566581123467351, + "grad_norm": 2.2901675873569656, + "learning_rate": 4.170194382770462e-06, + "loss": 0.0279, + "step": 1987 + }, + { + "epoch": 0.5668662674650699, + "grad_norm": 1.5826055506344388, + "learning_rate": 4.165640156470436e-06, + "loss": 0.0337, + "step": 1988 + }, + { + "epoch": 0.5671514114627887, + "grad_norm": 1.486333132761605, + "learning_rate": 4.161086642104594e-06, + "loss": 0.0147, + "step": 1989 + }, + { + "epoch": 0.5674365554605075, + "grad_norm": 1.2557151112171123, + "learning_rate": 4.156533843558309e-06, + "loss": 0.0224, + "step": 1990 + }, + { + "epoch": 0.5677216994582264, + "grad_norm": 0.682005313725667, + "learning_rate": 4.151981764716347e-06, + "loss": 0.0079, + "step": 1991 + }, + { + "epoch": 0.5680068434559452, + "grad_norm": 2.0989444982377665, + "learning_rate": 4.147430409462863e-06, + "loss": 0.0654, + "step": 1992 + }, + { + "epoch": 0.568291987453664, + "grad_norm": 1.3172125697333474, + "learning_rate": 4.142879781681389e-06, + "loss": 0.0382, + "step": 1993 + }, + { + "epoch": 0.568577131451383, + "grad_norm": 1.0200075411284906, + "learning_rate": 4.138329885254842e-06, + "loss": 0.0155, + "step": 1994 + }, + { + "epoch": 0.5688622754491018, + "grad_norm": 1.6810817501742972, + "learning_rate": 4.133780724065508e-06, + "loss": 0.0284, + "step": 1995 + }, + { + "epoch": 0.5691474194468207, + "grad_norm": 0.3058438071030809, + "learning_rate": 4.1292323019950515e-06, + "loss": 0.0057, + "step": 1996 + }, + { + "epoch": 0.5694325634445395, + "grad_norm": 3.2345016887299174, + "learning_rate": 4.124684622924503e-06, + "loss": 0.0603, + "step": 1997 + }, + { + "epoch": 0.5697177074422584, + "grad_norm": 0.25562993510263476, + "learning_rate": 4.120137690734264e-06, + "loss": 0.0058, + "step": 1998 + }, + { + "epoch": 0.5700028514399772, + "grad_norm": 0.9607045079360945, + "learning_rate": 4.115591509304092e-06, + "loss": 0.019, + "step": 1999 + }, + { + "epoch": 0.570287995437696, + "grad_norm": 0.6007173797861988, + "learning_rate": 4.111046082513109e-06, + "loss": 0.0082, + "step": 2000 + }, + { + "epoch": 0.5705731394354149, + "grad_norm": 0.8863429154698693, + "learning_rate": 4.106501414239787e-06, + "loss": 0.0118, + "step": 2001 + }, + { + "epoch": 0.5708582834331337, + "grad_norm": 1.5083003179196242, + "learning_rate": 4.1019575083619595e-06, + "loss": 0.0158, + "step": 2002 + }, + { + "epoch": 0.5711434274308526, + "grad_norm": 0.7391504005003044, + "learning_rate": 4.097414368756803e-06, + "loss": 0.0088, + "step": 2003 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.6832287705746387, + "learning_rate": 4.092871999300841e-06, + "loss": 0.0094, + "step": 2004 + }, + { + "epoch": 0.5717137154262902, + "grad_norm": 1.6772859676317076, + "learning_rate": 4.088330403869943e-06, + "loss": 0.0478, + "step": 2005 + }, + { + "epoch": 0.5719988594240091, + "grad_norm": 1.0306291075921608, + "learning_rate": 4.083789586339315e-06, + "loss": 0.0237, + "step": 2006 + }, + { + "epoch": 0.572284003421728, + "grad_norm": 1.184619272802862, + "learning_rate": 4.079249550583498e-06, + "loss": 0.0283, + "step": 2007 + }, + { + "epoch": 0.5725691474194469, + "grad_norm": 1.4096930747734462, + "learning_rate": 4.074710300476372e-06, + "loss": 0.014, + "step": 2008 + }, + { + "epoch": 0.5728542914171657, + "grad_norm": 0.783881045789122, + "learning_rate": 4.07017183989114e-06, + "loss": 0.0115, + "step": 2009 + }, + { + "epoch": 0.5731394354148845, + "grad_norm": 0.30172775611238883, + "learning_rate": 4.065634172700335e-06, + "loss": 0.0045, + "step": 2010 + }, + { + "epoch": 0.5734245794126034, + "grad_norm": 0.6976220498401472, + "learning_rate": 4.0610973027758124e-06, + "loss": 0.0272, + "step": 2011 + }, + { + "epoch": 0.5737097234103222, + "grad_norm": 1.1052303028320305, + "learning_rate": 4.0565612339887455e-06, + "loss": 0.017, + "step": 2012 + }, + { + "epoch": 0.5739948674080411, + "grad_norm": 1.7404973231076455, + "learning_rate": 4.052025970209626e-06, + "loss": 0.0392, + "step": 2013 + }, + { + "epoch": 0.5742800114057599, + "grad_norm": 0.674052952485932, + "learning_rate": 4.047491515308259e-06, + "loss": 0.0082, + "step": 2014 + }, + { + "epoch": 0.5745651554034787, + "grad_norm": 0.7117202921993683, + "learning_rate": 4.042957873153758e-06, + "loss": 0.0178, + "step": 2015 + }, + { + "epoch": 0.5748502994011976, + "grad_norm": 1.1146848695110856, + "learning_rate": 4.038425047614545e-06, + "loss": 0.0295, + "step": 2016 + }, + { + "epoch": 0.5751354433989164, + "grad_norm": 0.4209229851025855, + "learning_rate": 4.033893042558341e-06, + "loss": 0.0121, + "step": 2017 + }, + { + "epoch": 0.5754205873966353, + "grad_norm": 0.48277471651577564, + "learning_rate": 4.029361861852171e-06, + "loss": 0.0069, + "step": 2018 + }, + { + "epoch": 0.5757057313943541, + "grad_norm": 1.7222798865166078, + "learning_rate": 4.024831509362357e-06, + "loss": 0.0298, + "step": 2019 + }, + { + "epoch": 0.5759908753920729, + "grad_norm": 1.0337635489366164, + "learning_rate": 4.020301988954511e-06, + "loss": 0.0159, + "step": 2020 + }, + { + "epoch": 0.5762760193897919, + "grad_norm": 1.3775357400508321, + "learning_rate": 4.015773304493536e-06, + "loss": 0.0158, + "step": 2021 + }, + { + "epoch": 0.5765611633875107, + "grad_norm": 0.5083277988878048, + "learning_rate": 4.011245459843625e-06, + "loss": 0.0067, + "step": 2022 + }, + { + "epoch": 0.5768463073852296, + "grad_norm": 0.5286132139270061, + "learning_rate": 4.006718458868248e-06, + "loss": 0.0062, + "step": 2023 + }, + { + "epoch": 0.5771314513829484, + "grad_norm": 1.1038055454886817, + "learning_rate": 4.002192305430162e-06, + "loss": 0.0129, + "step": 2024 + }, + { + "epoch": 0.5774165953806673, + "grad_norm": 1.1714514779633876, + "learning_rate": 3.997667003391397e-06, + "loss": 0.031, + "step": 2025 + }, + { + "epoch": 0.5777017393783861, + "grad_norm": 1.7218432058203235, + "learning_rate": 3.993142556613255e-06, + "loss": 0.0407, + "step": 2026 + }, + { + "epoch": 0.5779868833761049, + "grad_norm": 1.3615661214766888, + "learning_rate": 3.988618968956315e-06, + "loss": 0.0442, + "step": 2027 + }, + { + "epoch": 0.5782720273738238, + "grad_norm": 1.9624857942185958, + "learning_rate": 3.9840962442804145e-06, + "loss": 0.0478, + "step": 2028 + }, + { + "epoch": 0.5785571713715426, + "grad_norm": 0.4209016778937654, + "learning_rate": 3.97957438644466e-06, + "loss": 0.0074, + "step": 2029 + }, + { + "epoch": 0.5788423153692615, + "grad_norm": 2.0516788767101795, + "learning_rate": 3.975053399307416e-06, + "loss": 0.0409, + "step": 2030 + }, + { + "epoch": 0.5791274593669803, + "grad_norm": 0.8532747901837053, + "learning_rate": 3.970533286726306e-06, + "loss": 0.0157, + "step": 2031 + }, + { + "epoch": 0.5794126033646991, + "grad_norm": 0.8877673247289528, + "learning_rate": 3.966014052558206e-06, + "loss": 0.0167, + "step": 2032 + }, + { + "epoch": 0.579697747362418, + "grad_norm": 1.0452094981847408, + "learning_rate": 3.961495700659243e-06, + "loss": 0.0177, + "step": 2033 + }, + { + "epoch": 0.5799828913601369, + "grad_norm": 2.336128746094747, + "learning_rate": 3.956978234884791e-06, + "loss": 0.0511, + "step": 2034 + }, + { + "epoch": 0.5802680353578558, + "grad_norm": 1.3293832741262577, + "learning_rate": 3.952461659089467e-06, + "loss": 0.0206, + "step": 2035 + }, + { + "epoch": 0.5805531793555746, + "grad_norm": 3.2429992646007246, + "learning_rate": 3.947945977127129e-06, + "loss": 0.0435, + "step": 2036 + }, + { + "epoch": 0.5808383233532934, + "grad_norm": 1.4889692622010537, + "learning_rate": 3.943431192850874e-06, + "loss": 0.0168, + "step": 2037 + }, + { + "epoch": 0.5811234673510123, + "grad_norm": 0.7779287362969187, + "learning_rate": 3.93891731011303e-06, + "loss": 0.0081, + "step": 2038 + }, + { + "epoch": 0.5814086113487311, + "grad_norm": 1.1967880354641325, + "learning_rate": 3.934404332765161e-06, + "loss": 0.0154, + "step": 2039 + }, + { + "epoch": 0.58169375534645, + "grad_norm": 1.1772392533629603, + "learning_rate": 3.929892264658052e-06, + "loss": 0.0269, + "step": 2040 + }, + { + "epoch": 0.5819788993441688, + "grad_norm": 1.8826797992290683, + "learning_rate": 3.925381109641715e-06, + "loss": 0.0459, + "step": 2041 + }, + { + "epoch": 0.5822640433418876, + "grad_norm": 0.8638860916601324, + "learning_rate": 3.9208708715653825e-06, + "loss": 0.0174, + "step": 2042 + }, + { + "epoch": 0.5825491873396065, + "grad_norm": 1.0986393666105592, + "learning_rate": 3.916361554277508e-06, + "loss": 0.019, + "step": 2043 + }, + { + "epoch": 0.5828343313373253, + "grad_norm": 0.9183125572551752, + "learning_rate": 3.911853161625756e-06, + "loss": 0.0183, + "step": 2044 + }, + { + "epoch": 0.5831194753350442, + "grad_norm": 1.9954417238979274, + "learning_rate": 3.9073456974570004e-06, + "loss": 0.0218, + "step": 2045 + }, + { + "epoch": 0.583404619332763, + "grad_norm": 0.6191794329765098, + "learning_rate": 3.902839165617323e-06, + "loss": 0.014, + "step": 2046 + }, + { + "epoch": 0.583689763330482, + "grad_norm": 1.4231403686313335, + "learning_rate": 3.898333569952018e-06, + "loss": 0.0296, + "step": 2047 + }, + { + "epoch": 0.5839749073282008, + "grad_norm": 0.8776537238404536, + "learning_rate": 3.89382891430557e-06, + "loss": 0.0083, + "step": 2048 + }, + { + "epoch": 0.5842600513259196, + "grad_norm": 0.8708883202320266, + "learning_rate": 3.889325202521668e-06, + "loss": 0.0202, + "step": 2049 + }, + { + "epoch": 0.5845451953236385, + "grad_norm": 0.8831775072235944, + "learning_rate": 3.884822438443196e-06, + "loss": 0.024, + "step": 2050 + }, + { + "epoch": 0.5848303393213573, + "grad_norm": 2.2216278743868196, + "learning_rate": 3.880320625912224e-06, + "loss": 0.0326, + "step": 2051 + }, + { + "epoch": 0.5851154833190761, + "grad_norm": 0.6026223557249291, + "learning_rate": 3.875819768770012e-06, + "loss": 0.0197, + "step": 2052 + }, + { + "epoch": 0.585400627316795, + "grad_norm": 1.4057182511451267, + "learning_rate": 3.87131987085701e-06, + "loss": 0.028, + "step": 2053 + }, + { + "epoch": 0.5856857713145138, + "grad_norm": 1.487605532299638, + "learning_rate": 3.866820936012844e-06, + "loss": 0.0113, + "step": 2054 + }, + { + "epoch": 0.5859709153122327, + "grad_norm": 1.4068652241281197, + "learning_rate": 3.862322968076322e-06, + "loss": 0.0553, + "step": 2055 + }, + { + "epoch": 0.5862560593099515, + "grad_norm": 0.6717247126069004, + "learning_rate": 3.857825970885422e-06, + "loss": 0.0162, + "step": 2056 + }, + { + "epoch": 0.5865412033076703, + "grad_norm": 0.2160215600154683, + "learning_rate": 3.853329948277297e-06, + "loss": 0.0048, + "step": 2057 + }, + { + "epoch": 0.5868263473053892, + "grad_norm": 0.7554944197765426, + "learning_rate": 3.848834904088268e-06, + "loss": 0.0157, + "step": 2058 + }, + { + "epoch": 0.587111491303108, + "grad_norm": 0.893784839410386, + "learning_rate": 3.844340842153823e-06, + "loss": 0.0186, + "step": 2059 + }, + { + "epoch": 0.5873966353008269, + "grad_norm": 0.742558705028012, + "learning_rate": 3.839847766308607e-06, + "loss": 0.0158, + "step": 2060 + }, + { + "epoch": 0.5876817792985458, + "grad_norm": 1.8275272220434577, + "learning_rate": 3.8353556803864324e-06, + "loss": 0.0421, + "step": 2061 + }, + { + "epoch": 0.5879669232962647, + "grad_norm": 0.4661961963077854, + "learning_rate": 3.830864588220253e-06, + "loss": 0.0086, + "step": 2062 + }, + { + "epoch": 0.5882520672939835, + "grad_norm": 0.4600167908756378, + "learning_rate": 3.826374493642187e-06, + "loss": 0.0143, + "step": 2063 + }, + { + "epoch": 0.5885372112917023, + "grad_norm": 0.9455308154957766, + "learning_rate": 3.821885400483497e-06, + "loss": 0.021, + "step": 2064 + }, + { + "epoch": 0.5888223552894212, + "grad_norm": 0.7579095990471869, + "learning_rate": 3.817397312574592e-06, + "loss": 0.0143, + "step": 2065 + }, + { + "epoch": 0.58910749928714, + "grad_norm": 1.4968058315352246, + "learning_rate": 3.8129102337450236e-06, + "loss": 0.0333, + "step": 2066 + }, + { + "epoch": 0.5893926432848589, + "grad_norm": 1.1681439442987638, + "learning_rate": 3.8084241678234777e-06, + "loss": 0.0183, + "step": 2067 + }, + { + "epoch": 0.5896777872825777, + "grad_norm": 1.0976297564753572, + "learning_rate": 3.8039391186377806e-06, + "loss": 0.0132, + "step": 2068 + }, + { + "epoch": 0.5899629312802965, + "grad_norm": 1.8400386955643093, + "learning_rate": 3.7994550900148914e-06, + "loss": 0.0233, + "step": 2069 + }, + { + "epoch": 0.5902480752780154, + "grad_norm": 0.8935945500146714, + "learning_rate": 3.7949720857808953e-06, + "loss": 0.0119, + "step": 2070 + }, + { + "epoch": 0.5905332192757342, + "grad_norm": 1.0682714756646516, + "learning_rate": 3.7904901097610074e-06, + "loss": 0.0108, + "step": 2071 + }, + { + "epoch": 0.590818363273453, + "grad_norm": 0.9744085122592351, + "learning_rate": 3.786009165779563e-06, + "loss": 0.0099, + "step": 2072 + }, + { + "epoch": 0.5911035072711719, + "grad_norm": 1.3356235518474884, + "learning_rate": 3.7815292576600143e-06, + "loss": 0.0374, + "step": 2073 + }, + { + "epoch": 0.5913886512688908, + "grad_norm": 2.052387547798954, + "learning_rate": 3.777050389224933e-06, + "loss": 0.0582, + "step": 2074 + }, + { + "epoch": 0.5916737952666097, + "grad_norm": 1.021714477749963, + "learning_rate": 3.7725725642960047e-06, + "loss": 0.0175, + "step": 2075 + }, + { + "epoch": 0.5919589392643285, + "grad_norm": 0.6765650754941559, + "learning_rate": 3.768095786694021e-06, + "loss": 0.0131, + "step": 2076 + }, + { + "epoch": 0.5922440832620474, + "grad_norm": 0.5186344994748767, + "learning_rate": 3.7636200602388855e-06, + "loss": 0.0061, + "step": 2077 + }, + { + "epoch": 0.5925292272597662, + "grad_norm": 0.6496702410485596, + "learning_rate": 3.759145388749595e-06, + "loss": 0.008, + "step": 2078 + }, + { + "epoch": 0.592814371257485, + "grad_norm": 1.003400177949479, + "learning_rate": 3.7546717760442565e-06, + "loss": 0.0215, + "step": 2079 + }, + { + "epoch": 0.5930995152552039, + "grad_norm": 1.4201821861215709, + "learning_rate": 3.7501992259400665e-06, + "loss": 0.0193, + "step": 2080 + }, + { + "epoch": 0.5933846592529227, + "grad_norm": 1.7584435403151972, + "learning_rate": 3.7457277422533213e-06, + "loss": 0.03, + "step": 2081 + }, + { + "epoch": 0.5936698032506416, + "grad_norm": 1.1121101189301186, + "learning_rate": 3.7412573287994e-06, + "loss": 0.0168, + "step": 2082 + }, + { + "epoch": 0.5939549472483604, + "grad_norm": 0.9801459726894103, + "learning_rate": 3.7367879893927763e-06, + "loss": 0.0236, + "step": 2083 + }, + { + "epoch": 0.5942400912460792, + "grad_norm": 0.9674593323408074, + "learning_rate": 3.732319727846998e-06, + "loss": 0.0179, + "step": 2084 + }, + { + "epoch": 0.5945252352437981, + "grad_norm": 1.222026333353283, + "learning_rate": 3.7278525479747014e-06, + "loss": 0.0365, + "step": 2085 + }, + { + "epoch": 0.5948103792415169, + "grad_norm": 2.025476959542898, + "learning_rate": 3.7233864535875953e-06, + "loss": 0.0373, + "step": 2086 + }, + { + "epoch": 0.5950955232392359, + "grad_norm": 1.0776536635630782, + "learning_rate": 3.7189214484964663e-06, + "loss": 0.0215, + "step": 2087 + }, + { + "epoch": 0.5953806672369547, + "grad_norm": 1.1705278143136957, + "learning_rate": 3.7144575365111677e-06, + "loss": 0.0114, + "step": 2088 + }, + { + "epoch": 0.5956658112346735, + "grad_norm": 2.401216532684097, + "learning_rate": 3.7099947214406217e-06, + "loss": 0.032, + "step": 2089 + }, + { + "epoch": 0.5959509552323924, + "grad_norm": 1.6998711218014348, + "learning_rate": 3.705533007092812e-06, + "loss": 0.0462, + "step": 2090 + }, + { + "epoch": 0.5962360992301112, + "grad_norm": 1.5985776456990062, + "learning_rate": 3.7010723972747887e-06, + "loss": 0.045, + "step": 2091 + }, + { + "epoch": 0.5965212432278301, + "grad_norm": 0.22163354454740036, + "learning_rate": 3.6966128957926528e-06, + "loss": 0.0035, + "step": 2092 + }, + { + "epoch": 0.5968063872255489, + "grad_norm": 0.860485640046081, + "learning_rate": 3.6921545064515667e-06, + "loss": 0.0232, + "step": 2093 + }, + { + "epoch": 0.5970915312232677, + "grad_norm": 0.19848014052903284, + "learning_rate": 3.6876972330557393e-06, + "loss": 0.0023, + "step": 2094 + }, + { + "epoch": 0.5973766752209866, + "grad_norm": 1.194915635682168, + "learning_rate": 3.6832410794084255e-06, + "loss": 0.0232, + "step": 2095 + }, + { + "epoch": 0.5976618192187054, + "grad_norm": 1.6969899400898916, + "learning_rate": 3.6787860493119274e-06, + "loss": 0.0479, + "step": 2096 + }, + { + "epoch": 0.5979469632164243, + "grad_norm": 0.6961043811511708, + "learning_rate": 3.6743321465675918e-06, + "loss": 0.0059, + "step": 2097 + }, + { + "epoch": 0.5982321072141431, + "grad_norm": 1.5504097165767623, + "learning_rate": 3.669879374975796e-06, + "loss": 0.0533, + "step": 2098 + }, + { + "epoch": 0.598517251211862, + "grad_norm": 0.6225399533823082, + "learning_rate": 3.6654277383359613e-06, + "loss": 0.0241, + "step": 2099 + }, + { + "epoch": 0.5988023952095808, + "grad_norm": 0.8199738615224776, + "learning_rate": 3.6609772404465293e-06, + "loss": 0.0153, + "step": 2100 + }, + { + "epoch": 0.5990875392072997, + "grad_norm": 2.6367146702980135, + "learning_rate": 3.6565278851049803e-06, + "loss": 0.0232, + "step": 2101 + }, + { + "epoch": 0.5993726832050186, + "grad_norm": 2.218376029082567, + "learning_rate": 3.6520796761078126e-06, + "loss": 0.0264, + "step": 2102 + }, + { + "epoch": 0.5996578272027374, + "grad_norm": 0.42063773687844663, + "learning_rate": 3.6476326172505516e-06, + "loss": 0.0077, + "step": 2103 + }, + { + "epoch": 0.5999429712004563, + "grad_norm": 1.4846079891375457, + "learning_rate": 3.6431867123277393e-06, + "loss": 0.0274, + "step": 2104 + }, + { + "epoch": 0.6002281151981751, + "grad_norm": 2.0672737709470628, + "learning_rate": 3.6387419651329326e-06, + "loss": 0.0252, + "step": 2105 + }, + { + "epoch": 0.6005132591958939, + "grad_norm": 1.3500653562887743, + "learning_rate": 3.6342983794586974e-06, + "loss": 0.0152, + "step": 2106 + }, + { + "epoch": 0.6007984031936128, + "grad_norm": 1.7611157268739988, + "learning_rate": 3.6298559590966153e-06, + "loss": 0.0286, + "step": 2107 + }, + { + "epoch": 0.6010835471913316, + "grad_norm": 1.2728775668088093, + "learning_rate": 3.625414707837268e-06, + "loss": 0.0179, + "step": 2108 + }, + { + "epoch": 0.6013686911890505, + "grad_norm": 0.9960972880229151, + "learning_rate": 3.6209746294702442e-06, + "loss": 0.0232, + "step": 2109 + }, + { + "epoch": 0.6016538351867693, + "grad_norm": 0.6157173200508199, + "learning_rate": 3.6165357277841294e-06, + "loss": 0.0134, + "step": 2110 + }, + { + "epoch": 0.6019389791844881, + "grad_norm": 0.8400067311463636, + "learning_rate": 3.6120980065665023e-06, + "loss": 0.0112, + "step": 2111 + }, + { + "epoch": 0.602224123182207, + "grad_norm": 1.2339712983273081, + "learning_rate": 3.607661469603937e-06, + "loss": 0.0239, + "step": 2112 + }, + { + "epoch": 0.6025092671799258, + "grad_norm": 1.5580285145203188, + "learning_rate": 3.6032261206819995e-06, + "loss": 0.0412, + "step": 2113 + }, + { + "epoch": 0.6027944111776448, + "grad_norm": 1.1722662736059852, + "learning_rate": 3.598791963585239e-06, + "loss": 0.0152, + "step": 2114 + }, + { + "epoch": 0.6030795551753636, + "grad_norm": 1.2912381233776336, + "learning_rate": 3.5943590020971873e-06, + "loss": 0.0272, + "step": 2115 + }, + { + "epoch": 0.6033646991730824, + "grad_norm": 2.1137858949289448, + "learning_rate": 3.5899272400003603e-06, + "loss": 0.0288, + "step": 2116 + }, + { + "epoch": 0.6036498431708013, + "grad_norm": 0.40725160937380805, + "learning_rate": 3.585496681076244e-06, + "loss": 0.005, + "step": 2117 + }, + { + "epoch": 0.6039349871685201, + "grad_norm": 1.069151927238274, + "learning_rate": 3.581067329105301e-06, + "loss": 0.0286, + "step": 2118 + }, + { + "epoch": 0.604220131166239, + "grad_norm": 1.6223267876797507, + "learning_rate": 3.5766391878669676e-06, + "loss": 0.028, + "step": 2119 + }, + { + "epoch": 0.6045052751639578, + "grad_norm": 0.9274658583181319, + "learning_rate": 3.5722122611396416e-06, + "loss": 0.0124, + "step": 2120 + }, + { + "epoch": 0.6047904191616766, + "grad_norm": 1.280180345583819, + "learning_rate": 3.5677865527006876e-06, + "loss": 0.0377, + "step": 2121 + }, + { + "epoch": 0.6050755631593955, + "grad_norm": 1.4904942621790036, + "learning_rate": 3.563362066326427e-06, + "loss": 0.0263, + "step": 2122 + }, + { + "epoch": 0.6053607071571143, + "grad_norm": 1.2524362229042842, + "learning_rate": 3.5589388057921435e-06, + "loss": 0.0267, + "step": 2123 + }, + { + "epoch": 0.6056458511548332, + "grad_norm": 4.061187413865505, + "learning_rate": 3.5545167748720705e-06, + "loss": 0.0713, + "step": 2124 + }, + { + "epoch": 0.605930995152552, + "grad_norm": 0.5362589492288503, + "learning_rate": 3.550095977339396e-06, + "loss": 0.0094, + "step": 2125 + }, + { + "epoch": 0.6062161391502708, + "grad_norm": 0.8432265034751789, + "learning_rate": 3.545676416966254e-06, + "loss": 0.0184, + "step": 2126 + }, + { + "epoch": 0.6065012831479898, + "grad_norm": 1.3167325005559367, + "learning_rate": 3.541258097523722e-06, + "loss": 0.0289, + "step": 2127 + }, + { + "epoch": 0.6067864271457086, + "grad_norm": 1.0810373994278544, + "learning_rate": 3.536841022781816e-06, + "loss": 0.019, + "step": 2128 + }, + { + "epoch": 0.6070715711434275, + "grad_norm": 1.2837062613842711, + "learning_rate": 3.532425196509498e-06, + "loss": 0.0268, + "step": 2129 + }, + { + "epoch": 0.6073567151411463, + "grad_norm": 1.1490746860147354, + "learning_rate": 3.5280106224746575e-06, + "loss": 0.0251, + "step": 2130 + }, + { + "epoch": 0.6076418591388651, + "grad_norm": 1.0489665093288734, + "learning_rate": 3.5235973044441163e-06, + "loss": 0.018, + "step": 2131 + }, + { + "epoch": 0.607927003136584, + "grad_norm": 1.726137132954225, + "learning_rate": 3.5191852461836306e-06, + "loss": 0.0241, + "step": 2132 + }, + { + "epoch": 0.6082121471343028, + "grad_norm": 0.7090919389506171, + "learning_rate": 3.514774451457873e-06, + "loss": 0.0186, + "step": 2133 + }, + { + "epoch": 0.6084972911320217, + "grad_norm": 0.4590722376042221, + "learning_rate": 3.510364924030443e-06, + "loss": 0.0097, + "step": 2134 + }, + { + "epoch": 0.6087824351297405, + "grad_norm": 1.1018085553098087, + "learning_rate": 3.505956667663859e-06, + "loss": 0.015, + "step": 2135 + }, + { + "epoch": 0.6090675791274593, + "grad_norm": 0.6254529574769264, + "learning_rate": 3.5015496861195526e-06, + "loss": 0.0135, + "step": 2136 + }, + { + "epoch": 0.6093527231251782, + "grad_norm": 1.6428276895681884, + "learning_rate": 3.497143983157868e-06, + "loss": 0.0266, + "step": 2137 + }, + { + "epoch": 0.609637867122897, + "grad_norm": 1.1906175686284322, + "learning_rate": 3.4927395625380626e-06, + "loss": 0.0403, + "step": 2138 + }, + { + "epoch": 0.6099230111206159, + "grad_norm": 0.7327641862389733, + "learning_rate": 3.488336428018293e-06, + "loss": 0.0166, + "step": 2139 + }, + { + "epoch": 0.6102081551183347, + "grad_norm": 1.8597660390527846, + "learning_rate": 3.4839345833556217e-06, + "loss": 0.0324, + "step": 2140 + }, + { + "epoch": 0.6104932991160537, + "grad_norm": 3.452549046142695, + "learning_rate": 3.479534032306011e-06, + "loss": 0.0538, + "step": 2141 + }, + { + "epoch": 0.6107784431137725, + "grad_norm": 1.2114221044219349, + "learning_rate": 3.4751347786243193e-06, + "loss": 0.0152, + "step": 2142 + }, + { + "epoch": 0.6110635871114913, + "grad_norm": 1.2721385499066669, + "learning_rate": 3.470736826064299e-06, + "loss": 0.0175, + "step": 2143 + }, + { + "epoch": 0.6113487311092102, + "grad_norm": 1.6293019383768632, + "learning_rate": 3.4663401783785865e-06, + "loss": 0.0483, + "step": 2144 + }, + { + "epoch": 0.611633875106929, + "grad_norm": 1.445639177748986, + "learning_rate": 3.4619448393187126e-06, + "loss": 0.0155, + "step": 2145 + }, + { + "epoch": 0.6119190191046479, + "grad_norm": 1.0315630583680009, + "learning_rate": 3.4575508126350875e-06, + "loss": 0.0126, + "step": 2146 + }, + { + "epoch": 0.6122041631023667, + "grad_norm": 0.7398009312525474, + "learning_rate": 3.453158102077001e-06, + "loss": 0.0229, + "step": 2147 + }, + { + "epoch": 0.6124893071000855, + "grad_norm": 0.3557371653456857, + "learning_rate": 3.4487667113926226e-06, + "loss": 0.0115, + "step": 2148 + }, + { + "epoch": 0.6127744510978044, + "grad_norm": 1.2762208256712575, + "learning_rate": 3.4443766443289948e-06, + "loss": 0.0246, + "step": 2149 + }, + { + "epoch": 0.6130595950955232, + "grad_norm": 1.3646703557027133, + "learning_rate": 3.439987904632026e-06, + "loss": 0.0311, + "step": 2150 + }, + { + "epoch": 0.613344739093242, + "grad_norm": 1.2229542617252571, + "learning_rate": 3.4356004960464994e-06, + "loss": 0.0227, + "step": 2151 + }, + { + "epoch": 0.6136298830909609, + "grad_norm": 2.3224264173489786, + "learning_rate": 3.431214422316057e-06, + "loss": 0.0362, + "step": 2152 + }, + { + "epoch": 0.6139150270886797, + "grad_norm": 0.709410518708107, + "learning_rate": 3.426829687183204e-06, + "loss": 0.0114, + "step": 2153 + }, + { + "epoch": 0.6142001710863987, + "grad_norm": 1.0413949214912819, + "learning_rate": 3.4224462943893057e-06, + "loss": 0.0236, + "step": 2154 + }, + { + "epoch": 0.6144853150841175, + "grad_norm": 0.7296747342861363, + "learning_rate": 3.418064247674576e-06, + "loss": 0.0112, + "step": 2155 + }, + { + "epoch": 0.6147704590818364, + "grad_norm": 1.2435332019082928, + "learning_rate": 3.413683550778084e-06, + "loss": 0.0158, + "step": 2156 + }, + { + "epoch": 0.6150556030795552, + "grad_norm": 1.0875315186112393, + "learning_rate": 3.409304207437749e-06, + "loss": 0.0194, + "step": 2157 + }, + { + "epoch": 0.615340747077274, + "grad_norm": 1.5282693808083203, + "learning_rate": 3.404926221390332e-06, + "loss": 0.0218, + "step": 2158 + }, + { + "epoch": 0.6156258910749929, + "grad_norm": 0.474632274589884, + "learning_rate": 3.400549596371435e-06, + "loss": 0.0117, + "step": 2159 + }, + { + "epoch": 0.6159110350727117, + "grad_norm": 0.7581340345773641, + "learning_rate": 3.3961743361155055e-06, + "loss": 0.0197, + "step": 2160 + }, + { + "epoch": 0.6161961790704306, + "grad_norm": 1.3727372919022707, + "learning_rate": 3.3918004443558163e-06, + "loss": 0.0563, + "step": 2161 + }, + { + "epoch": 0.6164813230681494, + "grad_norm": 2.2648471754423496, + "learning_rate": 3.3874279248244803e-06, + "loss": 0.0327, + "step": 2162 + }, + { + "epoch": 0.6167664670658682, + "grad_norm": 0.7505272987405108, + "learning_rate": 3.383056781252435e-06, + "loss": 0.013, + "step": 2163 + }, + { + "epoch": 0.6170516110635871, + "grad_norm": 1.1079848809501045, + "learning_rate": 3.3786870173694497e-06, + "loss": 0.0127, + "step": 2164 + }, + { + "epoch": 0.6173367550613059, + "grad_norm": 2.0002129754495295, + "learning_rate": 3.37431863690411e-06, + "loss": 0.0347, + "step": 2165 + }, + { + "epoch": 0.6176218990590248, + "grad_norm": 1.5327965290014722, + "learning_rate": 3.369951643583823e-06, + "loss": 0.0307, + "step": 2166 + }, + { + "epoch": 0.6179070430567437, + "grad_norm": 2.027574254434053, + "learning_rate": 3.365586041134815e-06, + "loss": 0.0292, + "step": 2167 + }, + { + "epoch": 0.6181921870544625, + "grad_norm": 1.1355125852210322, + "learning_rate": 3.361221833282122e-06, + "loss": 0.0182, + "step": 2168 + }, + { + "epoch": 0.6184773310521814, + "grad_norm": 1.0205244715021804, + "learning_rate": 3.3568590237495912e-06, + "loss": 0.0339, + "step": 2169 + }, + { + "epoch": 0.6187624750499002, + "grad_norm": 0.9286220091766371, + "learning_rate": 3.3524976162598777e-06, + "loss": 0.0223, + "step": 2170 + }, + { + "epoch": 0.6190476190476191, + "grad_norm": 0.6108615934396132, + "learning_rate": 3.34813761453444e-06, + "loss": 0.0103, + "step": 2171 + }, + { + "epoch": 0.6193327630453379, + "grad_norm": 1.2988343838308507, + "learning_rate": 3.343779022293536e-06, + "loss": 0.0301, + "step": 2172 + }, + { + "epoch": 0.6196179070430567, + "grad_norm": 0.3341719225290523, + "learning_rate": 3.3394218432562185e-06, + "loss": 0.0082, + "step": 2173 + }, + { + "epoch": 0.6199030510407756, + "grad_norm": 1.6943619739632687, + "learning_rate": 3.3350660811403425e-06, + "loss": 0.0294, + "step": 2174 + }, + { + "epoch": 0.6201881950384944, + "grad_norm": 1.1528385894622213, + "learning_rate": 3.330711739662545e-06, + "loss": 0.0187, + "step": 2175 + }, + { + "epoch": 0.6204733390362133, + "grad_norm": 1.4611973629167543, + "learning_rate": 3.326358822538258e-06, + "loss": 0.0402, + "step": 2176 + }, + { + "epoch": 0.6207584830339321, + "grad_norm": 1.7876364426505764, + "learning_rate": 3.3220073334816928e-06, + "loss": 0.0326, + "step": 2177 + }, + { + "epoch": 0.621043627031651, + "grad_norm": 0.7956538149927447, + "learning_rate": 3.3176572762058434e-06, + "loss": 0.0146, + "step": 2178 + }, + { + "epoch": 0.6213287710293698, + "grad_norm": 1.3226414882028097, + "learning_rate": 3.313308654422484e-06, + "loss": 0.0285, + "step": 2179 + }, + { + "epoch": 0.6216139150270886, + "grad_norm": 2.1614175148958616, + "learning_rate": 3.3089614718421635e-06, + "loss": 0.0377, + "step": 2180 + }, + { + "epoch": 0.6218990590248076, + "grad_norm": 0.9471207974194648, + "learning_rate": 3.304615732174201e-06, + "loss": 0.0145, + "step": 2181 + }, + { + "epoch": 0.6221842030225264, + "grad_norm": 1.3292435098511863, + "learning_rate": 3.300271439126689e-06, + "loss": 0.0234, + "step": 2182 + }, + { + "epoch": 0.6224693470202453, + "grad_norm": 0.706611462258107, + "learning_rate": 3.2959285964064776e-06, + "loss": 0.0147, + "step": 2183 + }, + { + "epoch": 0.6227544910179641, + "grad_norm": 3.000133668041504, + "learning_rate": 3.291587207719186e-06, + "loss": 0.0514, + "step": 2184 + }, + { + "epoch": 0.6230396350156829, + "grad_norm": 2.2778332807998405, + "learning_rate": 3.2872472767691894e-06, + "loss": 0.0488, + "step": 2185 + }, + { + "epoch": 0.6233247790134018, + "grad_norm": 1.1901143222547017, + "learning_rate": 3.2829088072596226e-06, + "loss": 0.0263, + "step": 2186 + }, + { + "epoch": 0.6236099230111206, + "grad_norm": 0.8397072051957538, + "learning_rate": 3.2785718028923715e-06, + "loss": 0.0264, + "step": 2187 + }, + { + "epoch": 0.6238950670088395, + "grad_norm": 0.38604583860305863, + "learning_rate": 3.2742362673680687e-06, + "loss": 0.0074, + "step": 2188 + }, + { + "epoch": 0.6241802110065583, + "grad_norm": 1.947982342419954, + "learning_rate": 3.2699022043860973e-06, + "loss": 0.0267, + "step": 2189 + }, + { + "epoch": 0.6244653550042771, + "grad_norm": 1.1626686035488216, + "learning_rate": 3.2655696176445852e-06, + "loss": 0.0229, + "step": 2190 + }, + { + "epoch": 0.624750499001996, + "grad_norm": 1.6043946457619445, + "learning_rate": 3.2612385108403955e-06, + "loss": 0.0382, + "step": 2191 + }, + { + "epoch": 0.6250356429997148, + "grad_norm": 1.8055044074507538, + "learning_rate": 3.256908887669134e-06, + "loss": 0.0276, + "step": 2192 + }, + { + "epoch": 0.6253207869974337, + "grad_norm": 1.6754046006759364, + "learning_rate": 3.2525807518251386e-06, + "loss": 0.0408, + "step": 2193 + }, + { + "epoch": 0.6256059309951526, + "grad_norm": 0.9463034307019634, + "learning_rate": 3.248254107001474e-06, + "loss": 0.0175, + "step": 2194 + }, + { + "epoch": 0.6258910749928714, + "grad_norm": 2.175470555795851, + "learning_rate": 3.243928956889938e-06, + "loss": 0.0278, + "step": 2195 + }, + { + "epoch": 0.6261762189905903, + "grad_norm": 0.6729073385440583, + "learning_rate": 3.2396053051810515e-06, + "loss": 0.0285, + "step": 2196 + }, + { + "epoch": 0.6264613629883091, + "grad_norm": 1.571784137465903, + "learning_rate": 3.2352831555640563e-06, + "loss": 0.0309, + "step": 2197 + }, + { + "epoch": 0.626746506986028, + "grad_norm": 1.1661769802553186, + "learning_rate": 3.230962511726915e-06, + "loss": 0.025, + "step": 2198 + }, + { + "epoch": 0.6270316509837468, + "grad_norm": 0.42123213663257564, + "learning_rate": 3.2266433773563e-06, + "loss": 0.0115, + "step": 2199 + }, + { + "epoch": 0.6273167949814656, + "grad_norm": 1.8338109807568292, + "learning_rate": 3.222325756137599e-06, + "loss": 0.0367, + "step": 2200 + }, + { + "epoch": 0.6276019389791845, + "grad_norm": 1.1277887251821224, + "learning_rate": 3.21800965175491e-06, + "loss": 0.0311, + "step": 2201 + }, + { + "epoch": 0.6278870829769033, + "grad_norm": 1.201306830776855, + "learning_rate": 3.213695067891034e-06, + "loss": 0.022, + "step": 2202 + }, + { + "epoch": 0.6281722269746222, + "grad_norm": 1.2570922828383633, + "learning_rate": 3.2093820082274763e-06, + "loss": 0.0193, + "step": 2203 + }, + { + "epoch": 0.628457370972341, + "grad_norm": 1.5186675511953927, + "learning_rate": 3.2050704764444433e-06, + "loss": 0.0441, + "step": 2204 + }, + { + "epoch": 0.6287425149700598, + "grad_norm": 0.6524174178550722, + "learning_rate": 3.200760476220831e-06, + "loss": 0.0268, + "step": 2205 + }, + { + "epoch": 0.6290276589677787, + "grad_norm": 0.8724476437379903, + "learning_rate": 3.1964520112342363e-06, + "loss": 0.0223, + "step": 2206 + }, + { + "epoch": 0.6293128029654976, + "grad_norm": 1.4186863998600603, + "learning_rate": 3.1921450851609404e-06, + "loss": 0.0392, + "step": 2207 + }, + { + "epoch": 0.6295979469632165, + "grad_norm": 1.336564669853662, + "learning_rate": 3.187839701675917e-06, + "loss": 0.0222, + "step": 2208 + }, + { + "epoch": 0.6298830909609353, + "grad_norm": 1.6468050869178028, + "learning_rate": 3.18353586445282e-06, + "loss": 0.037, + "step": 2209 + }, + { + "epoch": 0.6301682349586541, + "grad_norm": 1.841124429536788, + "learning_rate": 3.1792335771639827e-06, + "loss": 0.0394, + "step": 2210 + }, + { + "epoch": 0.630453378956373, + "grad_norm": 1.5844988548472232, + "learning_rate": 3.174932843480416e-06, + "loss": 0.0397, + "step": 2211 + }, + { + "epoch": 0.6307385229540918, + "grad_norm": 2.0069101566372085, + "learning_rate": 3.1706336670718106e-06, + "loss": 0.0501, + "step": 2212 + }, + { + "epoch": 0.6310236669518107, + "grad_norm": 0.9427410695473213, + "learning_rate": 3.166336051606521e-06, + "loss": 0.0231, + "step": 2213 + }, + { + "epoch": 0.6313088109495295, + "grad_norm": 0.9212663531499328, + "learning_rate": 3.1620400007515772e-06, + "loss": 0.0114, + "step": 2214 + }, + { + "epoch": 0.6315939549472483, + "grad_norm": 0.8808245598066381, + "learning_rate": 3.157745518172669e-06, + "loss": 0.0078, + "step": 2215 + }, + { + "epoch": 0.6318790989449672, + "grad_norm": 1.0702460227921857, + "learning_rate": 3.153452607534147e-06, + "loss": 0.02, + "step": 2216 + }, + { + "epoch": 0.632164242942686, + "grad_norm": 1.2534625169103417, + "learning_rate": 3.149161272499024e-06, + "loss": 0.0374, + "step": 2217 + }, + { + "epoch": 0.6324493869404049, + "grad_norm": 1.5261888629053173, + "learning_rate": 3.1448715167289677e-06, + "loss": 0.0435, + "step": 2218 + }, + { + "epoch": 0.6327345309381237, + "grad_norm": 1.2002510268158786, + "learning_rate": 3.140583343884298e-06, + "loss": 0.0183, + "step": 2219 + }, + { + "epoch": 0.6330196749358425, + "grad_norm": 1.0985692409600272, + "learning_rate": 3.1362967576239854e-06, + "loss": 0.0262, + "step": 2220 + }, + { + "epoch": 0.6333048189335615, + "grad_norm": 1.2072207242963418, + "learning_rate": 3.1320117616056413e-06, + "loss": 0.0314, + "step": 2221 + }, + { + "epoch": 0.6335899629312803, + "grad_norm": 0.467015398806509, + "learning_rate": 3.1277283594855267e-06, + "loss": 0.0106, + "step": 2222 + }, + { + "epoch": 0.6338751069289992, + "grad_norm": 0.8807024802610426, + "learning_rate": 3.123446554918538e-06, + "loss": 0.011, + "step": 2223 + }, + { + "epoch": 0.634160250926718, + "grad_norm": 0.6140299365612738, + "learning_rate": 3.1191663515582127e-06, + "loss": 0.0248, + "step": 2224 + }, + { + "epoch": 0.6344453949244369, + "grad_norm": 1.0559654324827494, + "learning_rate": 3.1148877530567177e-06, + "loss": 0.0183, + "step": 2225 + }, + { + "epoch": 0.6347305389221557, + "grad_norm": 1.0540351749712773, + "learning_rate": 3.1106107630648574e-06, + "loss": 0.0261, + "step": 2226 + }, + { + "epoch": 0.6350156829198745, + "grad_norm": 0.9313773695645087, + "learning_rate": 3.106335385232051e-06, + "loss": 0.0292, + "step": 2227 + }, + { + "epoch": 0.6353008269175934, + "grad_norm": 0.3747808238118342, + "learning_rate": 3.102061623206355e-06, + "loss": 0.0094, + "step": 2228 + }, + { + "epoch": 0.6355859709153122, + "grad_norm": 2.3312916926506504, + "learning_rate": 3.0977894806344406e-06, + "loss": 0.0527, + "step": 2229 + }, + { + "epoch": 0.635871114913031, + "grad_norm": 0.6324565323611943, + "learning_rate": 3.093518961161599e-06, + "loss": 0.0095, + "step": 2230 + }, + { + "epoch": 0.6361562589107499, + "grad_norm": 1.590518034839, + "learning_rate": 3.0892500684317386e-06, + "loss": 0.0244, + "step": 2231 + }, + { + "epoch": 0.6364414029084687, + "grad_norm": 0.7308543148733039, + "learning_rate": 3.084982806087372e-06, + "loss": 0.024, + "step": 2232 + }, + { + "epoch": 0.6367265469061876, + "grad_norm": 1.248369277983139, + "learning_rate": 3.080717177769629e-06, + "loss": 0.0231, + "step": 2233 + }, + { + "epoch": 0.6370116909039065, + "grad_norm": 0.9686441748388779, + "learning_rate": 3.0764531871182422e-06, + "loss": 0.016, + "step": 2234 + }, + { + "epoch": 0.6372968349016254, + "grad_norm": 1.0082773282212865, + "learning_rate": 3.072190837771546e-06, + "loss": 0.0175, + "step": 2235 + }, + { + "epoch": 0.6375819788993442, + "grad_norm": 1.4311529725243861, + "learning_rate": 3.067930133366476e-06, + "loss": 0.0319, + "step": 2236 + }, + { + "epoch": 0.637867122897063, + "grad_norm": 0.8406777760925859, + "learning_rate": 3.0636710775385635e-06, + "loss": 0.0108, + "step": 2237 + }, + { + "epoch": 0.6381522668947819, + "grad_norm": 0.878282487364186, + "learning_rate": 3.059413673921931e-06, + "loss": 0.0193, + "step": 2238 + }, + { + "epoch": 0.6384374108925007, + "grad_norm": 0.9502594240301524, + "learning_rate": 3.055157926149293e-06, + "loss": 0.0203, + "step": 2239 + }, + { + "epoch": 0.6387225548902196, + "grad_norm": 1.308186958668311, + "learning_rate": 3.050903837851953e-06, + "loss": 0.023, + "step": 2240 + }, + { + "epoch": 0.6390076988879384, + "grad_norm": 0.44428606044631475, + "learning_rate": 3.0466514126597945e-06, + "loss": 0.014, + "step": 2241 + }, + { + "epoch": 0.6392928428856572, + "grad_norm": 1.209699851720728, + "learning_rate": 3.0424006542012897e-06, + "loss": 0.0103, + "step": 2242 + }, + { + "epoch": 0.6395779868833761, + "grad_norm": 0.9514393878467423, + "learning_rate": 3.038151566103475e-06, + "loss": 0.0147, + "step": 2243 + }, + { + "epoch": 0.6398631308810949, + "grad_norm": 1.9454407250073225, + "learning_rate": 3.0339041519919745e-06, + "loss": 0.042, + "step": 2244 + }, + { + "epoch": 0.6401482748788138, + "grad_norm": 0.728850597077105, + "learning_rate": 3.029658415490977e-06, + "loss": 0.0124, + "step": 2245 + }, + { + "epoch": 0.6404334188765326, + "grad_norm": 1.1288659293589929, + "learning_rate": 3.0254143602232434e-06, + "loss": 0.0197, + "step": 2246 + }, + { + "epoch": 0.6407185628742516, + "grad_norm": 1.1475249422099956, + "learning_rate": 3.021171989810099e-06, + "loss": 0.0211, + "step": 2247 + }, + { + "epoch": 0.6410037068719704, + "grad_norm": 1.2132830355724133, + "learning_rate": 3.0169313078714296e-06, + "loss": 0.0293, + "step": 2248 + }, + { + "epoch": 0.6412888508696892, + "grad_norm": 1.9363856482086057, + "learning_rate": 3.0126923180256806e-06, + "loss": 0.0243, + "step": 2249 + }, + { + "epoch": 0.6415739948674081, + "grad_norm": 0.7703844388099039, + "learning_rate": 3.008455023889857e-06, + "loss": 0.0121, + "step": 2250 + }, + { + "epoch": 0.6418591388651269, + "grad_norm": 1.3134506657893854, + "learning_rate": 3.0042194290795123e-06, + "loss": 0.0318, + "step": 2251 + }, + { + "epoch": 0.6421442828628457, + "grad_norm": 1.4692545047359329, + "learning_rate": 2.999985537208755e-06, + "loss": 0.0176, + "step": 2252 + }, + { + "epoch": 0.6424294268605646, + "grad_norm": 0.9828572979175608, + "learning_rate": 2.9957533518902376e-06, + "loss": 0.0196, + "step": 2253 + }, + { + "epoch": 0.6427145708582834, + "grad_norm": 0.7049658553143684, + "learning_rate": 2.991522876735154e-06, + "loss": 0.0229, + "step": 2254 + }, + { + "epoch": 0.6429997148560023, + "grad_norm": 0.842198069759342, + "learning_rate": 2.987294115353242e-06, + "loss": 0.0128, + "step": 2255 + }, + { + "epoch": 0.6432848588537211, + "grad_norm": 1.7227804347359337, + "learning_rate": 2.9830670713527786e-06, + "loss": 0.0335, + "step": 2256 + }, + { + "epoch": 0.64357000285144, + "grad_norm": 0.3681229418460075, + "learning_rate": 2.9788417483405716e-06, + "loss": 0.0083, + "step": 2257 + }, + { + "epoch": 0.6438551468491588, + "grad_norm": 1.1818461840946088, + "learning_rate": 2.9746181499219627e-06, + "loss": 0.0154, + "step": 2258 + }, + { + "epoch": 0.6441402908468776, + "grad_norm": 1.411740982709478, + "learning_rate": 2.970396279700824e-06, + "loss": 0.0184, + "step": 2259 + }, + { + "epoch": 0.6444254348445965, + "grad_norm": 2.5844309546840716, + "learning_rate": 2.9661761412795465e-06, + "loss": 0.0378, + "step": 2260 + }, + { + "epoch": 0.6447105788423154, + "grad_norm": 1.5612725743924631, + "learning_rate": 2.9619577382590485e-06, + "loss": 0.0203, + "step": 2261 + }, + { + "epoch": 0.6449957228400343, + "grad_norm": 1.244411728256585, + "learning_rate": 2.9577410742387686e-06, + "loss": 0.0179, + "step": 2262 + }, + { + "epoch": 0.6452808668377531, + "grad_norm": 1.920069290541119, + "learning_rate": 2.9535261528166577e-06, + "loss": 0.0264, + "step": 2263 + }, + { + "epoch": 0.6455660108354719, + "grad_norm": 0.9562289133021995, + "learning_rate": 2.949312977589181e-06, + "loss": 0.0177, + "step": 2264 + }, + { + "epoch": 0.6458511548331908, + "grad_norm": 2.534580020676373, + "learning_rate": 2.945101552151317e-06, + "loss": 0.0305, + "step": 2265 + }, + { + "epoch": 0.6461362988309096, + "grad_norm": 1.3341099226810462, + "learning_rate": 2.9408918800965464e-06, + "loss": 0.0158, + "step": 2266 + }, + { + "epoch": 0.6464214428286285, + "grad_norm": 1.9949800798700283, + "learning_rate": 2.936683965016855e-06, + "loss": 0.0403, + "step": 2267 + }, + { + "epoch": 0.6467065868263473, + "grad_norm": 0.9844526919586044, + "learning_rate": 2.9324778105027323e-06, + "loss": 0.0137, + "step": 2268 + }, + { + "epoch": 0.6469917308240661, + "grad_norm": 1.0381005934303287, + "learning_rate": 2.9282734201431627e-06, + "loss": 0.0285, + "step": 2269 + }, + { + "epoch": 0.647276874821785, + "grad_norm": 1.2860946237813613, + "learning_rate": 2.924070797525628e-06, + "loss": 0.025, + "step": 2270 + }, + { + "epoch": 0.6475620188195038, + "grad_norm": 1.3340813057644438, + "learning_rate": 2.919869946236096e-06, + "loss": 0.0278, + "step": 2271 + }, + { + "epoch": 0.6478471628172227, + "grad_norm": 1.6573747957672138, + "learning_rate": 2.9156708698590273e-06, + "loss": 0.0301, + "step": 2272 + }, + { + "epoch": 0.6481323068149415, + "grad_norm": 0.868842113331832, + "learning_rate": 2.9114735719773718e-06, + "loss": 0.0206, + "step": 2273 + }, + { + "epoch": 0.6484174508126604, + "grad_norm": 0.7243820586100654, + "learning_rate": 2.9072780561725543e-06, + "loss": 0.0167, + "step": 2274 + }, + { + "epoch": 0.6487025948103793, + "grad_norm": 1.185198168923413, + "learning_rate": 2.9030843260244834e-06, + "loss": 0.023, + "step": 2275 + }, + { + "epoch": 0.6489877388080981, + "grad_norm": 0.6293907385655024, + "learning_rate": 2.8988923851115425e-06, + "loss": 0.0099, + "step": 2276 + }, + { + "epoch": 0.649272882805817, + "grad_norm": 2.0445266570057714, + "learning_rate": 2.894702237010589e-06, + "loss": 0.0186, + "step": 2277 + }, + { + "epoch": 0.6495580268035358, + "grad_norm": 2.047384698677883, + "learning_rate": 2.8905138852969507e-06, + "loss": 0.0268, + "step": 2278 + }, + { + "epoch": 0.6498431708012546, + "grad_norm": 1.0653517095398133, + "learning_rate": 2.886327333544421e-06, + "loss": 0.0161, + "step": 2279 + }, + { + "epoch": 0.6501283147989735, + "grad_norm": 1.5006183561890554, + "learning_rate": 2.8821425853252603e-06, + "loss": 0.035, + "step": 2280 + }, + { + "epoch": 0.6504134587966923, + "grad_norm": 0.937416176228414, + "learning_rate": 2.8779596442101878e-06, + "loss": 0.0229, + "step": 2281 + }, + { + "epoch": 0.6506986027944112, + "grad_norm": 1.4517724372844274, + "learning_rate": 2.8737785137683815e-06, + "loss": 0.027, + "step": 2282 + }, + { + "epoch": 0.65098374679213, + "grad_norm": 1.7278224774601127, + "learning_rate": 2.8695991975674735e-06, + "loss": 0.0237, + "step": 2283 + }, + { + "epoch": 0.6512688907898488, + "grad_norm": 1.5785313959183065, + "learning_rate": 2.8654216991735504e-06, + "loss": 0.0296, + "step": 2284 + }, + { + "epoch": 0.6515540347875677, + "grad_norm": 1.0919831115191292, + "learning_rate": 2.861246022151143e-06, + "loss": 0.02, + "step": 2285 + }, + { + "epoch": 0.6518391787852865, + "grad_norm": 0.7275158367500263, + "learning_rate": 2.8570721700632354e-06, + "loss": 0.0116, + "step": 2286 + }, + { + "epoch": 0.6521243227830055, + "grad_norm": 1.3129785911100587, + "learning_rate": 2.852900146471249e-06, + "loss": 0.0154, + "step": 2287 + }, + { + "epoch": 0.6524094667807243, + "grad_norm": 1.6226425506929618, + "learning_rate": 2.848729954935042e-06, + "loss": 0.019, + "step": 2288 + }, + { + "epoch": 0.6526946107784432, + "grad_norm": 1.7305587988312543, + "learning_rate": 2.844561599012918e-06, + "loss": 0.0267, + "step": 2289 + }, + { + "epoch": 0.652979754776162, + "grad_norm": 0.8143957160470554, + "learning_rate": 2.8403950822616088e-06, + "loss": 0.0085, + "step": 2290 + }, + { + "epoch": 0.6532648987738808, + "grad_norm": 1.1122531151983972, + "learning_rate": 2.836230408236278e-06, + "loss": 0.0215, + "step": 2291 + }, + { + "epoch": 0.6535500427715997, + "grad_norm": 0.4083282299767834, + "learning_rate": 2.832067580490516e-06, + "loss": 0.0063, + "step": 2292 + }, + { + "epoch": 0.6538351867693185, + "grad_norm": 0.5640507716536349, + "learning_rate": 2.827906602576339e-06, + "loss": 0.0087, + "step": 2293 + }, + { + "epoch": 0.6541203307670374, + "grad_norm": 1.3067961879293044, + "learning_rate": 2.823747478044185e-06, + "loss": 0.0138, + "step": 2294 + }, + { + "epoch": 0.6544054747647562, + "grad_norm": 1.0110478159071878, + "learning_rate": 2.8195902104429084e-06, + "loss": 0.0298, + "step": 2295 + }, + { + "epoch": 0.654690618762475, + "grad_norm": 0.5904570142107487, + "learning_rate": 2.815434803319783e-06, + "loss": 0.0111, + "step": 2296 + }, + { + "epoch": 0.6549757627601939, + "grad_norm": 0.245921457034098, + "learning_rate": 2.8112812602204885e-06, + "loss": 0.0026, + "step": 2297 + }, + { + "epoch": 0.6552609067579127, + "grad_norm": 0.7085877553818561, + "learning_rate": 2.8071295846891256e-06, + "loss": 0.0093, + "step": 2298 + }, + { + "epoch": 0.6555460507556315, + "grad_norm": 1.6452998075190501, + "learning_rate": 2.802979780268188e-06, + "loss": 0.0244, + "step": 2299 + }, + { + "epoch": 0.6558311947533504, + "grad_norm": 0.6904886542892218, + "learning_rate": 2.7988318504985817e-06, + "loss": 0.0174, + "step": 2300 + }, + { + "epoch": 0.6561163387510693, + "grad_norm": 0.5850968737884185, + "learning_rate": 2.7946857989196076e-06, + "loss": 0.0069, + "step": 2301 + }, + { + "epoch": 0.6564014827487882, + "grad_norm": 1.8185293469917627, + "learning_rate": 2.7905416290689717e-06, + "loss": 0.0412, + "step": 2302 + }, + { + "epoch": 0.656686626746507, + "grad_norm": 0.8965190611007764, + "learning_rate": 2.7863993444827697e-06, + "loss": 0.0226, + "step": 2303 + }, + { + "epoch": 0.6569717707442259, + "grad_norm": 1.3182053225055796, + "learning_rate": 2.782258948695481e-06, + "loss": 0.0175, + "step": 2304 + }, + { + "epoch": 0.6572569147419447, + "grad_norm": 0.9339402915101415, + "learning_rate": 2.778120445239989e-06, + "loss": 0.0122, + "step": 2305 + }, + { + "epoch": 0.6575420587396635, + "grad_norm": 1.6156977930324048, + "learning_rate": 2.773983837647551e-06, + "loss": 0.0389, + "step": 2306 + }, + { + "epoch": 0.6578272027373824, + "grad_norm": 0.6651074350777881, + "learning_rate": 2.76984912944781e-06, + "loss": 0.0146, + "step": 2307 + }, + { + "epoch": 0.6581123467351012, + "grad_norm": 1.8452127520754087, + "learning_rate": 2.765716324168789e-06, + "loss": 0.0227, + "step": 2308 + }, + { + "epoch": 0.6583974907328201, + "grad_norm": 0.9652739173619617, + "learning_rate": 2.761585425336886e-06, + "loss": 0.0267, + "step": 2309 + }, + { + "epoch": 0.6586826347305389, + "grad_norm": 1.2330026270001035, + "learning_rate": 2.757456436476873e-06, + "loss": 0.023, + "step": 2310 + }, + { + "epoch": 0.6589677787282577, + "grad_norm": 0.9565278016472557, + "learning_rate": 2.7533293611118923e-06, + "loss": 0.0131, + "step": 2311 + }, + { + "epoch": 0.6592529227259766, + "grad_norm": 0.5029893671434901, + "learning_rate": 2.7492042027634525e-06, + "loss": 0.0074, + "step": 2312 + }, + { + "epoch": 0.6595380667236954, + "grad_norm": 1.6048936560011449, + "learning_rate": 2.7450809649514265e-06, + "loss": 0.048, + "step": 2313 + }, + { + "epoch": 0.6598232107214144, + "grad_norm": 2.4275543233286503, + "learning_rate": 2.740959651194054e-06, + "loss": 0.0222, + "step": 2314 + }, + { + "epoch": 0.6601083547191332, + "grad_norm": 2.8531321009807247, + "learning_rate": 2.7368402650079228e-06, + "loss": 0.0597, + "step": 2315 + }, + { + "epoch": 0.660393498716852, + "grad_norm": 1.4630738141417063, + "learning_rate": 2.7327228099079826e-06, + "loss": 0.0229, + "step": 2316 + }, + { + "epoch": 0.6606786427145709, + "grad_norm": 0.6779278399548759, + "learning_rate": 2.728607289407534e-06, + "loss": 0.0069, + "step": 2317 + }, + { + "epoch": 0.6609637867122897, + "grad_norm": 2.398616876487329, + "learning_rate": 2.7244937070182286e-06, + "loss": 0.0297, + "step": 2318 + }, + { + "epoch": 0.6612489307100086, + "grad_norm": 0.7855959677784145, + "learning_rate": 2.7203820662500625e-06, + "loss": 0.0356, + "step": 2319 + }, + { + "epoch": 0.6615340747077274, + "grad_norm": 0.24261892957888342, + "learning_rate": 2.716272370611375e-06, + "loss": 0.004, + "step": 2320 + }, + { + "epoch": 0.6618192187054462, + "grad_norm": 1.5723455114774483, + "learning_rate": 2.712164623608844e-06, + "loss": 0.0285, + "step": 2321 + }, + { + "epoch": 0.6621043627031651, + "grad_norm": 0.24075125669855854, + "learning_rate": 2.7080588287474885e-06, + "loss": 0.0038, + "step": 2322 + }, + { + "epoch": 0.6623895067008839, + "grad_norm": 0.6828692169124314, + "learning_rate": 2.7039549895306593e-06, + "loss": 0.0154, + "step": 2323 + }, + { + "epoch": 0.6626746506986028, + "grad_norm": 0.6957630191781368, + "learning_rate": 2.699853109460039e-06, + "loss": 0.0097, + "step": 2324 + }, + { + "epoch": 0.6629597946963216, + "grad_norm": 2.9059174955559723, + "learning_rate": 2.695753192035639e-06, + "loss": 0.0604, + "step": 2325 + }, + { + "epoch": 0.6632449386940404, + "grad_norm": 1.1734601436082144, + "learning_rate": 2.691655240755795e-06, + "loss": 0.0181, + "step": 2326 + }, + { + "epoch": 0.6635300826917594, + "grad_norm": 1.3146469546781792, + "learning_rate": 2.6875592591171663e-06, + "loss": 0.0249, + "step": 2327 + }, + { + "epoch": 0.6638152266894782, + "grad_norm": 0.6224899882604887, + "learning_rate": 2.6834652506147297e-06, + "loss": 0.0094, + "step": 2328 + }, + { + "epoch": 0.6641003706871971, + "grad_norm": 0.8606657609714811, + "learning_rate": 2.67937321874178e-06, + "loss": 0.0117, + "step": 2329 + }, + { + "epoch": 0.6643855146849159, + "grad_norm": 1.237561338520917, + "learning_rate": 2.675283166989926e-06, + "loss": 0.0251, + "step": 2330 + }, + { + "epoch": 0.6646706586826348, + "grad_norm": 1.3971850707966957, + "learning_rate": 2.671195098849089e-06, + "loss": 0.0557, + "step": 2331 + }, + { + "epoch": 0.6649558026803536, + "grad_norm": 0.8052143234506958, + "learning_rate": 2.6671090178074878e-06, + "loss": 0.0147, + "step": 2332 + }, + { + "epoch": 0.6652409466780724, + "grad_norm": 1.6731504755642395, + "learning_rate": 2.663024927351655e-06, + "loss": 0.0187, + "step": 2333 + }, + { + "epoch": 0.6655260906757913, + "grad_norm": 3.2684569195727837, + "learning_rate": 2.658942830966425e-06, + "loss": 0.0474, + "step": 2334 + }, + { + "epoch": 0.6658112346735101, + "grad_norm": 1.6686161366860728, + "learning_rate": 2.654862732134926e-06, + "loss": 0.0957, + "step": 2335 + }, + { + "epoch": 0.666096378671229, + "grad_norm": 1.8905217359113058, + "learning_rate": 2.6507846343385862e-06, + "loss": 0.0332, + "step": 2336 + }, + { + "epoch": 0.6663815226689478, + "grad_norm": 0.8396385938218617, + "learning_rate": 2.6467085410571175e-06, + "loss": 0.0125, + "step": 2337 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.6943892112720279, + "learning_rate": 2.6426344557685342e-06, + "loss": 0.0151, + "step": 2338 + }, + { + "epoch": 0.6669518106643855, + "grad_norm": 0.8584969710658708, + "learning_rate": 2.6385623819491278e-06, + "loss": 0.0174, + "step": 2339 + }, + { + "epoch": 0.6672369546621043, + "grad_norm": 1.1323015265299097, + "learning_rate": 2.6344923230734786e-06, + "loss": 0.0344, + "step": 2340 + }, + { + "epoch": 0.6675220986598233, + "grad_norm": 1.2498573670424775, + "learning_rate": 2.630424282614441e-06, + "loss": 0.0258, + "step": 2341 + }, + { + "epoch": 0.6678072426575421, + "grad_norm": 1.6270286307278543, + "learning_rate": 2.6263582640431595e-06, + "loss": 0.0707, + "step": 2342 + }, + { + "epoch": 0.6680923866552609, + "grad_norm": 0.9402570992440751, + "learning_rate": 2.622294270829039e-06, + "loss": 0.0289, + "step": 2343 + }, + { + "epoch": 0.6683775306529798, + "grad_norm": 1.1740285977390892, + "learning_rate": 2.6182323064397645e-06, + "loss": 0.0236, + "step": 2344 + }, + { + "epoch": 0.6686626746506986, + "grad_norm": 2.7212202569279995, + "learning_rate": 2.614172374341287e-06, + "loss": 0.0568, + "step": 2345 + }, + { + "epoch": 0.6689478186484175, + "grad_norm": 0.6205323905446688, + "learning_rate": 2.610114477997827e-06, + "loss": 0.0109, + "step": 2346 + }, + { + "epoch": 0.6692329626461363, + "grad_norm": 2.0357402784016387, + "learning_rate": 2.6060586208718673e-06, + "loss": 0.0345, + "step": 2347 + }, + { + "epoch": 0.6695181066438551, + "grad_norm": 1.3978184198875205, + "learning_rate": 2.602004806424144e-06, + "loss": 0.0201, + "step": 2348 + }, + { + "epoch": 0.669803250641574, + "grad_norm": 0.8378057178472931, + "learning_rate": 2.597953038113655e-06, + "loss": 0.013, + "step": 2349 + }, + { + "epoch": 0.6700883946392928, + "grad_norm": 1.8429505668991712, + "learning_rate": 2.5939033193976567e-06, + "loss": 0.034, + "step": 2350 + }, + { + "epoch": 0.6703735386370117, + "grad_norm": 0.6951733957823362, + "learning_rate": 2.58985565373165e-06, + "loss": 0.0149, + "step": 2351 + }, + { + "epoch": 0.6706586826347305, + "grad_norm": 1.254638572408408, + "learning_rate": 2.585810044569387e-06, + "loss": 0.0198, + "step": 2352 + }, + { + "epoch": 0.6709438266324493, + "grad_norm": 1.2282829118276561, + "learning_rate": 2.581766495362864e-06, + "loss": 0.0153, + "step": 2353 + }, + { + "epoch": 0.6712289706301683, + "grad_norm": 1.8682120897475893, + "learning_rate": 2.57772500956232e-06, + "loss": 0.0286, + "step": 2354 + }, + { + "epoch": 0.6715141146278871, + "grad_norm": 0.9128992862378721, + "learning_rate": 2.573685590616234e-06, + "loss": 0.0178, + "step": 2355 + }, + { + "epoch": 0.671799258625606, + "grad_norm": 1.556527433042271, + "learning_rate": 2.5696482419713197e-06, + "loss": 0.0441, + "step": 2356 + }, + { + "epoch": 0.6720844026233248, + "grad_norm": 0.2516116743394582, + "learning_rate": 2.565612967072525e-06, + "loss": 0.0058, + "step": 2357 + }, + { + "epoch": 0.6723695466210436, + "grad_norm": 1.191372773291596, + "learning_rate": 2.5615797693630328e-06, + "loss": 0.0368, + "step": 2358 + }, + { + "epoch": 0.6726546906187625, + "grad_norm": 1.091184520957333, + "learning_rate": 2.557548652284245e-06, + "loss": 0.0259, + "step": 2359 + }, + { + "epoch": 0.6729398346164813, + "grad_norm": 1.0730756075730936, + "learning_rate": 2.553519619275794e-06, + "loss": 0.0185, + "step": 2360 + }, + { + "epoch": 0.6732249786142002, + "grad_norm": 1.8119984458090785, + "learning_rate": 2.5494926737755306e-06, + "loss": 0.0379, + "step": 2361 + }, + { + "epoch": 0.673510122611919, + "grad_norm": 0.5991375226390366, + "learning_rate": 2.5454678192195305e-06, + "loss": 0.0154, + "step": 2362 + }, + { + "epoch": 0.6737952666096378, + "grad_norm": 0.8446181744229467, + "learning_rate": 2.5414450590420793e-06, + "loss": 0.016, + "step": 2363 + }, + { + "epoch": 0.6740804106073567, + "grad_norm": 1.2506916116716331, + "learning_rate": 2.5374243966756782e-06, + "loss": 0.0179, + "step": 2364 + }, + { + "epoch": 0.6743655546050755, + "grad_norm": 0.7844351247089717, + "learning_rate": 2.5334058355510337e-06, + "loss": 0.0142, + "step": 2365 + }, + { + "epoch": 0.6746506986027944, + "grad_norm": 1.2753595775969064, + "learning_rate": 2.529389379097067e-06, + "loss": 0.0584, + "step": 2366 + }, + { + "epoch": 0.6749358426005133, + "grad_norm": 2.130096709691072, + "learning_rate": 2.5253750307408996e-06, + "loss": 0.0433, + "step": 2367 + }, + { + "epoch": 0.6752209865982322, + "grad_norm": 2.4889057844910223, + "learning_rate": 2.5213627939078534e-06, + "loss": 0.0529, + "step": 2368 + }, + { + "epoch": 0.675506130595951, + "grad_norm": 2.0733191308785397, + "learning_rate": 2.517352672021449e-06, + "loss": 0.0735, + "step": 2369 + }, + { + "epoch": 0.6757912745936698, + "grad_norm": 0.8721844668658577, + "learning_rate": 2.5133446685034048e-06, + "loss": 0.02, + "step": 2370 + }, + { + "epoch": 0.6760764185913887, + "grad_norm": 0.5662408732865061, + "learning_rate": 2.5093387867736275e-06, + "loss": 0.0185, + "step": 2371 + }, + { + "epoch": 0.6763615625891075, + "grad_norm": 0.7771568621772995, + "learning_rate": 2.5053350302502167e-06, + "loss": 0.016, + "step": 2372 + }, + { + "epoch": 0.6766467065868264, + "grad_norm": 1.699224004876367, + "learning_rate": 2.5013334023494556e-06, + "loss": 0.0308, + "step": 2373 + }, + { + "epoch": 0.6769318505845452, + "grad_norm": 0.6987391064530609, + "learning_rate": 2.4973339064858163e-06, + "loss": 0.0159, + "step": 2374 + }, + { + "epoch": 0.677216994582264, + "grad_norm": 1.6646679512778157, + "learning_rate": 2.49333654607195e-06, + "loss": 0.0236, + "step": 2375 + }, + { + "epoch": 0.6775021385799829, + "grad_norm": 1.6455502915346387, + "learning_rate": 2.489341324518678e-06, + "loss": 0.0621, + "step": 2376 + }, + { + "epoch": 0.6777872825777017, + "grad_norm": 0.7215796317053166, + "learning_rate": 2.4853482452350048e-06, + "loss": 0.0194, + "step": 2377 + }, + { + "epoch": 0.6780724265754206, + "grad_norm": 0.9265039963157378, + "learning_rate": 2.4813573116281083e-06, + "loss": 0.0287, + "step": 2378 + }, + { + "epoch": 0.6783575705731394, + "grad_norm": 1.386647458592848, + "learning_rate": 2.477368527103329e-06, + "loss": 0.0276, + "step": 2379 + }, + { + "epoch": 0.6786427145708582, + "grad_norm": 1.0194721860507407, + "learning_rate": 2.473381895064181e-06, + "loss": 0.0569, + "step": 2380 + }, + { + "epoch": 0.6789278585685772, + "grad_norm": 1.5781901137615435, + "learning_rate": 2.46939741891233e-06, + "loss": 0.0219, + "step": 2381 + }, + { + "epoch": 0.679213002566296, + "grad_norm": 0.9271661128511948, + "learning_rate": 2.4654151020476165e-06, + "loss": 0.0154, + "step": 2382 + }, + { + "epoch": 0.6794981465640149, + "grad_norm": 1.2929840875659686, + "learning_rate": 2.461434947868029e-06, + "loss": 0.0336, + "step": 2383 + }, + { + "epoch": 0.6797832905617337, + "grad_norm": 0.8356221756560887, + "learning_rate": 2.4574569597697145e-06, + "loss": 0.0202, + "step": 2384 + }, + { + "epoch": 0.6800684345594525, + "grad_norm": 0.664795804036026, + "learning_rate": 2.4534811411469704e-06, + "loss": 0.0104, + "step": 2385 + }, + { + "epoch": 0.6803535785571714, + "grad_norm": 1.9929916630870672, + "learning_rate": 2.4495074953922425e-06, + "loss": 0.0277, + "step": 2386 + }, + { + "epoch": 0.6806387225548902, + "grad_norm": 1.5262298229248654, + "learning_rate": 2.4455360258961247e-06, + "loss": 0.0325, + "step": 2387 + }, + { + "epoch": 0.6809238665526091, + "grad_norm": 0.7087902063036329, + "learning_rate": 2.4415667360473518e-06, + "loss": 0.0165, + "step": 2388 + }, + { + "epoch": 0.6812090105503279, + "grad_norm": 0.7561419087572873, + "learning_rate": 2.437599629232797e-06, + "loss": 0.011, + "step": 2389 + }, + { + "epoch": 0.6814941545480467, + "grad_norm": 1.045806215953938, + "learning_rate": 2.433634708837478e-06, + "loss": 0.0236, + "step": 2390 + }, + { + "epoch": 0.6817792985457656, + "grad_norm": 1.3375850382967915, + "learning_rate": 2.4296719782445422e-06, + "loss": 0.0322, + "step": 2391 + }, + { + "epoch": 0.6820644425434844, + "grad_norm": 1.442188021651849, + "learning_rate": 2.4257114408352646e-06, + "loss": 0.0382, + "step": 2392 + }, + { + "epoch": 0.6823495865412033, + "grad_norm": 1.4811177425124444, + "learning_rate": 2.421753099989052e-06, + "loss": 0.0399, + "step": 2393 + }, + { + "epoch": 0.6826347305389222, + "grad_norm": 1.2151409515623008, + "learning_rate": 2.4177969590834425e-06, + "loss": 0.0344, + "step": 2394 + }, + { + "epoch": 0.682919874536641, + "grad_norm": 2.7056727418479958, + "learning_rate": 2.4138430214940906e-06, + "loss": 0.0551, + "step": 2395 + }, + { + "epoch": 0.6832050185343599, + "grad_norm": 0.5969770040702388, + "learning_rate": 2.4098912905947712e-06, + "loss": 0.015, + "step": 2396 + }, + { + "epoch": 0.6834901625320787, + "grad_norm": 1.4726176220224632, + "learning_rate": 2.4059417697573782e-06, + "loss": 0.0191, + "step": 2397 + }, + { + "epoch": 0.6837753065297976, + "grad_norm": 0.6985703906149163, + "learning_rate": 2.4019944623519194e-06, + "loss": 0.0217, + "step": 2398 + }, + { + "epoch": 0.6840604505275164, + "grad_norm": 1.3556092377104045, + "learning_rate": 2.3980493717465124e-06, + "loss": 0.0314, + "step": 2399 + }, + { + "epoch": 0.6843455945252352, + "grad_norm": 1.6320428555492388, + "learning_rate": 2.394106501307386e-06, + "loss": 0.0406, + "step": 2400 + }, + { + "epoch": 0.6846307385229541, + "grad_norm": 0.3152755510356312, + "learning_rate": 2.390165854398872e-06, + "loss": 0.0076, + "step": 2401 + }, + { + "epoch": 0.6849158825206729, + "grad_norm": 0.5496493802305953, + "learning_rate": 2.386227434383407e-06, + "loss": 0.0095, + "step": 2402 + }, + { + "epoch": 0.6852010265183918, + "grad_norm": 0.6520718654651182, + "learning_rate": 2.3822912446215244e-06, + "loss": 0.0179, + "step": 2403 + }, + { + "epoch": 0.6854861705161106, + "grad_norm": 1.2473935565493588, + "learning_rate": 2.3783572884718592e-06, + "loss": 0.0319, + "step": 2404 + }, + { + "epoch": 0.6857713145138294, + "grad_norm": 1.0535903429708244, + "learning_rate": 2.3744255692911345e-06, + "loss": 0.015, + "step": 2405 + }, + { + "epoch": 0.6860564585115483, + "grad_norm": 1.153321160939757, + "learning_rate": 2.370496090434172e-06, + "loss": 0.0243, + "step": 2406 + }, + { + "epoch": 0.6863416025092672, + "grad_norm": 0.8326855476595857, + "learning_rate": 2.3665688552538767e-06, + "loss": 0.016, + "step": 2407 + }, + { + "epoch": 0.6866267465069861, + "grad_norm": 1.4628607016553867, + "learning_rate": 2.3626438671012412e-06, + "loss": 0.0411, + "step": 2408 + }, + { + "epoch": 0.6869118905047049, + "grad_norm": 0.5985342385584901, + "learning_rate": 2.358721129325336e-06, + "loss": 0.0153, + "step": 2409 + }, + { + "epoch": 0.6871970345024238, + "grad_norm": 0.8315001817874339, + "learning_rate": 2.354800645273319e-06, + "loss": 0.0314, + "step": 2410 + }, + { + "epoch": 0.6874821785001426, + "grad_norm": 0.6885528098745334, + "learning_rate": 2.3508824182904207e-06, + "loss": 0.0124, + "step": 2411 + }, + { + "epoch": 0.6877673224978614, + "grad_norm": 1.9141392122493273, + "learning_rate": 2.3469664517199463e-06, + "loss": 0.0354, + "step": 2412 + }, + { + "epoch": 0.6880524664955803, + "grad_norm": 1.1665319631123647, + "learning_rate": 2.3430527489032723e-06, + "loss": 0.0215, + "step": 2413 + }, + { + "epoch": 0.6883376104932991, + "grad_norm": 1.0810004667670476, + "learning_rate": 2.339141313179844e-06, + "loss": 0.0321, + "step": 2414 + }, + { + "epoch": 0.688622754491018, + "grad_norm": 0.8222383468643456, + "learning_rate": 2.3352321478871717e-06, + "loss": 0.023, + "step": 2415 + }, + { + "epoch": 0.6889078984887368, + "grad_norm": 1.1477329288258749, + "learning_rate": 2.331325256360828e-06, + "loss": 0.0147, + "step": 2416 + }, + { + "epoch": 0.6891930424864556, + "grad_norm": 0.4781898959973014, + "learning_rate": 2.327420641934447e-06, + "loss": 0.0059, + "step": 2417 + }, + { + "epoch": 0.6894781864841745, + "grad_norm": 1.1660744412330555, + "learning_rate": 2.323518307939717e-06, + "loss": 0.0294, + "step": 2418 + }, + { + "epoch": 0.6897633304818933, + "grad_norm": 0.5872242883588114, + "learning_rate": 2.3196182577063868e-06, + "loss": 0.0121, + "step": 2419 + }, + { + "epoch": 0.6900484744796122, + "grad_norm": 2.212762462397327, + "learning_rate": 2.315720494562248e-06, + "loss": 0.0324, + "step": 2420 + }, + { + "epoch": 0.6903336184773311, + "grad_norm": 1.6373711381327187, + "learning_rate": 2.3118250218331463e-06, + "loss": 0.0185, + "step": 2421 + }, + { + "epoch": 0.6906187624750499, + "grad_norm": 0.7770141753292534, + "learning_rate": 2.307931842842968e-06, + "loss": 0.0299, + "step": 2422 + }, + { + "epoch": 0.6909039064727688, + "grad_norm": 1.518201141107165, + "learning_rate": 2.3040409609136515e-06, + "loss": 0.0463, + "step": 2423 + }, + { + "epoch": 0.6911890504704876, + "grad_norm": 0.43241598666358966, + "learning_rate": 2.3001523793651688e-06, + "loss": 0.0124, + "step": 2424 + }, + { + "epoch": 0.6914741944682065, + "grad_norm": 1.0771130938316928, + "learning_rate": 2.2962661015155234e-06, + "loss": 0.0561, + "step": 2425 + }, + { + "epoch": 0.6917593384659253, + "grad_norm": 0.9154394384341286, + "learning_rate": 2.292382130680766e-06, + "loss": 0.0182, + "step": 2426 + }, + { + "epoch": 0.6920444824636441, + "grad_norm": 0.9492377155776165, + "learning_rate": 2.2885004701749695e-06, + "loss": 0.0181, + "step": 2427 + }, + { + "epoch": 0.692329626461363, + "grad_norm": 1.1013460236131432, + "learning_rate": 2.2846211233102387e-06, + "loss": 0.03, + "step": 2428 + }, + { + "epoch": 0.6926147704590818, + "grad_norm": 1.1127112319248678, + "learning_rate": 2.2807440933967034e-06, + "loss": 0.021, + "step": 2429 + }, + { + "epoch": 0.6928999144568007, + "grad_norm": 0.9531578556692747, + "learning_rate": 2.276869383742517e-06, + "loss": 0.0149, + "step": 2430 + }, + { + "epoch": 0.6931850584545195, + "grad_norm": 1.0679032804811803, + "learning_rate": 2.2729969976538524e-06, + "loss": 0.0187, + "step": 2431 + }, + { + "epoch": 0.6934702024522383, + "grad_norm": 2.528890187837627, + "learning_rate": 2.2691269384349007e-06, + "loss": 0.0397, + "step": 2432 + }, + { + "epoch": 0.6937553464499572, + "grad_norm": 1.366923157548722, + "learning_rate": 2.265259209387867e-06, + "loss": 0.0273, + "step": 2433 + }, + { + "epoch": 0.6940404904476761, + "grad_norm": 0.6850783877005101, + "learning_rate": 2.261393813812966e-06, + "loss": 0.015, + "step": 2434 + }, + { + "epoch": 0.694325634445395, + "grad_norm": 1.0072621570466294, + "learning_rate": 2.2575307550084295e-06, + "loss": 0.0234, + "step": 2435 + }, + { + "epoch": 0.6946107784431138, + "grad_norm": 1.3465168819525803, + "learning_rate": 2.2536700362704846e-06, + "loss": 0.0213, + "step": 2436 + }, + { + "epoch": 0.6948959224408326, + "grad_norm": 1.0391955609835548, + "learning_rate": 2.2498116608933673e-06, + "loss": 0.0157, + "step": 2437 + }, + { + "epoch": 0.6951810664385515, + "grad_norm": 2.814771203840875, + "learning_rate": 2.2459556321693123e-06, + "loss": 0.0552, + "step": 2438 + }, + { + "epoch": 0.6954662104362703, + "grad_norm": 0.6121746444624311, + "learning_rate": 2.242101953388556e-06, + "loss": 0.0112, + "step": 2439 + }, + { + "epoch": 0.6957513544339892, + "grad_norm": 1.3291384876570984, + "learning_rate": 2.238250627839325e-06, + "loss": 0.0131, + "step": 2440 + }, + { + "epoch": 0.696036498431708, + "grad_norm": 1.1148763168142317, + "learning_rate": 2.2344016588078403e-06, + "loss": 0.0354, + "step": 2441 + }, + { + "epoch": 0.6963216424294268, + "grad_norm": 1.36958013088009, + "learning_rate": 2.230555049578312e-06, + "loss": 0.029, + "step": 2442 + }, + { + "epoch": 0.6966067864271457, + "grad_norm": 0.7790771298504943, + "learning_rate": 2.2267108034329343e-06, + "loss": 0.0127, + "step": 2443 + }, + { + "epoch": 0.6968919304248645, + "grad_norm": 1.1333344045819018, + "learning_rate": 2.222868923651888e-06, + "loss": 0.0225, + "step": 2444 + }, + { + "epoch": 0.6971770744225834, + "grad_norm": 1.2892580769517954, + "learning_rate": 2.2190294135133334e-06, + "loss": 0.016, + "step": 2445 + }, + { + "epoch": 0.6974622184203022, + "grad_norm": 2.2596154885188304, + "learning_rate": 2.2151922762934096e-06, + "loss": 0.0359, + "step": 2446 + }, + { + "epoch": 0.6977473624180212, + "grad_norm": 1.5204543539753177, + "learning_rate": 2.2113575152662304e-06, + "loss": 0.0225, + "step": 2447 + }, + { + "epoch": 0.69803250641574, + "grad_norm": 2.129816314096946, + "learning_rate": 2.207525133703881e-06, + "loss": 0.0312, + "step": 2448 + }, + { + "epoch": 0.6983176504134588, + "grad_norm": 0.8726329016607753, + "learning_rate": 2.203695134876419e-06, + "loss": 0.0129, + "step": 2449 + }, + { + "epoch": 0.6986027944111777, + "grad_norm": 0.981509542326578, + "learning_rate": 2.199867522051865e-06, + "loss": 0.0306, + "step": 2450 + }, + { + "epoch": 0.6988879384088965, + "grad_norm": 1.2233864788734838, + "learning_rate": 2.1960422984962094e-06, + "loss": 0.0457, + "step": 2451 + }, + { + "epoch": 0.6991730824066154, + "grad_norm": 1.121663408796296, + "learning_rate": 2.1922194674734003e-06, + "loss": 0.0278, + "step": 2452 + }, + { + "epoch": 0.6994582264043342, + "grad_norm": 1.612742447635186, + "learning_rate": 2.1883990322453414e-06, + "loss": 0.0537, + "step": 2453 + }, + { + "epoch": 0.699743370402053, + "grad_norm": 0.9032206069159022, + "learning_rate": 2.184580996071895e-06, + "loss": 0.0382, + "step": 2454 + }, + { + "epoch": 0.7000285143997719, + "grad_norm": 1.3020251741485003, + "learning_rate": 2.1807653622108797e-06, + "loss": 0.0511, + "step": 2455 + }, + { + "epoch": 0.7003136583974907, + "grad_norm": 0.927814184904209, + "learning_rate": 2.1769521339180604e-06, + "loss": 0.0291, + "step": 2456 + }, + { + "epoch": 0.7005988023952096, + "grad_norm": 1.4080121788852615, + "learning_rate": 2.17314131444715e-06, + "loss": 0.0168, + "step": 2457 + }, + { + "epoch": 0.7008839463929284, + "grad_norm": 1.9115355677255668, + "learning_rate": 2.1693329070498057e-06, + "loss": 0.0243, + "step": 2458 + }, + { + "epoch": 0.7011690903906472, + "grad_norm": 0.6119517264756272, + "learning_rate": 2.165526914975628e-06, + "loss": 0.0128, + "step": 2459 + }, + { + "epoch": 0.7014542343883661, + "grad_norm": 1.2212817821331179, + "learning_rate": 2.1617233414721546e-06, + "loss": 0.0124, + "step": 2460 + }, + { + "epoch": 0.701739378386085, + "grad_norm": 1.0456467000073637, + "learning_rate": 2.1579221897848608e-06, + "loss": 0.0225, + "step": 2461 + }, + { + "epoch": 0.7020245223838039, + "grad_norm": 1.9001029978437518, + "learning_rate": 2.1541234631571533e-06, + "loss": 0.0347, + "step": 2462 + }, + { + "epoch": 0.7023096663815227, + "grad_norm": 1.1170803439317123, + "learning_rate": 2.1503271648303776e-06, + "loss": 0.0158, + "step": 2463 + }, + { + "epoch": 0.7025948103792415, + "grad_norm": 1.2513944489653663, + "learning_rate": 2.1465332980437937e-06, + "loss": 0.0185, + "step": 2464 + }, + { + "epoch": 0.7028799543769604, + "grad_norm": 0.9513327072225457, + "learning_rate": 2.1427418660345978e-06, + "loss": 0.0162, + "step": 2465 + }, + { + "epoch": 0.7031650983746792, + "grad_norm": 1.0188208991697878, + "learning_rate": 2.138952872037902e-06, + "loss": 0.044, + "step": 2466 + }, + { + "epoch": 0.7034502423723981, + "grad_norm": 1.5772768139954325, + "learning_rate": 2.135166319286745e-06, + "loss": 0.05, + "step": 2467 + }, + { + "epoch": 0.7037353863701169, + "grad_norm": 1.8088612980621237, + "learning_rate": 2.1313822110120787e-06, + "loss": 0.0434, + "step": 2468 + }, + { + "epoch": 0.7040205303678357, + "grad_norm": 0.589663362629804, + "learning_rate": 2.1276005504427643e-06, + "loss": 0.0089, + "step": 2469 + }, + { + "epoch": 0.7043056743655546, + "grad_norm": 1.0455626616060543, + "learning_rate": 2.1238213408055806e-06, + "loss": 0.0252, + "step": 2470 + }, + { + "epoch": 0.7045908183632734, + "grad_norm": 1.2419091857004885, + "learning_rate": 2.1200445853252165e-06, + "loss": 0.0313, + "step": 2471 + }, + { + "epoch": 0.7048759623609923, + "grad_norm": 1.309158885132024, + "learning_rate": 2.116270287224262e-06, + "loss": 0.016, + "step": 2472 + }, + { + "epoch": 0.7051611063587111, + "grad_norm": 2.46200008191099, + "learning_rate": 2.1124984497232127e-06, + "loss": 0.0427, + "step": 2473 + }, + { + "epoch": 0.70544625035643, + "grad_norm": 1.1686060450283622, + "learning_rate": 2.1087290760404634e-06, + "loss": 0.0226, + "step": 2474 + }, + { + "epoch": 0.7057313943541489, + "grad_norm": 1.4653606459109394, + "learning_rate": 2.1049621693923084e-06, + "loss": 0.0413, + "step": 2475 + }, + { + "epoch": 0.7060165383518677, + "grad_norm": 0.7719168267016131, + "learning_rate": 2.101197732992935e-06, + "loss": 0.0205, + "step": 2476 + }, + { + "epoch": 0.7063016823495866, + "grad_norm": 0.4502961937417831, + "learning_rate": 2.0974357700544244e-06, + "loss": 0.0117, + "step": 2477 + }, + { + "epoch": 0.7065868263473054, + "grad_norm": 1.385954489989293, + "learning_rate": 2.0936762837867445e-06, + "loss": 0.0242, + "step": 2478 + }, + { + "epoch": 0.7068719703450242, + "grad_norm": 2.1799652784336216, + "learning_rate": 2.0899192773977574e-06, + "loss": 0.0569, + "step": 2479 + }, + { + "epoch": 0.7071571143427431, + "grad_norm": 1.1110636882709009, + "learning_rate": 2.086164754093198e-06, + "loss": 0.0316, + "step": 2480 + }, + { + "epoch": 0.7074422583404619, + "grad_norm": 1.4941307442971459, + "learning_rate": 2.0824127170766904e-06, + "loss": 0.0239, + "step": 2481 + }, + { + "epoch": 0.7077274023381808, + "grad_norm": 0.6801030858403582, + "learning_rate": 2.0786631695497335e-06, + "loss": 0.0135, + "step": 2482 + }, + { + "epoch": 0.7080125463358996, + "grad_norm": 1.1364604361529025, + "learning_rate": 2.074916114711706e-06, + "loss": 0.0166, + "step": 2483 + }, + { + "epoch": 0.7082976903336184, + "grad_norm": 1.2388624592631503, + "learning_rate": 2.071171555759856e-06, + "loss": 0.02, + "step": 2484 + }, + { + "epoch": 0.7085828343313373, + "grad_norm": 0.9231133048338788, + "learning_rate": 2.0674294958893052e-06, + "loss": 0.0132, + "step": 2485 + }, + { + "epoch": 0.7088679783290561, + "grad_norm": 1.0122357410820453, + "learning_rate": 2.0636899382930357e-06, + "loss": 0.0195, + "step": 2486 + }, + { + "epoch": 0.7091531223267751, + "grad_norm": 1.0915144391953777, + "learning_rate": 2.0599528861619046e-06, + "loss": 0.0218, + "step": 2487 + }, + { + "epoch": 0.7094382663244939, + "grad_norm": 0.5781606335754661, + "learning_rate": 2.056218342684624e-06, + "loss": 0.0138, + "step": 2488 + }, + { + "epoch": 0.7097234103222128, + "grad_norm": 1.2034483165455416, + "learning_rate": 2.0524863110477683e-06, + "loss": 0.0208, + "step": 2489 + }, + { + "epoch": 0.7100085543199316, + "grad_norm": 1.0223652929411267, + "learning_rate": 2.0487567944357658e-06, + "loss": 0.0177, + "step": 2490 + }, + { + "epoch": 0.7102936983176504, + "grad_norm": 0.7589363858158285, + "learning_rate": 2.0450297960309057e-06, + "loss": 0.0191, + "step": 2491 + }, + { + "epoch": 0.7105788423153693, + "grad_norm": 1.7624288774750982, + "learning_rate": 2.0413053190133198e-06, + "loss": 0.0462, + "step": 2492 + }, + { + "epoch": 0.7108639863130881, + "grad_norm": 0.9123674547573875, + "learning_rate": 2.0375833665609927e-06, + "loss": 0.0349, + "step": 2493 + }, + { + "epoch": 0.711149130310807, + "grad_norm": 1.1539174639109313, + "learning_rate": 2.033863941849754e-06, + "loss": 0.0383, + "step": 2494 + }, + { + "epoch": 0.7114342743085258, + "grad_norm": 0.9661273666986842, + "learning_rate": 2.0301470480532803e-06, + "loss": 0.0296, + "step": 2495 + }, + { + "epoch": 0.7117194183062446, + "grad_norm": 1.101711554212656, + "learning_rate": 2.026432688343085e-06, + "loss": 0.0322, + "step": 2496 + }, + { + "epoch": 0.7120045623039635, + "grad_norm": 1.700057021557032, + "learning_rate": 2.0227208658885167e-06, + "loss": 0.0282, + "step": 2497 + }, + { + "epoch": 0.7122897063016823, + "grad_norm": 0.6125316137983147, + "learning_rate": 2.019011583856761e-06, + "loss": 0.009, + "step": 2498 + }, + { + "epoch": 0.7125748502994012, + "grad_norm": 2.1882827226129273, + "learning_rate": 2.015304845412841e-06, + "loss": 0.0422, + "step": 2499 + }, + { + "epoch": 0.71285999429712, + "grad_norm": 0.9211104223469543, + "learning_rate": 2.0116006537196033e-06, + "loss": 0.0327, + "step": 2500 + }, + { + "epoch": 0.7131451382948389, + "grad_norm": 1.0694540209104813, + "learning_rate": 2.0078990119377233e-06, + "loss": 0.0279, + "step": 2501 + }, + { + "epoch": 0.7134302822925578, + "grad_norm": 1.388175054161624, + "learning_rate": 2.004199923225701e-06, + "loss": 0.0259, + "step": 2502 + }, + { + "epoch": 0.7137154262902766, + "grad_norm": 1.0988981162449094, + "learning_rate": 2.0005033907398574e-06, + "loss": 0.0212, + "step": 2503 + }, + { + "epoch": 0.7140005702879955, + "grad_norm": 1.1482399987583405, + "learning_rate": 1.9968094176343322e-06, + "loss": 0.0281, + "step": 2504 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.5501580823692963, + "learning_rate": 1.9931180070610823e-06, + "loss": 0.012, + "step": 2505 + }, + { + "epoch": 0.7145708582834331, + "grad_norm": 1.2633694375174702, + "learning_rate": 1.989429162169876e-06, + "loss": 0.0304, + "step": 2506 + }, + { + "epoch": 0.714856002281152, + "grad_norm": 0.6924873154457065, + "learning_rate": 1.9857428861082976e-06, + "loss": 0.0141, + "step": 2507 + }, + { + "epoch": 0.7151411462788708, + "grad_norm": 0.6065820485902311, + "learning_rate": 1.9820591820217315e-06, + "loss": 0.0207, + "step": 2508 + }, + { + "epoch": 0.7154262902765897, + "grad_norm": 0.7765390907982586, + "learning_rate": 1.978378053053373e-06, + "loss": 0.0139, + "step": 2509 + }, + { + "epoch": 0.7157114342743085, + "grad_norm": 2.0988006081981316, + "learning_rate": 1.9746995023442177e-06, + "loss": 0.0231, + "step": 2510 + }, + { + "epoch": 0.7159965782720273, + "grad_norm": 1.0519010093126555, + "learning_rate": 1.9710235330330656e-06, + "loss": 0.0181, + "step": 2511 + }, + { + "epoch": 0.7162817222697462, + "grad_norm": 1.1614853715616795, + "learning_rate": 1.9673501482565083e-06, + "loss": 0.0341, + "step": 2512 + }, + { + "epoch": 0.716566866267465, + "grad_norm": 0.8821473934969634, + "learning_rate": 1.9636793511489377e-06, + "loss": 0.0229, + "step": 2513 + }, + { + "epoch": 0.716852010265184, + "grad_norm": 0.5250158672929854, + "learning_rate": 1.9600111448425285e-06, + "loss": 0.0159, + "step": 2514 + }, + { + "epoch": 0.7171371542629028, + "grad_norm": 1.9676827735955886, + "learning_rate": 1.9563455324672566e-06, + "loss": 0.033, + "step": 2515 + }, + { + "epoch": 0.7174222982606216, + "grad_norm": 0.648474303996466, + "learning_rate": 1.952682517150877e-06, + "loss": 0.0115, + "step": 2516 + }, + { + "epoch": 0.7177074422583405, + "grad_norm": 1.3772601549572416, + "learning_rate": 1.9490221020189306e-06, + "loss": 0.0301, + "step": 2517 + }, + { + "epoch": 0.7179925862560593, + "grad_norm": 1.526408416109426, + "learning_rate": 1.945364290194739e-06, + "loss": 0.0301, + "step": 2518 + }, + { + "epoch": 0.7182777302537782, + "grad_norm": 1.00941806936837, + "learning_rate": 1.941709084799404e-06, + "loss": 0.0403, + "step": 2519 + }, + { + "epoch": 0.718562874251497, + "grad_norm": 0.7841790764743731, + "learning_rate": 1.9380564889518027e-06, + "loss": 0.0206, + "step": 2520 + }, + { + "epoch": 0.7188480182492158, + "grad_norm": 0.3196980040678001, + "learning_rate": 1.9344065057685844e-06, + "loss": 0.0117, + "step": 2521 + }, + { + "epoch": 0.7191331622469347, + "grad_norm": 1.2494691029010219, + "learning_rate": 1.9307591383641704e-06, + "loss": 0.0271, + "step": 2522 + }, + { + "epoch": 0.7194183062446535, + "grad_norm": 3.35964679264394, + "learning_rate": 1.927114389850749e-06, + "loss": 0.0591, + "step": 2523 + }, + { + "epoch": 0.7197034502423724, + "grad_norm": 1.6156104346637272, + "learning_rate": 1.923472263338278e-06, + "loss": 0.0249, + "step": 2524 + }, + { + "epoch": 0.7199885942400912, + "grad_norm": 1.1690876897119913, + "learning_rate": 1.919832761934471e-06, + "loss": 0.0336, + "step": 2525 + }, + { + "epoch": 0.72027373823781, + "grad_norm": 0.4901043573896946, + "learning_rate": 1.9161958887448036e-06, + "loss": 0.0087, + "step": 2526 + }, + { + "epoch": 0.720558882235529, + "grad_norm": 0.5766787110046747, + "learning_rate": 1.912561646872515e-06, + "loss": 0.0194, + "step": 2527 + }, + { + "epoch": 0.7208440262332478, + "grad_norm": 0.4067719479340745, + "learning_rate": 1.908930039418593e-06, + "loss": 0.0074, + "step": 2528 + }, + { + "epoch": 0.7211291702309667, + "grad_norm": 0.8190616283681241, + "learning_rate": 1.9053010694817792e-06, + "loss": 0.0151, + "step": 2529 + }, + { + "epoch": 0.7214143142286855, + "grad_norm": 1.4745099441171658, + "learning_rate": 1.9016747401585612e-06, + "loss": 0.0192, + "step": 2530 + }, + { + "epoch": 0.7216994582264044, + "grad_norm": 1.1626077125676921, + "learning_rate": 1.8980510545431813e-06, + "loss": 0.0336, + "step": 2531 + }, + { + "epoch": 0.7219846022241232, + "grad_norm": 2.3232659525233785, + "learning_rate": 1.89443001572762e-06, + "loss": 0.0488, + "step": 2532 + }, + { + "epoch": 0.722269746221842, + "grad_norm": 0.4293645535795284, + "learning_rate": 1.8908116268016009e-06, + "loss": 0.0186, + "step": 2533 + }, + { + "epoch": 0.7225548902195609, + "grad_norm": 0.7057630192922806, + "learning_rate": 1.8871958908525861e-06, + "loss": 0.0165, + "step": 2534 + }, + { + "epoch": 0.7228400342172797, + "grad_norm": 0.7801979051938523, + "learning_rate": 1.883582810965775e-06, + "loss": 0.0173, + "step": 2535 + }, + { + "epoch": 0.7231251782149986, + "grad_norm": 0.6054884481515708, + "learning_rate": 1.8799723902240995e-06, + "loss": 0.012, + "step": 2536 + }, + { + "epoch": 0.7234103222127174, + "grad_norm": 0.7295030558864235, + "learning_rate": 1.8763646317082234e-06, + "loss": 0.0143, + "step": 2537 + }, + { + "epoch": 0.7236954662104362, + "grad_norm": 0.2348493963859001, + "learning_rate": 1.872759538496539e-06, + "loss": 0.0063, + "step": 2538 + }, + { + "epoch": 0.7239806102081551, + "grad_norm": 1.5317613085812878, + "learning_rate": 1.869157113665162e-06, + "loss": 0.0177, + "step": 2539 + }, + { + "epoch": 0.7242657542058739, + "grad_norm": 1.1185611844047032, + "learning_rate": 1.8655573602879384e-06, + "loss": 0.0406, + "step": 2540 + }, + { + "epoch": 0.7245508982035929, + "grad_norm": 0.7553842956065492, + "learning_rate": 1.8619602814364241e-06, + "loss": 0.0142, + "step": 2541 + }, + { + "epoch": 0.7248360422013117, + "grad_norm": 1.613711705936186, + "learning_rate": 1.8583658801798988e-06, + "loss": 0.021, + "step": 2542 + }, + { + "epoch": 0.7251211861990305, + "grad_norm": 0.6639471861383226, + "learning_rate": 1.8547741595853603e-06, + "loss": 0.0152, + "step": 2543 + }, + { + "epoch": 0.7254063301967494, + "grad_norm": 1.043444007239291, + "learning_rate": 1.8511851227175142e-06, + "loss": 0.0163, + "step": 2544 + }, + { + "epoch": 0.7256914741944682, + "grad_norm": 1.3859476624660125, + "learning_rate": 1.8475987726387783e-06, + "loss": 0.028, + "step": 2545 + }, + { + "epoch": 0.7259766181921871, + "grad_norm": 1.2707053562031854, + "learning_rate": 1.8440151124092764e-06, + "loss": 0.0203, + "step": 2546 + }, + { + "epoch": 0.7262617621899059, + "grad_norm": 0.7743398165083601, + "learning_rate": 1.8404341450868385e-06, + "loss": 0.0225, + "step": 2547 + }, + { + "epoch": 0.7265469061876247, + "grad_norm": 1.4948566545401907, + "learning_rate": 1.836855873726997e-06, + "loss": 0.0289, + "step": 2548 + }, + { + "epoch": 0.7268320501853436, + "grad_norm": 1.4926542884907947, + "learning_rate": 1.8332803013829824e-06, + "loss": 0.0314, + "step": 2549 + }, + { + "epoch": 0.7271171941830624, + "grad_norm": 1.7158591542352042, + "learning_rate": 1.8297074311057233e-06, + "loss": 0.0264, + "step": 2550 + }, + { + "epoch": 0.7274023381807813, + "grad_norm": 0.7051197790810769, + "learning_rate": 1.826137265943843e-06, + "loss": 0.0123, + "step": 2551 + }, + { + "epoch": 0.7276874821785001, + "grad_norm": 1.2332019677453634, + "learning_rate": 1.822569808943656e-06, + "loss": 0.0255, + "step": 2552 + }, + { + "epoch": 0.7279726261762189, + "grad_norm": 1.4227488885844022, + "learning_rate": 1.8190050631491662e-06, + "loss": 0.0186, + "step": 2553 + }, + { + "epoch": 0.7282577701739379, + "grad_norm": 0.888470253708542, + "learning_rate": 1.8154430316020638e-06, + "loss": 0.0091, + "step": 2554 + }, + { + "epoch": 0.7285429141716567, + "grad_norm": 0.6362280906290803, + "learning_rate": 1.811883717341722e-06, + "loss": 0.0227, + "step": 2555 + }, + { + "epoch": 0.7288280581693756, + "grad_norm": 2.0627256070084683, + "learning_rate": 1.8083271234051991e-06, + "loss": 0.0383, + "step": 2556 + }, + { + "epoch": 0.7291132021670944, + "grad_norm": 1.1067413056479523, + "learning_rate": 1.804773252827231e-06, + "loss": 0.0333, + "step": 2557 + }, + { + "epoch": 0.7293983461648132, + "grad_norm": 2.0036690116211333, + "learning_rate": 1.8012221086402226e-06, + "loss": 0.0302, + "step": 2558 + }, + { + "epoch": 0.7296834901625321, + "grad_norm": 1.7330904351751883, + "learning_rate": 1.7976736938742646e-06, + "loss": 0.0202, + "step": 2559 + }, + { + "epoch": 0.7299686341602509, + "grad_norm": 1.9066727681418387, + "learning_rate": 1.7941280115571103e-06, + "loss": 0.0521, + "step": 2560 + }, + { + "epoch": 0.7302537781579698, + "grad_norm": 0.8350093333223596, + "learning_rate": 1.7905850647141842e-06, + "loss": 0.0174, + "step": 2561 + }, + { + "epoch": 0.7305389221556886, + "grad_norm": 0.8935901878058463, + "learning_rate": 1.787044856368576e-06, + "loss": 0.0124, + "step": 2562 + }, + { + "epoch": 0.7308240661534074, + "grad_norm": 0.2609415835600623, + "learning_rate": 1.7835073895410393e-06, + "loss": 0.0034, + "step": 2563 + }, + { + "epoch": 0.7311092101511263, + "grad_norm": 0.7383110088545776, + "learning_rate": 1.779972667249989e-06, + "loss": 0.0132, + "step": 2564 + }, + { + "epoch": 0.7313943541488451, + "grad_norm": 0.82098631234755, + "learning_rate": 1.7764406925114957e-06, + "loss": 0.0181, + "step": 2565 + }, + { + "epoch": 0.731679498146564, + "grad_norm": 1.2122779163003397, + "learning_rate": 1.7729114683392889e-06, + "loss": 0.0186, + "step": 2566 + }, + { + "epoch": 0.7319646421442829, + "grad_norm": 1.0409332876987545, + "learning_rate": 1.769384997744747e-06, + "loss": 0.0218, + "step": 2567 + }, + { + "epoch": 0.7322497861420018, + "grad_norm": 1.0657193928152229, + "learning_rate": 1.7658612837369065e-06, + "loss": 0.0255, + "step": 2568 + }, + { + "epoch": 0.7325349301397206, + "grad_norm": 2.009539824029055, + "learning_rate": 1.7623403293224423e-06, + "loss": 0.0335, + "step": 2569 + }, + { + "epoch": 0.7328200741374394, + "grad_norm": 0.47135702796339735, + "learning_rate": 1.7588221375056797e-06, + "loss": 0.0124, + "step": 2570 + }, + { + "epoch": 0.7331052181351583, + "grad_norm": 0.5183616314191448, + "learning_rate": 1.7553067112885846e-06, + "loss": 0.0047, + "step": 2571 + }, + { + "epoch": 0.7333903621328771, + "grad_norm": 0.9381090221129461, + "learning_rate": 1.751794053670769e-06, + "loss": 0.0172, + "step": 2572 + }, + { + "epoch": 0.733675506130596, + "grad_norm": 1.5789937408733317, + "learning_rate": 1.7482841676494766e-06, + "loss": 0.0205, + "step": 2573 + }, + { + "epoch": 0.7339606501283148, + "grad_norm": 0.5928552002577685, + "learning_rate": 1.7447770562195831e-06, + "loss": 0.0137, + "step": 2574 + }, + { + "epoch": 0.7342457941260336, + "grad_norm": 1.1734406658202, + "learning_rate": 1.741272722373607e-06, + "loss": 0.0276, + "step": 2575 + }, + { + "epoch": 0.7345309381237525, + "grad_norm": 1.3469595275728594, + "learning_rate": 1.7377711691016885e-06, + "loss": 0.0291, + "step": 2576 + }, + { + "epoch": 0.7348160821214713, + "grad_norm": 1.0553587040388435, + "learning_rate": 1.7342723993915984e-06, + "loss": 0.0337, + "step": 2577 + }, + { + "epoch": 0.7351012261191902, + "grad_norm": 0.6104053843204919, + "learning_rate": 1.730776416228731e-06, + "loss": 0.0106, + "step": 2578 + }, + { + "epoch": 0.735386370116909, + "grad_norm": 1.180139523029748, + "learning_rate": 1.727283222596105e-06, + "loss": 0.0286, + "step": 2579 + }, + { + "epoch": 0.7356715141146278, + "grad_norm": 0.8943007056784072, + "learning_rate": 1.723792821474356e-06, + "loss": 0.0327, + "step": 2580 + }, + { + "epoch": 0.7359566581123468, + "grad_norm": 1.407472268943547, + "learning_rate": 1.7203052158417395e-06, + "loss": 0.039, + "step": 2581 + }, + { + "epoch": 0.7362418021100656, + "grad_norm": 1.2414400485072907, + "learning_rate": 1.7168204086741242e-06, + "loss": 0.0365, + "step": 2582 + }, + { + "epoch": 0.7365269461077845, + "grad_norm": 1.045965446497316, + "learning_rate": 1.7133384029449895e-06, + "loss": 0.0134, + "step": 2583 + }, + { + "epoch": 0.7368120901055033, + "grad_norm": 1.3401194939322665, + "learning_rate": 1.7098592016254318e-06, + "loss": 0.0192, + "step": 2584 + }, + { + "epoch": 0.7370972341032221, + "grad_norm": 0.4330957022031905, + "learning_rate": 1.7063828076841433e-06, + "loss": 0.0042, + "step": 2585 + }, + { + "epoch": 0.737382378100941, + "grad_norm": 0.6130445246080449, + "learning_rate": 1.7029092240874284e-06, + "loss": 0.0089, + "step": 2586 + }, + { + "epoch": 0.7376675220986598, + "grad_norm": 0.8820092232571117, + "learning_rate": 1.6994384537991898e-06, + "loss": 0.011, + "step": 2587 + }, + { + "epoch": 0.7379526660963787, + "grad_norm": 1.0509254864322741, + "learning_rate": 1.6959704997809355e-06, + "loss": 0.0291, + "step": 2588 + }, + { + "epoch": 0.7382378100940975, + "grad_norm": 1.3113452555468992, + "learning_rate": 1.6925053649917645e-06, + "loss": 0.0255, + "step": 2589 + }, + { + "epoch": 0.7385229540918163, + "grad_norm": 1.1114160968697633, + "learning_rate": 1.6890430523883715e-06, + "loss": 0.0176, + "step": 2590 + }, + { + "epoch": 0.7388080980895352, + "grad_norm": 1.4047944529889076, + "learning_rate": 1.6855835649250446e-06, + "loss": 0.0161, + "step": 2591 + }, + { + "epoch": 0.739093242087254, + "grad_norm": 1.268771098085473, + "learning_rate": 1.6821269055536604e-06, + "loss": 0.0359, + "step": 2592 + }, + { + "epoch": 0.7393783860849729, + "grad_norm": 1.0412207129886348, + "learning_rate": 1.678673077223682e-06, + "loss": 0.0111, + "step": 2593 + }, + { + "epoch": 0.7396635300826918, + "grad_norm": 1.9488662026753016, + "learning_rate": 1.6752220828821574e-06, + "loss": 0.0355, + "step": 2594 + }, + { + "epoch": 0.7399486740804107, + "grad_norm": 0.6388958326044936, + "learning_rate": 1.671773925473717e-06, + "loss": 0.0161, + "step": 2595 + }, + { + "epoch": 0.7402338180781295, + "grad_norm": 1.3676324169071015, + "learning_rate": 1.6683286079405692e-06, + "loss": 0.0446, + "step": 2596 + }, + { + "epoch": 0.7405189620758483, + "grad_norm": 0.7729955227718246, + "learning_rate": 1.6648861332225002e-06, + "loss": 0.0168, + "step": 2597 + }, + { + "epoch": 0.7408041060735672, + "grad_norm": 0.6524911607576815, + "learning_rate": 1.66144650425687e-06, + "loss": 0.0067, + "step": 2598 + }, + { + "epoch": 0.741089250071286, + "grad_norm": 0.9737064422050019, + "learning_rate": 1.6580097239786096e-06, + "loss": 0.0215, + "step": 2599 + }, + { + "epoch": 0.7413743940690048, + "grad_norm": 2.1748723955431237, + "learning_rate": 1.654575795320223e-06, + "loss": 0.0546, + "step": 2600 + }, + { + "epoch": 0.7416595380667237, + "grad_norm": 1.4285912646142453, + "learning_rate": 1.6511447212117786e-06, + "loss": 0.0346, + "step": 2601 + }, + { + "epoch": 0.7419446820644425, + "grad_norm": 2.7288639089504745, + "learning_rate": 1.6477165045809052e-06, + "loss": 0.0358, + "step": 2602 + }, + { + "epoch": 0.7422298260621614, + "grad_norm": 2.3650702688759018, + "learning_rate": 1.6442911483527978e-06, + "loss": 0.0669, + "step": 2603 + }, + { + "epoch": 0.7425149700598802, + "grad_norm": 1.2690767225620623, + "learning_rate": 1.6408686554502124e-06, + "loss": 0.0169, + "step": 2604 + }, + { + "epoch": 0.742800114057599, + "grad_norm": 1.4869660462533452, + "learning_rate": 1.637449028793458e-06, + "loss": 0.0294, + "step": 2605 + }, + { + "epoch": 0.7430852580553179, + "grad_norm": 1.3737598171297696, + "learning_rate": 1.6340322713003992e-06, + "loss": 0.026, + "step": 2606 + }, + { + "epoch": 0.7433704020530368, + "grad_norm": 0.5276542234582096, + "learning_rate": 1.6306183858864528e-06, + "loss": 0.0108, + "step": 2607 + }, + { + "epoch": 0.7436555460507557, + "grad_norm": 1.208035745863832, + "learning_rate": 1.6272073754645845e-06, + "loss": 0.0434, + "step": 2608 + }, + { + "epoch": 0.7439406900484745, + "grad_norm": 1.2060462983723554, + "learning_rate": 1.623799242945307e-06, + "loss": 0.0599, + "step": 2609 + }, + { + "epoch": 0.7442258340461934, + "grad_norm": 1.613815953946869, + "learning_rate": 1.6203939912366768e-06, + "loss": 0.0241, + "step": 2610 + }, + { + "epoch": 0.7445109780439122, + "grad_norm": 1.460570451296447, + "learning_rate": 1.6169916232442923e-06, + "loss": 0.0306, + "step": 2611 + }, + { + "epoch": 0.744796122041631, + "grad_norm": 1.0008169723964753, + "learning_rate": 1.6135921418712959e-06, + "loss": 0.0215, + "step": 2612 + }, + { + "epoch": 0.7450812660393499, + "grad_norm": 2.086617218943197, + "learning_rate": 1.610195550018358e-06, + "loss": 0.0274, + "step": 2613 + }, + { + "epoch": 0.7453664100370687, + "grad_norm": 1.6041573788510077, + "learning_rate": 1.6068018505836901e-06, + "loss": 0.019, + "step": 2614 + }, + { + "epoch": 0.7456515540347876, + "grad_norm": 0.7713053016927894, + "learning_rate": 1.6034110464630325e-06, + "loss": 0.0232, + "step": 2615 + }, + { + "epoch": 0.7459366980325064, + "grad_norm": 0.5511320080746058, + "learning_rate": 1.6000231405496602e-06, + "loss": 0.0191, + "step": 2616 + }, + { + "epoch": 0.7462218420302252, + "grad_norm": 1.0634637982976405, + "learning_rate": 1.5966381357343708e-06, + "loss": 0.0175, + "step": 2617 + }, + { + "epoch": 0.7465069860279441, + "grad_norm": 1.0374330207474594, + "learning_rate": 1.5932560349054838e-06, + "loss": 0.0245, + "step": 2618 + }, + { + "epoch": 0.7467921300256629, + "grad_norm": 1.0108319276571467, + "learning_rate": 1.5898768409488447e-06, + "loss": 0.0136, + "step": 2619 + }, + { + "epoch": 0.7470772740233818, + "grad_norm": 1.6396915594949968, + "learning_rate": 1.5865005567478215e-06, + "loss": 0.0251, + "step": 2620 + }, + { + "epoch": 0.7473624180211007, + "grad_norm": 0.9627999719461374, + "learning_rate": 1.5831271851832937e-06, + "loss": 0.0391, + "step": 2621 + }, + { + "epoch": 0.7476475620188195, + "grad_norm": 0.8253787374724513, + "learning_rate": 1.5797567291336586e-06, + "loss": 0.0122, + "step": 2622 + }, + { + "epoch": 0.7479327060165384, + "grad_norm": 0.6661195439268087, + "learning_rate": 1.5763891914748241e-06, + "loss": 0.0221, + "step": 2623 + }, + { + "epoch": 0.7482178500142572, + "grad_norm": 0.8779901773734643, + "learning_rate": 1.5730245750802103e-06, + "loss": 0.0195, + "step": 2624 + }, + { + "epoch": 0.7485029940119761, + "grad_norm": 1.6634272557568168, + "learning_rate": 1.5696628828207421e-06, + "loss": 0.037, + "step": 2625 + }, + { + "epoch": 0.7487881380096949, + "grad_norm": 1.4516278438233137, + "learning_rate": 1.5663041175648513e-06, + "loss": 0.0219, + "step": 2626 + }, + { + "epoch": 0.7490732820074137, + "grad_norm": 1.907475947775475, + "learning_rate": 1.5629482821784691e-06, + "loss": 0.0314, + "step": 2627 + }, + { + "epoch": 0.7493584260051326, + "grad_norm": 1.2003200972703238, + "learning_rate": 1.5595953795250346e-06, + "loss": 0.0242, + "step": 2628 + }, + { + "epoch": 0.7496435700028514, + "grad_norm": 1.1021426342768983, + "learning_rate": 1.556245412465473e-06, + "loss": 0.034, + "step": 2629 + }, + { + "epoch": 0.7499287140005703, + "grad_norm": 1.0302550777919475, + "learning_rate": 1.5528983838582129e-06, + "loss": 0.0285, + "step": 2630 + }, + { + "epoch": 0.7502138579982891, + "grad_norm": 0.8308789044386139, + "learning_rate": 1.5495542965591709e-06, + "loss": 0.0114, + "step": 2631 + }, + { + "epoch": 0.7504990019960079, + "grad_norm": 0.8819505544759334, + "learning_rate": 1.5462131534217607e-06, + "loss": 0.0159, + "step": 2632 + }, + { + "epoch": 0.7507841459937268, + "grad_norm": 0.9835443580092531, + "learning_rate": 1.542874957296876e-06, + "loss": 0.0154, + "step": 2633 + }, + { + "epoch": 0.7510692899914457, + "grad_norm": 1.0665665377487932, + "learning_rate": 1.5395397110329024e-06, + "loss": 0.0353, + "step": 2634 + }, + { + "epoch": 0.7513544339891646, + "grad_norm": 0.8164632784798294, + "learning_rate": 1.5362074174756998e-06, + "loss": 0.0134, + "step": 2635 + }, + { + "epoch": 0.7516395779868834, + "grad_norm": 1.646300986945349, + "learning_rate": 1.5328780794686188e-06, + "loss": 0.0394, + "step": 2636 + }, + { + "epoch": 0.7519247219846023, + "grad_norm": 1.1301313594828286, + "learning_rate": 1.5295516998524823e-06, + "loss": 0.0142, + "step": 2637 + }, + { + "epoch": 0.7522098659823211, + "grad_norm": 0.5764109215471761, + "learning_rate": 1.5262282814655893e-06, + "loss": 0.0142, + "step": 2638 + }, + { + "epoch": 0.7524950099800399, + "grad_norm": 1.3610524683318708, + "learning_rate": 1.5229078271437141e-06, + "loss": 0.034, + "step": 2639 + }, + { + "epoch": 0.7527801539777588, + "grad_norm": 0.477649205446183, + "learning_rate": 1.5195903397200996e-06, + "loss": 0.0109, + "step": 2640 + }, + { + "epoch": 0.7530652979754776, + "grad_norm": 1.515573238930559, + "learning_rate": 1.5162758220254586e-06, + "loss": 0.0166, + "step": 2641 + }, + { + "epoch": 0.7533504419731964, + "grad_norm": 0.7091310562536813, + "learning_rate": 1.5129642768879687e-06, + "loss": 0.0135, + "step": 2642 + }, + { + "epoch": 0.7536355859709153, + "grad_norm": 1.5074902375844876, + "learning_rate": 1.5096557071332712e-06, + "loss": 0.0253, + "step": 2643 + }, + { + "epoch": 0.7539207299686341, + "grad_norm": 1.0767879630125734, + "learning_rate": 1.5063501155844723e-06, + "loss": 0.0167, + "step": 2644 + }, + { + "epoch": 0.754205873966353, + "grad_norm": 3.1348957847232226, + "learning_rate": 1.5030475050621336e-06, + "loss": 0.0288, + "step": 2645 + }, + { + "epoch": 0.7544910179640718, + "grad_norm": 0.6684133082529641, + "learning_rate": 1.49974787838427e-06, + "loss": 0.0084, + "step": 2646 + }, + { + "epoch": 0.7547761619617908, + "grad_norm": 0.7727753084015943, + "learning_rate": 1.4964512383663544e-06, + "loss": 0.0219, + "step": 2647 + }, + { + "epoch": 0.7550613059595096, + "grad_norm": 2.0188024106365887, + "learning_rate": 1.4931575878213127e-06, + "loss": 0.0395, + "step": 2648 + }, + { + "epoch": 0.7553464499572284, + "grad_norm": 0.4906606761025358, + "learning_rate": 1.4898669295595181e-06, + "loss": 0.0069, + "step": 2649 + }, + { + "epoch": 0.7556315939549473, + "grad_norm": 1.260687367143981, + "learning_rate": 1.4865792663887907e-06, + "loss": 0.0319, + "step": 2650 + }, + { + "epoch": 0.7559167379526661, + "grad_norm": 1.0883273196471233, + "learning_rate": 1.4832946011143906e-06, + "loss": 0.0264, + "step": 2651 + }, + { + "epoch": 0.756201881950385, + "grad_norm": 0.35999385019644614, + "learning_rate": 1.4800129365390282e-06, + "loss": 0.0055, + "step": 2652 + }, + { + "epoch": 0.7564870259481038, + "grad_norm": 0.4561544926370098, + "learning_rate": 1.4767342754628477e-06, + "loss": 0.0095, + "step": 2653 + }, + { + "epoch": 0.7567721699458226, + "grad_norm": 0.8901887574735574, + "learning_rate": 1.4734586206834323e-06, + "loss": 0.0198, + "step": 2654 + }, + { + "epoch": 0.7570573139435415, + "grad_norm": 0.7083720617572299, + "learning_rate": 1.4701859749958004e-06, + "loss": 0.0152, + "step": 2655 + }, + { + "epoch": 0.7573424579412603, + "grad_norm": 1.1369840531128947, + "learning_rate": 1.466916341192401e-06, + "loss": 0.0141, + "step": 2656 + }, + { + "epoch": 0.7576276019389792, + "grad_norm": 1.2957588766875028, + "learning_rate": 1.4636497220631145e-06, + "loss": 0.0336, + "step": 2657 + }, + { + "epoch": 0.757912745936698, + "grad_norm": 2.0713263046299173, + "learning_rate": 1.4603861203952502e-06, + "loss": 0.0302, + "step": 2658 + }, + { + "epoch": 0.7581978899344168, + "grad_norm": 1.1733273590818454, + "learning_rate": 1.4571255389735385e-06, + "loss": 0.0234, + "step": 2659 + }, + { + "epoch": 0.7584830339321357, + "grad_norm": 2.1819052963632717, + "learning_rate": 1.4538679805801386e-06, + "loss": 0.0329, + "step": 2660 + }, + { + "epoch": 0.7587681779298546, + "grad_norm": 0.3495163345720255, + "learning_rate": 1.4506134479946281e-06, + "loss": 0.0098, + "step": 2661 + }, + { + "epoch": 0.7590533219275735, + "grad_norm": 0.5490129444641568, + "learning_rate": 1.4473619439939985e-06, + "loss": 0.0065, + "step": 2662 + }, + { + "epoch": 0.7593384659252923, + "grad_norm": 1.581308447857722, + "learning_rate": 1.4441134713526595e-06, + "loss": 0.057, + "step": 2663 + }, + { + "epoch": 0.7596236099230111, + "grad_norm": 1.0260023381149048, + "learning_rate": 1.440868032842439e-06, + "loss": 0.0185, + "step": 2664 + }, + { + "epoch": 0.75990875392073, + "grad_norm": 0.5787379526406348, + "learning_rate": 1.437625631232571e-06, + "loss": 0.0151, + "step": 2665 + }, + { + "epoch": 0.7601938979184488, + "grad_norm": 1.5032898745586536, + "learning_rate": 1.4343862692896986e-06, + "loss": 0.0191, + "step": 2666 + }, + { + "epoch": 0.7604790419161677, + "grad_norm": 0.9364587983512026, + "learning_rate": 1.431149949777873e-06, + "loss": 0.0138, + "step": 2667 + }, + { + "epoch": 0.7607641859138865, + "grad_norm": 0.7579871237730098, + "learning_rate": 1.4279166754585472e-06, + "loss": 0.0236, + "step": 2668 + }, + { + "epoch": 0.7610493299116053, + "grad_norm": 1.2909526522887504, + "learning_rate": 1.4246864490905776e-06, + "loss": 0.0228, + "step": 2669 + }, + { + "epoch": 0.7613344739093242, + "grad_norm": 1.313113386385982, + "learning_rate": 1.421459273430219e-06, + "loss": 0.0234, + "step": 2670 + }, + { + "epoch": 0.761619617907043, + "grad_norm": 0.8158285471748641, + "learning_rate": 1.4182351512311237e-06, + "loss": 0.027, + "step": 2671 + }, + { + "epoch": 0.7619047619047619, + "grad_norm": 1.1047514162563927, + "learning_rate": 1.4150140852443389e-06, + "loss": 0.016, + "step": 2672 + }, + { + "epoch": 0.7621899059024807, + "grad_norm": 0.7487315401677382, + "learning_rate": 1.4117960782183021e-06, + "loss": 0.0075, + "step": 2673 + }, + { + "epoch": 0.7624750499001997, + "grad_norm": 1.5926773015253632, + "learning_rate": 1.4085811328988425e-06, + "loss": 0.0245, + "step": 2674 + }, + { + "epoch": 0.7627601938979185, + "grad_norm": 1.0230513489809057, + "learning_rate": 1.405369252029175e-06, + "loss": 0.0108, + "step": 2675 + }, + { + "epoch": 0.7630453378956373, + "grad_norm": 0.6835988253710413, + "learning_rate": 1.4021604383499044e-06, + "loss": 0.0141, + "step": 2676 + }, + { + "epoch": 0.7633304818933562, + "grad_norm": 1.2256591335523719, + "learning_rate": 1.3989546945990129e-06, + "loss": 0.0158, + "step": 2677 + }, + { + "epoch": 0.763615625891075, + "grad_norm": 1.2930161491847667, + "learning_rate": 1.395752023511867e-06, + "loss": 0.0195, + "step": 2678 + }, + { + "epoch": 0.7639007698887939, + "grad_norm": 0.8994233014350279, + "learning_rate": 1.392552427821205e-06, + "loss": 0.0087, + "step": 2679 + }, + { + "epoch": 0.7641859138865127, + "grad_norm": 0.6770559844138675, + "learning_rate": 1.3893559102571502e-06, + "loss": 0.0141, + "step": 2680 + }, + { + "epoch": 0.7644710578842315, + "grad_norm": 1.4930856086234503, + "learning_rate": 1.386162473547194e-06, + "loss": 0.0343, + "step": 2681 + }, + { + "epoch": 0.7647562018819504, + "grad_norm": 1.682448635821019, + "learning_rate": 1.3829721204162e-06, + "loss": 0.0297, + "step": 2682 + }, + { + "epoch": 0.7650413458796692, + "grad_norm": 1.5491061182481676, + "learning_rate": 1.3797848535863995e-06, + "loss": 0.0326, + "step": 2683 + }, + { + "epoch": 0.765326489877388, + "grad_norm": 1.175627444484784, + "learning_rate": 1.3766006757773932e-06, + "loss": 0.0218, + "step": 2684 + }, + { + "epoch": 0.7656116338751069, + "grad_norm": 0.9840348835612843, + "learning_rate": 1.373419589706143e-06, + "loss": 0.0211, + "step": 2685 + }, + { + "epoch": 0.7658967778728257, + "grad_norm": 1.0057768113881969, + "learning_rate": 1.3702415980869743e-06, + "loss": 0.0464, + "step": 2686 + }, + { + "epoch": 0.7661819218705447, + "grad_norm": 1.7830889645336818, + "learning_rate": 1.3670667036315728e-06, + "loss": 0.0185, + "step": 2687 + }, + { + "epoch": 0.7664670658682635, + "grad_norm": 0.4224535629956841, + "learning_rate": 1.3638949090489772e-06, + "loss": 0.0081, + "step": 2688 + }, + { + "epoch": 0.7667522098659824, + "grad_norm": 0.6377033854455558, + "learning_rate": 1.360726217045591e-06, + "loss": 0.01, + "step": 2689 + }, + { + "epoch": 0.7670373538637012, + "grad_norm": 1.6543881718541624, + "learning_rate": 1.3575606303251582e-06, + "loss": 0.0274, + "step": 2690 + }, + { + "epoch": 0.76732249786142, + "grad_norm": 0.705004441035942, + "learning_rate": 1.3543981515887788e-06, + "loss": 0.0168, + "step": 2691 + }, + { + "epoch": 0.7676076418591389, + "grad_norm": 1.5935922533656512, + "learning_rate": 1.3512387835349045e-06, + "loss": 0.0248, + "step": 2692 + }, + { + "epoch": 0.7678927858568577, + "grad_norm": 0.7488281251727223, + "learning_rate": 1.3480825288593274e-06, + "loss": 0.0139, + "step": 2693 + }, + { + "epoch": 0.7681779298545766, + "grad_norm": 0.8165161964838176, + "learning_rate": 1.3449293902551857e-06, + "loss": 0.0199, + "step": 2694 + }, + { + "epoch": 0.7684630738522954, + "grad_norm": 1.5726186357766658, + "learning_rate": 1.341779370412954e-06, + "loss": 0.0364, + "step": 2695 + }, + { + "epoch": 0.7687482178500142, + "grad_norm": 0.8244759122464347, + "learning_rate": 1.3386324720204542e-06, + "loss": 0.0119, + "step": 2696 + }, + { + "epoch": 0.7690333618477331, + "grad_norm": 1.5946273315050323, + "learning_rate": 1.3354886977628374e-06, + "loss": 0.0267, + "step": 2697 + }, + { + "epoch": 0.7693185058454519, + "grad_norm": 0.8033194869459227, + "learning_rate": 1.3323480503225933e-06, + "loss": 0.0086, + "step": 2698 + }, + { + "epoch": 0.7696036498431708, + "grad_norm": 1.1393108615672012, + "learning_rate": 1.3292105323795406e-06, + "loss": 0.0277, + "step": 2699 + }, + { + "epoch": 0.7698887938408896, + "grad_norm": 1.9860907344454382, + "learning_rate": 1.3260761466108302e-06, + "loss": 0.0211, + "step": 2700 + }, + { + "epoch": 0.7701739378386085, + "grad_norm": 1.5082456222402305, + "learning_rate": 1.3229448956909385e-06, + "loss": 0.022, + "step": 2701 + }, + { + "epoch": 0.7704590818363274, + "grad_norm": 1.5344921371462583, + "learning_rate": 1.3198167822916685e-06, + "loss": 0.0245, + "step": 2702 + }, + { + "epoch": 0.7707442258340462, + "grad_norm": 0.7564484297441108, + "learning_rate": 1.3166918090821445e-06, + "loss": 0.0141, + "step": 2703 + }, + { + "epoch": 0.7710293698317651, + "grad_norm": 0.43057532343881677, + "learning_rate": 1.3135699787288114e-06, + "loss": 0.0059, + "step": 2704 + }, + { + "epoch": 0.7713145138294839, + "grad_norm": 1.0858462972323473, + "learning_rate": 1.3104512938954373e-06, + "loss": 0.0327, + "step": 2705 + }, + { + "epoch": 0.7715996578272027, + "grad_norm": 1.1709018893965801, + "learning_rate": 1.3073357572430984e-06, + "loss": 0.032, + "step": 2706 + }, + { + "epoch": 0.7718848018249216, + "grad_norm": 0.28314832345320107, + "learning_rate": 1.3042233714301893e-06, + "loss": 0.0063, + "step": 2707 + }, + { + "epoch": 0.7721699458226404, + "grad_norm": 1.0670337951747642, + "learning_rate": 1.3011141391124138e-06, + "loss": 0.017, + "step": 2708 + }, + { + "epoch": 0.7724550898203593, + "grad_norm": 1.5924118733503132, + "learning_rate": 1.2980080629427904e-06, + "loss": 0.0261, + "step": 2709 + }, + { + "epoch": 0.7727402338180781, + "grad_norm": 0.8829070105724145, + "learning_rate": 1.2949051455716378e-06, + "loss": 0.0107, + "step": 2710 + }, + { + "epoch": 0.7730253778157969, + "grad_norm": 0.4216528848394497, + "learning_rate": 1.2918053896465826e-06, + "loss": 0.012, + "step": 2711 + }, + { + "epoch": 0.7733105218135158, + "grad_norm": 0.7050612966316768, + "learning_rate": 1.2887087978125535e-06, + "loss": 0.0093, + "step": 2712 + }, + { + "epoch": 0.7735956658112346, + "grad_norm": 1.1605581061863133, + "learning_rate": 1.2856153727117792e-06, + "loss": 0.0212, + "step": 2713 + }, + { + "epoch": 0.7738808098089536, + "grad_norm": 1.3212374854178839, + "learning_rate": 1.2825251169837865e-06, + "loss": 0.0224, + "step": 2714 + }, + { + "epoch": 0.7741659538066724, + "grad_norm": 1.1885252640350221, + "learning_rate": 1.2794380332653976e-06, + "loss": 0.0208, + "step": 2715 + }, + { + "epoch": 0.7744510978043913, + "grad_norm": 1.7760519011776683, + "learning_rate": 1.2763541241907268e-06, + "loss": 0.0569, + "step": 2716 + }, + { + "epoch": 0.7747362418021101, + "grad_norm": 0.9620142840295433, + "learning_rate": 1.2732733923911854e-06, + "loss": 0.0161, + "step": 2717 + }, + { + "epoch": 0.7750213857998289, + "grad_norm": 1.1933906064161974, + "learning_rate": 1.270195840495465e-06, + "loss": 0.0181, + "step": 2718 + }, + { + "epoch": 0.7753065297975478, + "grad_norm": 1.0984270689396793, + "learning_rate": 1.2671214711295492e-06, + "loss": 0.0144, + "step": 2719 + }, + { + "epoch": 0.7755916737952666, + "grad_norm": 0.7603521204159235, + "learning_rate": 1.2640502869167043e-06, + "loss": 0.014, + "step": 2720 + }, + { + "epoch": 0.7758768177929855, + "grad_norm": 0.42403568104580863, + "learning_rate": 1.260982290477482e-06, + "loss": 0.0083, + "step": 2721 + }, + { + "epoch": 0.7761619617907043, + "grad_norm": 0.865345796891974, + "learning_rate": 1.2579174844297114e-06, + "loss": 0.0112, + "step": 2722 + }, + { + "epoch": 0.7764471057884231, + "grad_norm": 1.3649714875018568, + "learning_rate": 1.2548558713884963e-06, + "loss": 0.0109, + "step": 2723 + }, + { + "epoch": 0.776732249786142, + "grad_norm": 0.8805998540651545, + "learning_rate": 1.2517974539662198e-06, + "loss": 0.0181, + "step": 2724 + }, + { + "epoch": 0.7770173937838608, + "grad_norm": 1.133487480597252, + "learning_rate": 1.2487422347725397e-06, + "loss": 0.0439, + "step": 2725 + }, + { + "epoch": 0.7773025377815796, + "grad_norm": 1.077854176177537, + "learning_rate": 1.2456902164143813e-06, + "loss": 0.0107, + "step": 2726 + }, + { + "epoch": 0.7775876817792986, + "grad_norm": 1.4419228139613494, + "learning_rate": 1.2426414014959409e-06, + "loss": 0.0252, + "step": 2727 + }, + { + "epoch": 0.7778728257770174, + "grad_norm": 0.40646062715513975, + "learning_rate": 1.2395957926186802e-06, + "loss": 0.0048, + "step": 2728 + }, + { + "epoch": 0.7781579697747363, + "grad_norm": 0.8225502003586711, + "learning_rate": 1.2365533923813256e-06, + "loss": 0.0199, + "step": 2729 + }, + { + "epoch": 0.7784431137724551, + "grad_norm": 1.59562236103222, + "learning_rate": 1.2335142033798658e-06, + "loss": 0.0508, + "step": 2730 + }, + { + "epoch": 0.778728257770174, + "grad_norm": 1.233387689768871, + "learning_rate": 1.23047822820755e-06, + "loss": 0.0287, + "step": 2731 + }, + { + "epoch": 0.7790134017678928, + "grad_norm": 0.5500387337491803, + "learning_rate": 1.227445469454882e-06, + "loss": 0.0065, + "step": 2732 + }, + { + "epoch": 0.7792985457656116, + "grad_norm": 2.0299603071873533, + "learning_rate": 1.2244159297096291e-06, + "loss": 0.0415, + "step": 2733 + }, + { + "epoch": 0.7795836897633305, + "grad_norm": 1.353228704932132, + "learning_rate": 1.2213896115568024e-06, + "loss": 0.0387, + "step": 2734 + }, + { + "epoch": 0.7798688337610493, + "grad_norm": 1.6415096320435094, + "learning_rate": 1.2183665175786686e-06, + "loss": 0.0424, + "step": 2735 + }, + { + "epoch": 0.7801539777587682, + "grad_norm": 2.374020647253202, + "learning_rate": 1.2153466503547417e-06, + "loss": 0.0519, + "step": 2736 + }, + { + "epoch": 0.780439121756487, + "grad_norm": 1.8977748402119863, + "learning_rate": 1.2123300124617876e-06, + "loss": 0.0722, + "step": 2737 + }, + { + "epoch": 0.7807242657542058, + "grad_norm": 0.7295480552079158, + "learning_rate": 1.2093166064738098e-06, + "loss": 0.0135, + "step": 2738 + }, + { + "epoch": 0.7810094097519247, + "grad_norm": 0.8229549593104721, + "learning_rate": 1.2063064349620596e-06, + "loss": 0.0259, + "step": 2739 + }, + { + "epoch": 0.7812945537496435, + "grad_norm": 0.6925997286050343, + "learning_rate": 1.2032995004950216e-06, + "loss": 0.0146, + "step": 2740 + }, + { + "epoch": 0.7815796977473625, + "grad_norm": 0.9014027542875058, + "learning_rate": 1.2002958056384262e-06, + "loss": 0.0234, + "step": 2741 + }, + { + "epoch": 0.7818648417450813, + "grad_norm": 0.706354278112295, + "learning_rate": 1.197295352955235e-06, + "loss": 0.0086, + "step": 2742 + }, + { + "epoch": 0.7821499857428001, + "grad_norm": 0.8645190729336512, + "learning_rate": 1.194298145005644e-06, + "loss": 0.017, + "step": 2743 + }, + { + "epoch": 0.782435129740519, + "grad_norm": 0.37228866967958824, + "learning_rate": 1.1913041843470807e-06, + "loss": 0.0096, + "step": 2744 + }, + { + "epoch": 0.7827202737382378, + "grad_norm": 0.6034253285490447, + "learning_rate": 1.1883134735342023e-06, + "loss": 0.0153, + "step": 2745 + }, + { + "epoch": 0.7830054177359567, + "grad_norm": 1.1035155409154056, + "learning_rate": 1.1853260151188912e-06, + "loss": 0.0318, + "step": 2746 + }, + { + "epoch": 0.7832905617336755, + "grad_norm": 0.7012984171334737, + "learning_rate": 1.1823418116502566e-06, + "loss": 0.0133, + "step": 2747 + }, + { + "epoch": 0.7835757057313943, + "grad_norm": 2.2145145784676674, + "learning_rate": 1.1793608656746286e-06, + "loss": 0.0276, + "step": 2748 + }, + { + "epoch": 0.7838608497291132, + "grad_norm": 1.4050671677912367, + "learning_rate": 1.1763831797355612e-06, + "loss": 0.0201, + "step": 2749 + }, + { + "epoch": 0.784145993726832, + "grad_norm": 0.8314849154599873, + "learning_rate": 1.1734087563738245e-06, + "loss": 0.0146, + "step": 2750 + }, + { + "epoch": 0.7844311377245509, + "grad_norm": 1.6532863006133396, + "learning_rate": 1.1704375981274019e-06, + "loss": 0.0234, + "step": 2751 + }, + { + "epoch": 0.7847162817222697, + "grad_norm": 0.6670573448259365, + "learning_rate": 1.1674697075314928e-06, + "loss": 0.0135, + "step": 2752 + }, + { + "epoch": 0.7850014257199885, + "grad_norm": 0.6240914012825134, + "learning_rate": 1.1645050871185121e-06, + "loss": 0.0116, + "step": 2753 + }, + { + "epoch": 0.7852865697177075, + "grad_norm": 0.9870483425644136, + "learning_rate": 1.1615437394180806e-06, + "loss": 0.0107, + "step": 2754 + }, + { + "epoch": 0.7855717137154263, + "grad_norm": 1.9210648543710227, + "learning_rate": 1.1585856669570277e-06, + "loss": 0.0399, + "step": 2755 + }, + { + "epoch": 0.7858568577131452, + "grad_norm": 1.7634186455826524, + "learning_rate": 1.1556308722593845e-06, + "loss": 0.0589, + "step": 2756 + }, + { + "epoch": 0.786142001710864, + "grad_norm": 1.2268952626774527, + "learning_rate": 1.152679357846392e-06, + "loss": 0.0232, + "step": 2757 + }, + { + "epoch": 0.7864271457085829, + "grad_norm": 1.456806678520593, + "learning_rate": 1.1497311262364874e-06, + "loss": 0.0243, + "step": 2758 + }, + { + "epoch": 0.7867122897063017, + "grad_norm": 0.4950354070457815, + "learning_rate": 1.1467861799453084e-06, + "loss": 0.0072, + "step": 2759 + }, + { + "epoch": 0.7869974337040205, + "grad_norm": 0.8080613169821007, + "learning_rate": 1.143844521485688e-06, + "loss": 0.016, + "step": 2760 + }, + { + "epoch": 0.7872825777017394, + "grad_norm": 1.600494453613366, + "learning_rate": 1.1409061533676586e-06, + "loss": 0.0267, + "step": 2761 + }, + { + "epoch": 0.7875677216994582, + "grad_norm": 1.9230161162665296, + "learning_rate": 1.1379710780984376e-06, + "loss": 0.0278, + "step": 2762 + }, + { + "epoch": 0.787852865697177, + "grad_norm": 1.04041200038653, + "learning_rate": 1.1350392981824375e-06, + "loss": 0.0226, + "step": 2763 + }, + { + "epoch": 0.7881380096948959, + "grad_norm": 0.20832185978045045, + "learning_rate": 1.1321108161212574e-06, + "loss": 0.0049, + "step": 2764 + }, + { + "epoch": 0.7884231536926147, + "grad_norm": 0.6598830632065913, + "learning_rate": 1.1291856344136853e-06, + "loss": 0.0072, + "step": 2765 + }, + { + "epoch": 0.7887082976903336, + "grad_norm": 1.6941306324384335, + "learning_rate": 1.1262637555556905e-06, + "loss": 0.0444, + "step": 2766 + }, + { + "epoch": 0.7889934416880525, + "grad_norm": 2.3563085514868853, + "learning_rate": 1.1233451820404222e-06, + "loss": 0.0641, + "step": 2767 + }, + { + "epoch": 0.7892785856857714, + "grad_norm": 0.5794506056340978, + "learning_rate": 1.1204299163582117e-06, + "loss": 0.0099, + "step": 2768 + }, + { + "epoch": 0.7895637296834902, + "grad_norm": 0.5242308434486851, + "learning_rate": 1.1175179609965697e-06, + "loss": 0.0082, + "step": 2769 + }, + { + "epoch": 0.789848873681209, + "grad_norm": 1.1466102878255973, + "learning_rate": 1.1146093184401791e-06, + "loss": 0.0196, + "step": 2770 + }, + { + "epoch": 0.7901340176789279, + "grad_norm": 1.8361430845752722, + "learning_rate": 1.1117039911708966e-06, + "loss": 0.039, + "step": 2771 + }, + { + "epoch": 0.7904191616766467, + "grad_norm": 2.1710130177501252, + "learning_rate": 1.108801981667752e-06, + "loss": 0.0378, + "step": 2772 + }, + { + "epoch": 0.7907043056743656, + "grad_norm": 1.0607481844235185, + "learning_rate": 1.1059032924069419e-06, + "loss": 0.0198, + "step": 2773 + }, + { + "epoch": 0.7909894496720844, + "grad_norm": 1.1534272328002837, + "learning_rate": 1.1030079258618303e-06, + "loss": 0.0181, + "step": 2774 + }, + { + "epoch": 0.7912745936698032, + "grad_norm": 1.4451323963964195, + "learning_rate": 1.1001158845029475e-06, + "loss": 0.028, + "step": 2775 + }, + { + "epoch": 0.7915597376675221, + "grad_norm": 1.0170979316582653, + "learning_rate": 1.0972271707979837e-06, + "loss": 0.0151, + "step": 2776 + }, + { + "epoch": 0.7918448816652409, + "grad_norm": 0.4474136512465243, + "learning_rate": 1.0943417872117956e-06, + "loss": 0.0113, + "step": 2777 + }, + { + "epoch": 0.7921300256629598, + "grad_norm": 1.0050859141496182, + "learning_rate": 1.09145973620639e-06, + "loss": 0.0114, + "step": 2778 + }, + { + "epoch": 0.7924151696606786, + "grad_norm": 1.7556272164199616, + "learning_rate": 1.0885810202409358e-06, + "loss": 0.0284, + "step": 2779 + }, + { + "epoch": 0.7927003136583974, + "grad_norm": 0.8890931457178195, + "learning_rate": 1.0857056417717538e-06, + "loss": 0.0171, + "step": 2780 + }, + { + "epoch": 0.7929854576561164, + "grad_norm": 0.9710326204173798, + "learning_rate": 1.0828336032523206e-06, + "loss": 0.0303, + "step": 2781 + }, + { + "epoch": 0.7932706016538352, + "grad_norm": 1.23379631524315, + "learning_rate": 1.0799649071332585e-06, + "loss": 0.0203, + "step": 2782 + }, + { + "epoch": 0.7935557456515541, + "grad_norm": 1.0048302038142245, + "learning_rate": 1.077099555862342e-06, + "loss": 0.0184, + "step": 2783 + }, + { + "epoch": 0.7938408896492729, + "grad_norm": 0.5781905231147105, + "learning_rate": 1.0742375518844845e-06, + "loss": 0.0065, + "step": 2784 + }, + { + "epoch": 0.7941260336469917, + "grad_norm": 0.6193101566716163, + "learning_rate": 1.0713788976417522e-06, + "loss": 0.0142, + "step": 2785 + }, + { + "epoch": 0.7944111776447106, + "grad_norm": 0.4952614177724453, + "learning_rate": 1.068523595573348e-06, + "loss": 0.0058, + "step": 2786 + }, + { + "epoch": 0.7946963216424294, + "grad_norm": 1.2992089082042755, + "learning_rate": 1.0656716481156144e-06, + "loss": 0.0278, + "step": 2787 + }, + { + "epoch": 0.7949814656401483, + "grad_norm": 1.0260825841582102, + "learning_rate": 1.0628230577020327e-06, + "loss": 0.0125, + "step": 2788 + }, + { + "epoch": 0.7952666096378671, + "grad_norm": 1.650422933470823, + "learning_rate": 1.05997782676322e-06, + "loss": 0.0284, + "step": 2789 + }, + { + "epoch": 0.795551753635586, + "grad_norm": 1.6922243882463914, + "learning_rate": 1.0571359577269263e-06, + "loss": 0.034, + "step": 2790 + }, + { + "epoch": 0.7958368976333048, + "grad_norm": 2.064635080746134, + "learning_rate": 1.0542974530180327e-06, + "loss": 0.0331, + "step": 2791 + }, + { + "epoch": 0.7961220416310236, + "grad_norm": 0.6979973531032883, + "learning_rate": 1.0514623150585484e-06, + "loss": 0.015, + "step": 2792 + }, + { + "epoch": 0.7964071856287425, + "grad_norm": 1.9320077547472618, + "learning_rate": 1.048630546267615e-06, + "loss": 0.048, + "step": 2793 + }, + { + "epoch": 0.7966923296264614, + "grad_norm": 0.9851340654285308, + "learning_rate": 1.045802149061495e-06, + "loss": 0.0095, + "step": 2794 + }, + { + "epoch": 0.7969774736241803, + "grad_norm": 0.33088819901168887, + "learning_rate": 1.0429771258535726e-06, + "loss": 0.007, + "step": 2795 + }, + { + "epoch": 0.7972626176218991, + "grad_norm": 1.4719706011002405, + "learning_rate": 1.0401554790543545e-06, + "loss": 0.0243, + "step": 2796 + }, + { + "epoch": 0.7975477616196179, + "grad_norm": 2.20284051649092, + "learning_rate": 1.0373372110714697e-06, + "loss": 0.0381, + "step": 2797 + }, + { + "epoch": 0.7978329056173368, + "grad_norm": 0.9042831305808354, + "learning_rate": 1.0345223243096614e-06, + "loss": 0.0133, + "step": 2798 + }, + { + "epoch": 0.7981180496150556, + "grad_norm": 0.4357457490944433, + "learning_rate": 1.0317108211707883e-06, + "loss": 0.0059, + "step": 2799 + }, + { + "epoch": 0.7984031936127745, + "grad_norm": 1.4454010907254553, + "learning_rate": 1.0289027040538174e-06, + "loss": 0.0271, + "step": 2800 + }, + { + "epoch": 0.7986883376104933, + "grad_norm": 0.8325294383532024, + "learning_rate": 1.0260979753548356e-06, + "loss": 0.0242, + "step": 2801 + }, + { + "epoch": 0.7989734816082121, + "grad_norm": 0.6799031003016142, + "learning_rate": 1.023296637467031e-06, + "loss": 0.0115, + "step": 2802 + }, + { + "epoch": 0.799258625605931, + "grad_norm": 0.8000424760149701, + "learning_rate": 1.0204986927807026e-06, + "loss": 0.0139, + "step": 2803 + }, + { + "epoch": 0.7995437696036498, + "grad_norm": 1.0437386516730733, + "learning_rate": 1.0177041436832508e-06, + "loss": 0.0234, + "step": 2804 + }, + { + "epoch": 0.7998289136013687, + "grad_norm": 0.9328110888666781, + "learning_rate": 1.0149129925591816e-06, + "loss": 0.0411, + "step": 2805 + }, + { + "epoch": 0.8001140575990875, + "grad_norm": 0.8919536940706596, + "learning_rate": 1.0121252417901e-06, + "loss": 0.0243, + "step": 2806 + }, + { + "epoch": 0.8003992015968064, + "grad_norm": 1.7879385125082785, + "learning_rate": 1.00934089375471e-06, + "loss": 0.0299, + "step": 2807 + }, + { + "epoch": 0.8006843455945253, + "grad_norm": 1.0080187097759719, + "learning_rate": 1.006559950828812e-06, + "loss": 0.0275, + "step": 2808 + }, + { + "epoch": 0.8009694895922441, + "grad_norm": 2.297194259534309, + "learning_rate": 1.0037824153852993e-06, + "loss": 0.0589, + "step": 2809 + }, + { + "epoch": 0.801254633589963, + "grad_norm": 1.3722790217531777, + "learning_rate": 1.0010082897941642e-06, + "loss": 0.0468, + "step": 2810 + }, + { + "epoch": 0.8015397775876818, + "grad_norm": 2.880885415554408, + "learning_rate": 9.982375764224805e-07, + "loss": 0.0544, + "step": 2811 + }, + { + "epoch": 0.8018249215854006, + "grad_norm": 1.3779641066305213, + "learning_rate": 9.954702776344144e-07, + "loss": 0.0498, + "step": 2812 + }, + { + "epoch": 0.8021100655831195, + "grad_norm": 1.578974235716462, + "learning_rate": 9.927063957912214e-07, + "loss": 0.0309, + "step": 2813 + }, + { + "epoch": 0.8023952095808383, + "grad_norm": 0.5250722169025638, + "learning_rate": 9.89945933251238e-07, + "loss": 0.0073, + "step": 2814 + }, + { + "epoch": 0.8026803535785572, + "grad_norm": 1.214875683960437, + "learning_rate": 9.871888923698836e-07, + "loss": 0.0313, + "step": 2815 + }, + { + "epoch": 0.802965497576276, + "grad_norm": 1.899479207342997, + "learning_rate": 9.844352754996578e-07, + "loss": 0.0394, + "step": 2816 + }, + { + "epoch": 0.8032506415739948, + "grad_norm": 1.7956615761315469, + "learning_rate": 9.816850849901404e-07, + "loss": 0.037, + "step": 2817 + }, + { + "epoch": 0.8035357855717137, + "grad_norm": 0.987617818563438, + "learning_rate": 9.78938323187985e-07, + "loss": 0.0249, + "step": 2818 + }, + { + "epoch": 0.8038209295694325, + "grad_norm": 2.2873969718098177, + "learning_rate": 9.761949924369217e-07, + "loss": 0.0508, + "step": 2819 + }, + { + "epoch": 0.8041060735671514, + "grad_norm": 0.6727560710264749, + "learning_rate": 9.734550950777523e-07, + "loss": 0.0135, + "step": 2820 + }, + { + "epoch": 0.8043912175648703, + "grad_norm": 1.1083108655277412, + "learning_rate": 9.707186334483477e-07, + "loss": 0.0301, + "step": 2821 + }, + { + "epoch": 0.8046763615625891, + "grad_norm": 1.4347478292304672, + "learning_rate": 9.679856098836498e-07, + "loss": 0.018, + "step": 2822 + }, + { + "epoch": 0.804961505560308, + "grad_norm": 0.7012755730521804, + "learning_rate": 9.652560267156647e-07, + "loss": 0.0096, + "step": 2823 + }, + { + "epoch": 0.8052466495580268, + "grad_norm": 0.76616377946827, + "learning_rate": 9.625298862734645e-07, + "loss": 0.0136, + "step": 2824 + }, + { + "epoch": 0.8055317935557457, + "grad_norm": 1.510172264857123, + "learning_rate": 9.598071908831802e-07, + "loss": 0.0371, + "step": 2825 + }, + { + "epoch": 0.8058169375534645, + "grad_norm": 0.9247247630615653, + "learning_rate": 9.570879428680097e-07, + "loss": 0.0162, + "step": 2826 + }, + { + "epoch": 0.8061020815511833, + "grad_norm": 1.4471204790194672, + "learning_rate": 9.543721445482051e-07, + "loss": 0.0414, + "step": 2827 + }, + { + "epoch": 0.8063872255489022, + "grad_norm": 1.1261570935477987, + "learning_rate": 9.516597982410719e-07, + "loss": 0.0222, + "step": 2828 + }, + { + "epoch": 0.806672369546621, + "grad_norm": 1.213228339092222, + "learning_rate": 9.489509062609776e-07, + "loss": 0.019, + "step": 2829 + }, + { + "epoch": 0.8069575135443399, + "grad_norm": 1.0568593386045195, + "learning_rate": 9.46245470919337e-07, + "loss": 0.0162, + "step": 2830 + }, + { + "epoch": 0.8072426575420587, + "grad_norm": 1.0598129850467366, + "learning_rate": 9.435434945246164e-07, + "loss": 0.0115, + "step": 2831 + }, + { + "epoch": 0.8075278015397775, + "grad_norm": 1.148087127959207, + "learning_rate": 9.408449793823316e-07, + "loss": 0.0186, + "step": 2832 + }, + { + "epoch": 0.8078129455374964, + "grad_norm": 0.7838943287296171, + "learning_rate": 9.38149927795044e-07, + "loss": 0.0145, + "step": 2833 + }, + { + "epoch": 0.8080980895352153, + "grad_norm": 0.6450557189362891, + "learning_rate": 9.354583420623603e-07, + "loss": 0.0178, + "step": 2834 + }, + { + "epoch": 0.8083832335329342, + "grad_norm": 1.2580295549839888, + "learning_rate": 9.327702244809295e-07, + "loss": 0.0334, + "step": 2835 + }, + { + "epoch": 0.808668377530653, + "grad_norm": 0.5841231963670347, + "learning_rate": 9.300855773444422e-07, + "loss": 0.0171, + "step": 2836 + }, + { + "epoch": 0.8089535215283719, + "grad_norm": 1.0744692555164113, + "learning_rate": 9.274044029436252e-07, + "loss": 0.0231, + "step": 2837 + }, + { + "epoch": 0.8092386655260907, + "grad_norm": 0.5347627958333485, + "learning_rate": 9.247267035662483e-07, + "loss": 0.0106, + "step": 2838 + }, + { + "epoch": 0.8095238095238095, + "grad_norm": 0.8639389678174983, + "learning_rate": 9.220524814971082e-07, + "loss": 0.0098, + "step": 2839 + }, + { + "epoch": 0.8098089535215284, + "grad_norm": 1.4394556398547473, + "learning_rate": 9.193817390180387e-07, + "loss": 0.0158, + "step": 2840 + }, + { + "epoch": 0.8100940975192472, + "grad_norm": 1.7003245750972533, + "learning_rate": 9.167144784079035e-07, + "loss": 0.0287, + "step": 2841 + }, + { + "epoch": 0.810379241516966, + "grad_norm": 1.3577296521728226, + "learning_rate": 9.140507019425981e-07, + "loss": 0.031, + "step": 2842 + }, + { + "epoch": 0.8106643855146849, + "grad_norm": 1.3152852591455102, + "learning_rate": 9.11390411895044e-07, + "loss": 0.0271, + "step": 2843 + }, + { + "epoch": 0.8109495295124037, + "grad_norm": 0.8196188314690228, + "learning_rate": 9.087336105351813e-07, + "loss": 0.0207, + "step": 2844 + }, + { + "epoch": 0.8112346735101226, + "grad_norm": 0.7446020625868897, + "learning_rate": 9.060803001299833e-07, + "loss": 0.023, + "step": 2845 + }, + { + "epoch": 0.8115198175078414, + "grad_norm": 1.6959863019895527, + "learning_rate": 9.034304829434387e-07, + "loss": 0.0425, + "step": 2846 + }, + { + "epoch": 0.8118049615055604, + "grad_norm": 1.5565435765306315, + "learning_rate": 9.007841612365575e-07, + "loss": 0.0324, + "step": 2847 + }, + { + "epoch": 0.8120901055032792, + "grad_norm": 1.4501860535298694, + "learning_rate": 8.981413372673659e-07, + "loss": 0.0276, + "step": 2848 + }, + { + "epoch": 0.812375249500998, + "grad_norm": 0.8662824114962682, + "learning_rate": 8.955020132909059e-07, + "loss": 0.0222, + "step": 2849 + }, + { + "epoch": 0.8126603934987169, + "grad_norm": 0.6872747440077209, + "learning_rate": 8.928661915592335e-07, + "loss": 0.0064, + "step": 2850 + }, + { + "epoch": 0.8129455374964357, + "grad_norm": 0.8899013455152293, + "learning_rate": 8.902338743214167e-07, + "loss": 0.0133, + "step": 2851 + }, + { + "epoch": 0.8132306814941546, + "grad_norm": 0.75181554380744, + "learning_rate": 8.876050638235323e-07, + "loss": 0.0204, + "step": 2852 + }, + { + "epoch": 0.8135158254918734, + "grad_norm": 0.830293688967669, + "learning_rate": 8.849797623086643e-07, + "loss": 0.011, + "step": 2853 + }, + { + "epoch": 0.8138009694895922, + "grad_norm": 1.0573364540507733, + "learning_rate": 8.823579720169068e-07, + "loss": 0.038, + "step": 2854 + }, + { + "epoch": 0.8140861134873111, + "grad_norm": 1.2831202813735518, + "learning_rate": 8.797396951853515e-07, + "loss": 0.0322, + "step": 2855 + }, + { + "epoch": 0.8143712574850299, + "grad_norm": 0.5543201708828293, + "learning_rate": 8.771249340480959e-07, + "loss": 0.0094, + "step": 2856 + }, + { + "epoch": 0.8146564014827488, + "grad_norm": 0.9366333141939963, + "learning_rate": 8.745136908362367e-07, + "loss": 0.0124, + "step": 2857 + }, + { + "epoch": 0.8149415454804676, + "grad_norm": 1.1448966352215733, + "learning_rate": 8.719059677778712e-07, + "loss": 0.0348, + "step": 2858 + }, + { + "epoch": 0.8152266894781864, + "grad_norm": 0.6801824833631003, + "learning_rate": 8.693017670980903e-07, + "loss": 0.0153, + "step": 2859 + }, + { + "epoch": 0.8155118334759053, + "grad_norm": 1.490041583362656, + "learning_rate": 8.667010910189794e-07, + "loss": 0.039, + "step": 2860 + }, + { + "epoch": 0.8157969774736242, + "grad_norm": 1.3198201360592343, + "learning_rate": 8.641039417596181e-07, + "loss": 0.0233, + "step": 2861 + }, + { + "epoch": 0.8160821214713431, + "grad_norm": 0.8793603147206949, + "learning_rate": 8.615103215360754e-07, + "loss": 0.0206, + "step": 2862 + }, + { + "epoch": 0.8163672654690619, + "grad_norm": 0.9461948476174028, + "learning_rate": 8.589202325614094e-07, + "loss": 0.0248, + "step": 2863 + }, + { + "epoch": 0.8166524094667807, + "grad_norm": 1.016498277100784, + "learning_rate": 8.563336770456654e-07, + "loss": 0.012, + "step": 2864 + }, + { + "epoch": 0.8169375534644996, + "grad_norm": 2.102929530837856, + "learning_rate": 8.537506571958736e-07, + "loss": 0.0296, + "step": 2865 + }, + { + "epoch": 0.8172226974622184, + "grad_norm": 1.5361923525365948, + "learning_rate": 8.511711752160467e-07, + "loss": 0.0197, + "step": 2866 + }, + { + "epoch": 0.8175078414599373, + "grad_norm": 1.2400610374990475, + "learning_rate": 8.485952333071801e-07, + "loss": 0.0212, + "step": 2867 + }, + { + "epoch": 0.8177929854576561, + "grad_norm": 0.9973748714157373, + "learning_rate": 8.460228336672466e-07, + "loss": 0.0389, + "step": 2868 + }, + { + "epoch": 0.818078129455375, + "grad_norm": 2.49728872513892, + "learning_rate": 8.434539784911966e-07, + "loss": 0.0497, + "step": 2869 + }, + { + "epoch": 0.8183632734530938, + "grad_norm": 1.3237326648218986, + "learning_rate": 8.408886699709601e-07, + "loss": 0.0351, + "step": 2870 + }, + { + "epoch": 0.8186484174508126, + "grad_norm": 0.8497672015331845, + "learning_rate": 8.383269102954367e-07, + "loss": 0.0264, + "step": 2871 + }, + { + "epoch": 0.8189335614485315, + "grad_norm": 1.065455590155432, + "learning_rate": 8.357687016504972e-07, + "loss": 0.0256, + "step": 2872 + }, + { + "epoch": 0.8192187054462503, + "grad_norm": 1.5358434738505333, + "learning_rate": 8.332140462189841e-07, + "loss": 0.0439, + "step": 2873 + }, + { + "epoch": 0.8195038494439693, + "grad_norm": 1.4294989218591867, + "learning_rate": 8.306629461807109e-07, + "loss": 0.0286, + "step": 2874 + }, + { + "epoch": 0.8197889934416881, + "grad_norm": 1.9130676592818923, + "learning_rate": 8.281154037124523e-07, + "loss": 0.0196, + "step": 2875 + }, + { + "epoch": 0.8200741374394069, + "grad_norm": 1.406613009250302, + "learning_rate": 8.255714209879506e-07, + "loss": 0.0262, + "step": 2876 + }, + { + "epoch": 0.8203592814371258, + "grad_norm": 0.6268363665660219, + "learning_rate": 8.230310001779096e-07, + "loss": 0.0121, + "step": 2877 + }, + { + "epoch": 0.8206444254348446, + "grad_norm": 0.7357882911831951, + "learning_rate": 8.204941434499941e-07, + "loss": 0.0099, + "step": 2878 + }, + { + "epoch": 0.8209295694325635, + "grad_norm": 1.8175431785227907, + "learning_rate": 8.179608529688276e-07, + "loss": 0.0249, + "step": 2879 + }, + { + "epoch": 0.8212147134302823, + "grad_norm": 0.2394768774671226, + "learning_rate": 8.154311308959911e-07, + "loss": 0.0064, + "step": 2880 + }, + { + "epoch": 0.8214998574280011, + "grad_norm": 1.0082037597372302, + "learning_rate": 8.129049793900185e-07, + "loss": 0.0152, + "step": 2881 + }, + { + "epoch": 0.82178500142572, + "grad_norm": 0.9206006167023392, + "learning_rate": 8.103824006064032e-07, + "loss": 0.0267, + "step": 2882 + }, + { + "epoch": 0.8220701454234388, + "grad_norm": 1.184257876582493, + "learning_rate": 8.078633966975818e-07, + "loss": 0.0344, + "step": 2883 + }, + { + "epoch": 0.8223552894211577, + "grad_norm": 1.119928137630302, + "learning_rate": 8.053479698129463e-07, + "loss": 0.021, + "step": 2884 + }, + { + "epoch": 0.8226404334188765, + "grad_norm": 0.7752796412552893, + "learning_rate": 8.028361220988334e-07, + "loss": 0.0217, + "step": 2885 + }, + { + "epoch": 0.8229255774165953, + "grad_norm": 1.0723505832855476, + "learning_rate": 8.003278556985295e-07, + "loss": 0.0404, + "step": 2886 + }, + { + "epoch": 0.8232107214143143, + "grad_norm": 0.370869296241954, + "learning_rate": 7.978231727522634e-07, + "loss": 0.0091, + "step": 2887 + }, + { + "epoch": 0.8234958654120331, + "grad_norm": 1.6501205644493508, + "learning_rate": 7.953220753972029e-07, + "loss": 0.0302, + "step": 2888 + }, + { + "epoch": 0.823781009409752, + "grad_norm": 3.387858235076237, + "learning_rate": 7.928245657674599e-07, + "loss": 0.0449, + "step": 2889 + }, + { + "epoch": 0.8240661534074708, + "grad_norm": 0.7939624399299788, + "learning_rate": 7.903306459940863e-07, + "loss": 0.0149, + "step": 2890 + }, + { + "epoch": 0.8243512974051896, + "grad_norm": 0.8919761379481936, + "learning_rate": 7.87840318205067e-07, + "loss": 0.0121, + "step": 2891 + }, + { + "epoch": 0.8246364414029085, + "grad_norm": 0.8945707327162529, + "learning_rate": 7.853535845253252e-07, + "loss": 0.0167, + "step": 2892 + }, + { + "epoch": 0.8249215854006273, + "grad_norm": 1.4960085975377995, + "learning_rate": 7.828704470767151e-07, + "loss": 0.0239, + "step": 2893 + }, + { + "epoch": 0.8252067293983462, + "grad_norm": 1.7361926971910235, + "learning_rate": 7.803909079780237e-07, + "loss": 0.0196, + "step": 2894 + }, + { + "epoch": 0.825491873396065, + "grad_norm": 2.1069882551766312, + "learning_rate": 7.779149693449666e-07, + "loss": 0.0331, + "step": 2895 + }, + { + "epoch": 0.8257770173937838, + "grad_norm": 1.640025015859863, + "learning_rate": 7.754426332901888e-07, + "loss": 0.0393, + "step": 2896 + }, + { + "epoch": 0.8260621613915027, + "grad_norm": 0.6030323002263968, + "learning_rate": 7.729739019232579e-07, + "loss": 0.0108, + "step": 2897 + }, + { + "epoch": 0.8263473053892215, + "grad_norm": 0.526741493347755, + "learning_rate": 7.705087773506731e-07, + "loss": 0.016, + "step": 2898 + }, + { + "epoch": 0.8266324493869404, + "grad_norm": 1.5742017652828122, + "learning_rate": 7.680472616758467e-07, + "loss": 0.0298, + "step": 2899 + }, + { + "epoch": 0.8269175933846592, + "grad_norm": 0.934331197910886, + "learning_rate": 7.655893569991175e-07, + "loss": 0.0105, + "step": 2900 + }, + { + "epoch": 0.8272027373823782, + "grad_norm": 0.7000170882506863, + "learning_rate": 7.631350654177405e-07, + "loss": 0.0158, + "step": 2901 + }, + { + "epoch": 0.827487881380097, + "grad_norm": 0.9680259614112673, + "learning_rate": 7.606843890258914e-07, + "loss": 0.0239, + "step": 2902 + }, + { + "epoch": 0.8277730253778158, + "grad_norm": 1.494423198330321, + "learning_rate": 7.582373299146578e-07, + "loss": 0.0392, + "step": 2903 + }, + { + "epoch": 0.8280581693755347, + "grad_norm": 1.207768362788276, + "learning_rate": 7.557938901720418e-07, + "loss": 0.0151, + "step": 2904 + }, + { + "epoch": 0.8283433133732535, + "grad_norm": 1.1032689271047176, + "learning_rate": 7.533540718829547e-07, + "loss": 0.0482, + "step": 2905 + }, + { + "epoch": 0.8286284573709723, + "grad_norm": 1.376001495335339, + "learning_rate": 7.509178771292231e-07, + "loss": 0.0188, + "step": 2906 + }, + { + "epoch": 0.8289136013686912, + "grad_norm": 1.0040653742355359, + "learning_rate": 7.484853079895782e-07, + "loss": 0.014, + "step": 2907 + }, + { + "epoch": 0.82919874536641, + "grad_norm": 0.853257302586088, + "learning_rate": 7.460563665396569e-07, + "loss": 0.019, + "step": 2908 + }, + { + "epoch": 0.8294838893641289, + "grad_norm": 0.41199696502392325, + "learning_rate": 7.436310548520037e-07, + "loss": 0.0078, + "step": 2909 + }, + { + "epoch": 0.8297690333618477, + "grad_norm": 1.344739178123292, + "learning_rate": 7.412093749960625e-07, + "loss": 0.0381, + "step": 2910 + }, + { + "epoch": 0.8300541773595665, + "grad_norm": 1.4175396586193671, + "learning_rate": 7.38791329038181e-07, + "loss": 0.0152, + "step": 2911 + }, + { + "epoch": 0.8303393213572854, + "grad_norm": 1.8019194230806457, + "learning_rate": 7.363769190416048e-07, + "loss": 0.0206, + "step": 2912 + }, + { + "epoch": 0.8306244653550042, + "grad_norm": 0.9576926416107516, + "learning_rate": 7.339661470664761e-07, + "loss": 0.0179, + "step": 2913 + }, + { + "epoch": 0.8309096093527232, + "grad_norm": 1.1324523780149793, + "learning_rate": 7.315590151698371e-07, + "loss": 0.0238, + "step": 2914 + }, + { + "epoch": 0.831194753350442, + "grad_norm": 1.4722177708236213, + "learning_rate": 7.291555254056198e-07, + "loss": 0.0186, + "step": 2915 + }, + { + "epoch": 0.8314798973481609, + "grad_norm": 0.9504375580728752, + "learning_rate": 7.267556798246489e-07, + "loss": 0.0433, + "step": 2916 + }, + { + "epoch": 0.8317650413458797, + "grad_norm": 1.3361157091698659, + "learning_rate": 7.243594804746401e-07, + "loss": 0.0271, + "step": 2917 + }, + { + "epoch": 0.8320501853435985, + "grad_norm": 1.5034432617659543, + "learning_rate": 7.219669294002002e-07, + "loss": 0.0161, + "step": 2918 + }, + { + "epoch": 0.8323353293413174, + "grad_norm": 0.8175025379771647, + "learning_rate": 7.195780286428206e-07, + "loss": 0.0126, + "step": 2919 + }, + { + "epoch": 0.8326204733390362, + "grad_norm": 0.8373585191851456, + "learning_rate": 7.17192780240879e-07, + "loss": 0.0164, + "step": 2920 + }, + { + "epoch": 0.832905617336755, + "grad_norm": 0.4003185300249988, + "learning_rate": 7.148111862296331e-07, + "loss": 0.0117, + "step": 2921 + }, + { + "epoch": 0.8331907613344739, + "grad_norm": 1.5025437383078868, + "learning_rate": 7.124332486412289e-07, + "loss": 0.0204, + "step": 2922 + }, + { + "epoch": 0.8334759053321927, + "grad_norm": 1.3047499112241643, + "learning_rate": 7.100589695046883e-07, + "loss": 0.022, + "step": 2923 + }, + { + "epoch": 0.8337610493299116, + "grad_norm": 0.568459371870428, + "learning_rate": 7.076883508459115e-07, + "loss": 0.0078, + "step": 2924 + }, + { + "epoch": 0.8340461933276304, + "grad_norm": 2.1528288915735145, + "learning_rate": 7.053213946876769e-07, + "loss": 0.0592, + "step": 2925 + }, + { + "epoch": 0.8343313373253493, + "grad_norm": 0.7861846035271016, + "learning_rate": 7.029581030496368e-07, + "loss": 0.025, + "step": 2926 + }, + { + "epoch": 0.8346164813230682, + "grad_norm": 2.3660338087684987, + "learning_rate": 7.005984779483166e-07, + "loss": 0.0348, + "step": 2927 + }, + { + "epoch": 0.834901625320787, + "grad_norm": 1.2003912939811823, + "learning_rate": 6.982425213971145e-07, + "loss": 0.0263, + "step": 2928 + }, + { + "epoch": 0.8351867693185059, + "grad_norm": 0.80495647903881, + "learning_rate": 6.958902354062952e-07, + "loss": 0.0153, + "step": 2929 + }, + { + "epoch": 0.8354719133162247, + "grad_norm": 0.9565190013742579, + "learning_rate": 6.93541621982996e-07, + "loss": 0.0163, + "step": 2930 + }, + { + "epoch": 0.8357570573139436, + "grad_norm": 1.5292130072943486, + "learning_rate": 6.911966831312189e-07, + "loss": 0.0256, + "step": 2931 + }, + { + "epoch": 0.8360422013116624, + "grad_norm": 1.3532017392811009, + "learning_rate": 6.888554208518272e-07, + "loss": 0.019, + "step": 2932 + }, + { + "epoch": 0.8363273453093812, + "grad_norm": 0.9094015042109889, + "learning_rate": 6.86517837142549e-07, + "loss": 0.015, + "step": 2933 + }, + { + "epoch": 0.8366124893071001, + "grad_norm": 0.7082302676502451, + "learning_rate": 6.841839339979778e-07, + "loss": 0.0187, + "step": 2934 + }, + { + "epoch": 0.8368976333048189, + "grad_norm": 0.9879617552325342, + "learning_rate": 6.818537134095604e-07, + "loss": 0.0201, + "step": 2935 + }, + { + "epoch": 0.8371827773025378, + "grad_norm": 0.564486638835549, + "learning_rate": 6.795271773656054e-07, + "loss": 0.0208, + "step": 2936 + }, + { + "epoch": 0.8374679213002566, + "grad_norm": 0.2814308779842693, + "learning_rate": 6.772043278512747e-07, + "loss": 0.0068, + "step": 2937 + }, + { + "epoch": 0.8377530652979754, + "grad_norm": 0.7178043054217914, + "learning_rate": 6.748851668485873e-07, + "loss": 0.0098, + "step": 2938 + }, + { + "epoch": 0.8380382092956943, + "grad_norm": 0.6805116484205672, + "learning_rate": 6.725696963364126e-07, + "loss": 0.0081, + "step": 2939 + }, + { + "epoch": 0.8383233532934131, + "grad_norm": 1.0022569621899031, + "learning_rate": 6.702579182904723e-07, + "loss": 0.0264, + "step": 2940 + }, + { + "epoch": 0.8386084972911321, + "grad_norm": 1.904193877543303, + "learning_rate": 6.679498346833374e-07, + "loss": 0.042, + "step": 2941 + }, + { + "epoch": 0.8388936412888509, + "grad_norm": 0.29653976115050246, + "learning_rate": 6.656454474844248e-07, + "loss": 0.0059, + "step": 2942 + }, + { + "epoch": 0.8391787852865698, + "grad_norm": 0.6705392516340656, + "learning_rate": 6.633447586600028e-07, + "loss": 0.0118, + "step": 2943 + }, + { + "epoch": 0.8394639292842886, + "grad_norm": 0.4684828232836064, + "learning_rate": 6.61047770173176e-07, + "loss": 0.0083, + "step": 2944 + }, + { + "epoch": 0.8397490732820074, + "grad_norm": 0.6894065679425363, + "learning_rate": 6.587544839838961e-07, + "loss": 0.0139, + "step": 2945 + }, + { + "epoch": 0.8400342172797263, + "grad_norm": 1.1770716615384926, + "learning_rate": 6.564649020489566e-07, + "loss": 0.0188, + "step": 2946 + }, + { + "epoch": 0.8403193612774451, + "grad_norm": 0.8594451557210869, + "learning_rate": 6.541790263219894e-07, + "loss": 0.0171, + "step": 2947 + }, + { + "epoch": 0.840604505275164, + "grad_norm": 0.9031875486057582, + "learning_rate": 6.518968587534635e-07, + "loss": 0.025, + "step": 2948 + }, + { + "epoch": 0.8408896492728828, + "grad_norm": 1.9432753386854618, + "learning_rate": 6.496184012906814e-07, + "loss": 0.0232, + "step": 2949 + }, + { + "epoch": 0.8411747932706016, + "grad_norm": 0.7020884836813761, + "learning_rate": 6.473436558777846e-07, + "loss": 0.0086, + "step": 2950 + }, + { + "epoch": 0.8414599372683205, + "grad_norm": 0.5466929368926949, + "learning_rate": 6.450726244557448e-07, + "loss": 0.012, + "step": 2951 + }, + { + "epoch": 0.8417450812660393, + "grad_norm": 1.7507269669821028, + "learning_rate": 6.428053089623648e-07, + "loss": 0.0178, + "step": 2952 + }, + { + "epoch": 0.8420302252637581, + "grad_norm": 1.4983504947040904, + "learning_rate": 6.405417113322765e-07, + "loss": 0.0259, + "step": 2953 + }, + { + "epoch": 0.8423153692614771, + "grad_norm": 0.6649231360890827, + "learning_rate": 6.382818334969399e-07, + "loss": 0.0152, + "step": 2954 + }, + { + "epoch": 0.8426005132591959, + "grad_norm": 0.5291504974490259, + "learning_rate": 6.360256773846402e-07, + "loss": 0.0093, + "step": 2955 + }, + { + "epoch": 0.8428856572569148, + "grad_norm": 1.0922105163879545, + "learning_rate": 6.337732449204886e-07, + "loss": 0.0149, + "step": 2956 + }, + { + "epoch": 0.8431708012546336, + "grad_norm": 1.0162775087631601, + "learning_rate": 6.315245380264179e-07, + "loss": 0.0238, + "step": 2957 + }, + { + "epoch": 0.8434559452523525, + "grad_norm": 1.035067751129879, + "learning_rate": 6.292795586211803e-07, + "loss": 0.0184, + "step": 2958 + }, + { + "epoch": 0.8437410892500713, + "grad_norm": 0.4314581076678392, + "learning_rate": 6.27038308620353e-07, + "loss": 0.0069, + "step": 2959 + }, + { + "epoch": 0.8440262332477901, + "grad_norm": 0.9172564461045714, + "learning_rate": 6.24800789936324e-07, + "loss": 0.013, + "step": 2960 + }, + { + "epoch": 0.844311377245509, + "grad_norm": 1.8064356661478254, + "learning_rate": 6.225670044783011e-07, + "loss": 0.0285, + "step": 2961 + }, + { + "epoch": 0.8445965212432278, + "grad_norm": 0.9846608581771469, + "learning_rate": 6.203369541523075e-07, + "loss": 0.0181, + "step": 2962 + }, + { + "epoch": 0.8448816652409467, + "grad_norm": 0.5341057881948537, + "learning_rate": 6.181106408611781e-07, + "loss": 0.0092, + "step": 2963 + }, + { + "epoch": 0.8451668092386655, + "grad_norm": 0.5404376217996557, + "learning_rate": 6.158880665045586e-07, + "loss": 0.0104, + "step": 2964 + }, + { + "epoch": 0.8454519532363843, + "grad_norm": 0.5547122276300936, + "learning_rate": 6.136692329789046e-07, + "loss": 0.0138, + "step": 2965 + }, + { + "epoch": 0.8457370972341032, + "grad_norm": 0.870237694120721, + "learning_rate": 6.114541421774811e-07, + "loss": 0.0194, + "step": 2966 + }, + { + "epoch": 0.8460222412318221, + "grad_norm": 1.0347448612702808, + "learning_rate": 6.092427959903574e-07, + "loss": 0.0283, + "step": 2967 + }, + { + "epoch": 0.846307385229541, + "grad_norm": 1.4250265517501355, + "learning_rate": 6.070351963044091e-07, + "loss": 0.0508, + "step": 2968 + }, + { + "epoch": 0.8465925292272598, + "grad_norm": 0.34595863279716216, + "learning_rate": 6.04831345003315e-07, + "loss": 0.0062, + "step": 2969 + }, + { + "epoch": 0.8468776732249786, + "grad_norm": 1.2934086285223172, + "learning_rate": 6.026312439675553e-07, + "loss": 0.0239, + "step": 2970 + }, + { + "epoch": 0.8471628172226975, + "grad_norm": 1.1738506805733755, + "learning_rate": 6.004348950744094e-07, + "loss": 0.0134, + "step": 2971 + }, + { + "epoch": 0.8474479612204163, + "grad_norm": 0.9407356078115766, + "learning_rate": 5.982423001979559e-07, + "loss": 0.0395, + "step": 2972 + }, + { + "epoch": 0.8477331052181352, + "grad_norm": 1.899663507801717, + "learning_rate": 5.960534612090707e-07, + "loss": 0.029, + "step": 2973 + }, + { + "epoch": 0.848018249215854, + "grad_norm": 1.635086087665949, + "learning_rate": 5.93868379975423e-07, + "loss": 0.0204, + "step": 2974 + }, + { + "epoch": 0.8483033932135728, + "grad_norm": 2.7131462813485427, + "learning_rate": 5.916870583614792e-07, + "loss": 0.0511, + "step": 2975 + }, + { + "epoch": 0.8485885372112917, + "grad_norm": 0.6000044831511491, + "learning_rate": 5.895094982284949e-07, + "loss": 0.0151, + "step": 2976 + }, + { + "epoch": 0.8488736812090105, + "grad_norm": 0.8428158736884612, + "learning_rate": 5.873357014345143e-07, + "loss": 0.0275, + "step": 2977 + }, + { + "epoch": 0.8491588252067294, + "grad_norm": 1.1200799726027786, + "learning_rate": 5.851656698343761e-07, + "loss": 0.0311, + "step": 2978 + }, + { + "epoch": 0.8494439692044482, + "grad_norm": 0.7513879350557346, + "learning_rate": 5.829994052797011e-07, + "loss": 0.0189, + "step": 2979 + }, + { + "epoch": 0.849729113202167, + "grad_norm": 0.5097265708096947, + "learning_rate": 5.808369096188981e-07, + "loss": 0.0101, + "step": 2980 + }, + { + "epoch": 0.850014257199886, + "grad_norm": 1.5460511983229916, + "learning_rate": 5.786781846971601e-07, + "loss": 0.0304, + "step": 2981 + }, + { + "epoch": 0.8502994011976048, + "grad_norm": 1.677115356095477, + "learning_rate": 5.765232323564617e-07, + "loss": 0.0635, + "step": 2982 + }, + { + "epoch": 0.8505845451953237, + "grad_norm": 0.880229931432272, + "learning_rate": 5.743720544355597e-07, + "loss": 0.0363, + "step": 2983 + }, + { + "epoch": 0.8508696891930425, + "grad_norm": 0.5325984514944866, + "learning_rate": 5.722246527699887e-07, + "loss": 0.0049, + "step": 2984 + }, + { + "epoch": 0.8511548331907614, + "grad_norm": 1.031023684922083, + "learning_rate": 5.700810291920628e-07, + "loss": 0.0192, + "step": 2985 + }, + { + "epoch": 0.8514399771884802, + "grad_norm": 0.8501593340127255, + "learning_rate": 5.679411855308697e-07, + "loss": 0.0173, + "step": 2986 + }, + { + "epoch": 0.851725121186199, + "grad_norm": 1.2342740400840233, + "learning_rate": 5.658051236122774e-07, + "loss": 0.023, + "step": 2987 + }, + { + "epoch": 0.8520102651839179, + "grad_norm": 1.060959958106904, + "learning_rate": 5.636728452589196e-07, + "loss": 0.0154, + "step": 2988 + }, + { + "epoch": 0.8522954091816367, + "grad_norm": 1.2602365805850364, + "learning_rate": 5.615443522902076e-07, + "loss": 0.0231, + "step": 2989 + }, + { + "epoch": 0.8525805531793555, + "grad_norm": 0.764129932845986, + "learning_rate": 5.594196465223184e-07, + "loss": 0.0094, + "step": 2990 + }, + { + "epoch": 0.8528656971770744, + "grad_norm": 1.5099273606359904, + "learning_rate": 5.57298729768202e-07, + "loss": 0.0173, + "step": 2991 + }, + { + "epoch": 0.8531508411747932, + "grad_norm": 0.9820827550126608, + "learning_rate": 5.551816038375729e-07, + "loss": 0.0134, + "step": 2992 + }, + { + "epoch": 0.8534359851725121, + "grad_norm": 1.1727880595649633, + "learning_rate": 5.530682705369084e-07, + "loss": 0.0258, + "step": 2993 + }, + { + "epoch": 0.853721129170231, + "grad_norm": 1.0065318180810632, + "learning_rate": 5.509587316694537e-07, + "loss": 0.0207, + "step": 2994 + }, + { + "epoch": 0.8540062731679499, + "grad_norm": 0.7848127018539507, + "learning_rate": 5.488529890352157e-07, + "loss": 0.0083, + "step": 2995 + }, + { + "epoch": 0.8542914171656687, + "grad_norm": 1.0156083213820275, + "learning_rate": 5.467510444309609e-07, + "loss": 0.0119, + "step": 2996 + }, + { + "epoch": 0.8545765611633875, + "grad_norm": 1.0197812302503872, + "learning_rate": 5.446528996502149e-07, + "loss": 0.0128, + "step": 2997 + }, + { + "epoch": 0.8548617051611064, + "grad_norm": 0.8285221051674888, + "learning_rate": 5.425585564832625e-07, + "loss": 0.0093, + "step": 2998 + }, + { + "epoch": 0.8551468491588252, + "grad_norm": 0.8350102395268476, + "learning_rate": 5.404680167171427e-07, + "loss": 0.0209, + "step": 2999 + }, + { + "epoch": 0.8554319931565441, + "grad_norm": 0.8296267711722471, + "learning_rate": 5.38381282135651e-07, + "loss": 0.0103, + "step": 3000 + }, + { + "epoch": 0.8557171371542629, + "grad_norm": 1.1354258345442707, + "learning_rate": 5.362983545193351e-07, + "loss": 0.0132, + "step": 3001 + }, + { + "epoch": 0.8560022811519817, + "grad_norm": 1.3807885834714135, + "learning_rate": 5.34219235645494e-07, + "loss": 0.0184, + "step": 3002 + }, + { + "epoch": 0.8562874251497006, + "grad_norm": 0.4618296825629643, + "learning_rate": 5.321439272881795e-07, + "loss": 0.0101, + "step": 3003 + }, + { + "epoch": 0.8565725691474194, + "grad_norm": 0.7977559295975786, + "learning_rate": 5.300724312181876e-07, + "loss": 0.0149, + "step": 3004 + }, + { + "epoch": 0.8568577131451383, + "grad_norm": 0.5816427329433224, + "learning_rate": 5.280047492030638e-07, + "loss": 0.0177, + "step": 3005 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.47258730273410254, + "learning_rate": 5.259408830070989e-07, + "loss": 0.0071, + "step": 3006 + }, + { + "epoch": 0.857428001140576, + "grad_norm": 0.7070970759261074, + "learning_rate": 5.238808343913299e-07, + "loss": 0.0247, + "step": 3007 + }, + { + "epoch": 0.8577131451382949, + "grad_norm": 0.6869352056980774, + "learning_rate": 5.218246051135323e-07, + "loss": 0.0137, + "step": 3008 + }, + { + "epoch": 0.8579982891360137, + "grad_norm": 1.0038883944832988, + "learning_rate": 5.197721969282271e-07, + "loss": 0.0257, + "step": 3009 + }, + { + "epoch": 0.8582834331337326, + "grad_norm": 1.2068882204622162, + "learning_rate": 5.177236115866685e-07, + "loss": 0.0209, + "step": 3010 + }, + { + "epoch": 0.8585685771314514, + "grad_norm": 0.9311944031546174, + "learning_rate": 5.156788508368565e-07, + "loss": 0.0231, + "step": 3011 + }, + { + "epoch": 0.8588537211291702, + "grad_norm": 1.5397233224377365, + "learning_rate": 5.136379164235222e-07, + "loss": 0.0241, + "step": 3012 + }, + { + "epoch": 0.8591388651268891, + "grad_norm": 1.6349055154960528, + "learning_rate": 5.116008100881348e-07, + "loss": 0.0272, + "step": 3013 + }, + { + "epoch": 0.8594240091246079, + "grad_norm": 1.4003109679024432, + "learning_rate": 5.095675335688949e-07, + "loss": 0.0391, + "step": 3014 + }, + { + "epoch": 0.8597091531223268, + "grad_norm": 0.6121106764703516, + "learning_rate": 5.075380886007369e-07, + "loss": 0.0158, + "step": 3015 + }, + { + "epoch": 0.8599942971200456, + "grad_norm": 1.343535205043131, + "learning_rate": 5.055124769153247e-07, + "loss": 0.0401, + "step": 3016 + }, + { + "epoch": 0.8602794411177644, + "grad_norm": 0.3419700286934388, + "learning_rate": 5.034907002410517e-07, + "loss": 0.0048, + "step": 3017 + }, + { + "epoch": 0.8605645851154833, + "grad_norm": 0.9895620726753768, + "learning_rate": 5.014727603030389e-07, + "loss": 0.0221, + "step": 3018 + }, + { + "epoch": 0.8608497291132021, + "grad_norm": 1.3614488033863745, + "learning_rate": 4.994586588231354e-07, + "loss": 0.0183, + "step": 3019 + }, + { + "epoch": 0.861134873110921, + "grad_norm": 1.7974814733359572, + "learning_rate": 4.974483975199135e-07, + "loss": 0.053, + "step": 3020 + }, + { + "epoch": 0.8614200171086399, + "grad_norm": 0.6222206952249488, + "learning_rate": 4.954419781086672e-07, + "loss": 0.0126, + "step": 3021 + }, + { + "epoch": 0.8617051611063588, + "grad_norm": 0.5890050784706143, + "learning_rate": 4.934394023014133e-07, + "loss": 0.0097, + "step": 3022 + }, + { + "epoch": 0.8619903051040776, + "grad_norm": 0.2498934201504258, + "learning_rate": 4.914406718068914e-07, + "loss": 0.0037, + "step": 3023 + }, + { + "epoch": 0.8622754491017964, + "grad_norm": 1.370346135075879, + "learning_rate": 4.894457883305576e-07, + "loss": 0.0189, + "step": 3024 + }, + { + "epoch": 0.8625605930995153, + "grad_norm": 0.35639843396867693, + "learning_rate": 4.874547535745872e-07, + "loss": 0.0055, + "step": 3025 + }, + { + "epoch": 0.8628457370972341, + "grad_norm": 1.6416261884833374, + "learning_rate": 4.854675692378669e-07, + "loss": 0.0296, + "step": 3026 + }, + { + "epoch": 0.863130881094953, + "grad_norm": 1.1136701347202882, + "learning_rate": 4.834842370160048e-07, + "loss": 0.0164, + "step": 3027 + }, + { + "epoch": 0.8634160250926718, + "grad_norm": 1.368275106996081, + "learning_rate": 4.815047586013172e-07, + "loss": 0.0336, + "step": 3028 + }, + { + "epoch": 0.8637011690903906, + "grad_norm": 1.2575862109268776, + "learning_rate": 4.795291356828335e-07, + "loss": 0.0228, + "step": 3029 + }, + { + "epoch": 0.8639863130881095, + "grad_norm": 0.6138345409799086, + "learning_rate": 4.775573699462926e-07, + "loss": 0.0098, + "step": 3030 + }, + { + "epoch": 0.8642714570858283, + "grad_norm": 2.4032353134017397, + "learning_rate": 4.7558946307414645e-07, + "loss": 0.0509, + "step": 3031 + }, + { + "epoch": 0.8645566010835471, + "grad_norm": 1.3870212191516793, + "learning_rate": 4.736254167455473e-07, + "loss": 0.0358, + "step": 3032 + }, + { + "epoch": 0.864841745081266, + "grad_norm": 1.4321688335485183, + "learning_rate": 4.7166523263635743e-07, + "loss": 0.0353, + "step": 3033 + }, + { + "epoch": 0.8651268890789849, + "grad_norm": 0.452148177107619, + "learning_rate": 4.697089124191434e-07, + "loss": 0.0071, + "step": 3034 + }, + { + "epoch": 0.8654120330767038, + "grad_norm": 2.022511254722525, + "learning_rate": 4.677564577631749e-07, + "loss": 0.0269, + "step": 3035 + }, + { + "epoch": 0.8656971770744226, + "grad_norm": 1.1264957765805845, + "learning_rate": 4.6580787033442376e-07, + "loss": 0.0252, + "step": 3036 + }, + { + "epoch": 0.8659823210721415, + "grad_norm": 0.9989388112218847, + "learning_rate": 4.6386315179555883e-07, + "loss": 0.0241, + "step": 3037 + }, + { + "epoch": 0.8662674650698603, + "grad_norm": 1.9978868244980388, + "learning_rate": 4.6192230380595004e-07, + "loss": 0.0216, + "step": 3038 + }, + { + "epoch": 0.8665526090675791, + "grad_norm": 0.4630351383686262, + "learning_rate": 4.599853280216665e-07, + "loss": 0.0103, + "step": 3039 + }, + { + "epoch": 0.866837753065298, + "grad_norm": 0.7910478687505361, + "learning_rate": 4.580522260954706e-07, + "loss": 0.0133, + "step": 3040 + }, + { + "epoch": 0.8671228970630168, + "grad_norm": 0.905830864917694, + "learning_rate": 4.561229996768196e-07, + "loss": 0.0294, + "step": 3041 + }, + { + "epoch": 0.8674080410607357, + "grad_norm": 0.8213970821241439, + "learning_rate": 4.5419765041186556e-07, + "loss": 0.0219, + "step": 3042 + }, + { + "epoch": 0.8676931850584545, + "grad_norm": 1.1748421184539621, + "learning_rate": 4.5227617994345053e-07, + "loss": 0.0163, + "step": 3043 + }, + { + "epoch": 0.8679783290561733, + "grad_norm": 0.43672133470671437, + "learning_rate": 4.503585899111068e-07, + "loss": 0.007, + "step": 3044 + }, + { + "epoch": 0.8682634730538922, + "grad_norm": 2.0611265738885183, + "learning_rate": 4.4844488195105784e-07, + "loss": 0.048, + "step": 3045 + }, + { + "epoch": 0.868548617051611, + "grad_norm": 1.3462808320153008, + "learning_rate": 4.4653505769621073e-07, + "loss": 0.0322, + "step": 3046 + }, + { + "epoch": 0.86883376104933, + "grad_norm": 1.1019872968537088, + "learning_rate": 4.446291187761648e-07, + "loss": 0.0257, + "step": 3047 + }, + { + "epoch": 0.8691189050470488, + "grad_norm": 0.4768401486598245, + "learning_rate": 4.4272706681719737e-07, + "loss": 0.0138, + "step": 3048 + }, + { + "epoch": 0.8694040490447676, + "grad_norm": 1.2680969131683553, + "learning_rate": 4.408289034422736e-07, + "loss": 0.035, + "step": 3049 + }, + { + "epoch": 0.8696891930424865, + "grad_norm": 1.0552008198715335, + "learning_rate": 4.3893463027103735e-07, + "loss": 0.0191, + "step": 3050 + }, + { + "epoch": 0.8699743370402053, + "grad_norm": 0.7865993113772903, + "learning_rate": 4.370442489198179e-07, + "loss": 0.017, + "step": 3051 + }, + { + "epoch": 0.8702594810379242, + "grad_norm": 0.7490127620354204, + "learning_rate": 4.351577610016189e-07, + "loss": 0.0132, + "step": 3052 + }, + { + "epoch": 0.870544625035643, + "grad_norm": 0.5028891048262714, + "learning_rate": 4.3327516812612545e-07, + "loss": 0.0068, + "step": 3053 + }, + { + "epoch": 0.8708297690333618, + "grad_norm": 0.7876344772007207, + "learning_rate": 4.313964718996949e-07, + "loss": 0.0195, + "step": 3054 + }, + { + "epoch": 0.8711149130310807, + "grad_norm": 0.46302658918296347, + "learning_rate": 4.2952167392536436e-07, + "loss": 0.0081, + "step": 3055 + }, + { + "epoch": 0.8714000570287995, + "grad_norm": 0.5289145118332526, + "learning_rate": 4.2765077580284197e-07, + "loss": 0.0065, + "step": 3056 + }, + { + "epoch": 0.8716852010265184, + "grad_norm": 1.5234422334108315, + "learning_rate": 4.257837791285091e-07, + "loss": 0.016, + "step": 3057 + }, + { + "epoch": 0.8719703450242372, + "grad_norm": 1.6244487998081059, + "learning_rate": 4.2392068549541755e-07, + "loss": 0.0346, + "step": 3058 + }, + { + "epoch": 0.872255489021956, + "grad_norm": 1.141957719991174, + "learning_rate": 4.22061496493289e-07, + "loss": 0.0178, + "step": 3059 + }, + { + "epoch": 0.8725406330196749, + "grad_norm": 1.6278771393473568, + "learning_rate": 4.202062137085139e-07, + "loss": 0.0276, + "step": 3060 + }, + { + "epoch": 0.8728257770173938, + "grad_norm": 0.514051654602654, + "learning_rate": 4.183548387241498e-07, + "loss": 0.0086, + "step": 3061 + }, + { + "epoch": 0.8731109210151127, + "grad_norm": 0.8895694324937317, + "learning_rate": 4.1650737311991765e-07, + "loss": 0.0163, + "step": 3062 + }, + { + "epoch": 0.8733960650128315, + "grad_norm": 1.7832475192370916, + "learning_rate": 4.146638184722057e-07, + "loss": 0.0443, + "step": 3063 + }, + { + "epoch": 0.8736812090105504, + "grad_norm": 1.1145459448952413, + "learning_rate": 4.1282417635406525e-07, + "loss": 0.0241, + "step": 3064 + }, + { + "epoch": 0.8739663530082692, + "grad_norm": 1.3530250524727738, + "learning_rate": 4.109884483352045e-07, + "loss": 0.0204, + "step": 3065 + }, + { + "epoch": 0.874251497005988, + "grad_norm": 1.0947124737811844, + "learning_rate": 4.091566359819954e-07, + "loss": 0.0418, + "step": 3066 + }, + { + "epoch": 0.8745366410037069, + "grad_norm": 0.9920153082074292, + "learning_rate": 4.0732874085747053e-07, + "loss": 0.0183, + "step": 3067 + }, + { + "epoch": 0.8748217850014257, + "grad_norm": 1.065157293408351, + "learning_rate": 4.055047645213167e-07, + "loss": 0.0162, + "step": 3068 + }, + { + "epoch": 0.8751069289991446, + "grad_norm": 0.8965691490104757, + "learning_rate": 4.0368470852987893e-07, + "loss": 0.024, + "step": 3069 + }, + { + "epoch": 0.8753920729968634, + "grad_norm": 2.346622705630544, + "learning_rate": 4.018685744361539e-07, + "loss": 0.0497, + "step": 3070 + }, + { + "epoch": 0.8756772169945822, + "grad_norm": 0.9532499289760871, + "learning_rate": 4.000563637897964e-07, + "loss": 0.0121, + "step": 3071 + }, + { + "epoch": 0.8759623609923011, + "grad_norm": 0.4104488966130429, + "learning_rate": 3.982480781371106e-07, + "loss": 0.0082, + "step": 3072 + }, + { + "epoch": 0.8762475049900199, + "grad_norm": 1.5437624866770545, + "learning_rate": 3.9644371902105296e-07, + "loss": 0.0588, + "step": 3073 + }, + { + "epoch": 0.8765326489877389, + "grad_norm": 1.0598825987060185, + "learning_rate": 3.9464328798122843e-07, + "loss": 0.011, + "step": 3074 + }, + { + "epoch": 0.8768177929854577, + "grad_norm": 0.7745631171488191, + "learning_rate": 3.928467865538904e-07, + "loss": 0.0277, + "step": 3075 + }, + { + "epoch": 0.8771029369831765, + "grad_norm": 1.1461659365443477, + "learning_rate": 3.910542162719394e-07, + "loss": 0.0379, + "step": 3076 + }, + { + "epoch": 0.8773880809808954, + "grad_norm": 1.0689248460984624, + "learning_rate": 3.8926557866492297e-07, + "loss": 0.0189, + "step": 3077 + }, + { + "epoch": 0.8776732249786142, + "grad_norm": 1.7797425568134084, + "learning_rate": 3.874808752590298e-07, + "loss": 0.0512, + "step": 3078 + }, + { + "epoch": 0.8779583689763331, + "grad_norm": 1.313282350888063, + "learning_rate": 3.8570010757709555e-07, + "loss": 0.0285, + "step": 3079 + }, + { + "epoch": 0.8782435129740519, + "grad_norm": 1.3537466884000247, + "learning_rate": 3.8392327713859644e-07, + "loss": 0.0192, + "step": 3080 + }, + { + "epoch": 0.8785286569717707, + "grad_norm": 1.1739115495562644, + "learning_rate": 3.821503854596459e-07, + "loss": 0.0154, + "step": 3081 + }, + { + "epoch": 0.8788138009694896, + "grad_norm": 0.6978895204370438, + "learning_rate": 3.803814340529999e-07, + "loss": 0.0205, + "step": 3082 + }, + { + "epoch": 0.8790989449672084, + "grad_norm": 2.472278902571292, + "learning_rate": 3.786164244280532e-07, + "loss": 0.0816, + "step": 3083 + }, + { + "epoch": 0.8793840889649273, + "grad_norm": 0.42124346849130617, + "learning_rate": 3.7685535809083406e-07, + "loss": 0.0061, + "step": 3084 + }, + { + "epoch": 0.8796692329626461, + "grad_norm": 0.641144962594694, + "learning_rate": 3.7509823654400757e-07, + "loss": 0.0065, + "step": 3085 + }, + { + "epoch": 0.8799543769603649, + "grad_norm": 0.9745992756194799, + "learning_rate": 3.7334506128687277e-07, + "loss": 0.0127, + "step": 3086 + }, + { + "epoch": 0.8802395209580839, + "grad_norm": 1.0045168845028045, + "learning_rate": 3.715958338153619e-07, + "loss": 0.0247, + "step": 3087 + }, + { + "epoch": 0.8805246649558027, + "grad_norm": 0.49384156816707714, + "learning_rate": 3.698505556220372e-07, + "loss": 0.0134, + "step": 3088 + }, + { + "epoch": 0.8808098089535216, + "grad_norm": 1.153521953839027, + "learning_rate": 3.681092281960935e-07, + "loss": 0.0098, + "step": 3089 + }, + { + "epoch": 0.8810949529512404, + "grad_norm": 1.7322788165810779, + "learning_rate": 3.6637185302335234e-07, + "loss": 0.035, + "step": 3090 + }, + { + "epoch": 0.8813800969489592, + "grad_norm": 1.129612121303509, + "learning_rate": 3.646384315862633e-07, + "loss": 0.0126, + "step": 3091 + }, + { + "epoch": 0.8816652409466781, + "grad_norm": 0.602950178845579, + "learning_rate": 3.629089653639034e-07, + "loss": 0.0165, + "step": 3092 + }, + { + "epoch": 0.8819503849443969, + "grad_norm": 0.6273478691828218, + "learning_rate": 3.6118345583197434e-07, + "loss": 0.0087, + "step": 3093 + }, + { + "epoch": 0.8822355289421158, + "grad_norm": 0.7988486703644966, + "learning_rate": 3.594619044628017e-07, + "loss": 0.0235, + "step": 3094 + }, + { + "epoch": 0.8825206729398346, + "grad_norm": 1.1404611832652243, + "learning_rate": 3.577443127253316e-07, + "loss": 0.0208, + "step": 3095 + }, + { + "epoch": 0.8828058169375534, + "grad_norm": 0.673777489436252, + "learning_rate": 3.5603068208513616e-07, + "loss": 0.0139, + "step": 3096 + }, + { + "epoch": 0.8830909609352723, + "grad_norm": 1.0996498578604454, + "learning_rate": 3.5432101400440456e-07, + "loss": 0.0188, + "step": 3097 + }, + { + "epoch": 0.8833761049329911, + "grad_norm": 1.7534824016132662, + "learning_rate": 3.526153099419427e-07, + "loss": 0.0332, + "step": 3098 + }, + { + "epoch": 0.88366124893071, + "grad_norm": 1.5098939443616157, + "learning_rate": 3.5091357135317917e-07, + "loss": 0.0269, + "step": 3099 + }, + { + "epoch": 0.8839463929284288, + "grad_norm": 0.8049820887786194, + "learning_rate": 3.492157996901552e-07, + "loss": 0.0082, + "step": 3100 + }, + { + "epoch": 0.8842315369261478, + "grad_norm": 0.9473485933582722, + "learning_rate": 3.4752199640152916e-07, + "loss": 0.0215, + "step": 3101 + }, + { + "epoch": 0.8845166809238666, + "grad_norm": 1.2883061493145709, + "learning_rate": 3.458321629325717e-07, + "loss": 0.0306, + "step": 3102 + }, + { + "epoch": 0.8848018249215854, + "grad_norm": 0.9187716869904959, + "learning_rate": 3.441463007251672e-07, + "loss": 0.0132, + "step": 3103 + }, + { + "epoch": 0.8850869689193043, + "grad_norm": 0.6349246018941566, + "learning_rate": 3.4246441121781105e-07, + "loss": 0.0059, + "step": 3104 + }, + { + "epoch": 0.8853721129170231, + "grad_norm": 1.3681391815563542, + "learning_rate": 3.407864958456092e-07, + "loss": 0.016, + "step": 3105 + }, + { + "epoch": 0.885657256914742, + "grad_norm": 1.3040792173489308, + "learning_rate": 3.391125560402764e-07, + "loss": 0.0424, + "step": 3106 + }, + { + "epoch": 0.8859424009124608, + "grad_norm": 1.2333440427099385, + "learning_rate": 3.3744259323013453e-07, + "loss": 0.0154, + "step": 3107 + }, + { + "epoch": 0.8862275449101796, + "grad_norm": 1.150041429446506, + "learning_rate": 3.357766088401149e-07, + "loss": 0.0355, + "step": 3108 + }, + { + "epoch": 0.8865126889078985, + "grad_norm": 0.5551063042416896, + "learning_rate": 3.3411460429174927e-07, + "loss": 0.0093, + "step": 3109 + }, + { + "epoch": 0.8867978329056173, + "grad_norm": 1.2252348783292581, + "learning_rate": 3.324565810031777e-07, + "loss": 0.0247, + "step": 3110 + }, + { + "epoch": 0.8870829769033362, + "grad_norm": 0.7969957705002801, + "learning_rate": 3.3080254038914014e-07, + "loss": 0.0128, + "step": 3111 + }, + { + "epoch": 0.887368120901055, + "grad_norm": 0.5694148413447736, + "learning_rate": 3.2915248386098163e-07, + "loss": 0.0049, + "step": 3112 + }, + { + "epoch": 0.8876532648987738, + "grad_norm": 0.5807428864056374, + "learning_rate": 3.275064128266453e-07, + "loss": 0.0108, + "step": 3113 + }, + { + "epoch": 0.8879384088964928, + "grad_norm": 0.9409478396226437, + "learning_rate": 3.2586432869067263e-07, + "loss": 0.0291, + "step": 3114 + }, + { + "epoch": 0.8882235528942116, + "grad_norm": 1.9926399501252572, + "learning_rate": 3.242262328542067e-07, + "loss": 0.0305, + "step": 3115 + }, + { + "epoch": 0.8885086968919305, + "grad_norm": 0.9596467871013998, + "learning_rate": 3.225921267149845e-07, + "loss": 0.0175, + "step": 3116 + }, + { + "epoch": 0.8887938408896493, + "grad_norm": 1.307498365238962, + "learning_rate": 3.2096201166734007e-07, + "loss": 0.0352, + "step": 3117 + }, + { + "epoch": 0.8890789848873681, + "grad_norm": 1.5303253426806933, + "learning_rate": 3.193358891022008e-07, + "loss": 0.0146, + "step": 3118 + }, + { + "epoch": 0.889364128885087, + "grad_norm": 0.33325411716229314, + "learning_rate": 3.17713760407089e-07, + "loss": 0.0056, + "step": 3119 + }, + { + "epoch": 0.8896492728828058, + "grad_norm": 1.538780520413921, + "learning_rate": 3.160956269661175e-07, + "loss": 0.0188, + "step": 3120 + }, + { + "epoch": 0.8899344168805247, + "grad_norm": 0.7446784766283835, + "learning_rate": 3.1448149015999187e-07, + "loss": 0.013, + "step": 3121 + }, + { + "epoch": 0.8902195608782435, + "grad_norm": 1.3141671766830856, + "learning_rate": 3.1287135136600643e-07, + "loss": 0.0254, + "step": 3122 + }, + { + "epoch": 0.8905047048759623, + "grad_norm": 1.4567278490282125, + "learning_rate": 3.112652119580428e-07, + "loss": 0.0329, + "step": 3123 + }, + { + "epoch": 0.8907898488736812, + "grad_norm": 1.6857368705721907, + "learning_rate": 3.096630733065742e-07, + "loss": 0.0288, + "step": 3124 + }, + { + "epoch": 0.8910749928714, + "grad_norm": 2.0692748305620747, + "learning_rate": 3.0806493677865534e-07, + "loss": 0.0278, + "step": 3125 + }, + { + "epoch": 0.8913601368691189, + "grad_norm": 0.7567155805940998, + "learning_rate": 3.0647080373792824e-07, + "loss": 0.0153, + "step": 3126 + }, + { + "epoch": 0.8916452808668378, + "grad_norm": 2.5094863114875863, + "learning_rate": 3.048806755446182e-07, + "loss": 0.0432, + "step": 3127 + }, + { + "epoch": 0.8919304248645566, + "grad_norm": 1.2661236090125079, + "learning_rate": 3.032945535555354e-07, + "loss": 0.0616, + "step": 3128 + }, + { + "epoch": 0.8922155688622755, + "grad_norm": 0.377347821334773, + "learning_rate": 3.0171243912406944e-07, + "loss": 0.0064, + "step": 3129 + }, + { + "epoch": 0.8925007128599943, + "grad_norm": 0.7116646851338002, + "learning_rate": 3.0013433360019105e-07, + "loss": 0.0234, + "step": 3130 + }, + { + "epoch": 0.8927858568577132, + "grad_norm": 1.280990787410855, + "learning_rate": 2.9856023833045033e-07, + "loss": 0.0218, + "step": 3131 + }, + { + "epoch": 0.893071000855432, + "grad_norm": 0.7586342697071281, + "learning_rate": 2.969901546579751e-07, + "loss": 0.0211, + "step": 3132 + }, + { + "epoch": 0.8933561448531508, + "grad_norm": 0.8495441832875039, + "learning_rate": 2.9542408392247036e-07, + "loss": 0.0106, + "step": 3133 + }, + { + "epoch": 0.8936412888508697, + "grad_norm": 1.7650449817458465, + "learning_rate": 2.9386202746021773e-07, + "loss": 0.0278, + "step": 3134 + }, + { + "epoch": 0.8939264328485885, + "grad_norm": 0.7986477079444706, + "learning_rate": 2.9230398660407277e-07, + "loss": 0.0136, + "step": 3135 + }, + { + "epoch": 0.8942115768463074, + "grad_norm": 0.8463514580140247, + "learning_rate": 2.9074996268346533e-07, + "loss": 0.0252, + "step": 3136 + }, + { + "epoch": 0.8944967208440262, + "grad_norm": 1.3930496907278234, + "learning_rate": 2.8919995702439696e-07, + "loss": 0.0329, + "step": 3137 + }, + { + "epoch": 0.894781864841745, + "grad_norm": 1.0464784961095484, + "learning_rate": 2.8765397094944083e-07, + "loss": 0.0249, + "step": 3138 + }, + { + "epoch": 0.8950670088394639, + "grad_norm": 1.3983247885259522, + "learning_rate": 2.8611200577774e-07, + "loss": 0.0182, + "step": 3139 + }, + { + "epoch": 0.8953521528371827, + "grad_norm": 1.0011589985184415, + "learning_rate": 2.845740628250082e-07, + "loss": 0.0155, + "step": 3140 + }, + { + "epoch": 0.8956372968349017, + "grad_norm": 0.41127888337629276, + "learning_rate": 2.8304014340352625e-07, + "loss": 0.0091, + "step": 3141 + }, + { + "epoch": 0.8959224408326205, + "grad_norm": 0.7008237115353383, + "learning_rate": 2.815102488221394e-07, + "loss": 0.0092, + "step": 3142 + }, + { + "epoch": 0.8962075848303394, + "grad_norm": 1.6731644282839946, + "learning_rate": 2.7998438038626174e-07, + "loss": 0.0176, + "step": 3143 + }, + { + "epoch": 0.8964927288280582, + "grad_norm": 1.606249319903414, + "learning_rate": 2.7846253939787125e-07, + "loss": 0.0415, + "step": 3144 + }, + { + "epoch": 0.896777872825777, + "grad_norm": 1.300144651734939, + "learning_rate": 2.7694472715550925e-07, + "loss": 0.0182, + "step": 3145 + }, + { + "epoch": 0.8970630168234959, + "grad_norm": 0.8102382687014947, + "learning_rate": 2.7543094495427913e-07, + "loss": 0.0164, + "step": 3146 + }, + { + "epoch": 0.8973481608212147, + "grad_norm": 0.4796368067100617, + "learning_rate": 2.7392119408584493e-07, + "loss": 0.0071, + "step": 3147 + }, + { + "epoch": 0.8976333048189336, + "grad_norm": 0.4708253859579334, + "learning_rate": 2.7241547583843286e-07, + "loss": 0.0096, + "step": 3148 + }, + { + "epoch": 0.8979184488166524, + "grad_norm": 1.6685451122274635, + "learning_rate": 2.7091379149682683e-07, + "loss": 0.0186, + "step": 3149 + }, + { + "epoch": 0.8982035928143712, + "grad_norm": 1.3143101566649746, + "learning_rate": 2.6941614234236857e-07, + "loss": 0.0192, + "step": 3150 + }, + { + "epoch": 0.8984887368120901, + "grad_norm": 0.6686035692072104, + "learning_rate": 2.679225296529564e-07, + "loss": 0.0135, + "step": 3151 + }, + { + "epoch": 0.8987738808098089, + "grad_norm": 0.8841163748576049, + "learning_rate": 2.664329547030475e-07, + "loss": 0.0315, + "step": 3152 + }, + { + "epoch": 0.8990590248075278, + "grad_norm": 0.9004178197360575, + "learning_rate": 2.649474187636492e-07, + "loss": 0.0154, + "step": 3153 + }, + { + "epoch": 0.8993441688052467, + "grad_norm": 2.2456953621383815, + "learning_rate": 2.634659231023251e-07, + "loss": 0.0293, + "step": 3154 + }, + { + "epoch": 0.8996293128029655, + "grad_norm": 0.6666588261244653, + "learning_rate": 2.619884689831909e-07, + "loss": 0.0109, + "step": 3155 + }, + { + "epoch": 0.8999144568006844, + "grad_norm": 0.8776064374516797, + "learning_rate": 2.6051505766691464e-07, + "loss": 0.0144, + "step": 3156 + }, + { + "epoch": 0.9001996007984032, + "grad_norm": 1.1268349745247128, + "learning_rate": 2.5904569041071417e-07, + "loss": 0.0245, + "step": 3157 + }, + { + "epoch": 0.9004847447961221, + "grad_norm": 0.4970618468994337, + "learning_rate": 2.5758036846835476e-07, + "loss": 0.0081, + "step": 3158 + }, + { + "epoch": 0.9007698887938409, + "grad_norm": 1.0343525333492938, + "learning_rate": 2.561190930901519e-07, + "loss": 0.0217, + "step": 3159 + }, + { + "epoch": 0.9010550327915597, + "grad_norm": 1.1597997219850231, + "learning_rate": 2.5466186552296933e-07, + "loss": 0.029, + "step": 3160 + }, + { + "epoch": 0.9013401767892786, + "grad_norm": 0.5518918587988524, + "learning_rate": 2.532086870102146e-07, + "loss": 0.0107, + "step": 3161 + }, + { + "epoch": 0.9016253207869974, + "grad_norm": 1.555015490765001, + "learning_rate": 2.5175955879184146e-07, + "loss": 0.0201, + "step": 3162 + }, + { + "epoch": 0.9019104647847163, + "grad_norm": 0.6795609328872417, + "learning_rate": 2.503144821043474e-07, + "loss": 0.0142, + "step": 3163 + }, + { + "epoch": 0.9021956087824351, + "grad_norm": 0.6884525201103593, + "learning_rate": 2.488734581807728e-07, + "loss": 0.0185, + "step": 3164 + }, + { + "epoch": 0.9024807527801539, + "grad_norm": 0.5375639386409782, + "learning_rate": 2.474364882507002e-07, + "loss": 0.0059, + "step": 3165 + }, + { + "epoch": 0.9027658967778728, + "grad_norm": 0.5080847121580953, + "learning_rate": 2.4600357354025275e-07, + "loss": 0.0072, + "step": 3166 + }, + { + "epoch": 0.9030510407755917, + "grad_norm": 0.806467203465794, + "learning_rate": 2.4457471527209343e-07, + "loss": 0.0246, + "step": 3167 + }, + { + "epoch": 0.9033361847733106, + "grad_norm": 1.700259776983858, + "learning_rate": 2.431499146654243e-07, + "loss": 0.0195, + "step": 3168 + }, + { + "epoch": 0.9036213287710294, + "grad_norm": 1.066316674315371, + "learning_rate": 2.4172917293598607e-07, + "loss": 0.0332, + "step": 3169 + }, + { + "epoch": 0.9039064727687482, + "grad_norm": 1.2640803766957995, + "learning_rate": 2.4031249129605305e-07, + "loss": 0.0148, + "step": 3170 + }, + { + "epoch": 0.9041916167664671, + "grad_norm": 2.356961541868472, + "learning_rate": 2.3889987095443657e-07, + "loss": 0.0305, + "step": 3171 + }, + { + "epoch": 0.9044767607641859, + "grad_norm": 0.8215136443424736, + "learning_rate": 2.3749131311648576e-07, + "loss": 0.0111, + "step": 3172 + }, + { + "epoch": 0.9047619047619048, + "grad_norm": 1.21999981493346, + "learning_rate": 2.360868189840787e-07, + "loss": 0.0162, + "step": 3173 + }, + { + "epoch": 0.9050470487596236, + "grad_norm": 1.955946628179618, + "learning_rate": 2.346863897556295e-07, + "loss": 0.033, + "step": 3174 + }, + { + "epoch": 0.9053321927573424, + "grad_norm": 1.3460408883251782, + "learning_rate": 2.3329002662608068e-07, + "loss": 0.0461, + "step": 3175 + }, + { + "epoch": 0.9056173367550613, + "grad_norm": 1.1009867392611814, + "learning_rate": 2.3189773078690748e-07, + "loss": 0.0119, + "step": 3176 + }, + { + "epoch": 0.9059024807527801, + "grad_norm": 0.6976449459445861, + "learning_rate": 2.305095034261151e-07, + "loss": 0.0104, + "step": 3177 + }, + { + "epoch": 0.906187624750499, + "grad_norm": 1.050109166261638, + "learning_rate": 2.2912534572823498e-07, + "loss": 0.0226, + "step": 3178 + }, + { + "epoch": 0.9064727687482178, + "grad_norm": 0.6801788595908712, + "learning_rate": 2.2774525887432786e-07, + "loss": 0.0185, + "step": 3179 + }, + { + "epoch": 0.9067579127459366, + "grad_norm": 0.6869437524292872, + "learning_rate": 2.2636924404198014e-07, + "loss": 0.0104, + "step": 3180 + }, + { + "epoch": 0.9070430567436556, + "grad_norm": 1.4278818202476977, + "learning_rate": 2.2499730240530426e-07, + "loss": 0.0239, + "step": 3181 + }, + { + "epoch": 0.9073282007413744, + "grad_norm": 0.6426812316033088, + "learning_rate": 2.2362943513493662e-07, + "loss": 0.0164, + "step": 3182 + }, + { + "epoch": 0.9076133447390933, + "grad_norm": 0.7907561871363531, + "learning_rate": 2.2226564339803636e-07, + "loss": 0.0096, + "step": 3183 + }, + { + "epoch": 0.9078984887368121, + "grad_norm": 0.9117928880337626, + "learning_rate": 2.2090592835828817e-07, + "loss": 0.017, + "step": 3184 + }, + { + "epoch": 0.908183632734531, + "grad_norm": 0.9775102371730449, + "learning_rate": 2.1955029117589454e-07, + "loss": 0.0145, + "step": 3185 + }, + { + "epoch": 0.9084687767322498, + "grad_norm": 0.6325452457433002, + "learning_rate": 2.1819873300758022e-07, + "loss": 0.0088, + "step": 3186 + }, + { + "epoch": 0.9087539207299686, + "grad_norm": 0.8958197615264045, + "learning_rate": 2.168512550065882e-07, + "loss": 0.0312, + "step": 3187 + }, + { + "epoch": 0.9090390647276875, + "grad_norm": 0.6725945959693137, + "learning_rate": 2.1550785832268217e-07, + "loss": 0.0147, + "step": 3188 + }, + { + "epoch": 0.9093242087254063, + "grad_norm": 0.7200310122010458, + "learning_rate": 2.1416854410214183e-07, + "loss": 0.009, + "step": 3189 + }, + { + "epoch": 0.9096093527231252, + "grad_norm": 1.3177205015746256, + "learning_rate": 2.1283331348776414e-07, + "loss": 0.0207, + "step": 3190 + }, + { + "epoch": 0.909894496720844, + "grad_norm": 1.6289098120134233, + "learning_rate": 2.115021676188611e-07, + "loss": 0.0245, + "step": 3191 + }, + { + "epoch": 0.9101796407185628, + "grad_norm": 1.9024020760670197, + "learning_rate": 2.101751076312586e-07, + "loss": 0.0367, + "step": 3192 + }, + { + "epoch": 0.9104647847162817, + "grad_norm": 1.1389262118067653, + "learning_rate": 2.0885213465729802e-07, + "loss": 0.0238, + "step": 3193 + }, + { + "epoch": 0.9107499287140006, + "grad_norm": 2.0058186055618106, + "learning_rate": 2.0753324982583202e-07, + "loss": 0.0419, + "step": 3194 + }, + { + "epoch": 0.9110350727117195, + "grad_norm": 1.2169233475560073, + "learning_rate": 2.0621845426222587e-07, + "loss": 0.0326, + "step": 3195 + }, + { + "epoch": 0.9113202167094383, + "grad_norm": 0.3754420977930853, + "learning_rate": 2.0490774908835442e-07, + "loss": 0.0057, + "step": 3196 + }, + { + "epoch": 0.9116053607071571, + "grad_norm": 2.673609902784827, + "learning_rate": 2.0360113542260307e-07, + "loss": 0.0349, + "step": 3197 + }, + { + "epoch": 0.911890504704876, + "grad_norm": 1.0286839819383613, + "learning_rate": 2.0229861437986665e-07, + "loss": 0.0309, + "step": 3198 + }, + { + "epoch": 0.9121756487025948, + "grad_norm": 1.100305443067917, + "learning_rate": 2.0100018707154612e-07, + "loss": 0.0127, + "step": 3199 + }, + { + "epoch": 0.9124607927003137, + "grad_norm": 0.9752425027943912, + "learning_rate": 1.9970585460555193e-07, + "loss": 0.0114, + "step": 3200 + }, + { + "epoch": 0.9127459366980325, + "grad_norm": 1.5804332245975838, + "learning_rate": 1.984156180862984e-07, + "loss": 0.036, + "step": 3201 + }, + { + "epoch": 0.9130310806957513, + "grad_norm": 1.2405391166369548, + "learning_rate": 1.97129478614706e-07, + "loss": 0.0198, + "step": 3202 + }, + { + "epoch": 0.9133162246934702, + "grad_norm": 1.4988667995021159, + "learning_rate": 1.958474372881969e-07, + "loss": 0.0165, + "step": 3203 + }, + { + "epoch": 0.913601368691189, + "grad_norm": 2.6758434050413924, + "learning_rate": 1.94569495200701e-07, + "loss": 0.0473, + "step": 3204 + }, + { + "epoch": 0.9138865126889079, + "grad_norm": 0.37648975473173746, + "learning_rate": 1.9329565344264666e-07, + "loss": 0.009, + "step": 3205 + }, + { + "epoch": 0.9141716566866267, + "grad_norm": 1.4270330071786936, + "learning_rate": 1.9202591310096495e-07, + "loss": 0.0318, + "step": 3206 + }, + { + "epoch": 0.9144568006843456, + "grad_norm": 1.1284641851442696, + "learning_rate": 1.9076027525908704e-07, + "loss": 0.0169, + "step": 3207 + }, + { + "epoch": 0.9147419446820645, + "grad_norm": 0.22940010951660872, + "learning_rate": 1.8949874099694344e-07, + "loss": 0.0054, + "step": 3208 + }, + { + "epoch": 0.9150270886797833, + "grad_norm": 0.472117646501281, + "learning_rate": 1.8824131139096424e-07, + "loss": 0.013, + "step": 3209 + }, + { + "epoch": 0.9153122326775022, + "grad_norm": 1.0145391952231864, + "learning_rate": 1.8698798751407566e-07, + "loss": 0.0163, + "step": 3210 + }, + { + "epoch": 0.915597376675221, + "grad_norm": 1.6566652751095565, + "learning_rate": 1.8573877043570166e-07, + "loss": 0.0293, + "step": 3211 + }, + { + "epoch": 0.9158825206729398, + "grad_norm": 0.8730480588302432, + "learning_rate": 1.8449366122176072e-07, + "loss": 0.0211, + "step": 3212 + }, + { + "epoch": 0.9161676646706587, + "grad_norm": 0.762722802208298, + "learning_rate": 1.8325266093466908e-07, + "loss": 0.0167, + "step": 3213 + }, + { + "epoch": 0.9164528086683775, + "grad_norm": 1.8519175215823354, + "learning_rate": 1.820157706333331e-07, + "loss": 0.042, + "step": 3214 + }, + { + "epoch": 0.9167379526660964, + "grad_norm": 0.6178345429669876, + "learning_rate": 1.8078299137315457e-07, + "loss": 0.0115, + "step": 3215 + }, + { + "epoch": 0.9170230966638152, + "grad_norm": 1.8389032255626088, + "learning_rate": 1.7955432420602714e-07, + "loss": 0.0449, + "step": 3216 + }, + { + "epoch": 0.917308240661534, + "grad_norm": 0.8976456881308423, + "learning_rate": 1.7832977018033604e-07, + "loss": 0.0146, + "step": 3217 + }, + { + "epoch": 0.9175933846592529, + "grad_norm": 1.1641398023584142, + "learning_rate": 1.7710933034095658e-07, + "loss": 0.0167, + "step": 3218 + }, + { + "epoch": 0.9178785286569717, + "grad_norm": 0.34100280170072866, + "learning_rate": 1.7589300572925184e-07, + "loss": 0.0044, + "step": 3219 + }, + { + "epoch": 0.9181636726546906, + "grad_norm": 1.7683542198735762, + "learning_rate": 1.7468079738307608e-07, + "loss": 0.0254, + "step": 3220 + }, + { + "epoch": 0.9184488166524095, + "grad_norm": 0.9779660998455367, + "learning_rate": 1.7347270633677082e-07, + "loss": 0.0247, + "step": 3221 + }, + { + "epoch": 0.9187339606501284, + "grad_norm": 0.7604628137905527, + "learning_rate": 1.722687336211626e-07, + "loss": 0.016, + "step": 3222 + }, + { + "epoch": 0.9190191046478472, + "grad_norm": 1.9616843025886015, + "learning_rate": 1.7106888026356626e-07, + "loss": 0.0218, + "step": 3223 + }, + { + "epoch": 0.919304248645566, + "grad_norm": 1.4389747508794408, + "learning_rate": 1.6987314728778014e-07, + "loss": 0.1148, + "step": 3224 + }, + { + "epoch": 0.9195893926432849, + "grad_norm": 1.3027980674511903, + "learning_rate": 1.6868153571408695e-07, + "loss": 0.0248, + "step": 3225 + }, + { + "epoch": 0.9198745366410037, + "grad_norm": 0.4343153104128488, + "learning_rate": 1.6749404655925338e-07, + "loss": 0.009, + "step": 3226 + }, + { + "epoch": 0.9201596806387226, + "grad_norm": 0.47044929809354835, + "learning_rate": 1.663106808365289e-07, + "loss": 0.0077, + "step": 3227 + }, + { + "epoch": 0.9204448246364414, + "grad_norm": 1.7245884546851775, + "learning_rate": 1.6513143955564192e-07, + "loss": 0.0386, + "step": 3228 + }, + { + "epoch": 0.9207299686341602, + "grad_norm": 1.2986893334701157, + "learning_rate": 1.6395632372280646e-07, + "loss": 0.0219, + "step": 3229 + }, + { + "epoch": 0.9210151126318791, + "grad_norm": 1.1072209359887417, + "learning_rate": 1.62785334340711e-07, + "loss": 0.0162, + "step": 3230 + }, + { + "epoch": 0.9213002566295979, + "grad_norm": 1.116053157390405, + "learning_rate": 1.6161847240852624e-07, + "loss": 0.0133, + "step": 3231 + }, + { + "epoch": 0.9215854006273168, + "grad_norm": 0.6591215243411573, + "learning_rate": 1.6045573892190136e-07, + "loss": 0.0161, + "step": 3232 + }, + { + "epoch": 0.9218705446250356, + "grad_norm": 0.44109595747664676, + "learning_rate": 1.5929713487296162e-07, + "loss": 0.0055, + "step": 3233 + }, + { + "epoch": 0.9221556886227545, + "grad_norm": 1.3398968208505428, + "learning_rate": 1.581426612503084e-07, + "loss": 0.0225, + "step": 3234 + }, + { + "epoch": 0.9224408326204734, + "grad_norm": 1.4579154403674046, + "learning_rate": 1.5699231903901934e-07, + "loss": 0.0216, + "step": 3235 + }, + { + "epoch": 0.9227259766181922, + "grad_norm": 2.4188092429370704, + "learning_rate": 1.5584610922064759e-07, + "loss": 0.0523, + "step": 3236 + }, + { + "epoch": 0.9230111206159111, + "grad_norm": 0.5884136082750657, + "learning_rate": 1.547040327732191e-07, + "loss": 0.0091, + "step": 3237 + }, + { + "epoch": 0.9232962646136299, + "grad_norm": 1.108590752242791, + "learning_rate": 1.535660906712333e-07, + "loss": 0.009, + "step": 3238 + }, + { + "epoch": 0.9235814086113487, + "grad_norm": 1.3748359488240862, + "learning_rate": 1.5243228388566233e-07, + "loss": 0.0145, + "step": 3239 + }, + { + "epoch": 0.9238665526090676, + "grad_norm": 0.20338331633106474, + "learning_rate": 1.5130261338394904e-07, + "loss": 0.004, + "step": 3240 + }, + { + "epoch": 0.9241516966067864, + "grad_norm": 0.6798693978374936, + "learning_rate": 1.5017708013000787e-07, + "loss": 0.02, + "step": 3241 + }, + { + "epoch": 0.9244368406045053, + "grad_norm": 1.2945381953335924, + "learning_rate": 1.4905568508422173e-07, + "loss": 0.0221, + "step": 3242 + }, + { + "epoch": 0.9247219846022241, + "grad_norm": 1.1088855335116197, + "learning_rate": 1.4793842920344358e-07, + "loss": 0.0173, + "step": 3243 + }, + { + "epoch": 0.9250071285999429, + "grad_norm": 0.6902543763473945, + "learning_rate": 1.468253134409947e-07, + "loss": 0.0078, + "step": 3244 + }, + { + "epoch": 0.9252922725976618, + "grad_norm": 0.36426685758935945, + "learning_rate": 1.4571633874666313e-07, + "loss": 0.0068, + "step": 3245 + }, + { + "epoch": 0.9255774165953806, + "grad_norm": 1.089534630630725, + "learning_rate": 1.4461150606670414e-07, + "loss": 0.0283, + "step": 3246 + }, + { + "epoch": 0.9258625605930996, + "grad_norm": 1.1642004537200386, + "learning_rate": 1.4351081634383647e-07, + "loss": 0.0257, + "step": 3247 + }, + { + "epoch": 0.9261477045908184, + "grad_norm": 0.7868438826662598, + "learning_rate": 1.4241427051724765e-07, + "loss": 0.0103, + "step": 3248 + }, + { + "epoch": 0.9264328485885372, + "grad_norm": 0.9770444382275083, + "learning_rate": 1.4132186952258653e-07, + "loss": 0.0153, + "step": 3249 + }, + { + "epoch": 0.9267179925862561, + "grad_norm": 1.785958725620067, + "learning_rate": 1.402336142919658e-07, + "loss": 0.0264, + "step": 3250 + }, + { + "epoch": 0.9270031365839749, + "grad_norm": 1.2964838487983503, + "learning_rate": 1.3914950575396102e-07, + "loss": 0.0267, + "step": 3251 + }, + { + "epoch": 0.9272882805816938, + "grad_norm": 1.6603947739477958, + "learning_rate": 1.3806954483361002e-07, + "loss": 0.0208, + "step": 3252 + }, + { + "epoch": 0.9275734245794126, + "grad_norm": 1.9307212298456977, + "learning_rate": 1.369937324524101e-07, + "loss": 0.0458, + "step": 3253 + }, + { + "epoch": 0.9278585685771314, + "grad_norm": 3.4936463480149653, + "learning_rate": 1.3592206952832031e-07, + "loss": 0.0591, + "step": 3254 + }, + { + "epoch": 0.9281437125748503, + "grad_norm": 1.3343590595906798, + "learning_rate": 1.3485455697575755e-07, + "loss": 0.0246, + "step": 3255 + }, + { + "epoch": 0.9284288565725691, + "grad_norm": 1.4183852372661856, + "learning_rate": 1.3379119570559872e-07, + "loss": 0.0362, + "step": 3256 + }, + { + "epoch": 0.928714000570288, + "grad_norm": 1.540850659243285, + "learning_rate": 1.3273198662517917e-07, + "loss": 0.0269, + "step": 3257 + }, + { + "epoch": 0.9289991445680068, + "grad_norm": 1.5560860076422636, + "learning_rate": 1.3167693063828867e-07, + "loss": 0.0163, + "step": 3258 + }, + { + "epoch": 0.9292842885657256, + "grad_norm": 1.484976547707544, + "learning_rate": 1.3062602864517548e-07, + "loss": 0.0472, + "step": 3259 + }, + { + "epoch": 0.9295694325634445, + "grad_norm": 0.6381737168749684, + "learning_rate": 1.2957928154254174e-07, + "loss": 0.0107, + "step": 3260 + }, + { + "epoch": 0.9298545765611634, + "grad_norm": 1.1036012557283326, + "learning_rate": 1.285366902235463e-07, + "loss": 0.0258, + "step": 3261 + }, + { + "epoch": 0.9301397205588823, + "grad_norm": 1.9037929843957349, + "learning_rate": 1.2749825557780148e-07, + "loss": 0.0647, + "step": 3262 + }, + { + "epoch": 0.9304248645566011, + "grad_norm": 1.1279142271655538, + "learning_rate": 1.264639784913707e-07, + "loss": 0.0206, + "step": 3263 + }, + { + "epoch": 0.93071000855432, + "grad_norm": 1.3538375710084851, + "learning_rate": 1.2543385984677249e-07, + "loss": 0.0294, + "step": 3264 + }, + { + "epoch": 0.9309951525520388, + "grad_norm": 1.5857087537071777, + "learning_rate": 1.2440790052297648e-07, + "loss": 0.0262, + "step": 3265 + }, + { + "epoch": 0.9312802965497576, + "grad_norm": 0.8358164956623412, + "learning_rate": 1.233861013954024e-07, + "loss": 0.0128, + "step": 3266 + }, + { + "epoch": 0.9315654405474765, + "grad_norm": 1.0889195632514523, + "learning_rate": 1.2236846333592068e-07, + "loss": 0.0161, + "step": 3267 + }, + { + "epoch": 0.9318505845451953, + "grad_norm": 1.3316230123061, + "learning_rate": 1.2135498721285167e-07, + "loss": 0.0568, + "step": 3268 + }, + { + "epoch": 0.9321357285429142, + "grad_norm": 1.1761990589238678, + "learning_rate": 1.2034567389096364e-07, + "loss": 0.0179, + "step": 3269 + }, + { + "epoch": 0.932420872540633, + "grad_norm": 1.460756188712811, + "learning_rate": 1.193405242314738e-07, + "loss": 0.0242, + "step": 3270 + }, + { + "epoch": 0.9327060165383518, + "grad_norm": 1.4799019491857686, + "learning_rate": 1.1833953909204554e-07, + "loss": 0.0227, + "step": 3271 + }, + { + "epoch": 0.9329911605360707, + "grad_norm": 1.500459177807598, + "learning_rate": 1.1734271932679008e-07, + "loss": 0.0475, + "step": 3272 + }, + { + "epoch": 0.9332763045337895, + "grad_norm": 1.4260297811235727, + "learning_rate": 1.1635006578626374e-07, + "loss": 0.0249, + "step": 3273 + }, + { + "epoch": 0.9335614485315085, + "grad_norm": 2.2936699937794636, + "learning_rate": 1.1536157931746728e-07, + "loss": 0.049, + "step": 3274 + }, + { + "epoch": 0.9338465925292273, + "grad_norm": 1.3381636184621566, + "learning_rate": 1.1437726076384715e-07, + "loss": 0.0273, + "step": 3275 + }, + { + "epoch": 0.9341317365269461, + "grad_norm": 2.3124956924737776, + "learning_rate": 1.1339711096529149e-07, + "loss": 0.0484, + "step": 3276 + }, + { + "epoch": 0.934416880524665, + "grad_norm": 1.3552422519009062, + "learning_rate": 1.1242113075813466e-07, + "loss": 0.0544, + "step": 3277 + }, + { + "epoch": 0.9347020245223838, + "grad_norm": 0.4791576527546003, + "learning_rate": 1.1144932097515048e-07, + "loss": 0.0083, + "step": 3278 + }, + { + "epoch": 0.9349871685201027, + "grad_norm": 1.1383690407288758, + "learning_rate": 1.1048168244555513e-07, + "loss": 0.0321, + "step": 3279 + }, + { + "epoch": 0.9352723125178215, + "grad_norm": 0.5018323683744155, + "learning_rate": 1.0951821599500423e-07, + "loss": 0.0099, + "step": 3280 + }, + { + "epoch": 0.9355574565155403, + "grad_norm": 0.38194627856193947, + "learning_rate": 1.0855892244559573e-07, + "loss": 0.0058, + "step": 3281 + }, + { + "epoch": 0.9358426005132592, + "grad_norm": 1.7992899459503027, + "learning_rate": 1.0760380261586656e-07, + "loss": 0.022, + "step": 3282 + }, + { + "epoch": 0.936127744510978, + "grad_norm": 1.608919056354979, + "learning_rate": 1.0665285732079145e-07, + "loss": 0.0373, + "step": 3283 + }, + { + "epoch": 0.9364128885086969, + "grad_norm": 0.5295751980400589, + "learning_rate": 1.0570608737178245e-07, + "loss": 0.0143, + "step": 3284 + }, + { + "epoch": 0.9366980325064157, + "grad_norm": 1.0884180574425513, + "learning_rate": 1.0476349357669113e-07, + "loss": 0.0176, + "step": 3285 + }, + { + "epoch": 0.9369831765041345, + "grad_norm": 0.8642293290298916, + "learning_rate": 1.0382507673980358e-07, + "loss": 0.0239, + "step": 3286 + }, + { + "epoch": 0.9372683205018535, + "grad_norm": 1.0721029415636114, + "learning_rate": 1.0289083766184371e-07, + "loss": 0.0125, + "step": 3287 + }, + { + "epoch": 0.9375534644995723, + "grad_norm": 1.8058621968187927, + "learning_rate": 1.0196077713996777e-07, + "loss": 0.0268, + "step": 3288 + }, + { + "epoch": 0.9378386084972912, + "grad_norm": 1.6236022683941318, + "learning_rate": 1.0103489596777094e-07, + "loss": 0.0507, + "step": 3289 + }, + { + "epoch": 0.93812375249501, + "grad_norm": 0.3847928622518018, + "learning_rate": 1.0011319493527849e-07, + "loss": 0.0048, + "step": 3290 + }, + { + "epoch": 0.9384088964927289, + "grad_norm": 0.5354601853769945, + "learning_rate": 9.919567482894965e-08, + "loss": 0.0138, + "step": 3291 + }, + { + "epoch": 0.9386940404904477, + "grad_norm": 1.707818158964234, + "learning_rate": 9.828233643167762e-08, + "loss": 0.0157, + "step": 3292 + }, + { + "epoch": 0.9389791844881665, + "grad_norm": 0.8286681216996449, + "learning_rate": 9.737318052278622e-08, + "loss": 0.0227, + "step": 3293 + }, + { + "epoch": 0.9392643284858854, + "grad_norm": 1.638770990611571, + "learning_rate": 9.646820787803102e-08, + "loss": 0.0238, + "step": 3294 + }, + { + "epoch": 0.9395494724836042, + "grad_norm": 1.4844955571277005, + "learning_rate": 9.556741926959878e-08, + "loss": 0.0251, + "step": 3295 + }, + { + "epoch": 0.939834616481323, + "grad_norm": 1.2669550894482697, + "learning_rate": 9.467081546610357e-08, + "loss": 0.0279, + "step": 3296 + }, + { + "epoch": 0.9401197604790419, + "grad_norm": 0.607099774823454, + "learning_rate": 9.377839723259175e-08, + "loss": 0.0121, + "step": 3297 + }, + { + "epoch": 0.9404049044767607, + "grad_norm": 0.8781917053000157, + "learning_rate": 9.289016533053696e-08, + "loss": 0.0104, + "step": 3298 + }, + { + "epoch": 0.9406900484744796, + "grad_norm": 1.3542338084425016, + "learning_rate": 9.200612051784019e-08, + "loss": 0.0167, + "step": 3299 + }, + { + "epoch": 0.9409751924721984, + "grad_norm": 1.1515026257845251, + "learning_rate": 9.112626354883025e-08, + "loss": 0.0166, + "step": 3300 + }, + { + "epoch": 0.9412603364699174, + "grad_norm": 1.2561544765269976, + "learning_rate": 9.025059517426383e-08, + "loss": 0.0202, + "step": 3301 + }, + { + "epoch": 0.9415454804676362, + "grad_norm": 0.8869786585507469, + "learning_rate": 8.937911614132155e-08, + "loss": 0.0113, + "step": 3302 + }, + { + "epoch": 0.941830624465355, + "grad_norm": 1.1028012235771263, + "learning_rate": 8.851182719361029e-08, + "loss": 0.0213, + "step": 3303 + }, + { + "epoch": 0.9421157684630739, + "grad_norm": 0.8016773289830269, + "learning_rate": 8.764872907116084e-08, + "loss": 0.0126, + "step": 3304 + }, + { + "epoch": 0.9424009124607927, + "grad_norm": 2.1285540435739647, + "learning_rate": 8.678982251043078e-08, + "loss": 0.0291, + "step": 3305 + }, + { + "epoch": 0.9426860564585116, + "grad_norm": 0.5550319327426171, + "learning_rate": 8.59351082442983e-08, + "loss": 0.0102, + "step": 3306 + }, + { + "epoch": 0.9429712004562304, + "grad_norm": 0.8827415276408771, + "learning_rate": 8.508458700206501e-08, + "loss": 0.0167, + "step": 3307 + }, + { + "epoch": 0.9432563444539492, + "grad_norm": 1.0858933176242238, + "learning_rate": 8.423825950945541e-08, + "loss": 0.0139, + "step": 3308 + }, + { + "epoch": 0.9435414884516681, + "grad_norm": 0.9312052457091772, + "learning_rate": 8.339612648861573e-08, + "loss": 0.0189, + "step": 3309 + }, + { + "epoch": 0.9438266324493869, + "grad_norm": 0.8767853437349821, + "learning_rate": 8.255818865811226e-08, + "loss": 0.0209, + "step": 3310 + }, + { + "epoch": 0.9441117764471058, + "grad_norm": 1.1912037158829953, + "learning_rate": 8.172444673293201e-08, + "loss": 0.0154, + "step": 3311 + }, + { + "epoch": 0.9443969204448246, + "grad_norm": 1.1446724760051077, + "learning_rate": 8.089490142448254e-08, + "loss": 0.0159, + "step": 3312 + }, + { + "epoch": 0.9446820644425434, + "grad_norm": 1.254371535428258, + "learning_rate": 8.006955344058986e-08, + "loss": 0.0203, + "step": 3313 + }, + { + "epoch": 0.9449672084402624, + "grad_norm": 0.9243205335182174, + "learning_rate": 7.92484034854979e-08, + "loss": 0.0147, + "step": 3314 + }, + { + "epoch": 0.9452523524379812, + "grad_norm": 0.5842500345183507, + "learning_rate": 7.843145225987003e-08, + "loss": 0.0075, + "step": 3315 + }, + { + "epoch": 0.9455374964357001, + "grad_norm": 1.493712093765859, + "learning_rate": 7.761870046078534e-08, + "loss": 0.0231, + "step": 3316 + }, + { + "epoch": 0.9458226404334189, + "grad_norm": 1.162136015617278, + "learning_rate": 7.681014878174187e-08, + "loss": 0.0321, + "step": 3317 + }, + { + "epoch": 0.9461077844311377, + "grad_norm": 0.5474839381848375, + "learning_rate": 7.600579791265161e-08, + "loss": 0.0117, + "step": 3318 + }, + { + "epoch": 0.9463929284288566, + "grad_norm": 1.3755166848139597, + "learning_rate": 7.52056485398428e-08, + "loss": 0.0244, + "step": 3319 + }, + { + "epoch": 0.9466780724265754, + "grad_norm": 1.3943608791399622, + "learning_rate": 7.440970134605819e-08, + "loss": 0.0342, + "step": 3320 + }, + { + "epoch": 0.9469632164242943, + "grad_norm": 0.9763179433503778, + "learning_rate": 7.361795701045726e-08, + "loss": 0.0233, + "step": 3321 + }, + { + "epoch": 0.9472483604220131, + "grad_norm": 0.8089246096238166, + "learning_rate": 7.283041620861131e-08, + "loss": 0.0127, + "step": 3322 + }, + { + "epoch": 0.9475335044197319, + "grad_norm": 0.7679809907075772, + "learning_rate": 7.204707961250446e-08, + "loss": 0.0156, + "step": 3323 + }, + { + "epoch": 0.9478186484174508, + "grad_norm": 0.9473437484277629, + "learning_rate": 7.126794789053426e-08, + "loss": 0.0147, + "step": 3324 + }, + { + "epoch": 0.9481037924151696, + "grad_norm": 1.38358551291656, + "learning_rate": 7.049302170751115e-08, + "loss": 0.0567, + "step": 3325 + }, + { + "epoch": 0.9483889364128885, + "grad_norm": 1.2621054129425233, + "learning_rate": 6.972230172465567e-08, + "loss": 0.0231, + "step": 3326 + }, + { + "epoch": 0.9486740804106074, + "grad_norm": 0.3297509743111443, + "learning_rate": 6.895578859960062e-08, + "loss": 0.0072, + "step": 3327 + }, + { + "epoch": 0.9489592244083263, + "grad_norm": 0.2484790392335108, + "learning_rate": 6.819348298638839e-08, + "loss": 0.0045, + "step": 3328 + }, + { + "epoch": 0.9492443684060451, + "grad_norm": 0.5873609332531889, + "learning_rate": 6.743538553547091e-08, + "loss": 0.0107, + "step": 3329 + }, + { + "epoch": 0.9495295124037639, + "grad_norm": 1.7018257018683238, + "learning_rate": 6.668149689371074e-08, + "loss": 0.0432, + "step": 3330 + }, + { + "epoch": 0.9498146564014828, + "grad_norm": 0.5246171733900967, + "learning_rate": 6.593181770437829e-08, + "loss": 0.0068, + "step": 3331 + }, + { + "epoch": 0.9500998003992016, + "grad_norm": 0.7763822843561252, + "learning_rate": 6.518634860715134e-08, + "loss": 0.0162, + "step": 3332 + }, + { + "epoch": 0.9503849443969205, + "grad_norm": 1.0168579745893171, + "learning_rate": 6.444509023811773e-08, + "loss": 0.0209, + "step": 3333 + }, + { + "epoch": 0.9506700883946393, + "grad_norm": 1.075777400723064, + "learning_rate": 6.370804322977042e-08, + "loss": 0.0189, + "step": 3334 + }, + { + "epoch": 0.9509552323923581, + "grad_norm": 1.5206330880639494, + "learning_rate": 6.297520821100911e-08, + "loss": 0.0462, + "step": 3335 + }, + { + "epoch": 0.951240376390077, + "grad_norm": 1.5897941053837084, + "learning_rate": 6.224658580713971e-08, + "loss": 0.0238, + "step": 3336 + }, + { + "epoch": 0.9515255203877958, + "grad_norm": 2.1685547816279316, + "learning_rate": 6.152217663987437e-08, + "loss": 0.0302, + "step": 3337 + }, + { + "epoch": 0.9518106643855146, + "grad_norm": 0.41072290294514274, + "learning_rate": 6.080198132732917e-08, + "loss": 0.0062, + "step": 3338 + }, + { + "epoch": 0.9520958083832335, + "grad_norm": 1.6595471882531296, + "learning_rate": 6.008600048402647e-08, + "loss": 0.0236, + "step": 3339 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 1.3876995960785814, + "learning_rate": 5.937423472088866e-08, + "loss": 0.0173, + "step": 3340 + }, + { + "epoch": 0.9526660963786713, + "grad_norm": 0.95914211608916, + "learning_rate": 5.866668464524661e-08, + "loss": 0.0146, + "step": 3341 + }, + { + "epoch": 0.9529512403763901, + "grad_norm": 2.367073977432817, + "learning_rate": 5.796335086083016e-08, + "loss": 0.0281, + "step": 3342 + }, + { + "epoch": 0.953236384374109, + "grad_norm": 0.8209213096210055, + "learning_rate": 5.7264233967773696e-08, + "loss": 0.0127, + "step": 3343 + }, + { + "epoch": 0.9535215283718278, + "grad_norm": 1.8947375988470225, + "learning_rate": 5.6569334562611714e-08, + "loss": 0.0205, + "step": 3344 + }, + { + "epoch": 0.9538066723695466, + "grad_norm": 1.043061161634943, + "learning_rate": 5.5878653238281564e-08, + "loss": 0.0209, + "step": 3345 + }, + { + "epoch": 0.9540918163672655, + "grad_norm": 0.8863361955005523, + "learning_rate": 5.519219058412129e-08, + "loss": 0.0098, + "step": 3346 + }, + { + "epoch": 0.9543769603649843, + "grad_norm": 0.5971170239016343, + "learning_rate": 5.4509947185867883e-08, + "loss": 0.0162, + "step": 3347 + }, + { + "epoch": 0.9546621043627032, + "grad_norm": 1.117922239866681, + "learning_rate": 5.3831923625659034e-08, + "loss": 0.0235, + "step": 3348 + }, + { + "epoch": 0.954947248360422, + "grad_norm": 0.4746383532243566, + "learning_rate": 5.315812048203306e-08, + "loss": 0.0081, + "step": 3349 + }, + { + "epoch": 0.9552323923581408, + "grad_norm": 0.7993643810754686, + "learning_rate": 5.2488538329926175e-08, + "loss": 0.0214, + "step": 3350 + }, + { + "epoch": 0.9555175363558597, + "grad_norm": 1.4852695501324207, + "learning_rate": 5.18231777406708e-08, + "loss": 0.032, + "step": 3351 + }, + { + "epoch": 0.9558026803535785, + "grad_norm": 2.15399813077587, + "learning_rate": 5.116203928200003e-08, + "loss": 0.056, + "step": 3352 + }, + { + "epoch": 0.9560878243512974, + "grad_norm": 1.4790577170889294, + "learning_rate": 5.050512351804371e-08, + "loss": 0.035, + "step": 3353 + }, + { + "epoch": 0.9563729683490163, + "grad_norm": 1.5138221982520688, + "learning_rate": 4.9852431009328464e-08, + "loss": 0.0206, + "step": 3354 + }, + { + "epoch": 0.9566581123467351, + "grad_norm": 1.2976796226213914, + "learning_rate": 4.920396231277713e-08, + "loss": 0.0185, + "step": 3355 + }, + { + "epoch": 0.956943256344454, + "grad_norm": 0.8940591579415054, + "learning_rate": 4.855971798170822e-08, + "loss": 0.0161, + "step": 3356 + }, + { + "epoch": 0.9572284003421728, + "grad_norm": 1.3689820258335235, + "learning_rate": 4.7919698565835894e-08, + "loss": 0.0346, + "step": 3357 + }, + { + "epoch": 0.9575135443398917, + "grad_norm": 0.6602976007376902, + "learning_rate": 4.728390461126997e-08, + "loss": 0.0135, + "step": 3358 + }, + { + "epoch": 0.9577986883376105, + "grad_norm": 1.7405971970346032, + "learning_rate": 4.6652336660514275e-08, + "loss": 0.0391, + "step": 3359 + }, + { + "epoch": 0.9580838323353293, + "grad_norm": 0.7948682203377249, + "learning_rate": 4.602499525246606e-08, + "loss": 0.0095, + "step": 3360 + }, + { + "epoch": 0.9583689763330482, + "grad_norm": 2.246065889190937, + "learning_rate": 4.5401880922418264e-08, + "loss": 0.0295, + "step": 3361 + }, + { + "epoch": 0.958654120330767, + "grad_norm": 0.998355740167951, + "learning_rate": 4.478299420205445e-08, + "loss": 0.0134, + "step": 3362 + }, + { + "epoch": 0.9589392643284859, + "grad_norm": 1.4618361724398996, + "learning_rate": 4.416833561945222e-08, + "loss": 0.0361, + "step": 3363 + }, + { + "epoch": 0.9592244083262047, + "grad_norm": 1.3947578420400226, + "learning_rate": 4.355790569908147e-08, + "loss": 0.0316, + "step": 3364 + }, + { + "epoch": 0.9595095523239235, + "grad_norm": 1.092207434346036, + "learning_rate": 4.29517049618039e-08, + "loss": 0.0205, + "step": 3365 + }, + { + "epoch": 0.9597946963216424, + "grad_norm": 1.2842978731112948, + "learning_rate": 4.2349733924872406e-08, + "loss": 0.0261, + "step": 3366 + }, + { + "epoch": 0.9600798403193613, + "grad_norm": 1.9319381855432591, + "learning_rate": 4.1751993101930565e-08, + "loss": 0.0313, + "step": 3367 + }, + { + "epoch": 0.9603649843170802, + "grad_norm": 1.9881438705501773, + "learning_rate": 4.1158483003012614e-08, + "loss": 0.0338, + "step": 3368 + }, + { + "epoch": 0.960650128314799, + "grad_norm": 1.5737829057522759, + "learning_rate": 4.056920413454291e-08, + "loss": 0.0237, + "step": 3369 + }, + { + "epoch": 0.9609352723125179, + "grad_norm": 1.000096415972, + "learning_rate": 3.9984156999335334e-08, + "loss": 0.0145, + "step": 3370 + }, + { + "epoch": 0.9612204163102367, + "grad_norm": 1.35746116703504, + "learning_rate": 3.94033420965928e-08, + "loss": 0.015, + "step": 3371 + }, + { + "epoch": 0.9615055603079555, + "grad_norm": 2.149928406053472, + "learning_rate": 3.882675992190832e-08, + "loss": 0.0354, + "step": 3372 + }, + { + "epoch": 0.9617907043056744, + "grad_norm": 0.7299966913484716, + "learning_rate": 3.825441096726057e-08, + "loss": 0.008, + "step": 3373 + }, + { + "epoch": 0.9620758483033932, + "grad_norm": 0.3263502956977538, + "learning_rate": 3.76862957210189e-08, + "loss": 0.004, + "step": 3374 + }, + { + "epoch": 0.962360992301112, + "grad_norm": 1.2745281137121607, + "learning_rate": 3.712241466793887e-08, + "loss": 0.0148, + "step": 3375 + }, + { + "epoch": 0.9626461362988309, + "grad_norm": 1.0802932953346063, + "learning_rate": 3.6562768289162834e-08, + "loss": 0.0155, + "step": 3376 + }, + { + "epoch": 0.9629312802965497, + "grad_norm": 0.8278153218304807, + "learning_rate": 3.6007357062219914e-08, + "loss": 0.0122, + "step": 3377 + }, + { + "epoch": 0.9632164242942686, + "grad_norm": 0.22430709604137844, + "learning_rate": 3.5456181461026585e-08, + "loss": 0.0042, + "step": 3378 + }, + { + "epoch": 0.9635015682919874, + "grad_norm": 0.8451639389040133, + "learning_rate": 3.4909241955883853e-08, + "loss": 0.014, + "step": 3379 + }, + { + "epoch": 0.9637867122897062, + "grad_norm": 1.3080849693612018, + "learning_rate": 3.4366539013478975e-08, + "loss": 0.024, + "step": 3380 + }, + { + "epoch": 0.9640718562874252, + "grad_norm": 1.4925602998353593, + "learning_rate": 3.3828073096884294e-08, + "loss": 0.0229, + "step": 3381 + }, + { + "epoch": 0.964357000285144, + "grad_norm": 0.9866730078294964, + "learning_rate": 3.329384466555619e-08, + "loss": 0.0332, + "step": 3382 + }, + { + "epoch": 0.9646421442828629, + "grad_norm": 0.53963382342648, + "learning_rate": 3.276385417533612e-08, + "loss": 0.0076, + "step": 3383 + }, + { + "epoch": 0.9649272882805817, + "grad_norm": 2.028169739046422, + "learning_rate": 3.2238102078448466e-08, + "loss": 0.0525, + "step": 3384 + }, + { + "epoch": 0.9652124322783006, + "grad_norm": 0.70221035179003, + "learning_rate": 3.17165888235027e-08, + "loss": 0.0147, + "step": 3385 + }, + { + "epoch": 0.9654975762760194, + "grad_norm": 1.594502446966948, + "learning_rate": 3.1199314855489547e-08, + "loss": 0.0386, + "step": 3386 + }, + { + "epoch": 0.9657827202737382, + "grad_norm": 0.6155279599436443, + "learning_rate": 3.0686280615783156e-08, + "loss": 0.007, + "step": 3387 + }, + { + "epoch": 0.9660678642714571, + "grad_norm": 0.9940196228788715, + "learning_rate": 3.0177486542141144e-08, + "loss": 0.0181, + "step": 3388 + }, + { + "epoch": 0.9663530082691759, + "grad_norm": 0.9662957584867693, + "learning_rate": 2.9672933068701227e-08, + "loss": 0.0191, + "step": 3389 + }, + { + "epoch": 0.9666381522668948, + "grad_norm": 1.3090922190101484, + "learning_rate": 2.9172620625984583e-08, + "loss": 0.0244, + "step": 3390 + }, + { + "epoch": 0.9669232962646136, + "grad_norm": 0.8123264288075666, + "learning_rate": 2.8676549640892502e-08, + "loss": 0.015, + "step": 3391 + }, + { + "epoch": 0.9672084402623324, + "grad_norm": 0.8232507163935322, + "learning_rate": 2.8184720536706956e-08, + "loss": 0.0161, + "step": 3392 + }, + { + "epoch": 0.9674935842600513, + "grad_norm": 0.6151000895965828, + "learning_rate": 2.7697133733091686e-08, + "loss": 0.01, + "step": 3393 + }, + { + "epoch": 0.9677787282577702, + "grad_norm": 1.5924017390252279, + "learning_rate": 2.7213789646088896e-08, + "loss": 0.055, + "step": 3394 + }, + { + "epoch": 0.9680638722554891, + "grad_norm": 1.4169645143655845, + "learning_rate": 2.673468868812312e-08, + "loss": 0.0179, + "step": 3395 + }, + { + "epoch": 0.9683490162532079, + "grad_norm": 0.5948053783440101, + "learning_rate": 2.625983126799514e-08, + "loss": 0.0083, + "step": 3396 + }, + { + "epoch": 0.9686341602509267, + "grad_norm": 1.575445770817084, + "learning_rate": 2.57892177908875e-08, + "loss": 0.0313, + "step": 3397 + }, + { + "epoch": 0.9689193042486456, + "grad_norm": 1.500329123622438, + "learning_rate": 2.53228486583601e-08, + "loss": 0.0241, + "step": 3398 + }, + { + "epoch": 0.9692044482463644, + "grad_norm": 1.311203579000811, + "learning_rate": 2.4860724268351845e-08, + "loss": 0.0378, + "step": 3399 + }, + { + "epoch": 0.9694895922440833, + "grad_norm": 1.7134696799995803, + "learning_rate": 2.4402845015180088e-08, + "loss": 0.0345, + "step": 3400 + }, + { + "epoch": 0.9697747362418021, + "grad_norm": 0.6710349220659272, + "learning_rate": 2.3949211289538975e-08, + "loss": 0.0073, + "step": 3401 + }, + { + "epoch": 0.9700598802395209, + "grad_norm": 1.5504891835178136, + "learning_rate": 2.3499823478499995e-08, + "loss": 0.0237, + "step": 3402 + }, + { + "epoch": 0.9703450242372398, + "grad_norm": 0.7081799782835921, + "learning_rate": 2.305468196551308e-08, + "loss": 0.0148, + "step": 3403 + }, + { + "epoch": 0.9706301682349586, + "grad_norm": 0.9281303546894957, + "learning_rate": 2.2613787130403854e-08, + "loss": 0.0192, + "step": 3404 + }, + { + "epoch": 0.9709153122326775, + "grad_norm": 1.27109390632808, + "learning_rate": 2.217713934937471e-08, + "loss": 0.0453, + "step": 3405 + }, + { + "epoch": 0.9712004562303963, + "grad_norm": 0.6464573130091568, + "learning_rate": 2.1744738995003733e-08, + "loss": 0.0253, + "step": 3406 + }, + { + "epoch": 0.9714856002281153, + "grad_norm": 1.119205251723142, + "learning_rate": 2.131658643624579e-08, + "loss": 0.0356, + "step": 3407 + }, + { + "epoch": 0.9717707442258341, + "grad_norm": 1.4078034602604481, + "learning_rate": 2.0892682038429758e-08, + "loss": 0.0391, + "step": 3408 + }, + { + "epoch": 0.9720558882235529, + "grad_norm": 0.6417882749586009, + "learning_rate": 2.0473026163261302e-08, + "loss": 0.0155, + "step": 3409 + }, + { + "epoch": 0.9723410322212718, + "grad_norm": 0.5777849338634182, + "learning_rate": 2.0057619168819544e-08, + "loss": 0.0132, + "step": 3410 + }, + { + "epoch": 0.9726261762189906, + "grad_norm": 0.7877201908002133, + "learning_rate": 1.964646140955928e-08, + "loss": 0.0282, + "step": 3411 + }, + { + "epoch": 0.9729113202167095, + "grad_norm": 1.1195581875615601, + "learning_rate": 1.923955323630877e-08, + "loss": 0.0124, + "step": 3412 + }, + { + "epoch": 0.9731964642144283, + "grad_norm": 1.1199818626019644, + "learning_rate": 1.883689499627084e-08, + "loss": 0.0156, + "step": 3413 + }, + { + "epoch": 0.9734816082121471, + "grad_norm": 0.600014710134955, + "learning_rate": 1.84384870330212e-08, + "loss": 0.0217, + "step": 3414 + }, + { + "epoch": 0.973766752209866, + "grad_norm": 0.980931450691564, + "learning_rate": 1.8044329686509598e-08, + "loss": 0.0199, + "step": 3415 + }, + { + "epoch": 0.9740518962075848, + "grad_norm": 0.820500426709812, + "learning_rate": 1.7654423293058666e-08, + "loss": 0.0099, + "step": 3416 + }, + { + "epoch": 0.9743370402053037, + "grad_norm": 0.7809793057569531, + "learning_rate": 1.726876818536394e-08, + "loss": 0.0146, + "step": 3417 + }, + { + "epoch": 0.9746221842030225, + "grad_norm": 1.0867941197474569, + "learning_rate": 1.6887364692493303e-08, + "loss": 0.0171, + "step": 3418 + }, + { + "epoch": 0.9749073282007413, + "grad_norm": 1.128048720541707, + "learning_rate": 1.6510213139886987e-08, + "loss": 0.0558, + "step": 3419 + }, + { + "epoch": 0.9751924721984602, + "grad_norm": 1.548358358713432, + "learning_rate": 1.613731384935702e-08, + "loss": 0.0403, + "step": 3420 + }, + { + "epoch": 0.9754776161961791, + "grad_norm": 1.50416589986925, + "learning_rate": 1.5768667139086645e-08, + "loss": 0.0432, + "step": 3421 + }, + { + "epoch": 0.975762760193898, + "grad_norm": 1.3525365106907985, + "learning_rate": 1.540427332363148e-08, + "loss": 0.0193, + "step": 3422 + }, + { + "epoch": 0.9760479041916168, + "grad_norm": 2.136152304647279, + "learning_rate": 1.5044132713917803e-08, + "loss": 0.0434, + "step": 3423 + }, + { + "epoch": 0.9763330481893356, + "grad_norm": 0.703158736191946, + "learning_rate": 1.4688245617243135e-08, + "loss": 0.0141, + "step": 3424 + }, + { + "epoch": 0.9766181921870545, + "grad_norm": 1.7291368031174819, + "learning_rate": 1.4336612337274014e-08, + "loss": 0.0177, + "step": 3425 + }, + { + "epoch": 0.9769033361847733, + "grad_norm": 0.7954994827635179, + "learning_rate": 1.3989233174050431e-08, + "loss": 0.0233, + "step": 3426 + }, + { + "epoch": 0.9771884801824922, + "grad_norm": 0.5057022729587526, + "learning_rate": 1.3646108423978621e-08, + "loss": 0.0083, + "step": 3427 + }, + { + "epoch": 0.977473624180211, + "grad_norm": 1.395543096080145, + "learning_rate": 1.3307238379838273e-08, + "loss": 0.0238, + "step": 3428 + }, + { + "epoch": 0.9777587681779298, + "grad_norm": 0.5812031981674576, + "learning_rate": 1.2972623330775869e-08, + "loss": 0.0112, + "step": 3429 + }, + { + "epoch": 0.9780439121756487, + "grad_norm": 1.9594929069324818, + "learning_rate": 1.2642263562309131e-08, + "loss": 0.0526, + "step": 3430 + }, + { + "epoch": 0.9783290561733675, + "grad_norm": 0.6563286778890409, + "learning_rate": 1.2316159356323132e-08, + "loss": 0.0078, + "step": 3431 + }, + { + "epoch": 0.9786142001710864, + "grad_norm": 0.5324232039060124, + "learning_rate": 1.1994310991074177e-08, + "loss": 0.0108, + "step": 3432 + }, + { + "epoch": 0.9788993441688052, + "grad_norm": 1.2259761094193657, + "learning_rate": 1.1676718741184812e-08, + "loss": 0.033, + "step": 3433 + }, + { + "epoch": 0.9791844881665241, + "grad_norm": 1.365541853973848, + "learning_rate": 1.1363382877647155e-08, + "loss": 0.0183, + "step": 3434 + }, + { + "epoch": 0.979469632164243, + "grad_norm": 1.8621831220471732, + "learning_rate": 1.1054303667821232e-08, + "loss": 0.0299, + "step": 3435 + }, + { + "epoch": 0.9797547761619618, + "grad_norm": 0.6347679936689795, + "learning_rate": 1.0749481375434966e-08, + "loss": 0.0099, + "step": 3436 + }, + { + "epoch": 0.9800399201596807, + "grad_norm": 1.1434239902853023, + "learning_rate": 1.0448916260584752e-08, + "loss": 0.0215, + "step": 3437 + }, + { + "epoch": 0.9803250641573995, + "grad_norm": 0.7640056037278686, + "learning_rate": 1.0152608579733214e-08, + "loss": 0.0187, + "step": 3438 + }, + { + "epoch": 0.9806102081551183, + "grad_norm": 0.9356956746133379, + "learning_rate": 9.860558585710334e-09, + "loss": 0.0122, + "step": 3439 + }, + { + "epoch": 0.9808953521528372, + "grad_norm": 1.1473267478745108, + "learning_rate": 9.57276652771455e-09, + "loss": 0.0166, + "step": 3440 + }, + { + "epoch": 0.981180496150556, + "grad_norm": 1.9021257652076382, + "learning_rate": 9.289232651309432e-09, + "loss": 0.0264, + "step": 3441 + }, + { + "epoch": 0.9814656401482749, + "grad_norm": 0.630448042704614, + "learning_rate": 9.009957198426455e-09, + "loss": 0.0092, + "step": 3442 + }, + { + "epoch": 0.9817507841459937, + "grad_norm": 1.1654682091314135, + "learning_rate": 8.73494040736278e-09, + "loss": 0.0467, + "step": 3443 + }, + { + "epoch": 0.9820359281437125, + "grad_norm": 0.8094012015584077, + "learning_rate": 8.46418251278236e-09, + "loss": 0.0122, + "step": 3444 + }, + { + "epoch": 0.9823210721414314, + "grad_norm": 0.43072831856408855, + "learning_rate": 8.197683745713725e-09, + "loss": 0.0099, + "step": 3445 + }, + { + "epoch": 0.9826062161391502, + "grad_norm": 0.9313780942022813, + "learning_rate": 7.935444333552756e-09, + "loss": 0.0375, + "step": 3446 + }, + { + "epoch": 0.9828913601368692, + "grad_norm": 1.86814045946998, + "learning_rate": 7.677464500061015e-09, + "loss": 0.0354, + "step": 3447 + }, + { + "epoch": 0.983176504134588, + "grad_norm": 0.9242833759734096, + "learning_rate": 7.423744465364091e-09, + "loss": 0.017, + "step": 3448 + }, + { + "epoch": 0.9834616481323069, + "grad_norm": 1.004654257677267, + "learning_rate": 7.1742844459543605e-09, + "loss": 0.0225, + "step": 3449 + }, + { + "epoch": 0.9837467921300257, + "grad_norm": 0.5132997575591574, + "learning_rate": 6.929084654688223e-09, + "loss": 0.0076, + "step": 3450 + }, + { + "epoch": 0.9840319361277445, + "grad_norm": 1.2537143990748647, + "learning_rate": 6.688145300787208e-09, + "loss": 0.0174, + "step": 3451 + }, + { + "epoch": 0.9843170801254634, + "grad_norm": 0.856916087539409, + "learning_rate": 6.451466589837974e-09, + "loss": 0.0076, + "step": 3452 + }, + { + "epoch": 0.9846022241231822, + "grad_norm": 1.2874699118028397, + "learning_rate": 6.219048723790644e-09, + "loss": 0.0161, + "step": 3453 + }, + { + "epoch": 0.984887368120901, + "grad_norm": 0.7191572461319289, + "learning_rate": 5.990891900961582e-09, + "loss": 0.0143, + "step": 3454 + }, + { + "epoch": 0.9851725121186199, + "grad_norm": 1.5720947624205361, + "learning_rate": 5.766996316029505e-09, + "loss": 0.0312, + "step": 3455 + }, + { + "epoch": 0.9854576561163387, + "grad_norm": 1.5965820870491516, + "learning_rate": 5.547362160037151e-09, + "loss": 0.0483, + "step": 3456 + }, + { + "epoch": 0.9857428001140576, + "grad_norm": 2.324661905639617, + "learning_rate": 5.331989620392386e-09, + "loss": 0.0569, + "step": 3457 + }, + { + "epoch": 0.9860279441117764, + "grad_norm": 1.1089308013214734, + "learning_rate": 5.120878880866542e-09, + "loss": 0.0142, + "step": 3458 + }, + { + "epoch": 0.9863130881094953, + "grad_norm": 0.8282424350000585, + "learning_rate": 4.914030121593305e-09, + "loss": 0.025, + "step": 3459 + }, + { + "epoch": 0.9865982321072141, + "grad_norm": 1.2887917701143625, + "learning_rate": 4.711443519070935e-09, + "loss": 0.0357, + "step": 3460 + }, + { + "epoch": 0.986883376104933, + "grad_norm": 0.948215811306342, + "learning_rate": 4.513119246160602e-09, + "loss": 0.0131, + "step": 3461 + }, + { + "epoch": 0.9871685201026519, + "grad_norm": 1.220943268524718, + "learning_rate": 4.3190574720858305e-09, + "loss": 0.0326, + "step": 3462 + }, + { + "epoch": 0.9874536641003707, + "grad_norm": 0.41899450985279857, + "learning_rate": 4.129258362434163e-09, + "loss": 0.0054, + "step": 3463 + }, + { + "epoch": 0.9877388080980896, + "grad_norm": 0.647778329863355, + "learning_rate": 3.943722079155499e-09, + "loss": 0.0151, + "step": 3464 + }, + { + "epoch": 0.9880239520958084, + "grad_norm": 1.802544256763322, + "learning_rate": 3.762448780562089e-09, + "loss": 0.034, + "step": 3465 + }, + { + "epoch": 0.9883090960935272, + "grad_norm": 0.9733016714220596, + "learning_rate": 3.585438621329096e-09, + "loss": 0.0091, + "step": 3466 + }, + { + "epoch": 0.9885942400912461, + "grad_norm": 0.7507618500967945, + "learning_rate": 3.41269175249459e-09, + "loss": 0.0134, + "step": 3467 + }, + { + "epoch": 0.9888793840889649, + "grad_norm": 0.6719411044195046, + "learning_rate": 3.2442083214573316e-09, + "loss": 0.0074, + "step": 3468 + }, + { + "epoch": 0.9891645280866838, + "grad_norm": 2.0659767490048186, + "learning_rate": 3.0799884719795448e-09, + "loss": 0.0223, + "step": 3469 + }, + { + "epoch": 0.9894496720844026, + "grad_norm": 1.0486040613170182, + "learning_rate": 2.920032344185253e-09, + "loss": 0.0154, + "step": 3470 + }, + { + "epoch": 0.9897348160821214, + "grad_norm": 1.226563276780761, + "learning_rate": 2.7643400745602787e-09, + "loss": 0.0185, + "step": 3471 + }, + { + "epoch": 0.9900199600798403, + "grad_norm": 1.1703998955142922, + "learning_rate": 2.612911795951689e-09, + "loss": 0.0124, + "step": 3472 + }, + { + "epoch": 0.9903051040775591, + "grad_norm": 0.6924588334370636, + "learning_rate": 2.465747637568905e-09, + "loss": 0.0075, + "step": 3473 + }, + { + "epoch": 0.9905902480752781, + "grad_norm": 0.9080422538502102, + "learning_rate": 2.322847724982591e-09, + "loss": 0.0115, + "step": 3474 + }, + { + "epoch": 0.9908753920729969, + "grad_norm": 0.8362530707299618, + "learning_rate": 2.1842121801257666e-09, + "loss": 0.0132, + "step": 3475 + }, + { + "epoch": 0.9911605360707157, + "grad_norm": 0.4226196218790714, + "learning_rate": 2.0498411212904746e-09, + "loss": 0.0058, + "step": 3476 + }, + { + "epoch": 0.9914456800684346, + "grad_norm": 1.8399346633917293, + "learning_rate": 1.9197346631327774e-09, + "loss": 0.0283, + "step": 3477 + }, + { + "epoch": 0.9917308240661534, + "grad_norm": 0.7267003973721158, + "learning_rate": 1.7938929166683161e-09, + "loss": 0.0095, + "step": 3478 + }, + { + "epoch": 0.9920159680638723, + "grad_norm": 1.8275007401191372, + "learning_rate": 1.6723159892734209e-09, + "loss": 0.0189, + "step": 3479 + }, + { + "epoch": 0.9923011120615911, + "grad_norm": 0.9793326780518042, + "learning_rate": 1.5550039846867758e-09, + "loss": 0.0273, + "step": 3480 + }, + { + "epoch": 0.99258625605931, + "grad_norm": 1.3801801510684655, + "learning_rate": 1.4419570030071995e-09, + "loss": 0.0356, + "step": 3481 + }, + { + "epoch": 0.9928714000570288, + "grad_norm": 1.5847937802859948, + "learning_rate": 1.3331751406936433e-09, + "loss": 0.0445, + "step": 3482 + }, + { + "epoch": 0.9931565440547476, + "grad_norm": 0.8944423135053186, + "learning_rate": 1.2286584905668587e-09, + "loss": 0.0219, + "step": 3483 + }, + { + "epoch": 0.9934416880524665, + "grad_norm": 1.4836875534178315, + "learning_rate": 1.1284071418077303e-09, + "loss": 0.0213, + "step": 3484 + }, + { + "epoch": 0.9937268320501853, + "grad_norm": 1.3626285135130423, + "learning_rate": 1.0324211799578321e-09, + "loss": 0.0259, + "step": 3485 + }, + { + "epoch": 0.9940119760479041, + "grad_norm": 0.47759862221040755, + "learning_rate": 9.407006869188718e-10, + "loss": 0.0055, + "step": 3486 + }, + { + "epoch": 0.9942971200456231, + "grad_norm": 0.5968701456927105, + "learning_rate": 8.532457409532457e-10, + "loss": 0.0082, + "step": 3487 + }, + { + "epoch": 0.9945822640433419, + "grad_norm": 0.5594488628362712, + "learning_rate": 7.700564166834845e-10, + "loss": 0.0097, + "step": 3488 + }, + { + "epoch": 0.9948674080410608, + "grad_norm": 1.836502535790717, + "learning_rate": 6.911327850928074e-10, + "loss": 0.0312, + "step": 3489 + }, + { + "epoch": 0.9951525520387796, + "grad_norm": 0.7196938824374762, + "learning_rate": 6.164749135240122e-10, + "loss": 0.0261, + "step": 3490 + }, + { + "epoch": 0.9954376960364985, + "grad_norm": 0.9928051386279392, + "learning_rate": 5.460828656811412e-10, + "loss": 0.0251, + "step": 3491 + }, + { + "epoch": 0.9957228400342173, + "grad_norm": 0.9905040540471107, + "learning_rate": 4.799567016267048e-10, + "loss": 0.0167, + "step": 3492 + }, + { + "epoch": 0.9960079840319361, + "grad_norm": 1.4772976262817499, + "learning_rate": 4.1809647778501274e-10, + "loss": 0.0144, + "step": 3493 + }, + { + "epoch": 0.996293128029655, + "grad_norm": 1.617884852382969, + "learning_rate": 3.605022469388431e-10, + "loss": 0.0367, + "step": 3494 + }, + { + "epoch": 0.9965782720273738, + "grad_norm": 1.9532885950196877, + "learning_rate": 3.07174058232218e-10, + "loss": 0.0321, + "step": 3495 + }, + { + "epoch": 0.9968634160250927, + "grad_norm": 1.21126863713605, + "learning_rate": 2.5811195716762827e-10, + "loss": 0.0241, + "step": 3496 + }, + { + "epoch": 0.9971485600228115, + "grad_norm": 0.8578791491329285, + "learning_rate": 2.133159856093636e-10, + "loss": 0.0158, + "step": 3497 + }, + { + "epoch": 0.9974337040205303, + "grad_norm": 0.8397533407417973, + "learning_rate": 1.7278618177962726e-10, + "loss": 0.0104, + "step": 3498 + }, + { + "epoch": 0.9977188480182492, + "grad_norm": 0.8670159276111888, + "learning_rate": 1.3652258026186638e-10, + "loss": 0.0264, + "step": 3499 + }, + { + "epoch": 0.998003992015968, + "grad_norm": 1.4392317009722384, + "learning_rate": 1.045252119979967e-10, + "loss": 0.0371, + "step": 3500 + }, + { + "epoch": 0.998289136013687, + "grad_norm": 0.8578595317936397, + "learning_rate": 7.679410429117795e-11, + "loss": 0.0353, + "step": 3501 + }, + { + "epoch": 0.9985742800114058, + "grad_norm": 0.43156184395377595, + "learning_rate": 5.332928080359345e-11, + "loss": 0.01, + "step": 3502 + }, + { + "epoch": 0.9988594240091246, + "grad_norm": 1.35487670264924, + "learning_rate": 3.413076155645012e-11, + "loss": 0.0233, + "step": 3503 + }, + { + "epoch": 0.9991445680068435, + "grad_norm": 0.8191645811893169, + "learning_rate": 1.9198562931088682e-11, + "loss": 0.0199, + "step": 3504 + }, + { + "epoch": 0.9994297120045623, + "grad_norm": 1.885688610242415, + "learning_rate": 8.532697669538791e-12, + "loss": 0.0381, + "step": 3505 + }, + { + "epoch": 0.9997148560022812, + "grad_norm": 1.563181683513951, + "learning_rate": 2.1331748722985823e-12, + "loss": 0.0318, + "step": 3506 + }, + { + "epoch": 1.0, + "grad_norm": 0.8918846626454352, + "learning_rate": 0.0, + "loss": 0.0097, + "step": 3507 + }, + { + "epoch": 1.0, + "step": 3507, + "total_flos": 28156764236288.0, + "train_loss": 0.031115454253810983, + "train_runtime": 8471.7741, + "train_samples_per_second": 3.312, + "train_steps_per_second": 0.414 + } + ], + "logging_steps": 1.0, + "max_steps": 3507, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1365, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 28156764236288.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}