{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999966971628629, "eval_steps": 500, "global_step": 7569, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00013211348548403077, "grad_norm": 1.223802924156189, "learning_rate": 2e-05, "loss": 0.1696, "step": 1 }, { "epoch": 0.00026422697096806154, "grad_norm": 0.8252124190330505, "learning_rate": 4e-05, "loss": 0.1671, "step": 2 }, { "epoch": 0.00039634045645209234, "grad_norm": 0.9697959423065186, "learning_rate": 6e-05, "loss": 0.1425, "step": 3 }, { "epoch": 0.0005284539419361231, "grad_norm": 0.9205353856086731, "learning_rate": 8e-05, "loss": 0.1113, "step": 4 }, { "epoch": 0.0006605674274201539, "grad_norm": 0.6761035323143005, "learning_rate": 0.0001, "loss": 0.1032, "step": 5 }, { "epoch": 0.0007926809129041847, "grad_norm": 0.6447001695632935, "learning_rate": 0.00012, "loss": 0.1314, "step": 6 }, { "epoch": 0.0009247943983882155, "grad_norm": 0.6059455871582031, "learning_rate": 0.00014, "loss": 0.1041, "step": 7 }, { "epoch": 0.0010569078838722462, "grad_norm": 0.4858318567276001, "learning_rate": 0.00016, "loss": 0.067, "step": 8 }, { "epoch": 0.001189021369356277, "grad_norm": 0.4148808717727661, "learning_rate": 0.00018, "loss": 0.0534, "step": 9 }, { "epoch": 0.0013211348548403078, "grad_norm": 0.39029794931411743, "learning_rate": 0.0002, "loss": 0.0882, "step": 10 }, { "epoch": 0.0014532483403243387, "grad_norm": 0.7170138955116272, "learning_rate": 0.00019999999136343486, "loss": 0.1022, "step": 11 }, { "epoch": 0.0015853618258083694, "grad_norm": 0.6317317485809326, "learning_rate": 0.00019999996545374095, "loss": 0.1389, "step": 12 }, { "epoch": 0.0017174753112924003, "grad_norm": 0.4452052116394043, "learning_rate": 0.00019999992227092274, "loss": 0.1099, "step": 13 }, { "epoch": 0.001849588796776431, "grad_norm": 0.37861597537994385, "learning_rate": 0.00019999986181498767, "loss": 0.0748, "step": 14 }, { "epoch": 0.0019817022822604616, "grad_norm": 0.48993363976478577, "learning_rate": 0.00019999978408594618, "loss": 0.0713, "step": 15 }, { "epoch": 0.0021138157677444923, "grad_norm": 0.35126134753227234, "learning_rate": 0.00019999968908381176, "loss": 0.0684, "step": 16 }, { "epoch": 0.0022459292532285235, "grad_norm": 0.6053355932235718, "learning_rate": 0.00019999957680860071, "loss": 0.0986, "step": 17 }, { "epoch": 0.002378042738712554, "grad_norm": 0.44116076827049255, "learning_rate": 0.00019999944726033252, "loss": 0.1038, "step": 18 }, { "epoch": 0.002510156224196585, "grad_norm": 0.3535824418067932, "learning_rate": 0.00019999930043902952, "loss": 0.0556, "step": 19 }, { "epoch": 0.0026422697096806155, "grad_norm": 0.5412662029266357, "learning_rate": 0.0001999991363447171, "loss": 0.069, "step": 20 }, { "epoch": 0.0027743831951646462, "grad_norm": 0.3860626220703125, "learning_rate": 0.0001999989549774236, "loss": 0.0875, "step": 21 }, { "epoch": 0.0029064966806486773, "grad_norm": 0.4982723891735077, "learning_rate": 0.0001999987563371803, "loss": 0.057, "step": 22 }, { "epoch": 0.003038610166132708, "grad_norm": 0.4545145630836487, "learning_rate": 0.00019999854042402157, "loss": 0.0415, "step": 23 }, { "epoch": 0.0031707236516167387, "grad_norm": 0.5430541634559631, "learning_rate": 0.00019999830723798468, "loss": 0.0939, "step": 24 }, { "epoch": 0.0033028371371007694, "grad_norm": 0.39696234464645386, "learning_rate": 0.0001999980567791099, "loss": 0.0624, "step": 25 }, { "epoch": 0.0034349506225848005, "grad_norm": 0.6339076161384583, "learning_rate": 0.0001999977890474405, "loss": 0.0873, "step": 26 }, { "epoch": 0.0035670641080688312, "grad_norm": 0.29136186838150024, "learning_rate": 0.00019999750404302272, "loss": 0.0502, "step": 27 }, { "epoch": 0.003699177593552862, "grad_norm": 0.41455066204071045, "learning_rate": 0.00019999720176590584, "loss": 0.0801, "step": 28 }, { "epoch": 0.0038312910790368926, "grad_norm": 0.40953338146209717, "learning_rate": 0.000199996882216142, "loss": 0.0763, "step": 29 }, { "epoch": 0.003963404564520923, "grad_norm": 0.3564930856227875, "learning_rate": 0.00019999654539378642, "loss": 0.0512, "step": 30 }, { "epoch": 0.004095518050004954, "grad_norm": 0.5764234066009521, "learning_rate": 0.0001999961912988973, "loss": 0.0918, "step": 31 }, { "epoch": 0.004227631535488985, "grad_norm": 0.38000357151031494, "learning_rate": 0.00019999581993153577, "loss": 0.0433, "step": 32 }, { "epoch": 0.004359745020973016, "grad_norm": 0.3675003945827484, "learning_rate": 0.000199995431291766, "loss": 0.0603, "step": 33 }, { "epoch": 0.004491858506457047, "grad_norm": 0.2798750698566437, "learning_rate": 0.00019999502537965512, "loss": 0.0384, "step": 34 }, { "epoch": 0.004623971991941077, "grad_norm": 0.38664910197257996, "learning_rate": 0.00019999460219527327, "loss": 0.0541, "step": 35 }, { "epoch": 0.004756085477425108, "grad_norm": 0.505436897277832, "learning_rate": 0.00019999416173869348, "loss": 0.0639, "step": 36 }, { "epoch": 0.0048881989629091386, "grad_norm": 0.3454754948616028, "learning_rate": 0.00019999370400999186, "loss": 0.0566, "step": 37 }, { "epoch": 0.00502031244839317, "grad_norm": 0.3809961676597595, "learning_rate": 0.00019999322900924753, "loss": 0.0522, "step": 38 }, { "epoch": 0.005152425933877201, "grad_norm": 1.07643723487854, "learning_rate": 0.00019999273673654245, "loss": 0.0714, "step": 39 }, { "epoch": 0.005284539419361231, "grad_norm": 0.5807058811187744, "learning_rate": 0.0001999922271919617, "loss": 0.0534, "step": 40 }, { "epoch": 0.005416652904845262, "grad_norm": 0.5059730410575867, "learning_rate": 0.0001999917003755933, "loss": 0.0627, "step": 41 }, { "epoch": 0.0055487663903292924, "grad_norm": 0.5554924607276917, "learning_rate": 0.0001999911562875282, "loss": 0.063, "step": 42 }, { "epoch": 0.0056808798758133236, "grad_norm": 0.4277496635913849, "learning_rate": 0.00019999059492786044, "loss": 0.0624, "step": 43 }, { "epoch": 0.005812993361297355, "grad_norm": 0.38281354308128357, "learning_rate": 0.00019999001629668692, "loss": 0.0595, "step": 44 }, { "epoch": 0.005945106846781385, "grad_norm": 0.39231449365615845, "learning_rate": 0.00019998942039410765, "loss": 0.0432, "step": 45 }, { "epoch": 0.006077220332265416, "grad_norm": 0.3610371947288513, "learning_rate": 0.00019998880722022557, "loss": 0.0602, "step": 46 }, { "epoch": 0.006209333817749447, "grad_norm": 0.4488823711872101, "learning_rate": 0.0001999881767751465, "loss": 0.0829, "step": 47 }, { "epoch": 0.0063414473032334774, "grad_norm": 0.2863968014717102, "learning_rate": 0.00019998752905897943, "loss": 0.0631, "step": 48 }, { "epoch": 0.006473560788717509, "grad_norm": 0.3344953954219818, "learning_rate": 0.00019998686407183622, "loss": 0.056, "step": 49 }, { "epoch": 0.006605674274201539, "grad_norm": 0.6836997270584106, "learning_rate": 0.0001999861818138317, "loss": 0.0765, "step": 50 }, { "epoch": 0.00673778775968557, "grad_norm": 0.283840149641037, "learning_rate": 0.00019998548228508377, "loss": 0.0491, "step": 51 }, { "epoch": 0.006869901245169601, "grad_norm": 0.35097068548202515, "learning_rate": 0.0001999847654857132, "loss": 0.0498, "step": 52 }, { "epoch": 0.007002014730653631, "grad_norm": 0.4589046537876129, "learning_rate": 0.00019998403141584386, "loss": 0.0507, "step": 53 }, { "epoch": 0.0071341282161376625, "grad_norm": 0.3400501012802124, "learning_rate": 0.0001999832800756025, "loss": 0.0578, "step": 54 }, { "epoch": 0.007266241701621693, "grad_norm": 0.3245796859264374, "learning_rate": 0.00019998251146511893, "loss": 0.0471, "step": 55 }, { "epoch": 0.007398355187105724, "grad_norm": 0.4059467017650604, "learning_rate": 0.0001999817255845259, "loss": 0.0613, "step": 56 }, { "epoch": 0.007530468672589755, "grad_norm": 0.32029154896736145, "learning_rate": 0.00019998092243395918, "loss": 0.0545, "step": 57 }, { "epoch": 0.007662582158073785, "grad_norm": 0.29986247420310974, "learning_rate": 0.00019998010201355745, "loss": 0.0644, "step": 58 }, { "epoch": 0.007794695643557816, "grad_norm": 0.3331080973148346, "learning_rate": 0.00019997926432346245, "loss": 0.0702, "step": 59 }, { "epoch": 0.007926809129041847, "grad_norm": 0.5329291820526123, "learning_rate": 0.00019997840936381893, "loss": 0.0504, "step": 60 }, { "epoch": 0.008058922614525878, "grad_norm": 0.32196447253227234, "learning_rate": 0.00019997753713477448, "loss": 0.0618, "step": 61 }, { "epoch": 0.008191036100009909, "grad_norm": 0.4515661597251892, "learning_rate": 0.00019997664763647977, "loss": 0.0544, "step": 62 }, { "epoch": 0.00832314958549394, "grad_norm": 0.31676095724105835, "learning_rate": 0.0001999757408690885, "loss": 0.0645, "step": 63 }, { "epoch": 0.00845526307097797, "grad_norm": 0.29965102672576904, "learning_rate": 0.00019997481683275728, "loss": 0.0613, "step": 64 }, { "epoch": 0.008587376556462, "grad_norm": 0.3969486355781555, "learning_rate": 0.00019997387552764568, "loss": 0.0481, "step": 65 }, { "epoch": 0.008719490041946032, "grad_norm": 0.27435922622680664, "learning_rate": 0.00019997291695391636, "loss": 0.0383, "step": 66 }, { "epoch": 0.008851603527430063, "grad_norm": 0.37260785698890686, "learning_rate": 0.00019997194111173483, "loss": 0.0459, "step": 67 }, { "epoch": 0.008983717012914094, "grad_norm": 0.44555535912513733, "learning_rate": 0.0001999709480012697, "loss": 0.0576, "step": 68 }, { "epoch": 0.009115830498398123, "grad_norm": 0.5709348917007446, "learning_rate": 0.00019996993762269244, "loss": 0.0774, "step": 69 }, { "epoch": 0.009247943983882154, "grad_norm": 0.48640209436416626, "learning_rate": 0.00019996890997617766, "loss": 0.07, "step": 70 }, { "epoch": 0.009380057469366185, "grad_norm": 0.4809843897819519, "learning_rate": 0.0001999678650619028, "loss": 0.0783, "step": 71 }, { "epoch": 0.009512170954850217, "grad_norm": 0.3329864740371704, "learning_rate": 0.0001999668028800484, "loss": 0.0468, "step": 72 }, { "epoch": 0.009644284440334248, "grad_norm": 0.2965797781944275, "learning_rate": 0.00019996572343079788, "loss": 0.0433, "step": 73 }, { "epoch": 0.009776397925818277, "grad_norm": 0.31055697798728943, "learning_rate": 0.00019996462671433775, "loss": 0.075, "step": 74 }, { "epoch": 0.009908511411302308, "grad_norm": 0.3642573952674866, "learning_rate": 0.00019996351273085744, "loss": 0.0517, "step": 75 }, { "epoch": 0.01004062489678634, "grad_norm": 0.32922685146331787, "learning_rate": 0.0001999623814805493, "loss": 0.0499, "step": 76 }, { "epoch": 0.01017273838227037, "grad_norm": 0.27644476294517517, "learning_rate": 0.00019996123296360882, "loss": 0.0513, "step": 77 }, { "epoch": 0.010304851867754402, "grad_norm": 0.2898384630680084, "learning_rate": 0.00019996006718023433, "loss": 0.0516, "step": 78 }, { "epoch": 0.010436965353238431, "grad_norm": 0.2735963761806488, "learning_rate": 0.00019995888413062724, "loss": 0.0389, "step": 79 }, { "epoch": 0.010569078838722462, "grad_norm": 0.2912638783454895, "learning_rate": 0.00019995768381499186, "loss": 0.053, "step": 80 }, { "epoch": 0.010701192324206493, "grad_norm": 0.42680662870407104, "learning_rate": 0.00019995646623353555, "loss": 0.0569, "step": 81 }, { "epoch": 0.010833305809690524, "grad_norm": 0.43611371517181396, "learning_rate": 0.00019995523138646858, "loss": 0.0447, "step": 82 }, { "epoch": 0.010965419295174555, "grad_norm": 0.419028103351593, "learning_rate": 0.0001999539792740043, "loss": 0.0694, "step": 83 }, { "epoch": 0.011097532780658585, "grad_norm": 0.3377486765384674, "learning_rate": 0.00019995270989635894, "loss": 0.0568, "step": 84 }, { "epoch": 0.011229646266142616, "grad_norm": 0.25879159569740295, "learning_rate": 0.00019995142325375181, "loss": 0.0418, "step": 85 }, { "epoch": 0.011361759751626647, "grad_norm": 0.2732607126235962, "learning_rate": 0.00019995011934640516, "loss": 0.0446, "step": 86 }, { "epoch": 0.011493873237110678, "grad_norm": 0.39738181233406067, "learning_rate": 0.00019994879817454415, "loss": 0.0588, "step": 87 }, { "epoch": 0.01162598672259471, "grad_norm": 0.5449623465538025, "learning_rate": 0.00019994745973839703, "loss": 0.0697, "step": 88 }, { "epoch": 0.01175810020807874, "grad_norm": 0.48834940791130066, "learning_rate": 0.000199946104038195, "loss": 0.0654, "step": 89 }, { "epoch": 0.01189021369356277, "grad_norm": 0.3608017563819885, "learning_rate": 0.0001999447310741722, "loss": 0.0515, "step": 90 }, { "epoch": 0.012022327179046801, "grad_norm": 0.28702256083488464, "learning_rate": 0.0001999433408465658, "loss": 0.05, "step": 91 }, { "epoch": 0.012154440664530832, "grad_norm": 0.8507578372955322, "learning_rate": 0.00019994193335561594, "loss": 0.1108, "step": 92 }, { "epoch": 0.012286554150014863, "grad_norm": 0.35049304366111755, "learning_rate": 0.00019994050860156574, "loss": 0.0513, "step": 93 }, { "epoch": 0.012418667635498894, "grad_norm": 0.23222468793392181, "learning_rate": 0.0001999390665846613, "loss": 0.0363, "step": 94 }, { "epoch": 0.012550781120982924, "grad_norm": 0.5731639862060547, "learning_rate": 0.00019993760730515166, "loss": 0.054, "step": 95 }, { "epoch": 0.012682894606466955, "grad_norm": 0.26577410101890564, "learning_rate": 0.00019993613076328898, "loss": 0.042, "step": 96 }, { "epoch": 0.012815008091950986, "grad_norm": 0.33413466811180115, "learning_rate": 0.0001999346369593282, "loss": 0.031, "step": 97 }, { "epoch": 0.012947121577435017, "grad_norm": 0.311821848154068, "learning_rate": 0.00019993312589352739, "loss": 0.0518, "step": 98 }, { "epoch": 0.013079235062919048, "grad_norm": 0.4056153893470764, "learning_rate": 0.00019993159756614759, "loss": 0.0582, "step": 99 }, { "epoch": 0.013211348548403078, "grad_norm": 0.31561562418937683, "learning_rate": 0.00019993005197745274, "loss": 0.0554, "step": 100 }, { "epoch": 0.013343462033887109, "grad_norm": 0.4322955906391144, "learning_rate": 0.00019992848912770984, "loss": 0.0516, "step": 101 }, { "epoch": 0.01347557551937114, "grad_norm": 0.2842521369457245, "learning_rate": 0.0001999269090171888, "loss": 0.0443, "step": 102 }, { "epoch": 0.013607689004855171, "grad_norm": 0.4293564558029175, "learning_rate": 0.00019992531164616262, "loss": 0.0679, "step": 103 }, { "epoch": 0.013739802490339202, "grad_norm": 0.4246540069580078, "learning_rate": 0.00019992369701490715, "loss": 0.0428, "step": 104 }, { "epoch": 0.013871915975823232, "grad_norm": 0.4045541286468506, "learning_rate": 0.00019992206512370135, "loss": 0.0616, "step": 105 }, { "epoch": 0.014004029461307263, "grad_norm": 0.37344667315483093, "learning_rate": 0.00019992041597282706, "loss": 0.037, "step": 106 }, { "epoch": 0.014136142946791294, "grad_norm": 0.19354045391082764, "learning_rate": 0.00019991874956256918, "loss": 0.0255, "step": 107 }, { "epoch": 0.014268256432275325, "grad_norm": 0.2800353765487671, "learning_rate": 0.00019991706589321548, "loss": 0.0475, "step": 108 }, { "epoch": 0.014400369917759356, "grad_norm": 0.29562219977378845, "learning_rate": 0.00019991536496505682, "loss": 0.0441, "step": 109 }, { "epoch": 0.014532483403243385, "grad_norm": 0.35554012656211853, "learning_rate": 0.00019991364677838705, "loss": 0.0477, "step": 110 }, { "epoch": 0.014664596888727417, "grad_norm": 0.2925634980201721, "learning_rate": 0.00019991191133350287, "loss": 0.031, "step": 111 }, { "epoch": 0.014796710374211448, "grad_norm": 0.29444658756256104, "learning_rate": 0.00019991015863070411, "loss": 0.0532, "step": 112 }, { "epoch": 0.014928823859695479, "grad_norm": 0.3253136873245239, "learning_rate": 0.00019990838867029348, "loss": 0.0579, "step": 113 }, { "epoch": 0.01506093734517951, "grad_norm": 0.31317800283432007, "learning_rate": 0.00019990660145257673, "loss": 0.0486, "step": 114 }, { "epoch": 0.01519305083066354, "grad_norm": 0.3204265236854553, "learning_rate": 0.00019990479697786257, "loss": 0.0497, "step": 115 }, { "epoch": 0.01532516431614757, "grad_norm": 0.3222450911998749, "learning_rate": 0.0001999029752464627, "loss": 0.047, "step": 116 }, { "epoch": 0.015457277801631602, "grad_norm": 0.4164290726184845, "learning_rate": 0.00019990113625869172, "loss": 0.0575, "step": 117 }, { "epoch": 0.015589391287115633, "grad_norm": 0.32476532459259033, "learning_rate": 0.00019989928001486735, "loss": 0.0426, "step": 118 }, { "epoch": 0.015721504772599662, "grad_norm": 0.32922977209091187, "learning_rate": 0.0001998974065153102, "loss": 0.0502, "step": 119 }, { "epoch": 0.015853618258083693, "grad_norm": 0.5189375281333923, "learning_rate": 0.0001998955157603439, "loss": 0.0707, "step": 120 }, { "epoch": 0.015985731743567724, "grad_norm": 0.2372635304927826, "learning_rate": 0.000199893607750295, "loss": 0.0362, "step": 121 }, { "epoch": 0.016117845229051755, "grad_norm": 0.39730942249298096, "learning_rate": 0.00019989168248549312, "loss": 0.0472, "step": 122 }, { "epoch": 0.016249958714535787, "grad_norm": 0.3153408467769623, "learning_rate": 0.00019988973996627076, "loss": 0.0335, "step": 123 }, { "epoch": 0.016382072200019818, "grad_norm": 0.38365423679351807, "learning_rate": 0.0001998877801929635, "loss": 0.0574, "step": 124 }, { "epoch": 0.01651418568550385, "grad_norm": 0.35128024220466614, "learning_rate": 0.00019988580316590985, "loss": 0.039, "step": 125 }, { "epoch": 0.01664629917098788, "grad_norm": 0.23545370995998383, "learning_rate": 0.00019988380888545128, "loss": 0.0374, "step": 126 }, { "epoch": 0.01677841265647191, "grad_norm": 0.35600200295448303, "learning_rate": 0.00019988179735193232, "loss": 0.0625, "step": 127 }, { "epoch": 0.01691052614195594, "grad_norm": 0.3094804286956787, "learning_rate": 0.00019987976856570034, "loss": 0.0505, "step": 128 }, { "epoch": 0.01704263962743997, "grad_norm": 0.41479748487472534, "learning_rate": 0.00019987772252710582, "loss": 0.0468, "step": 129 }, { "epoch": 0.017174753112924, "grad_norm": 0.3505192995071411, "learning_rate": 0.0001998756592365022, "loss": 0.0562, "step": 130 }, { "epoch": 0.017306866598408032, "grad_norm": 0.30042126774787903, "learning_rate": 0.00019987357869424586, "loss": 0.046, "step": 131 }, { "epoch": 0.017438980083892063, "grad_norm": 0.27621573209762573, "learning_rate": 0.00019987148090069617, "loss": 0.0474, "step": 132 }, { "epoch": 0.017571093569376094, "grad_norm": 0.28688856959342957, "learning_rate": 0.00019986936585621542, "loss": 0.0348, "step": 133 }, { "epoch": 0.017703207054860125, "grad_norm": 0.314082533121109, "learning_rate": 0.00019986723356116905, "loss": 0.0699, "step": 134 }, { "epoch": 0.017835320540344157, "grad_norm": 0.3481959104537964, "learning_rate": 0.0001998650840159253, "loss": 0.0376, "step": 135 }, { "epoch": 0.017967434025828188, "grad_norm": 0.31359419226646423, "learning_rate": 0.00019986291722085553, "loss": 0.0456, "step": 136 }, { "epoch": 0.01809954751131222, "grad_norm": 0.29597827792167664, "learning_rate": 0.00019986073317633394, "loss": 0.0481, "step": 137 }, { "epoch": 0.018231660996796246, "grad_norm": 0.31836578249931335, "learning_rate": 0.00019985853188273783, "loss": 0.0566, "step": 138 }, { "epoch": 0.018363774482280278, "grad_norm": 0.3767854869365692, "learning_rate": 0.0001998563133404474, "loss": 0.0456, "step": 139 }, { "epoch": 0.01849588796776431, "grad_norm": 0.5224811434745789, "learning_rate": 0.0001998540775498459, "loss": 0.0605, "step": 140 }, { "epoch": 0.01862800145324834, "grad_norm": 0.6283369660377502, "learning_rate": 0.00019985182451131948, "loss": 0.0737, "step": 141 }, { "epoch": 0.01876011493873237, "grad_norm": 0.29250311851501465, "learning_rate": 0.00019984955422525737, "loss": 0.0393, "step": 142 }, { "epoch": 0.018892228424216402, "grad_norm": 0.3254264295101166, "learning_rate": 0.00019984726669205167, "loss": 0.0511, "step": 143 }, { "epoch": 0.019024341909700433, "grad_norm": 0.5066094398498535, "learning_rate": 0.00019984496191209752, "loss": 0.0519, "step": 144 }, { "epoch": 0.019156455395184464, "grad_norm": 0.4091821312904358, "learning_rate": 0.00019984263988579302, "loss": 0.0602, "step": 145 }, { "epoch": 0.019288568880668495, "grad_norm": 0.3435320556163788, "learning_rate": 0.00019984030061353925, "loss": 0.0508, "step": 146 }, { "epoch": 0.019420682366152527, "grad_norm": 0.2907693386077881, "learning_rate": 0.0001998379440957403, "loss": 0.0469, "step": 147 }, { "epoch": 0.019552795851636554, "grad_norm": 0.38646361231803894, "learning_rate": 0.00019983557033280322, "loss": 0.0435, "step": 148 }, { "epoch": 0.019684909337120585, "grad_norm": 0.3644489347934723, "learning_rate": 0.000199833179325138, "loss": 0.0468, "step": 149 }, { "epoch": 0.019817022822604616, "grad_norm": 0.3132040202617645, "learning_rate": 0.00019983077107315768, "loss": 0.0566, "step": 150 }, { "epoch": 0.019949136308088648, "grad_norm": 0.41234928369522095, "learning_rate": 0.0001998283455772782, "loss": 0.0571, "step": 151 }, { "epoch": 0.02008124979357268, "grad_norm": 0.31212666630744934, "learning_rate": 0.00019982590283791857, "loss": 0.0345, "step": 152 }, { "epoch": 0.02021336327905671, "grad_norm": 0.27527937293052673, "learning_rate": 0.00019982344285550068, "loss": 0.0414, "step": 153 }, { "epoch": 0.02034547676454074, "grad_norm": 0.40870988368988037, "learning_rate": 0.00019982096563044946, "loss": 0.0633, "step": 154 }, { "epoch": 0.020477590250024772, "grad_norm": 0.3136458098888397, "learning_rate": 0.0001998184711631928, "loss": 0.0445, "step": 155 }, { "epoch": 0.020609703735508803, "grad_norm": 0.26755937933921814, "learning_rate": 0.00019981595945416157, "loss": 0.0431, "step": 156 }, { "epoch": 0.020741817220992834, "grad_norm": 0.5987271666526794, "learning_rate": 0.00019981343050378967, "loss": 0.0578, "step": 157 }, { "epoch": 0.020873930706476862, "grad_norm": 1.4551541805267334, "learning_rate": 0.00019981088431251384, "loss": 0.0443, "step": 158 }, { "epoch": 0.021006044191960893, "grad_norm": 0.3044228255748749, "learning_rate": 0.00019980832088077396, "loss": 0.0385, "step": 159 }, { "epoch": 0.021138157677444924, "grad_norm": 0.4092259109020233, "learning_rate": 0.00019980574020901282, "loss": 0.0492, "step": 160 }, { "epoch": 0.021270271162928955, "grad_norm": 0.3194064497947693, "learning_rate": 0.00019980314229767608, "loss": 0.0571, "step": 161 }, { "epoch": 0.021402384648412986, "grad_norm": 0.27775710821151733, "learning_rate": 0.00019980052714721263, "loss": 0.0477, "step": 162 }, { "epoch": 0.021534498133897018, "grad_norm": 0.3487389385700226, "learning_rate": 0.0001997978947580741, "loss": 0.0427, "step": 163 }, { "epoch": 0.02166661161938105, "grad_norm": 0.3832155466079712, "learning_rate": 0.00019979524513071516, "loss": 0.045, "step": 164 }, { "epoch": 0.02179872510486508, "grad_norm": 0.27927878499031067, "learning_rate": 0.00019979257826559357, "loss": 0.0371, "step": 165 }, { "epoch": 0.02193083859034911, "grad_norm": 0.2599792182445526, "learning_rate": 0.00019978989416316988, "loss": 0.0436, "step": 166 }, { "epoch": 0.022062952075833142, "grad_norm": 0.33800897002220154, "learning_rate": 0.00019978719282390782, "loss": 0.0624, "step": 167 }, { "epoch": 0.02219506556131717, "grad_norm": 0.3244200646877289, "learning_rate": 0.00019978447424827392, "loss": 0.0305, "step": 168 }, { "epoch": 0.0223271790468012, "grad_norm": 0.3526539206504822, "learning_rate": 0.00019978173843673779, "loss": 0.0428, "step": 169 }, { "epoch": 0.022459292532285232, "grad_norm": 0.21195152401924133, "learning_rate": 0.00019977898538977201, "loss": 0.0326, "step": 170 }, { "epoch": 0.022591406017769263, "grad_norm": 0.36155325174331665, "learning_rate": 0.00019977621510785208, "loss": 0.0716, "step": 171 }, { "epoch": 0.022723519503253294, "grad_norm": 0.31337928771972656, "learning_rate": 0.00019977342759145653, "loss": 0.0436, "step": 172 }, { "epoch": 0.022855632988737325, "grad_norm": 0.33871617913246155, "learning_rate": 0.00019977062284106688, "loss": 0.0496, "step": 173 }, { "epoch": 0.022987746474221357, "grad_norm": 0.22005482017993927, "learning_rate": 0.00019976780085716758, "loss": 0.0246, "step": 174 }, { "epoch": 0.023119859959705388, "grad_norm": 0.4200701117515564, "learning_rate": 0.00019976496164024604, "loss": 0.0575, "step": 175 }, { "epoch": 0.02325197344518942, "grad_norm": 0.3284203112125397, "learning_rate": 0.0001997621051907927, "loss": 0.0565, "step": 176 }, { "epoch": 0.02338408693067345, "grad_norm": 0.3615087568759918, "learning_rate": 0.000199759231509301, "loss": 0.0535, "step": 177 }, { "epoch": 0.02351620041615748, "grad_norm": 0.2843573987483978, "learning_rate": 0.00019975634059626727, "loss": 0.0453, "step": 178 }, { "epoch": 0.02364831390164151, "grad_norm": 0.3295792043209076, "learning_rate": 0.00019975343245219086, "loss": 0.0551, "step": 179 }, { "epoch": 0.02378042738712554, "grad_norm": 0.3016244173049927, "learning_rate": 0.00019975050707757413, "loss": 0.054, "step": 180 }, { "epoch": 0.02391254087260957, "grad_norm": 0.3135812282562256, "learning_rate": 0.00019974756447292235, "loss": 0.0415, "step": 181 }, { "epoch": 0.024044654358093602, "grad_norm": 0.2880041301250458, "learning_rate": 0.00019974460463874382, "loss": 0.0417, "step": 182 }, { "epoch": 0.024176767843577633, "grad_norm": 0.23650489747524261, "learning_rate": 0.0001997416275755498, "loss": 0.0382, "step": 183 }, { "epoch": 0.024308881329061664, "grad_norm": 0.20587410032749176, "learning_rate": 0.0001997386332838545, "loss": 0.0285, "step": 184 }, { "epoch": 0.024440994814545695, "grad_norm": 0.2216879278421402, "learning_rate": 0.00019973562176417515, "loss": 0.0293, "step": 185 }, { "epoch": 0.024573108300029727, "grad_norm": 0.3186545968055725, "learning_rate": 0.0001997325930170319, "loss": 0.0408, "step": 186 }, { "epoch": 0.024705221785513758, "grad_norm": 0.25373074412345886, "learning_rate": 0.00019972954704294797, "loss": 0.0323, "step": 187 }, { "epoch": 0.02483733527099779, "grad_norm": 0.27224501967430115, "learning_rate": 0.00019972648384244943, "loss": 0.0389, "step": 188 }, { "epoch": 0.024969448756481816, "grad_norm": 0.2983395755290985, "learning_rate": 0.00019972340341606546, "loss": 0.0511, "step": 189 }, { "epoch": 0.025101562241965848, "grad_norm": 0.2117580771446228, "learning_rate": 0.00019972030576432807, "loss": 0.0279, "step": 190 }, { "epoch": 0.02523367572744988, "grad_norm": 0.3268132507801056, "learning_rate": 0.00019971719088777236, "loss": 0.0526, "step": 191 }, { "epoch": 0.02536578921293391, "grad_norm": 0.295146644115448, "learning_rate": 0.00019971405878693637, "loss": 0.0371, "step": 192 }, { "epoch": 0.02549790269841794, "grad_norm": 0.3200048804283142, "learning_rate": 0.00019971090946236108, "loss": 0.0454, "step": 193 }, { "epoch": 0.025630016183901972, "grad_norm": 0.30760833621025085, "learning_rate": 0.00019970774291459053, "loss": 0.0477, "step": 194 }, { "epoch": 0.025762129669386003, "grad_norm": 0.28404486179351807, "learning_rate": 0.00019970455914417165, "loss": 0.0317, "step": 195 }, { "epoch": 0.025894243154870034, "grad_norm": 0.271648645401001, "learning_rate": 0.00019970135815165438, "loss": 0.0354, "step": 196 }, { "epoch": 0.026026356640354065, "grad_norm": 0.3435926139354706, "learning_rate": 0.00019969813993759162, "loss": 0.0443, "step": 197 }, { "epoch": 0.026158470125838097, "grad_norm": 0.31753024458885193, "learning_rate": 0.00019969490450253932, "loss": 0.0439, "step": 198 }, { "epoch": 0.026290583611322124, "grad_norm": 0.4796743094921112, "learning_rate": 0.00019969165184705623, "loss": 0.0335, "step": 199 }, { "epoch": 0.026422697096806155, "grad_norm": 0.47100892663002014, "learning_rate": 0.00019968838197170427, "loss": 0.0516, "step": 200 }, { "epoch": 0.026554810582290186, "grad_norm": 0.32624903321266174, "learning_rate": 0.0001996850948770482, "loss": 0.0431, "step": 201 }, { "epoch": 0.026686924067774218, "grad_norm": 0.2607976198196411, "learning_rate": 0.00019968179056365588, "loss": 0.0306, "step": 202 }, { "epoch": 0.02681903755325825, "grad_norm": 0.3531661331653595, "learning_rate": 0.000199678469032098, "loss": 0.0474, "step": 203 }, { "epoch": 0.02695115103874228, "grad_norm": 0.3134405314922333, "learning_rate": 0.0001996751302829483, "loss": 0.0518, "step": 204 }, { "epoch": 0.02708326452422631, "grad_norm": 0.26866933703422546, "learning_rate": 0.00019967177431678347, "loss": 0.0375, "step": 205 }, { "epoch": 0.027215378009710342, "grad_norm": 0.30498382449150085, "learning_rate": 0.00019966840113418326, "loss": 0.033, "step": 206 }, { "epoch": 0.027347491495194373, "grad_norm": 0.27717041969299316, "learning_rate": 0.00019966501073573025, "loss": 0.0387, "step": 207 }, { "epoch": 0.027479604980678404, "grad_norm": 0.3919484615325928, "learning_rate": 0.0001996616031220101, "loss": 0.0434, "step": 208 }, { "epoch": 0.027611718466162432, "grad_norm": 0.31150931119918823, "learning_rate": 0.00019965817829361145, "loss": 0.0352, "step": 209 }, { "epoch": 0.027743831951646463, "grad_norm": 1.0243334770202637, "learning_rate": 0.0001996547362511258, "loss": 0.0719, "step": 210 }, { "epoch": 0.027875945437130494, "grad_norm": 0.4097023904323578, "learning_rate": 0.00019965127699514773, "loss": 0.0503, "step": 211 }, { "epoch": 0.028008058922614525, "grad_norm": 0.29546260833740234, "learning_rate": 0.00019964780052627478, "loss": 0.0484, "step": 212 }, { "epoch": 0.028140172408098556, "grad_norm": 0.3095148801803589, "learning_rate": 0.00019964430684510744, "loss": 0.0548, "step": 213 }, { "epoch": 0.028272285893582588, "grad_norm": 0.37686797976493835, "learning_rate": 0.00019964079595224919, "loss": 0.0559, "step": 214 }, { "epoch": 0.02840439937906662, "grad_norm": 0.352685809135437, "learning_rate": 0.0001996372678483064, "loss": 0.0424, "step": 215 }, { "epoch": 0.02853651286455065, "grad_norm": 0.30168718099594116, "learning_rate": 0.00019963372253388857, "loss": 0.0392, "step": 216 }, { "epoch": 0.02866862635003468, "grad_norm": 0.3077027201652527, "learning_rate": 0.00019963016000960803, "loss": 0.043, "step": 217 }, { "epoch": 0.028800739835518712, "grad_norm": 0.1910567730665207, "learning_rate": 0.00019962658027608017, "loss": 0.0266, "step": 218 }, { "epoch": 0.02893285332100274, "grad_norm": 0.3267483115196228, "learning_rate": 0.00019962298333392331, "loss": 0.0466, "step": 219 }, { "epoch": 0.02906496680648677, "grad_norm": 0.2665257155895233, "learning_rate": 0.00019961936918375876, "loss": 0.0373, "step": 220 }, { "epoch": 0.029197080291970802, "grad_norm": 0.3253210484981537, "learning_rate": 0.0001996157378262108, "loss": 0.0416, "step": 221 }, { "epoch": 0.029329193777454833, "grad_norm": 0.32253530621528625, "learning_rate": 0.00019961208926190668, "loss": 0.0369, "step": 222 }, { "epoch": 0.029461307262938864, "grad_norm": 0.3137173652648926, "learning_rate": 0.00019960842349147658, "loss": 0.028, "step": 223 }, { "epoch": 0.029593420748422895, "grad_norm": 0.3122856020927429, "learning_rate": 0.00019960474051555372, "loss": 0.0547, "step": 224 }, { "epoch": 0.029725534233906926, "grad_norm": 0.3124530613422394, "learning_rate": 0.00019960104033477433, "loss": 0.0529, "step": 225 }, { "epoch": 0.029857647719390958, "grad_norm": 0.3381507992744446, "learning_rate": 0.00019959732294977744, "loss": 0.0457, "step": 226 }, { "epoch": 0.02998976120487499, "grad_norm": 0.25204360485076904, "learning_rate": 0.00019959358836120524, "loss": 0.0262, "step": 227 }, { "epoch": 0.03012187469035902, "grad_norm": 0.21586604416370392, "learning_rate": 0.00019958983656970277, "loss": 0.0328, "step": 228 }, { "epoch": 0.030253988175843047, "grad_norm": 0.32628804445266724, "learning_rate": 0.00019958606757591806, "loss": 0.0332, "step": 229 }, { "epoch": 0.03038610166132708, "grad_norm": 0.31612610816955566, "learning_rate": 0.00019958228138050222, "loss": 0.0358, "step": 230 }, { "epoch": 0.03051821514681111, "grad_norm": 0.2597368061542511, "learning_rate": 0.00019957847798410914, "loss": 0.0321, "step": 231 }, { "epoch": 0.03065032863229514, "grad_norm": 0.2700968384742737, "learning_rate": 0.00019957465738739587, "loss": 0.0507, "step": 232 }, { "epoch": 0.030782442117779172, "grad_norm": 0.3140323758125305, "learning_rate": 0.0001995708195910223, "loss": 0.06, "step": 233 }, { "epoch": 0.030914555603263203, "grad_norm": 0.2122896909713745, "learning_rate": 0.00019956696459565133, "loss": 0.0308, "step": 234 }, { "epoch": 0.031046669088747234, "grad_norm": 0.30587294697761536, "learning_rate": 0.00019956309240194887, "loss": 0.0398, "step": 235 }, { "epoch": 0.031178782574231265, "grad_norm": 0.3328923285007477, "learning_rate": 0.00019955920301058377, "loss": 0.0703, "step": 236 }, { "epoch": 0.03131089605971529, "grad_norm": 0.25456702709198, "learning_rate": 0.00019955529642222782, "loss": 0.0384, "step": 237 }, { "epoch": 0.031443009545199324, "grad_norm": 0.4323252737522125, "learning_rate": 0.00019955137263755584, "loss": 0.0473, "step": 238 }, { "epoch": 0.031575123030683355, "grad_norm": 0.2853761911392212, "learning_rate": 0.00019954743165724554, "loss": 0.0463, "step": 239 }, { "epoch": 0.031707236516167386, "grad_norm": 0.24068021774291992, "learning_rate": 0.00019954347348197772, "loss": 0.0422, "step": 240 }, { "epoch": 0.03183935000165142, "grad_norm": 0.24711669981479645, "learning_rate": 0.00019953949811243602, "loss": 0.048, "step": 241 }, { "epoch": 0.03197146348713545, "grad_norm": 0.3120286464691162, "learning_rate": 0.00019953550554930715, "loss": 0.0297, "step": 242 }, { "epoch": 0.03210357697261948, "grad_norm": 0.35250943899154663, "learning_rate": 0.00019953149579328075, "loss": 0.028, "step": 243 }, { "epoch": 0.03223569045810351, "grad_norm": 0.26348239183425903, "learning_rate": 0.00019952746884504942, "loss": 0.037, "step": 244 }, { "epoch": 0.03236780394358754, "grad_norm": 0.23028039932250977, "learning_rate": 0.00019952342470530874, "loss": 0.0279, "step": 245 }, { "epoch": 0.03249991742907157, "grad_norm": 0.3633415102958679, "learning_rate": 0.00019951936337475723, "loss": 0.0627, "step": 246 }, { "epoch": 0.032632030914555604, "grad_norm": 0.582521915435791, "learning_rate": 0.00019951528485409646, "loss": 0.0819, "step": 247 }, { "epoch": 0.032764144400039635, "grad_norm": 0.28156954050064087, "learning_rate": 0.0001995111891440309, "loss": 0.044, "step": 248 }, { "epoch": 0.032896257885523666, "grad_norm": 0.4584990441799164, "learning_rate": 0.000199507076245268, "loss": 0.0534, "step": 249 }, { "epoch": 0.0330283713710077, "grad_norm": 0.45770400762557983, "learning_rate": 0.00019950294615851818, "loss": 0.0524, "step": 250 }, { "epoch": 0.03316048485649173, "grad_norm": 0.23415185511112213, "learning_rate": 0.00019949879888449487, "loss": 0.0278, "step": 251 }, { "epoch": 0.03329259834197576, "grad_norm": 0.23550723493099213, "learning_rate": 0.00019949463442391437, "loss": 0.0466, "step": 252 }, { "epoch": 0.03342471182745979, "grad_norm": 0.21547628939151764, "learning_rate": 0.00019949045277749608, "loss": 0.0346, "step": 253 }, { "epoch": 0.03355682531294382, "grad_norm": 0.28360670804977417, "learning_rate": 0.0001994862539459623, "loss": 0.0447, "step": 254 }, { "epoch": 0.033688938798427846, "grad_norm": 0.22443735599517822, "learning_rate": 0.00019948203793003822, "loss": 0.0446, "step": 255 }, { "epoch": 0.03382105228391188, "grad_norm": 0.46867096424102783, "learning_rate": 0.00019947780473045216, "loss": 0.0703, "step": 256 }, { "epoch": 0.03395316576939591, "grad_norm": 0.24217675626277924, "learning_rate": 0.00019947355434793526, "loss": 0.0486, "step": 257 }, { "epoch": 0.03408527925487994, "grad_norm": 0.2256115823984146, "learning_rate": 0.00019946928678322173, "loss": 0.0383, "step": 258 }, { "epoch": 0.03421739274036397, "grad_norm": 0.2267841249704361, "learning_rate": 0.00019946500203704877, "loss": 0.0366, "step": 259 }, { "epoch": 0.034349506225848, "grad_norm": 0.330691397190094, "learning_rate": 0.00019946070011015642, "loss": 0.0473, "step": 260 }, { "epoch": 0.03448161971133203, "grad_norm": 0.3271361291408539, "learning_rate": 0.0001994563810032877, "loss": 0.0391, "step": 261 }, { "epoch": 0.034613733196816064, "grad_norm": 0.23788490891456604, "learning_rate": 0.0001994520447171888, "loss": 0.0251, "step": 262 }, { "epoch": 0.034745846682300095, "grad_norm": 0.37657663226127625, "learning_rate": 0.00019944769125260862, "loss": 0.0442, "step": 263 }, { "epoch": 0.034877960167784126, "grad_norm": 0.37910810112953186, "learning_rate": 0.0001994433206102992, "loss": 0.0411, "step": 264 }, { "epoch": 0.03501007365326816, "grad_norm": 0.4243963956832886, "learning_rate": 0.00019943893279101543, "loss": 0.0621, "step": 265 }, { "epoch": 0.03514218713875219, "grad_norm": 0.3931756019592285, "learning_rate": 0.0001994345277955153, "loss": 0.0544, "step": 266 }, { "epoch": 0.03527430062423622, "grad_norm": 0.5544042587280273, "learning_rate": 0.00019943010562455962, "loss": 0.046, "step": 267 }, { "epoch": 0.03540641410972025, "grad_norm": 0.3359571695327759, "learning_rate": 0.0001994256662789123, "loss": 0.0476, "step": 268 }, { "epoch": 0.03553852759520428, "grad_norm": 0.26182490587234497, "learning_rate": 0.00019942120975934008, "loss": 0.0301, "step": 269 }, { "epoch": 0.03567064108068831, "grad_norm": 0.30689841508865356, "learning_rate": 0.00019941673606661277, "loss": 0.0414, "step": 270 }, { "epoch": 0.035802754566172344, "grad_norm": 1.3792670965194702, "learning_rate": 0.00019941224520150314, "loss": 0.0793, "step": 271 }, { "epoch": 0.035934868051656375, "grad_norm": 0.2693886160850525, "learning_rate": 0.0001994077371647869, "loss": 0.038, "step": 272 }, { "epoch": 0.036066981537140406, "grad_norm": 0.26960980892181396, "learning_rate": 0.0001994032119572427, "loss": 0.0331, "step": 273 }, { "epoch": 0.03619909502262444, "grad_norm": 0.3784126937389374, "learning_rate": 0.00019939866957965224, "loss": 0.0527, "step": 274 }, { "epoch": 0.03633120850810846, "grad_norm": 0.308722585439682, "learning_rate": 0.00019939411003280007, "loss": 0.0576, "step": 275 }, { "epoch": 0.03646332199359249, "grad_norm": 0.3805916905403137, "learning_rate": 0.0001993895333174738, "loss": 0.0417, "step": 276 }, { "epoch": 0.036595435479076524, "grad_norm": 0.39375531673431396, "learning_rate": 0.00019938493943446394, "loss": 0.0544, "step": 277 }, { "epoch": 0.036727548964560555, "grad_norm": 0.37296605110168457, "learning_rate": 0.000199380328384564, "loss": 0.0424, "step": 278 }, { "epoch": 0.036859662450044586, "grad_norm": 0.5127437114715576, "learning_rate": 0.00019937570016857054, "loss": 0.0578, "step": 279 }, { "epoch": 0.03699177593552862, "grad_norm": 0.20692554116249084, "learning_rate": 0.00019937105478728292, "loss": 0.0304, "step": 280 }, { "epoch": 0.03712388942101265, "grad_norm": 0.2890003025531769, "learning_rate": 0.0001993663922415035, "loss": 0.0408, "step": 281 }, { "epoch": 0.03725600290649668, "grad_norm": 0.3227050006389618, "learning_rate": 0.00019936171253203772, "loss": 0.0573, "step": 282 }, { "epoch": 0.03738811639198071, "grad_norm": 0.3141216039657593, "learning_rate": 0.00019935701565969391, "loss": 0.0452, "step": 283 }, { "epoch": 0.03752022987746474, "grad_norm": 0.3515707552433014, "learning_rate": 0.00019935230162528334, "loss": 0.0557, "step": 284 }, { "epoch": 0.03765234336294877, "grad_norm": 0.24839921295642853, "learning_rate": 0.0001993475704296203, "loss": 0.041, "step": 285 }, { "epoch": 0.037784456848432804, "grad_norm": 0.2833830714225769, "learning_rate": 0.00019934282207352197, "loss": 0.0503, "step": 286 }, { "epoch": 0.037916570333916835, "grad_norm": 0.25666195154190063, "learning_rate": 0.0001993380565578086, "loss": 0.0413, "step": 287 }, { "epoch": 0.038048683819400866, "grad_norm": 0.41307997703552246, "learning_rate": 0.00019933327388330327, "loss": 0.042, "step": 288 }, { "epoch": 0.0381807973048849, "grad_norm": 0.28401657938957214, "learning_rate": 0.00019932847405083214, "loss": 0.0391, "step": 289 }, { "epoch": 0.03831291079036893, "grad_norm": 0.2572495937347412, "learning_rate": 0.00019932365706122433, "loss": 0.0392, "step": 290 }, { "epoch": 0.03844502427585296, "grad_norm": 0.2577889561653137, "learning_rate": 0.00019931882291531183, "loss": 0.0304, "step": 291 }, { "epoch": 0.03857713776133699, "grad_norm": 0.25664687156677246, "learning_rate": 0.00019931397161392965, "loss": 0.0364, "step": 292 }, { "epoch": 0.03870925124682102, "grad_norm": 0.3059316873550415, "learning_rate": 0.00019930910315791577, "loss": 0.0412, "step": 293 }, { "epoch": 0.03884136473230505, "grad_norm": 0.20961342751979828, "learning_rate": 0.00019930421754811112, "loss": 0.0317, "step": 294 }, { "epoch": 0.038973478217789084, "grad_norm": 0.2327074110507965, "learning_rate": 0.00019929931478535965, "loss": 0.0391, "step": 295 }, { "epoch": 0.03910559170327311, "grad_norm": 0.2746254503726959, "learning_rate": 0.00019929439487050812, "loss": 0.017, "step": 296 }, { "epoch": 0.03923770518875714, "grad_norm": 0.7959591150283813, "learning_rate": 0.00019928945780440645, "loss": 0.0459, "step": 297 }, { "epoch": 0.03936981867424117, "grad_norm": 0.2347254902124405, "learning_rate": 0.0001992845035879074, "loss": 0.0416, "step": 298 }, { "epoch": 0.0395019321597252, "grad_norm": 0.34481900930404663, "learning_rate": 0.00019927953222186666, "loss": 0.0609, "step": 299 }, { "epoch": 0.03963404564520923, "grad_norm": 0.46709200739860535, "learning_rate": 0.000199274543707143, "loss": 0.0428, "step": 300 }, { "epoch": 0.039766159130693264, "grad_norm": 0.29546868801116943, "learning_rate": 0.0001992695380445981, "loss": 0.0362, "step": 301 }, { "epoch": 0.039898272616177295, "grad_norm": 0.45198675990104675, "learning_rate": 0.00019926451523509653, "loss": 0.0766, "step": 302 }, { "epoch": 0.040030386101661326, "grad_norm": 0.25279825925827026, "learning_rate": 0.00019925947527950596, "loss": 0.051, "step": 303 }, { "epoch": 0.04016249958714536, "grad_norm": 0.22193700075149536, "learning_rate": 0.0001992544181786969, "loss": 0.0299, "step": 304 }, { "epoch": 0.04029461307262939, "grad_norm": 0.3298232853412628, "learning_rate": 0.00019924934393354292, "loss": 0.0468, "step": 305 }, { "epoch": 0.04042672655811342, "grad_norm": 0.22312577068805695, "learning_rate": 0.00019924425254492042, "loss": 0.0385, "step": 306 }, { "epoch": 0.04055884004359745, "grad_norm": 0.27001601457595825, "learning_rate": 0.00019923914401370893, "loss": 0.0292, "step": 307 }, { "epoch": 0.04069095352908148, "grad_norm": 0.2545408010482788, "learning_rate": 0.0001992340183407908, "loss": 0.043, "step": 308 }, { "epoch": 0.04082306701456551, "grad_norm": 0.22949855029582977, "learning_rate": 0.0001992288755270514, "loss": 0.0321, "step": 309 }, { "epoch": 0.040955180500049544, "grad_norm": 0.44335100054740906, "learning_rate": 0.00019922371557337906, "loss": 0.0524, "step": 310 }, { "epoch": 0.041087293985533575, "grad_norm": 0.46752268075942993, "learning_rate": 0.00019921853848066506, "loss": 0.0432, "step": 311 }, { "epoch": 0.041219407471017606, "grad_norm": 0.4074217677116394, "learning_rate": 0.0001992133442498037, "loss": 0.0359, "step": 312 }, { "epoch": 0.04135152095650164, "grad_norm": 0.33330097794532776, "learning_rate": 0.00019920813288169212, "loss": 0.0736, "step": 313 }, { "epoch": 0.04148363444198567, "grad_norm": 0.352234810590744, "learning_rate": 0.00019920290437723046, "loss": 0.0338, "step": 314 }, { "epoch": 0.0416157479274697, "grad_norm": 0.27098697423934937, "learning_rate": 0.00019919765873732193, "loss": 0.0482, "step": 315 }, { "epoch": 0.041747861412953724, "grad_norm": 0.29560789465904236, "learning_rate": 0.00019919239596287257, "loss": 0.0426, "step": 316 }, { "epoch": 0.041879974898437755, "grad_norm": 0.22003328800201416, "learning_rate": 0.00019918711605479146, "loss": 0.0299, "step": 317 }, { "epoch": 0.042012088383921786, "grad_norm": 0.20728716254234314, "learning_rate": 0.00019918181901399057, "loss": 0.0335, "step": 318 }, { "epoch": 0.04214420186940582, "grad_norm": 0.31240910291671753, "learning_rate": 0.00019917650484138486, "loss": 0.042, "step": 319 }, { "epoch": 0.04227631535488985, "grad_norm": 0.3703945279121399, "learning_rate": 0.00019917117353789225, "loss": 0.0403, "step": 320 }, { "epoch": 0.04240842884037388, "grad_norm": 0.2775070071220398, "learning_rate": 0.00019916582510443368, "loss": 0.0405, "step": 321 }, { "epoch": 0.04254054232585791, "grad_norm": 0.2771599292755127, "learning_rate": 0.00019916045954193292, "loss": 0.0455, "step": 322 }, { "epoch": 0.04267265581134194, "grad_norm": 0.4107271432876587, "learning_rate": 0.00019915507685131685, "loss": 0.0458, "step": 323 }, { "epoch": 0.04280476929682597, "grad_norm": 0.38203802704811096, "learning_rate": 0.00019914967703351513, "loss": 0.0495, "step": 324 }, { "epoch": 0.042936882782310004, "grad_norm": 0.26967763900756836, "learning_rate": 0.00019914426008946058, "loss": 0.0364, "step": 325 }, { "epoch": 0.043068996267794035, "grad_norm": 0.39296478033065796, "learning_rate": 0.00019913882602008877, "loss": 0.045, "step": 326 }, { "epoch": 0.043201109753278066, "grad_norm": 0.2375117838382721, "learning_rate": 0.00019913337482633844, "loss": 0.0359, "step": 327 }, { "epoch": 0.0433332232387621, "grad_norm": 0.3346708118915558, "learning_rate": 0.00019912790650915112, "loss": 0.0376, "step": 328 }, { "epoch": 0.04346533672424613, "grad_norm": 0.26708805561065674, "learning_rate": 0.00019912242106947137, "loss": 0.0325, "step": 329 }, { "epoch": 0.04359745020973016, "grad_norm": 0.3279438316822052, "learning_rate": 0.0001991169185082467, "loss": 0.0437, "step": 330 }, { "epoch": 0.04372956369521419, "grad_norm": 0.12221206724643707, "learning_rate": 0.00019911139882642758, "loss": 0.0154, "step": 331 }, { "epoch": 0.04386167718069822, "grad_norm": 0.2495148479938507, "learning_rate": 0.00019910586202496742, "loss": 0.0464, "step": 332 }, { "epoch": 0.04399379066618225, "grad_norm": 0.4149893820285797, "learning_rate": 0.0001991003081048226, "loss": 0.0423, "step": 333 }, { "epoch": 0.044125904151666284, "grad_norm": 0.33313971757888794, "learning_rate": 0.00019909473706695245, "loss": 0.0556, "step": 334 }, { "epoch": 0.044258017637150315, "grad_norm": 0.2713533341884613, "learning_rate": 0.00019908914891231927, "loss": 0.0279, "step": 335 }, { "epoch": 0.04439013112263434, "grad_norm": 0.2693321704864502, "learning_rate": 0.00019908354364188836, "loss": 0.0571, "step": 336 }, { "epoch": 0.04452224460811837, "grad_norm": 0.2995421290397644, "learning_rate": 0.00019907792125662782, "loss": 0.0404, "step": 337 }, { "epoch": 0.0446543580936024, "grad_norm": 0.2042950689792633, "learning_rate": 0.0001990722817575089, "loss": 0.0411, "step": 338 }, { "epoch": 0.04478647157908643, "grad_norm": 0.21087051928043365, "learning_rate": 0.0001990666251455057, "loss": 0.0291, "step": 339 }, { "epoch": 0.044918585064570464, "grad_norm": 0.24781399965286255, "learning_rate": 0.00019906095142159524, "loss": 0.0322, "step": 340 }, { "epoch": 0.045050698550054495, "grad_norm": 0.25378474593162537, "learning_rate": 0.00019905526058675764, "loss": 0.0461, "step": 341 }, { "epoch": 0.045182812035538526, "grad_norm": 0.25275951623916626, "learning_rate": 0.00019904955264197577, "loss": 0.0397, "step": 342 }, { "epoch": 0.04531492552102256, "grad_norm": 0.2260797619819641, "learning_rate": 0.0001990438275882357, "loss": 0.0413, "step": 343 }, { "epoch": 0.04544703900650659, "grad_norm": 0.18803325295448303, "learning_rate": 0.00019903808542652625, "loss": 0.0256, "step": 344 }, { "epoch": 0.04557915249199062, "grad_norm": 0.33592838048934937, "learning_rate": 0.0001990323261578393, "loss": 0.0469, "step": 345 }, { "epoch": 0.04571126597747465, "grad_norm": 0.20926974713802338, "learning_rate": 0.00019902654978316958, "loss": 0.0349, "step": 346 }, { "epoch": 0.04584337946295868, "grad_norm": 0.22873865067958832, "learning_rate": 0.00019902075630351496, "loss": 0.032, "step": 347 }, { "epoch": 0.04597549294844271, "grad_norm": 0.3676367998123169, "learning_rate": 0.0001990149457198761, "loss": 0.0507, "step": 348 }, { "epoch": 0.046107606433926744, "grad_norm": 0.23917162418365479, "learning_rate": 0.0001990091180332567, "loss": 0.0455, "step": 349 }, { "epoch": 0.046239719919410775, "grad_norm": 0.2903183102607727, "learning_rate": 0.0001990032732446633, "loss": 0.0466, "step": 350 }, { "epoch": 0.046371833404894806, "grad_norm": 0.19862964749336243, "learning_rate": 0.0001989974113551056, "loss": 0.0243, "step": 351 }, { "epoch": 0.04650394689037884, "grad_norm": 0.4329935312271118, "learning_rate": 0.00019899153236559603, "loss": 0.0513, "step": 352 }, { "epoch": 0.04663606037586287, "grad_norm": 0.21616338193416595, "learning_rate": 0.0001989856362771501, "loss": 0.0337, "step": 353 }, { "epoch": 0.0467681738613469, "grad_norm": 0.3290994465351105, "learning_rate": 0.00019897972309078628, "loss": 0.0327, "step": 354 }, { "epoch": 0.04690028734683093, "grad_norm": 0.6223381757736206, "learning_rate": 0.00019897379280752598, "loss": 0.06, "step": 355 }, { "epoch": 0.04703240083231496, "grad_norm": 0.2321864515542984, "learning_rate": 0.0001989678454283935, "loss": 0.0413, "step": 356 }, { "epoch": 0.047164514317798986, "grad_norm": 0.2947792708873749, "learning_rate": 0.00019896188095441613, "loss": 0.0541, "step": 357 }, { "epoch": 0.04729662780328302, "grad_norm": 0.307486355304718, "learning_rate": 0.00019895589938662416, "loss": 0.0485, "step": 358 }, { "epoch": 0.04742874128876705, "grad_norm": 0.2538960576057434, "learning_rate": 0.0001989499007260508, "loss": 0.0393, "step": 359 }, { "epoch": 0.04756085477425108, "grad_norm": 0.21938273310661316, "learning_rate": 0.00019894388497373214, "loss": 0.0272, "step": 360 }, { "epoch": 0.04769296825973511, "grad_norm": 0.2789864242076874, "learning_rate": 0.00019893785213070733, "loss": 0.0318, "step": 361 }, { "epoch": 0.04782508174521914, "grad_norm": 0.23993203043937683, "learning_rate": 0.00019893180219801844, "loss": 0.0355, "step": 362 }, { "epoch": 0.04795719523070317, "grad_norm": 0.3103472590446472, "learning_rate": 0.00019892573517671047, "loss": 0.0384, "step": 363 }, { "epoch": 0.048089308716187204, "grad_norm": 0.31523704528808594, "learning_rate": 0.0001989196510678314, "loss": 0.0326, "step": 364 }, { "epoch": 0.048221422201671235, "grad_norm": 0.33698925375938416, "learning_rate": 0.00019891354987243217, "loss": 0.0527, "step": 365 }, { "epoch": 0.048353535687155266, "grad_norm": 0.22326672077178955, "learning_rate": 0.00019890743159156656, "loss": 0.0345, "step": 366 }, { "epoch": 0.0484856491726393, "grad_norm": 0.32215416431427, "learning_rate": 0.00019890129622629146, "loss": 0.0706, "step": 367 }, { "epoch": 0.04861776265812333, "grad_norm": 0.31839343905448914, "learning_rate": 0.00019889514377766662, "loss": 0.038, "step": 368 }, { "epoch": 0.04874987614360736, "grad_norm": 0.23498553037643433, "learning_rate": 0.00019888897424675476, "loss": 0.0279, "step": 369 }, { "epoch": 0.04888198962909139, "grad_norm": 0.29461991786956787, "learning_rate": 0.00019888278763462158, "loss": 0.0339, "step": 370 }, { "epoch": 0.04901410311457542, "grad_norm": 0.27036118507385254, "learning_rate": 0.00019887658394233563, "loss": 0.0353, "step": 371 }, { "epoch": 0.04914621660005945, "grad_norm": 0.24991001188755035, "learning_rate": 0.00019887036317096856, "loss": 0.03, "step": 372 }, { "epoch": 0.049278330085543484, "grad_norm": 0.22536452114582062, "learning_rate": 0.00019886412532159486, "loss": 0.035, "step": 373 }, { "epoch": 0.049410443571027515, "grad_norm": 0.3040442168712616, "learning_rate": 0.00019885787039529198, "loss": 0.0324, "step": 374 }, { "epoch": 0.049542557056511546, "grad_norm": 0.32448676228523254, "learning_rate": 0.00019885159839314035, "loss": 0.0289, "step": 375 }, { "epoch": 0.04967467054199558, "grad_norm": 0.3627474904060364, "learning_rate": 0.0001988453093162234, "loss": 0.0639, "step": 376 }, { "epoch": 0.0498067840274796, "grad_norm": 0.30182796716690063, "learning_rate": 0.00019883900316562735, "loss": 0.037, "step": 377 }, { "epoch": 0.04993889751296363, "grad_norm": 0.21517710387706757, "learning_rate": 0.00019883267994244154, "loss": 0.0351, "step": 378 }, { "epoch": 0.050071010998447664, "grad_norm": 0.22155945003032684, "learning_rate": 0.0001988263396477582, "loss": 0.0228, "step": 379 }, { "epoch": 0.050203124483931695, "grad_norm": 0.2640284299850464, "learning_rate": 0.00019881998228267245, "loss": 0.0475, "step": 380 }, { "epoch": 0.050335237969415726, "grad_norm": 0.38280588388442993, "learning_rate": 0.00019881360784828242, "loss": 0.0468, "step": 381 }, { "epoch": 0.05046735145489976, "grad_norm": 0.23366643488407135, "learning_rate": 0.0001988072163456892, "loss": 0.0317, "step": 382 }, { "epoch": 0.05059946494038379, "grad_norm": 0.2938922643661499, "learning_rate": 0.00019880080777599673, "loss": 0.0419, "step": 383 }, { "epoch": 0.05073157842586782, "grad_norm": 0.23098677396774292, "learning_rate": 0.00019879438214031206, "loss": 0.0291, "step": 384 }, { "epoch": 0.05086369191135185, "grad_norm": 0.32484182715415955, "learning_rate": 0.00019878793943974506, "loss": 0.0349, "step": 385 }, { "epoch": 0.05099580539683588, "grad_norm": 0.356143593788147, "learning_rate": 0.00019878147967540859, "loss": 0.0435, "step": 386 }, { "epoch": 0.05112791888231991, "grad_norm": 0.24577073752880096, "learning_rate": 0.00019877500284841846, "loss": 0.0459, "step": 387 }, { "epoch": 0.051260032367803944, "grad_norm": 0.32349520921707153, "learning_rate": 0.00019876850895989337, "loss": 0.0282, "step": 388 }, { "epoch": 0.051392145853287975, "grad_norm": 0.2956535816192627, "learning_rate": 0.0001987619980109551, "loss": 0.033, "step": 389 }, { "epoch": 0.051524259338772006, "grad_norm": 0.26525723934173584, "learning_rate": 0.00019875547000272823, "loss": 0.0487, "step": 390 }, { "epoch": 0.05165637282425604, "grad_norm": 0.23867838084697723, "learning_rate": 0.00019874892493634038, "loss": 0.0371, "step": 391 }, { "epoch": 0.05178848630974007, "grad_norm": 0.281076580286026, "learning_rate": 0.00019874236281292208, "loss": 0.0373, "step": 392 }, { "epoch": 0.0519205997952241, "grad_norm": 0.2519184648990631, "learning_rate": 0.00019873578363360683, "loss": 0.0384, "step": 393 }, { "epoch": 0.05205271328070813, "grad_norm": 0.4297747015953064, "learning_rate": 0.00019872918739953103, "loss": 0.0717, "step": 394 }, { "epoch": 0.05218482676619216, "grad_norm": 0.203557550907135, "learning_rate": 0.0001987225741118341, "loss": 0.0274, "step": 395 }, { "epoch": 0.05231694025167619, "grad_norm": 0.24573923647403717, "learning_rate": 0.00019871594377165831, "loss": 0.0412, "step": 396 }, { "epoch": 0.05244905373716022, "grad_norm": 0.34401312470436096, "learning_rate": 0.00019870929638014895, "loss": 0.0288, "step": 397 }, { "epoch": 0.05258116722264425, "grad_norm": 0.35726824402809143, "learning_rate": 0.00019870263193845427, "loss": 0.0594, "step": 398 }, { "epoch": 0.05271328070812828, "grad_norm": 0.23975130915641785, "learning_rate": 0.00019869595044772536, "loss": 0.0359, "step": 399 }, { "epoch": 0.05284539419361231, "grad_norm": 0.271397203207016, "learning_rate": 0.00019868925190911636, "loss": 0.0447, "step": 400 }, { "epoch": 0.05297750767909634, "grad_norm": 0.28331097960472107, "learning_rate": 0.0001986825363237843, "loss": 0.0464, "step": 401 }, { "epoch": 0.05310962116458037, "grad_norm": 0.24692410230636597, "learning_rate": 0.0001986758036928892, "loss": 0.0362, "step": 402 }, { "epoch": 0.053241734650064404, "grad_norm": 0.6335957646369934, "learning_rate": 0.000198669054017594, "loss": 0.0383, "step": 403 }, { "epoch": 0.053373848135548435, "grad_norm": 0.26764601469039917, "learning_rate": 0.00019866228729906453, "loss": 0.032, "step": 404 }, { "epoch": 0.053505961621032466, "grad_norm": 0.3402880132198334, "learning_rate": 0.00019865550353846966, "loss": 0.0294, "step": 405 }, { "epoch": 0.0536380751065165, "grad_norm": 0.2578073740005493, "learning_rate": 0.00019864870273698113, "loss": 0.0468, "step": 406 }, { "epoch": 0.05377018859200053, "grad_norm": 0.2212483137845993, "learning_rate": 0.00019864188489577368, "loss": 0.037, "step": 407 }, { "epoch": 0.05390230207748456, "grad_norm": 0.2527289390563965, "learning_rate": 0.00019863505001602492, "loss": 0.0306, "step": 408 }, { "epoch": 0.05403441556296859, "grad_norm": 0.3121415674686432, "learning_rate": 0.00019862819809891548, "loss": 0.0359, "step": 409 }, { "epoch": 0.05416652904845262, "grad_norm": 0.3989887833595276, "learning_rate": 0.00019862132914562892, "loss": 0.0494, "step": 410 }, { "epoch": 0.05429864253393665, "grad_norm": 0.310231477022171, "learning_rate": 0.0001986144431573517, "loss": 0.0343, "step": 411 }, { "epoch": 0.054430756019420684, "grad_norm": 0.30125224590301514, "learning_rate": 0.00019860754013527326, "loss": 0.0351, "step": 412 }, { "epoch": 0.054562869504904715, "grad_norm": 0.28822439908981323, "learning_rate": 0.00019860062008058592, "loss": 0.0505, "step": 413 }, { "epoch": 0.054694982990388746, "grad_norm": 0.3362140357494354, "learning_rate": 0.00019859368299448505, "loss": 0.0489, "step": 414 }, { "epoch": 0.05482709647587278, "grad_norm": 0.22514215111732483, "learning_rate": 0.00019858672887816884, "loss": 0.0393, "step": 415 }, { "epoch": 0.05495920996135681, "grad_norm": 0.3549448251724243, "learning_rate": 0.00019857975773283855, "loss": 0.0313, "step": 416 }, { "epoch": 0.05509132344684084, "grad_norm": 0.238682359457016, "learning_rate": 0.00019857276955969827, "loss": 0.0437, "step": 417 }, { "epoch": 0.055223436932324864, "grad_norm": 0.24690861999988556, "learning_rate": 0.0001985657643599551, "loss": 0.0398, "step": 418 }, { "epoch": 0.055355550417808895, "grad_norm": 0.17593775689601898, "learning_rate": 0.00019855874213481903, "loss": 0.0232, "step": 419 }, { "epoch": 0.055487663903292926, "grad_norm": 0.22385098040103912, "learning_rate": 0.00019855170288550305, "loss": 0.035, "step": 420 }, { "epoch": 0.05561977738877696, "grad_norm": 0.27371707558631897, "learning_rate": 0.00019854464661322302, "loss": 0.038, "step": 421 }, { "epoch": 0.05575189087426099, "grad_norm": 0.2543024718761444, "learning_rate": 0.00019853757331919785, "loss": 0.0441, "step": 422 }, { "epoch": 0.05588400435974502, "grad_norm": 0.18014635145664215, "learning_rate": 0.00019853048300464925, "loss": 0.0296, "step": 423 }, { "epoch": 0.05601611784522905, "grad_norm": 0.2874230742454529, "learning_rate": 0.00019852337567080196, "loss": 0.0396, "step": 424 }, { "epoch": 0.05614823133071308, "grad_norm": 0.24500614404678345, "learning_rate": 0.00019851625131888363, "loss": 0.0405, "step": 425 }, { "epoch": 0.05628034481619711, "grad_norm": 0.277548611164093, "learning_rate": 0.00019850910995012488, "loss": 0.0256, "step": 426 }, { "epoch": 0.056412458301681144, "grad_norm": 0.5045779943466187, "learning_rate": 0.00019850195156575926, "loss": 0.0392, "step": 427 }, { "epoch": 0.056544571787165175, "grad_norm": 0.2808299958705902, "learning_rate": 0.0001984947761670232, "loss": 0.0339, "step": 428 }, { "epoch": 0.056676685272649206, "grad_norm": 0.2322196215391159, "learning_rate": 0.00019848758375515615, "loss": 0.0304, "step": 429 }, { "epoch": 0.05680879875813324, "grad_norm": 0.8448551893234253, "learning_rate": 0.00019848037433140044, "loss": 0.0885, "step": 430 }, { "epoch": 0.05694091224361727, "grad_norm": 0.24392534792423248, "learning_rate": 0.0001984731478970014, "loss": 0.0413, "step": 431 }, { "epoch": 0.0570730257291013, "grad_norm": 0.33474117517471313, "learning_rate": 0.00019846590445320723, "loss": 0.0417, "step": 432 }, { "epoch": 0.05720513921458533, "grad_norm": 0.251472532749176, "learning_rate": 0.0001984586440012691, "loss": 0.0291, "step": 433 }, { "epoch": 0.05733725270006936, "grad_norm": 0.21113821864128113, "learning_rate": 0.00019845136654244114, "loss": 0.0307, "step": 434 }, { "epoch": 0.05746936618555339, "grad_norm": 0.3923201858997345, "learning_rate": 0.00019844407207798037, "loss": 0.0482, "step": 435 }, { "epoch": 0.057601479671037424, "grad_norm": 0.29165613651275635, "learning_rate": 0.0001984367606091468, "loss": 0.0459, "step": 436 }, { "epoch": 0.057733593156521455, "grad_norm": 0.31841304898262024, "learning_rate": 0.00019842943213720332, "loss": 0.0452, "step": 437 }, { "epoch": 0.05786570664200548, "grad_norm": 0.19356763362884521, "learning_rate": 0.00019842208666341583, "loss": 0.0292, "step": 438 }, { "epoch": 0.05799782012748951, "grad_norm": 0.2828764021396637, "learning_rate": 0.00019841472418905305, "loss": 0.0418, "step": 439 }, { "epoch": 0.05812993361297354, "grad_norm": 0.7014762163162231, "learning_rate": 0.00019840734471538677, "loss": 0.0375, "step": 440 }, { "epoch": 0.05826204709845757, "grad_norm": 0.2510444223880768, "learning_rate": 0.00019839994824369167, "loss": 0.0371, "step": 441 }, { "epoch": 0.058394160583941604, "grad_norm": 0.4541257917881012, "learning_rate": 0.00019839253477524528, "loss": 0.0425, "step": 442 }, { "epoch": 0.058526274069425635, "grad_norm": 0.3985843062400818, "learning_rate": 0.0001983851043113282, "loss": 0.046, "step": 443 }, { "epoch": 0.058658387554909666, "grad_norm": 0.2939070165157318, "learning_rate": 0.00019837765685322385, "loss": 0.0465, "step": 444 }, { "epoch": 0.0587905010403937, "grad_norm": 0.2937294542789459, "learning_rate": 0.00019837019240221874, "loss": 0.0367, "step": 445 }, { "epoch": 0.05892261452587773, "grad_norm": 0.24315603077411652, "learning_rate": 0.00019836271095960206, "loss": 0.0344, "step": 446 }, { "epoch": 0.05905472801136176, "grad_norm": 0.3837631642818451, "learning_rate": 0.00019835521252666624, "loss": 0.0412, "step": 447 }, { "epoch": 0.05918684149684579, "grad_norm": 0.2079976350069046, "learning_rate": 0.00019834769710470643, "loss": 0.0312, "step": 448 }, { "epoch": 0.05931895498232982, "grad_norm": 0.35846373438835144, "learning_rate": 0.00019834016469502075, "loss": 0.0512, "step": 449 }, { "epoch": 0.05945106846781385, "grad_norm": 0.2271134853363037, "learning_rate": 0.00019833261529891033, "loss": 0.0344, "step": 450 }, { "epoch": 0.059583181953297884, "grad_norm": 0.26171258091926575, "learning_rate": 0.00019832504891767916, "loss": 0.0443, "step": 451 }, { "epoch": 0.059715295438781915, "grad_norm": 0.32468751072883606, "learning_rate": 0.00019831746555263417, "loss": 0.0275, "step": 452 }, { "epoch": 0.059847408924265946, "grad_norm": 0.3164912462234497, "learning_rate": 0.0001983098652050853, "loss": 0.0537, "step": 453 }, { "epoch": 0.05997952240974998, "grad_norm": 0.29063880443573, "learning_rate": 0.00019830224787634537, "loss": 0.0426, "step": 454 }, { "epoch": 0.06011163589523401, "grad_norm": 0.3254551589488983, "learning_rate": 0.00019829461356773008, "loss": 0.0668, "step": 455 }, { "epoch": 0.06024374938071804, "grad_norm": 0.2961951494216919, "learning_rate": 0.00019828696228055815, "loss": 0.0445, "step": 456 }, { "epoch": 0.06037586286620207, "grad_norm": 0.37507346272468567, "learning_rate": 0.00019827929401615115, "loss": 0.0359, "step": 457 }, { "epoch": 0.060507976351686095, "grad_norm": 0.3069281578063965, "learning_rate": 0.0001982716087758337, "loss": 0.0351, "step": 458 }, { "epoch": 0.060640089837170126, "grad_norm": 0.2815963923931122, "learning_rate": 0.0001982639065609332, "loss": 0.0609, "step": 459 }, { "epoch": 0.06077220332265416, "grad_norm": 0.3931083381175995, "learning_rate": 0.00019825618737278017, "loss": 0.0491, "step": 460 }, { "epoch": 0.06090431680813819, "grad_norm": 0.24349068105220795, "learning_rate": 0.00019824845121270787, "loss": 0.0398, "step": 461 }, { "epoch": 0.06103643029362222, "grad_norm": 0.260797917842865, "learning_rate": 0.00019824069808205259, "loss": 0.0379, "step": 462 }, { "epoch": 0.06116854377910625, "grad_norm": 0.22222347557544708, "learning_rate": 0.00019823292798215353, "loss": 0.0247, "step": 463 }, { "epoch": 0.06130065726459028, "grad_norm": 0.342363566160202, "learning_rate": 0.00019822514091435287, "loss": 0.0509, "step": 464 }, { "epoch": 0.06143277075007431, "grad_norm": 0.2042882740497589, "learning_rate": 0.00019821733687999568, "loss": 0.0351, "step": 465 }, { "epoch": 0.061564884235558344, "grad_norm": 0.18748803436756134, "learning_rate": 0.00019820951588042993, "loss": 0.0222, "step": 466 }, { "epoch": 0.061696997721042375, "grad_norm": 0.3864912688732147, "learning_rate": 0.00019820167791700653, "loss": 0.0499, "step": 467 }, { "epoch": 0.061829111206526406, "grad_norm": 0.2836579978466034, "learning_rate": 0.0001981938229910794, "loss": 0.0469, "step": 468 }, { "epoch": 0.06196122469201044, "grad_norm": 0.2501057982444763, "learning_rate": 0.00019818595110400531, "loss": 0.0336, "step": 469 }, { "epoch": 0.06209333817749447, "grad_norm": 0.1960388571023941, "learning_rate": 0.00019817806225714394, "loss": 0.0267, "step": 470 }, { "epoch": 0.0622254516629785, "grad_norm": 0.20142126083374023, "learning_rate": 0.00019817015645185801, "loss": 0.022, "step": 471 }, { "epoch": 0.06235756514846253, "grad_norm": 0.26167020201683044, "learning_rate": 0.00019816223368951307, "loss": 0.0348, "step": 472 }, { "epoch": 0.06248967863394656, "grad_norm": 0.313064306974411, "learning_rate": 0.00019815429397147764, "loss": 0.0393, "step": 473 }, { "epoch": 0.06262179211943059, "grad_norm": 0.23127563297748566, "learning_rate": 0.0001981463372991231, "loss": 0.029, "step": 474 }, { "epoch": 0.06275390560491462, "grad_norm": 0.2343924194574356, "learning_rate": 0.00019813836367382388, "loss": 0.04, "step": 475 }, { "epoch": 0.06288601909039865, "grad_norm": 0.35813525319099426, "learning_rate": 0.00019813037309695725, "loss": 0.0429, "step": 476 }, { "epoch": 0.06301813257588268, "grad_norm": 0.20709457993507385, "learning_rate": 0.00019812236556990346, "loss": 0.0315, "step": 477 }, { "epoch": 0.06315024606136671, "grad_norm": 0.20351076126098633, "learning_rate": 0.00019811434109404563, "loss": 0.0271, "step": 478 }, { "epoch": 0.06328235954685074, "grad_norm": 0.7306005358695984, "learning_rate": 0.00019810629967076984, "loss": 0.0406, "step": 479 }, { "epoch": 0.06341447303233477, "grad_norm": 0.2870320975780487, "learning_rate": 0.0001980982413014651, "loss": 0.0367, "step": 480 }, { "epoch": 0.0635465865178188, "grad_norm": 0.3635860085487366, "learning_rate": 0.00019809016598752334, "loss": 0.0385, "step": 481 }, { "epoch": 0.06367870000330283, "grad_norm": 0.2811727523803711, "learning_rate": 0.00019808207373033944, "loss": 0.0407, "step": 482 }, { "epoch": 0.06381081348878687, "grad_norm": 0.18366318941116333, "learning_rate": 0.00019807396453131118, "loss": 0.0329, "step": 483 }, { "epoch": 0.0639429269742709, "grad_norm": 0.23548203706741333, "learning_rate": 0.00019806583839183922, "loss": 0.0273, "step": 484 }, { "epoch": 0.06407504045975493, "grad_norm": 0.4355567395687103, "learning_rate": 0.00019805769531332728, "loss": 0.0531, "step": 485 }, { "epoch": 0.06420715394523896, "grad_norm": 0.38908225297927856, "learning_rate": 0.00019804953529718185, "loss": 0.0445, "step": 486 }, { "epoch": 0.06433926743072299, "grad_norm": 0.2681350111961365, "learning_rate": 0.0001980413583448125, "loss": 0.0324, "step": 487 }, { "epoch": 0.06447138091620702, "grad_norm": 0.32850104570388794, "learning_rate": 0.00019803316445763156, "loss": 0.0547, "step": 488 }, { "epoch": 0.06460349440169105, "grad_norm": 0.20159336924552917, "learning_rate": 0.00019802495363705446, "loss": 0.0348, "step": 489 }, { "epoch": 0.06473560788717508, "grad_norm": 0.23026055097579956, "learning_rate": 0.00019801672588449937, "loss": 0.041, "step": 490 }, { "epoch": 0.06486772137265912, "grad_norm": 0.23433181643486023, "learning_rate": 0.00019800848120138755, "loss": 0.0377, "step": 491 }, { "epoch": 0.06499983485814315, "grad_norm": 0.26035311818122864, "learning_rate": 0.0001980002195891431, "loss": 0.0342, "step": 492 }, { "epoch": 0.06513194834362718, "grad_norm": 0.23564809560775757, "learning_rate": 0.00019799194104919306, "loss": 0.0264, "step": 493 }, { "epoch": 0.06526406182911121, "grad_norm": 0.23754185438156128, "learning_rate": 0.00019798364558296737, "loss": 0.0377, "step": 494 }, { "epoch": 0.06539617531459524, "grad_norm": 0.23798996210098267, "learning_rate": 0.0001979753331918989, "loss": 0.039, "step": 495 }, { "epoch": 0.06552828880007927, "grad_norm": 0.2444775253534317, "learning_rate": 0.00019796700387742354, "loss": 0.0228, "step": 496 }, { "epoch": 0.0656604022855633, "grad_norm": 0.3243919909000397, "learning_rate": 0.00019795865764097998, "loss": 0.0501, "step": 497 }, { "epoch": 0.06579251577104733, "grad_norm": 0.29826414585113525, "learning_rate": 0.00019795029448400984, "loss": 0.0343, "step": 498 }, { "epoch": 0.06592462925653136, "grad_norm": 0.30492937564849854, "learning_rate": 0.00019794191440795775, "loss": 0.0368, "step": 499 }, { "epoch": 0.0660567427420154, "grad_norm": 0.3303871750831604, "learning_rate": 0.00019793351741427117, "loss": 0.0397, "step": 500 }, { "epoch": 0.06618885622749943, "grad_norm": 0.2616530656814575, "learning_rate": 0.00019792510350440058, "loss": 0.0295, "step": 501 }, { "epoch": 0.06632096971298346, "grad_norm": 0.2011949121952057, "learning_rate": 0.00019791667267979928, "loss": 0.0206, "step": 502 }, { "epoch": 0.06645308319846749, "grad_norm": 0.985835611820221, "learning_rate": 0.00019790822494192357, "loss": 0.0354, "step": 503 }, { "epoch": 0.06658519668395152, "grad_norm": 0.26946449279785156, "learning_rate": 0.00019789976029223257, "loss": 0.0402, "step": 504 }, { "epoch": 0.06671731016943555, "grad_norm": 0.2122262567281723, "learning_rate": 0.00019789127873218843, "loss": 0.0352, "step": 505 }, { "epoch": 0.06684942365491958, "grad_norm": 0.2970362603664398, "learning_rate": 0.00019788278026325627, "loss": 0.0352, "step": 506 }, { "epoch": 0.06698153714040361, "grad_norm": 0.34310245513916016, "learning_rate": 0.0001978742648869039, "loss": 0.0378, "step": 507 }, { "epoch": 0.06711365062588764, "grad_norm": 0.3422040343284607, "learning_rate": 0.00019786573260460226, "loss": 0.0623, "step": 508 }, { "epoch": 0.06724576411137166, "grad_norm": 0.23203279078006744, "learning_rate": 0.00019785718341782516, "loss": 0.0435, "step": 509 }, { "epoch": 0.06737787759685569, "grad_norm": 0.3923529386520386, "learning_rate": 0.00019784861732804926, "loss": 0.0358, "step": 510 }, { "epoch": 0.06750999108233972, "grad_norm": 0.32772544026374817, "learning_rate": 0.00019784003433675421, "loss": 0.0418, "step": 511 }, { "epoch": 0.06764210456782375, "grad_norm": 0.2983352541923523, "learning_rate": 0.00019783143444542257, "loss": 0.0333, "step": 512 }, { "epoch": 0.06777421805330779, "grad_norm": 0.3037568926811218, "learning_rate": 0.00019782281765553985, "loss": 0.045, "step": 513 }, { "epoch": 0.06790633153879182, "grad_norm": 0.2756015658378601, "learning_rate": 0.00019781418396859436, "loss": 0.0381, "step": 514 }, { "epoch": 0.06803844502427585, "grad_norm": 0.2572304606437683, "learning_rate": 0.00019780553338607745, "loss": 0.0423, "step": 515 }, { "epoch": 0.06817055850975988, "grad_norm": 0.40779799222946167, "learning_rate": 0.00019779686590948336, "loss": 0.0407, "step": 516 }, { "epoch": 0.06830267199524391, "grad_norm": 0.32563602924346924, "learning_rate": 0.00019778818154030922, "loss": 0.058, "step": 517 }, { "epoch": 0.06843478548072794, "grad_norm": 0.19639067351818085, "learning_rate": 0.0001977794802800551, "loss": 0.0232, "step": 518 }, { "epoch": 0.06856689896621197, "grad_norm": 0.3198147118091583, "learning_rate": 0.00019777076213022397, "loss": 0.0426, "step": 519 }, { "epoch": 0.068699012451696, "grad_norm": 0.34367501735687256, "learning_rate": 0.0001977620270923217, "loss": 0.0524, "step": 520 }, { "epoch": 0.06883112593718003, "grad_norm": 0.2864331305027008, "learning_rate": 0.00019775327516785714, "loss": 0.0386, "step": 521 }, { "epoch": 0.06896323942266407, "grad_norm": 0.2509300708770752, "learning_rate": 0.00019774450635834203, "loss": 0.0478, "step": 522 }, { "epoch": 0.0690953529081481, "grad_norm": 0.3212604522705078, "learning_rate": 0.000197735720665291, "loss": 0.0399, "step": 523 }, { "epoch": 0.06922746639363213, "grad_norm": 0.25791066884994507, "learning_rate": 0.00019772691809022161, "loss": 0.0352, "step": 524 }, { "epoch": 0.06935957987911616, "grad_norm": 0.3202870190143585, "learning_rate": 0.00019771809863465437, "loss": 0.0273, "step": 525 }, { "epoch": 0.06949169336460019, "grad_norm": 0.292076975107193, "learning_rate": 0.0001977092623001126, "loss": 0.047, "step": 526 }, { "epoch": 0.06962380685008422, "grad_norm": 0.2345723956823349, "learning_rate": 0.0001977004090881227, "loss": 0.0356, "step": 527 }, { "epoch": 0.06975592033556825, "grad_norm": 0.2687516212463379, "learning_rate": 0.00019769153900021388, "loss": 0.043, "step": 528 }, { "epoch": 0.06988803382105228, "grad_norm": 0.24501895904541016, "learning_rate": 0.00019768265203791826, "loss": 0.0347, "step": 529 }, { "epoch": 0.07002014730653632, "grad_norm": 0.29931098222732544, "learning_rate": 0.00019767374820277086, "loss": 0.0484, "step": 530 }, { "epoch": 0.07015226079202035, "grad_norm": 0.19638241827487946, "learning_rate": 0.0001976648274963097, "loss": 0.0292, "step": 531 }, { "epoch": 0.07028437427750438, "grad_norm": 0.22806896269321442, "learning_rate": 0.00019765588992007568, "loss": 0.0256, "step": 532 }, { "epoch": 0.07041648776298841, "grad_norm": 0.24054238200187683, "learning_rate": 0.00019764693547561255, "loss": 0.025, "step": 533 }, { "epoch": 0.07054860124847244, "grad_norm": 0.24966415762901306, "learning_rate": 0.00019763796416446706, "loss": 0.0322, "step": 534 }, { "epoch": 0.07068071473395647, "grad_norm": 0.25949081778526306, "learning_rate": 0.00019762897598818883, "loss": 0.0268, "step": 535 }, { "epoch": 0.0708128282194405, "grad_norm": 0.34949883818626404, "learning_rate": 0.00019761997094833037, "loss": 0.0434, "step": 536 }, { "epoch": 0.07094494170492453, "grad_norm": 0.3490285873413086, "learning_rate": 0.0001976109490464472, "loss": 0.0231, "step": 537 }, { "epoch": 0.07107705519040856, "grad_norm": 0.22599942982196808, "learning_rate": 0.0001976019102840976, "loss": 0.0242, "step": 538 }, { "epoch": 0.0712091686758926, "grad_norm": 0.2737880051136017, "learning_rate": 0.0001975928546628429, "loss": 0.0341, "step": 539 }, { "epoch": 0.07134128216137663, "grad_norm": 0.2882981598377228, "learning_rate": 0.00019758378218424726, "loss": 0.0378, "step": 540 }, { "epoch": 0.07147339564686066, "grad_norm": 0.20302222669124603, "learning_rate": 0.00019757469284987784, "loss": 0.0371, "step": 541 }, { "epoch": 0.07160550913234469, "grad_norm": 0.23300130665302277, "learning_rate": 0.0001975655866613046, "loss": 0.0258, "step": 542 }, { "epoch": 0.07173762261782872, "grad_norm": 0.23061759769916534, "learning_rate": 0.00019755646362010044, "loss": 0.0262, "step": 543 }, { "epoch": 0.07186973610331275, "grad_norm": 0.3055586516857147, "learning_rate": 0.00019754732372784126, "loss": 0.0406, "step": 544 }, { "epoch": 0.07200184958879678, "grad_norm": 0.20383194088935852, "learning_rate": 0.00019753816698610577, "loss": 0.0263, "step": 545 }, { "epoch": 0.07213396307428081, "grad_norm": 0.25024712085723877, "learning_rate": 0.00019752899339647563, "loss": 0.0299, "step": 546 }, { "epoch": 0.07226607655976484, "grad_norm": 0.21452884376049042, "learning_rate": 0.00019751980296053541, "loss": 0.0293, "step": 547 }, { "epoch": 0.07239819004524888, "grad_norm": 0.24472303688526154, "learning_rate": 0.00019751059567987259, "loss": 0.0251, "step": 548 }, { "epoch": 0.0725303035307329, "grad_norm": 0.28603652119636536, "learning_rate": 0.0001975013715560775, "loss": 0.0439, "step": 549 }, { "epoch": 0.07266241701621692, "grad_norm": 0.349922776222229, "learning_rate": 0.00019749213059074353, "loss": 0.0497, "step": 550 }, { "epoch": 0.07279453050170095, "grad_norm": 0.2149500995874405, "learning_rate": 0.00019748287278546683, "loss": 0.0227, "step": 551 }, { "epoch": 0.07292664398718499, "grad_norm": 0.2354866862297058, "learning_rate": 0.00019747359814184653, "loss": 0.0252, "step": 552 }, { "epoch": 0.07305875747266902, "grad_norm": 0.3029223680496216, "learning_rate": 0.00019746430666148462, "loss": 0.0422, "step": 553 }, { "epoch": 0.07319087095815305, "grad_norm": 0.364507257938385, "learning_rate": 0.00019745499834598605, "loss": 0.0459, "step": 554 }, { "epoch": 0.07332298444363708, "grad_norm": 0.2667244076728821, "learning_rate": 0.00019744567319695869, "loss": 0.0362, "step": 555 }, { "epoch": 0.07345509792912111, "grad_norm": 0.35405200719833374, "learning_rate": 0.00019743633121601322, "loss": 0.04, "step": 556 }, { "epoch": 0.07358721141460514, "grad_norm": 0.21454021334648132, "learning_rate": 0.00019742697240476332, "loss": 0.0365, "step": 557 }, { "epoch": 0.07371932490008917, "grad_norm": 0.29431477189064026, "learning_rate": 0.0001974175967648256, "loss": 0.0334, "step": 558 }, { "epoch": 0.0738514383855732, "grad_norm": 0.22505509853363037, "learning_rate": 0.00019740820429781943, "loss": 0.0398, "step": 559 }, { "epoch": 0.07398355187105723, "grad_norm": 0.22686269879341125, "learning_rate": 0.00019739879500536725, "loss": 0.0317, "step": 560 }, { "epoch": 0.07411566535654127, "grad_norm": 0.2540476322174072, "learning_rate": 0.00019738936888909434, "loss": 0.029, "step": 561 }, { "epoch": 0.0742477788420253, "grad_norm": 0.22462958097457886, "learning_rate": 0.00019737992595062886, "loss": 0.0239, "step": 562 }, { "epoch": 0.07437989232750933, "grad_norm": 0.4206967353820801, "learning_rate": 0.00019737046619160194, "loss": 0.0497, "step": 563 }, { "epoch": 0.07451200581299336, "grad_norm": 0.2678888738155365, "learning_rate": 0.0001973609896136475, "loss": 0.029, "step": 564 }, { "epoch": 0.07464411929847739, "grad_norm": 0.19996492564678192, "learning_rate": 0.0001973514962184025, "loss": 0.023, "step": 565 }, { "epoch": 0.07477623278396142, "grad_norm": 0.32264792919158936, "learning_rate": 0.00019734198600750678, "loss": 0.0366, "step": 566 }, { "epoch": 0.07490834626944545, "grad_norm": 0.23401503264904022, "learning_rate": 0.00019733245898260297, "loss": 0.0279, "step": 567 }, { "epoch": 0.07504045975492948, "grad_norm": 0.19676105678081512, "learning_rate": 0.00019732291514533673, "loss": 0.0406, "step": 568 }, { "epoch": 0.07517257324041351, "grad_norm": 0.37109094858169556, "learning_rate": 0.00019731335449735659, "loss": 0.0452, "step": 569 }, { "epoch": 0.07530468672589755, "grad_norm": 0.19893021881580353, "learning_rate": 0.00019730377704031392, "loss": 0.028, "step": 570 }, { "epoch": 0.07543680021138158, "grad_norm": 0.24675323069095612, "learning_rate": 0.00019729418277586306, "loss": 0.032, "step": 571 }, { "epoch": 0.07556891369686561, "grad_norm": 0.2152172029018402, "learning_rate": 0.00019728457170566132, "loss": 0.0341, "step": 572 }, { "epoch": 0.07570102718234964, "grad_norm": 0.23518556356430054, "learning_rate": 0.00019727494383136874, "loss": 0.0295, "step": 573 }, { "epoch": 0.07583314066783367, "grad_norm": 0.23954501748085022, "learning_rate": 0.00019726529915464842, "loss": 0.0296, "step": 574 }, { "epoch": 0.0759652541533177, "grad_norm": 0.25000494718551636, "learning_rate": 0.00019725563767716625, "loss": 0.0353, "step": 575 }, { "epoch": 0.07609736763880173, "grad_norm": 0.34247976541519165, "learning_rate": 0.00019724595940059106, "loss": 0.0314, "step": 576 }, { "epoch": 0.07622948112428576, "grad_norm": 0.36853498220443726, "learning_rate": 0.00019723626432659462, "loss": 0.0478, "step": 577 }, { "epoch": 0.0763615946097698, "grad_norm": 0.20818112790584564, "learning_rate": 0.0001972265524568516, "loss": 0.0357, "step": 578 }, { "epoch": 0.07649370809525383, "grad_norm": 0.40607398748397827, "learning_rate": 0.0001972168237930395, "loss": 0.053, "step": 579 }, { "epoch": 0.07662582158073786, "grad_norm": 0.2218083292245865, "learning_rate": 0.0001972070783368388, "loss": 0.0375, "step": 580 }, { "epoch": 0.07675793506622189, "grad_norm": 0.27258095145225525, "learning_rate": 0.00019719731608993282, "loss": 0.037, "step": 581 }, { "epoch": 0.07689004855170592, "grad_norm": 0.420881450176239, "learning_rate": 0.0001971875370540078, "loss": 0.0456, "step": 582 }, { "epoch": 0.07702216203718995, "grad_norm": 0.27685117721557617, "learning_rate": 0.0001971777412307529, "loss": 0.0325, "step": 583 }, { "epoch": 0.07715427552267398, "grad_norm": 0.28919875621795654, "learning_rate": 0.00019716792862186014, "loss": 0.0214, "step": 584 }, { "epoch": 0.07728638900815801, "grad_norm": 0.23569947481155396, "learning_rate": 0.0001971580992290245, "loss": 0.0331, "step": 585 }, { "epoch": 0.07741850249364204, "grad_norm": 0.2427627444267273, "learning_rate": 0.0001971482530539438, "loss": 0.0329, "step": 586 }, { "epoch": 0.07755061597912608, "grad_norm": 0.19415737688541412, "learning_rate": 0.0001971383900983188, "loss": 0.0262, "step": 587 }, { "epoch": 0.0776827294646101, "grad_norm": 0.46439921855926514, "learning_rate": 0.00019712851036385315, "loss": 0.0417, "step": 588 }, { "epoch": 0.07781484295009414, "grad_norm": 0.18036745488643646, "learning_rate": 0.00019711861385225338, "loss": 0.0179, "step": 589 }, { "epoch": 0.07794695643557817, "grad_norm": 0.22073324024677277, "learning_rate": 0.00019710870056522889, "loss": 0.0349, "step": 590 }, { "epoch": 0.07807906992106219, "grad_norm": 0.2414667010307312, "learning_rate": 0.00019709877050449204, "loss": 0.0356, "step": 591 }, { "epoch": 0.07821118340654622, "grad_norm": 0.19599366188049316, "learning_rate": 0.0001970888236717581, "loss": 0.0229, "step": 592 }, { "epoch": 0.07834329689203025, "grad_norm": 0.23867139220237732, "learning_rate": 0.00019707886006874515, "loss": 0.0241, "step": 593 }, { "epoch": 0.07847541037751428, "grad_norm": 0.3124195635318756, "learning_rate": 0.0001970688796971742, "loss": 0.0435, "step": 594 }, { "epoch": 0.07860752386299831, "grad_norm": 0.27740946412086487, "learning_rate": 0.00019705888255876927, "loss": 0.0337, "step": 595 }, { "epoch": 0.07873963734848234, "grad_norm": 0.20327328145503998, "learning_rate": 0.00019704886865525706, "loss": 0.0297, "step": 596 }, { "epoch": 0.07887175083396637, "grad_norm": 0.37126624584198, "learning_rate": 0.00019703883798836738, "loss": 0.0553, "step": 597 }, { "epoch": 0.0790038643194504, "grad_norm": 0.22105854749679565, "learning_rate": 0.0001970287905598328, "loss": 0.041, "step": 598 }, { "epoch": 0.07913597780493443, "grad_norm": 0.22515787184238434, "learning_rate": 0.0001970187263713888, "loss": 0.0295, "step": 599 }, { "epoch": 0.07926809129041847, "grad_norm": 0.47252383828163147, "learning_rate": 0.0001970086454247738, "loss": 0.048, "step": 600 }, { "epoch": 0.0794002047759025, "grad_norm": 0.24759109318256378, "learning_rate": 0.0001969985477217291, "loss": 0.0274, "step": 601 }, { "epoch": 0.07953231826138653, "grad_norm": 0.41675540804862976, "learning_rate": 0.0001969884332639989, "loss": 0.0379, "step": 602 }, { "epoch": 0.07966443174687056, "grad_norm": 0.2635309398174286, "learning_rate": 0.0001969783020533303, "loss": 0.0449, "step": 603 }, { "epoch": 0.07979654523235459, "grad_norm": 0.25403887033462524, "learning_rate": 0.00019696815409147317, "loss": 0.0387, "step": 604 }, { "epoch": 0.07992865871783862, "grad_norm": 0.21932649612426758, "learning_rate": 0.00019695798938018053, "loss": 0.036, "step": 605 }, { "epoch": 0.08006077220332265, "grad_norm": 0.23342837393283844, "learning_rate": 0.00019694780792120807, "loss": 0.0431, "step": 606 }, { "epoch": 0.08019288568880668, "grad_norm": 0.5543757081031799, "learning_rate": 0.00019693760971631444, "loss": 0.0539, "step": 607 }, { "epoch": 0.08032499917429071, "grad_norm": 0.29595449566841125, "learning_rate": 0.00019692739476726118, "loss": 0.021, "step": 608 }, { "epoch": 0.08045711265977475, "grad_norm": 0.24539723992347717, "learning_rate": 0.0001969171630758128, "loss": 0.0386, "step": 609 }, { "epoch": 0.08058922614525878, "grad_norm": 0.20920851826667786, "learning_rate": 0.0001969069146437365, "loss": 0.0207, "step": 610 }, { "epoch": 0.08072133963074281, "grad_norm": 0.3386335074901581, "learning_rate": 0.00019689664947280267, "loss": 0.0408, "step": 611 }, { "epoch": 0.08085345311622684, "grad_norm": 0.3359006643295288, "learning_rate": 0.00019688636756478434, "loss": 0.0548, "step": 612 }, { "epoch": 0.08098556660171087, "grad_norm": 0.3495159149169922, "learning_rate": 0.00019687606892145748, "loss": 0.0353, "step": 613 }, { "epoch": 0.0811176800871949, "grad_norm": 0.34049656987190247, "learning_rate": 0.00019686575354460107, "loss": 0.0547, "step": 614 }, { "epoch": 0.08124979357267893, "grad_norm": 0.37891748547554016, "learning_rate": 0.00019685542143599684, "loss": 0.0415, "step": 615 }, { "epoch": 0.08138190705816296, "grad_norm": 0.1867925226688385, "learning_rate": 0.0001968450725974295, "loss": 0.0279, "step": 616 }, { "epoch": 0.081514020543647, "grad_norm": 0.2333107739686966, "learning_rate": 0.00019683470703068664, "loss": 0.0297, "step": 617 }, { "epoch": 0.08164613402913103, "grad_norm": 0.2671178877353668, "learning_rate": 0.0001968243247375586, "loss": 0.0264, "step": 618 }, { "epoch": 0.08177824751461506, "grad_norm": 0.2965892553329468, "learning_rate": 0.00019681392571983887, "loss": 0.0354, "step": 619 }, { "epoch": 0.08191036100009909, "grad_norm": 0.23038817942142487, "learning_rate": 0.00019680350997932364, "loss": 0.0332, "step": 620 }, { "epoch": 0.08204247448558312, "grad_norm": 0.3962733745574951, "learning_rate": 0.000196793077517812, "loss": 0.0359, "step": 621 }, { "epoch": 0.08217458797106715, "grad_norm": 0.26966428756713867, "learning_rate": 0.00019678262833710598, "loss": 0.0279, "step": 622 }, { "epoch": 0.08230670145655118, "grad_norm": 0.31382763385772705, "learning_rate": 0.00019677216243901052, "loss": 0.0317, "step": 623 }, { "epoch": 0.08243881494203521, "grad_norm": 0.23322831094264984, "learning_rate": 0.00019676167982533334, "loss": 0.046, "step": 624 }, { "epoch": 0.08257092842751924, "grad_norm": 0.2607961595058441, "learning_rate": 0.00019675118049788514, "loss": 0.03, "step": 625 }, { "epoch": 0.08270304191300328, "grad_norm": 0.18060070276260376, "learning_rate": 0.00019674066445847952, "loss": 0.0272, "step": 626 }, { "epoch": 0.0828351553984873, "grad_norm": 0.24973896145820618, "learning_rate": 0.0001967301317089329, "loss": 0.0412, "step": 627 }, { "epoch": 0.08296726888397134, "grad_norm": 0.25361204147338867, "learning_rate": 0.00019671958225106462, "loss": 0.0294, "step": 628 }, { "epoch": 0.08309938236945537, "grad_norm": 0.38416653871536255, "learning_rate": 0.00019670901608669685, "loss": 0.0396, "step": 629 }, { "epoch": 0.0832314958549394, "grad_norm": 0.21381421387195587, "learning_rate": 0.0001966984332176548, "loss": 0.0391, "step": 630 }, { "epoch": 0.08336360934042342, "grad_norm": 0.2730841636657715, "learning_rate": 0.0001966878336457664, "loss": 0.0288, "step": 631 }, { "epoch": 0.08349572282590745, "grad_norm": 0.1917334944009781, "learning_rate": 0.00019667721737286252, "loss": 0.0222, "step": 632 }, { "epoch": 0.08362783631139148, "grad_norm": 0.27792686223983765, "learning_rate": 0.00019666658440077695, "loss": 0.035, "step": 633 }, { "epoch": 0.08375994979687551, "grad_norm": 0.4698370397090912, "learning_rate": 0.00019665593473134631, "loss": 0.0394, "step": 634 }, { "epoch": 0.08389206328235954, "grad_norm": 0.38890373706817627, "learning_rate": 0.00019664526836641016, "loss": 0.0304, "step": 635 }, { "epoch": 0.08402417676784357, "grad_norm": 0.25699278712272644, "learning_rate": 0.00019663458530781093, "loss": 0.0305, "step": 636 }, { "epoch": 0.0841562902533276, "grad_norm": 0.3510453999042511, "learning_rate": 0.00019662388555739387, "loss": 0.0341, "step": 637 }, { "epoch": 0.08428840373881163, "grad_norm": 0.21721839904785156, "learning_rate": 0.00019661316911700715, "loss": 0.0276, "step": 638 }, { "epoch": 0.08442051722429567, "grad_norm": 0.3929044008255005, "learning_rate": 0.0001966024359885019, "loss": 0.0459, "step": 639 }, { "epoch": 0.0845526307097797, "grad_norm": 0.3013642430305481, "learning_rate": 0.000196591686173732, "loss": 0.0428, "step": 640 }, { "epoch": 0.08468474419526373, "grad_norm": 0.2829546630382538, "learning_rate": 0.00019658091967455436, "loss": 0.0221, "step": 641 }, { "epoch": 0.08481685768074776, "grad_norm": 0.3992788791656494, "learning_rate": 0.00019657013649282865, "loss": 0.0298, "step": 642 }, { "epoch": 0.08494897116623179, "grad_norm": 0.24115873873233795, "learning_rate": 0.00019655933663041743, "loss": 0.0273, "step": 643 }, { "epoch": 0.08508108465171582, "grad_norm": 0.22903680801391602, "learning_rate": 0.0001965485200891862, "loss": 0.0306, "step": 644 }, { "epoch": 0.08521319813719985, "grad_norm": 0.24627751111984253, "learning_rate": 0.00019653768687100334, "loss": 0.037, "step": 645 }, { "epoch": 0.08534531162268388, "grad_norm": 0.23460929095745087, "learning_rate": 0.00019652683697774008, "loss": 0.0289, "step": 646 }, { "epoch": 0.08547742510816791, "grad_norm": 0.4498720169067383, "learning_rate": 0.0001965159704112705, "loss": 0.0362, "step": 647 }, { "epoch": 0.08560953859365195, "grad_norm": 0.32583916187286377, "learning_rate": 0.0001965050871734716, "loss": 0.038, "step": 648 }, { "epoch": 0.08574165207913598, "grad_norm": 0.20444458723068237, "learning_rate": 0.00019649418726622327, "loss": 0.0233, "step": 649 }, { "epoch": 0.08587376556462001, "grad_norm": 0.20803149044513702, "learning_rate": 0.00019648327069140832, "loss": 0.0187, "step": 650 }, { "epoch": 0.08600587905010404, "grad_norm": 0.21215280890464783, "learning_rate": 0.00019647233745091226, "loss": 0.0331, "step": 651 }, { "epoch": 0.08613799253558807, "grad_norm": 0.24561989307403564, "learning_rate": 0.00019646138754662374, "loss": 0.0301, "step": 652 }, { "epoch": 0.0862701060210721, "grad_norm": 0.3120889961719513, "learning_rate": 0.00019645042098043406, "loss": 0.0407, "step": 653 }, { "epoch": 0.08640221950655613, "grad_norm": 0.22205640375614166, "learning_rate": 0.0001964394377542375, "loss": 0.0402, "step": 654 }, { "epoch": 0.08653433299204016, "grad_norm": 0.24051034450531006, "learning_rate": 0.00019642843786993124, "loss": 0.0182, "step": 655 }, { "epoch": 0.0866664464775242, "grad_norm": 0.23244361579418182, "learning_rate": 0.00019641742132941529, "loss": 0.0268, "step": 656 }, { "epoch": 0.08679855996300823, "grad_norm": 0.30195531249046326, "learning_rate": 0.00019640638813459252, "loss": 0.042, "step": 657 }, { "epoch": 0.08693067344849226, "grad_norm": 0.2843739092350006, "learning_rate": 0.00019639533828736875, "loss": 0.0314, "step": 658 }, { "epoch": 0.08706278693397629, "grad_norm": 0.33743587136268616, "learning_rate": 0.00019638427178965263, "loss": 0.0342, "step": 659 }, { "epoch": 0.08719490041946032, "grad_norm": 0.2209853231906891, "learning_rate": 0.0001963731886433557, "loss": 0.0436, "step": 660 }, { "epoch": 0.08732701390494435, "grad_norm": 0.20948272943496704, "learning_rate": 0.00019636208885039232, "loss": 0.0333, "step": 661 }, { "epoch": 0.08745912739042838, "grad_norm": 0.27300870418548584, "learning_rate": 0.00019635097241267979, "loss": 0.0435, "step": 662 }, { "epoch": 0.08759124087591241, "grad_norm": 0.21323652565479279, "learning_rate": 0.0001963398393321383, "loss": 0.023, "step": 663 }, { "epoch": 0.08772335436139644, "grad_norm": 0.2643250823020935, "learning_rate": 0.00019632868961069085, "loss": 0.0451, "step": 664 }, { "epoch": 0.08785546784688048, "grad_norm": 0.3321099579334259, "learning_rate": 0.00019631752325026335, "loss": 0.0338, "step": 665 }, { "epoch": 0.0879875813323645, "grad_norm": 0.28824859857559204, "learning_rate": 0.0001963063402527846, "loss": 0.0366, "step": 666 }, { "epoch": 0.08811969481784854, "grad_norm": 0.2807556092739105, "learning_rate": 0.00019629514062018618, "loss": 0.028, "step": 667 }, { "epoch": 0.08825180830333257, "grad_norm": 0.25782066583633423, "learning_rate": 0.00019628392435440276, "loss": 0.0271, "step": 668 }, { "epoch": 0.0883839217888166, "grad_norm": 0.31848353147506714, "learning_rate": 0.0001962726914573716, "loss": 0.0391, "step": 669 }, { "epoch": 0.08851603527430063, "grad_norm": 0.20026235282421112, "learning_rate": 0.00019626144193103304, "loss": 0.0303, "step": 670 }, { "epoch": 0.08864814875978466, "grad_norm": 0.2486668825149536, "learning_rate": 0.0001962501757773302, "loss": 0.035, "step": 671 }, { "epoch": 0.08878026224526868, "grad_norm": 0.18474815785884857, "learning_rate": 0.00019623889299820913, "loss": 0.0263, "step": 672 }, { "epoch": 0.08891237573075271, "grad_norm": 0.23487155139446259, "learning_rate": 0.0001962275935956187, "loss": 0.04, "step": 673 }, { "epoch": 0.08904448921623674, "grad_norm": 0.25455838441848755, "learning_rate": 0.00019621627757151065, "loss": 0.0275, "step": 674 }, { "epoch": 0.08917660270172077, "grad_norm": 0.2235615849494934, "learning_rate": 0.00019620494492783962, "loss": 0.0216, "step": 675 }, { "epoch": 0.0893087161872048, "grad_norm": 0.26305070519447327, "learning_rate": 0.00019619359566656316, "loss": 0.0425, "step": 676 }, { "epoch": 0.08944082967268883, "grad_norm": 0.24703732132911682, "learning_rate": 0.0001961822297896416, "loss": 0.0204, "step": 677 }, { "epoch": 0.08957294315817287, "grad_norm": 0.2195281833410263, "learning_rate": 0.00019617084729903818, "loss": 0.0254, "step": 678 }, { "epoch": 0.0897050566436569, "grad_norm": 0.20875030755996704, "learning_rate": 0.000196159448196719, "loss": 0.0398, "step": 679 }, { "epoch": 0.08983717012914093, "grad_norm": 0.23726671934127808, "learning_rate": 0.0001961480324846531, "loss": 0.0346, "step": 680 }, { "epoch": 0.08996928361462496, "grad_norm": 0.2222008854150772, "learning_rate": 0.0001961366001648123, "loss": 0.0246, "step": 681 }, { "epoch": 0.09010139710010899, "grad_norm": 0.2726287245750427, "learning_rate": 0.0001961251512391713, "loss": 0.026, "step": 682 }, { "epoch": 0.09023351058559302, "grad_norm": 0.2012794464826584, "learning_rate": 0.00019611368570970767, "loss": 0.0282, "step": 683 }, { "epoch": 0.09036562407107705, "grad_norm": 0.16400226950645447, "learning_rate": 0.0001961022035784019, "loss": 0.0241, "step": 684 }, { "epoch": 0.09049773755656108, "grad_norm": 0.30417323112487793, "learning_rate": 0.00019609070484723738, "loss": 0.035, "step": 685 }, { "epoch": 0.09062985104204511, "grad_norm": 0.3207133412361145, "learning_rate": 0.0001960791895182002, "loss": 0.0469, "step": 686 }, { "epoch": 0.09076196452752915, "grad_norm": 0.385721355676651, "learning_rate": 0.00019606765759327944, "loss": 0.036, "step": 687 }, { "epoch": 0.09089407801301318, "grad_norm": 0.2257477343082428, "learning_rate": 0.000196056109074467, "loss": 0.033, "step": 688 }, { "epoch": 0.09102619149849721, "grad_norm": 0.238136425614357, "learning_rate": 0.00019604454396375773, "loss": 0.0231, "step": 689 }, { "epoch": 0.09115830498398124, "grad_norm": 0.2879732847213745, "learning_rate": 0.00019603296226314927, "loss": 0.0358, "step": 690 }, { "epoch": 0.09129041846946527, "grad_norm": 0.20712998509407043, "learning_rate": 0.00019602136397464212, "loss": 0.0232, "step": 691 }, { "epoch": 0.0914225319549493, "grad_norm": 0.19827014207839966, "learning_rate": 0.0001960097491002397, "loss": 0.0194, "step": 692 }, { "epoch": 0.09155464544043333, "grad_norm": 0.23387299478054047, "learning_rate": 0.00019599811764194823, "loss": 0.0326, "step": 693 }, { "epoch": 0.09168675892591736, "grad_norm": 0.2525857090950012, "learning_rate": 0.00019598646960177683, "loss": 0.0346, "step": 694 }, { "epoch": 0.0918188724114014, "grad_norm": 0.22527404129505157, "learning_rate": 0.00019597480498173754, "loss": 0.0338, "step": 695 }, { "epoch": 0.09195098589688543, "grad_norm": 0.2621941864490509, "learning_rate": 0.0001959631237838451, "loss": 0.0291, "step": 696 }, { "epoch": 0.09208309938236946, "grad_norm": 0.19432783126831055, "learning_rate": 0.0001959514260101173, "loss": 0.0198, "step": 697 }, { "epoch": 0.09221521286785349, "grad_norm": 0.42948463559150696, "learning_rate": 0.00019593971166257466, "loss": 0.0275, "step": 698 }, { "epoch": 0.09234732635333752, "grad_norm": 0.3384920060634613, "learning_rate": 0.00019592798074324067, "loss": 0.0405, "step": 699 }, { "epoch": 0.09247943983882155, "grad_norm": 0.2368774712085724, "learning_rate": 0.00019591623325414161, "loss": 0.0385, "step": 700 }, { "epoch": 0.09261155332430558, "grad_norm": 0.31908854842185974, "learning_rate": 0.0001959044691973066, "loss": 0.0308, "step": 701 }, { "epoch": 0.09274366680978961, "grad_norm": 0.24914051592350006, "learning_rate": 0.0001958926885747677, "loss": 0.0181, "step": 702 }, { "epoch": 0.09287578029527364, "grad_norm": 0.33457881212234497, "learning_rate": 0.00019588089138855978, "loss": 0.0392, "step": 703 }, { "epoch": 0.09300789378075767, "grad_norm": 0.3331588804721832, "learning_rate": 0.0001958690776407206, "loss": 0.0454, "step": 704 }, { "epoch": 0.0931400072662417, "grad_norm": 0.30309465527534485, "learning_rate": 0.00019585724733329072, "loss": 0.0387, "step": 705 }, { "epoch": 0.09327212075172574, "grad_norm": 0.1881301999092102, "learning_rate": 0.00019584540046831364, "loss": 0.0233, "step": 706 }, { "epoch": 0.09340423423720977, "grad_norm": 0.19208797812461853, "learning_rate": 0.0001958335370478357, "loss": 0.0269, "step": 707 }, { "epoch": 0.0935363477226938, "grad_norm": 0.3077560365200043, "learning_rate": 0.00019582165707390602, "loss": 0.0349, "step": 708 }, { "epoch": 0.09366846120817783, "grad_norm": 0.19938530027866364, "learning_rate": 0.0001958097605485767, "loss": 0.023, "step": 709 }, { "epoch": 0.09380057469366186, "grad_norm": 0.7101864218711853, "learning_rate": 0.00019579784747390263, "loss": 0.0621, "step": 710 }, { "epoch": 0.09393268817914589, "grad_norm": 0.2244912087917328, "learning_rate": 0.00019578591785194156, "loss": 0.0371, "step": 711 }, { "epoch": 0.09406480166462992, "grad_norm": 0.21191316843032837, "learning_rate": 0.00019577397168475414, "loss": 0.0227, "step": 712 }, { "epoch": 0.09419691515011394, "grad_norm": 0.27440908551216125, "learning_rate": 0.0001957620089744038, "loss": 0.025, "step": 713 }, { "epoch": 0.09432902863559797, "grad_norm": 0.3090989291667938, "learning_rate": 0.0001957500297229569, "loss": 0.0304, "step": 714 }, { "epoch": 0.094461142121082, "grad_norm": 0.204986572265625, "learning_rate": 0.00019573803393248263, "loss": 0.0286, "step": 715 }, { "epoch": 0.09459325560656603, "grad_norm": 0.21652153134346008, "learning_rate": 0.00019572602160505305, "loss": 0.0178, "step": 716 }, { "epoch": 0.09472536909205007, "grad_norm": 0.3828398585319519, "learning_rate": 0.00019571399274274305, "loss": 0.0249, "step": 717 }, { "epoch": 0.0948574825775341, "grad_norm": 0.24476584792137146, "learning_rate": 0.00019570194734763038, "loss": 0.0352, "step": 718 }, { "epoch": 0.09498959606301813, "grad_norm": 0.2881144881248474, "learning_rate": 0.00019568988542179567, "loss": 0.0457, "step": 719 }, { "epoch": 0.09512170954850216, "grad_norm": 0.19496379792690277, "learning_rate": 0.0001956778069673224, "loss": 0.0186, "step": 720 }, { "epoch": 0.09525382303398619, "grad_norm": 0.21406958997249603, "learning_rate": 0.00019566571198629694, "loss": 0.0282, "step": 721 }, { "epoch": 0.09538593651947022, "grad_norm": 0.1741507649421692, "learning_rate": 0.00019565360048080837, "loss": 0.0218, "step": 722 }, { "epoch": 0.09551805000495425, "grad_norm": 0.2991659939289093, "learning_rate": 0.00019564147245294876, "loss": 0.0358, "step": 723 }, { "epoch": 0.09565016349043828, "grad_norm": 0.361500084400177, "learning_rate": 0.00019562932790481306, "loss": 0.0533, "step": 724 }, { "epoch": 0.09578227697592231, "grad_norm": 0.24400769174098969, "learning_rate": 0.00019561716683849894, "loss": 0.0336, "step": 725 }, { "epoch": 0.09591439046140635, "grad_norm": 0.22594720125198364, "learning_rate": 0.00019560498925610706, "loss": 0.0296, "step": 726 }, { "epoch": 0.09604650394689038, "grad_norm": 0.22000914812088013, "learning_rate": 0.0001955927951597408, "loss": 0.0327, "step": 727 }, { "epoch": 0.09617861743237441, "grad_norm": 0.42935800552368164, "learning_rate": 0.00019558058455150653, "loss": 0.0432, "step": 728 }, { "epoch": 0.09631073091785844, "grad_norm": 0.30375710129737854, "learning_rate": 0.0001955683574335134, "loss": 0.028, "step": 729 }, { "epoch": 0.09644284440334247, "grad_norm": 0.33745771646499634, "learning_rate": 0.00019555611380787333, "loss": 0.0415, "step": 730 }, { "epoch": 0.0965749578888265, "grad_norm": 0.25258249044418335, "learning_rate": 0.00019554385367670128, "loss": 0.0158, "step": 731 }, { "epoch": 0.09670707137431053, "grad_norm": 0.20375578105449677, "learning_rate": 0.0001955315770421149, "loss": 0.0209, "step": 732 }, { "epoch": 0.09683918485979456, "grad_norm": 0.2016744613647461, "learning_rate": 0.00019551928390623477, "loss": 0.0337, "step": 733 }, { "epoch": 0.0969712983452786, "grad_norm": 0.26688241958618164, "learning_rate": 0.00019550697427118429, "loss": 0.0387, "step": 734 }, { "epoch": 0.09710341183076263, "grad_norm": 0.21267996728420258, "learning_rate": 0.00019549464813908973, "loss": 0.0336, "step": 735 }, { "epoch": 0.09723552531624666, "grad_norm": 0.3066573739051819, "learning_rate": 0.0001954823055120802, "loss": 0.0375, "step": 736 }, { "epoch": 0.09736763880173069, "grad_norm": 0.22104468941688538, "learning_rate": 0.00019546994639228765, "loss": 0.0224, "step": 737 }, { "epoch": 0.09749975228721472, "grad_norm": 0.20909041166305542, "learning_rate": 0.00019545757078184687, "loss": 0.0242, "step": 738 }, { "epoch": 0.09763186577269875, "grad_norm": 0.2227659821510315, "learning_rate": 0.00019544517868289556, "loss": 0.0376, "step": 739 }, { "epoch": 0.09776397925818278, "grad_norm": 0.23300065100193024, "learning_rate": 0.00019543277009757417, "loss": 0.0379, "step": 740 }, { "epoch": 0.09789609274366681, "grad_norm": 0.320353239774704, "learning_rate": 0.0001954203450280261, "loss": 0.0414, "step": 741 }, { "epoch": 0.09802820622915084, "grad_norm": 0.23459066450595856, "learning_rate": 0.00019540790347639752, "loss": 0.0339, "step": 742 }, { "epoch": 0.09816031971463487, "grad_norm": 0.2674216032028198, "learning_rate": 0.00019539544544483746, "loss": 0.0255, "step": 743 }, { "epoch": 0.0982924332001189, "grad_norm": 0.2853393256664276, "learning_rate": 0.00019538297093549788, "loss": 0.0349, "step": 744 }, { "epoch": 0.09842454668560294, "grad_norm": 0.2072666585445404, "learning_rate": 0.00019537047995053347, "loss": 0.0254, "step": 745 }, { "epoch": 0.09855666017108697, "grad_norm": 0.23279407620429993, "learning_rate": 0.00019535797249210177, "loss": 0.035, "step": 746 }, { "epoch": 0.098688773656571, "grad_norm": 0.30693086981773376, "learning_rate": 0.00019534544856236329, "loss": 0.0384, "step": 747 }, { "epoch": 0.09882088714205503, "grad_norm": 0.24419213831424713, "learning_rate": 0.00019533290816348123, "loss": 0.0403, "step": 748 }, { "epoch": 0.09895300062753906, "grad_norm": 0.288097620010376, "learning_rate": 0.0001953203512976218, "loss": 0.034, "step": 749 }, { "epoch": 0.09908511411302309, "grad_norm": 0.26100876927375793, "learning_rate": 0.0001953077779669539, "loss": 0.0331, "step": 750 }, { "epoch": 0.09921722759850712, "grad_norm": 0.2615920305252075, "learning_rate": 0.00019529518817364933, "loss": 0.0389, "step": 751 }, { "epoch": 0.09934934108399116, "grad_norm": 0.28125059604644775, "learning_rate": 0.00019528258191988277, "loss": 0.0344, "step": 752 }, { "epoch": 0.09948145456947517, "grad_norm": 0.268867164850235, "learning_rate": 0.00019526995920783174, "loss": 0.0463, "step": 753 }, { "epoch": 0.0996135680549592, "grad_norm": 0.303653359413147, "learning_rate": 0.00019525732003967651, "loss": 0.0454, "step": 754 }, { "epoch": 0.09974568154044323, "grad_norm": 0.2101089507341385, "learning_rate": 0.0001952446644176003, "loss": 0.0332, "step": 755 }, { "epoch": 0.09987779502592727, "grad_norm": 0.2549467980861664, "learning_rate": 0.00019523199234378915, "loss": 0.0284, "step": 756 }, { "epoch": 0.1000099085114113, "grad_norm": 0.31274503469467163, "learning_rate": 0.00019521930382043187, "loss": 0.0491, "step": 757 }, { "epoch": 0.10014202199689533, "grad_norm": 0.2537939250469208, "learning_rate": 0.0001952065988497202, "loss": 0.0315, "step": 758 }, { "epoch": 0.10027413548237936, "grad_norm": 0.2864026129245758, "learning_rate": 0.00019519387743384872, "loss": 0.0367, "step": 759 }, { "epoch": 0.10040624896786339, "grad_norm": 0.584783673286438, "learning_rate": 0.00019518113957501477, "loss": 0.0467, "step": 760 }, { "epoch": 0.10053836245334742, "grad_norm": 0.14912083745002747, "learning_rate": 0.00019516838527541857, "loss": 0.0225, "step": 761 }, { "epoch": 0.10067047593883145, "grad_norm": 0.19884039461612701, "learning_rate": 0.0001951556145372632, "loss": 0.0204, "step": 762 }, { "epoch": 0.10080258942431548, "grad_norm": 0.20827095210552216, "learning_rate": 0.00019514282736275454, "loss": 0.0291, "step": 763 }, { "epoch": 0.10093470290979951, "grad_norm": 0.23917272686958313, "learning_rate": 0.0001951300237541014, "loss": 0.0256, "step": 764 }, { "epoch": 0.10106681639528355, "grad_norm": 0.23405112326145172, "learning_rate": 0.00019511720371351534, "loss": 0.04, "step": 765 }, { "epoch": 0.10119892988076758, "grad_norm": 0.21369348466396332, "learning_rate": 0.00019510436724321076, "loss": 0.0386, "step": 766 }, { "epoch": 0.10133104336625161, "grad_norm": 0.25339123606681824, "learning_rate": 0.0001950915143454049, "loss": 0.027, "step": 767 }, { "epoch": 0.10146315685173564, "grad_norm": 0.2526932656764984, "learning_rate": 0.00019507864502231792, "loss": 0.0329, "step": 768 }, { "epoch": 0.10159527033721967, "grad_norm": 0.223262757062912, "learning_rate": 0.00019506575927617271, "loss": 0.0298, "step": 769 }, { "epoch": 0.1017273838227037, "grad_norm": 0.22292746603488922, "learning_rate": 0.00019505285710919506, "loss": 0.0378, "step": 770 }, { "epoch": 0.10185949730818773, "grad_norm": 0.20766198635101318, "learning_rate": 0.0001950399385236136, "loss": 0.0321, "step": 771 }, { "epoch": 0.10199161079367176, "grad_norm": 0.25311240553855896, "learning_rate": 0.00019502700352165973, "loss": 0.0397, "step": 772 }, { "epoch": 0.1021237242791558, "grad_norm": 0.25872135162353516, "learning_rate": 0.00019501405210556774, "loss": 0.032, "step": 773 }, { "epoch": 0.10225583776463983, "grad_norm": 0.14971302449703217, "learning_rate": 0.00019500108427757473, "loss": 0.0155, "step": 774 }, { "epoch": 0.10238795125012386, "grad_norm": 0.22067025303840637, "learning_rate": 0.0001949881000399207, "loss": 0.0245, "step": 775 }, { "epoch": 0.10252006473560789, "grad_norm": 0.2329145222902298, "learning_rate": 0.00019497509939484843, "loss": 0.0286, "step": 776 }, { "epoch": 0.10265217822109192, "grad_norm": 0.15475256741046906, "learning_rate": 0.00019496208234460346, "loss": 0.0169, "step": 777 }, { "epoch": 0.10278429170657595, "grad_norm": 0.2959286868572235, "learning_rate": 0.00019494904889143434, "loss": 0.0479, "step": 778 }, { "epoch": 0.10291640519205998, "grad_norm": 0.3313973546028137, "learning_rate": 0.0001949359990375923, "loss": 0.0329, "step": 779 }, { "epoch": 0.10304851867754401, "grad_norm": 0.3937395215034485, "learning_rate": 0.00019492293278533147, "loss": 0.0338, "step": 780 }, { "epoch": 0.10318063216302804, "grad_norm": 0.2434310019016266, "learning_rate": 0.0001949098501369088, "loss": 0.0253, "step": 781 }, { "epoch": 0.10331274564851207, "grad_norm": 0.3571442663669586, "learning_rate": 0.00019489675109458406, "loss": 0.0214, "step": 782 }, { "epoch": 0.1034448591339961, "grad_norm": 0.20219728350639343, "learning_rate": 0.0001948836356606199, "loss": 0.0265, "step": 783 }, { "epoch": 0.10357697261948014, "grad_norm": 0.277512788772583, "learning_rate": 0.00019487050383728175, "loss": 0.0469, "step": 784 }, { "epoch": 0.10370908610496417, "grad_norm": 0.19468040764331818, "learning_rate": 0.00019485735562683784, "loss": 0.027, "step": 785 }, { "epoch": 0.1038411995904482, "grad_norm": 0.23780375719070435, "learning_rate": 0.00019484419103155937, "loss": 0.0275, "step": 786 }, { "epoch": 0.10397331307593223, "grad_norm": 0.17604751884937286, "learning_rate": 0.0001948310100537202, "loss": 0.0139, "step": 787 }, { "epoch": 0.10410542656141626, "grad_norm": 0.25029489398002625, "learning_rate": 0.0001948178126955971, "loss": 0.0381, "step": 788 }, { "epoch": 0.10423754004690029, "grad_norm": 0.20858237147331238, "learning_rate": 0.00019480459895946975, "loss": 0.0299, "step": 789 }, { "epoch": 0.10436965353238432, "grad_norm": 0.28888922929763794, "learning_rate": 0.00019479136884762048, "loss": 0.0289, "step": 790 }, { "epoch": 0.10450176701786835, "grad_norm": 0.2120070606470108, "learning_rate": 0.00019477812236233456, "loss": 0.0457, "step": 791 }, { "epoch": 0.10463388050335239, "grad_norm": 0.248973086476326, "learning_rate": 0.00019476485950590012, "loss": 0.0411, "step": 792 }, { "epoch": 0.10476599398883642, "grad_norm": 0.32294097542762756, "learning_rate": 0.00019475158028060808, "loss": 0.0381, "step": 793 }, { "epoch": 0.10489810747432043, "grad_norm": 0.25859832763671875, "learning_rate": 0.0001947382846887521, "loss": 0.0276, "step": 794 }, { "epoch": 0.10503022095980447, "grad_norm": 0.178748220205307, "learning_rate": 0.0001947249727326288, "loss": 0.0184, "step": 795 }, { "epoch": 0.1051623344452885, "grad_norm": 0.21161046624183655, "learning_rate": 0.00019471164441453755, "loss": 0.0313, "step": 796 }, { "epoch": 0.10529444793077253, "grad_norm": 0.36292991042137146, "learning_rate": 0.0001946982997367806, "loss": 0.0329, "step": 797 }, { "epoch": 0.10542656141625656, "grad_norm": 0.4161984324455261, "learning_rate": 0.00019468493870166293, "loss": 0.0363, "step": 798 }, { "epoch": 0.10555867490174059, "grad_norm": 0.21217572689056396, "learning_rate": 0.00019467156131149248, "loss": 0.0402, "step": 799 }, { "epoch": 0.10569078838722462, "grad_norm": 0.24036584794521332, "learning_rate": 0.00019465816756857992, "loss": 0.0351, "step": 800 }, { "epoch": 0.10582290187270865, "grad_norm": 0.18879947066307068, "learning_rate": 0.00019464475747523876, "loss": 0.0204, "step": 801 }, { "epoch": 0.10595501535819268, "grad_norm": 0.26750242710113525, "learning_rate": 0.00019463133103378533, "loss": 0.0349, "step": 802 }, { "epoch": 0.10608712884367671, "grad_norm": 0.2977040708065033, "learning_rate": 0.0001946178882465388, "loss": 0.0464, "step": 803 }, { "epoch": 0.10621924232916075, "grad_norm": 0.1896338313817978, "learning_rate": 0.0001946044291158212, "loss": 0.0237, "step": 804 }, { "epoch": 0.10635135581464478, "grad_norm": 0.23944416642189026, "learning_rate": 0.00019459095364395728, "loss": 0.0312, "step": 805 }, { "epoch": 0.10648346930012881, "grad_norm": 0.23625792562961578, "learning_rate": 0.00019457746183327475, "loss": 0.0257, "step": 806 }, { "epoch": 0.10661558278561284, "grad_norm": 0.22346678376197815, "learning_rate": 0.000194563953686104, "loss": 0.0202, "step": 807 }, { "epoch": 0.10674769627109687, "grad_norm": 0.269264280796051, "learning_rate": 0.00019455042920477834, "loss": 0.0201, "step": 808 }, { "epoch": 0.1068798097565809, "grad_norm": 0.27811092138290405, "learning_rate": 0.00019453688839163392, "loss": 0.0397, "step": 809 }, { "epoch": 0.10701192324206493, "grad_norm": 0.3348802328109741, "learning_rate": 0.00019452333124900955, "loss": 0.0354, "step": 810 }, { "epoch": 0.10714403672754896, "grad_norm": 0.4341381788253784, "learning_rate": 0.00019450975777924706, "loss": 0.0446, "step": 811 }, { "epoch": 0.107276150213033, "grad_norm": 0.2206539660692215, "learning_rate": 0.00019449616798469097, "loss": 0.0257, "step": 812 }, { "epoch": 0.10740826369851703, "grad_norm": 0.26142534613609314, "learning_rate": 0.00019448256186768869, "loss": 0.0351, "step": 813 }, { "epoch": 0.10754037718400106, "grad_norm": 0.3535195589065552, "learning_rate": 0.00019446893943059044, "loss": 0.0217, "step": 814 }, { "epoch": 0.10767249066948509, "grad_norm": 0.2100822776556015, "learning_rate": 0.0001944553006757492, "loss": 0.0333, "step": 815 }, { "epoch": 0.10780460415496912, "grad_norm": 0.19954076409339905, "learning_rate": 0.00019444164560552082, "loss": 0.0242, "step": 816 }, { "epoch": 0.10793671764045315, "grad_norm": 0.25790032744407654, "learning_rate": 0.00019442797422226398, "loss": 0.0282, "step": 817 }, { "epoch": 0.10806883112593718, "grad_norm": 0.2589845359325409, "learning_rate": 0.0001944142865283401, "loss": 0.0233, "step": 818 }, { "epoch": 0.10820094461142121, "grad_norm": 0.2854527235031128, "learning_rate": 0.00019440058252611354, "loss": 0.0378, "step": 819 }, { "epoch": 0.10833305809690524, "grad_norm": 0.35180917382240295, "learning_rate": 0.00019438686221795137, "loss": 0.028, "step": 820 }, { "epoch": 0.10846517158238927, "grad_norm": 0.22869399189949036, "learning_rate": 0.00019437312560622355, "loss": 0.0215, "step": 821 }, { "epoch": 0.1085972850678733, "grad_norm": 0.25632917881011963, "learning_rate": 0.00019435937269330275, "loss": 0.0395, "step": 822 }, { "epoch": 0.10872939855335734, "grad_norm": 0.22697928547859192, "learning_rate": 0.00019434560348156464, "loss": 0.0341, "step": 823 }, { "epoch": 0.10886151203884137, "grad_norm": 0.285995215177536, "learning_rate": 0.00019433181797338752, "loss": 0.0377, "step": 824 }, { "epoch": 0.1089936255243254, "grad_norm": 0.15859773755073547, "learning_rate": 0.00019431801617115257, "loss": 0.017, "step": 825 }, { "epoch": 0.10912573900980943, "grad_norm": 0.2510858178138733, "learning_rate": 0.00019430419807724383, "loss": 0.0477, "step": 826 }, { "epoch": 0.10925785249529346, "grad_norm": 0.17218337953090668, "learning_rate": 0.0001942903636940481, "loss": 0.0151, "step": 827 }, { "epoch": 0.10938996598077749, "grad_norm": 0.2229837030172348, "learning_rate": 0.000194276513023955, "loss": 0.0365, "step": 828 }, { "epoch": 0.10952207946626152, "grad_norm": 0.25345245003700256, "learning_rate": 0.00019426264606935703, "loss": 0.027, "step": 829 }, { "epoch": 0.10965419295174555, "grad_norm": 0.2927243411540985, "learning_rate": 0.00019424876283264937, "loss": 0.0393, "step": 830 }, { "epoch": 0.10978630643722959, "grad_norm": 0.17304779589176178, "learning_rate": 0.00019423486331623013, "loss": 0.0186, "step": 831 }, { "epoch": 0.10991841992271362, "grad_norm": 0.16232873499393463, "learning_rate": 0.0001942209475225002, "loss": 0.0219, "step": 832 }, { "epoch": 0.11005053340819765, "grad_norm": 0.2516721189022064, "learning_rate": 0.00019420701545386327, "loss": 0.0282, "step": 833 }, { "epoch": 0.11018264689368168, "grad_norm": 0.3067221939563751, "learning_rate": 0.0001941930671127258, "loss": 0.0331, "step": 834 }, { "epoch": 0.1103147603791657, "grad_norm": 0.28294551372528076, "learning_rate": 0.00019417910250149714, "loss": 0.043, "step": 835 }, { "epoch": 0.11044687386464973, "grad_norm": 0.3205573856830597, "learning_rate": 0.00019416512162258944, "loss": 0.0402, "step": 836 }, { "epoch": 0.11057898735013376, "grad_norm": 0.38353288173675537, "learning_rate": 0.00019415112447841764, "loss": 0.0293, "step": 837 }, { "epoch": 0.11071110083561779, "grad_norm": 0.234690859913826, "learning_rate": 0.0001941371110713994, "loss": 0.0396, "step": 838 }, { "epoch": 0.11084321432110182, "grad_norm": 0.292104035615921, "learning_rate": 0.00019412308140395534, "loss": 0.0368, "step": 839 }, { "epoch": 0.11097532780658585, "grad_norm": 0.2527158856391907, "learning_rate": 0.00019410903547850884, "loss": 0.0473, "step": 840 }, { "epoch": 0.11110744129206988, "grad_norm": 0.33742791414260864, "learning_rate": 0.00019409497329748603, "loss": 0.0165, "step": 841 }, { "epoch": 0.11123955477755391, "grad_norm": 0.3314475119113922, "learning_rate": 0.0001940808948633159, "loss": 0.0257, "step": 842 }, { "epoch": 0.11137166826303795, "grad_norm": 0.5029959678649902, "learning_rate": 0.00019406680017843022, "loss": 0.0449, "step": 843 }, { "epoch": 0.11150378174852198, "grad_norm": 0.24230080842971802, "learning_rate": 0.0001940526892452636, "loss": 0.0196, "step": 844 }, { "epoch": 0.11163589523400601, "grad_norm": 0.23190683126449585, "learning_rate": 0.00019403856206625349, "loss": 0.0252, "step": 845 }, { "epoch": 0.11176800871949004, "grad_norm": 0.3320704400539398, "learning_rate": 0.00019402441864384, "loss": 0.0435, "step": 846 }, { "epoch": 0.11190012220497407, "grad_norm": 0.2635941505432129, "learning_rate": 0.00019401025898046622, "loss": 0.0414, "step": 847 }, { "epoch": 0.1120322356904581, "grad_norm": 0.40881574153900146, "learning_rate": 0.00019399608307857792, "loss": 0.0514, "step": 848 }, { "epoch": 0.11216434917594213, "grad_norm": 0.21754924952983856, "learning_rate": 0.00019398189094062374, "loss": 0.0194, "step": 849 }, { "epoch": 0.11229646266142616, "grad_norm": 0.24845930933952332, "learning_rate": 0.0001939676825690551, "loss": 0.0243, "step": 850 }, { "epoch": 0.1124285761469102, "grad_norm": 0.30211737751960754, "learning_rate": 0.00019395345796632626, "loss": 0.034, "step": 851 }, { "epoch": 0.11256068963239423, "grad_norm": 0.25241124629974365, "learning_rate": 0.00019393921713489417, "loss": 0.0238, "step": 852 }, { "epoch": 0.11269280311787826, "grad_norm": 0.26138097047805786, "learning_rate": 0.00019392496007721878, "loss": 0.0257, "step": 853 }, { "epoch": 0.11282491660336229, "grad_norm": 0.2760176956653595, "learning_rate": 0.00019391068679576264, "loss": 0.0304, "step": 854 }, { "epoch": 0.11295703008884632, "grad_norm": 0.1539127230644226, "learning_rate": 0.0001938963972929913, "loss": 0.0276, "step": 855 }, { "epoch": 0.11308914357433035, "grad_norm": 0.31126534938812256, "learning_rate": 0.00019388209157137286, "loss": 0.0529, "step": 856 }, { "epoch": 0.11322125705981438, "grad_norm": 0.2580847442150116, "learning_rate": 0.00019386776963337848, "loss": 0.0368, "step": 857 }, { "epoch": 0.11335337054529841, "grad_norm": 0.3393428921699524, "learning_rate": 0.00019385343148148193, "loss": 0.0195, "step": 858 }, { "epoch": 0.11348548403078244, "grad_norm": 0.22064736485481262, "learning_rate": 0.00019383907711815993, "loss": 0.0238, "step": 859 }, { "epoch": 0.11361759751626647, "grad_norm": 0.2550196945667267, "learning_rate": 0.00019382470654589188, "loss": 0.0234, "step": 860 }, { "epoch": 0.1137497110017505, "grad_norm": 0.1911945790052414, "learning_rate": 0.00019381031976716006, "loss": 0.0199, "step": 861 }, { "epoch": 0.11388182448723454, "grad_norm": 0.2527654469013214, "learning_rate": 0.0001937959167844495, "loss": 0.0298, "step": 862 }, { "epoch": 0.11401393797271857, "grad_norm": 0.20978406071662903, "learning_rate": 0.00019378149760024803, "loss": 0.0205, "step": 863 }, { "epoch": 0.1141460514582026, "grad_norm": 0.38254910707473755, "learning_rate": 0.00019376706221704628, "loss": 0.0315, "step": 864 }, { "epoch": 0.11427816494368663, "grad_norm": 0.33304017782211304, "learning_rate": 0.00019375261063733773, "loss": 0.0209, "step": 865 }, { "epoch": 0.11441027842917066, "grad_norm": 0.2691979706287384, "learning_rate": 0.00019373814286361863, "loss": 0.0342, "step": 866 }, { "epoch": 0.11454239191465469, "grad_norm": 0.21065416932106018, "learning_rate": 0.000193723658898388, "loss": 0.029, "step": 867 }, { "epoch": 0.11467450540013872, "grad_norm": 0.3999660015106201, "learning_rate": 0.00019370915874414765, "loss": 0.0348, "step": 868 }, { "epoch": 0.11480661888562275, "grad_norm": 0.35480427742004395, "learning_rate": 0.00019369464240340226, "loss": 0.0295, "step": 869 }, { "epoch": 0.11493873237110679, "grad_norm": 0.24580347537994385, "learning_rate": 0.00019368010987865918, "loss": 0.0205, "step": 870 }, { "epoch": 0.11507084585659082, "grad_norm": 0.27900341153144836, "learning_rate": 0.00019366556117242874, "loss": 0.0271, "step": 871 }, { "epoch": 0.11520295934207485, "grad_norm": 0.2475052922964096, "learning_rate": 0.00019365099628722388, "loss": 0.0364, "step": 872 }, { "epoch": 0.11533507282755888, "grad_norm": 0.27171263098716736, "learning_rate": 0.00019363641522556038, "loss": 0.0516, "step": 873 }, { "epoch": 0.11546718631304291, "grad_norm": 0.23279814422130585, "learning_rate": 0.000193621817989957, "loss": 0.0496, "step": 874 }, { "epoch": 0.11559929979852693, "grad_norm": 0.26830148696899414, "learning_rate": 0.00019360720458293495, "loss": 0.025, "step": 875 }, { "epoch": 0.11573141328401096, "grad_norm": 0.47811031341552734, "learning_rate": 0.00019359257500701853, "loss": 0.0368, "step": 876 }, { "epoch": 0.11586352676949499, "grad_norm": 0.2844720780849457, "learning_rate": 0.0001935779292647347, "loss": 0.0272, "step": 877 }, { "epoch": 0.11599564025497902, "grad_norm": 0.3734263479709625, "learning_rate": 0.00019356326735861322, "loss": 0.0274, "step": 878 }, { "epoch": 0.11612775374046305, "grad_norm": 0.29889705777168274, "learning_rate": 0.00019354858929118674, "loss": 0.026, "step": 879 }, { "epoch": 0.11625986722594708, "grad_norm": 0.37999603152275085, "learning_rate": 0.00019353389506499054, "loss": 0.0515, "step": 880 }, { "epoch": 0.11639198071143111, "grad_norm": 0.2977294921875, "learning_rate": 0.00019351918468256277, "loss": 0.0431, "step": 881 }, { "epoch": 0.11652409419691515, "grad_norm": 0.25868692994117737, "learning_rate": 0.00019350445814644442, "loss": 0.0348, "step": 882 }, { "epoch": 0.11665620768239918, "grad_norm": 0.2617489993572235, "learning_rate": 0.0001934897154591792, "loss": 0.0292, "step": 883 }, { "epoch": 0.11678832116788321, "grad_norm": 0.21163403987884521, "learning_rate": 0.00019347495662331364, "loss": 0.0355, "step": 884 }, { "epoch": 0.11692043465336724, "grad_norm": 0.2167588174343109, "learning_rate": 0.00019346018164139705, "loss": 0.0316, "step": 885 }, { "epoch": 0.11705254813885127, "grad_norm": 0.30651724338531494, "learning_rate": 0.00019344539051598152, "loss": 0.0546, "step": 886 }, { "epoch": 0.1171846616243353, "grad_norm": 0.1882271021604538, "learning_rate": 0.00019343058324962196, "loss": 0.0294, "step": 887 }, { "epoch": 0.11731677510981933, "grad_norm": 0.17964918911457062, "learning_rate": 0.00019341575984487604, "loss": 0.0254, "step": 888 }, { "epoch": 0.11744888859530336, "grad_norm": 0.16270661354064941, "learning_rate": 0.0001934009203043042, "loss": 0.0252, "step": 889 }, { "epoch": 0.1175810020807874, "grad_norm": 0.8287057876586914, "learning_rate": 0.00019338606463046977, "loss": 0.027, "step": 890 }, { "epoch": 0.11771311556627143, "grad_norm": 0.2975148856639862, "learning_rate": 0.00019337119282593874, "loss": 0.0288, "step": 891 }, { "epoch": 0.11784522905175546, "grad_norm": 0.2509402632713318, "learning_rate": 0.0001933563048932799, "loss": 0.0401, "step": 892 }, { "epoch": 0.11797734253723949, "grad_norm": 0.18912091851234436, "learning_rate": 0.0001933414008350649, "loss": 0.0253, "step": 893 }, { "epoch": 0.11810945602272352, "grad_norm": 0.22809197008609772, "learning_rate": 0.00019332648065386815, "loss": 0.0385, "step": 894 }, { "epoch": 0.11824156950820755, "grad_norm": 0.41790077090263367, "learning_rate": 0.00019331154435226684, "loss": 0.0377, "step": 895 }, { "epoch": 0.11837368299369158, "grad_norm": 0.2439369559288025, "learning_rate": 0.00019329659193284088, "loss": 0.026, "step": 896 }, { "epoch": 0.11850579647917561, "grad_norm": 0.2535647749900818, "learning_rate": 0.00019328162339817307, "loss": 0.024, "step": 897 }, { "epoch": 0.11863790996465964, "grad_norm": 0.36167967319488525, "learning_rate": 0.00019326663875084891, "loss": 0.0273, "step": 898 }, { "epoch": 0.11877002345014367, "grad_norm": 0.19031481444835663, "learning_rate": 0.00019325163799345675, "loss": 0.0311, "step": 899 }, { "epoch": 0.1189021369356277, "grad_norm": 0.31373831629753113, "learning_rate": 0.0001932366211285877, "loss": 0.0328, "step": 900 }, { "epoch": 0.11903425042111174, "grad_norm": 0.30672964453697205, "learning_rate": 0.00019322158815883558, "loss": 0.0529, "step": 901 }, { "epoch": 0.11916636390659577, "grad_norm": 0.20721125602722168, "learning_rate": 0.00019320653908679711, "loss": 0.0366, "step": 902 }, { "epoch": 0.1192984773920798, "grad_norm": 0.19243820011615753, "learning_rate": 0.00019319147391507174, "loss": 0.0185, "step": 903 }, { "epoch": 0.11943059087756383, "grad_norm": 0.2745702564716339, "learning_rate": 0.00019317639264626165, "loss": 0.0356, "step": 904 }, { "epoch": 0.11956270436304786, "grad_norm": 0.3759895861148834, "learning_rate": 0.00019316129528297192, "loss": 0.0121, "step": 905 }, { "epoch": 0.11969481784853189, "grad_norm": 0.2563433349132538, "learning_rate": 0.00019314618182781024, "loss": 0.0356, "step": 906 }, { "epoch": 0.11982693133401592, "grad_norm": 0.2550245225429535, "learning_rate": 0.00019313105228338726, "loss": 0.0267, "step": 907 }, { "epoch": 0.11995904481949995, "grad_norm": 0.27986523509025574, "learning_rate": 0.00019311590665231626, "loss": 0.0295, "step": 908 }, { "epoch": 0.12009115830498399, "grad_norm": 0.24848835170269012, "learning_rate": 0.00019310074493721343, "loss": 0.0395, "step": 909 }, { "epoch": 0.12022327179046802, "grad_norm": 0.2371983379125595, "learning_rate": 0.00019308556714069764, "loss": 0.0377, "step": 910 }, { "epoch": 0.12035538527595205, "grad_norm": 0.26739948987960815, "learning_rate": 0.00019307037326539057, "loss": 0.0215, "step": 911 }, { "epoch": 0.12048749876143608, "grad_norm": 0.19139261543750763, "learning_rate": 0.0001930551633139167, "loss": 0.028, "step": 912 }, { "epoch": 0.12061961224692011, "grad_norm": 0.19188474118709564, "learning_rate": 0.0001930399372889032, "loss": 0.0202, "step": 913 }, { "epoch": 0.12075172573240414, "grad_norm": 0.25728708505630493, "learning_rate": 0.00019302469519298014, "loss": 0.0417, "step": 914 }, { "epoch": 0.12088383921788817, "grad_norm": 0.28429311513900757, "learning_rate": 0.0001930094370287803, "loss": 0.0273, "step": 915 }, { "epoch": 0.12101595270337219, "grad_norm": 0.2718221843242645, "learning_rate": 0.00019299416279893925, "loss": 0.0349, "step": 916 }, { "epoch": 0.12114806618885622, "grad_norm": 0.18933570384979248, "learning_rate": 0.0001929788725060953, "loss": 0.0248, "step": 917 }, { "epoch": 0.12128017967434025, "grad_norm": 0.2467522919178009, "learning_rate": 0.00019296356615288959, "loss": 0.0216, "step": 918 }, { "epoch": 0.12141229315982428, "grad_norm": 0.22754551470279694, "learning_rate": 0.00019294824374196598, "loss": 0.0266, "step": 919 }, { "epoch": 0.12154440664530831, "grad_norm": 0.2605039179325104, "learning_rate": 0.00019293290527597114, "loss": 0.0315, "step": 920 }, { "epoch": 0.12167652013079235, "grad_norm": 0.2127530425786972, "learning_rate": 0.00019291755075755452, "loss": 0.0351, "step": 921 }, { "epoch": 0.12180863361627638, "grad_norm": 0.3158978521823883, "learning_rate": 0.00019290218018936829, "loss": 0.0342, "step": 922 }, { "epoch": 0.12194074710176041, "grad_norm": 0.2638450860977173, "learning_rate": 0.00019288679357406746, "loss": 0.0407, "step": 923 }, { "epoch": 0.12207286058724444, "grad_norm": 0.26631325483322144, "learning_rate": 0.00019287139091430977, "loss": 0.0339, "step": 924 }, { "epoch": 0.12220497407272847, "grad_norm": 0.24103693664073944, "learning_rate": 0.00019285597221275572, "loss": 0.0252, "step": 925 }, { "epoch": 0.1223370875582125, "grad_norm": 0.1970626562833786, "learning_rate": 0.00019284053747206867, "loss": 0.0211, "step": 926 }, { "epoch": 0.12246920104369653, "grad_norm": 0.23220521211624146, "learning_rate": 0.0001928250866949146, "loss": 0.0363, "step": 927 }, { "epoch": 0.12260131452918056, "grad_norm": 0.2567414343357086, "learning_rate": 0.0001928096198839624, "loss": 0.0429, "step": 928 }, { "epoch": 0.1227334280146646, "grad_norm": 0.18246598541736603, "learning_rate": 0.00019279413704188363, "loss": 0.0319, "step": 929 }, { "epoch": 0.12286554150014863, "grad_norm": 0.2592841684818268, "learning_rate": 0.00019277863817135268, "loss": 0.0284, "step": 930 }, { "epoch": 0.12299765498563266, "grad_norm": 0.29718905687332153, "learning_rate": 0.00019276312327504673, "loss": 0.0195, "step": 931 }, { "epoch": 0.12312976847111669, "grad_norm": 0.48700830340385437, "learning_rate": 0.0001927475923556456, "loss": 0.0263, "step": 932 }, { "epoch": 0.12326188195660072, "grad_norm": 0.18568505346775055, "learning_rate": 0.00019273204541583208, "loss": 0.0238, "step": 933 }, { "epoch": 0.12339399544208475, "grad_norm": 0.32222384214401245, "learning_rate": 0.00019271648245829153, "loss": 0.0279, "step": 934 }, { "epoch": 0.12352610892756878, "grad_norm": 0.2924232482910156, "learning_rate": 0.00019270090348571216, "loss": 0.0282, "step": 935 }, { "epoch": 0.12365822241305281, "grad_norm": 0.17929571866989136, "learning_rate": 0.00019268530850078498, "loss": 0.0228, "step": 936 }, { "epoch": 0.12379033589853684, "grad_norm": 0.2127278745174408, "learning_rate": 0.0001926696975062037, "loss": 0.0388, "step": 937 }, { "epoch": 0.12392244938402087, "grad_norm": 0.14027151465415955, "learning_rate": 0.00019265407050466485, "loss": 0.0159, "step": 938 }, { "epoch": 0.1240545628695049, "grad_norm": 0.23516128957271576, "learning_rate": 0.0001926384274988677, "loss": 0.0367, "step": 939 }, { "epoch": 0.12418667635498894, "grad_norm": 0.27905580401420593, "learning_rate": 0.00019262276849151433, "loss": 0.0373, "step": 940 }, { "epoch": 0.12431878984047297, "grad_norm": 0.3128524124622345, "learning_rate": 0.00019260709348530944, "loss": 0.0312, "step": 941 }, { "epoch": 0.124450903325957, "grad_norm": 0.3510471284389496, "learning_rate": 0.00019259140248296068, "loss": 0.0413, "step": 942 }, { "epoch": 0.12458301681144103, "grad_norm": 0.2699235677719116, "learning_rate": 0.00019257569548717832, "loss": 0.0279, "step": 943 }, { "epoch": 0.12471513029692506, "grad_norm": 0.2423953115940094, "learning_rate": 0.00019255997250067553, "loss": 0.0317, "step": 944 }, { "epoch": 0.12484724378240909, "grad_norm": 0.3213600516319275, "learning_rate": 0.00019254423352616805, "loss": 0.0402, "step": 945 }, { "epoch": 0.12497935726789312, "grad_norm": 0.26986581087112427, "learning_rate": 0.00019252847856637457, "loss": 0.0301, "step": 946 }, { "epoch": 0.12511147075337714, "grad_norm": 0.2727492153644562, "learning_rate": 0.00019251270762401647, "loss": 0.0208, "step": 947 }, { "epoch": 0.12524358423886117, "grad_norm": 0.2855754792690277, "learning_rate": 0.00019249692070181785, "loss": 0.0419, "step": 948 }, { "epoch": 0.1253756977243452, "grad_norm": 0.278870165348053, "learning_rate": 0.00019248111780250561, "loss": 0.0214, "step": 949 }, { "epoch": 0.12550781120982923, "grad_norm": 0.18916112184524536, "learning_rate": 0.00019246529892880945, "loss": 0.025, "step": 950 }, { "epoch": 0.12563992469531327, "grad_norm": 0.2686702013015747, "learning_rate": 0.0001924494640834617, "loss": 0.0321, "step": 951 }, { "epoch": 0.1257720381807973, "grad_norm": 0.23873859643936157, "learning_rate": 0.0001924336132691976, "loss": 0.0307, "step": 952 }, { "epoch": 0.12590415166628133, "grad_norm": 0.26889604330062866, "learning_rate": 0.0001924177464887551, "loss": 0.0355, "step": 953 }, { "epoch": 0.12603626515176536, "grad_norm": 0.2223420888185501, "learning_rate": 0.0001924018637448748, "loss": 0.0416, "step": 954 }, { "epoch": 0.1261683786372494, "grad_norm": 0.24047642946243286, "learning_rate": 0.00019238596504030024, "loss": 0.0317, "step": 955 }, { "epoch": 0.12630049212273342, "grad_norm": 0.3237987160682678, "learning_rate": 0.00019237005037777755, "loss": 0.0351, "step": 956 }, { "epoch": 0.12643260560821745, "grad_norm": 0.30036526918411255, "learning_rate": 0.00019235411976005576, "loss": 0.0416, "step": 957 }, { "epoch": 0.12656471909370148, "grad_norm": 0.2027190625667572, "learning_rate": 0.00019233817318988652, "loss": 0.0368, "step": 958 }, { "epoch": 0.12669683257918551, "grad_norm": 0.23805420100688934, "learning_rate": 0.00019232221067002437, "loss": 0.02, "step": 959 }, { "epoch": 0.12682894606466955, "grad_norm": 0.3873671293258667, "learning_rate": 0.00019230623220322648, "loss": 0.0294, "step": 960 }, { "epoch": 0.12696105955015358, "grad_norm": 0.27125123143196106, "learning_rate": 0.00019229023779225284, "loss": 0.0299, "step": 961 }, { "epoch": 0.1270931730356376, "grad_norm": 0.2969513535499573, "learning_rate": 0.0001922742274398662, "loss": 0.031, "step": 962 }, { "epoch": 0.12722528652112164, "grad_norm": 0.244962677359581, "learning_rate": 0.00019225820114883208, "loss": 0.0275, "step": 963 }, { "epoch": 0.12735740000660567, "grad_norm": 0.2579887807369232, "learning_rate": 0.00019224215892191864, "loss": 0.0332, "step": 964 }, { "epoch": 0.1274895134920897, "grad_norm": 0.1939936876296997, "learning_rate": 0.00019222610076189694, "loss": 0.03, "step": 965 }, { "epoch": 0.12762162697757373, "grad_norm": 0.24424876272678375, "learning_rate": 0.0001922100266715407, "loss": 0.0359, "step": 966 }, { "epoch": 0.12775374046305776, "grad_norm": 0.17694316804409027, "learning_rate": 0.00019219393665362647, "loss": 0.0187, "step": 967 }, { "epoch": 0.1278858539485418, "grad_norm": 0.27498042583465576, "learning_rate": 0.00019217783071093342, "loss": 0.0256, "step": 968 }, { "epoch": 0.12801796743402583, "grad_norm": 0.24844923615455627, "learning_rate": 0.0001921617088462436, "loss": 0.0288, "step": 969 }, { "epoch": 0.12815008091950986, "grad_norm": 0.19000475108623505, "learning_rate": 0.00019214557106234174, "loss": 0.0247, "step": 970 }, { "epoch": 0.1282821944049939, "grad_norm": 0.3154200613498688, "learning_rate": 0.00019212941736201537, "loss": 0.031, "step": 971 }, { "epoch": 0.12841430789047792, "grad_norm": 0.3130955398082733, "learning_rate": 0.00019211324774805473, "loss": 0.0349, "step": 972 }, { "epoch": 0.12854642137596195, "grad_norm": 0.2864592671394348, "learning_rate": 0.00019209706222325277, "loss": 0.032, "step": 973 }, { "epoch": 0.12867853486144598, "grad_norm": 0.22307147085666656, "learning_rate": 0.0001920808607904053, "loss": 0.037, "step": 974 }, { "epoch": 0.12881064834693, "grad_norm": 0.3396158814430237, "learning_rate": 0.00019206464345231078, "loss": 0.0337, "step": 975 }, { "epoch": 0.12894276183241404, "grad_norm": 0.24885155260562897, "learning_rate": 0.00019204841021177048, "loss": 0.0291, "step": 976 }, { "epoch": 0.12907487531789807, "grad_norm": 0.1922139674425125, "learning_rate": 0.00019203216107158833, "loss": 0.02, "step": 977 }, { "epoch": 0.1292069888033821, "grad_norm": 0.252600759267807, "learning_rate": 0.00019201589603457114, "loss": 0.0259, "step": 978 }, { "epoch": 0.12933910228886614, "grad_norm": 0.24059033393859863, "learning_rate": 0.00019199961510352832, "loss": 0.0281, "step": 979 }, { "epoch": 0.12947121577435017, "grad_norm": 0.21730482578277588, "learning_rate": 0.00019198331828127217, "loss": 0.034, "step": 980 }, { "epoch": 0.1296033292598342, "grad_norm": 0.3147624433040619, "learning_rate": 0.00019196700557061762, "loss": 0.0323, "step": 981 }, { "epoch": 0.12973544274531823, "grad_norm": 0.21451252698898315, "learning_rate": 0.00019195067697438237, "loss": 0.0254, "step": 982 }, { "epoch": 0.12986755623080226, "grad_norm": 0.230705127120018, "learning_rate": 0.0001919343324953869, "loss": 0.0398, "step": 983 }, { "epoch": 0.1299996697162863, "grad_norm": 0.19270865619182587, "learning_rate": 0.00019191797213645445, "loss": 0.0274, "step": 984 }, { "epoch": 0.13013178320177032, "grad_norm": 0.1896556168794632, "learning_rate": 0.00019190159590041088, "loss": 0.0197, "step": 985 }, { "epoch": 0.13026389668725435, "grad_norm": 0.21830704808235168, "learning_rate": 0.00019188520379008494, "loss": 0.0191, "step": 986 }, { "epoch": 0.13039601017273839, "grad_norm": 0.3246423304080963, "learning_rate": 0.00019186879580830807, "loss": 0.0294, "step": 987 }, { "epoch": 0.13052812365822242, "grad_norm": 0.34824860095977783, "learning_rate": 0.0001918523719579144, "loss": 0.0418, "step": 988 }, { "epoch": 0.13066023714370645, "grad_norm": 0.19474251568317413, "learning_rate": 0.00019183593224174084, "loss": 0.0265, "step": 989 }, { "epoch": 0.13079235062919048, "grad_norm": 0.2300022393465042, "learning_rate": 0.00019181947666262712, "loss": 0.0312, "step": 990 }, { "epoch": 0.1309244641146745, "grad_norm": 0.2592034339904785, "learning_rate": 0.00019180300522341558, "loss": 0.0434, "step": 991 }, { "epoch": 0.13105657760015854, "grad_norm": 0.21826054155826569, "learning_rate": 0.0001917865179269513, "loss": 0.0233, "step": 992 }, { "epoch": 0.13118869108564257, "grad_norm": 0.263738214969635, "learning_rate": 0.00019177001477608226, "loss": 0.0184, "step": 993 }, { "epoch": 0.1313208045711266, "grad_norm": 0.34222137928009033, "learning_rate": 0.000191753495773659, "loss": 0.0295, "step": 994 }, { "epoch": 0.13145291805661063, "grad_norm": 0.3267710208892822, "learning_rate": 0.00019173696092253487, "loss": 0.0307, "step": 995 }, { "epoch": 0.13158503154209467, "grad_norm": 0.22792892158031464, "learning_rate": 0.00019172041022556596, "loss": 0.03, "step": 996 }, { "epoch": 0.1317171450275787, "grad_norm": 0.20261724293231964, "learning_rate": 0.0001917038436856111, "loss": 0.027, "step": 997 }, { "epoch": 0.13184925851306273, "grad_norm": 0.23422667384147644, "learning_rate": 0.0001916872613055319, "loss": 0.0249, "step": 998 }, { "epoch": 0.13198137199854676, "grad_norm": 0.26018065214157104, "learning_rate": 0.0001916706630881926, "loss": 0.0335, "step": 999 }, { "epoch": 0.1321134854840308, "grad_norm": 0.36841535568237305, "learning_rate": 0.00019165404903646023, "loss": 0.0239, "step": 1000 }, { "epoch": 0.13224559896951482, "grad_norm": 0.1476823389530182, "learning_rate": 0.00019163741915320456, "loss": 0.013, "step": 1001 }, { "epoch": 0.13237771245499885, "grad_norm": 0.2931428849697113, "learning_rate": 0.0001916207734412981, "loss": 0.0263, "step": 1002 }, { "epoch": 0.13250982594048288, "grad_norm": 0.2295461893081665, "learning_rate": 0.00019160411190361612, "loss": 0.0243, "step": 1003 }, { "epoch": 0.13264193942596691, "grad_norm": 0.22297543287277222, "learning_rate": 0.00019158743454303654, "loss": 0.0372, "step": 1004 }, { "epoch": 0.13277405291145095, "grad_norm": 0.2877453863620758, "learning_rate": 0.0001915707413624401, "loss": 0.0319, "step": 1005 }, { "epoch": 0.13290616639693498, "grad_norm": 0.21408823132514954, "learning_rate": 0.00019155403236471017, "loss": 0.0229, "step": 1006 }, { "epoch": 0.133038279882419, "grad_norm": 0.29204505681991577, "learning_rate": 0.00019153730755273296, "loss": 0.0408, "step": 1007 }, { "epoch": 0.13317039336790304, "grad_norm": 0.20328344404697418, "learning_rate": 0.0001915205669293974, "loss": 0.0249, "step": 1008 }, { "epoch": 0.13330250685338707, "grad_norm": 0.2546907663345337, "learning_rate": 0.00019150381049759508, "loss": 0.0328, "step": 1009 }, { "epoch": 0.1334346203388711, "grad_norm": 0.19306686520576477, "learning_rate": 0.00019148703826022035, "loss": 0.0287, "step": 1010 }, { "epoch": 0.13356673382435513, "grad_norm": 0.6536433100700378, "learning_rate": 0.0001914702502201703, "loss": 0.0398, "step": 1011 }, { "epoch": 0.13369884730983916, "grad_norm": 0.2630719542503357, "learning_rate": 0.00019145344638034484, "loss": 0.0271, "step": 1012 }, { "epoch": 0.1338309607953232, "grad_norm": 0.2874428331851959, "learning_rate": 0.0001914366267436464, "loss": 0.046, "step": 1013 }, { "epoch": 0.13396307428080723, "grad_norm": 0.25163084268569946, "learning_rate": 0.0001914197913129803, "loss": 0.0335, "step": 1014 }, { "epoch": 0.13409518776629126, "grad_norm": 0.303242564201355, "learning_rate": 0.00019140294009125457, "loss": 0.0281, "step": 1015 }, { "epoch": 0.1342273012517753, "grad_norm": 0.2756708264350891, "learning_rate": 0.0001913860730813799, "loss": 0.0503, "step": 1016 }, { "epoch": 0.13435941473725932, "grad_norm": 0.3068932592868805, "learning_rate": 0.0001913691902862698, "loss": 0.0236, "step": 1017 }, { "epoch": 0.13449152822274332, "grad_norm": 0.21432353556156158, "learning_rate": 0.00019135229170884043, "loss": 0.0367, "step": 1018 }, { "epoch": 0.13462364170822735, "grad_norm": 0.39591842889785767, "learning_rate": 0.0001913353773520107, "loss": 0.0399, "step": 1019 }, { "epoch": 0.13475575519371139, "grad_norm": 0.2623952329158783, "learning_rate": 0.00019131844721870226, "loss": 0.0285, "step": 1020 }, { "epoch": 0.13488786867919542, "grad_norm": 0.4142909646034241, "learning_rate": 0.00019130150131183946, "loss": 0.0419, "step": 1021 }, { "epoch": 0.13501998216467945, "grad_norm": 0.20737366378307343, "learning_rate": 0.0001912845396343494, "loss": 0.0293, "step": 1022 }, { "epoch": 0.13515209565016348, "grad_norm": 0.22168460488319397, "learning_rate": 0.00019126756218916188, "loss": 0.0266, "step": 1023 }, { "epoch": 0.1352842091356475, "grad_norm": 0.21502786874771118, "learning_rate": 0.00019125056897920946, "loss": 0.0326, "step": 1024 }, { "epoch": 0.13541632262113154, "grad_norm": 0.17693819105625153, "learning_rate": 0.0001912335600074274, "loss": 0.0177, "step": 1025 }, { "epoch": 0.13554843610661557, "grad_norm": 0.1689392477273941, "learning_rate": 0.00019121653527675366, "loss": 0.0201, "step": 1026 }, { "epoch": 0.1356805495920996, "grad_norm": 0.3991635739803314, "learning_rate": 0.00019119949479012892, "loss": 0.0385, "step": 1027 }, { "epoch": 0.13581266307758363, "grad_norm": 0.20152747631072998, "learning_rate": 0.00019118243855049667, "loss": 0.0407, "step": 1028 }, { "epoch": 0.13594477656306767, "grad_norm": 0.16736957430839539, "learning_rate": 0.00019116536656080298, "loss": 0.0222, "step": 1029 }, { "epoch": 0.1360768900485517, "grad_norm": 0.2872177064418793, "learning_rate": 0.00019114827882399683, "loss": 0.041, "step": 1030 }, { "epoch": 0.13620900353403573, "grad_norm": 0.22250396013259888, "learning_rate": 0.00019113117534302968, "loss": 0.0381, "step": 1031 }, { "epoch": 0.13634111701951976, "grad_norm": 0.24812956154346466, "learning_rate": 0.00019111405612085594, "loss": 0.0329, "step": 1032 }, { "epoch": 0.1364732305050038, "grad_norm": 0.18522006273269653, "learning_rate": 0.00019109692116043255, "loss": 0.0203, "step": 1033 }, { "epoch": 0.13660534399048782, "grad_norm": 0.17845605313777924, "learning_rate": 0.0001910797704647193, "loss": 0.0202, "step": 1034 }, { "epoch": 0.13673745747597185, "grad_norm": 0.20754919946193695, "learning_rate": 0.00019106260403667865, "loss": 0.0295, "step": 1035 }, { "epoch": 0.13686957096145588, "grad_norm": 0.2204241305589676, "learning_rate": 0.00019104542187927577, "loss": 0.0246, "step": 1036 }, { "epoch": 0.13700168444693991, "grad_norm": 0.2664002776145935, "learning_rate": 0.0001910282239954786, "loss": 0.0258, "step": 1037 }, { "epoch": 0.13713379793242395, "grad_norm": 0.26305004954338074, "learning_rate": 0.00019101101038825766, "loss": 0.0236, "step": 1038 }, { "epoch": 0.13726591141790798, "grad_norm": 0.1791495531797409, "learning_rate": 0.00019099378106058636, "loss": 0.0229, "step": 1039 }, { "epoch": 0.137398024903392, "grad_norm": 0.2402389943599701, "learning_rate": 0.00019097653601544073, "loss": 0.0237, "step": 1040 }, { "epoch": 0.13753013838887604, "grad_norm": 0.24972432851791382, "learning_rate": 0.00019095927525579948, "loss": 0.0289, "step": 1041 }, { "epoch": 0.13766225187436007, "grad_norm": 0.18227741122245789, "learning_rate": 0.00019094199878464413, "loss": 0.0183, "step": 1042 }, { "epoch": 0.1377943653598441, "grad_norm": 0.2381259799003601, "learning_rate": 0.00019092470660495887, "loss": 0.0327, "step": 1043 }, { "epoch": 0.13792647884532813, "grad_norm": 0.2130383551120758, "learning_rate": 0.0001909073987197306, "loss": 0.0251, "step": 1044 }, { "epoch": 0.13805859233081216, "grad_norm": 0.2830989360809326, "learning_rate": 0.0001908900751319489, "loss": 0.0377, "step": 1045 }, { "epoch": 0.1381907058162962, "grad_norm": 0.2945898175239563, "learning_rate": 0.0001908727358446061, "loss": 0.0294, "step": 1046 }, { "epoch": 0.13832281930178023, "grad_norm": 0.23683592677116394, "learning_rate": 0.00019085538086069728, "loss": 0.0188, "step": 1047 }, { "epoch": 0.13845493278726426, "grad_norm": 0.24602121114730835, "learning_rate": 0.0001908380101832202, "loss": 0.0376, "step": 1048 }, { "epoch": 0.1385870462727483, "grad_norm": 0.4324703514575958, "learning_rate": 0.00019082062381517524, "loss": 0.0461, "step": 1049 }, { "epoch": 0.13871915975823232, "grad_norm": 0.28953877091407776, "learning_rate": 0.00019080322175956562, "loss": 0.036, "step": 1050 }, { "epoch": 0.13885127324371635, "grad_norm": 0.1999170482158661, "learning_rate": 0.0001907858040193972, "loss": 0.0326, "step": 1051 }, { "epoch": 0.13898338672920038, "grad_norm": 0.15232303738594055, "learning_rate": 0.0001907683705976786, "loss": 0.0173, "step": 1052 }, { "epoch": 0.1391155002146844, "grad_norm": 0.3097558319568634, "learning_rate": 0.00019075092149742112, "loss": 0.0311, "step": 1053 }, { "epoch": 0.13924761370016844, "grad_norm": 0.1991742104291916, "learning_rate": 0.0001907334567216387, "loss": 0.0302, "step": 1054 }, { "epoch": 0.13937972718565247, "grad_norm": 0.19482475519180298, "learning_rate": 0.00019071597627334815, "loss": 0.031, "step": 1055 }, { "epoch": 0.1395118406711365, "grad_norm": 0.18483558297157288, "learning_rate": 0.00019069848015556878, "loss": 0.0259, "step": 1056 }, { "epoch": 0.13964395415662054, "grad_norm": 0.3096649944782257, "learning_rate": 0.00019068096837132284, "loss": 0.0278, "step": 1057 }, { "epoch": 0.13977606764210457, "grad_norm": 0.3025237023830414, "learning_rate": 0.00019066344092363507, "loss": 0.0363, "step": 1058 }, { "epoch": 0.1399081811275886, "grad_norm": 0.2409946471452713, "learning_rate": 0.00019064589781553305, "loss": 0.0188, "step": 1059 }, { "epoch": 0.14004029461307263, "grad_norm": 0.20228618383407593, "learning_rate": 0.00019062833905004697, "loss": 0.0317, "step": 1060 }, { "epoch": 0.14017240809855666, "grad_norm": 0.17695695161819458, "learning_rate": 0.00019061076463020986, "loss": 0.0269, "step": 1061 }, { "epoch": 0.1403045215840407, "grad_norm": 0.3224967420101166, "learning_rate": 0.0001905931745590573, "loss": 0.0245, "step": 1062 }, { "epoch": 0.14043663506952472, "grad_norm": 0.130575031042099, "learning_rate": 0.00019057556883962776, "loss": 0.0163, "step": 1063 }, { "epoch": 0.14056874855500875, "grad_norm": 0.272037535905838, "learning_rate": 0.00019055794747496215, "loss": 0.0361, "step": 1064 }, { "epoch": 0.14070086204049279, "grad_norm": 0.29713672399520874, "learning_rate": 0.00019054031046810433, "loss": 0.041, "step": 1065 }, { "epoch": 0.14083297552597682, "grad_norm": 0.2734525501728058, "learning_rate": 0.0001905226578221007, "loss": 0.0179, "step": 1066 }, { "epoch": 0.14096508901146085, "grad_norm": 0.32613930106163025, "learning_rate": 0.00019050498954000048, "loss": 0.0436, "step": 1067 }, { "epoch": 0.14109720249694488, "grad_norm": 0.168988898396492, "learning_rate": 0.00019048730562485554, "loss": 0.0198, "step": 1068 }, { "epoch": 0.1412293159824289, "grad_norm": 0.21649277210235596, "learning_rate": 0.00019046960607972037, "loss": 0.0283, "step": 1069 }, { "epoch": 0.14136142946791294, "grad_norm": 0.19166362285614014, "learning_rate": 0.00019045189090765232, "loss": 0.0256, "step": 1070 }, { "epoch": 0.14149354295339697, "grad_norm": 0.23602889478206635, "learning_rate": 0.0001904341601117113, "loss": 0.0366, "step": 1071 }, { "epoch": 0.141625656438881, "grad_norm": 0.2654229700565338, "learning_rate": 0.00019041641369496, "loss": 0.0415, "step": 1072 }, { "epoch": 0.14175776992436503, "grad_norm": 0.2024417668581009, "learning_rate": 0.00019039865166046378, "loss": 0.0247, "step": 1073 }, { "epoch": 0.14188988340984907, "grad_norm": 0.34355026483535767, "learning_rate": 0.00019038087401129067, "loss": 0.036, "step": 1074 }, { "epoch": 0.1420219968953331, "grad_norm": 0.1743433028459549, "learning_rate": 0.00019036308075051148, "loss": 0.033, "step": 1075 }, { "epoch": 0.14215411038081713, "grad_norm": 0.23234279453754425, "learning_rate": 0.00019034527188119962, "loss": 0.0418, "step": 1076 }, { "epoch": 0.14228622386630116, "grad_norm": 0.1761244535446167, "learning_rate": 0.00019032744740643125, "loss": 0.0185, "step": 1077 }, { "epoch": 0.1424183373517852, "grad_norm": 0.18785180151462555, "learning_rate": 0.00019030960732928522, "loss": 0.0242, "step": 1078 }, { "epoch": 0.14255045083726922, "grad_norm": 0.23817138373851776, "learning_rate": 0.0001902917516528431, "loss": 0.0253, "step": 1079 }, { "epoch": 0.14268256432275325, "grad_norm": 0.20377707481384277, "learning_rate": 0.00019027388038018902, "loss": 0.0314, "step": 1080 }, { "epoch": 0.14281467780823728, "grad_norm": 0.21786224842071533, "learning_rate": 0.00019025599351441002, "loss": 0.0321, "step": 1081 }, { "epoch": 0.14294679129372131, "grad_norm": 0.2601945400238037, "learning_rate": 0.00019023809105859569, "loss": 0.0333, "step": 1082 }, { "epoch": 0.14307890477920535, "grad_norm": 0.2133285105228424, "learning_rate": 0.00019022017301583834, "loss": 0.0418, "step": 1083 }, { "epoch": 0.14321101826468938, "grad_norm": 0.16249844431877136, "learning_rate": 0.00019020223938923296, "loss": 0.0197, "step": 1084 }, { "epoch": 0.1433431317501734, "grad_norm": 0.21031540632247925, "learning_rate": 0.00019018429018187723, "loss": 0.0285, "step": 1085 }, { "epoch": 0.14347524523565744, "grad_norm": 0.23109595477581024, "learning_rate": 0.00019016632539687163, "loss": 0.0222, "step": 1086 }, { "epoch": 0.14360735872114147, "grad_norm": 0.2368910163640976, "learning_rate": 0.00019014834503731915, "loss": 0.0315, "step": 1087 }, { "epoch": 0.1437394722066255, "grad_norm": 0.3795262277126312, "learning_rate": 0.00019013034910632558, "loss": 0.039, "step": 1088 }, { "epoch": 0.14387158569210953, "grad_norm": 0.20181721448898315, "learning_rate": 0.00019011233760699942, "loss": 0.0228, "step": 1089 }, { "epoch": 0.14400369917759356, "grad_norm": 0.26354295015335083, "learning_rate": 0.00019009431054245178, "loss": 0.0298, "step": 1090 }, { "epoch": 0.1441358126630776, "grad_norm": 0.3916509449481964, "learning_rate": 0.00019007626791579652, "loss": 0.0325, "step": 1091 }, { "epoch": 0.14426792614856163, "grad_norm": 0.29506227374076843, "learning_rate": 0.00019005820973015016, "loss": 0.0298, "step": 1092 }, { "epoch": 0.14440003963404566, "grad_norm": 0.35943931341171265, "learning_rate": 0.0001900401359886319, "loss": 0.033, "step": 1093 }, { "epoch": 0.1445321531195297, "grad_norm": 0.19208261370658875, "learning_rate": 0.00019002204669436369, "loss": 0.0207, "step": 1094 }, { "epoch": 0.14466426660501372, "grad_norm": 0.22459660470485687, "learning_rate": 0.00019000394185047004, "loss": 0.0358, "step": 1095 }, { "epoch": 0.14479638009049775, "grad_norm": 0.27884823083877563, "learning_rate": 0.00018998582146007825, "loss": 0.0403, "step": 1096 }, { "epoch": 0.14492849357598178, "grad_norm": 0.45688390731811523, "learning_rate": 0.0001899676855263183, "loss": 0.0349, "step": 1097 }, { "epoch": 0.1450606070614658, "grad_norm": 0.28939759731292725, "learning_rate": 0.00018994953405232287, "loss": 0.0439, "step": 1098 }, { "epoch": 0.14519272054694982, "grad_norm": 0.3789207339286804, "learning_rate": 0.0001899313670412272, "loss": 0.0408, "step": 1099 }, { "epoch": 0.14532483403243385, "grad_norm": 0.2404397577047348, "learning_rate": 0.00018991318449616937, "loss": 0.0255, "step": 1100 }, { "epoch": 0.14545694751791788, "grad_norm": 0.2406584769487381, "learning_rate": 0.00018989498642029004, "loss": 0.0301, "step": 1101 }, { "epoch": 0.1455890610034019, "grad_norm": 0.20736750960350037, "learning_rate": 0.0001898767728167326, "loss": 0.0161, "step": 1102 }, { "epoch": 0.14572117448888594, "grad_norm": 0.24288155138492584, "learning_rate": 0.0001898585436886431, "loss": 0.0239, "step": 1103 }, { "epoch": 0.14585328797436997, "grad_norm": 0.2473408728837967, "learning_rate": 0.00018984029903917026, "loss": 0.0272, "step": 1104 }, { "epoch": 0.145985401459854, "grad_norm": 0.22904950380325317, "learning_rate": 0.00018982203887146556, "loss": 0.031, "step": 1105 }, { "epoch": 0.14611751494533803, "grad_norm": 0.23573671281337738, "learning_rate": 0.0001898037631886831, "loss": 0.0296, "step": 1106 }, { "epoch": 0.14624962843082207, "grad_norm": 0.2947499752044678, "learning_rate": 0.00018978547199397959, "loss": 0.0485, "step": 1107 }, { "epoch": 0.1463817419163061, "grad_norm": 0.2720722556114197, "learning_rate": 0.00018976716529051454, "loss": 0.0326, "step": 1108 }, { "epoch": 0.14651385540179013, "grad_norm": 0.20816238224506378, "learning_rate": 0.0001897488430814501, "loss": 0.0231, "step": 1109 }, { "epoch": 0.14664596888727416, "grad_norm": 0.291048526763916, "learning_rate": 0.0001897305053699511, "loss": 0.0399, "step": 1110 }, { "epoch": 0.1467780823727582, "grad_norm": 0.15667644143104553, "learning_rate": 0.00018971215215918497, "loss": 0.0122, "step": 1111 }, { "epoch": 0.14691019585824222, "grad_norm": 0.23087377846240997, "learning_rate": 0.00018969378345232193, "loss": 0.0273, "step": 1112 }, { "epoch": 0.14704230934372625, "grad_norm": 0.2081710398197174, "learning_rate": 0.00018967539925253486, "loss": 0.0376, "step": 1113 }, { "epoch": 0.14717442282921028, "grad_norm": 0.2385694682598114, "learning_rate": 0.00018965699956299923, "loss": 0.0333, "step": 1114 }, { "epoch": 0.14730653631469431, "grad_norm": 0.25243672728538513, "learning_rate": 0.00018963858438689326, "loss": 0.0304, "step": 1115 }, { "epoch": 0.14743864980017835, "grad_norm": 0.1876525729894638, "learning_rate": 0.00018962015372739788, "loss": 0.0178, "step": 1116 }, { "epoch": 0.14757076328566238, "grad_norm": 0.20586282014846802, "learning_rate": 0.00018960170758769654, "loss": 0.0342, "step": 1117 }, { "epoch": 0.1477028767711464, "grad_norm": 0.17124402523040771, "learning_rate": 0.00018958324597097555, "loss": 0.0182, "step": 1118 }, { "epoch": 0.14783499025663044, "grad_norm": 0.165056049823761, "learning_rate": 0.00018956476888042377, "loss": 0.0146, "step": 1119 }, { "epoch": 0.14796710374211447, "grad_norm": 0.24566829204559326, "learning_rate": 0.00018954627631923279, "loss": 0.0296, "step": 1120 }, { "epoch": 0.1480992172275985, "grad_norm": 0.2505076825618744, "learning_rate": 0.00018952776829059685, "loss": 0.033, "step": 1121 }, { "epoch": 0.14823133071308253, "grad_norm": 0.22813905775547028, "learning_rate": 0.00018950924479771287, "loss": 0.0262, "step": 1122 }, { "epoch": 0.14836344419856656, "grad_norm": 0.18311943113803864, "learning_rate": 0.0001894907058437804, "loss": 0.0256, "step": 1123 }, { "epoch": 0.1484955576840506, "grad_norm": 0.2090534269809723, "learning_rate": 0.00018947215143200175, "loss": 0.0286, "step": 1124 }, { "epoch": 0.14862767116953463, "grad_norm": 0.5938106775283813, "learning_rate": 0.00018945358156558184, "loss": 0.0517, "step": 1125 }, { "epoch": 0.14875978465501866, "grad_norm": 0.371894896030426, "learning_rate": 0.0001894349962477282, "loss": 0.0224, "step": 1126 }, { "epoch": 0.1488918981405027, "grad_norm": 0.22545740008354187, "learning_rate": 0.0001894163954816512, "loss": 0.032, "step": 1127 }, { "epoch": 0.14902401162598672, "grad_norm": 0.23922865092754364, "learning_rate": 0.00018939777927056372, "loss": 0.0317, "step": 1128 }, { "epoch": 0.14915612511147075, "grad_norm": 0.3410808742046356, "learning_rate": 0.00018937914761768133, "loss": 0.0278, "step": 1129 }, { "epoch": 0.14928823859695478, "grad_norm": 0.2365533411502838, "learning_rate": 0.00018936050052622237, "loss": 0.0323, "step": 1130 }, { "epoch": 0.1494203520824388, "grad_norm": 0.26630455255508423, "learning_rate": 0.00018934183799940773, "loss": 0.0247, "step": 1131 }, { "epoch": 0.14955246556792284, "grad_norm": 0.2577216625213623, "learning_rate": 0.00018932316004046103, "loss": 0.0396, "step": 1132 }, { "epoch": 0.14968457905340687, "grad_norm": 0.20923064649105072, "learning_rate": 0.00018930446665260854, "loss": 0.0277, "step": 1133 }, { "epoch": 0.1498166925388909, "grad_norm": 0.24694538116455078, "learning_rate": 0.00018928575783907914, "loss": 0.0364, "step": 1134 }, { "epoch": 0.14994880602437494, "grad_norm": 0.28028610348701477, "learning_rate": 0.00018926703360310453, "loss": 0.0367, "step": 1135 }, { "epoch": 0.15008091950985897, "grad_norm": 0.4478507936000824, "learning_rate": 0.00018924829394791886, "loss": 0.0308, "step": 1136 }, { "epoch": 0.150213032995343, "grad_norm": 0.3255590796470642, "learning_rate": 0.00018922953887675915, "loss": 0.0404, "step": 1137 }, { "epoch": 0.15034514648082703, "grad_norm": 0.18644562363624573, "learning_rate": 0.00018921076839286495, "loss": 0.0197, "step": 1138 }, { "epoch": 0.15047725996631106, "grad_norm": 0.2694709897041321, "learning_rate": 0.00018919198249947846, "loss": 0.015, "step": 1139 }, { "epoch": 0.1506093734517951, "grad_norm": 0.23030641674995422, "learning_rate": 0.00018917318119984468, "loss": 0.0326, "step": 1140 }, { "epoch": 0.15074148693727912, "grad_norm": 0.2648636996746063, "learning_rate": 0.00018915436449721117, "loss": 0.0452, "step": 1141 }, { "epoch": 0.15087360042276315, "grad_norm": 0.28663501143455505, "learning_rate": 0.0001891355323948281, "loss": 0.0368, "step": 1142 }, { "epoch": 0.15100571390824719, "grad_norm": 0.21681685745716095, "learning_rate": 0.00018911668489594838, "loss": 0.0222, "step": 1143 }, { "epoch": 0.15113782739373122, "grad_norm": 0.3494594693183899, "learning_rate": 0.00018909782200382763, "loss": 0.0212, "step": 1144 }, { "epoch": 0.15126994087921525, "grad_norm": 0.32084718346595764, "learning_rate": 0.000189078943721724, "loss": 0.0402, "step": 1145 }, { "epoch": 0.15140205436469928, "grad_norm": 0.23676195740699768, "learning_rate": 0.00018906005005289836, "loss": 0.0324, "step": 1146 }, { "epoch": 0.1515341678501833, "grad_norm": 0.16447904706001282, "learning_rate": 0.00018904114100061424, "loss": 0.0174, "step": 1147 }, { "epoch": 0.15166628133566734, "grad_norm": 0.24322609603405, "learning_rate": 0.0001890222165681379, "loss": 0.0327, "step": 1148 }, { "epoch": 0.15179839482115137, "grad_norm": 0.21443983912467957, "learning_rate": 0.00018900327675873806, "loss": 0.035, "step": 1149 }, { "epoch": 0.1519305083066354, "grad_norm": 0.1838637739419937, "learning_rate": 0.0001889843215756863, "loss": 0.0267, "step": 1150 }, { "epoch": 0.15206262179211943, "grad_norm": 0.2316933572292328, "learning_rate": 0.00018896535102225673, "loss": 0.0266, "step": 1151 }, { "epoch": 0.15219473527760347, "grad_norm": 0.25021079182624817, "learning_rate": 0.0001889463651017262, "loss": 0.0207, "step": 1152 }, { "epoch": 0.1523268487630875, "grad_norm": 0.28480058908462524, "learning_rate": 0.00018892736381737418, "loss": 0.0331, "step": 1153 }, { "epoch": 0.15245896224857153, "grad_norm": 0.2881741523742676, "learning_rate": 0.0001889083471724827, "loss": 0.0202, "step": 1154 }, { "epoch": 0.15259107573405556, "grad_norm": 0.31690138578414917, "learning_rate": 0.00018888931517033663, "loss": 0.0391, "step": 1155 }, { "epoch": 0.1527231892195396, "grad_norm": 0.1786164790391922, "learning_rate": 0.00018887026781422338, "loss": 0.0304, "step": 1156 }, { "epoch": 0.15285530270502362, "grad_norm": 0.15002429485321045, "learning_rate": 0.00018885120510743296, "loss": 0.0163, "step": 1157 }, { "epoch": 0.15298741619050765, "grad_norm": 0.1841292530298233, "learning_rate": 0.00018883212705325813, "loss": 0.0276, "step": 1158 }, { "epoch": 0.15311952967599168, "grad_norm": 0.3281814157962799, "learning_rate": 0.00018881303365499426, "loss": 0.0307, "step": 1159 }, { "epoch": 0.15325164316147571, "grad_norm": 0.30686184763908386, "learning_rate": 0.00018879392491593942, "loss": 0.0349, "step": 1160 }, { "epoch": 0.15338375664695975, "grad_norm": 0.19830575585365295, "learning_rate": 0.00018877480083939424, "loss": 0.0247, "step": 1161 }, { "epoch": 0.15351587013244378, "grad_norm": 0.2402205914258957, "learning_rate": 0.00018875566142866204, "loss": 0.0235, "step": 1162 }, { "epoch": 0.1536479836179278, "grad_norm": 0.29031792283058167, "learning_rate": 0.00018873650668704882, "loss": 0.035, "step": 1163 }, { "epoch": 0.15378009710341184, "grad_norm": 0.24618452787399292, "learning_rate": 0.00018871733661786325, "loss": 0.0308, "step": 1164 }, { "epoch": 0.15391221058889587, "grad_norm": 0.2671643793582916, "learning_rate": 0.0001886981512244165, "loss": 0.0331, "step": 1165 }, { "epoch": 0.1540443240743799, "grad_norm": 0.2721196711063385, "learning_rate": 0.00018867895051002256, "loss": 0.037, "step": 1166 }, { "epoch": 0.15417643755986393, "grad_norm": 0.32829850912094116, "learning_rate": 0.000188659734477998, "loss": 0.0387, "step": 1167 }, { "epoch": 0.15430855104534796, "grad_norm": 0.14274796843528748, "learning_rate": 0.00018864050313166194, "loss": 0.0235, "step": 1168 }, { "epoch": 0.154440664530832, "grad_norm": 0.25783032178878784, "learning_rate": 0.00018862125647433635, "loss": 0.03, "step": 1169 }, { "epoch": 0.15457277801631603, "grad_norm": 0.1943584531545639, "learning_rate": 0.00018860199450934566, "loss": 0.035, "step": 1170 }, { "epoch": 0.15470489150180006, "grad_norm": 0.2702155113220215, "learning_rate": 0.00018858271724001707, "loss": 0.0349, "step": 1171 }, { "epoch": 0.1548370049872841, "grad_norm": 0.34101739525794983, "learning_rate": 0.00018856342466968027, "loss": 0.0453, "step": 1172 }, { "epoch": 0.15496911847276812, "grad_norm": 0.23040156066417694, "learning_rate": 0.0001885441168016678, "loss": 0.0289, "step": 1173 }, { "epoch": 0.15510123195825215, "grad_norm": 0.26681602001190186, "learning_rate": 0.00018852479363931467, "loss": 0.0434, "step": 1174 }, { "epoch": 0.15523334544373618, "grad_norm": 0.1774115115404129, "learning_rate": 0.00018850545518595859, "loss": 0.0255, "step": 1175 }, { "epoch": 0.1553654589292202, "grad_norm": 0.24246004223823547, "learning_rate": 0.00018848610144493996, "loss": 0.0279, "step": 1176 }, { "epoch": 0.15549757241470424, "grad_norm": 0.365506112575531, "learning_rate": 0.00018846673241960176, "loss": 0.043, "step": 1177 }, { "epoch": 0.15562968590018827, "grad_norm": 0.3292416036128998, "learning_rate": 0.0001884473481132896, "loss": 0.0359, "step": 1178 }, { "epoch": 0.1557617993856723, "grad_norm": 0.28295496106147766, "learning_rate": 0.00018842794852935181, "loss": 0.0494, "step": 1179 }, { "epoch": 0.15589391287115634, "grad_norm": 0.19957217574119568, "learning_rate": 0.00018840853367113925, "loss": 0.0199, "step": 1180 }, { "epoch": 0.15602602635664034, "grad_norm": 0.251113623380661, "learning_rate": 0.0001883891035420055, "loss": 0.0367, "step": 1181 }, { "epoch": 0.15615813984212437, "grad_norm": 0.31150946021080017, "learning_rate": 0.00018836965814530675, "loss": 0.0312, "step": 1182 }, { "epoch": 0.1562902533276084, "grad_norm": 0.20670191943645477, "learning_rate": 0.00018835019748440185, "loss": 0.0236, "step": 1183 }, { "epoch": 0.15642236681309243, "grad_norm": 0.24962063133716583, "learning_rate": 0.0001883307215626522, "loss": 0.037, "step": 1184 }, { "epoch": 0.15655448029857646, "grad_norm": 0.24450640380382538, "learning_rate": 0.00018831123038342195, "loss": 0.0317, "step": 1185 }, { "epoch": 0.1566865937840605, "grad_norm": 0.1726607382297516, "learning_rate": 0.00018829172395007783, "loss": 0.0251, "step": 1186 }, { "epoch": 0.15681870726954453, "grad_norm": 0.22054699063301086, "learning_rate": 0.00018827220226598924, "loss": 0.0312, "step": 1187 }, { "epoch": 0.15695082075502856, "grad_norm": 0.1949966996908188, "learning_rate": 0.00018825266533452814, "loss": 0.0213, "step": 1188 }, { "epoch": 0.1570829342405126, "grad_norm": 0.27526164054870605, "learning_rate": 0.0001882331131590692, "loss": 0.0274, "step": 1189 }, { "epoch": 0.15721504772599662, "grad_norm": 0.25454092025756836, "learning_rate": 0.00018821354574298965, "loss": 0.0216, "step": 1190 }, { "epoch": 0.15734716121148065, "grad_norm": 0.18084175884723663, "learning_rate": 0.00018819396308966945, "loss": 0.02, "step": 1191 }, { "epoch": 0.15747927469696468, "grad_norm": 0.19691844284534454, "learning_rate": 0.00018817436520249112, "loss": 0.0194, "step": 1192 }, { "epoch": 0.15761138818244871, "grad_norm": 0.26010459661483765, "learning_rate": 0.0001881547520848398, "loss": 0.0266, "step": 1193 }, { "epoch": 0.15774350166793275, "grad_norm": 0.17826038599014282, "learning_rate": 0.00018813512374010332, "loss": 0.0216, "step": 1194 }, { "epoch": 0.15787561515341678, "grad_norm": 0.34981799125671387, "learning_rate": 0.0001881154801716721, "loss": 0.0398, "step": 1195 }, { "epoch": 0.1580077286389008, "grad_norm": 0.23868107795715332, "learning_rate": 0.00018809582138293922, "loss": 0.034, "step": 1196 }, { "epoch": 0.15813984212438484, "grad_norm": 0.23763948678970337, "learning_rate": 0.00018807614737730033, "loss": 0.0251, "step": 1197 }, { "epoch": 0.15827195560986887, "grad_norm": 0.25793859362602234, "learning_rate": 0.00018805645815815373, "loss": 0.0305, "step": 1198 }, { "epoch": 0.1584040690953529, "grad_norm": 0.1698640137910843, "learning_rate": 0.00018803675372890046, "loss": 0.0217, "step": 1199 }, { "epoch": 0.15853618258083693, "grad_norm": 0.20204704999923706, "learning_rate": 0.000188017034092944, "loss": 0.021, "step": 1200 }, { "epoch": 0.15866829606632096, "grad_norm": 0.22710919380187988, "learning_rate": 0.00018799729925369056, "loss": 0.0155, "step": 1201 }, { "epoch": 0.158800409551805, "grad_norm": 0.4746352732181549, "learning_rate": 0.000187977549214549, "loss": 0.0268, "step": 1202 }, { "epoch": 0.15893252303728903, "grad_norm": 0.3901737630367279, "learning_rate": 0.00018795778397893078, "loss": 0.0314, "step": 1203 }, { "epoch": 0.15906463652277306, "grad_norm": 0.22661690413951874, "learning_rate": 0.0001879380035502499, "loss": 0.0375, "step": 1204 }, { "epoch": 0.1591967500082571, "grad_norm": 0.25800031423568726, "learning_rate": 0.00018791820793192314, "loss": 0.0309, "step": 1205 }, { "epoch": 0.15932886349374112, "grad_norm": 0.20021669566631317, "learning_rate": 0.0001878983971273698, "loss": 0.0191, "step": 1206 }, { "epoch": 0.15946097697922515, "grad_norm": 0.2079436480998993, "learning_rate": 0.00018787857114001177, "loss": 0.018, "step": 1207 }, { "epoch": 0.15959309046470918, "grad_norm": 0.22843880951404572, "learning_rate": 0.0001878587299732737, "loss": 0.0241, "step": 1208 }, { "epoch": 0.1597252039501932, "grad_norm": 0.20756269991397858, "learning_rate": 0.00018783887363058274, "loss": 0.0337, "step": 1209 }, { "epoch": 0.15985731743567724, "grad_norm": 0.2768935561180115, "learning_rate": 0.0001878190021153687, "loss": 0.0383, "step": 1210 }, { "epoch": 0.15998943092116127, "grad_norm": 0.2520005702972412, "learning_rate": 0.00018779911543106406, "loss": 0.0373, "step": 1211 }, { "epoch": 0.1601215444066453, "grad_norm": 0.2929930090904236, "learning_rate": 0.0001877792135811038, "loss": 0.0303, "step": 1212 }, { "epoch": 0.16025365789212934, "grad_norm": 0.23010598123073578, "learning_rate": 0.0001877592965689256, "loss": 0.0313, "step": 1213 }, { "epoch": 0.16038577137761337, "grad_norm": 0.21887648105621338, "learning_rate": 0.0001877393643979698, "loss": 0.0317, "step": 1214 }, { "epoch": 0.1605178848630974, "grad_norm": 0.35093334317207336, "learning_rate": 0.0001877194170716793, "loss": 0.0377, "step": 1215 }, { "epoch": 0.16064999834858143, "grad_norm": 0.20710135996341705, "learning_rate": 0.00018769945459349964, "loss": 0.034, "step": 1216 }, { "epoch": 0.16078211183406546, "grad_norm": 0.1625404953956604, "learning_rate": 0.00018767947696687887, "loss": 0.0175, "step": 1217 }, { "epoch": 0.1609142253195495, "grad_norm": 0.2056693285703659, "learning_rate": 0.00018765948419526788, "loss": 0.0203, "step": 1218 }, { "epoch": 0.16104633880503352, "grad_norm": 0.15321901440620422, "learning_rate": 0.00018763947628211994, "loss": 0.0294, "step": 1219 }, { "epoch": 0.16117845229051755, "grad_norm": 0.19081968069076538, "learning_rate": 0.0001876194532308911, "loss": 0.0153, "step": 1220 }, { "epoch": 0.16131056577600159, "grad_norm": 0.19270607829093933, "learning_rate": 0.00018759941504504002, "loss": 0.0311, "step": 1221 }, { "epoch": 0.16144267926148562, "grad_norm": 0.20099586248397827, "learning_rate": 0.0001875793617280278, "loss": 0.0359, "step": 1222 }, { "epoch": 0.16157479274696965, "grad_norm": 0.20773924887180328, "learning_rate": 0.00018755929328331835, "loss": 0.0222, "step": 1223 }, { "epoch": 0.16170690623245368, "grad_norm": 0.256315141916275, "learning_rate": 0.00018753920971437813, "loss": 0.0383, "step": 1224 }, { "epoch": 0.1618390197179377, "grad_norm": 0.20451293885707855, "learning_rate": 0.00018751911102467614, "loss": 0.0281, "step": 1225 }, { "epoch": 0.16197113320342174, "grad_norm": 0.20011462271213531, "learning_rate": 0.0001874989972176841, "loss": 0.0287, "step": 1226 }, { "epoch": 0.16210324668890577, "grad_norm": 0.2945924997329712, "learning_rate": 0.00018747886829687628, "loss": 0.0389, "step": 1227 }, { "epoch": 0.1622353601743898, "grad_norm": 0.1852794885635376, "learning_rate": 0.00018745872426572958, "loss": 0.0214, "step": 1228 }, { "epoch": 0.16236747365987383, "grad_norm": 0.25254741311073303, "learning_rate": 0.0001874385651277235, "loss": 0.0278, "step": 1229 }, { "epoch": 0.16249958714535787, "grad_norm": 0.22638756036758423, "learning_rate": 0.00018741839088634018, "loss": 0.0268, "step": 1230 }, { "epoch": 0.1626317006308419, "grad_norm": 0.27086418867111206, "learning_rate": 0.0001873982015450643, "loss": 0.0394, "step": 1231 }, { "epoch": 0.16276381411632593, "grad_norm": 0.1940222531557083, "learning_rate": 0.00018737799710738325, "loss": 0.0308, "step": 1232 }, { "epoch": 0.16289592760180996, "grad_norm": 0.26498880982398987, "learning_rate": 0.00018735777757678687, "loss": 0.0164, "step": 1233 }, { "epoch": 0.163028041087294, "grad_norm": 0.2891830503940582, "learning_rate": 0.00018733754295676777, "loss": 0.0367, "step": 1234 }, { "epoch": 0.16316015457277802, "grad_norm": 0.32519957423210144, "learning_rate": 0.00018731729325082114, "loss": 0.0219, "step": 1235 }, { "epoch": 0.16329226805826205, "grad_norm": 0.495128333568573, "learning_rate": 0.00018729702846244467, "loss": 0.03, "step": 1236 }, { "epoch": 0.16342438154374608, "grad_norm": 0.16794808208942413, "learning_rate": 0.00018727674859513879, "loss": 0.0142, "step": 1237 }, { "epoch": 0.16355649502923011, "grad_norm": 0.40994521975517273, "learning_rate": 0.00018725645365240637, "loss": 0.037, "step": 1238 }, { "epoch": 0.16368860851471415, "grad_norm": 0.21588881313800812, "learning_rate": 0.00018723614363775303, "loss": 0.0215, "step": 1239 }, { "epoch": 0.16382072200019818, "grad_norm": 0.21787334978580475, "learning_rate": 0.00018721581855468702, "loss": 0.0256, "step": 1240 }, { "epoch": 0.1639528354856822, "grad_norm": 0.31683409214019775, "learning_rate": 0.000187195478406719, "loss": 0.0261, "step": 1241 }, { "epoch": 0.16408494897116624, "grad_norm": 0.18830211460590363, "learning_rate": 0.00018717512319736242, "loss": 0.0153, "step": 1242 }, { "epoch": 0.16421706245665027, "grad_norm": 0.231922909617424, "learning_rate": 0.00018715475293013326, "loss": 0.0333, "step": 1243 }, { "epoch": 0.1643491759421343, "grad_norm": 0.28003785014152527, "learning_rate": 0.00018713436760855006, "loss": 0.0328, "step": 1244 }, { "epoch": 0.16448128942761833, "grad_norm": 0.24703218042850494, "learning_rate": 0.00018711396723613402, "loss": 0.024, "step": 1245 }, { "epoch": 0.16461340291310236, "grad_norm": 0.2914784550666809, "learning_rate": 0.00018709355181640897, "loss": 0.0313, "step": 1246 }, { "epoch": 0.1647455163985864, "grad_norm": 0.20224568247795105, "learning_rate": 0.00018707312135290125, "loss": 0.0334, "step": 1247 }, { "epoch": 0.16487762988407043, "grad_norm": 0.2872107923030853, "learning_rate": 0.00018705267584913983, "loss": 0.0312, "step": 1248 }, { "epoch": 0.16500974336955446, "grad_norm": 0.3120299279689789, "learning_rate": 0.00018703221530865633, "loss": 0.0343, "step": 1249 }, { "epoch": 0.1651418568550385, "grad_norm": 0.23731586337089539, "learning_rate": 0.00018701173973498486, "loss": 0.0277, "step": 1250 }, { "epoch": 0.16527397034052252, "grad_norm": 0.3275063633918762, "learning_rate": 0.00018699124913166228, "loss": 0.0274, "step": 1251 }, { "epoch": 0.16540608382600655, "grad_norm": 0.17656168341636658, "learning_rate": 0.00018697074350222786, "loss": 0.0246, "step": 1252 }, { "epoch": 0.16553819731149058, "grad_norm": 0.27418434619903564, "learning_rate": 0.00018695022285022364, "loss": 0.0419, "step": 1253 }, { "epoch": 0.1656703107969746, "grad_norm": 0.26168766617774963, "learning_rate": 0.0001869296871791942, "loss": 0.0262, "step": 1254 }, { "epoch": 0.16580242428245864, "grad_norm": 0.21798458695411682, "learning_rate": 0.0001869091364926866, "loss": 0.0245, "step": 1255 }, { "epoch": 0.16593453776794267, "grad_norm": 0.4269181191921234, "learning_rate": 0.00018688857079425064, "loss": 0.0287, "step": 1256 }, { "epoch": 0.1660666512534267, "grad_norm": 0.35198020935058594, "learning_rate": 0.00018686799008743864, "loss": 0.0269, "step": 1257 }, { "epoch": 0.16619876473891074, "grad_norm": 0.2965468168258667, "learning_rate": 0.00018684739437580555, "loss": 0.0391, "step": 1258 }, { "epoch": 0.16633087822439477, "grad_norm": 0.2700027823448181, "learning_rate": 0.00018682678366290894, "loss": 0.0316, "step": 1259 }, { "epoch": 0.1664629917098788, "grad_norm": 0.27826032042503357, "learning_rate": 0.0001868061579523088, "loss": 0.043, "step": 1260 }, { "epoch": 0.16659510519536283, "grad_norm": 0.22224481403827667, "learning_rate": 0.00018678551724756796, "loss": 0.0281, "step": 1261 }, { "epoch": 0.16672721868084683, "grad_norm": 0.23873165249824524, "learning_rate": 0.00018676486155225168, "loss": 0.0385, "step": 1262 }, { "epoch": 0.16685933216633086, "grad_norm": 0.16569150984287262, "learning_rate": 0.0001867441908699278, "loss": 0.0256, "step": 1263 }, { "epoch": 0.1669914456518149, "grad_norm": 0.29517295956611633, "learning_rate": 0.00018672350520416683, "loss": 0.0202, "step": 1264 }, { "epoch": 0.16712355913729893, "grad_norm": 0.2139241248369217, "learning_rate": 0.00018670280455854185, "loss": 0.0368, "step": 1265 }, { "epoch": 0.16725567262278296, "grad_norm": 0.25826069712638855, "learning_rate": 0.0001866820889366285, "loss": 0.0276, "step": 1266 }, { "epoch": 0.167387786108267, "grad_norm": 0.205109640955925, "learning_rate": 0.000186661358342005, "loss": 0.0233, "step": 1267 }, { "epoch": 0.16751989959375102, "grad_norm": 0.16500066220760345, "learning_rate": 0.00018664061277825212, "loss": 0.0273, "step": 1268 }, { "epoch": 0.16765201307923505, "grad_norm": 0.38416817784309387, "learning_rate": 0.00018661985224895339, "loss": 0.0339, "step": 1269 }, { "epoch": 0.16778412656471908, "grad_norm": 0.3248102366924286, "learning_rate": 0.0001865990767576947, "loss": 0.0314, "step": 1270 }, { "epoch": 0.1679162400502031, "grad_norm": 0.3818456828594208, "learning_rate": 0.00018657828630806467, "loss": 0.0226, "step": 1271 }, { "epoch": 0.16804835353568714, "grad_norm": 0.22842957079410553, "learning_rate": 0.00018655748090365445, "loss": 0.039, "step": 1272 }, { "epoch": 0.16818046702117118, "grad_norm": 0.18327449262142181, "learning_rate": 0.00018653666054805785, "loss": 0.0211, "step": 1273 }, { "epoch": 0.1683125805066552, "grad_norm": 0.31490403413772583, "learning_rate": 0.0001865158252448711, "loss": 0.0211, "step": 1274 }, { "epoch": 0.16844469399213924, "grad_norm": 0.23804102838039398, "learning_rate": 0.00018649497499769314, "loss": 0.0231, "step": 1275 }, { "epoch": 0.16857680747762327, "grad_norm": 0.17960631847381592, "learning_rate": 0.0001864741098101255, "loss": 0.0226, "step": 1276 }, { "epoch": 0.1687089209631073, "grad_norm": 0.20831550657749176, "learning_rate": 0.00018645322968577216, "loss": 0.0265, "step": 1277 }, { "epoch": 0.16884103444859133, "grad_norm": 0.19507895410060883, "learning_rate": 0.00018643233462823988, "loss": 0.0215, "step": 1278 }, { "epoch": 0.16897314793407536, "grad_norm": 0.19099454581737518, "learning_rate": 0.00018641142464113783, "loss": 0.0239, "step": 1279 }, { "epoch": 0.1691052614195594, "grad_norm": 0.32890570163726807, "learning_rate": 0.00018639049972807783, "loss": 0.0375, "step": 1280 }, { "epoch": 0.16923737490504343, "grad_norm": 0.17816013097763062, "learning_rate": 0.00018636955989267427, "loss": 0.0131, "step": 1281 }, { "epoch": 0.16936948839052746, "grad_norm": 0.2211841195821762, "learning_rate": 0.00018634860513854412, "loss": 0.0266, "step": 1282 }, { "epoch": 0.1695016018760115, "grad_norm": 0.3327048420906067, "learning_rate": 0.00018632763546930692, "loss": 0.0431, "step": 1283 }, { "epoch": 0.16963371536149552, "grad_norm": 0.3703952133655548, "learning_rate": 0.00018630665088858477, "loss": 0.0515, "step": 1284 }, { "epoch": 0.16976582884697955, "grad_norm": 0.4615309536457062, "learning_rate": 0.0001862856514000024, "loss": 0.023, "step": 1285 }, { "epoch": 0.16989794233246358, "grad_norm": 0.23466283082962036, "learning_rate": 0.00018626463700718705, "loss": 0.0348, "step": 1286 }, { "epoch": 0.1700300558179476, "grad_norm": 0.17402033507823944, "learning_rate": 0.00018624360771376855, "loss": 0.0198, "step": 1287 }, { "epoch": 0.17016216930343164, "grad_norm": 0.2167549729347229, "learning_rate": 0.00018622256352337935, "loss": 0.0186, "step": 1288 }, { "epoch": 0.17029428278891567, "grad_norm": 0.20165550708770752, "learning_rate": 0.00018620150443965442, "loss": 0.0208, "step": 1289 }, { "epoch": 0.1704263962743997, "grad_norm": 0.29167264699935913, "learning_rate": 0.00018618043046623136, "loss": 0.0407, "step": 1290 }, { "epoch": 0.17055850975988374, "grad_norm": 0.23466479778289795, "learning_rate": 0.00018615934160675024, "loss": 0.0316, "step": 1291 }, { "epoch": 0.17069062324536777, "grad_norm": 0.16358260810375214, "learning_rate": 0.00018613823786485382, "loss": 0.0139, "step": 1292 }, { "epoch": 0.1708227367308518, "grad_norm": 0.228166401386261, "learning_rate": 0.00018611711924418733, "loss": 0.0328, "step": 1293 }, { "epoch": 0.17095485021633583, "grad_norm": 0.18310043215751648, "learning_rate": 0.00018609598574839868, "loss": 0.022, "step": 1294 }, { "epoch": 0.17108696370181986, "grad_norm": 0.27457547187805176, "learning_rate": 0.00018607483738113825, "loss": 0.0203, "step": 1295 }, { "epoch": 0.1712190771873039, "grad_norm": 0.2539723217487335, "learning_rate": 0.000186053674146059, "loss": 0.0149, "step": 1296 }, { "epoch": 0.17135119067278792, "grad_norm": 0.26341545581817627, "learning_rate": 0.00018603249604681653, "loss": 0.0463, "step": 1297 }, { "epoch": 0.17148330415827195, "grad_norm": 0.9475441575050354, "learning_rate": 0.00018601130308706896, "loss": 0.0245, "step": 1298 }, { "epoch": 0.17161541764375599, "grad_norm": 0.2520473599433899, "learning_rate": 0.00018599009527047692, "loss": 0.0324, "step": 1299 }, { "epoch": 0.17174753112924002, "grad_norm": 0.201473668217659, "learning_rate": 0.00018596887260070375, "loss": 0.0163, "step": 1300 }, { "epoch": 0.17187964461472405, "grad_norm": 0.3438372015953064, "learning_rate": 0.00018594763508141516, "loss": 0.0441, "step": 1301 }, { "epoch": 0.17201175810020808, "grad_norm": 0.33211642503738403, "learning_rate": 0.00018592638271627964, "loss": 0.031, "step": 1302 }, { "epoch": 0.1721438715856921, "grad_norm": 0.3771807551383972, "learning_rate": 0.00018590511550896808, "loss": 0.0374, "step": 1303 }, { "epoch": 0.17227598507117614, "grad_norm": 0.5802851319313049, "learning_rate": 0.00018588383346315404, "loss": 0.0425, "step": 1304 }, { "epoch": 0.17240809855666017, "grad_norm": 0.3841919004917145, "learning_rate": 0.00018586253658251352, "loss": 0.0443, "step": 1305 }, { "epoch": 0.1725402120421442, "grad_norm": 0.25487831234931946, "learning_rate": 0.00018584122487072522, "loss": 0.0164, "step": 1306 }, { "epoch": 0.17267232552762823, "grad_norm": 0.23374485969543457, "learning_rate": 0.0001858198983314703, "loss": 0.024, "step": 1307 }, { "epoch": 0.17280443901311227, "grad_norm": 0.18719379603862762, "learning_rate": 0.00018579855696843257, "loss": 0.0159, "step": 1308 }, { "epoch": 0.1729365524985963, "grad_norm": 0.2631910741329193, "learning_rate": 0.0001857772007852983, "loss": 0.0273, "step": 1309 }, { "epoch": 0.17306866598408033, "grad_norm": 0.2309308648109436, "learning_rate": 0.0001857558297857564, "loss": 0.0201, "step": 1310 }, { "epoch": 0.17320077946956436, "grad_norm": 0.2370709776878357, "learning_rate": 0.0001857344439734983, "loss": 0.0428, "step": 1311 }, { "epoch": 0.1733328929550484, "grad_norm": 0.25953209400177, "learning_rate": 0.00018571304335221803, "loss": 0.0374, "step": 1312 }, { "epoch": 0.17346500644053242, "grad_norm": 0.21717828512191772, "learning_rate": 0.0001856916279256121, "loss": 0.0279, "step": 1313 }, { "epoch": 0.17359711992601645, "grad_norm": 0.1866343766450882, "learning_rate": 0.00018567019769737963, "loss": 0.0181, "step": 1314 }, { "epoch": 0.17372923341150048, "grad_norm": 0.21271054446697235, "learning_rate": 0.0001856487526712223, "loss": 0.0246, "step": 1315 }, { "epoch": 0.17386134689698451, "grad_norm": 0.2034507691860199, "learning_rate": 0.00018562729285084438, "loss": 0.0286, "step": 1316 }, { "epoch": 0.17399346038246855, "grad_norm": 0.3184293508529663, "learning_rate": 0.0001856058182399526, "loss": 0.0478, "step": 1317 }, { "epoch": 0.17412557386795258, "grad_norm": 0.2903006672859192, "learning_rate": 0.00018558432884225633, "loss": 0.0329, "step": 1318 }, { "epoch": 0.1742576873534366, "grad_norm": 0.22487443685531616, "learning_rate": 0.00018556282466146743, "loss": 0.0217, "step": 1319 }, { "epoch": 0.17438980083892064, "grad_norm": 0.2479228377342224, "learning_rate": 0.00018554130570130038, "loss": 0.0225, "step": 1320 }, { "epoch": 0.17452191432440467, "grad_norm": 0.26292628049850464, "learning_rate": 0.00018551977196547213, "loss": 0.0361, "step": 1321 }, { "epoch": 0.1746540278098887, "grad_norm": 0.2891198694705963, "learning_rate": 0.0001854982234577023, "loss": 0.0379, "step": 1322 }, { "epoch": 0.17478614129537273, "grad_norm": 0.22966928780078888, "learning_rate": 0.00018547666018171294, "loss": 0.033, "step": 1323 }, { "epoch": 0.17491825478085676, "grad_norm": 0.291325181722641, "learning_rate": 0.0001854550821412287, "loss": 0.0302, "step": 1324 }, { "epoch": 0.1750503682663408, "grad_norm": 0.4915953576564789, "learning_rate": 0.00018543348933997678, "loss": 0.0372, "step": 1325 }, { "epoch": 0.17518248175182483, "grad_norm": 0.3161768317222595, "learning_rate": 0.00018541188178168696, "loss": 0.0304, "step": 1326 }, { "epoch": 0.17531459523730886, "grad_norm": 0.252323716878891, "learning_rate": 0.00018539025947009153, "loss": 0.0188, "step": 1327 }, { "epoch": 0.1754467087227929, "grad_norm": 0.4096275866031647, "learning_rate": 0.00018536862240892536, "loss": 0.0511, "step": 1328 }, { "epoch": 0.17557882220827692, "grad_norm": 0.22228872776031494, "learning_rate": 0.00018534697060192584, "loss": 0.032, "step": 1329 }, { "epoch": 0.17571093569376095, "grad_norm": 0.20836091041564941, "learning_rate": 0.00018532530405283287, "loss": 0.0239, "step": 1330 }, { "epoch": 0.17584304917924498, "grad_norm": 0.28031957149505615, "learning_rate": 0.00018530362276538898, "loss": 0.0292, "step": 1331 }, { "epoch": 0.175975162664729, "grad_norm": 0.3088838756084442, "learning_rate": 0.00018528192674333922, "loss": 0.0408, "step": 1332 }, { "epoch": 0.17610727615021304, "grad_norm": 0.28220516443252563, "learning_rate": 0.00018526021599043113, "loss": 0.0257, "step": 1333 }, { "epoch": 0.17623938963569707, "grad_norm": 0.19157913327217102, "learning_rate": 0.0001852384905104149, "loss": 0.0297, "step": 1334 }, { "epoch": 0.1763715031211811, "grad_norm": 0.3451235592365265, "learning_rate": 0.00018521675030704312, "loss": 0.027, "step": 1335 }, { "epoch": 0.17650361660666514, "grad_norm": 0.3861960470676422, "learning_rate": 0.00018519499538407105, "loss": 0.041, "step": 1336 }, { "epoch": 0.17663573009214917, "grad_norm": 0.2573603391647339, "learning_rate": 0.00018517322574525648, "loss": 0.0312, "step": 1337 }, { "epoch": 0.1767678435776332, "grad_norm": 0.29054367542266846, "learning_rate": 0.00018515144139435964, "loss": 0.0243, "step": 1338 }, { "epoch": 0.17689995706311723, "grad_norm": 0.24201183021068573, "learning_rate": 0.0001851296423351434, "loss": 0.0269, "step": 1339 }, { "epoch": 0.17703207054860126, "grad_norm": 0.16764526069164276, "learning_rate": 0.0001851078285713731, "loss": 0.018, "step": 1340 }, { "epoch": 0.1771641840340853, "grad_norm": 0.18756064772605896, "learning_rate": 0.0001850860001068168, "loss": 0.0224, "step": 1341 }, { "epoch": 0.17729629751956932, "grad_norm": 0.27146032452583313, "learning_rate": 0.00018506415694524478, "loss": 0.0303, "step": 1342 }, { "epoch": 0.17742841100505333, "grad_norm": 0.2190525382757187, "learning_rate": 0.00018504229909043014, "loss": 0.0218, "step": 1343 }, { "epoch": 0.17756052449053736, "grad_norm": 0.30307960510253906, "learning_rate": 0.00018502042654614838, "loss": 0.0292, "step": 1344 }, { "epoch": 0.1776926379760214, "grad_norm": 0.2004368156194687, "learning_rate": 0.0001849985393161776, "loss": 0.0298, "step": 1345 }, { "epoch": 0.17782475146150542, "grad_norm": 0.22861526906490326, "learning_rate": 0.00018497663740429837, "loss": 0.0254, "step": 1346 }, { "epoch": 0.17795686494698945, "grad_norm": 0.2857377529144287, "learning_rate": 0.00018495472081429386, "loss": 0.0302, "step": 1347 }, { "epoch": 0.17808897843247348, "grad_norm": 0.275430291891098, "learning_rate": 0.00018493278954994976, "loss": 0.0387, "step": 1348 }, { "epoch": 0.1782210919179575, "grad_norm": 0.23096689581871033, "learning_rate": 0.0001849108436150543, "loss": 0.0362, "step": 1349 }, { "epoch": 0.17835320540344154, "grad_norm": 0.1883264034986496, "learning_rate": 0.00018488888301339818, "loss": 0.0239, "step": 1350 }, { "epoch": 0.17848531888892558, "grad_norm": 0.3247344195842743, "learning_rate": 0.00018486690774877472, "loss": 0.04, "step": 1351 }, { "epoch": 0.1786174323744096, "grad_norm": 0.15827877819538116, "learning_rate": 0.00018484491782497974, "loss": 0.0181, "step": 1352 }, { "epoch": 0.17874954585989364, "grad_norm": 0.21765846014022827, "learning_rate": 0.0001848229132458115, "loss": 0.0374, "step": 1353 }, { "epoch": 0.17888165934537767, "grad_norm": 0.29163116216659546, "learning_rate": 0.00018480089401507103, "loss": 0.03, "step": 1354 }, { "epoch": 0.1790137728308617, "grad_norm": 0.1919863075017929, "learning_rate": 0.00018477886013656164, "loss": 0.0226, "step": 1355 }, { "epoch": 0.17914588631634573, "grad_norm": 0.2065700888633728, "learning_rate": 0.0001847568116140893, "loss": 0.0203, "step": 1356 }, { "epoch": 0.17927799980182976, "grad_norm": 0.32423102855682373, "learning_rate": 0.0001847347484514625, "loss": 0.0507, "step": 1357 }, { "epoch": 0.1794101132873138, "grad_norm": 0.31353965401649475, "learning_rate": 0.00018471267065249216, "loss": 0.0267, "step": 1358 }, { "epoch": 0.17954222677279782, "grad_norm": 0.29463842511177063, "learning_rate": 0.00018469057822099192, "loss": 0.0286, "step": 1359 }, { "epoch": 0.17967434025828186, "grad_norm": 0.19099555909633636, "learning_rate": 0.0001846684711607777, "loss": 0.0199, "step": 1360 }, { "epoch": 0.1798064537437659, "grad_norm": 0.3381175398826599, "learning_rate": 0.00018464634947566825, "loss": 0.0416, "step": 1361 }, { "epoch": 0.17993856722924992, "grad_norm": 0.24425239861011505, "learning_rate": 0.00018462421316948452, "loss": 0.0322, "step": 1362 }, { "epoch": 0.18007068071473395, "grad_norm": 0.20561014115810394, "learning_rate": 0.0001846020622460502, "loss": 0.0283, "step": 1363 }, { "epoch": 0.18020279420021798, "grad_norm": 0.16568754613399506, "learning_rate": 0.0001845798967091915, "loss": 0.0153, "step": 1364 }, { "epoch": 0.180334907685702, "grad_norm": 0.24680128693580627, "learning_rate": 0.000184557716562737, "loss": 0.0312, "step": 1365 }, { "epoch": 0.18046702117118604, "grad_norm": 0.2902357876300812, "learning_rate": 0.00018453552181051802, "loss": 0.0385, "step": 1366 }, { "epoch": 0.18059913465667007, "grad_norm": 0.20758208632469177, "learning_rate": 0.00018451331245636818, "loss": 0.0264, "step": 1367 }, { "epoch": 0.1807312481421541, "grad_norm": 0.21378950774669647, "learning_rate": 0.00018449108850412382, "loss": 0.0326, "step": 1368 }, { "epoch": 0.18086336162763814, "grad_norm": 0.32274001836776733, "learning_rate": 0.00018446884995762365, "loss": 0.0337, "step": 1369 }, { "epoch": 0.18099547511312217, "grad_norm": 0.19334742426872253, "learning_rate": 0.000184446596820709, "loss": 0.0246, "step": 1370 }, { "epoch": 0.1811275885986062, "grad_norm": 0.3434712588787079, "learning_rate": 0.00018442432909722366, "loss": 0.036, "step": 1371 }, { "epoch": 0.18125970208409023, "grad_norm": 0.23470188677310944, "learning_rate": 0.000184402046791014, "loss": 0.0285, "step": 1372 }, { "epoch": 0.18139181556957426, "grad_norm": 0.16895225644111633, "learning_rate": 0.00018437974990592884, "loss": 0.0179, "step": 1373 }, { "epoch": 0.1815239290550583, "grad_norm": 0.26691296696662903, "learning_rate": 0.00018435743844581954, "loss": 0.025, "step": 1374 }, { "epoch": 0.18165604254054232, "grad_norm": 0.21763736009597778, "learning_rate": 0.00018433511241454001, "loss": 0.0253, "step": 1375 }, { "epoch": 0.18178815602602635, "grad_norm": 0.37234312295913696, "learning_rate": 0.0001843127718159466, "loss": 0.0342, "step": 1376 }, { "epoch": 0.18192026951151039, "grad_norm": 0.22473712265491486, "learning_rate": 0.00018429041665389835, "loss": 0.0202, "step": 1377 }, { "epoch": 0.18205238299699442, "grad_norm": 0.18173733353614807, "learning_rate": 0.00018426804693225658, "loss": 0.0259, "step": 1378 }, { "epoch": 0.18218449648247845, "grad_norm": 0.1645773947238922, "learning_rate": 0.00018424566265488532, "loss": 0.012, "step": 1379 }, { "epoch": 0.18231660996796248, "grad_norm": 0.3594745695590973, "learning_rate": 0.00018422326382565096, "loss": 0.0333, "step": 1380 }, { "epoch": 0.1824487234534465, "grad_norm": 0.2075868397951126, "learning_rate": 0.00018420085044842255, "loss": 0.0246, "step": 1381 }, { "epoch": 0.18258083693893054, "grad_norm": 0.26188474893569946, "learning_rate": 0.0001841784225270715, "loss": 0.0444, "step": 1382 }, { "epoch": 0.18271295042441457, "grad_norm": 0.2381543070077896, "learning_rate": 0.00018415598006547192, "loss": 0.0156, "step": 1383 }, { "epoch": 0.1828450639098986, "grad_norm": 0.2644082009792328, "learning_rate": 0.00018413352306750026, "loss": 0.0467, "step": 1384 }, { "epoch": 0.18297717739538263, "grad_norm": 0.21643215417861938, "learning_rate": 0.00018411105153703556, "loss": 0.0344, "step": 1385 }, { "epoch": 0.18310929088086667, "grad_norm": 0.2230250984430313, "learning_rate": 0.00018408856547795933, "loss": 0.0287, "step": 1386 }, { "epoch": 0.1832414043663507, "grad_norm": 0.2237335443496704, "learning_rate": 0.00018406606489415568, "loss": 0.0341, "step": 1387 }, { "epoch": 0.18337351785183473, "grad_norm": 0.2967820167541504, "learning_rate": 0.0001840435497895111, "loss": 0.0216, "step": 1388 }, { "epoch": 0.18350563133731876, "grad_norm": 0.21675102412700653, "learning_rate": 0.00018402102016791468, "loss": 0.0246, "step": 1389 }, { "epoch": 0.1836377448228028, "grad_norm": 0.22738157212734222, "learning_rate": 0.000183998476033258, "loss": 0.0367, "step": 1390 }, { "epoch": 0.18376985830828682, "grad_norm": 0.23565442860126495, "learning_rate": 0.0001839759173894351, "loss": 0.0327, "step": 1391 }, { "epoch": 0.18390197179377085, "grad_norm": 0.3125069737434387, "learning_rate": 0.00018395334424034263, "loss": 0.0252, "step": 1392 }, { "epoch": 0.18403408527925488, "grad_norm": 0.2953563630580902, "learning_rate": 0.00018393075658987962, "loss": 0.0383, "step": 1393 }, { "epoch": 0.18416619876473891, "grad_norm": 0.2701183259487152, "learning_rate": 0.00018390815444194766, "loss": 0.0235, "step": 1394 }, { "epoch": 0.18429831225022295, "grad_norm": 0.26304230093955994, "learning_rate": 0.00018388553780045093, "loss": 0.0379, "step": 1395 }, { "epoch": 0.18443042573570698, "grad_norm": 0.2280203104019165, "learning_rate": 0.00018386290666929593, "loss": 0.0302, "step": 1396 }, { "epoch": 0.184562539221191, "grad_norm": 0.2568672001361847, "learning_rate": 0.00018384026105239184, "loss": 0.0227, "step": 1397 }, { "epoch": 0.18469465270667504, "grad_norm": 0.556996762752533, "learning_rate": 0.00018381760095365022, "loss": 0.0346, "step": 1398 }, { "epoch": 0.18482676619215907, "grad_norm": 0.3996849060058594, "learning_rate": 0.0001837949263769852, "loss": 0.0229, "step": 1399 }, { "epoch": 0.1849588796776431, "grad_norm": 0.2925168573856354, "learning_rate": 0.00018377223732631337, "loss": 0.0385, "step": 1400 }, { "epoch": 0.18509099316312713, "grad_norm": 0.26440808176994324, "learning_rate": 0.00018374953380555388, "loss": 0.0262, "step": 1401 }, { "epoch": 0.18522310664861116, "grad_norm": 0.23454102873802185, "learning_rate": 0.0001837268158186283, "loss": 0.0251, "step": 1402 }, { "epoch": 0.1853552201340952, "grad_norm": 0.6885169744491577, "learning_rate": 0.00018370408336946075, "loss": 0.0437, "step": 1403 }, { "epoch": 0.18548733361957923, "grad_norm": 0.3095178008079529, "learning_rate": 0.00018368133646197782, "loss": 0.0287, "step": 1404 }, { "epoch": 0.18561944710506326, "grad_norm": 0.24140727519989014, "learning_rate": 0.00018365857510010866, "loss": 0.0284, "step": 1405 }, { "epoch": 0.1857515605905473, "grad_norm": 0.32140839099884033, "learning_rate": 0.00018363579928778483, "loss": 0.037, "step": 1406 }, { "epoch": 0.18588367407603132, "grad_norm": 0.2660520374774933, "learning_rate": 0.00018361300902894044, "loss": 0.0512, "step": 1407 }, { "epoch": 0.18601578756151535, "grad_norm": 0.2977862060070038, "learning_rate": 0.00018359020432751205, "loss": 0.0348, "step": 1408 }, { "epoch": 0.18614790104699938, "grad_norm": 0.2596471607685089, "learning_rate": 0.0001835673851874388, "loss": 0.0302, "step": 1409 }, { "epoch": 0.1862800145324834, "grad_norm": 0.20368549227714539, "learning_rate": 0.0001835445516126622, "loss": 0.0331, "step": 1410 }, { "epoch": 0.18641212801796744, "grad_norm": 0.24589885771274567, "learning_rate": 0.00018352170360712639, "loss": 0.0294, "step": 1411 }, { "epoch": 0.18654424150345147, "grad_norm": 0.33342570066452026, "learning_rate": 0.0001834988411747779, "loss": 0.0333, "step": 1412 }, { "epoch": 0.1866763549889355, "grad_norm": 0.17387668788433075, "learning_rate": 0.00018347596431956582, "loss": 0.0204, "step": 1413 }, { "epoch": 0.18680846847441954, "grad_norm": 0.25326329469680786, "learning_rate": 0.0001834530730454417, "loss": 0.0354, "step": 1414 }, { "epoch": 0.18694058195990357, "grad_norm": 0.18185511231422424, "learning_rate": 0.0001834301673563595, "loss": 0.0292, "step": 1415 }, { "epoch": 0.1870726954453876, "grad_norm": 0.16677400469779968, "learning_rate": 0.00018340724725627583, "loss": 0.0204, "step": 1416 }, { "epoch": 0.18720480893087163, "grad_norm": 0.2312796413898468, "learning_rate": 0.0001833843127491497, "loss": 0.0294, "step": 1417 }, { "epoch": 0.18733692241635566, "grad_norm": 0.21785087883472443, "learning_rate": 0.00018336136383894256, "loss": 0.033, "step": 1418 }, { "epoch": 0.1874690359018397, "grad_norm": 0.22082215547561646, "learning_rate": 0.0001833384005296185, "loss": 0.0252, "step": 1419 }, { "epoch": 0.18760114938732372, "grad_norm": 0.22463281452655792, "learning_rate": 0.0001833154228251439, "loss": 0.0298, "step": 1420 }, { "epoch": 0.18773326287280775, "grad_norm": 0.20574955642223358, "learning_rate": 0.0001832924307294878, "loss": 0.0192, "step": 1421 }, { "epoch": 0.18786537635829179, "grad_norm": 0.20774230360984802, "learning_rate": 0.00018326942424662165, "loss": 0.0211, "step": 1422 }, { "epoch": 0.18799748984377582, "grad_norm": 0.2312815636396408, "learning_rate": 0.00018324640338051934, "loss": 0.0402, "step": 1423 }, { "epoch": 0.18812960332925985, "grad_norm": 0.18057630956172943, "learning_rate": 0.00018322336813515733, "loss": 0.0284, "step": 1424 }, { "epoch": 0.18826171681474385, "grad_norm": 0.20045863091945648, "learning_rate": 0.00018320031851451452, "loss": 0.0193, "step": 1425 }, { "epoch": 0.18839383030022788, "grad_norm": 0.21414987742900848, "learning_rate": 0.00018317725452257234, "loss": 0.0313, "step": 1426 }, { "epoch": 0.1885259437857119, "grad_norm": 0.2546866834163666, "learning_rate": 0.0001831541761633146, "loss": 0.0342, "step": 1427 }, { "epoch": 0.18865805727119594, "grad_norm": 0.23323996365070343, "learning_rate": 0.0001831310834407277, "loss": 0.0256, "step": 1428 }, { "epoch": 0.18879017075667998, "grad_norm": 0.1575809270143509, "learning_rate": 0.00018310797635880043, "loss": 0.0178, "step": 1429 }, { "epoch": 0.188922284242164, "grad_norm": 0.2791820466518402, "learning_rate": 0.0001830848549215242, "loss": 0.0385, "step": 1430 }, { "epoch": 0.18905439772764804, "grad_norm": 0.18286937475204468, "learning_rate": 0.00018306171913289268, "loss": 0.0152, "step": 1431 }, { "epoch": 0.18918651121313207, "grad_norm": 0.2981870770454407, "learning_rate": 0.00018303856899690223, "loss": 0.0248, "step": 1432 }, { "epoch": 0.1893186246986161, "grad_norm": 0.2948318123817444, "learning_rate": 0.00018301540451755158, "loss": 0.0206, "step": 1433 }, { "epoch": 0.18945073818410013, "grad_norm": 0.25580301880836487, "learning_rate": 0.00018299222569884198, "loss": 0.0245, "step": 1434 }, { "epoch": 0.18958285166958416, "grad_norm": 0.23851770162582397, "learning_rate": 0.0001829690325447771, "loss": 0.0293, "step": 1435 }, { "epoch": 0.1897149651550682, "grad_norm": 0.35320428013801575, "learning_rate": 0.00018294582505936312, "loss": 0.0419, "step": 1436 }, { "epoch": 0.18984707864055222, "grad_norm": 0.2724175453186035, "learning_rate": 0.00018292260324660875, "loss": 0.0336, "step": 1437 }, { "epoch": 0.18997919212603626, "grad_norm": 0.1679764837026596, "learning_rate": 0.0001828993671105251, "loss": 0.0232, "step": 1438 }, { "epoch": 0.1901113056115203, "grad_norm": 0.19844774901866913, "learning_rate": 0.00018287611665512575, "loss": 0.0216, "step": 1439 }, { "epoch": 0.19024341909700432, "grad_norm": 0.26759544014930725, "learning_rate": 0.00018285285188442683, "loss": 0.0296, "step": 1440 }, { "epoch": 0.19037553258248835, "grad_norm": 0.6173081994056702, "learning_rate": 0.00018282957280244685, "loss": 0.0172, "step": 1441 }, { "epoch": 0.19050764606797238, "grad_norm": 0.2865166664123535, "learning_rate": 0.00018280627941320688, "loss": 0.0303, "step": 1442 }, { "epoch": 0.1906397595534564, "grad_norm": 0.24494263529777527, "learning_rate": 0.00018278297172073037, "loss": 0.0247, "step": 1443 }, { "epoch": 0.19077187303894044, "grad_norm": 0.1931535005569458, "learning_rate": 0.0001827596497290433, "loss": 0.0251, "step": 1444 }, { "epoch": 0.19090398652442447, "grad_norm": 0.2376992106437683, "learning_rate": 0.00018273631344217415, "loss": 0.022, "step": 1445 }, { "epoch": 0.1910361000099085, "grad_norm": 0.23395435512065887, "learning_rate": 0.00018271296286415377, "loss": 0.0311, "step": 1446 }, { "epoch": 0.19116821349539254, "grad_norm": 0.2956295311450958, "learning_rate": 0.00018268959799901558, "loss": 0.0255, "step": 1447 }, { "epoch": 0.19130032698087657, "grad_norm": 0.31732040643692017, "learning_rate": 0.0001826662188507954, "loss": 0.0342, "step": 1448 }, { "epoch": 0.1914324404663606, "grad_norm": 0.20662644505500793, "learning_rate": 0.00018264282542353156, "loss": 0.0177, "step": 1449 }, { "epoch": 0.19156455395184463, "grad_norm": 0.15632778406143188, "learning_rate": 0.00018261941772126477, "loss": 0.0147, "step": 1450 }, { "epoch": 0.19169666743732866, "grad_norm": 0.18582721054553986, "learning_rate": 0.0001825959957480384, "loss": 0.0253, "step": 1451 }, { "epoch": 0.1918287809228127, "grad_norm": 0.3005513846874237, "learning_rate": 0.00018257255950789803, "loss": 0.0497, "step": 1452 }, { "epoch": 0.19196089440829672, "grad_norm": 0.20745569467544556, "learning_rate": 0.0001825491090048919, "loss": 0.0277, "step": 1453 }, { "epoch": 0.19209300789378075, "grad_norm": 0.22888556122779846, "learning_rate": 0.00018252564424307065, "loss": 0.0304, "step": 1454 }, { "epoch": 0.19222512137926479, "grad_norm": 0.2664090394973755, "learning_rate": 0.00018250216522648738, "loss": 0.0344, "step": 1455 }, { "epoch": 0.19235723486474882, "grad_norm": 0.27466508746147156, "learning_rate": 0.0001824786719591976, "loss": 0.0235, "step": 1456 }, { "epoch": 0.19248934835023285, "grad_norm": 0.21832376718521118, "learning_rate": 0.00018245516444525937, "loss": 0.0137, "step": 1457 }, { "epoch": 0.19262146183571688, "grad_norm": 0.163347527384758, "learning_rate": 0.00018243164268873317, "loss": 0.0133, "step": 1458 }, { "epoch": 0.1927535753212009, "grad_norm": 0.19356457889080048, "learning_rate": 0.00018240810669368194, "loss": 0.026, "step": 1459 }, { "epoch": 0.19288568880668494, "grad_norm": 0.336757093667984, "learning_rate": 0.00018238455646417108, "loss": 0.034, "step": 1460 }, { "epoch": 0.19301780229216897, "grad_norm": 0.197519913315773, "learning_rate": 0.00018236099200426845, "loss": 0.0303, "step": 1461 }, { "epoch": 0.193149915777653, "grad_norm": 0.40165066719055176, "learning_rate": 0.0001823374133180444, "loss": 0.0409, "step": 1462 }, { "epoch": 0.19328202926313703, "grad_norm": 0.167204812169075, "learning_rate": 0.00018231382040957166, "loss": 0.0193, "step": 1463 }, { "epoch": 0.19341414274862107, "grad_norm": 0.1733560860157013, "learning_rate": 0.0001822902132829255, "loss": 0.0189, "step": 1464 }, { "epoch": 0.1935462562341051, "grad_norm": 0.22970251739025116, "learning_rate": 0.00018226659194218363, "loss": 0.0337, "step": 1465 }, { "epoch": 0.19367836971958913, "grad_norm": 0.5224602818489075, "learning_rate": 0.00018224295639142612, "loss": 0.0214, "step": 1466 }, { "epoch": 0.19381048320507316, "grad_norm": 0.18081384897232056, "learning_rate": 0.0001822193066347356, "loss": 0.0196, "step": 1467 }, { "epoch": 0.1939425966905572, "grad_norm": 0.2021227478981018, "learning_rate": 0.00018219564267619719, "loss": 0.0253, "step": 1468 }, { "epoch": 0.19407471017604122, "grad_norm": 0.17635184526443481, "learning_rate": 0.00018217196451989832, "loss": 0.0187, "step": 1469 }, { "epoch": 0.19420682366152525, "grad_norm": 0.36210498213768005, "learning_rate": 0.00018214827216992893, "loss": 0.0383, "step": 1470 }, { "epoch": 0.19433893714700928, "grad_norm": 0.2603173553943634, "learning_rate": 0.00018212456563038151, "loss": 0.0212, "step": 1471 }, { "epoch": 0.19447105063249331, "grad_norm": 0.1690969467163086, "learning_rate": 0.00018210084490535088, "loss": 0.0252, "step": 1472 }, { "epoch": 0.19460316411797735, "grad_norm": 0.17981064319610596, "learning_rate": 0.00018207710999893436, "loss": 0.0257, "step": 1473 }, { "epoch": 0.19473527760346138, "grad_norm": 0.2037472426891327, "learning_rate": 0.00018205336091523167, "loss": 0.0228, "step": 1474 }, { "epoch": 0.1948673910889454, "grad_norm": 0.3084849715232849, "learning_rate": 0.0001820295976583451, "loss": 0.0321, "step": 1475 }, { "epoch": 0.19499950457442944, "grad_norm": 0.24606458842754364, "learning_rate": 0.00018200582023237925, "loss": 0.0327, "step": 1476 }, { "epoch": 0.19513161805991347, "grad_norm": 0.2386983186006546, "learning_rate": 0.00018198202864144124, "loss": 0.0245, "step": 1477 }, { "epoch": 0.1952637315453975, "grad_norm": 0.19429266452789307, "learning_rate": 0.00018195822288964063, "loss": 0.0266, "step": 1478 }, { "epoch": 0.19539584503088153, "grad_norm": 0.20982322096824646, "learning_rate": 0.00018193440298108939, "loss": 0.0227, "step": 1479 }, { "epoch": 0.19552795851636556, "grad_norm": 0.20643344521522522, "learning_rate": 0.00018191056891990202, "loss": 0.0294, "step": 1480 }, { "epoch": 0.1956600720018496, "grad_norm": 0.2400667667388916, "learning_rate": 0.00018188672071019535, "loss": 0.0242, "step": 1481 }, { "epoch": 0.19579218548733363, "grad_norm": 0.3001692593097687, "learning_rate": 0.0001818628583560887, "loss": 0.0316, "step": 1482 }, { "epoch": 0.19592429897281766, "grad_norm": 0.22269077599048615, "learning_rate": 0.00018183898186170395, "loss": 0.0338, "step": 1483 }, { "epoch": 0.1960564124583017, "grad_norm": 0.26167190074920654, "learning_rate": 0.0001818150912311652, "loss": 0.0361, "step": 1484 }, { "epoch": 0.19618852594378572, "grad_norm": 0.2652554512023926, "learning_rate": 0.00018179118646859918, "loss": 0.0194, "step": 1485 }, { "epoch": 0.19632063942926975, "grad_norm": 0.26757028698921204, "learning_rate": 0.00018176726757813497, "loss": 0.0173, "step": 1486 }, { "epoch": 0.19645275291475378, "grad_norm": 0.28542211651802063, "learning_rate": 0.00018174333456390409, "loss": 0.0436, "step": 1487 }, { "epoch": 0.1965848664002378, "grad_norm": 0.25279921293258667, "learning_rate": 0.00018171938743004055, "loss": 0.0358, "step": 1488 }, { "epoch": 0.19671697988572184, "grad_norm": 0.2647570073604584, "learning_rate": 0.00018169542618068078, "loss": 0.0168, "step": 1489 }, { "epoch": 0.19684909337120587, "grad_norm": 0.20313741266727448, "learning_rate": 0.00018167145081996358, "loss": 0.0248, "step": 1490 }, { "epoch": 0.1969812068566899, "grad_norm": 0.8464295864105225, "learning_rate": 0.00018164746135203034, "loss": 0.0314, "step": 1491 }, { "epoch": 0.19711332034217394, "grad_norm": 0.15083986520767212, "learning_rate": 0.0001816234577810247, "loss": 0.0111, "step": 1492 }, { "epoch": 0.19724543382765797, "grad_norm": 0.30844977498054504, "learning_rate": 0.0001815994401110929, "loss": 0.029, "step": 1493 }, { "epoch": 0.197377547313142, "grad_norm": 0.2315860390663147, "learning_rate": 0.00018157540834638346, "loss": 0.0282, "step": 1494 }, { "epoch": 0.19750966079862603, "grad_norm": 0.249103382229805, "learning_rate": 0.00018155136249104747, "loss": 0.0227, "step": 1495 }, { "epoch": 0.19764177428411006, "grad_norm": 0.1728508025407791, "learning_rate": 0.00018152730254923841, "loss": 0.0255, "step": 1496 }, { "epoch": 0.1977738877695941, "grad_norm": 0.35926568508148193, "learning_rate": 0.00018150322852511218, "loss": 0.021, "step": 1497 }, { "epoch": 0.19790600125507812, "grad_norm": 0.22047758102416992, "learning_rate": 0.0001814791404228271, "loss": 0.0345, "step": 1498 }, { "epoch": 0.19803811474056215, "grad_norm": 0.28702178597450256, "learning_rate": 0.00018145503824654394, "loss": 0.0246, "step": 1499 }, { "epoch": 0.19817022822604619, "grad_norm": 0.24854514002799988, "learning_rate": 0.00018143092200042596, "loss": 0.0303, "step": 1500 }, { "epoch": 0.19830234171153022, "grad_norm": 0.21608246862888336, "learning_rate": 0.0001814067916886387, "loss": 0.0155, "step": 1501 }, { "epoch": 0.19843445519701425, "grad_norm": 0.21416838467121124, "learning_rate": 0.00018138264731535025, "loss": 0.0243, "step": 1502 }, { "epoch": 0.19856656868249828, "grad_norm": 0.21499444544315338, "learning_rate": 0.00018135848888473115, "loss": 0.0268, "step": 1503 }, { "epoch": 0.1986986821679823, "grad_norm": 0.2473350167274475, "learning_rate": 0.00018133431640095425, "loss": 0.0181, "step": 1504 }, { "epoch": 0.19883079565346634, "grad_norm": 0.15252315998077393, "learning_rate": 0.0001813101298681949, "loss": 0.0165, "step": 1505 }, { "epoch": 0.19896290913895034, "grad_norm": 0.2195468693971634, "learning_rate": 0.00018128592929063093, "loss": 0.0302, "step": 1506 }, { "epoch": 0.19909502262443438, "grad_norm": 0.19480378925800323, "learning_rate": 0.00018126171467244248, "loss": 0.026, "step": 1507 }, { "epoch": 0.1992271361099184, "grad_norm": 0.3477564752101898, "learning_rate": 0.0001812374860178122, "loss": 0.0353, "step": 1508 }, { "epoch": 0.19935924959540244, "grad_norm": 0.19011445343494415, "learning_rate": 0.00018121324333092513, "loss": 0.0203, "step": 1509 }, { "epoch": 0.19949136308088647, "grad_norm": 0.2625371515750885, "learning_rate": 0.00018118898661596876, "loss": 0.0191, "step": 1510 }, { "epoch": 0.1996234765663705, "grad_norm": 0.18926642835140228, "learning_rate": 0.00018116471587713293, "loss": 0.0178, "step": 1511 }, { "epoch": 0.19975559005185453, "grad_norm": 0.17896400392055511, "learning_rate": 0.00018114043111861, "loss": 0.025, "step": 1512 }, { "epoch": 0.19988770353733856, "grad_norm": 0.2485428750514984, "learning_rate": 0.00018111613234459472, "loss": 0.0186, "step": 1513 }, { "epoch": 0.2000198170228226, "grad_norm": 0.3549768030643463, "learning_rate": 0.0001810918195592842, "loss": 0.047, "step": 1514 }, { "epoch": 0.20015193050830662, "grad_norm": 0.2386510819196701, "learning_rate": 0.00018106749276687806, "loss": 0.0269, "step": 1515 }, { "epoch": 0.20028404399379066, "grad_norm": 0.21441778540611267, "learning_rate": 0.0001810431519715783, "loss": 0.0271, "step": 1516 }, { "epoch": 0.2004161574792747, "grad_norm": 0.3943035900592804, "learning_rate": 0.00018101879717758931, "loss": 0.0253, "step": 1517 }, { "epoch": 0.20054827096475872, "grad_norm": 0.2370191365480423, "learning_rate": 0.00018099442838911793, "loss": 0.0418, "step": 1518 }, { "epoch": 0.20068038445024275, "grad_norm": 0.2918333113193512, "learning_rate": 0.00018097004561037344, "loss": 0.0371, "step": 1519 }, { "epoch": 0.20081249793572678, "grad_norm": 0.23465055227279663, "learning_rate": 0.00018094564884556745, "loss": 0.0301, "step": 1520 }, { "epoch": 0.2009446114212108, "grad_norm": 0.1612727791070938, "learning_rate": 0.00018092123809891413, "loss": 0.0219, "step": 1521 }, { "epoch": 0.20107672490669484, "grad_norm": 0.21482056379318237, "learning_rate": 0.0001808968133746299, "loss": 0.0307, "step": 1522 }, { "epoch": 0.20120883839217887, "grad_norm": 0.24555975198745728, "learning_rate": 0.00018087237467693374, "loss": 0.0233, "step": 1523 }, { "epoch": 0.2013409518776629, "grad_norm": 0.1954948455095291, "learning_rate": 0.0001808479220100469, "loss": 0.0236, "step": 1524 }, { "epoch": 0.20147306536314694, "grad_norm": 0.2907003164291382, "learning_rate": 0.00018082345537819326, "loss": 0.0356, "step": 1525 }, { "epoch": 0.20160517884863097, "grad_norm": 0.17122356593608856, "learning_rate": 0.00018079897478559878, "loss": 0.026, "step": 1526 }, { "epoch": 0.201737292334115, "grad_norm": 0.2171349972486496, "learning_rate": 0.00018077448023649218, "loss": 0.0284, "step": 1527 }, { "epoch": 0.20186940581959903, "grad_norm": 0.18378032743930817, "learning_rate": 0.00018074997173510437, "loss": 0.0255, "step": 1528 }, { "epoch": 0.20200151930508306, "grad_norm": 0.7698642611503601, "learning_rate": 0.00018072544928566874, "loss": 0.0277, "step": 1529 }, { "epoch": 0.2021336327905671, "grad_norm": 0.23853057622909546, "learning_rate": 0.00018070091289242114, "loss": 0.0294, "step": 1530 }, { "epoch": 0.20226574627605112, "grad_norm": 0.3057100176811218, "learning_rate": 0.00018067636255959964, "loss": 0.0363, "step": 1531 }, { "epoch": 0.20239785976153515, "grad_norm": 0.28446629643440247, "learning_rate": 0.00018065179829144498, "loss": 0.034, "step": 1532 }, { "epoch": 0.20252997324701918, "grad_norm": 0.4760570526123047, "learning_rate": 0.00018062722009220015, "loss": 0.0383, "step": 1533 }, { "epoch": 0.20266208673250322, "grad_norm": 0.36639007925987244, "learning_rate": 0.00018060262796611057, "loss": 0.0224, "step": 1534 }, { "epoch": 0.20279420021798725, "grad_norm": 0.23543117940425873, "learning_rate": 0.00018057802191742402, "loss": 0.0393, "step": 1535 }, { "epoch": 0.20292631370347128, "grad_norm": 0.21014980971813202, "learning_rate": 0.00018055340195039077, "loss": 0.0271, "step": 1536 }, { "epoch": 0.2030584271889553, "grad_norm": 0.2565693259239197, "learning_rate": 0.00018052876806926347, "loss": 0.0347, "step": 1537 }, { "epoch": 0.20319054067443934, "grad_norm": 0.24108761548995972, "learning_rate": 0.00018050412027829715, "loss": 0.0182, "step": 1538 }, { "epoch": 0.20332265415992337, "grad_norm": 0.2666507363319397, "learning_rate": 0.00018047945858174925, "loss": 0.0289, "step": 1539 }, { "epoch": 0.2034547676454074, "grad_norm": 0.2039230316877365, "learning_rate": 0.00018045478298387967, "loss": 0.0244, "step": 1540 }, { "epoch": 0.20358688113089143, "grad_norm": 0.26694339513778687, "learning_rate": 0.00018043009348895058, "loss": 0.0285, "step": 1541 }, { "epoch": 0.20371899461637547, "grad_norm": 0.23415853083133698, "learning_rate": 0.00018040539010122668, "loss": 0.0325, "step": 1542 }, { "epoch": 0.2038511081018595, "grad_norm": 0.23588255047798157, "learning_rate": 0.000180380672824975, "loss": 0.0282, "step": 1543 }, { "epoch": 0.20398322158734353, "grad_norm": 0.2112555056810379, "learning_rate": 0.00018035594166446498, "loss": 0.0322, "step": 1544 }, { "epoch": 0.20411533507282756, "grad_norm": 0.21087202429771423, "learning_rate": 0.00018033119662396846, "loss": 0.0237, "step": 1545 }, { "epoch": 0.2042474485583116, "grad_norm": 0.24182988703250885, "learning_rate": 0.00018030643770775972, "loss": 0.0251, "step": 1546 }, { "epoch": 0.20437956204379562, "grad_norm": 0.205749049782753, "learning_rate": 0.00018028166492011538, "loss": 0.0242, "step": 1547 }, { "epoch": 0.20451167552927965, "grad_norm": 0.26967766880989075, "learning_rate": 0.00018025687826531445, "loss": 0.0312, "step": 1548 }, { "epoch": 0.20464378901476368, "grad_norm": 0.3510602116584778, "learning_rate": 0.0001802320777476384, "loss": 0.0512, "step": 1549 }, { "epoch": 0.20477590250024771, "grad_norm": 0.1554529070854187, "learning_rate": 0.00018020726337137106, "loss": 0.0141, "step": 1550 }, { "epoch": 0.20490801598573175, "grad_norm": 0.22048431634902954, "learning_rate": 0.00018018243514079861, "loss": 0.0319, "step": 1551 }, { "epoch": 0.20504012947121578, "grad_norm": 0.24141453206539154, "learning_rate": 0.00018015759306020968, "loss": 0.0299, "step": 1552 }, { "epoch": 0.2051722429566998, "grad_norm": 0.2439783215522766, "learning_rate": 0.00018013273713389527, "loss": 0.035, "step": 1553 }, { "epoch": 0.20530435644218384, "grad_norm": 0.2585470676422119, "learning_rate": 0.0001801078673661488, "loss": 0.0367, "step": 1554 }, { "epoch": 0.20543646992766787, "grad_norm": 0.33572009205818176, "learning_rate": 0.00018008298376126605, "loss": 0.0409, "step": 1555 }, { "epoch": 0.2055685834131519, "grad_norm": 0.2367447465658188, "learning_rate": 0.00018005808632354516, "loss": 0.0269, "step": 1556 }, { "epoch": 0.20570069689863593, "grad_norm": 0.20082278549671173, "learning_rate": 0.00018003317505728674, "loss": 0.0285, "step": 1557 }, { "epoch": 0.20583281038411996, "grad_norm": 0.1698249727487564, "learning_rate": 0.0001800082499667937, "loss": 0.0203, "step": 1558 }, { "epoch": 0.205964923869604, "grad_norm": 0.1803530901670456, "learning_rate": 0.00017998331105637148, "loss": 0.0226, "step": 1559 }, { "epoch": 0.20609703735508803, "grad_norm": 0.24770233035087585, "learning_rate": 0.0001799583583303277, "loss": 0.0187, "step": 1560 }, { "epoch": 0.20622915084057206, "grad_norm": 0.09762029349803925, "learning_rate": 0.0001799333917929725, "loss": 0.0057, "step": 1561 }, { "epoch": 0.2063612643260561, "grad_norm": 0.1758822500705719, "learning_rate": 0.00017990841144861845, "loss": 0.0139, "step": 1562 }, { "epoch": 0.20649337781154012, "grad_norm": 0.24245113134384155, "learning_rate": 0.00017988341730158037, "loss": 0.0368, "step": 1563 }, { "epoch": 0.20662549129702415, "grad_norm": 0.21039935946464539, "learning_rate": 0.00017985840935617558, "loss": 0.0267, "step": 1564 }, { "epoch": 0.20675760478250818, "grad_norm": 0.12615539133548737, "learning_rate": 0.00017983338761672367, "loss": 0.0162, "step": 1565 }, { "epoch": 0.2068897182679922, "grad_norm": 0.21313214302062988, "learning_rate": 0.00017980835208754675, "loss": 0.0256, "step": 1566 }, { "epoch": 0.20702183175347624, "grad_norm": 0.18838942050933838, "learning_rate": 0.00017978330277296917, "loss": 0.0268, "step": 1567 }, { "epoch": 0.20715394523896027, "grad_norm": 0.2906305491924286, "learning_rate": 0.00017975823967731778, "loss": 0.0321, "step": 1568 }, { "epoch": 0.2072860587244443, "grad_norm": 0.2979860305786133, "learning_rate": 0.00017973316280492173, "loss": 0.0415, "step": 1569 }, { "epoch": 0.20741817220992834, "grad_norm": 0.19764262437820435, "learning_rate": 0.00017970807216011262, "loss": 0.0201, "step": 1570 }, { "epoch": 0.20755028569541237, "grad_norm": 0.22368620336055756, "learning_rate": 0.00017968296774722436, "loss": 0.0385, "step": 1571 }, { "epoch": 0.2076823991808964, "grad_norm": 0.1834963858127594, "learning_rate": 0.0001796578495705933, "loss": 0.0175, "step": 1572 }, { "epoch": 0.20781451266638043, "grad_norm": 0.20226474106311798, "learning_rate": 0.0001796327176345581, "loss": 0.0202, "step": 1573 }, { "epoch": 0.20794662615186446, "grad_norm": 0.21934787929058075, "learning_rate": 0.00017960757194345983, "loss": 0.0244, "step": 1574 }, { "epoch": 0.2080787396373485, "grad_norm": 0.21891097724437714, "learning_rate": 0.00017958241250164196, "loss": 0.0183, "step": 1575 }, { "epoch": 0.20821085312283252, "grad_norm": 0.21612125635147095, "learning_rate": 0.0001795572393134503, "loss": 0.0305, "step": 1576 }, { "epoch": 0.20834296660831655, "grad_norm": 0.18277354538440704, "learning_rate": 0.00017953205238323305, "loss": 0.021, "step": 1577 }, { "epoch": 0.20847508009380059, "grad_norm": 0.1836683601140976, "learning_rate": 0.0001795068517153408, "loss": 0.0272, "step": 1578 }, { "epoch": 0.20860719357928462, "grad_norm": 0.22064363956451416, "learning_rate": 0.00017948163731412647, "loss": 0.0257, "step": 1579 }, { "epoch": 0.20873930706476865, "grad_norm": 0.24008288979530334, "learning_rate": 0.00017945640918394536, "loss": 0.0249, "step": 1580 }, { "epoch": 0.20887142055025268, "grad_norm": 0.32040950655937195, "learning_rate": 0.00017943116732915522, "loss": 0.0238, "step": 1581 }, { "epoch": 0.2090035340357367, "grad_norm": 0.2716364562511444, "learning_rate": 0.00017940591175411602, "loss": 0.0323, "step": 1582 }, { "epoch": 0.20913564752122074, "grad_norm": 0.2920228838920593, "learning_rate": 0.0001793806424631903, "loss": 0.0272, "step": 1583 }, { "epoch": 0.20926776100670477, "grad_norm": 0.30364248156547546, "learning_rate": 0.00017935535946074277, "loss": 0.028, "step": 1584 }, { "epoch": 0.2093998744921888, "grad_norm": 0.19998852908611298, "learning_rate": 0.00017933006275114058, "loss": 0.026, "step": 1585 }, { "epoch": 0.20953198797767283, "grad_norm": 0.23407015204429626, "learning_rate": 0.00017930475233875334, "loss": 0.0262, "step": 1586 }, { "epoch": 0.20966410146315687, "grad_norm": 0.31622499227523804, "learning_rate": 0.00017927942822795295, "loss": 0.0443, "step": 1587 }, { "epoch": 0.20979621494864087, "grad_norm": 0.2934754192829132, "learning_rate": 0.0001792540904231136, "loss": 0.0319, "step": 1588 }, { "epoch": 0.2099283284341249, "grad_norm": 0.21690823137760162, "learning_rate": 0.00017922873892861198, "loss": 0.0333, "step": 1589 }, { "epoch": 0.21006044191960893, "grad_norm": 0.20228062570095062, "learning_rate": 0.00017920337374882707, "loss": 0.0179, "step": 1590 }, { "epoch": 0.21019255540509296, "grad_norm": 0.19345952570438385, "learning_rate": 0.00017917799488814022, "loss": 0.0169, "step": 1591 }, { "epoch": 0.210324668890577, "grad_norm": 0.22918011248111725, "learning_rate": 0.00017915260235093516, "loss": 0.0198, "step": 1592 }, { "epoch": 0.21045678237606102, "grad_norm": 0.26978784799575806, "learning_rate": 0.000179127196141598, "loss": 0.0354, "step": 1593 }, { "epoch": 0.21058889586154506, "grad_norm": 0.19669629633426666, "learning_rate": 0.00017910177626451716, "loss": 0.0204, "step": 1594 }, { "epoch": 0.2107210093470291, "grad_norm": 0.2608644962310791, "learning_rate": 0.00017907634272408348, "loss": 0.0236, "step": 1595 }, { "epoch": 0.21085312283251312, "grad_norm": 0.3554353713989258, "learning_rate": 0.00017905089552469006, "loss": 0.0375, "step": 1596 }, { "epoch": 0.21098523631799715, "grad_norm": 0.21859480440616608, "learning_rate": 0.00017902543467073251, "loss": 0.0349, "step": 1597 }, { "epoch": 0.21111734980348118, "grad_norm": 0.21692459285259247, "learning_rate": 0.00017899996016660868, "loss": 0.0291, "step": 1598 }, { "epoch": 0.2112494632889652, "grad_norm": 0.1959286630153656, "learning_rate": 0.00017897447201671883, "loss": 0.0299, "step": 1599 }, { "epoch": 0.21138157677444924, "grad_norm": 0.21266604959964752, "learning_rate": 0.00017894897022546552, "loss": 0.0291, "step": 1600 }, { "epoch": 0.21151369025993327, "grad_norm": 0.36550846695899963, "learning_rate": 0.00017892345479725373, "loss": 0.0302, "step": 1601 }, { "epoch": 0.2116458037454173, "grad_norm": 0.25008660554885864, "learning_rate": 0.0001788979257364908, "loss": 0.0293, "step": 1602 }, { "epoch": 0.21177791723090134, "grad_norm": 0.21736475825309753, "learning_rate": 0.00017887238304758633, "loss": 0.0166, "step": 1603 }, { "epoch": 0.21191003071638537, "grad_norm": 0.19466808438301086, "learning_rate": 0.00017884682673495244, "loss": 0.025, "step": 1604 }, { "epoch": 0.2120421442018694, "grad_norm": 0.22048614919185638, "learning_rate": 0.00017882125680300344, "loss": 0.0217, "step": 1605 }, { "epoch": 0.21217425768735343, "grad_norm": 0.2242555469274521, "learning_rate": 0.00017879567325615605, "loss": 0.0216, "step": 1606 }, { "epoch": 0.21230637117283746, "grad_norm": 0.17746785283088684, "learning_rate": 0.00017877007609882938, "loss": 0.0173, "step": 1607 }, { "epoch": 0.2124384846583215, "grad_norm": 0.3593146502971649, "learning_rate": 0.00017874446533544484, "loss": 0.0235, "step": 1608 }, { "epoch": 0.21257059814380552, "grad_norm": 0.22502164542675018, "learning_rate": 0.0001787188409704262, "loss": 0.0221, "step": 1609 }, { "epoch": 0.21270271162928955, "grad_norm": 0.2735959589481354, "learning_rate": 0.00017869320300819967, "loss": 0.0367, "step": 1610 }, { "epoch": 0.21283482511477358, "grad_norm": 0.4019820988178253, "learning_rate": 0.00017866755145319366, "loss": 0.0412, "step": 1611 }, { "epoch": 0.21296693860025762, "grad_norm": 0.2791774868965149, "learning_rate": 0.00017864188630983897, "loss": 0.0387, "step": 1612 }, { "epoch": 0.21309905208574165, "grad_norm": 0.28183433413505554, "learning_rate": 0.0001786162075825688, "loss": 0.0318, "step": 1613 }, { "epoch": 0.21323116557122568, "grad_norm": 0.20986947417259216, "learning_rate": 0.0001785905152758187, "loss": 0.0197, "step": 1614 }, { "epoch": 0.2133632790567097, "grad_norm": 0.26751548051834106, "learning_rate": 0.0001785648093940265, "loss": 0.0268, "step": 1615 }, { "epoch": 0.21349539254219374, "grad_norm": 0.24319139122962952, "learning_rate": 0.00017853908994163248, "loss": 0.0338, "step": 1616 }, { "epoch": 0.21362750602767777, "grad_norm": 0.18239794671535492, "learning_rate": 0.00017851335692307905, "loss": 0.0236, "step": 1617 }, { "epoch": 0.2137596195131618, "grad_norm": 0.12741148471832275, "learning_rate": 0.00017848761034281127, "loss": 0.0139, "step": 1618 }, { "epoch": 0.21389173299864583, "grad_norm": 0.28356799483299255, "learning_rate": 0.00017846185020527628, "loss": 0.0263, "step": 1619 }, { "epoch": 0.21402384648412986, "grad_norm": 0.18482592701911926, "learning_rate": 0.00017843607651492368, "loss": 0.0223, "step": 1620 }, { "epoch": 0.2141559599696139, "grad_norm": 0.21175946295261383, "learning_rate": 0.00017841028927620544, "loss": 0.0304, "step": 1621 }, { "epoch": 0.21428807345509793, "grad_norm": 0.23961223661899567, "learning_rate": 0.00017838448849357574, "loss": 0.0377, "step": 1622 }, { "epoch": 0.21442018694058196, "grad_norm": 0.2638351023197174, "learning_rate": 0.00017835867417149127, "loss": 0.0268, "step": 1623 }, { "epoch": 0.214552300426066, "grad_norm": 0.16874513030052185, "learning_rate": 0.0001783328463144109, "loss": 0.0157, "step": 1624 }, { "epoch": 0.21468441391155002, "grad_norm": 0.22855417430400848, "learning_rate": 0.00017830700492679595, "loss": 0.0269, "step": 1625 }, { "epoch": 0.21481652739703405, "grad_norm": 0.23026318848133087, "learning_rate": 0.00017828115001311003, "loss": 0.0193, "step": 1626 }, { "epoch": 0.21494864088251808, "grad_norm": 0.1460496187210083, "learning_rate": 0.00017825528157781908, "loss": 0.0164, "step": 1627 }, { "epoch": 0.21508075436800211, "grad_norm": 0.15897943079471588, "learning_rate": 0.00017822939962539142, "loss": 0.0219, "step": 1628 }, { "epoch": 0.21521286785348615, "grad_norm": 0.32363682985305786, "learning_rate": 0.00017820350416029762, "loss": 0.0342, "step": 1629 }, { "epoch": 0.21534498133897018, "grad_norm": 0.2849627733230591, "learning_rate": 0.0001781775951870107, "loss": 0.0333, "step": 1630 }, { "epoch": 0.2154770948244542, "grad_norm": 0.22426821291446686, "learning_rate": 0.00017815167271000587, "loss": 0.0349, "step": 1631 }, { "epoch": 0.21560920830993824, "grad_norm": 0.18805508315563202, "learning_rate": 0.00017812573673376086, "loss": 0.0321, "step": 1632 }, { "epoch": 0.21574132179542227, "grad_norm": 0.13759900629520416, "learning_rate": 0.00017809978726275553, "loss": 0.0159, "step": 1633 }, { "epoch": 0.2158734352809063, "grad_norm": 0.17568650841712952, "learning_rate": 0.00017807382430147221, "loss": 0.0268, "step": 1634 }, { "epoch": 0.21600554876639033, "grad_norm": 0.1804172843694687, "learning_rate": 0.00017804784785439552, "loss": 0.0237, "step": 1635 }, { "epoch": 0.21613766225187436, "grad_norm": 0.18389666080474854, "learning_rate": 0.0001780218579260124, "loss": 0.019, "step": 1636 }, { "epoch": 0.2162697757373584, "grad_norm": 0.3894127607345581, "learning_rate": 0.00017799585452081212, "loss": 0.0375, "step": 1637 }, { "epoch": 0.21640188922284243, "grad_norm": 0.21016249060630798, "learning_rate": 0.00017796983764328627, "loss": 0.0217, "step": 1638 }, { "epoch": 0.21653400270832646, "grad_norm": 0.22238297760486603, "learning_rate": 0.0001779438072979288, "loss": 0.0208, "step": 1639 }, { "epoch": 0.2166661161938105, "grad_norm": 0.22341783344745636, "learning_rate": 0.00017791776348923593, "loss": 0.0332, "step": 1640 }, { "epoch": 0.21679822967929452, "grad_norm": 0.19672025740146637, "learning_rate": 0.00017789170622170626, "loss": 0.0135, "step": 1641 }, { "epoch": 0.21693034316477855, "grad_norm": 0.3600621223449707, "learning_rate": 0.00017786563549984074, "loss": 0.0325, "step": 1642 }, { "epoch": 0.21706245665026258, "grad_norm": 0.1935514360666275, "learning_rate": 0.00017783955132814257, "loss": 0.0223, "step": 1643 }, { "epoch": 0.2171945701357466, "grad_norm": 0.19862335920333862, "learning_rate": 0.00017781345371111726, "loss": 0.0241, "step": 1644 }, { "epoch": 0.21732668362123064, "grad_norm": 0.2320193350315094, "learning_rate": 0.0001777873426532727, "loss": 0.0311, "step": 1645 }, { "epoch": 0.21745879710671467, "grad_norm": 0.3330860435962677, "learning_rate": 0.00017776121815911915, "loss": 0.0334, "step": 1646 }, { "epoch": 0.2175909105921987, "grad_norm": 0.23526433110237122, "learning_rate": 0.00017773508023316909, "loss": 0.0337, "step": 1647 }, { "epoch": 0.21772302407768274, "grad_norm": 0.22359345853328705, "learning_rate": 0.00017770892887993735, "loss": 0.0213, "step": 1648 }, { "epoch": 0.21785513756316677, "grad_norm": 0.18130530416965485, "learning_rate": 0.0001776827641039411, "loss": 0.0235, "step": 1649 }, { "epoch": 0.2179872510486508, "grad_norm": 0.2357502579689026, "learning_rate": 0.00017765658590969977, "loss": 0.0275, "step": 1650 }, { "epoch": 0.21811936453413483, "grad_norm": 0.21433457732200623, "learning_rate": 0.00017763039430173522, "loss": 0.0355, "step": 1651 }, { "epoch": 0.21825147801961886, "grad_norm": 0.38811975717544556, "learning_rate": 0.00017760418928457149, "loss": 0.0403, "step": 1652 }, { "epoch": 0.2183835915051029, "grad_norm": 0.22088828682899475, "learning_rate": 0.0001775779708627351, "loss": 0.0258, "step": 1653 }, { "epoch": 0.21851570499058692, "grad_norm": 0.1678798496723175, "learning_rate": 0.0001775517390407547, "loss": 0.0196, "step": 1654 }, { "epoch": 0.21864781847607095, "grad_norm": 0.1645585298538208, "learning_rate": 0.00017752549382316142, "loss": 0.0195, "step": 1655 }, { "epoch": 0.21877993196155499, "grad_norm": 0.40038180351257324, "learning_rate": 0.00017749923521448858, "loss": 0.0281, "step": 1656 }, { "epoch": 0.21891204544703902, "grad_norm": 0.24812577664852142, "learning_rate": 0.0001774729632192719, "loss": 0.0308, "step": 1657 }, { "epoch": 0.21904415893252305, "grad_norm": 0.2897437810897827, "learning_rate": 0.00017744667784204933, "loss": 0.0366, "step": 1658 }, { "epoch": 0.21917627241800708, "grad_norm": 0.32864317297935486, "learning_rate": 0.0001774203790873612, "loss": 0.0446, "step": 1659 }, { "epoch": 0.2193083859034911, "grad_norm": 0.23558707535266876, "learning_rate": 0.00017739406695975015, "loss": 0.0205, "step": 1660 }, { "epoch": 0.21944049938897514, "grad_norm": 0.1841057389974594, "learning_rate": 0.0001773677414637611, "loss": 0.0269, "step": 1661 }, { "epoch": 0.21957261287445917, "grad_norm": 0.19524601101875305, "learning_rate": 0.00017734140260394126, "loss": 0.0277, "step": 1662 }, { "epoch": 0.2197047263599432, "grad_norm": 0.14408576488494873, "learning_rate": 0.0001773150503848402, "loss": 0.0124, "step": 1663 }, { "epoch": 0.21983683984542723, "grad_norm": 0.16591165959835052, "learning_rate": 0.00017728868481100977, "loss": 0.0221, "step": 1664 }, { "epoch": 0.21996895333091127, "grad_norm": 0.15631432831287384, "learning_rate": 0.00017726230588700412, "loss": 0.0242, "step": 1665 }, { "epoch": 0.2201010668163953, "grad_norm": 0.12172911316156387, "learning_rate": 0.0001772359136173797, "loss": 0.0191, "step": 1666 }, { "epoch": 0.22023318030187933, "grad_norm": 0.14911390841007233, "learning_rate": 0.00017720950800669533, "loss": 0.0173, "step": 1667 }, { "epoch": 0.22036529378736336, "grad_norm": 0.27882853150367737, "learning_rate": 0.0001771830890595121, "loss": 0.0553, "step": 1668 }, { "epoch": 0.22049740727284736, "grad_norm": 0.2508869171142578, "learning_rate": 0.0001771566567803933, "loss": 0.032, "step": 1669 }, { "epoch": 0.2206295207583314, "grad_norm": 0.1741028130054474, "learning_rate": 0.00017713021117390465, "loss": 0.0189, "step": 1670 }, { "epoch": 0.22076163424381542, "grad_norm": 0.256998747587204, "learning_rate": 0.00017710375224461416, "loss": 0.023, "step": 1671 }, { "epoch": 0.22089374772929946, "grad_norm": 0.18835949897766113, "learning_rate": 0.00017707727999709207, "loss": 0.0146, "step": 1672 }, { "epoch": 0.2210258612147835, "grad_norm": 0.22253838181495667, "learning_rate": 0.00017705079443591104, "loss": 0.0212, "step": 1673 }, { "epoch": 0.22115797470026752, "grad_norm": 0.15436488389968872, "learning_rate": 0.0001770242955656459, "loss": 0.0181, "step": 1674 }, { "epoch": 0.22129008818575155, "grad_norm": 0.24255327880382538, "learning_rate": 0.00017699778339087384, "loss": 0.0284, "step": 1675 }, { "epoch": 0.22142220167123558, "grad_norm": 0.23516564071178436, "learning_rate": 0.00017697125791617434, "loss": 0.0184, "step": 1676 }, { "epoch": 0.2215543151567196, "grad_norm": 0.15337681770324707, "learning_rate": 0.0001769447191461292, "loss": 0.0159, "step": 1677 }, { "epoch": 0.22168642864220364, "grad_norm": 0.16270548105239868, "learning_rate": 0.00017691816708532247, "loss": 0.0171, "step": 1678 }, { "epoch": 0.22181854212768767, "grad_norm": 0.19286991655826569, "learning_rate": 0.00017689160173834054, "loss": 0.0166, "step": 1679 }, { "epoch": 0.2219506556131717, "grad_norm": 0.1711857169866562, "learning_rate": 0.0001768650231097721, "loss": 0.0271, "step": 1680 }, { "epoch": 0.22208276909865574, "grad_norm": 0.30671536922454834, "learning_rate": 0.00017683843120420804, "loss": 0.027, "step": 1681 }, { "epoch": 0.22221488258413977, "grad_norm": 0.2321193814277649, "learning_rate": 0.00017681182602624168, "loss": 0.0084, "step": 1682 }, { "epoch": 0.2223469960696238, "grad_norm": 0.1957147866487503, "learning_rate": 0.00017678520758046857, "loss": 0.0169, "step": 1683 }, { "epoch": 0.22247910955510783, "grad_norm": 0.1827395260334015, "learning_rate": 0.0001767585758714865, "loss": 0.0147, "step": 1684 }, { "epoch": 0.22261122304059186, "grad_norm": 0.1572861671447754, "learning_rate": 0.00017673193090389562, "loss": 0.0136, "step": 1685 }, { "epoch": 0.2227433365260759, "grad_norm": 0.21025492250919342, "learning_rate": 0.00017670527268229838, "loss": 0.0221, "step": 1686 }, { "epoch": 0.22287545001155992, "grad_norm": 0.2409404218196869, "learning_rate": 0.00017667860121129943, "loss": 0.0229, "step": 1687 }, { "epoch": 0.22300756349704395, "grad_norm": 0.20603077113628387, "learning_rate": 0.0001766519164955058, "loss": 0.0223, "step": 1688 }, { "epoch": 0.22313967698252798, "grad_norm": 0.2035643309354782, "learning_rate": 0.00017662521853952678, "loss": 0.0169, "step": 1689 }, { "epoch": 0.22327179046801202, "grad_norm": 0.2209031730890274, "learning_rate": 0.00017659850734797397, "loss": 0.0341, "step": 1690 }, { "epoch": 0.22340390395349605, "grad_norm": 0.25767406821250916, "learning_rate": 0.00017657178292546118, "loss": 0.026, "step": 1691 }, { "epoch": 0.22353601743898008, "grad_norm": 0.2129616141319275, "learning_rate": 0.00017654504527660455, "loss": 0.0163, "step": 1692 }, { "epoch": 0.2236681309244641, "grad_norm": 0.28341618180274963, "learning_rate": 0.00017651829440602258, "loss": 0.0237, "step": 1693 }, { "epoch": 0.22380024440994814, "grad_norm": 0.2251022607088089, "learning_rate": 0.00017649153031833593, "loss": 0.025, "step": 1694 }, { "epoch": 0.22393235789543217, "grad_norm": 0.2455792874097824, "learning_rate": 0.00017646475301816755, "loss": 0.032, "step": 1695 }, { "epoch": 0.2240644713809162, "grad_norm": 0.18342851102352142, "learning_rate": 0.0001764379625101428, "loss": 0.0188, "step": 1696 }, { "epoch": 0.22419658486640023, "grad_norm": 0.2210848480463028, "learning_rate": 0.0001764111587988892, "loss": 0.0233, "step": 1697 }, { "epoch": 0.22432869835188426, "grad_norm": 0.2323235422372818, "learning_rate": 0.0001763843418890366, "loss": 0.0111, "step": 1698 }, { "epoch": 0.2244608118373683, "grad_norm": 0.228493332862854, "learning_rate": 0.00017635751178521716, "loss": 0.0279, "step": 1699 }, { "epoch": 0.22459292532285233, "grad_norm": 0.3134079575538635, "learning_rate": 0.00017633066849206518, "loss": 0.0272, "step": 1700 }, { "epoch": 0.22472503880833636, "grad_norm": 0.1924765408039093, "learning_rate": 0.0001763038120142174, "loss": 0.0229, "step": 1701 }, { "epoch": 0.2248571522938204, "grad_norm": 0.4358731806278229, "learning_rate": 0.00017627694235631278, "loss": 0.0174, "step": 1702 }, { "epoch": 0.22498926577930442, "grad_norm": 0.24608589708805084, "learning_rate": 0.00017625005952299255, "loss": 0.0192, "step": 1703 }, { "epoch": 0.22512137926478845, "grad_norm": 0.32105961441993713, "learning_rate": 0.00017622316351890017, "loss": 0.0254, "step": 1704 }, { "epoch": 0.22525349275027248, "grad_norm": 0.33401522040367126, "learning_rate": 0.0001761962543486815, "loss": 0.0393, "step": 1705 }, { "epoch": 0.2253856062357565, "grad_norm": 0.34425655007362366, "learning_rate": 0.00017616933201698452, "loss": 0.0305, "step": 1706 }, { "epoch": 0.22551771972124054, "grad_norm": 0.26156890392303467, "learning_rate": 0.0001761423965284596, "loss": 0.0349, "step": 1707 }, { "epoch": 0.22564983320672458, "grad_norm": 0.22440393269062042, "learning_rate": 0.00017611544788775937, "loss": 0.0278, "step": 1708 }, { "epoch": 0.2257819466922086, "grad_norm": 0.4424617886543274, "learning_rate": 0.0001760884860995386, "loss": 0.0276, "step": 1709 }, { "epoch": 0.22591406017769264, "grad_norm": 0.1829511821269989, "learning_rate": 0.00017606151116845458, "loss": 0.0251, "step": 1710 }, { "epoch": 0.22604617366317667, "grad_norm": 0.21703237295150757, "learning_rate": 0.0001760345230991666, "loss": 0.0191, "step": 1711 }, { "epoch": 0.2261782871486607, "grad_norm": 0.23436713218688965, "learning_rate": 0.0001760075218963364, "loss": 0.0269, "step": 1712 }, { "epoch": 0.22631040063414473, "grad_norm": 0.5055571794509888, "learning_rate": 0.00017598050756462795, "loss": 0.0277, "step": 1713 }, { "epoch": 0.22644251411962876, "grad_norm": 0.23705795407295227, "learning_rate": 0.0001759534801087074, "loss": 0.0209, "step": 1714 }, { "epoch": 0.2265746276051128, "grad_norm": 0.2245938926935196, "learning_rate": 0.00017592643953324332, "loss": 0.0143, "step": 1715 }, { "epoch": 0.22670674109059682, "grad_norm": 0.24844376742839813, "learning_rate": 0.0001758993858429064, "loss": 0.0324, "step": 1716 }, { "epoch": 0.22683885457608086, "grad_norm": 0.26490744948387146, "learning_rate": 0.0001758723190423697, "loss": 0.0295, "step": 1717 }, { "epoch": 0.2269709680615649, "grad_norm": 0.3163183033466339, "learning_rate": 0.00017584523913630851, "loss": 0.046, "step": 1718 }, { "epoch": 0.22710308154704892, "grad_norm": 0.27821382880210876, "learning_rate": 0.00017581814612940036, "loss": 0.0419, "step": 1719 }, { "epoch": 0.22723519503253295, "grad_norm": 0.16733022034168243, "learning_rate": 0.00017579104002632504, "loss": 0.0228, "step": 1720 }, { "epoch": 0.22736730851801698, "grad_norm": 0.3289214074611664, "learning_rate": 0.00017576392083176466, "loss": 0.0416, "step": 1721 }, { "epoch": 0.227499422003501, "grad_norm": 0.205794095993042, "learning_rate": 0.0001757367885504035, "loss": 0.016, "step": 1722 }, { "epoch": 0.22763153548898504, "grad_norm": 0.2669104337692261, "learning_rate": 0.0001757096431869282, "loss": 0.0335, "step": 1723 }, { "epoch": 0.22776364897446907, "grad_norm": 0.22329342365264893, "learning_rate": 0.0001756824847460276, "loss": 0.021, "step": 1724 }, { "epoch": 0.2278957624599531, "grad_norm": 0.2545218765735626, "learning_rate": 0.00017565531323239286, "loss": 0.0306, "step": 1725 }, { "epoch": 0.22802787594543714, "grad_norm": 0.2121114581823349, "learning_rate": 0.00017562812865071727, "loss": 0.0161, "step": 1726 }, { "epoch": 0.22815998943092117, "grad_norm": 0.2927865982055664, "learning_rate": 0.0001756009310056965, "loss": 0.0574, "step": 1727 }, { "epoch": 0.2282921029164052, "grad_norm": 0.18407411873340607, "learning_rate": 0.00017557372030202844, "loss": 0.0239, "step": 1728 }, { "epoch": 0.22842421640188923, "grad_norm": 0.2130606770515442, "learning_rate": 0.00017554649654441323, "loss": 0.0303, "step": 1729 }, { "epoch": 0.22855632988737326, "grad_norm": 0.3313738703727722, "learning_rate": 0.0001755192597375532, "loss": 0.0481, "step": 1730 }, { "epoch": 0.2286884433728573, "grad_norm": 0.24702952802181244, "learning_rate": 0.00017549200988615311, "loss": 0.0234, "step": 1731 }, { "epoch": 0.22882055685834132, "grad_norm": 0.19129116833209991, "learning_rate": 0.00017546474699491976, "loss": 0.0299, "step": 1732 }, { "epoch": 0.22895267034382535, "grad_norm": 0.2197098731994629, "learning_rate": 0.0001754374710685624, "loss": 0.0359, "step": 1733 }, { "epoch": 0.22908478382930939, "grad_norm": 0.1769767850637436, "learning_rate": 0.00017541018211179236, "loss": 0.0155, "step": 1734 }, { "epoch": 0.22921689731479342, "grad_norm": 0.22637642920017242, "learning_rate": 0.00017538288012932334, "loss": 0.0276, "step": 1735 }, { "epoch": 0.22934901080027745, "grad_norm": 0.23079346120357513, "learning_rate": 0.0001753555651258712, "loss": 0.0204, "step": 1736 }, { "epoch": 0.22948112428576148, "grad_norm": 0.3650188148021698, "learning_rate": 0.00017532823710615417, "loss": 0.0407, "step": 1737 }, { "epoch": 0.2296132377712455, "grad_norm": 0.30013394355773926, "learning_rate": 0.0001753008960748926, "loss": 0.0426, "step": 1738 }, { "epoch": 0.22974535125672954, "grad_norm": 0.22095589339733124, "learning_rate": 0.00017527354203680914, "loss": 0.0312, "step": 1739 }, { "epoch": 0.22987746474221357, "grad_norm": 0.29758599400520325, "learning_rate": 0.00017524617499662873, "loss": 0.038, "step": 1740 }, { "epoch": 0.2300095782276976, "grad_norm": 0.19875991344451904, "learning_rate": 0.00017521879495907845, "loss": 0.022, "step": 1741 }, { "epoch": 0.23014169171318163, "grad_norm": 0.318978875875473, "learning_rate": 0.00017519140192888774, "loss": 0.0352, "step": 1742 }, { "epoch": 0.23027380519866567, "grad_norm": 0.2085612267255783, "learning_rate": 0.00017516399591078823, "loss": 0.0246, "step": 1743 }, { "epoch": 0.2304059186841497, "grad_norm": 0.26876693964004517, "learning_rate": 0.00017513657690951378, "loss": 0.0247, "step": 1744 }, { "epoch": 0.23053803216963373, "grad_norm": 0.1887684464454651, "learning_rate": 0.00017510914492980054, "loss": 0.0325, "step": 1745 }, { "epoch": 0.23067014565511776, "grad_norm": 0.16390034556388855, "learning_rate": 0.0001750816999763868, "loss": 0.0208, "step": 1746 }, { "epoch": 0.2308022591406018, "grad_norm": 0.15387281775474548, "learning_rate": 0.0001750542420540133, "loss": 0.0109, "step": 1747 }, { "epoch": 0.23093437262608582, "grad_norm": 0.17003598809242249, "learning_rate": 0.00017502677116742273, "loss": 0.0165, "step": 1748 }, { "epoch": 0.23106648611156985, "grad_norm": 0.19222618639469147, "learning_rate": 0.0001749992873213602, "loss": 0.0195, "step": 1749 }, { "epoch": 0.23119859959705386, "grad_norm": 0.203046977519989, "learning_rate": 0.00017497179052057313, "loss": 0.0217, "step": 1750 }, { "epoch": 0.2313307130825379, "grad_norm": 0.23541304469108582, "learning_rate": 0.000174944280769811, "loss": 0.0229, "step": 1751 }, { "epoch": 0.23146282656802192, "grad_norm": 0.1804003268480301, "learning_rate": 0.0001749167580738256, "loss": 0.0274, "step": 1752 }, { "epoch": 0.23159494005350595, "grad_norm": 0.2763790786266327, "learning_rate": 0.00017488922243737103, "loss": 0.0334, "step": 1753 }, { "epoch": 0.23172705353898998, "grad_norm": 0.17673511803150177, "learning_rate": 0.0001748616738652035, "loss": 0.0243, "step": 1754 }, { "epoch": 0.231859167024474, "grad_norm": 0.23072902858257294, "learning_rate": 0.00017483411236208149, "loss": 0.0263, "step": 1755 }, { "epoch": 0.23199128050995804, "grad_norm": 0.1810344159603119, "learning_rate": 0.00017480653793276578, "loss": 0.0222, "step": 1756 }, { "epoch": 0.23212339399544207, "grad_norm": 0.19396093487739563, "learning_rate": 0.0001747789505820193, "loss": 0.0265, "step": 1757 }, { "epoch": 0.2322555074809261, "grad_norm": 0.2533824145793915, "learning_rate": 0.00017475135031460727, "loss": 0.0401, "step": 1758 }, { "epoch": 0.23238762096641014, "grad_norm": 0.18334946036338806, "learning_rate": 0.00017472373713529714, "loss": 0.0204, "step": 1759 }, { "epoch": 0.23251973445189417, "grad_norm": 0.23319047689437866, "learning_rate": 0.0001746961110488585, "loss": 0.0294, "step": 1760 }, { "epoch": 0.2326518479373782, "grad_norm": 0.2602129876613617, "learning_rate": 0.00017466847206006335, "loss": 0.0276, "step": 1761 }, { "epoch": 0.23278396142286223, "grad_norm": 0.1965998113155365, "learning_rate": 0.00017464082017368574, "loss": 0.0239, "step": 1762 }, { "epoch": 0.23291607490834626, "grad_norm": 0.27252528071403503, "learning_rate": 0.00017461315539450204, "loss": 0.0349, "step": 1763 }, { "epoch": 0.2330481883938303, "grad_norm": 0.20603559911251068, "learning_rate": 0.00017458547772729075, "loss": 0.0281, "step": 1764 }, { "epoch": 0.23318030187931432, "grad_norm": 0.17887108027935028, "learning_rate": 0.00017455778717683277, "loss": 0.0163, "step": 1765 }, { "epoch": 0.23331241536479835, "grad_norm": 0.30276936292648315, "learning_rate": 0.00017453008374791106, "loss": 0.0526, "step": 1766 }, { "epoch": 0.23344452885028238, "grad_norm": 0.37227141857147217, "learning_rate": 0.00017450236744531093, "loss": 0.0261, "step": 1767 }, { "epoch": 0.23357664233576642, "grad_norm": 0.24804899096488953, "learning_rate": 0.00017447463827381977, "loss": 0.0391, "step": 1768 }, { "epoch": 0.23370875582125045, "grad_norm": 0.2147999256849289, "learning_rate": 0.00017444689623822735, "loss": 0.0264, "step": 1769 }, { "epoch": 0.23384086930673448, "grad_norm": 0.64190274477005, "learning_rate": 0.00017441914134332556, "loss": 0.0392, "step": 1770 }, { "epoch": 0.2339729827922185, "grad_norm": 0.24241581559181213, "learning_rate": 0.0001743913735939085, "loss": 0.0238, "step": 1771 }, { "epoch": 0.23410509627770254, "grad_norm": 0.19723357260227203, "learning_rate": 0.0001743635929947726, "loss": 0.023, "step": 1772 }, { "epoch": 0.23423720976318657, "grad_norm": 0.16124692559242249, "learning_rate": 0.0001743357995507164, "loss": 0.0176, "step": 1773 }, { "epoch": 0.2343693232486706, "grad_norm": 0.16757133603096008, "learning_rate": 0.0001743079932665407, "loss": 0.0138, "step": 1774 }, { "epoch": 0.23450143673415463, "grad_norm": 0.36387279629707336, "learning_rate": 0.00017428017414704853, "loss": 0.037, "step": 1775 }, { "epoch": 0.23463355021963866, "grad_norm": 0.20576192438602448, "learning_rate": 0.0001742523421970451, "loss": 0.0151, "step": 1776 }, { "epoch": 0.2347656637051227, "grad_norm": 0.25506216287612915, "learning_rate": 0.00017422449742133787, "loss": 0.0311, "step": 1777 }, { "epoch": 0.23489777719060673, "grad_norm": 0.20788130164146423, "learning_rate": 0.0001741966398247365, "loss": 0.0229, "step": 1778 }, { "epoch": 0.23502989067609076, "grad_norm": 0.2590482532978058, "learning_rate": 0.0001741687694120529, "loss": 0.0257, "step": 1779 }, { "epoch": 0.2351620041615748, "grad_norm": 0.23437197506427765, "learning_rate": 0.00017414088618810113, "loss": 0.0246, "step": 1780 }, { "epoch": 0.23529411764705882, "grad_norm": 0.23272240161895752, "learning_rate": 0.00017411299015769754, "loss": 0.0256, "step": 1781 }, { "epoch": 0.23542623113254285, "grad_norm": 0.20408686995506287, "learning_rate": 0.00017408508132566055, "loss": 0.0257, "step": 1782 }, { "epoch": 0.23555834461802688, "grad_norm": 0.2479366958141327, "learning_rate": 0.00017405715969681098, "loss": 0.0357, "step": 1783 }, { "epoch": 0.2356904581035109, "grad_norm": 0.19560597836971283, "learning_rate": 0.00017402922527597173, "loss": 0.0191, "step": 1784 }, { "epoch": 0.23582257158899494, "grad_norm": 0.2366217076778412, "learning_rate": 0.00017400127806796792, "loss": 0.0219, "step": 1785 }, { "epoch": 0.23595468507447898, "grad_norm": 0.1941092610359192, "learning_rate": 0.00017397331807762702, "loss": 0.0298, "step": 1786 }, { "epoch": 0.236086798559963, "grad_norm": 0.17476750910282135, "learning_rate": 0.0001739453453097785, "loss": 0.0165, "step": 1787 }, { "epoch": 0.23621891204544704, "grad_norm": 0.17220233380794525, "learning_rate": 0.00017391735976925412, "loss": 0.0218, "step": 1788 }, { "epoch": 0.23635102553093107, "grad_norm": 0.21363332867622375, "learning_rate": 0.0001738893614608879, "loss": 0.021, "step": 1789 }, { "epoch": 0.2364831390164151, "grad_norm": 0.2519174814224243, "learning_rate": 0.00017386135038951602, "loss": 0.0352, "step": 1790 }, { "epoch": 0.23661525250189913, "grad_norm": 0.23778265714645386, "learning_rate": 0.0001738333265599769, "loss": 0.0252, "step": 1791 }, { "epoch": 0.23674736598738316, "grad_norm": 0.2772628366947174, "learning_rate": 0.00017380528997711108, "loss": 0.0264, "step": 1792 }, { "epoch": 0.2368794794728672, "grad_norm": 0.20920205116271973, "learning_rate": 0.00017377724064576136, "loss": 0.0277, "step": 1793 }, { "epoch": 0.23701159295835122, "grad_norm": 0.2609867453575134, "learning_rate": 0.00017374917857077276, "loss": 0.0273, "step": 1794 }, { "epoch": 0.23714370644383526, "grad_norm": 0.34655800461769104, "learning_rate": 0.00017372110375699247, "loss": 0.0311, "step": 1795 }, { "epoch": 0.2372758199293193, "grad_norm": 0.3078354001045227, "learning_rate": 0.0001736930162092699, "loss": 0.0495, "step": 1796 }, { "epoch": 0.23740793341480332, "grad_norm": 0.3296651542186737, "learning_rate": 0.0001736649159324566, "loss": 0.0357, "step": 1797 }, { "epoch": 0.23754004690028735, "grad_norm": 0.27013999223709106, "learning_rate": 0.00017363680293140644, "loss": 0.0315, "step": 1798 }, { "epoch": 0.23767216038577138, "grad_norm": 0.35716310143470764, "learning_rate": 0.00017360867721097538, "loss": 0.018, "step": 1799 }, { "epoch": 0.2378042738712554, "grad_norm": 0.22914241254329681, "learning_rate": 0.0001735805387760216, "loss": 0.0282, "step": 1800 }, { "epoch": 0.23793638735673944, "grad_norm": 0.2120794951915741, "learning_rate": 0.00017355238763140554, "loss": 0.0293, "step": 1801 }, { "epoch": 0.23806850084222347, "grad_norm": 0.24935783445835114, "learning_rate": 0.00017352422378198973, "loss": 0.0336, "step": 1802 }, { "epoch": 0.2382006143277075, "grad_norm": 0.2915463447570801, "learning_rate": 0.00017349604723263892, "loss": 0.03, "step": 1803 }, { "epoch": 0.23833272781319154, "grad_norm": 0.21356649696826935, "learning_rate": 0.00017346785798822017, "loss": 0.0258, "step": 1804 }, { "epoch": 0.23846484129867557, "grad_norm": 0.3095000386238098, "learning_rate": 0.00017343965605360256, "loss": 0.0232, "step": 1805 }, { "epoch": 0.2385969547841596, "grad_norm": 0.1942126750946045, "learning_rate": 0.00017341144143365753, "loss": 0.0218, "step": 1806 }, { "epoch": 0.23872906826964363, "grad_norm": 0.4227634072303772, "learning_rate": 0.00017338321413325859, "loss": 0.0407, "step": 1807 }, { "epoch": 0.23886118175512766, "grad_norm": 0.19542503356933594, "learning_rate": 0.00017335497415728145, "loss": 0.0193, "step": 1808 }, { "epoch": 0.2389932952406117, "grad_norm": 0.18369914591312408, "learning_rate": 0.00017332672151060404, "loss": 0.0229, "step": 1809 }, { "epoch": 0.23912540872609572, "grad_norm": 0.2526663839817047, "learning_rate": 0.00017329845619810653, "loss": 0.041, "step": 1810 }, { "epoch": 0.23925752221157975, "grad_norm": 0.20717869699001312, "learning_rate": 0.00017327017822467119, "loss": 0.0187, "step": 1811 }, { "epoch": 0.23938963569706379, "grad_norm": 0.23846372961997986, "learning_rate": 0.00017324188759518252, "loss": 0.0212, "step": 1812 }, { "epoch": 0.23952174918254782, "grad_norm": 0.1714830845594406, "learning_rate": 0.00017321358431452718, "loss": 0.0239, "step": 1813 }, { "epoch": 0.23965386266803185, "grad_norm": 0.18644586205482483, "learning_rate": 0.00017318526838759404, "loss": 0.0317, "step": 1814 }, { "epoch": 0.23978597615351588, "grad_norm": 0.14925703406333923, "learning_rate": 0.00017315693981927416, "loss": 0.0154, "step": 1815 }, { "epoch": 0.2399180896389999, "grad_norm": 0.16923202574253082, "learning_rate": 0.00017312859861446075, "loss": 0.0229, "step": 1816 }, { "epoch": 0.24005020312448394, "grad_norm": 0.30296310782432556, "learning_rate": 0.00017310024477804926, "loss": 0.0346, "step": 1817 }, { "epoch": 0.24018231660996797, "grad_norm": 0.2446068674325943, "learning_rate": 0.00017307187831493726, "loss": 0.0268, "step": 1818 }, { "epoch": 0.240314430095452, "grad_norm": 0.26852089166641235, "learning_rate": 0.0001730434992300245, "loss": 0.0407, "step": 1819 }, { "epoch": 0.24044654358093603, "grad_norm": 0.18075965344905853, "learning_rate": 0.00017301510752821297, "loss": 0.024, "step": 1820 }, { "epoch": 0.24057865706642007, "grad_norm": 0.1839093118906021, "learning_rate": 0.0001729867032144068, "loss": 0.0278, "step": 1821 }, { "epoch": 0.2407107705519041, "grad_norm": 0.17958417534828186, "learning_rate": 0.00017295828629351233, "loss": 0.0214, "step": 1822 }, { "epoch": 0.24084288403738813, "grad_norm": 0.17468973994255066, "learning_rate": 0.00017292985677043796, "loss": 0.0236, "step": 1823 }, { "epoch": 0.24097499752287216, "grad_norm": 0.27770498394966125, "learning_rate": 0.00017290141465009447, "loss": 0.032, "step": 1824 }, { "epoch": 0.2411071110083562, "grad_norm": 0.16993235051631927, "learning_rate": 0.00017287295993739465, "loss": 0.0146, "step": 1825 }, { "epoch": 0.24123922449384022, "grad_norm": 0.2006664276123047, "learning_rate": 0.00017284449263725354, "loss": 0.0316, "step": 1826 }, { "epoch": 0.24137133797932425, "grad_norm": 0.33198291063308716, "learning_rate": 0.00017281601275458827, "loss": 0.0185, "step": 1827 }, { "epoch": 0.24150345146480828, "grad_norm": 0.3680172562599182, "learning_rate": 0.0001727875202943183, "loss": 0.0266, "step": 1828 }, { "epoch": 0.24163556495029231, "grad_norm": 0.18102778494358063, "learning_rate": 0.00017275901526136512, "loss": 0.0172, "step": 1829 }, { "epoch": 0.24176767843577635, "grad_norm": 0.36317577958106995, "learning_rate": 0.00017273049766065244, "loss": 0.029, "step": 1830 }, { "epoch": 0.24189979192126038, "grad_norm": 0.15369760990142822, "learning_rate": 0.00017270196749710618, "loss": 0.0192, "step": 1831 }, { "epoch": 0.24203190540674438, "grad_norm": 0.16851592063903809, "learning_rate": 0.00017267342477565433, "loss": 0.0219, "step": 1832 }, { "epoch": 0.2421640188922284, "grad_norm": 0.22588814795017242, "learning_rate": 0.00017264486950122716, "loss": 0.0261, "step": 1833 }, { "epoch": 0.24229613237771244, "grad_norm": 0.20349568128585815, "learning_rate": 0.000172616301678757, "loss": 0.0323, "step": 1834 }, { "epoch": 0.24242824586319647, "grad_norm": 0.17781049013137817, "learning_rate": 0.00017258772131317852, "loss": 0.0319, "step": 1835 }, { "epoch": 0.2425603593486805, "grad_norm": 0.25952428579330444, "learning_rate": 0.00017255912840942833, "loss": 0.0293, "step": 1836 }, { "epoch": 0.24269247283416454, "grad_norm": 0.23489899933338165, "learning_rate": 0.00017253052297244538, "loss": 0.0387, "step": 1837 }, { "epoch": 0.24282458631964857, "grad_norm": 0.3075209856033325, "learning_rate": 0.00017250190500717075, "loss": 0.0252, "step": 1838 }, { "epoch": 0.2429566998051326, "grad_norm": 0.3768205940723419, "learning_rate": 0.00017247327451854757, "loss": 0.0508, "step": 1839 }, { "epoch": 0.24308881329061663, "grad_norm": 0.15840856730937958, "learning_rate": 0.00017244463151152125, "loss": 0.0137, "step": 1840 }, { "epoch": 0.24322092677610066, "grad_norm": 0.31808412075042725, "learning_rate": 0.0001724159759910394, "loss": 0.0395, "step": 1841 }, { "epoch": 0.2433530402615847, "grad_norm": 0.7312543392181396, "learning_rate": 0.00017238730796205167, "loss": 0.0197, "step": 1842 }, { "epoch": 0.24348515374706872, "grad_norm": 0.15382172167301178, "learning_rate": 0.0001723586274295099, "loss": 0.017, "step": 1843 }, { "epoch": 0.24361726723255275, "grad_norm": 0.25445330142974854, "learning_rate": 0.0001723299343983682, "loss": 0.0198, "step": 1844 }, { "epoch": 0.24374938071803678, "grad_norm": 0.2107013463973999, "learning_rate": 0.00017230122887358269, "loss": 0.0278, "step": 1845 }, { "epoch": 0.24388149420352082, "grad_norm": 0.25929173827171326, "learning_rate": 0.0001722725108601117, "loss": 0.0141, "step": 1846 }, { "epoch": 0.24401360768900485, "grad_norm": 0.2503575384616852, "learning_rate": 0.00017224378036291576, "loss": 0.022, "step": 1847 }, { "epoch": 0.24414572117448888, "grad_norm": 0.24697549641132355, "learning_rate": 0.00017221503738695757, "loss": 0.0301, "step": 1848 }, { "epoch": 0.2442778346599729, "grad_norm": 0.25922176241874695, "learning_rate": 0.00017218628193720186, "loss": 0.0295, "step": 1849 }, { "epoch": 0.24440994814545694, "grad_norm": 0.20222648978233337, "learning_rate": 0.00017215751401861563, "loss": 0.0134, "step": 1850 }, { "epoch": 0.24454206163094097, "grad_norm": 0.19815732538700104, "learning_rate": 0.00017212873363616803, "loss": 0.0257, "step": 1851 }, { "epoch": 0.244674175116425, "grad_norm": 0.16871704161167145, "learning_rate": 0.00017209994079483027, "loss": 0.016, "step": 1852 }, { "epoch": 0.24480628860190903, "grad_norm": 0.21894827485084534, "learning_rate": 0.00017207113549957582, "loss": 0.0182, "step": 1853 }, { "epoch": 0.24493840208739306, "grad_norm": 0.211959108710289, "learning_rate": 0.00017204231775538027, "loss": 0.0315, "step": 1854 }, { "epoch": 0.2450705155728771, "grad_norm": 0.2773512899875641, "learning_rate": 0.0001720134875672213, "loss": 0.0371, "step": 1855 }, { "epoch": 0.24520262905836113, "grad_norm": 0.23648853600025177, "learning_rate": 0.00017198464494007883, "loss": 0.0279, "step": 1856 }, { "epoch": 0.24533474254384516, "grad_norm": 0.569492757320404, "learning_rate": 0.00017195578987893483, "loss": 0.0586, "step": 1857 }, { "epoch": 0.2454668560293292, "grad_norm": 0.23297181725502014, "learning_rate": 0.00017192692238877352, "loss": 0.0214, "step": 1858 }, { "epoch": 0.24559896951481322, "grad_norm": 0.28752514719963074, "learning_rate": 0.0001718980424745812, "loss": 0.0361, "step": 1859 }, { "epoch": 0.24573108300029725, "grad_norm": 0.25073346495628357, "learning_rate": 0.00017186915014134635, "loss": 0.0255, "step": 1860 }, { "epoch": 0.24586319648578128, "grad_norm": 0.2746346890926361, "learning_rate": 0.00017184024539405957, "loss": 0.0297, "step": 1861 }, { "epoch": 0.2459953099712653, "grad_norm": 0.2706051170825958, "learning_rate": 0.0001718113282377136, "loss": 0.0269, "step": 1862 }, { "epoch": 0.24612742345674934, "grad_norm": 0.27474790811538696, "learning_rate": 0.00017178239867730337, "loss": 0.0192, "step": 1863 }, { "epoch": 0.24625953694223338, "grad_norm": 0.18939033150672913, "learning_rate": 0.00017175345671782588, "loss": 0.0306, "step": 1864 }, { "epoch": 0.2463916504277174, "grad_norm": 0.16039957106113434, "learning_rate": 0.00017172450236428035, "loss": 0.0264, "step": 1865 }, { "epoch": 0.24652376391320144, "grad_norm": 0.27674245834350586, "learning_rate": 0.00017169553562166811, "loss": 0.0292, "step": 1866 }, { "epoch": 0.24665587739868547, "grad_norm": 0.26449286937713623, "learning_rate": 0.00017166655649499256, "loss": 0.0411, "step": 1867 }, { "epoch": 0.2467879908841695, "grad_norm": 0.24633260071277618, "learning_rate": 0.00017163756498925938, "loss": 0.031, "step": 1868 }, { "epoch": 0.24692010436965353, "grad_norm": 0.34218135476112366, "learning_rate": 0.00017160856110947625, "loss": 0.022, "step": 1869 }, { "epoch": 0.24705221785513756, "grad_norm": 0.3518677055835724, "learning_rate": 0.00017157954486065311, "loss": 0.0481, "step": 1870 }, { "epoch": 0.2471843313406216, "grad_norm": 0.1745559424161911, "learning_rate": 0.0001715505162478019, "loss": 0.023, "step": 1871 }, { "epoch": 0.24731644482610562, "grad_norm": 0.19944533705711365, "learning_rate": 0.00017152147527593682, "loss": 0.0229, "step": 1872 }, { "epoch": 0.24744855831158966, "grad_norm": 0.1665343940258026, "learning_rate": 0.00017149242195007417, "loss": 0.0192, "step": 1873 }, { "epoch": 0.2475806717970737, "grad_norm": 0.19697602093219757, "learning_rate": 0.0001714633562752323, "loss": 0.0183, "step": 1874 }, { "epoch": 0.24771278528255772, "grad_norm": 0.3007800877094269, "learning_rate": 0.00017143427825643182, "loss": 0.0288, "step": 1875 }, { "epoch": 0.24784489876804175, "grad_norm": 0.194051593542099, "learning_rate": 0.0001714051878986954, "loss": 0.0279, "step": 1876 }, { "epoch": 0.24797701225352578, "grad_norm": 0.18599578738212585, "learning_rate": 0.00017137608520704785, "loss": 0.0171, "step": 1877 }, { "epoch": 0.2481091257390098, "grad_norm": 0.3460446000099182, "learning_rate": 0.00017134697018651612, "loss": 0.0382, "step": 1878 }, { "epoch": 0.24824123922449384, "grad_norm": 0.23564647138118744, "learning_rate": 0.00017131784284212927, "loss": 0.038, "step": 1879 }, { "epoch": 0.24837335270997787, "grad_norm": 0.2204003930091858, "learning_rate": 0.00017128870317891854, "loss": 0.0343, "step": 1880 }, { "epoch": 0.2485054661954619, "grad_norm": 0.1897745579481125, "learning_rate": 0.00017125955120191725, "loss": 0.02, "step": 1881 }, { "epoch": 0.24863757968094594, "grad_norm": 0.31301870942115784, "learning_rate": 0.0001712303869161608, "loss": 0.0233, "step": 1882 }, { "epoch": 0.24876969316642997, "grad_norm": 0.2227194607257843, "learning_rate": 0.00017120121032668687, "loss": 0.038, "step": 1883 }, { "epoch": 0.248901806651914, "grad_norm": 0.25485947728157043, "learning_rate": 0.0001711720214385351, "loss": 0.0435, "step": 1884 }, { "epoch": 0.24903392013739803, "grad_norm": 0.2976076602935791, "learning_rate": 0.00017114282025674734, "loss": 0.0284, "step": 1885 }, { "epoch": 0.24916603362288206, "grad_norm": 0.18369312584400177, "learning_rate": 0.0001711136067863676, "loss": 0.02, "step": 1886 }, { "epoch": 0.2492981471083661, "grad_norm": 0.24397863447666168, "learning_rate": 0.00017108438103244188, "loss": 0.0365, "step": 1887 }, { "epoch": 0.24943026059385012, "grad_norm": 0.1897364854812622, "learning_rate": 0.0001710551430000184, "loss": 0.0183, "step": 1888 }, { "epoch": 0.24956237407933415, "grad_norm": 0.24037641286849976, "learning_rate": 0.00017102589269414758, "loss": 0.0392, "step": 1889 }, { "epoch": 0.24969448756481818, "grad_norm": 0.20401312410831451, "learning_rate": 0.00017099663011988173, "loss": 0.0231, "step": 1890 }, { "epoch": 0.24982660105030222, "grad_norm": 0.2197640836238861, "learning_rate": 0.00017096735528227547, "loss": 0.0214, "step": 1891 }, { "epoch": 0.24995871453578625, "grad_norm": 0.4697629511356354, "learning_rate": 0.0001709380681863855, "loss": 0.0183, "step": 1892 }, { "epoch": 0.25009082802127025, "grad_norm": 0.3376876413822174, "learning_rate": 0.0001709087688372706, "loss": 0.0324, "step": 1893 }, { "epoch": 0.2502229415067543, "grad_norm": 0.3038407862186432, "learning_rate": 0.0001708794572399917, "loss": 0.034, "step": 1894 }, { "epoch": 0.2503550549922383, "grad_norm": 0.2301395833492279, "learning_rate": 0.00017085013339961178, "loss": 0.0342, "step": 1895 }, { "epoch": 0.25048716847772234, "grad_norm": 0.20109249651432037, "learning_rate": 0.000170820797321196, "loss": 0.029, "step": 1896 }, { "epoch": 0.2506192819632064, "grad_norm": 0.16469945013523102, "learning_rate": 0.0001707914490098117, "loss": 0.0158, "step": 1897 }, { "epoch": 0.2507513954486904, "grad_norm": 0.174901083111763, "learning_rate": 0.00017076208847052816, "loss": 0.0272, "step": 1898 }, { "epoch": 0.25088350893417444, "grad_norm": 0.2955951988697052, "learning_rate": 0.00017073271570841692, "loss": 0.0432, "step": 1899 }, { "epoch": 0.25101562241965847, "grad_norm": 0.47816672921180725, "learning_rate": 0.0001707033307285515, "loss": 0.0292, "step": 1900 }, { "epoch": 0.2511477359051425, "grad_norm": 0.25690802931785583, "learning_rate": 0.0001706739335360077, "loss": 0.018, "step": 1901 }, { "epoch": 0.25127984939062653, "grad_norm": 0.37199512124061584, "learning_rate": 0.00017064452413586328, "loss": 0.0442, "step": 1902 }, { "epoch": 0.25141196287611056, "grad_norm": 0.21501943469047546, "learning_rate": 0.00017061510253319815, "loss": 0.0256, "step": 1903 }, { "epoch": 0.2515440763615946, "grad_norm": 0.33642202615737915, "learning_rate": 0.00017058566873309438, "loss": 0.0381, "step": 1904 }, { "epoch": 0.2516761898470786, "grad_norm": 0.1645767092704773, "learning_rate": 0.0001705562227406361, "loss": 0.0355, "step": 1905 }, { "epoch": 0.25180830333256266, "grad_norm": 0.24579696357250214, "learning_rate": 0.00017052676456090956, "loss": 0.0257, "step": 1906 }, { "epoch": 0.2519404168180467, "grad_norm": 0.2509557008743286, "learning_rate": 0.0001704972941990031, "loss": 0.0295, "step": 1907 }, { "epoch": 0.2520725303035307, "grad_norm": 0.23168303072452545, "learning_rate": 0.00017046781166000716, "loss": 0.0193, "step": 1908 }, { "epoch": 0.25220464378901475, "grad_norm": 0.15511181950569153, "learning_rate": 0.00017043831694901434, "loss": 0.024, "step": 1909 }, { "epoch": 0.2523367572744988, "grad_norm": 0.24736307561397552, "learning_rate": 0.00017040881007111925, "loss": 0.0275, "step": 1910 }, { "epoch": 0.2524688707599828, "grad_norm": 0.19190946221351624, "learning_rate": 0.00017037929103141865, "loss": 0.0241, "step": 1911 }, { "epoch": 0.25260098424546684, "grad_norm": 0.14852671325206757, "learning_rate": 0.00017034975983501146, "loss": 0.0174, "step": 1912 }, { "epoch": 0.2527330977309509, "grad_norm": 0.42580118775367737, "learning_rate": 0.00017032021648699858, "loss": 0.02, "step": 1913 }, { "epoch": 0.2528652112164349, "grad_norm": 0.23213063180446625, "learning_rate": 0.00017029066099248313, "loss": 0.0464, "step": 1914 }, { "epoch": 0.25299732470191894, "grad_norm": 0.40431395173072815, "learning_rate": 0.00017026109335657022, "loss": 0.0389, "step": 1915 }, { "epoch": 0.25312943818740297, "grad_norm": 0.26642554998397827, "learning_rate": 0.0001702315135843671, "loss": 0.0217, "step": 1916 }, { "epoch": 0.253261551672887, "grad_norm": 0.3669357895851135, "learning_rate": 0.0001702019216809832, "loss": 0.0404, "step": 1917 }, { "epoch": 0.25339366515837103, "grad_norm": 0.14874513447284698, "learning_rate": 0.00017017231765152987, "loss": 0.0166, "step": 1918 }, { "epoch": 0.25352577864385506, "grad_norm": 0.29495328664779663, "learning_rate": 0.0001701427015011207, "loss": 0.0228, "step": 1919 }, { "epoch": 0.2536578921293391, "grad_norm": 0.3058466613292694, "learning_rate": 0.00017011307323487132, "loss": 0.023, "step": 1920 }, { "epoch": 0.2537900056148231, "grad_norm": 0.2111206203699112, "learning_rate": 0.00017008343285789953, "loss": 0.0319, "step": 1921 }, { "epoch": 0.25392211910030715, "grad_norm": 0.26292023062705994, "learning_rate": 0.000170053780375325, "loss": 0.0314, "step": 1922 }, { "epoch": 0.2540542325857912, "grad_norm": 0.18287408351898193, "learning_rate": 0.00017002411579226974, "loss": 0.0282, "step": 1923 }, { "epoch": 0.2541863460712752, "grad_norm": 0.21615292131900787, "learning_rate": 0.00016999443911385774, "loss": 0.0266, "step": 1924 }, { "epoch": 0.25431845955675925, "grad_norm": 0.240324005484581, "learning_rate": 0.00016996475034521512, "loss": 0.0378, "step": 1925 }, { "epoch": 0.2544505730422433, "grad_norm": 0.24561507999897003, "learning_rate": 0.00016993504949147, "loss": 0.0387, "step": 1926 }, { "epoch": 0.2545826865277273, "grad_norm": 0.26725414395332336, "learning_rate": 0.00016990533655775268, "loss": 0.0189, "step": 1927 }, { "epoch": 0.25471480001321134, "grad_norm": 0.21090959012508392, "learning_rate": 0.0001698756115491955, "loss": 0.0205, "step": 1928 }, { "epoch": 0.25484691349869537, "grad_norm": 0.25421762466430664, "learning_rate": 0.00016984587447093293, "loss": 0.0341, "step": 1929 }, { "epoch": 0.2549790269841794, "grad_norm": 0.14478729665279388, "learning_rate": 0.00016981612532810145, "loss": 0.0202, "step": 1930 }, { "epoch": 0.25511114046966343, "grad_norm": 0.18194665014743805, "learning_rate": 0.0001697863641258397, "loss": 0.0281, "step": 1931 }, { "epoch": 0.25524325395514746, "grad_norm": 0.2574337124824524, "learning_rate": 0.00016975659086928836, "loss": 0.0294, "step": 1932 }, { "epoch": 0.2553753674406315, "grad_norm": 0.19785122573375702, "learning_rate": 0.0001697268055635902, "loss": 0.0201, "step": 1933 }, { "epoch": 0.2555074809261155, "grad_norm": 0.37554481625556946, "learning_rate": 0.00016969700821389008, "loss": 0.0463, "step": 1934 }, { "epoch": 0.25563959441159956, "grad_norm": 0.3940991461277008, "learning_rate": 0.00016966719882533497, "loss": 0.0197, "step": 1935 }, { "epoch": 0.2557717078970836, "grad_norm": 0.21494190394878387, "learning_rate": 0.00016963737740307381, "loss": 0.0378, "step": 1936 }, { "epoch": 0.2559038213825676, "grad_norm": 0.24298927187919617, "learning_rate": 0.00016960754395225775, "loss": 0.0306, "step": 1937 }, { "epoch": 0.25603593486805165, "grad_norm": 0.2017381191253662, "learning_rate": 0.00016957769847803994, "loss": 0.021, "step": 1938 }, { "epoch": 0.2561680483535357, "grad_norm": 0.21786369383335114, "learning_rate": 0.00016954784098557565, "loss": 0.0253, "step": 1939 }, { "epoch": 0.2563001618390197, "grad_norm": 0.28797072172164917, "learning_rate": 0.00016951797148002216, "loss": 0.0306, "step": 1940 }, { "epoch": 0.25643227532450374, "grad_norm": 0.17211690545082092, "learning_rate": 0.00016948808996653889, "loss": 0.0189, "step": 1941 }, { "epoch": 0.2565643888099878, "grad_norm": 0.2314436137676239, "learning_rate": 0.00016945819645028731, "loss": 0.0194, "step": 1942 }, { "epoch": 0.2566965022954718, "grad_norm": 0.3278854489326477, "learning_rate": 0.000169428290936431, "loss": 0.0297, "step": 1943 }, { "epoch": 0.25682861578095584, "grad_norm": 0.1829918473958969, "learning_rate": 0.00016939837343013552, "loss": 0.0245, "step": 1944 }, { "epoch": 0.25696072926643987, "grad_norm": 0.3587496876716614, "learning_rate": 0.00016936844393656864, "loss": 0.027, "step": 1945 }, { "epoch": 0.2570928427519239, "grad_norm": 0.20004841685295105, "learning_rate": 0.0001693385024609, "loss": 0.018, "step": 1946 }, { "epoch": 0.25722495623740793, "grad_norm": 0.24667544662952423, "learning_rate": 0.00016930854900830156, "loss": 0.0203, "step": 1947 }, { "epoch": 0.25735706972289196, "grad_norm": 0.34233590960502625, "learning_rate": 0.00016927858358394712, "loss": 0.0307, "step": 1948 }, { "epoch": 0.257489183208376, "grad_norm": 0.16642144322395325, "learning_rate": 0.00016924860619301271, "loss": 0.0186, "step": 1949 }, { "epoch": 0.25762129669386, "grad_norm": 0.22433346509933472, "learning_rate": 0.00016921861684067633, "loss": 0.0145, "step": 1950 }, { "epoch": 0.25775341017934406, "grad_norm": 0.9970558285713196, "learning_rate": 0.0001691886155321181, "loss": 0.0281, "step": 1951 }, { "epoch": 0.2578855236648281, "grad_norm": 0.2507546842098236, "learning_rate": 0.0001691586022725202, "loss": 0.0133, "step": 1952 }, { "epoch": 0.2580176371503121, "grad_norm": 0.15097548067569733, "learning_rate": 0.0001691285770670668, "loss": 0.0192, "step": 1953 }, { "epoch": 0.25814975063579615, "grad_norm": 0.22995054721832275, "learning_rate": 0.0001690985399209442, "loss": 0.036, "step": 1954 }, { "epoch": 0.2582818641212802, "grad_norm": 0.2987491190433502, "learning_rate": 0.00016906849083934083, "loss": 0.0299, "step": 1955 }, { "epoch": 0.2584139776067642, "grad_norm": 0.3527694344520569, "learning_rate": 0.00016903842982744704, "loss": 0.0222, "step": 1956 }, { "epoch": 0.25854609109224824, "grad_norm": 0.4488008916378021, "learning_rate": 0.00016900835689045535, "loss": 0.0305, "step": 1957 }, { "epoch": 0.2586782045777323, "grad_norm": 0.18324284255504608, "learning_rate": 0.00016897827203356025, "loss": 0.0204, "step": 1958 }, { "epoch": 0.2588103180632163, "grad_norm": 0.303751677274704, "learning_rate": 0.00016894817526195833, "loss": 0.0362, "step": 1959 }, { "epoch": 0.25894243154870034, "grad_norm": 0.29179519414901733, "learning_rate": 0.0001689180665808483, "loss": 0.0364, "step": 1960 }, { "epoch": 0.25907454503418437, "grad_norm": 0.16983579099178314, "learning_rate": 0.00016888794599543089, "loss": 0.0266, "step": 1961 }, { "epoch": 0.2592066585196684, "grad_norm": 0.23435205221176147, "learning_rate": 0.0001688578135109088, "loss": 0.0293, "step": 1962 }, { "epoch": 0.25933877200515243, "grad_norm": 0.20603543519973755, "learning_rate": 0.00016882766913248686, "loss": 0.0168, "step": 1963 }, { "epoch": 0.25947088549063646, "grad_norm": 0.32107990980148315, "learning_rate": 0.000168797512865372, "loss": 0.0268, "step": 1964 }, { "epoch": 0.2596029989761205, "grad_norm": 0.22059336304664612, "learning_rate": 0.00016876734471477312, "loss": 0.0187, "step": 1965 }, { "epoch": 0.2597351124616045, "grad_norm": 0.23701448738574982, "learning_rate": 0.00016873716468590117, "loss": 0.0211, "step": 1966 }, { "epoch": 0.25986722594708855, "grad_norm": 0.21445463597774506, "learning_rate": 0.00016870697278396923, "loss": 0.0318, "step": 1967 }, { "epoch": 0.2599993394325726, "grad_norm": 0.1952168196439743, "learning_rate": 0.00016867676901419237, "loss": 0.0284, "step": 1968 }, { "epoch": 0.2601314529180566, "grad_norm": 0.2025042474269867, "learning_rate": 0.00016864655338178777, "loss": 0.016, "step": 1969 }, { "epoch": 0.26026356640354065, "grad_norm": 0.20277979969978333, "learning_rate": 0.00016861632589197453, "loss": 0.0275, "step": 1970 }, { "epoch": 0.2603956798890247, "grad_norm": 0.18800503015518188, "learning_rate": 0.00016858608654997395, "loss": 0.0198, "step": 1971 }, { "epoch": 0.2605277933745087, "grad_norm": 0.1867101937532425, "learning_rate": 0.00016855583536100926, "loss": 0.0243, "step": 1972 }, { "epoch": 0.26065990685999274, "grad_norm": 0.28561997413635254, "learning_rate": 0.00016852557233030586, "loss": 0.03, "step": 1973 }, { "epoch": 0.26079202034547677, "grad_norm": 0.24048177897930145, "learning_rate": 0.00016849529746309108, "loss": 0.0296, "step": 1974 }, { "epoch": 0.2609241338309608, "grad_norm": 0.24759027361869812, "learning_rate": 0.00016846501076459434, "loss": 0.0327, "step": 1975 }, { "epoch": 0.26105624731644483, "grad_norm": 0.27090364694595337, "learning_rate": 0.00016843471224004704, "loss": 0.0362, "step": 1976 }, { "epoch": 0.26118836080192886, "grad_norm": 0.29066452383995056, "learning_rate": 0.0001684044018946828, "loss": 0.0434, "step": 1977 }, { "epoch": 0.2613204742874129, "grad_norm": 0.2134980857372284, "learning_rate": 0.0001683740797337371, "loss": 0.0242, "step": 1978 }, { "epoch": 0.2614525877728969, "grad_norm": 0.256011426448822, "learning_rate": 0.00016834374576244753, "loss": 0.0407, "step": 1979 }, { "epoch": 0.26158470125838096, "grad_norm": 0.22266724705696106, "learning_rate": 0.00016831339998605373, "loss": 0.0241, "step": 1980 }, { "epoch": 0.261716814743865, "grad_norm": 0.29499000310897827, "learning_rate": 0.00016828304240979735, "loss": 0.0221, "step": 1981 }, { "epoch": 0.261848928229349, "grad_norm": 0.19685974717140198, "learning_rate": 0.0001682526730389221, "loss": 0.0189, "step": 1982 }, { "epoch": 0.26198104171483305, "grad_norm": 0.2072879672050476, "learning_rate": 0.00016822229187867373, "loss": 0.0209, "step": 1983 }, { "epoch": 0.2621131552003171, "grad_norm": 0.33101019263267517, "learning_rate": 0.00016819189893429998, "loss": 0.031, "step": 1984 }, { "epoch": 0.2622452686858011, "grad_norm": 0.14240136742591858, "learning_rate": 0.00016816149421105072, "loss": 0.0179, "step": 1985 }, { "epoch": 0.26237738217128515, "grad_norm": 0.23045289516448975, "learning_rate": 0.00016813107771417775, "loss": 0.0238, "step": 1986 }, { "epoch": 0.2625094956567692, "grad_norm": 0.2774489223957062, "learning_rate": 0.000168100649448935, "loss": 0.0327, "step": 1987 }, { "epoch": 0.2626416091422532, "grad_norm": 0.20476806163787842, "learning_rate": 0.0001680702094205783, "loss": 0.0251, "step": 1988 }, { "epoch": 0.26277372262773724, "grad_norm": 0.28308263421058655, "learning_rate": 0.0001680397576343657, "loss": 0.0411, "step": 1989 }, { "epoch": 0.26290583611322127, "grad_norm": 0.12856468558311462, "learning_rate": 0.0001680092940955571, "loss": 0.0145, "step": 1990 }, { "epoch": 0.2630379495987053, "grad_norm": 0.3462192118167877, "learning_rate": 0.00016797881880941455, "loss": 0.0345, "step": 1991 }, { "epoch": 0.26317006308418933, "grad_norm": 0.18518102169036865, "learning_rate": 0.00016794833178120205, "loss": 0.0263, "step": 1992 }, { "epoch": 0.26330217656967336, "grad_norm": 0.2983401119709015, "learning_rate": 0.00016791783301618572, "loss": 0.0222, "step": 1993 }, { "epoch": 0.2634342900551574, "grad_norm": 0.20992180705070496, "learning_rate": 0.00016788732251963356, "loss": 0.0222, "step": 1994 }, { "epoch": 0.2635664035406414, "grad_norm": 0.24977165460586548, "learning_rate": 0.0001678568002968158, "loss": 0.0386, "step": 1995 }, { "epoch": 0.26369851702612546, "grad_norm": 0.2119644284248352, "learning_rate": 0.0001678262663530045, "loss": 0.0137, "step": 1996 }, { "epoch": 0.2638306305116095, "grad_norm": 0.2339543104171753, "learning_rate": 0.00016779572069347385, "loss": 0.0329, "step": 1997 }, { "epoch": 0.2639627439970935, "grad_norm": 0.25529757142066956, "learning_rate": 0.00016776516332350005, "loss": 0.0397, "step": 1998 }, { "epoch": 0.26409485748257755, "grad_norm": 0.22438396513462067, "learning_rate": 0.0001677345942483613, "loss": 0.0247, "step": 1999 }, { "epoch": 0.2642269709680616, "grad_norm": 0.24435588717460632, "learning_rate": 0.00016770401347333786, "loss": 0.0139, "step": 2000 }, { "epoch": 0.2643590844535456, "grad_norm": 0.2665388286113739, "learning_rate": 0.00016767342100371195, "loss": 0.0234, "step": 2001 }, { "epoch": 0.26449119793902964, "grad_norm": 0.28348082304000854, "learning_rate": 0.0001676428168447679, "loss": 0.0291, "step": 2002 }, { "epoch": 0.2646233114245137, "grad_norm": 0.20156100392341614, "learning_rate": 0.00016761220100179196, "loss": 0.0266, "step": 2003 }, { "epoch": 0.2647554249099977, "grad_norm": 0.21052148938179016, "learning_rate": 0.00016758157348007246, "loss": 0.0264, "step": 2004 }, { "epoch": 0.26488753839548174, "grad_norm": 0.2750470042228699, "learning_rate": 0.00016755093428489975, "loss": 0.048, "step": 2005 }, { "epoch": 0.26501965188096577, "grad_norm": 0.2317686378955841, "learning_rate": 0.0001675202834215661, "loss": 0.0203, "step": 2006 }, { "epoch": 0.2651517653664498, "grad_norm": 0.2592511773109436, "learning_rate": 0.00016748962089536601, "loss": 0.0229, "step": 2007 }, { "epoch": 0.26528387885193383, "grad_norm": 0.20439298450946808, "learning_rate": 0.00016745894671159578, "loss": 0.0217, "step": 2008 }, { "epoch": 0.26541599233741786, "grad_norm": 0.1797826588153839, "learning_rate": 0.00016742826087555375, "loss": 0.0248, "step": 2009 }, { "epoch": 0.2655481058229019, "grad_norm": 0.21490655839443207, "learning_rate": 0.0001673975633925404, "loss": 0.0224, "step": 2010 }, { "epoch": 0.2656802193083859, "grad_norm": 0.23416008055210114, "learning_rate": 0.00016736685426785815, "loss": 0.0201, "step": 2011 }, { "epoch": 0.26581233279386995, "grad_norm": 0.2559327483177185, "learning_rate": 0.00016733613350681137, "loss": 0.0268, "step": 2012 }, { "epoch": 0.265944446279354, "grad_norm": 0.29835590720176697, "learning_rate": 0.00016730540111470652, "loss": 0.034, "step": 2013 }, { "epoch": 0.266076559764838, "grad_norm": 0.22217904031276703, "learning_rate": 0.00016727465709685208, "loss": 0.0296, "step": 2014 }, { "epoch": 0.26620867325032205, "grad_norm": 0.23027774691581726, "learning_rate": 0.00016724390145855846, "loss": 0.0234, "step": 2015 }, { "epoch": 0.2663407867358061, "grad_norm": 0.24703939259052277, "learning_rate": 0.00016721313420513817, "loss": 0.0248, "step": 2016 }, { "epoch": 0.2664729002212901, "grad_norm": 0.26446980237960815, "learning_rate": 0.00016718235534190563, "loss": 0.0216, "step": 2017 }, { "epoch": 0.26660501370677414, "grad_norm": 0.16694585978984833, "learning_rate": 0.0001671515648741773, "loss": 0.0203, "step": 2018 }, { "epoch": 0.26673712719225817, "grad_norm": 0.21245832741260529, "learning_rate": 0.00016712076280727173, "loss": 0.0237, "step": 2019 }, { "epoch": 0.2668692406777422, "grad_norm": 0.18598133325576782, "learning_rate": 0.00016708994914650934, "loss": 0.0207, "step": 2020 }, { "epoch": 0.26700135416322623, "grad_norm": 0.24841630458831787, "learning_rate": 0.00016705912389721267, "loss": 0.0348, "step": 2021 }, { "epoch": 0.26713346764871027, "grad_norm": 0.17656932771205902, "learning_rate": 0.00016702828706470615, "loss": 0.0202, "step": 2022 }, { "epoch": 0.2672655811341943, "grad_norm": 0.26282253861427307, "learning_rate": 0.00016699743865431627, "loss": 0.0303, "step": 2023 }, { "epoch": 0.2673976946196783, "grad_norm": 0.2911767065525055, "learning_rate": 0.00016696657867137156, "loss": 0.0286, "step": 2024 }, { "epoch": 0.26752980810516236, "grad_norm": 0.16267457604408264, "learning_rate": 0.00016693570712120247, "loss": 0.0095, "step": 2025 }, { "epoch": 0.2676619215906464, "grad_norm": 0.37082239985466003, "learning_rate": 0.00016690482400914144, "loss": 0.0298, "step": 2026 }, { "epoch": 0.2677940350761304, "grad_norm": 0.18662765622138977, "learning_rate": 0.00016687392934052305, "loss": 0.0238, "step": 2027 }, { "epoch": 0.26792614856161445, "grad_norm": 0.22798240184783936, "learning_rate": 0.00016684302312068374, "loss": 0.0289, "step": 2028 }, { "epoch": 0.2680582620470985, "grad_norm": 0.23374059796333313, "learning_rate": 0.00016681210535496194, "loss": 0.0281, "step": 2029 }, { "epoch": 0.2681903755325825, "grad_norm": 0.19504158198833466, "learning_rate": 0.00016678117604869815, "loss": 0.0175, "step": 2030 }, { "epoch": 0.26832248901806655, "grad_norm": 0.23058202862739563, "learning_rate": 0.0001667502352072348, "loss": 0.0166, "step": 2031 }, { "epoch": 0.2684546025035506, "grad_norm": 0.23326201736927032, "learning_rate": 0.0001667192828359164, "loss": 0.0317, "step": 2032 }, { "epoch": 0.2685867159890346, "grad_norm": 0.2174236923456192, "learning_rate": 0.00016668831894008936, "loss": 0.0244, "step": 2033 }, { "epoch": 0.26871882947451864, "grad_norm": 0.21321603655815125, "learning_rate": 0.00016665734352510207, "loss": 0.0211, "step": 2034 }, { "epoch": 0.2688509429600026, "grad_norm": 0.29613959789276123, "learning_rate": 0.00016662635659630504, "loss": 0.0411, "step": 2035 }, { "epoch": 0.26898305644548665, "grad_norm": 0.2365838587284088, "learning_rate": 0.00016659535815905064, "loss": 0.0223, "step": 2036 }, { "epoch": 0.2691151699309707, "grad_norm": 0.31860825419425964, "learning_rate": 0.00016656434821869323, "loss": 0.035, "step": 2037 }, { "epoch": 0.2692472834164547, "grad_norm": 0.17729634046554565, "learning_rate": 0.00016653332678058928, "loss": 0.0176, "step": 2038 }, { "epoch": 0.26937939690193874, "grad_norm": 0.19119992852210999, "learning_rate": 0.0001665022938500971, "loss": 0.018, "step": 2039 }, { "epoch": 0.26951151038742277, "grad_norm": 0.18928834795951843, "learning_rate": 0.0001664712494325771, "loss": 0.0336, "step": 2040 }, { "epoch": 0.2696436238729068, "grad_norm": 0.191539004445076, "learning_rate": 0.00016644019353339153, "loss": 0.0225, "step": 2041 }, { "epoch": 0.26977573735839083, "grad_norm": 0.3990846872329712, "learning_rate": 0.00016640912615790483, "loss": 0.0205, "step": 2042 }, { "epoch": 0.26990785084387486, "grad_norm": 0.17020075023174286, "learning_rate": 0.00016637804731148322, "loss": 0.0117, "step": 2043 }, { "epoch": 0.2700399643293589, "grad_norm": 0.17086337506771088, "learning_rate": 0.00016634695699949505, "loss": 0.0297, "step": 2044 }, { "epoch": 0.2701720778148429, "grad_norm": 0.16119834780693054, "learning_rate": 0.00016631585522731054, "loss": 0.0198, "step": 2045 }, { "epoch": 0.27030419130032696, "grad_norm": 0.23394055664539337, "learning_rate": 0.00016628474200030196, "loss": 0.0298, "step": 2046 }, { "epoch": 0.270436304785811, "grad_norm": 0.21679936349391937, "learning_rate": 0.0001662536173238436, "loss": 0.0174, "step": 2047 }, { "epoch": 0.270568418271295, "grad_norm": 0.4186050295829773, "learning_rate": 0.00016622248120331157, "loss": 0.0397, "step": 2048 }, { "epoch": 0.27070053175677905, "grad_norm": 0.22252652049064636, "learning_rate": 0.0001661913336440841, "loss": 0.0165, "step": 2049 }, { "epoch": 0.2708326452422631, "grad_norm": 0.16150662302970886, "learning_rate": 0.00016616017465154133, "loss": 0.0286, "step": 2050 }, { "epoch": 0.2709647587277471, "grad_norm": 0.20743726193904877, "learning_rate": 0.0001661290042310654, "loss": 0.0216, "step": 2051 }, { "epoch": 0.27109687221323114, "grad_norm": 0.2305314987897873, "learning_rate": 0.0001660978223880404, "loss": 0.0281, "step": 2052 }, { "epoch": 0.2712289856987152, "grad_norm": 0.17613615095615387, "learning_rate": 0.0001660666291278525, "loss": 0.0144, "step": 2053 }, { "epoch": 0.2713610991841992, "grad_norm": 0.18410120904445648, "learning_rate": 0.00016603542445588963, "loss": 0.0183, "step": 2054 }, { "epoch": 0.27149321266968324, "grad_norm": 0.16890189051628113, "learning_rate": 0.0001660042083775419, "loss": 0.0215, "step": 2055 }, { "epoch": 0.27162532615516727, "grad_norm": 0.16772271692752838, "learning_rate": 0.00016597298089820125, "loss": 0.0168, "step": 2056 }, { "epoch": 0.2717574396406513, "grad_norm": 0.172120600938797, "learning_rate": 0.00016594174202326167, "loss": 0.0159, "step": 2057 }, { "epoch": 0.27188955312613533, "grad_norm": 0.20007994771003723, "learning_rate": 0.00016591049175811908, "loss": 0.0315, "step": 2058 }, { "epoch": 0.27202166661161936, "grad_norm": 0.15204408764839172, "learning_rate": 0.00016587923010817138, "loss": 0.0112, "step": 2059 }, { "epoch": 0.2721537800971034, "grad_norm": 0.26047348976135254, "learning_rate": 0.00016584795707881846, "loss": 0.0296, "step": 2060 }, { "epoch": 0.2722858935825874, "grad_norm": 0.16377569735050201, "learning_rate": 0.00016581667267546213, "loss": 0.0173, "step": 2061 }, { "epoch": 0.27241800706807145, "grad_norm": 0.2574384808540344, "learning_rate": 0.00016578537690350618, "loss": 0.0282, "step": 2062 }, { "epoch": 0.2725501205535555, "grad_norm": 0.2776723802089691, "learning_rate": 0.00016575406976835637, "loss": 0.0301, "step": 2063 }, { "epoch": 0.2726822340390395, "grad_norm": 0.23735906183719635, "learning_rate": 0.00016572275127542044, "loss": 0.0253, "step": 2064 }, { "epoch": 0.27281434752452355, "grad_norm": 0.332064688205719, "learning_rate": 0.00016569142143010805, "loss": 0.0394, "step": 2065 }, { "epoch": 0.2729464610100076, "grad_norm": 0.29127272963523865, "learning_rate": 0.00016566008023783087, "loss": 0.0308, "step": 2066 }, { "epoch": 0.2730785744954916, "grad_norm": 0.25889071822166443, "learning_rate": 0.00016562872770400252, "loss": 0.0247, "step": 2067 }, { "epoch": 0.27321068798097564, "grad_norm": 0.17592589557170868, "learning_rate": 0.0001655973638340385, "loss": 0.0202, "step": 2068 }, { "epoch": 0.2733428014664597, "grad_norm": 0.20446552336215973, "learning_rate": 0.00016556598863335634, "loss": 0.0243, "step": 2069 }, { "epoch": 0.2734749149519437, "grad_norm": 0.20881833136081696, "learning_rate": 0.00016553460210737563, "loss": 0.0297, "step": 2070 }, { "epoch": 0.27360702843742774, "grad_norm": 0.17322196066379547, "learning_rate": 0.00016550320426151767, "loss": 0.0163, "step": 2071 }, { "epoch": 0.27373914192291177, "grad_norm": 0.15687014162540436, "learning_rate": 0.00016547179510120592, "loss": 0.0175, "step": 2072 }, { "epoch": 0.2738712554083958, "grad_norm": 0.19147679209709167, "learning_rate": 0.0001654403746318657, "loss": 0.0176, "step": 2073 }, { "epoch": 0.27400336889387983, "grad_norm": 0.2648773193359375, "learning_rate": 0.00016540894285892432, "loss": 0.0317, "step": 2074 }, { "epoch": 0.27413548237936386, "grad_norm": 0.26171812415122986, "learning_rate": 0.00016537749978781102, "loss": 0.0244, "step": 2075 }, { "epoch": 0.2742675958648479, "grad_norm": 0.24321086704730988, "learning_rate": 0.00016534604542395705, "loss": 0.0123, "step": 2076 }, { "epoch": 0.2743997093503319, "grad_norm": 0.23158743977546692, "learning_rate": 0.00016531457977279548, "loss": 0.0345, "step": 2077 }, { "epoch": 0.27453182283581595, "grad_norm": 0.2649182975292206, "learning_rate": 0.00016528310283976148, "loss": 0.0213, "step": 2078 }, { "epoch": 0.2746639363213, "grad_norm": 0.21584641933441162, "learning_rate": 0.00016525161463029208, "loss": 0.0164, "step": 2079 }, { "epoch": 0.274796049806784, "grad_norm": 0.2687309682369232, "learning_rate": 0.00016522011514982633, "loss": 0.0124, "step": 2080 }, { "epoch": 0.27492816329226805, "grad_norm": 0.20673182606697083, "learning_rate": 0.00016518860440380503, "loss": 0.0246, "step": 2081 }, { "epoch": 0.2750602767777521, "grad_norm": 0.27289971709251404, "learning_rate": 0.00016515708239767124, "loss": 0.0206, "step": 2082 }, { "epoch": 0.2751923902632361, "grad_norm": 0.17888841032981873, "learning_rate": 0.00016512554913686967, "loss": 0.0197, "step": 2083 }, { "epoch": 0.27532450374872014, "grad_norm": 0.16519111394882202, "learning_rate": 0.0001650940046268472, "loss": 0.0166, "step": 2084 }, { "epoch": 0.27545661723420417, "grad_norm": 0.23809735476970673, "learning_rate": 0.00016506244887305252, "loss": 0.0393, "step": 2085 }, { "epoch": 0.2755887307196882, "grad_norm": 0.2258623093366623, "learning_rate": 0.00016503088188093626, "loss": 0.027, "step": 2086 }, { "epoch": 0.27572084420517223, "grad_norm": 0.21751005947589874, "learning_rate": 0.0001649993036559511, "loss": 0.0245, "step": 2087 }, { "epoch": 0.27585295769065626, "grad_norm": 0.3354283571243286, "learning_rate": 0.0001649677142035515, "loss": 0.0269, "step": 2088 }, { "epoch": 0.2759850711761403, "grad_norm": 0.3530503511428833, "learning_rate": 0.000164936113529194, "loss": 0.0255, "step": 2089 }, { "epoch": 0.2761171846616243, "grad_norm": 0.2316848635673523, "learning_rate": 0.000164904501638337, "loss": 0.0171, "step": 2090 }, { "epoch": 0.27624929814710836, "grad_norm": 0.20455540716648102, "learning_rate": 0.00016487287853644088, "loss": 0.027, "step": 2091 }, { "epoch": 0.2763814116325924, "grad_norm": 0.2040807604789734, "learning_rate": 0.00016484124422896796, "loss": 0.0201, "step": 2092 }, { "epoch": 0.2765135251180764, "grad_norm": 0.2649083435535431, "learning_rate": 0.00016480959872138245, "loss": 0.0214, "step": 2093 }, { "epoch": 0.27664563860356045, "grad_norm": 0.5068869590759277, "learning_rate": 0.00016477794201915052, "loss": 0.0265, "step": 2094 }, { "epoch": 0.2767777520890445, "grad_norm": 0.26571395993232727, "learning_rate": 0.00016474627412774027, "loss": 0.0426, "step": 2095 }, { "epoch": 0.2769098655745285, "grad_norm": 0.37573423981666565, "learning_rate": 0.00016471459505262176, "loss": 0.0294, "step": 2096 }, { "epoch": 0.27704197906001254, "grad_norm": 0.2801987826824188, "learning_rate": 0.0001646829047992669, "loss": 0.0207, "step": 2097 }, { "epoch": 0.2771740925454966, "grad_norm": 0.32490137219429016, "learning_rate": 0.00016465120337314968, "loss": 0.0378, "step": 2098 }, { "epoch": 0.2773062060309806, "grad_norm": 0.29033592343330383, "learning_rate": 0.00016461949077974585, "loss": 0.0248, "step": 2099 }, { "epoch": 0.27743831951646464, "grad_norm": 0.2312108725309372, "learning_rate": 0.0001645877670245332, "loss": 0.0358, "step": 2100 }, { "epoch": 0.27757043300194867, "grad_norm": 0.2771279215812683, "learning_rate": 0.0001645560321129914, "loss": 0.0313, "step": 2101 }, { "epoch": 0.2777025464874327, "grad_norm": 0.2926751375198364, "learning_rate": 0.0001645242860506021, "loss": 0.0162, "step": 2102 }, { "epoch": 0.27783465997291673, "grad_norm": 0.2527139186859131, "learning_rate": 0.0001644925288428488, "loss": 0.0244, "step": 2103 }, { "epoch": 0.27796677345840076, "grad_norm": 0.20468686521053314, "learning_rate": 0.000164460760495217, "loss": 0.0209, "step": 2104 }, { "epoch": 0.2780988869438848, "grad_norm": 0.26166459918022156, "learning_rate": 0.000164428981013194, "loss": 0.0389, "step": 2105 }, { "epoch": 0.2782310004293688, "grad_norm": 0.25610142946243286, "learning_rate": 0.00016439719040226925, "loss": 0.0198, "step": 2106 }, { "epoch": 0.27836311391485286, "grad_norm": 0.23919682204723358, "learning_rate": 0.00016436538866793386, "loss": 0.0256, "step": 2107 }, { "epoch": 0.2784952274003369, "grad_norm": 0.29429468512535095, "learning_rate": 0.00016433357581568107, "loss": 0.0216, "step": 2108 }, { "epoch": 0.2786273408858209, "grad_norm": 0.17881804704666138, "learning_rate": 0.0001643017518510059, "loss": 0.0229, "step": 2109 }, { "epoch": 0.27875945437130495, "grad_norm": 0.19791945815086365, "learning_rate": 0.00016426991677940538, "loss": 0.0165, "step": 2110 }, { "epoch": 0.278891567856789, "grad_norm": 0.18446460366249084, "learning_rate": 0.00016423807060637836, "loss": 0.0177, "step": 2111 }, { "epoch": 0.279023681342273, "grad_norm": 0.19182991981506348, "learning_rate": 0.0001642062133374258, "loss": 0.0262, "step": 2112 }, { "epoch": 0.27915579482775704, "grad_norm": 0.23441122472286224, "learning_rate": 0.0001641743449780503, "loss": 0.0237, "step": 2113 }, { "epoch": 0.2792879083132411, "grad_norm": 0.21251298487186432, "learning_rate": 0.00016414246553375663, "loss": 0.0247, "step": 2114 }, { "epoch": 0.2794200217987251, "grad_norm": 0.19433574378490448, "learning_rate": 0.0001641105750100513, "loss": 0.0278, "step": 2115 }, { "epoch": 0.27955213528420914, "grad_norm": 0.16363725066184998, "learning_rate": 0.00016407867341244282, "loss": 0.0306, "step": 2116 }, { "epoch": 0.27968424876969317, "grad_norm": 0.1874036192893982, "learning_rate": 0.00016404676074644167, "loss": 0.0299, "step": 2117 }, { "epoch": 0.2798163622551772, "grad_norm": 0.27652040123939514, "learning_rate": 0.00016401483701756003, "loss": 0.0403, "step": 2118 }, { "epoch": 0.27994847574066123, "grad_norm": 0.23305629193782806, "learning_rate": 0.00016398290223131222, "loss": 0.0247, "step": 2119 }, { "epoch": 0.28008058922614526, "grad_norm": 0.24826198816299438, "learning_rate": 0.00016395095639321438, "loss": 0.0175, "step": 2120 }, { "epoch": 0.2802127027116293, "grad_norm": 0.16378138959407806, "learning_rate": 0.0001639189995087845, "loss": 0.0184, "step": 2121 }, { "epoch": 0.2803448161971133, "grad_norm": 0.4624858498573303, "learning_rate": 0.0001638870315835426, "loss": 0.0349, "step": 2122 }, { "epoch": 0.28047692968259735, "grad_norm": 0.17329254746437073, "learning_rate": 0.0001638550526230105, "loss": 0.0355, "step": 2123 }, { "epoch": 0.2806090431680814, "grad_norm": 0.258650541305542, "learning_rate": 0.00016382306263271193, "loss": 0.0317, "step": 2124 }, { "epoch": 0.2807411566535654, "grad_norm": 0.2232353240251541, "learning_rate": 0.00016379106161817263, "loss": 0.0315, "step": 2125 }, { "epoch": 0.28087327013904945, "grad_norm": 0.17142353951931, "learning_rate": 0.00016375904958492016, "loss": 0.0183, "step": 2126 }, { "epoch": 0.2810053836245335, "grad_norm": 0.1361655443906784, "learning_rate": 0.00016372702653848402, "loss": 0.0169, "step": 2127 }, { "epoch": 0.2811374971100175, "grad_norm": 0.3477037250995636, "learning_rate": 0.00016369499248439554, "loss": 0.0349, "step": 2128 }, { "epoch": 0.28126961059550154, "grad_norm": 0.20569737255573273, "learning_rate": 0.000163662947428188, "loss": 0.0244, "step": 2129 }, { "epoch": 0.28140172408098557, "grad_norm": 0.17859184741973877, "learning_rate": 0.0001636308913753967, "loss": 0.0232, "step": 2130 }, { "epoch": 0.2815338375664696, "grad_norm": 0.22174854576587677, "learning_rate": 0.00016359882433155857, "loss": 0.0256, "step": 2131 }, { "epoch": 0.28166595105195363, "grad_norm": 0.2234748899936676, "learning_rate": 0.00016356674630221268, "loss": 0.0192, "step": 2132 }, { "epoch": 0.28179806453743766, "grad_norm": 0.3553476333618164, "learning_rate": 0.0001635346572928999, "loss": 0.0308, "step": 2133 }, { "epoch": 0.2819301780229217, "grad_norm": 0.3469673991203308, "learning_rate": 0.000163502557309163, "loss": 0.0322, "step": 2134 }, { "epoch": 0.2820622915084057, "grad_norm": 0.27370384335517883, "learning_rate": 0.00016347044635654662, "loss": 0.0271, "step": 2135 }, { "epoch": 0.28219440499388976, "grad_norm": 0.20264363288879395, "learning_rate": 0.00016343832444059737, "loss": 0.016, "step": 2136 }, { "epoch": 0.2823265184793738, "grad_norm": 0.1757168471813202, "learning_rate": 0.0001634061915668637, "loss": 0.0159, "step": 2137 }, { "epoch": 0.2824586319648578, "grad_norm": 0.24907803535461426, "learning_rate": 0.00016337404774089596, "loss": 0.0259, "step": 2138 }, { "epoch": 0.28259074545034185, "grad_norm": 0.20025473833084106, "learning_rate": 0.0001633418929682464, "loss": 0.0262, "step": 2139 }, { "epoch": 0.2827228589358259, "grad_norm": 0.33397263288497925, "learning_rate": 0.00016330972725446915, "loss": 0.0384, "step": 2140 }, { "epoch": 0.2828549724213099, "grad_norm": 0.16113078594207764, "learning_rate": 0.0001632775506051202, "loss": 0.0185, "step": 2141 }, { "epoch": 0.28298708590679394, "grad_norm": 0.19310183823108673, "learning_rate": 0.00016324536302575755, "loss": 0.0281, "step": 2142 }, { "epoch": 0.283119199392278, "grad_norm": 0.2211921066045761, "learning_rate": 0.00016321316452194094, "loss": 0.0346, "step": 2143 }, { "epoch": 0.283251312877762, "grad_norm": 0.27257010340690613, "learning_rate": 0.0001631809550992321, "loss": 0.0225, "step": 2144 }, { "epoch": 0.28338342636324604, "grad_norm": 0.1994791179895401, "learning_rate": 0.0001631487347631945, "loss": 0.0302, "step": 2145 }, { "epoch": 0.28351553984873007, "grad_norm": 0.22019606828689575, "learning_rate": 0.00016311650351939373, "loss": 0.0255, "step": 2146 }, { "epoch": 0.2836476533342141, "grad_norm": 0.22622746229171753, "learning_rate": 0.00016308426137339707, "loss": 0.0221, "step": 2147 }, { "epoch": 0.28377976681969813, "grad_norm": 0.1966571807861328, "learning_rate": 0.0001630520083307738, "loss": 0.0286, "step": 2148 }, { "epoch": 0.28391188030518216, "grad_norm": 0.18262293934822083, "learning_rate": 0.00016301974439709494, "loss": 0.0197, "step": 2149 }, { "epoch": 0.2840439937906662, "grad_norm": 0.21323782205581665, "learning_rate": 0.00016298746957793355, "loss": 0.0349, "step": 2150 }, { "epoch": 0.2841761072761502, "grad_norm": 0.13299968838691711, "learning_rate": 0.0001629551838788645, "loss": 0.012, "step": 2151 }, { "epoch": 0.28430822076163426, "grad_norm": 0.19067250192165375, "learning_rate": 0.0001629228873054645, "loss": 0.0235, "step": 2152 }, { "epoch": 0.2844403342471183, "grad_norm": 0.3284483551979065, "learning_rate": 0.00016289057986331221, "loss": 0.0185, "step": 2153 }, { "epoch": 0.2845724477326023, "grad_norm": 0.2838576138019562, "learning_rate": 0.00016285826155798815, "loss": 0.0288, "step": 2154 }, { "epoch": 0.28470456121808635, "grad_norm": 0.20192408561706543, "learning_rate": 0.00016282593239507466, "loss": 0.0205, "step": 2155 }, { "epoch": 0.2848366747035704, "grad_norm": 0.23177526891231537, "learning_rate": 0.00016279359238015605, "loss": 0.0195, "step": 2156 }, { "epoch": 0.2849687881890544, "grad_norm": 0.20849239826202393, "learning_rate": 0.0001627612415188184, "loss": 0.0469, "step": 2157 }, { "epoch": 0.28510090167453844, "grad_norm": 0.30044230818748474, "learning_rate": 0.00016272887981664974, "loss": 0.0241, "step": 2158 }, { "epoch": 0.2852330151600225, "grad_norm": 0.22462965548038483, "learning_rate": 0.00016269650727923998, "loss": 0.0173, "step": 2159 }, { "epoch": 0.2853651286455065, "grad_norm": 0.2355768233537674, "learning_rate": 0.0001626641239121808, "loss": 0.0286, "step": 2160 }, { "epoch": 0.28549724213099054, "grad_norm": 0.24778874218463898, "learning_rate": 0.0001626317297210659, "loss": 0.0329, "step": 2161 }, { "epoch": 0.28562935561647457, "grad_norm": 0.3856741189956665, "learning_rate": 0.00016259932471149072, "loss": 0.0369, "step": 2162 }, { "epoch": 0.2857614691019586, "grad_norm": 0.21599425375461578, "learning_rate": 0.00016256690888905264, "loss": 0.0271, "step": 2163 }, { "epoch": 0.28589358258744263, "grad_norm": 0.2502656579017639, "learning_rate": 0.00016253448225935087, "loss": 0.0243, "step": 2164 }, { "epoch": 0.28602569607292666, "grad_norm": 0.30705583095550537, "learning_rate": 0.0001625020448279865, "loss": 0.0418, "step": 2165 }, { "epoch": 0.2861578095584107, "grad_norm": 0.43476346135139465, "learning_rate": 0.0001624695966005625, "loss": 0.0356, "step": 2166 }, { "epoch": 0.2862899230438947, "grad_norm": 0.19399316608905792, "learning_rate": 0.00016243713758268372, "loss": 0.0331, "step": 2167 }, { "epoch": 0.28642203652937875, "grad_norm": 0.19930878281593323, "learning_rate": 0.00016240466777995685, "loss": 0.0241, "step": 2168 }, { "epoch": 0.2865541500148628, "grad_norm": 0.18226459622383118, "learning_rate": 0.00016237218719799035, "loss": 0.0179, "step": 2169 }, { "epoch": 0.2866862635003468, "grad_norm": 0.35922908782958984, "learning_rate": 0.00016233969584239478, "loss": 0.0303, "step": 2170 }, { "epoch": 0.28681837698583085, "grad_norm": 0.2960500717163086, "learning_rate": 0.0001623071937187823, "loss": 0.0334, "step": 2171 }, { "epoch": 0.2869504904713149, "grad_norm": 0.21738512814044952, "learning_rate": 0.00016227468083276707, "loss": 0.014, "step": 2172 }, { "epoch": 0.2870826039567989, "grad_norm": 0.18073824048042297, "learning_rate": 0.0001622421571899651, "loss": 0.0172, "step": 2173 }, { "epoch": 0.28721471744228294, "grad_norm": 0.1971302479505539, "learning_rate": 0.00016220962279599424, "loss": 0.0254, "step": 2174 }, { "epoch": 0.28734683092776697, "grad_norm": 0.2494899183511734, "learning_rate": 0.0001621770776564742, "loss": 0.0341, "step": 2175 }, { "epoch": 0.287478944413251, "grad_norm": 0.45942479372024536, "learning_rate": 0.0001621445217770265, "loss": 0.0218, "step": 2176 }, { "epoch": 0.28761105789873503, "grad_norm": 0.13894154131412506, "learning_rate": 0.0001621119551632746, "loss": 0.0157, "step": 2177 }, { "epoch": 0.28774317138421907, "grad_norm": 0.18475857377052307, "learning_rate": 0.00016207937782084382, "loss": 0.0234, "step": 2178 }, { "epoch": 0.2878752848697031, "grad_norm": 0.2803318500518799, "learning_rate": 0.0001620467897553612, "loss": 0.0352, "step": 2179 }, { "epoch": 0.2880073983551871, "grad_norm": 0.16757351160049438, "learning_rate": 0.00016201419097245577, "loss": 0.0228, "step": 2180 }, { "epoch": 0.28813951184067116, "grad_norm": 0.1853777915239334, "learning_rate": 0.00016198158147775834, "loss": 0.0169, "step": 2181 }, { "epoch": 0.2882716253261552, "grad_norm": 0.20891882479190826, "learning_rate": 0.0001619489612769016, "loss": 0.0237, "step": 2182 }, { "epoch": 0.2884037388116392, "grad_norm": 0.3805196285247803, "learning_rate": 0.00016191633037552006, "loss": 0.0295, "step": 2183 }, { "epoch": 0.28853585229712325, "grad_norm": 0.2402908354997635, "learning_rate": 0.00016188368877925012, "loss": 0.0203, "step": 2184 }, { "epoch": 0.2886679657826073, "grad_norm": 0.22193406522274017, "learning_rate": 0.00016185103649373, "loss": 0.0171, "step": 2185 }, { "epoch": 0.2888000792680913, "grad_norm": 0.23514831066131592, "learning_rate": 0.00016181837352459977, "loss": 0.0396, "step": 2186 }, { "epoch": 0.28893219275357535, "grad_norm": 0.2036595493555069, "learning_rate": 0.00016178569987750137, "loss": 0.0344, "step": 2187 }, { "epoch": 0.2890643062390594, "grad_norm": 0.21834520995616913, "learning_rate": 0.0001617530155580785, "loss": 0.0271, "step": 2188 }, { "epoch": 0.2891964197245434, "grad_norm": 0.23453760147094727, "learning_rate": 0.00016172032057197683, "loss": 0.0217, "step": 2189 }, { "epoch": 0.28932853321002744, "grad_norm": 0.16586275398731232, "learning_rate": 0.00016168761492484378, "loss": 0.0217, "step": 2190 }, { "epoch": 0.28946064669551147, "grad_norm": 0.46942102909088135, "learning_rate": 0.00016165489862232866, "loss": 0.0177, "step": 2191 }, { "epoch": 0.2895927601809955, "grad_norm": 0.18020497262477875, "learning_rate": 0.00016162217167008255, "loss": 0.0154, "step": 2192 }, { "epoch": 0.28972487366647953, "grad_norm": 0.20823246240615845, "learning_rate": 0.00016158943407375845, "loss": 0.0289, "step": 2193 }, { "epoch": 0.28985698715196356, "grad_norm": 0.32893314957618713, "learning_rate": 0.0001615566858390112, "loss": 0.0334, "step": 2194 }, { "epoch": 0.2899891006374476, "grad_norm": 0.3370470404624939, "learning_rate": 0.0001615239269714974, "loss": 0.0206, "step": 2195 }, { "epoch": 0.2901212141229316, "grad_norm": 0.19687312841415405, "learning_rate": 0.00016149115747687552, "loss": 0.0249, "step": 2196 }, { "epoch": 0.29025332760841566, "grad_norm": 0.19295763969421387, "learning_rate": 0.00016145837736080592, "loss": 0.0253, "step": 2197 }, { "epoch": 0.29038544109389963, "grad_norm": 0.19667404890060425, "learning_rate": 0.00016142558662895072, "loss": 0.0172, "step": 2198 }, { "epoch": 0.29051755457938366, "grad_norm": 0.17432467639446259, "learning_rate": 0.00016139278528697396, "loss": 0.0268, "step": 2199 }, { "epoch": 0.2906496680648677, "grad_norm": 0.18398909270763397, "learning_rate": 0.0001613599733405414, "loss": 0.028, "step": 2200 }, { "epoch": 0.2907817815503517, "grad_norm": 0.19623810052871704, "learning_rate": 0.00016132715079532074, "loss": 0.0286, "step": 2201 }, { "epoch": 0.29091389503583576, "grad_norm": 0.18011881411075592, "learning_rate": 0.00016129431765698137, "loss": 0.0263, "step": 2202 }, { "epoch": 0.2910460085213198, "grad_norm": 0.4542388916015625, "learning_rate": 0.00016126147393119472, "loss": 0.023, "step": 2203 }, { "epoch": 0.2911781220068038, "grad_norm": 0.19564567506313324, "learning_rate": 0.0001612286196236338, "loss": 0.0181, "step": 2204 }, { "epoch": 0.29131023549228785, "grad_norm": 0.1872696876525879, "learning_rate": 0.00016119575473997372, "loss": 0.0147, "step": 2205 }, { "epoch": 0.2914423489777719, "grad_norm": 0.2464790791273117, "learning_rate": 0.00016116287928589115, "loss": 0.0228, "step": 2206 }, { "epoch": 0.2915744624632559, "grad_norm": 0.19536122679710388, "learning_rate": 0.00016112999326706482, "loss": 0.0372, "step": 2207 }, { "epoch": 0.29170657594873994, "grad_norm": 0.2512054443359375, "learning_rate": 0.00016109709668917508, "loss": 0.0218, "step": 2208 }, { "epoch": 0.291838689434224, "grad_norm": 0.22209592163562775, "learning_rate": 0.00016106418955790422, "loss": 0.0208, "step": 2209 }, { "epoch": 0.291970802919708, "grad_norm": 0.25197240710258484, "learning_rate": 0.00016103127187893637, "loss": 0.0146, "step": 2210 }, { "epoch": 0.29210291640519204, "grad_norm": 0.34588807821273804, "learning_rate": 0.0001609983436579574, "loss": 0.0285, "step": 2211 }, { "epoch": 0.29223502989067607, "grad_norm": 0.2277306318283081, "learning_rate": 0.00016096540490065508, "loss": 0.0155, "step": 2212 }, { "epoch": 0.2923671433761601, "grad_norm": 0.14773651957511902, "learning_rate": 0.00016093245561271896, "loss": 0.0167, "step": 2213 }, { "epoch": 0.29249925686164413, "grad_norm": 0.13434603810310364, "learning_rate": 0.0001608994957998404, "loss": 0.0188, "step": 2214 }, { "epoch": 0.29263137034712816, "grad_norm": 0.1497490257024765, "learning_rate": 0.0001608665254677126, "loss": 0.0203, "step": 2215 }, { "epoch": 0.2927634838326122, "grad_norm": 0.25678667426109314, "learning_rate": 0.00016083354462203056, "loss": 0.0314, "step": 2216 }, { "epoch": 0.2928955973180962, "grad_norm": 0.2165631651878357, "learning_rate": 0.00016080055326849109, "loss": 0.0346, "step": 2217 }, { "epoch": 0.29302771080358025, "grad_norm": 0.16122478246688843, "learning_rate": 0.00016076755141279287, "loss": 0.0234, "step": 2218 }, { "epoch": 0.2931598242890643, "grad_norm": 0.1963433027267456, "learning_rate": 0.0001607345390606363, "loss": 0.0243, "step": 2219 }, { "epoch": 0.2932919377745483, "grad_norm": 0.21741873025894165, "learning_rate": 0.00016070151621772372, "loss": 0.0264, "step": 2220 }, { "epoch": 0.29342405126003235, "grad_norm": 0.1384367048740387, "learning_rate": 0.00016066848288975912, "loss": 0.0131, "step": 2221 }, { "epoch": 0.2935561647455164, "grad_norm": 0.23075279593467712, "learning_rate": 0.00016063543908244847, "loss": 0.0259, "step": 2222 }, { "epoch": 0.2936882782310004, "grad_norm": 0.33965376019477844, "learning_rate": 0.0001606023848014994, "loss": 0.0331, "step": 2223 }, { "epoch": 0.29382039171648444, "grad_norm": 0.15662816166877747, "learning_rate": 0.00016056932005262148, "loss": 0.0189, "step": 2224 }, { "epoch": 0.2939525052019685, "grad_norm": 0.21762795746326447, "learning_rate": 0.000160536244841526, "loss": 0.0253, "step": 2225 }, { "epoch": 0.2940846186874525, "grad_norm": 0.20064249634742737, "learning_rate": 0.00016050315917392612, "loss": 0.0229, "step": 2226 }, { "epoch": 0.29421673217293653, "grad_norm": 0.16303327679634094, "learning_rate": 0.0001604700630555367, "loss": 0.0217, "step": 2227 }, { "epoch": 0.29434884565842057, "grad_norm": 0.18921849131584167, "learning_rate": 0.0001604369564920745, "loss": 0.0244, "step": 2228 }, { "epoch": 0.2944809591439046, "grad_norm": 0.24990400671958923, "learning_rate": 0.0001604038394892581, "loss": 0.0232, "step": 2229 }, { "epoch": 0.29461307262938863, "grad_norm": 0.23241092264652252, "learning_rate": 0.00016037071205280781, "loss": 0.0254, "step": 2230 }, { "epoch": 0.29474518611487266, "grad_norm": 0.17506669461727142, "learning_rate": 0.00016033757418844577, "loss": 0.0195, "step": 2231 }, { "epoch": 0.2948772996003567, "grad_norm": 0.1743784099817276, "learning_rate": 0.00016030442590189595, "loss": 0.0151, "step": 2232 }, { "epoch": 0.2950094130858407, "grad_norm": 0.45965102314949036, "learning_rate": 0.00016027126719888408, "loss": 0.0377, "step": 2233 }, { "epoch": 0.29514152657132475, "grad_norm": 0.17976310849189758, "learning_rate": 0.0001602380980851377, "loss": 0.0187, "step": 2234 }, { "epoch": 0.2952736400568088, "grad_norm": 0.16373927891254425, "learning_rate": 0.00016020491856638618, "loss": 0.0194, "step": 2235 }, { "epoch": 0.2954057535422928, "grad_norm": 0.19986402988433838, "learning_rate": 0.00016017172864836064, "loss": 0.0164, "step": 2236 }, { "epoch": 0.29553786702777685, "grad_norm": 0.2965887486934662, "learning_rate": 0.00016013852833679398, "loss": 0.0441, "step": 2237 }, { "epoch": 0.2956699805132609, "grad_norm": 0.22505910694599152, "learning_rate": 0.00016010531763742104, "loss": 0.025, "step": 2238 }, { "epoch": 0.2958020939987449, "grad_norm": 0.3020710051059723, "learning_rate": 0.00016007209655597828, "loss": 0.0244, "step": 2239 }, { "epoch": 0.29593420748422894, "grad_norm": 0.6665115356445312, "learning_rate": 0.00016003886509820397, "loss": 0.0346, "step": 2240 }, { "epoch": 0.29606632096971297, "grad_norm": 0.17169487476348877, "learning_rate": 0.0001600056232698383, "loss": 0.0193, "step": 2241 }, { "epoch": 0.296198434455197, "grad_norm": 0.18180319666862488, "learning_rate": 0.00015997237107662318, "loss": 0.0169, "step": 2242 }, { "epoch": 0.29633054794068103, "grad_norm": 0.23043014109134674, "learning_rate": 0.00015993910852430228, "loss": 0.0195, "step": 2243 }, { "epoch": 0.29646266142616506, "grad_norm": 0.2865215241909027, "learning_rate": 0.00015990583561862102, "loss": 0.0388, "step": 2244 }, { "epoch": 0.2965947749116491, "grad_norm": 0.20309293270111084, "learning_rate": 0.0001598725523653268, "loss": 0.0355, "step": 2245 }, { "epoch": 0.2967268883971331, "grad_norm": 0.31159254908561707, "learning_rate": 0.0001598392587701686, "loss": 0.0252, "step": 2246 }, { "epoch": 0.29685900188261716, "grad_norm": 0.15543049573898315, "learning_rate": 0.00015980595483889725, "loss": 0.0052, "step": 2247 }, { "epoch": 0.2969911153681012, "grad_norm": 0.29889604449272156, "learning_rate": 0.00015977264057726539, "loss": 0.0229, "step": 2248 }, { "epoch": 0.2971232288535852, "grad_norm": 0.14694826304912567, "learning_rate": 0.0001597393159910275, "loss": 0.0188, "step": 2249 }, { "epoch": 0.29725534233906925, "grad_norm": 0.2545098066329956, "learning_rate": 0.00015970598108593973, "loss": 0.032, "step": 2250 }, { "epoch": 0.2973874558245533, "grad_norm": 0.3052002191543579, "learning_rate": 0.00015967263586776006, "loss": 0.0253, "step": 2251 }, { "epoch": 0.2975195693100373, "grad_norm": 0.13028715550899506, "learning_rate": 0.00015963928034224824, "loss": 0.0144, "step": 2252 }, { "epoch": 0.29765168279552134, "grad_norm": 0.16676437854766846, "learning_rate": 0.00015960591451516585, "loss": 0.023, "step": 2253 }, { "epoch": 0.2977837962810054, "grad_norm": 0.2732032835483551, "learning_rate": 0.0001595725383922762, "loss": 0.0344, "step": 2254 }, { "epoch": 0.2979159097664894, "grad_norm": 0.24554814398288727, "learning_rate": 0.00015953915197934436, "loss": 0.0309, "step": 2255 }, { "epoch": 0.29804802325197344, "grad_norm": 0.1905503273010254, "learning_rate": 0.0001595057552821373, "loss": 0.0243, "step": 2256 }, { "epoch": 0.29818013673745747, "grad_norm": 0.1754557490348816, "learning_rate": 0.00015947234830642355, "loss": 0.0185, "step": 2257 }, { "epoch": 0.2983122502229415, "grad_norm": 0.1937703639268875, "learning_rate": 0.00015943893105797364, "loss": 0.0133, "step": 2258 }, { "epoch": 0.29844436370842553, "grad_norm": 0.2528638541698456, "learning_rate": 0.0001594055035425597, "loss": 0.0425, "step": 2259 }, { "epoch": 0.29857647719390956, "grad_norm": 0.16026671230793, "learning_rate": 0.00015937206576595574, "loss": 0.0116, "step": 2260 }, { "epoch": 0.2987085906793936, "grad_norm": 0.23767255246639252, "learning_rate": 0.00015933861773393754, "loss": 0.0244, "step": 2261 }, { "epoch": 0.2988407041648776, "grad_norm": 0.40182456374168396, "learning_rate": 0.0001593051594522826, "loss": 0.0279, "step": 2262 }, { "epoch": 0.29897281765036166, "grad_norm": 0.16921398043632507, "learning_rate": 0.0001592716909267702, "loss": 0.0243, "step": 2263 }, { "epoch": 0.2991049311358457, "grad_norm": 0.21883520483970642, "learning_rate": 0.00015923821216318141, "loss": 0.0262, "step": 2264 }, { "epoch": 0.2992370446213297, "grad_norm": 0.15632914006710052, "learning_rate": 0.00015920472316729908, "loss": 0.0164, "step": 2265 }, { "epoch": 0.29936915810681375, "grad_norm": 0.19226789474487305, "learning_rate": 0.00015917122394490775, "loss": 0.0159, "step": 2266 }, { "epoch": 0.2995012715922978, "grad_norm": 0.2588377296924591, "learning_rate": 0.00015913771450179384, "loss": 0.0199, "step": 2267 }, { "epoch": 0.2996333850777818, "grad_norm": 0.2027762234210968, "learning_rate": 0.00015910419484374547, "loss": 0.0195, "step": 2268 }, { "epoch": 0.29976549856326584, "grad_norm": 0.1688690334558487, "learning_rate": 0.00015907066497655253, "loss": 0.0119, "step": 2269 }, { "epoch": 0.2998976120487499, "grad_norm": 0.18758751451969147, "learning_rate": 0.00015903712490600668, "loss": 0.0247, "step": 2270 }, { "epoch": 0.3000297255342339, "grad_norm": 0.23641707003116608, "learning_rate": 0.00015900357463790132, "loss": 0.0288, "step": 2271 }, { "epoch": 0.30016183901971794, "grad_norm": 0.22951290011405945, "learning_rate": 0.00015897001417803166, "loss": 0.026, "step": 2272 }, { "epoch": 0.30029395250520197, "grad_norm": 0.20448489487171173, "learning_rate": 0.00015893644353219463, "loss": 0.0196, "step": 2273 }, { "epoch": 0.300426065990686, "grad_norm": 0.1670645922422409, "learning_rate": 0.00015890286270618892, "loss": 0.015, "step": 2274 }, { "epoch": 0.30055817947617003, "grad_norm": 0.1759205311536789, "learning_rate": 0.000158869271705815, "loss": 0.023, "step": 2275 }, { "epoch": 0.30069029296165406, "grad_norm": 0.23238608241081238, "learning_rate": 0.00015883567053687512, "loss": 0.0234, "step": 2276 }, { "epoch": 0.3008224064471381, "grad_norm": 0.14851292967796326, "learning_rate": 0.0001588020592051732, "loss": 0.0178, "step": 2277 }, { "epoch": 0.3009545199326221, "grad_norm": 0.17358282208442688, "learning_rate": 0.00015876843771651497, "loss": 0.0224, "step": 2278 }, { "epoch": 0.30108663341810615, "grad_norm": 0.25880980491638184, "learning_rate": 0.00015873480607670793, "loss": 0.0276, "step": 2279 }, { "epoch": 0.3012187469035902, "grad_norm": 0.1913393884897232, "learning_rate": 0.00015870116429156136, "loss": 0.0193, "step": 2280 }, { "epoch": 0.3013508603890742, "grad_norm": 0.16971909999847412, "learning_rate": 0.00015866751236688617, "loss": 0.0249, "step": 2281 }, { "epoch": 0.30148297387455825, "grad_norm": 0.17388664186000824, "learning_rate": 0.00015863385030849515, "loss": 0.0133, "step": 2282 }, { "epoch": 0.3016150873600423, "grad_norm": 0.26276373863220215, "learning_rate": 0.0001586001781222028, "loss": 0.0361, "step": 2283 }, { "epoch": 0.3017472008455263, "grad_norm": 0.21703331172466278, "learning_rate": 0.00015856649581382534, "loss": 0.0186, "step": 2284 }, { "epoch": 0.30187931433101034, "grad_norm": 0.24544142186641693, "learning_rate": 0.00015853280338918078, "loss": 0.0146, "step": 2285 }, { "epoch": 0.30201142781649437, "grad_norm": 0.1939631551504135, "learning_rate": 0.00015849910085408882, "loss": 0.0295, "step": 2286 }, { "epoch": 0.3021435413019784, "grad_norm": 0.21825960278511047, "learning_rate": 0.00015846538821437094, "loss": 0.0218, "step": 2287 }, { "epoch": 0.30227565478746243, "grad_norm": 0.20250838994979858, "learning_rate": 0.00015843166547585043, "loss": 0.0236, "step": 2288 }, { "epoch": 0.30240776827294646, "grad_norm": 0.20301519334316254, "learning_rate": 0.0001583979326443522, "loss": 0.0256, "step": 2289 }, { "epoch": 0.3025398817584305, "grad_norm": 0.12368103861808777, "learning_rate": 0.00015836418972570298, "loss": 0.0106, "step": 2290 }, { "epoch": 0.3026719952439145, "grad_norm": 0.25658977031707764, "learning_rate": 0.00015833043672573122, "loss": 0.0391, "step": 2291 }, { "epoch": 0.30280410872939856, "grad_norm": 0.22816288471221924, "learning_rate": 0.00015829667365026718, "loss": 0.0217, "step": 2292 }, { "epoch": 0.3029362222148826, "grad_norm": 0.34003958106040955, "learning_rate": 0.00015826290050514273, "loss": 0.0237, "step": 2293 }, { "epoch": 0.3030683357003666, "grad_norm": 0.2503148317337036, "learning_rate": 0.00015822911729619158, "loss": 0.0133, "step": 2294 }, { "epoch": 0.30320044918585065, "grad_norm": 0.24932456016540527, "learning_rate": 0.00015819532402924912, "loss": 0.0277, "step": 2295 }, { "epoch": 0.3033325626713347, "grad_norm": 0.27211788296699524, "learning_rate": 0.00015816152071015255, "loss": 0.0179, "step": 2296 }, { "epoch": 0.3034646761568187, "grad_norm": 0.15673963725566864, "learning_rate": 0.00015812770734474074, "loss": 0.0196, "step": 2297 }, { "epoch": 0.30359678964230274, "grad_norm": 0.3486553728580475, "learning_rate": 0.00015809388393885434, "loss": 0.0349, "step": 2298 }, { "epoch": 0.3037289031277868, "grad_norm": 0.34395694732666016, "learning_rate": 0.00015806005049833564, "loss": 0.023, "step": 2299 }, { "epoch": 0.3038610166132708, "grad_norm": 0.23733146488666534, "learning_rate": 0.00015802620702902882, "loss": 0.0192, "step": 2300 }, { "epoch": 0.30399313009875484, "grad_norm": 0.18981397151947021, "learning_rate": 0.00015799235353677963, "loss": 0.0216, "step": 2301 }, { "epoch": 0.30412524358423887, "grad_norm": 0.34877878427505493, "learning_rate": 0.0001579584900274357, "loss": 0.0279, "step": 2302 }, { "epoch": 0.3042573570697229, "grad_norm": 0.5037104487419128, "learning_rate": 0.00015792461650684624, "loss": 0.0301, "step": 2303 }, { "epoch": 0.30438947055520693, "grad_norm": 0.16604360938072205, "learning_rate": 0.00015789073298086236, "loss": 0.0193, "step": 2304 }, { "epoch": 0.30452158404069096, "grad_norm": 0.24134650826454163, "learning_rate": 0.00015785683945533673, "loss": 0.0307, "step": 2305 }, { "epoch": 0.304653697526175, "grad_norm": 0.2895773947238922, "learning_rate": 0.00015782293593612386, "loss": 0.0243, "step": 2306 }, { "epoch": 0.304785811011659, "grad_norm": 0.17948000133037567, "learning_rate": 0.00015778902242907995, "loss": 0.0243, "step": 2307 }, { "epoch": 0.30491792449714306, "grad_norm": 0.2832844853401184, "learning_rate": 0.00015775509894006286, "loss": 0.0295, "step": 2308 }, { "epoch": 0.3050500379826271, "grad_norm": 0.29266154766082764, "learning_rate": 0.00015772116547493233, "loss": 0.018, "step": 2309 }, { "epoch": 0.3051821514681111, "grad_norm": 0.23423396050930023, "learning_rate": 0.0001576872220395497, "loss": 0.0154, "step": 2310 }, { "epoch": 0.30531426495359515, "grad_norm": 0.2711523175239563, "learning_rate": 0.00015765326863977804, "loss": 0.0352, "step": 2311 }, { "epoch": 0.3054463784390792, "grad_norm": 0.13761219382286072, "learning_rate": 0.00015761930528148218, "loss": 0.0139, "step": 2312 }, { "epoch": 0.3055784919245632, "grad_norm": 0.21768812835216522, "learning_rate": 0.00015758533197052867, "loss": 0.0251, "step": 2313 }, { "epoch": 0.30571060541004724, "grad_norm": 0.22113913297653198, "learning_rate": 0.00015755134871278575, "loss": 0.0271, "step": 2314 }, { "epoch": 0.3058427188955313, "grad_norm": 0.31142762303352356, "learning_rate": 0.00015751735551412338, "loss": 0.0319, "step": 2315 }, { "epoch": 0.3059748323810153, "grad_norm": 0.22142787277698517, "learning_rate": 0.00015748335238041324, "loss": 0.0299, "step": 2316 }, { "epoch": 0.30610694586649934, "grad_norm": 0.20579847693443298, "learning_rate": 0.00015744933931752882, "loss": 0.0238, "step": 2317 }, { "epoch": 0.30623905935198337, "grad_norm": 0.2536778151988983, "learning_rate": 0.00015741531633134512, "loss": 0.038, "step": 2318 }, { "epoch": 0.3063711728374674, "grad_norm": 0.18283496797084808, "learning_rate": 0.00015738128342773907, "loss": 0.0243, "step": 2319 }, { "epoch": 0.30650328632295143, "grad_norm": 0.23785123229026794, "learning_rate": 0.0001573472406125892, "loss": 0.0284, "step": 2320 }, { "epoch": 0.30663539980843546, "grad_norm": 0.19473259150981903, "learning_rate": 0.0001573131878917757, "loss": 0.0213, "step": 2321 }, { "epoch": 0.3067675132939195, "grad_norm": 0.33988869190216064, "learning_rate": 0.00015727912527118063, "loss": 0.0422, "step": 2322 }, { "epoch": 0.3068996267794035, "grad_norm": 0.22191791236400604, "learning_rate": 0.00015724505275668758, "loss": 0.0344, "step": 2323 }, { "epoch": 0.30703174026488755, "grad_norm": 0.25878584384918213, "learning_rate": 0.00015721097035418206, "loss": 0.0322, "step": 2324 }, { "epoch": 0.3071638537503716, "grad_norm": 0.18412941694259644, "learning_rate": 0.00015717687806955107, "loss": 0.021, "step": 2325 }, { "epoch": 0.3072959672358556, "grad_norm": 0.19447560608386993, "learning_rate": 0.00015714277590868345, "loss": 0.0161, "step": 2326 }, { "epoch": 0.30742808072133965, "grad_norm": 0.19321192800998688, "learning_rate": 0.0001571086638774697, "loss": 0.0298, "step": 2327 }, { "epoch": 0.3075601942068237, "grad_norm": 0.3516335189342499, "learning_rate": 0.00015707454198180204, "loss": 0.0274, "step": 2328 }, { "epoch": 0.3076923076923077, "grad_norm": 0.3699778914451599, "learning_rate": 0.00015704041022757438, "loss": 0.0549, "step": 2329 }, { "epoch": 0.30782442117779174, "grad_norm": 0.2168762981891632, "learning_rate": 0.00015700626862068237, "loss": 0.0216, "step": 2330 }, { "epoch": 0.30795653466327577, "grad_norm": 0.20153917372226715, "learning_rate": 0.0001569721171670233, "loss": 0.0151, "step": 2331 }, { "epoch": 0.3080886481487598, "grad_norm": 0.15755203366279602, "learning_rate": 0.00015693795587249623, "loss": 0.0204, "step": 2332 }, { "epoch": 0.30822076163424383, "grad_norm": 0.1630401760339737, "learning_rate": 0.0001569037847430019, "loss": 0.0113, "step": 2333 }, { "epoch": 0.30835287511972787, "grad_norm": 0.252340167760849, "learning_rate": 0.00015686960378444266, "loss": 0.0231, "step": 2334 }, { "epoch": 0.3084849886052119, "grad_norm": 0.20554135739803314, "learning_rate": 0.00015683541300272268, "loss": 0.0239, "step": 2335 }, { "epoch": 0.3086171020906959, "grad_norm": 0.36748287081718445, "learning_rate": 0.00015680121240374775, "loss": 0.0345, "step": 2336 }, { "epoch": 0.30874921557617996, "grad_norm": 0.32464292645454407, "learning_rate": 0.00015676700199342543, "loss": 0.0169, "step": 2337 }, { "epoch": 0.308881329061664, "grad_norm": 0.18028365075588226, "learning_rate": 0.00015673278177766488, "loss": 0.017, "step": 2338 }, { "epoch": 0.309013442547148, "grad_norm": 0.2513674199581146, "learning_rate": 0.00015669855176237704, "loss": 0.0261, "step": 2339 }, { "epoch": 0.30914555603263205, "grad_norm": 0.21183346211910248, "learning_rate": 0.0001566643119534745, "loss": 0.0246, "step": 2340 }, { "epoch": 0.3092776695181161, "grad_norm": 0.1958574652671814, "learning_rate": 0.00015663006235687153, "loss": 0.0178, "step": 2341 }, { "epoch": 0.3094097830036001, "grad_norm": 0.16935515403747559, "learning_rate": 0.00015659580297848412, "loss": 0.025, "step": 2342 }, { "epoch": 0.30954189648908415, "grad_norm": 0.19944345951080322, "learning_rate": 0.00015656153382422993, "loss": 0.0236, "step": 2343 }, { "epoch": 0.3096740099745682, "grad_norm": 0.22058166563510895, "learning_rate": 0.00015652725490002833, "loss": 0.0389, "step": 2344 }, { "epoch": 0.3098061234600522, "grad_norm": 0.13750414550304413, "learning_rate": 0.00015649296621180034, "loss": 0.0164, "step": 2345 }, { "epoch": 0.30993823694553624, "grad_norm": 0.14292161166667938, "learning_rate": 0.00015645866776546868, "loss": 0.0189, "step": 2346 }, { "epoch": 0.31007035043102027, "grad_norm": 0.246055006980896, "learning_rate": 0.0001564243595669578, "loss": 0.029, "step": 2347 }, { "epoch": 0.3102024639165043, "grad_norm": 0.33867743611335754, "learning_rate": 0.00015639004162219382, "loss": 0.0326, "step": 2348 }, { "epoch": 0.31033457740198833, "grad_norm": 0.17780427634716034, "learning_rate": 0.00015635571393710445, "loss": 0.0209, "step": 2349 }, { "epoch": 0.31046669088747236, "grad_norm": 0.20596635341644287, "learning_rate": 0.00015632137651761923, "loss": 0.0118, "step": 2350 }, { "epoch": 0.3105988043729564, "grad_norm": 0.15004736185073853, "learning_rate": 0.00015628702936966926, "loss": 0.0091, "step": 2351 }, { "epoch": 0.3107309178584404, "grad_norm": 0.34517210721969604, "learning_rate": 0.00015625267249918737, "loss": 0.0362, "step": 2352 }, { "epoch": 0.31086303134392446, "grad_norm": 0.3376559317111969, "learning_rate": 0.00015621830591210808, "loss": 0.0372, "step": 2353 }, { "epoch": 0.3109951448294085, "grad_norm": 0.2009844183921814, "learning_rate": 0.00015618392961436756, "loss": 0.026, "step": 2354 }, { "epoch": 0.3111272583148925, "grad_norm": 0.23881836235523224, "learning_rate": 0.0001561495436119037, "loss": 0.0215, "step": 2355 }, { "epoch": 0.31125937180037655, "grad_norm": 0.28877967596054077, "learning_rate": 0.00015611514791065602, "loss": 0.0376, "step": 2356 }, { "epoch": 0.3113914852858606, "grad_norm": 0.20852817595005035, "learning_rate": 0.00015608074251656574, "loss": 0.023, "step": 2357 }, { "epoch": 0.3115235987713446, "grad_norm": 0.22282958030700684, "learning_rate": 0.00015604632743557577, "loss": 0.0257, "step": 2358 }, { "epoch": 0.31165571225682864, "grad_norm": 0.22633077204227448, "learning_rate": 0.00015601190267363062, "loss": 0.0495, "step": 2359 }, { "epoch": 0.3117878257423127, "grad_norm": 0.6149209141731262, "learning_rate": 0.00015597746823667655, "loss": 0.0493, "step": 2360 }, { "epoch": 0.31191993922779665, "grad_norm": 0.14970439672470093, "learning_rate": 0.0001559430241306615, "loss": 0.0164, "step": 2361 }, { "epoch": 0.3120520527132807, "grad_norm": 0.2662661373615265, "learning_rate": 0.00015590857036153498, "loss": 0.0258, "step": 2362 }, { "epoch": 0.3121841661987647, "grad_norm": 0.1701253205537796, "learning_rate": 0.0001558741069352483, "loss": 0.0278, "step": 2363 }, { "epoch": 0.31231627968424874, "grad_norm": 0.20933538675308228, "learning_rate": 0.0001558396338577543, "loss": 0.0187, "step": 2364 }, { "epoch": 0.3124483931697328, "grad_norm": 0.24754363298416138, "learning_rate": 0.00015580515113500763, "loss": 0.0559, "step": 2365 }, { "epoch": 0.3125805066552168, "grad_norm": 0.32924720644950867, "learning_rate": 0.00015577065877296452, "loss": 0.0303, "step": 2366 }, { "epoch": 0.31271262014070084, "grad_norm": 0.3301711678504944, "learning_rate": 0.0001557361567775828, "loss": 0.0436, "step": 2367 }, { "epoch": 0.31284473362618487, "grad_norm": 0.1682203859090805, "learning_rate": 0.00015570164515482215, "loss": 0.0202, "step": 2368 }, { "epoch": 0.3129768471116689, "grad_norm": 0.1786518096923828, "learning_rate": 0.00015566712391064378, "loss": 0.0312, "step": 2369 }, { "epoch": 0.31310896059715293, "grad_norm": 0.13824589550495148, "learning_rate": 0.00015563259305101057, "loss": 0.0191, "step": 2370 }, { "epoch": 0.31324107408263696, "grad_norm": 0.2650575637817383, "learning_rate": 0.00015559805258188707, "loss": 0.0264, "step": 2371 }, { "epoch": 0.313373187568121, "grad_norm": 0.220457062125206, "learning_rate": 0.00015556350250923954, "loss": 0.0208, "step": 2372 }, { "epoch": 0.313505301053605, "grad_norm": 0.25818371772766113, "learning_rate": 0.00015552894283903584, "loss": 0.0266, "step": 2373 }, { "epoch": 0.31363741453908905, "grad_norm": 0.25876984000205994, "learning_rate": 0.00015549437357724547, "loss": 0.0366, "step": 2374 }, { "epoch": 0.3137695280245731, "grad_norm": 0.17257654666900635, "learning_rate": 0.00015545979472983968, "loss": 0.0229, "step": 2375 }, { "epoch": 0.3139016415100571, "grad_norm": 0.21076619625091553, "learning_rate": 0.00015542520630279133, "loss": 0.0135, "step": 2376 }, { "epoch": 0.31403375499554115, "grad_norm": 0.2625904083251953, "learning_rate": 0.00015539060830207485, "loss": 0.0358, "step": 2377 }, { "epoch": 0.3141658684810252, "grad_norm": 0.2343330979347229, "learning_rate": 0.00015535600073366649, "loss": 0.0247, "step": 2378 }, { "epoch": 0.3142979819665092, "grad_norm": 0.3292272984981537, "learning_rate": 0.00015532138360354396, "loss": 0.0307, "step": 2379 }, { "epoch": 0.31443009545199324, "grad_norm": 0.24976593255996704, "learning_rate": 0.00015528675691768676, "loss": 0.0325, "step": 2380 }, { "epoch": 0.3145622089374773, "grad_norm": 0.15806466341018677, "learning_rate": 0.00015525212068207605, "loss": 0.0166, "step": 2381 }, { "epoch": 0.3146943224229613, "grad_norm": 0.2826812267303467, "learning_rate": 0.00015521747490269454, "loss": 0.024, "step": 2382 }, { "epoch": 0.31482643590844533, "grad_norm": 0.1427195966243744, "learning_rate": 0.00015518281958552666, "loss": 0.0176, "step": 2383 }, { "epoch": 0.31495854939392937, "grad_norm": 0.26324498653411865, "learning_rate": 0.00015514815473655847, "loss": 0.0364, "step": 2384 }, { "epoch": 0.3150906628794134, "grad_norm": 0.31639760732650757, "learning_rate": 0.00015511348036177766, "loss": 0.0447, "step": 2385 }, { "epoch": 0.31522277636489743, "grad_norm": 0.22099065780639648, "learning_rate": 0.0001550787964671736, "loss": 0.0293, "step": 2386 }, { "epoch": 0.31535488985038146, "grad_norm": 0.16949567198753357, "learning_rate": 0.00015504410305873726, "loss": 0.0203, "step": 2387 }, { "epoch": 0.3154870033358655, "grad_norm": 0.18645323812961578, "learning_rate": 0.0001550094001424613, "loss": 0.0236, "step": 2388 }, { "epoch": 0.3156191168213495, "grad_norm": 0.23583142459392548, "learning_rate": 0.00015497468772434, "loss": 0.0153, "step": 2389 }, { "epoch": 0.31575123030683355, "grad_norm": 0.41702818870544434, "learning_rate": 0.00015493996581036928, "loss": 0.0274, "step": 2390 }, { "epoch": 0.3158833437923176, "grad_norm": 0.24233517050743103, "learning_rate": 0.00015490523440654666, "loss": 0.0124, "step": 2391 }, { "epoch": 0.3160154572778016, "grad_norm": 0.16888493299484253, "learning_rate": 0.0001548704935188714, "loss": 0.0211, "step": 2392 }, { "epoch": 0.31614757076328565, "grad_norm": 0.24889791011810303, "learning_rate": 0.00015483574315334431, "loss": 0.0192, "step": 2393 }, { "epoch": 0.3162796842487697, "grad_norm": 0.31684616208076477, "learning_rate": 0.00015480098331596784, "loss": 0.0249, "step": 2394 }, { "epoch": 0.3164117977342537, "grad_norm": 0.2519291937351227, "learning_rate": 0.00015476621401274617, "loss": 0.0287, "step": 2395 }, { "epoch": 0.31654391121973774, "grad_norm": 0.15193960070610046, "learning_rate": 0.00015473143524968497, "loss": 0.0164, "step": 2396 }, { "epoch": 0.31667602470522177, "grad_norm": 0.12480553984642029, "learning_rate": 0.0001546966470327917, "loss": 0.0167, "step": 2397 }, { "epoch": 0.3168081381907058, "grad_norm": 0.22492715716362, "learning_rate": 0.00015466184936807528, "loss": 0.0213, "step": 2398 }, { "epoch": 0.31694025167618983, "grad_norm": 0.2097017616033554, "learning_rate": 0.00015462704226154646, "loss": 0.031, "step": 2399 }, { "epoch": 0.31707236516167386, "grad_norm": 0.18435464799404144, "learning_rate": 0.00015459222571921743, "loss": 0.0264, "step": 2400 }, { "epoch": 0.3172044786471579, "grad_norm": 0.23296067118644714, "learning_rate": 0.00015455739974710214, "loss": 0.0199, "step": 2401 }, { "epoch": 0.3173365921326419, "grad_norm": 0.15619218349456787, "learning_rate": 0.00015452256435121616, "loss": 0.026, "step": 2402 }, { "epoch": 0.31746870561812596, "grad_norm": 0.31736814975738525, "learning_rate": 0.00015448771953757658, "loss": 0.0259, "step": 2403 }, { "epoch": 0.31760081910361, "grad_norm": 0.21897679567337036, "learning_rate": 0.0001544528653122022, "loss": 0.0342, "step": 2404 }, { "epoch": 0.317732932589094, "grad_norm": 0.15073131024837494, "learning_rate": 0.0001544180016811135, "loss": 0.0206, "step": 2405 }, { "epoch": 0.31786504607457805, "grad_norm": 0.199736088514328, "learning_rate": 0.00015438312865033245, "loss": 0.0265, "step": 2406 }, { "epoch": 0.3179971595600621, "grad_norm": 0.369244247674942, "learning_rate": 0.00015434824622588276, "loss": 0.0283, "step": 2407 }, { "epoch": 0.3181292730455461, "grad_norm": 0.21546632051467896, "learning_rate": 0.00015431335441378968, "loss": 0.0118, "step": 2408 }, { "epoch": 0.31826138653103014, "grad_norm": 0.14493218064308167, "learning_rate": 0.00015427845322008013, "loss": 0.0137, "step": 2409 }, { "epoch": 0.3183935000165142, "grad_norm": 0.1289505809545517, "learning_rate": 0.00015424354265078266, "loss": 0.0123, "step": 2410 }, { "epoch": 0.3185256135019982, "grad_norm": 0.27677011489868164, "learning_rate": 0.00015420862271192743, "loss": 0.0401, "step": 2411 }, { "epoch": 0.31865772698748224, "grad_norm": 0.19438651204109192, "learning_rate": 0.00015417369340954616, "loss": 0.0292, "step": 2412 }, { "epoch": 0.31878984047296627, "grad_norm": 0.2415550947189331, "learning_rate": 0.00015413875474967222, "loss": 0.0437, "step": 2413 }, { "epoch": 0.3189219539584503, "grad_norm": 0.23373529314994812, "learning_rate": 0.00015410380673834068, "loss": 0.0366, "step": 2414 }, { "epoch": 0.31905406744393433, "grad_norm": 0.45507702231407166, "learning_rate": 0.0001540688493815881, "loss": 0.0236, "step": 2415 }, { "epoch": 0.31918618092941836, "grad_norm": 0.26520341634750366, "learning_rate": 0.00015403388268545276, "loss": 0.0169, "step": 2416 }, { "epoch": 0.3193182944149024, "grad_norm": 0.17684026062488556, "learning_rate": 0.00015399890665597442, "loss": 0.0273, "step": 2417 }, { "epoch": 0.3194504079003864, "grad_norm": 0.22508570551872253, "learning_rate": 0.00015396392129919467, "loss": 0.0176, "step": 2418 }, { "epoch": 0.31958252138587045, "grad_norm": 0.20308654010295868, "learning_rate": 0.00015392892662115644, "loss": 0.0331, "step": 2419 }, { "epoch": 0.3197146348713545, "grad_norm": 0.31760069727897644, "learning_rate": 0.00015389392262790444, "loss": 0.0253, "step": 2420 }, { "epoch": 0.3198467483568385, "grad_norm": 0.2112942934036255, "learning_rate": 0.00015385890932548502, "loss": 0.0279, "step": 2421 }, { "epoch": 0.31997886184232255, "grad_norm": 0.25001874566078186, "learning_rate": 0.00015382388671994599, "loss": 0.0361, "step": 2422 }, { "epoch": 0.3201109753278066, "grad_norm": 0.30203571915626526, "learning_rate": 0.00015378885481733692, "loss": 0.0189, "step": 2423 }, { "epoch": 0.3202430888132906, "grad_norm": 0.1823374330997467, "learning_rate": 0.00015375381362370884, "loss": 0.0217, "step": 2424 }, { "epoch": 0.32037520229877464, "grad_norm": 0.2312767654657364, "learning_rate": 0.00015371876314511455, "loss": 0.0212, "step": 2425 }, { "epoch": 0.3205073157842587, "grad_norm": 0.2587668001651764, "learning_rate": 0.0001536837033876083, "loss": 0.024, "step": 2426 }, { "epoch": 0.3206394292697427, "grad_norm": 0.29623347520828247, "learning_rate": 0.00015364863435724606, "loss": 0.0307, "step": 2427 }, { "epoch": 0.32077154275522674, "grad_norm": 0.18185120820999146, "learning_rate": 0.00015361355606008527, "loss": 0.0184, "step": 2428 }, { "epoch": 0.32090365624071077, "grad_norm": 0.2674174904823303, "learning_rate": 0.00015357846850218513, "loss": 0.0335, "step": 2429 }, { "epoch": 0.3210357697261948, "grad_norm": 0.22505123913288116, "learning_rate": 0.00015354337168960633, "loss": 0.0297, "step": 2430 }, { "epoch": 0.32116788321167883, "grad_norm": 0.21133509278297424, "learning_rate": 0.00015350826562841117, "loss": 0.0198, "step": 2431 }, { "epoch": 0.32129999669716286, "grad_norm": 0.1815750002861023, "learning_rate": 0.00015347315032466358, "loss": 0.0222, "step": 2432 }, { "epoch": 0.3214321101826469, "grad_norm": 0.16259314119815826, "learning_rate": 0.0001534380257844291, "loss": 0.0172, "step": 2433 }, { "epoch": 0.3215642236681309, "grad_norm": 0.2182605117559433, "learning_rate": 0.00015340289201377477, "loss": 0.027, "step": 2434 }, { "epoch": 0.32169633715361495, "grad_norm": 0.1356114149093628, "learning_rate": 0.00015336774901876936, "loss": 0.0198, "step": 2435 }, { "epoch": 0.321828450639099, "grad_norm": 0.2803022861480713, "learning_rate": 0.00015333259680548313, "loss": 0.0238, "step": 2436 }, { "epoch": 0.321960564124583, "grad_norm": 0.23340195417404175, "learning_rate": 0.000153297435379988, "loss": 0.0224, "step": 2437 }, { "epoch": 0.32209267761006705, "grad_norm": 0.15403622388839722, "learning_rate": 0.0001532622647483574, "loss": 0.021, "step": 2438 }, { "epoch": 0.3222247910955511, "grad_norm": 0.21697868406772614, "learning_rate": 0.00015322708491666642, "loss": 0.0309, "step": 2439 }, { "epoch": 0.3223569045810351, "grad_norm": 0.27215054631233215, "learning_rate": 0.00015319189589099174, "loss": 0.0216, "step": 2440 }, { "epoch": 0.32248901806651914, "grad_norm": 0.18713687360286713, "learning_rate": 0.00015315669767741155, "loss": 0.0211, "step": 2441 }, { "epoch": 0.32262113155200317, "grad_norm": 0.24193894863128662, "learning_rate": 0.00015312149028200576, "loss": 0.033, "step": 2442 }, { "epoch": 0.3227532450374872, "grad_norm": 0.5523065328598022, "learning_rate": 0.00015308627371085574, "loss": 0.0321, "step": 2443 }, { "epoch": 0.32288535852297123, "grad_norm": 0.37535157799720764, "learning_rate": 0.00015305104797004452, "loss": 0.0162, "step": 2444 }, { "epoch": 0.32301747200845526, "grad_norm": 0.220017671585083, "learning_rate": 0.00015301581306565666, "loss": 0.0286, "step": 2445 }, { "epoch": 0.3231495854939393, "grad_norm": 0.15219157934188843, "learning_rate": 0.00015298056900377833, "loss": 0.0165, "step": 2446 }, { "epoch": 0.3232816989794233, "grad_norm": 0.2553863525390625, "learning_rate": 0.00015294531579049733, "loss": 0.0212, "step": 2447 }, { "epoch": 0.32341381246490736, "grad_norm": 0.2089850753545761, "learning_rate": 0.00015291005343190292, "loss": 0.0222, "step": 2448 }, { "epoch": 0.3235459259503914, "grad_norm": 0.19783416390419006, "learning_rate": 0.00015287478193408608, "loss": 0.0214, "step": 2449 }, { "epoch": 0.3236780394358754, "grad_norm": 0.2705092132091522, "learning_rate": 0.00015283950130313926, "loss": 0.0409, "step": 2450 }, { "epoch": 0.32381015292135945, "grad_norm": 0.28372472524642944, "learning_rate": 0.00015280421154515656, "loss": 0.0274, "step": 2451 }, { "epoch": 0.3239422664068435, "grad_norm": 0.17090560495853424, "learning_rate": 0.00015276891266623362, "loss": 0.0151, "step": 2452 }, { "epoch": 0.3240743798923275, "grad_norm": 0.17509298026561737, "learning_rate": 0.00015273360467246762, "loss": 0.0206, "step": 2453 }, { "epoch": 0.32420649337781154, "grad_norm": 0.27054014801979065, "learning_rate": 0.0001526982875699574, "loss": 0.025, "step": 2454 }, { "epoch": 0.3243386068632956, "grad_norm": 0.17075131833553314, "learning_rate": 0.00015266296136480333, "loss": 0.017, "step": 2455 }, { "epoch": 0.3244707203487796, "grad_norm": 0.21123848855495453, "learning_rate": 0.0001526276260631073, "loss": 0.0274, "step": 2456 }, { "epoch": 0.32460283383426364, "grad_norm": 0.15472885966300964, "learning_rate": 0.00015259228167097287, "loss": 0.0106, "step": 2457 }, { "epoch": 0.32473494731974767, "grad_norm": 0.264280766248703, "learning_rate": 0.00015255692819450512, "loss": 0.0288, "step": 2458 }, { "epoch": 0.3248670608052317, "grad_norm": 0.3066900670528412, "learning_rate": 0.00015252156563981073, "loss": 0.0302, "step": 2459 }, { "epoch": 0.32499917429071573, "grad_norm": 0.24605877697467804, "learning_rate": 0.00015248619401299785, "loss": 0.0322, "step": 2460 }, { "epoch": 0.32513128777619976, "grad_norm": 0.2953263819217682, "learning_rate": 0.0001524508133201763, "loss": 0.0308, "step": 2461 }, { "epoch": 0.3252634012616838, "grad_norm": 0.25516271591186523, "learning_rate": 0.00015241542356745749, "loss": 0.025, "step": 2462 }, { "epoch": 0.3253955147471678, "grad_norm": 0.23421679437160492, "learning_rate": 0.00015238002476095422, "loss": 0.022, "step": 2463 }, { "epoch": 0.32552762823265186, "grad_norm": 0.1896619200706482, "learning_rate": 0.0001523446169067811, "loss": 0.0235, "step": 2464 }, { "epoch": 0.3256597417181359, "grad_norm": 0.1484084129333496, "learning_rate": 0.00015230920001105405, "loss": 0.0216, "step": 2465 }, { "epoch": 0.3257918552036199, "grad_norm": 0.20647959411144257, "learning_rate": 0.00015227377407989073, "loss": 0.0196, "step": 2466 }, { "epoch": 0.32592396868910395, "grad_norm": 0.2000134289264679, "learning_rate": 0.00015223833911941036, "loss": 0.0289, "step": 2467 }, { "epoch": 0.326056082174588, "grad_norm": 0.15682335197925568, "learning_rate": 0.00015220289513573362, "loss": 0.0122, "step": 2468 }, { "epoch": 0.326188195660072, "grad_norm": 0.2056158483028412, "learning_rate": 0.0001521674421349828, "loss": 0.0246, "step": 2469 }, { "epoch": 0.32632030914555604, "grad_norm": 0.47918254137039185, "learning_rate": 0.0001521319801232817, "loss": 0.0349, "step": 2470 }, { "epoch": 0.3264524226310401, "grad_norm": 0.14794687926769257, "learning_rate": 0.00015209650910675578, "loss": 0.0148, "step": 2471 }, { "epoch": 0.3265845361165241, "grad_norm": 0.21690905094146729, "learning_rate": 0.00015206102909153197, "loss": 0.0227, "step": 2472 }, { "epoch": 0.32671664960200814, "grad_norm": 0.3061348795890808, "learning_rate": 0.0001520255400837388, "loss": 0.032, "step": 2473 }, { "epoch": 0.32684876308749217, "grad_norm": 0.21932987868785858, "learning_rate": 0.0001519900420895063, "loss": 0.0317, "step": 2474 }, { "epoch": 0.3269808765729762, "grad_norm": 0.23280304670333862, "learning_rate": 0.0001519545351149661, "loss": 0.0315, "step": 2475 }, { "epoch": 0.32711299005846023, "grad_norm": 0.3209573030471802, "learning_rate": 0.0001519190191662514, "loss": 0.0353, "step": 2476 }, { "epoch": 0.32724510354394426, "grad_norm": 0.2334751933813095, "learning_rate": 0.00015188349424949683, "loss": 0.031, "step": 2477 }, { "epoch": 0.3273772170294283, "grad_norm": 0.1476392149925232, "learning_rate": 0.00015184796037083875, "loss": 0.0164, "step": 2478 }, { "epoch": 0.3275093305149123, "grad_norm": 0.34847354888916016, "learning_rate": 0.0001518124175364149, "loss": 0.0243, "step": 2479 }, { "epoch": 0.32764144400039635, "grad_norm": 0.2759470045566559, "learning_rate": 0.0001517768657523647, "loss": 0.0244, "step": 2480 }, { "epoch": 0.3277735574858804, "grad_norm": 0.28842800855636597, "learning_rate": 0.00015174130502482899, "loss": 0.039, "step": 2481 }, { "epoch": 0.3279056709713644, "grad_norm": 0.14972680807113647, "learning_rate": 0.00015170573535995029, "loss": 0.0162, "step": 2482 }, { "epoch": 0.32803778445684845, "grad_norm": 0.23471418023109436, "learning_rate": 0.00015167015676387257, "loss": 0.0304, "step": 2483 }, { "epoch": 0.3281698979423325, "grad_norm": 0.1672179400920868, "learning_rate": 0.00015163456924274134, "loss": 0.0155, "step": 2484 }, { "epoch": 0.3283020114278165, "grad_norm": 0.24049347639083862, "learning_rate": 0.00015159897280270373, "loss": 0.0271, "step": 2485 }, { "epoch": 0.32843412491330054, "grad_norm": 0.288613885641098, "learning_rate": 0.00015156336744990827, "loss": 0.0335, "step": 2486 }, { "epoch": 0.32856623839878457, "grad_norm": 0.2593197822570801, "learning_rate": 0.00015152775319050523, "loss": 0.0194, "step": 2487 }, { "epoch": 0.3286983518842686, "grad_norm": 0.1788790225982666, "learning_rate": 0.00015149213003064622, "loss": 0.0198, "step": 2488 }, { "epoch": 0.32883046536975263, "grad_norm": 0.21538689732551575, "learning_rate": 0.00015145649797648455, "loss": 0.0231, "step": 2489 }, { "epoch": 0.32896257885523666, "grad_norm": 0.2356831431388855, "learning_rate": 0.0001514208570341749, "loss": 0.0231, "step": 2490 }, { "epoch": 0.3290946923407207, "grad_norm": 0.12992119789123535, "learning_rate": 0.00015138520720987366, "loss": 0.0129, "step": 2491 }, { "epoch": 0.3292268058262047, "grad_norm": 0.19276019930839539, "learning_rate": 0.00015134954850973864, "loss": 0.0189, "step": 2492 }, { "epoch": 0.32935891931168876, "grad_norm": 0.20604351162910461, "learning_rate": 0.00015131388093992916, "loss": 0.0141, "step": 2493 }, { "epoch": 0.3294910327971728, "grad_norm": 0.3511555790901184, "learning_rate": 0.00015127820450660625, "loss": 0.0428, "step": 2494 }, { "epoch": 0.3296231462826568, "grad_norm": 0.18908363580703735, "learning_rate": 0.00015124251921593227, "loss": 0.0233, "step": 2495 }, { "epoch": 0.32975525976814085, "grad_norm": 0.1390237957239151, "learning_rate": 0.00015120682507407113, "loss": 0.0225, "step": 2496 }, { "epoch": 0.3298873732536249, "grad_norm": 0.20404070615768433, "learning_rate": 0.00015117112208718844, "loss": 0.0336, "step": 2497 }, { "epoch": 0.3300194867391089, "grad_norm": 0.14441095292568207, "learning_rate": 0.00015113541026145114, "loss": 0.0142, "step": 2498 }, { "epoch": 0.33015160022459294, "grad_norm": 0.11000459641218185, "learning_rate": 0.00015109968960302784, "loss": 0.0122, "step": 2499 }, { "epoch": 0.330283713710077, "grad_norm": 0.21273118257522583, "learning_rate": 0.00015106396011808855, "loss": 0.0184, "step": 2500 }, { "epoch": 0.330415827195561, "grad_norm": 0.19047781825065613, "learning_rate": 0.0001510282218128049, "loss": 0.0199, "step": 2501 }, { "epoch": 0.33054794068104504, "grad_norm": 0.3213901221752167, "learning_rate": 0.00015099247469335008, "loss": 0.0299, "step": 2502 }, { "epoch": 0.33068005416652907, "grad_norm": 0.2068183869123459, "learning_rate": 0.00015095671876589863, "loss": 0.0237, "step": 2503 }, { "epoch": 0.3308121676520131, "grad_norm": 0.13071802258491516, "learning_rate": 0.00015092095403662677, "loss": 0.0169, "step": 2504 }, { "epoch": 0.33094428113749713, "grad_norm": 0.1935047060251236, "learning_rate": 0.00015088518051171218, "loss": 0.0188, "step": 2505 }, { "epoch": 0.33107639462298116, "grad_norm": 0.20110556483268738, "learning_rate": 0.0001508493981973341, "loss": 0.0247, "step": 2506 }, { "epoch": 0.3312085081084652, "grad_norm": 0.19901902973651886, "learning_rate": 0.00015081360709967318, "loss": 0.0262, "step": 2507 }, { "epoch": 0.3313406215939492, "grad_norm": 0.24266882240772247, "learning_rate": 0.00015077780722491175, "loss": 0.0291, "step": 2508 }, { "epoch": 0.33147273507943326, "grad_norm": 0.1843455731868744, "learning_rate": 0.00015074199857923352, "loss": 0.0205, "step": 2509 }, { "epoch": 0.3316048485649173, "grad_norm": 0.30212709307670593, "learning_rate": 0.00015070618116882375, "loss": 0.0368, "step": 2510 }, { "epoch": 0.3317369620504013, "grad_norm": 0.23659828305244446, "learning_rate": 0.00015067035499986928, "loss": 0.0154, "step": 2511 }, { "epoch": 0.33186907553588535, "grad_norm": 0.2850700318813324, "learning_rate": 0.00015063452007855834, "loss": 0.0235, "step": 2512 }, { "epoch": 0.3320011890213694, "grad_norm": 0.3431922495365143, "learning_rate": 0.00015059867641108082, "loss": 0.0425, "step": 2513 }, { "epoch": 0.3321333025068534, "grad_norm": 0.15030327439308167, "learning_rate": 0.000150562824003628, "loss": 0.016, "step": 2514 }, { "epoch": 0.33226541599233744, "grad_norm": 0.19416648149490356, "learning_rate": 0.00015052696286239274, "loss": 0.0147, "step": 2515 }, { "epoch": 0.3323975294778215, "grad_norm": 0.2212238758802414, "learning_rate": 0.00015049109299356933, "loss": 0.0273, "step": 2516 }, { "epoch": 0.3325296429633055, "grad_norm": 0.47250092029571533, "learning_rate": 0.00015045521440335363, "loss": 0.0266, "step": 2517 }, { "epoch": 0.33266175644878954, "grad_norm": 0.21963316202163696, "learning_rate": 0.00015041932709794308, "loss": 0.0239, "step": 2518 }, { "epoch": 0.33279386993427357, "grad_norm": 0.1677628606557846, "learning_rate": 0.00015038343108353646, "loss": 0.0202, "step": 2519 }, { "epoch": 0.3329259834197576, "grad_norm": 0.21540825068950653, "learning_rate": 0.0001503475263663341, "loss": 0.0252, "step": 2520 }, { "epoch": 0.33305809690524163, "grad_norm": 0.23379550874233246, "learning_rate": 0.00015031161295253796, "loss": 0.0292, "step": 2521 }, { "epoch": 0.33319021039072566, "grad_norm": 0.22972875833511353, "learning_rate": 0.00015027569084835138, "loss": 0.0269, "step": 2522 }, { "epoch": 0.3333223238762097, "grad_norm": 0.18469804525375366, "learning_rate": 0.0001502397600599792, "loss": 0.0145, "step": 2523 }, { "epoch": 0.33345443736169367, "grad_norm": 0.22447286546230316, "learning_rate": 0.00015020382059362786, "loss": 0.022, "step": 2524 }, { "epoch": 0.3335865508471777, "grad_norm": 0.1887979358434677, "learning_rate": 0.00015016787245550515, "loss": 0.0211, "step": 2525 }, { "epoch": 0.33371866433266173, "grad_norm": 0.19186192750930786, "learning_rate": 0.0001501319156518205, "loss": 0.0188, "step": 2526 }, { "epoch": 0.33385077781814576, "grad_norm": 0.22555392980575562, "learning_rate": 0.00015009595018878472, "loss": 0.0274, "step": 2527 }, { "epoch": 0.3339828913036298, "grad_norm": 0.18418656289577484, "learning_rate": 0.00015005997607261024, "loss": 0.0196, "step": 2528 }, { "epoch": 0.3341150047891138, "grad_norm": 0.19265905022621155, "learning_rate": 0.00015002399330951084, "loss": 0.0291, "step": 2529 }, { "epoch": 0.33424711827459785, "grad_norm": 0.374276340007782, "learning_rate": 0.00014998800190570193, "loss": 0.0331, "step": 2530 }, { "epoch": 0.3343792317600819, "grad_norm": 0.17162999510765076, "learning_rate": 0.00014995200186740032, "loss": 0.0182, "step": 2531 }, { "epoch": 0.3345113452455659, "grad_norm": 0.2706623077392578, "learning_rate": 0.00014991599320082438, "loss": 0.0287, "step": 2532 }, { "epoch": 0.33464345873104995, "grad_norm": 0.2407575100660324, "learning_rate": 0.00014987997591219386, "loss": 0.028, "step": 2533 }, { "epoch": 0.334775572216534, "grad_norm": 0.2555379271507263, "learning_rate": 0.00014984395000773015, "loss": 0.0269, "step": 2534 }, { "epoch": 0.334907685702018, "grad_norm": 0.1932975947856903, "learning_rate": 0.00014980791549365602, "loss": 0.023, "step": 2535 }, { "epoch": 0.33503979918750204, "grad_norm": 0.15046212077140808, "learning_rate": 0.00014977187237619576, "loss": 0.016, "step": 2536 }, { "epoch": 0.33517191267298607, "grad_norm": 0.3287968933582306, "learning_rate": 0.00014973582066157514, "loss": 0.0333, "step": 2537 }, { "epoch": 0.3353040261584701, "grad_norm": 0.10586533695459366, "learning_rate": 0.00014969976035602144, "loss": 0.0084, "step": 2538 }, { "epoch": 0.33543613964395413, "grad_norm": 0.25480660796165466, "learning_rate": 0.00014966369146576338, "loss": 0.0367, "step": 2539 }, { "epoch": 0.33556825312943817, "grad_norm": 0.1262090504169464, "learning_rate": 0.0001496276139970312, "loss": 0.0148, "step": 2540 }, { "epoch": 0.3357003666149222, "grad_norm": 0.21672266721725464, "learning_rate": 0.0001495915279560566, "loss": 0.0272, "step": 2541 }, { "epoch": 0.3358324801004062, "grad_norm": 0.2435304820537567, "learning_rate": 0.00014955543334907277, "loss": 0.0271, "step": 2542 }, { "epoch": 0.33596459358589026, "grad_norm": 0.21179917454719543, "learning_rate": 0.00014951933018231435, "loss": 0.0213, "step": 2543 }, { "epoch": 0.3360967070713743, "grad_norm": 0.20296365022659302, "learning_rate": 0.00014948321846201758, "loss": 0.0239, "step": 2544 }, { "epoch": 0.3362288205568583, "grad_norm": 0.24866227805614471, "learning_rate": 0.00014944709819441994, "loss": 0.0235, "step": 2545 }, { "epoch": 0.33636093404234235, "grad_norm": 0.2591897249221802, "learning_rate": 0.00014941096938576068, "loss": 0.0348, "step": 2546 }, { "epoch": 0.3364930475278264, "grad_norm": 0.24188803136348724, "learning_rate": 0.00014937483204228029, "loss": 0.0199, "step": 2547 }, { "epoch": 0.3366251610133104, "grad_norm": 0.20329833030700684, "learning_rate": 0.00014933868617022085, "loss": 0.0195, "step": 2548 }, { "epoch": 0.33675727449879445, "grad_norm": 0.1957000344991684, "learning_rate": 0.00014930253177582585, "loss": 0.0198, "step": 2549 }, { "epoch": 0.3368893879842785, "grad_norm": 0.25972720980644226, "learning_rate": 0.00014926636886534032, "loss": 0.0277, "step": 2550 }, { "epoch": 0.3370215014697625, "grad_norm": 0.16775253415107727, "learning_rate": 0.00014923019744501073, "loss": 0.0125, "step": 2551 }, { "epoch": 0.33715361495524654, "grad_norm": 0.22409114241600037, "learning_rate": 0.000149194017521085, "loss": 0.026, "step": 2552 }, { "epoch": 0.33728572844073057, "grad_norm": 0.33436572551727295, "learning_rate": 0.00014915782909981248, "loss": 0.0411, "step": 2553 }, { "epoch": 0.3374178419262146, "grad_norm": 0.21022121608257294, "learning_rate": 0.00014912163218744418, "loss": 0.0336, "step": 2554 }, { "epoch": 0.33754995541169863, "grad_norm": 0.1965954452753067, "learning_rate": 0.0001490854267902323, "loss": 0.0166, "step": 2555 }, { "epoch": 0.33768206889718266, "grad_norm": 0.2767995595932007, "learning_rate": 0.00014904921291443074, "loss": 0.0345, "step": 2556 }, { "epoch": 0.3378141823826667, "grad_norm": 0.10635353624820709, "learning_rate": 0.00014901299056629475, "loss": 0.0149, "step": 2557 }, { "epoch": 0.3379462958681507, "grad_norm": 0.23388898372650146, "learning_rate": 0.000148976759752081, "loss": 0.0204, "step": 2558 }, { "epoch": 0.33807840935363476, "grad_norm": 0.19081705808639526, "learning_rate": 0.00014894052047804775, "loss": 0.0254, "step": 2559 }, { "epoch": 0.3382105228391188, "grad_norm": 0.2938932478427887, "learning_rate": 0.00014890427275045468, "loss": 0.0442, "step": 2560 }, { "epoch": 0.3383426363246028, "grad_norm": 0.2676193118095398, "learning_rate": 0.00014886801657556283, "loss": 0.0197, "step": 2561 }, { "epoch": 0.33847474981008685, "grad_norm": 0.15962931513786316, "learning_rate": 0.00014883175195963482, "loss": 0.0188, "step": 2562 }, { "epoch": 0.3386068632955709, "grad_norm": 0.4894774854183197, "learning_rate": 0.00014879547890893469, "loss": 0.0276, "step": 2563 }, { "epoch": 0.3387389767810549, "grad_norm": 0.33002814650535583, "learning_rate": 0.00014875919742972794, "loss": 0.0333, "step": 2564 }, { "epoch": 0.33887109026653894, "grad_norm": 0.27643507719039917, "learning_rate": 0.00014872290752828145, "loss": 0.0223, "step": 2565 }, { "epoch": 0.339003203752023, "grad_norm": 0.2616123557090759, "learning_rate": 0.0001486866092108637, "loss": 0.02, "step": 2566 }, { "epoch": 0.339135317237507, "grad_norm": 0.1584719866514206, "learning_rate": 0.0001486503024837445, "loss": 0.0191, "step": 2567 }, { "epoch": 0.33926743072299104, "grad_norm": 0.14762555062770844, "learning_rate": 0.00014861398735319518, "loss": 0.0138, "step": 2568 }, { "epoch": 0.33939954420847507, "grad_norm": 0.198564350605011, "learning_rate": 0.0001485776638254885, "loss": 0.035, "step": 2569 }, { "epoch": 0.3395316576939591, "grad_norm": 0.15630686283111572, "learning_rate": 0.00014854133190689867, "loss": 0.0166, "step": 2570 }, { "epoch": 0.33966377117944313, "grad_norm": 0.23185116052627563, "learning_rate": 0.00014850499160370134, "loss": 0.0179, "step": 2571 }, { "epoch": 0.33979588466492716, "grad_norm": 0.5001983046531677, "learning_rate": 0.0001484686429221736, "loss": 0.0247, "step": 2572 }, { "epoch": 0.3399279981504112, "grad_norm": 0.2338925302028656, "learning_rate": 0.00014843228586859406, "loss": 0.021, "step": 2573 }, { "epoch": 0.3400601116358952, "grad_norm": 0.2724718153476715, "learning_rate": 0.00014839592044924265, "loss": 0.0215, "step": 2574 }, { "epoch": 0.34019222512137925, "grad_norm": 0.1371961534023285, "learning_rate": 0.00014835954667040085, "loss": 0.0134, "step": 2575 }, { "epoch": 0.3403243386068633, "grad_norm": 0.24581897258758545, "learning_rate": 0.0001483231645383516, "loss": 0.029, "step": 2576 }, { "epoch": 0.3404564520923473, "grad_norm": 0.30089905858039856, "learning_rate": 0.00014828677405937917, "loss": 0.0341, "step": 2577 }, { "epoch": 0.34058856557783135, "grad_norm": 0.22530049085617065, "learning_rate": 0.00014825037523976935, "loss": 0.0243, "step": 2578 }, { "epoch": 0.3407206790633154, "grad_norm": 0.18343088030815125, "learning_rate": 0.00014821396808580934, "loss": 0.0186, "step": 2579 }, { "epoch": 0.3408527925487994, "grad_norm": 0.2015973925590515, "learning_rate": 0.00014817755260378786, "loss": 0.0213, "step": 2580 }, { "epoch": 0.34098490603428344, "grad_norm": 0.24317368865013123, "learning_rate": 0.00014814112879999488, "loss": 0.0309, "step": 2581 }, { "epoch": 0.3411170195197675, "grad_norm": 0.2161998599767685, "learning_rate": 0.00014810469668072207, "loss": 0.0305, "step": 2582 }, { "epoch": 0.3412491330052515, "grad_norm": 0.22956828773021698, "learning_rate": 0.00014806825625226234, "loss": 0.0189, "step": 2583 }, { "epoch": 0.34138124649073553, "grad_norm": 0.1916801631450653, "learning_rate": 0.00014803180752091005, "loss": 0.0298, "step": 2584 }, { "epoch": 0.34151335997621957, "grad_norm": 0.2665329873561859, "learning_rate": 0.0001479953504929611, "loss": 0.0266, "step": 2585 }, { "epoch": 0.3416454734617036, "grad_norm": 0.2349788397550583, "learning_rate": 0.0001479588851747127, "loss": 0.0302, "step": 2586 }, { "epoch": 0.34177758694718763, "grad_norm": 0.14647245407104492, "learning_rate": 0.00014792241157246362, "loss": 0.0168, "step": 2587 }, { "epoch": 0.34190970043267166, "grad_norm": 0.17012257874011993, "learning_rate": 0.00014788592969251397, "loss": 0.021, "step": 2588 }, { "epoch": 0.3420418139181557, "grad_norm": 0.30817341804504395, "learning_rate": 0.0001478494395411653, "loss": 0.0369, "step": 2589 }, { "epoch": 0.3421739274036397, "grad_norm": 0.23504842817783356, "learning_rate": 0.00014781294112472057, "loss": 0.0213, "step": 2590 }, { "epoch": 0.34230604088912375, "grad_norm": 0.18608969449996948, "learning_rate": 0.00014777643444948424, "loss": 0.0193, "step": 2591 }, { "epoch": 0.3424381543746078, "grad_norm": 0.2825300991535187, "learning_rate": 0.00014773991952176215, "loss": 0.0246, "step": 2592 }, { "epoch": 0.3425702678600918, "grad_norm": 0.1629979908466339, "learning_rate": 0.00014770339634786157, "loss": 0.016, "step": 2593 }, { "epoch": 0.34270238134557585, "grad_norm": 0.18643341958522797, "learning_rate": 0.00014766686493409122, "loss": 0.0105, "step": 2594 }, { "epoch": 0.3428344948310599, "grad_norm": 0.1917845457792282, "learning_rate": 0.00014763032528676114, "loss": 0.017, "step": 2595 }, { "epoch": 0.3429666083165439, "grad_norm": 0.2119200974702835, "learning_rate": 0.00014759377741218298, "loss": 0.0203, "step": 2596 }, { "epoch": 0.34309872180202794, "grad_norm": 0.153628870844841, "learning_rate": 0.00014755722131666962, "loss": 0.0165, "step": 2597 }, { "epoch": 0.34323083528751197, "grad_norm": 0.15713736414909363, "learning_rate": 0.00014752065700653546, "loss": 0.0208, "step": 2598 }, { "epoch": 0.343362948772996, "grad_norm": 0.18878145515918732, "learning_rate": 0.00014748408448809631, "loss": 0.0267, "step": 2599 }, { "epoch": 0.34349506225848003, "grad_norm": 0.21092022955417633, "learning_rate": 0.0001474475037676694, "loss": 0.023, "step": 2600 }, { "epoch": 0.34362717574396406, "grad_norm": 0.22435013949871063, "learning_rate": 0.00014741091485157335, "loss": 0.0242, "step": 2601 }, { "epoch": 0.3437592892294481, "grad_norm": 0.24122808873653412, "learning_rate": 0.0001473743177461282, "loss": 0.0226, "step": 2602 }, { "epoch": 0.3438914027149321, "grad_norm": 0.28979021310806274, "learning_rate": 0.00014733771245765544, "loss": 0.0236, "step": 2603 }, { "epoch": 0.34402351620041616, "grad_norm": 0.1095261350274086, "learning_rate": 0.00014730109899247794, "loss": 0.0104, "step": 2604 }, { "epoch": 0.3441556296859002, "grad_norm": 0.2334039807319641, "learning_rate": 0.00014726447735692, "loss": 0.0262, "step": 2605 }, { "epoch": 0.3442877431713842, "grad_norm": 0.144916832447052, "learning_rate": 0.00014722784755730732, "loss": 0.0168, "step": 2606 }, { "epoch": 0.34441985665686825, "grad_norm": 0.1582469791173935, "learning_rate": 0.000147191209599967, "loss": 0.018, "step": 2607 }, { "epoch": 0.3445519701423523, "grad_norm": 0.19044512510299683, "learning_rate": 0.00014715456349122754, "loss": 0.0261, "step": 2608 }, { "epoch": 0.3446840836278363, "grad_norm": 0.15183739364147186, "learning_rate": 0.00014711790923741894, "loss": 0.0181, "step": 2609 }, { "epoch": 0.34481619711332034, "grad_norm": 0.580545961856842, "learning_rate": 0.00014708124684487245, "loss": 0.0423, "step": 2610 }, { "epoch": 0.3449483105988044, "grad_norm": 0.19683608412742615, "learning_rate": 0.00014704457631992091, "loss": 0.0108, "step": 2611 }, { "epoch": 0.3450804240842884, "grad_norm": 0.2760066092014313, "learning_rate": 0.00014700789766889836, "loss": 0.0229, "step": 2612 }, { "epoch": 0.34521253756977244, "grad_norm": 0.310825914144516, "learning_rate": 0.00014697121089814042, "loss": 0.0274, "step": 2613 }, { "epoch": 0.34534465105525647, "grad_norm": 0.28714632987976074, "learning_rate": 0.00014693451601398408, "loss": 0.025, "step": 2614 }, { "epoch": 0.3454767645407405, "grad_norm": 0.15742076933383942, "learning_rate": 0.0001468978130227676, "loss": 0.0214, "step": 2615 }, { "epoch": 0.34560887802622453, "grad_norm": 0.24934737384319305, "learning_rate": 0.0001468611019308308, "loss": 0.0278, "step": 2616 }, { "epoch": 0.34574099151170856, "grad_norm": 0.18599407374858856, "learning_rate": 0.0001468243827445148, "loss": 0.0309, "step": 2617 }, { "epoch": 0.3458731049971926, "grad_norm": 0.13141511380672455, "learning_rate": 0.0001467876554701622, "loss": 0.0152, "step": 2618 }, { "epoch": 0.3460052184826766, "grad_norm": 0.4108513295650482, "learning_rate": 0.00014675092011411689, "loss": 0.028, "step": 2619 }, { "epoch": 0.34613733196816066, "grad_norm": 0.1940169632434845, "learning_rate": 0.00014671417668272424, "loss": 0.0333, "step": 2620 }, { "epoch": 0.3462694454536447, "grad_norm": 0.24660934507846832, "learning_rate": 0.00014667742518233103, "loss": 0.0296, "step": 2621 }, { "epoch": 0.3464015589391287, "grad_norm": 0.31359758973121643, "learning_rate": 0.00014664066561928532, "loss": 0.0197, "step": 2622 }, { "epoch": 0.34653367242461275, "grad_norm": 0.24189671874046326, "learning_rate": 0.00014660389799993673, "loss": 0.0292, "step": 2623 }, { "epoch": 0.3466657859100968, "grad_norm": 0.17720560729503632, "learning_rate": 0.00014656712233063608, "loss": 0.0225, "step": 2624 }, { "epoch": 0.3467978993955808, "grad_norm": 0.2419784665107727, "learning_rate": 0.00014653033861773573, "loss": 0.0338, "step": 2625 }, { "epoch": 0.34693001288106484, "grad_norm": 0.202085480093956, "learning_rate": 0.0001464935468675894, "loss": 0.0251, "step": 2626 }, { "epoch": 0.3470621263665489, "grad_norm": 0.18005435168743134, "learning_rate": 0.00014645674708655212, "loss": 0.0188, "step": 2627 }, { "epoch": 0.3471942398520329, "grad_norm": 0.2608417272567749, "learning_rate": 0.00014641993928098042, "loss": 0.0249, "step": 2628 }, { "epoch": 0.34732635333751694, "grad_norm": 0.3798268437385559, "learning_rate": 0.0001463831234572321, "loss": 0.0262, "step": 2629 }, { "epoch": 0.34745846682300097, "grad_norm": 0.189011350274086, "learning_rate": 0.0001463462996216665, "loss": 0.0191, "step": 2630 }, { "epoch": 0.347590580308485, "grad_norm": 0.26774030923843384, "learning_rate": 0.00014630946778064415, "loss": 0.027, "step": 2631 }, { "epoch": 0.34772269379396903, "grad_norm": 0.25474798679351807, "learning_rate": 0.0001462726279405271, "loss": 0.0317, "step": 2632 }, { "epoch": 0.34785480727945306, "grad_norm": 0.2341891974210739, "learning_rate": 0.00014623578010767874, "loss": 0.0268, "step": 2633 }, { "epoch": 0.3479869207649371, "grad_norm": 0.18379223346710205, "learning_rate": 0.00014619892428846388, "loss": 0.0248, "step": 2634 }, { "epoch": 0.3481190342504211, "grad_norm": 0.21996241807937622, "learning_rate": 0.00014616206048924862, "loss": 0.0211, "step": 2635 }, { "epoch": 0.34825114773590515, "grad_norm": 0.16686783730983734, "learning_rate": 0.00014612518871640049, "loss": 0.0264, "step": 2636 }, { "epoch": 0.3483832612213892, "grad_norm": 0.20649857819080353, "learning_rate": 0.00014608830897628846, "loss": 0.0211, "step": 2637 }, { "epoch": 0.3485153747068732, "grad_norm": 0.2630440890789032, "learning_rate": 0.00014605142127528277, "loss": 0.0245, "step": 2638 }, { "epoch": 0.34864748819235725, "grad_norm": 0.1711365431547165, "learning_rate": 0.0001460145256197551, "loss": 0.0101, "step": 2639 }, { "epoch": 0.3487796016778413, "grad_norm": 0.1501888632774353, "learning_rate": 0.0001459776220160785, "loss": 0.0122, "step": 2640 }, { "epoch": 0.3489117151633253, "grad_norm": 0.3269023895263672, "learning_rate": 0.0001459407104706273, "loss": 0.0533, "step": 2641 }, { "epoch": 0.34904382864880934, "grad_norm": 0.12305083870887756, "learning_rate": 0.00014590379098977736, "loss": 0.0146, "step": 2642 }, { "epoch": 0.34917594213429337, "grad_norm": 0.5508819222450256, "learning_rate": 0.00014586686357990578, "loss": 0.024, "step": 2643 }, { "epoch": 0.3493080556197774, "grad_norm": 0.15577666461467743, "learning_rate": 0.00014582992824739113, "loss": 0.0169, "step": 2644 }, { "epoch": 0.34944016910526143, "grad_norm": 0.3088664710521698, "learning_rate": 0.00014579298499861325, "loss": 0.0216, "step": 2645 }, { "epoch": 0.34957228259074546, "grad_norm": 0.20767951011657715, "learning_rate": 0.00014575603383995344, "loss": 0.024, "step": 2646 }, { "epoch": 0.3497043960762295, "grad_norm": 0.2066233605146408, "learning_rate": 0.0001457190747777943, "loss": 0.0252, "step": 2647 }, { "epoch": 0.3498365095617135, "grad_norm": 0.3334047198295593, "learning_rate": 0.00014568210781851977, "loss": 0.0488, "step": 2648 }, { "epoch": 0.34996862304719756, "grad_norm": 0.272562175989151, "learning_rate": 0.0001456451329685153, "loss": 0.0285, "step": 2649 }, { "epoch": 0.3501007365326816, "grad_norm": 0.2628403902053833, "learning_rate": 0.0001456081502341675, "loss": 0.0179, "step": 2650 }, { "epoch": 0.3502328500181656, "grad_norm": 0.26784005761146545, "learning_rate": 0.00014557115962186452, "loss": 0.0237, "step": 2651 }, { "epoch": 0.35036496350364965, "grad_norm": 0.2535072863101959, "learning_rate": 0.00014553416113799575, "loss": 0.035, "step": 2652 }, { "epoch": 0.3504970769891337, "grad_norm": 0.14403533935546875, "learning_rate": 0.00014549715478895202, "loss": 0.0223, "step": 2653 }, { "epoch": 0.3506291904746177, "grad_norm": 0.17470040917396545, "learning_rate": 0.00014546014058112552, "loss": 0.024, "step": 2654 }, { "epoch": 0.35076130396010174, "grad_norm": 0.29080143570899963, "learning_rate": 0.00014542311852090963, "loss": 0.0245, "step": 2655 }, { "epoch": 0.3508934174455858, "grad_norm": 0.17105811834335327, "learning_rate": 0.00014538608861469938, "loss": 0.0211, "step": 2656 }, { "epoch": 0.3510255309310698, "grad_norm": 0.14506390690803528, "learning_rate": 0.00014534905086889085, "loss": 0.0197, "step": 2657 }, { "epoch": 0.35115764441655384, "grad_norm": 0.3682821989059448, "learning_rate": 0.00014531200528988174, "loss": 0.0184, "step": 2658 }, { "epoch": 0.35128975790203787, "grad_norm": 0.2343844622373581, "learning_rate": 0.0001452749518840709, "loss": 0.0227, "step": 2659 }, { "epoch": 0.3514218713875219, "grad_norm": 0.22336915135383606, "learning_rate": 0.00014523789065785866, "loss": 0.0274, "step": 2660 }, { "epoch": 0.35155398487300593, "grad_norm": 0.2997768521308899, "learning_rate": 0.00014520082161764665, "loss": 0.0301, "step": 2661 }, { "epoch": 0.35168609835848996, "grad_norm": 0.2349299192428589, "learning_rate": 0.00014516374476983775, "loss": 0.0219, "step": 2662 }, { "epoch": 0.351818211843974, "grad_norm": 0.17483294010162354, "learning_rate": 0.00014512666012083644, "loss": 0.0166, "step": 2663 }, { "epoch": 0.351950325329458, "grad_norm": 0.18980465829372406, "learning_rate": 0.00014508956767704835, "loss": 0.0191, "step": 2664 }, { "epoch": 0.35208243881494206, "grad_norm": 0.22129544615745544, "learning_rate": 0.00014505246744488047, "loss": 0.0225, "step": 2665 }, { "epoch": 0.3522145523004261, "grad_norm": 0.2243020087480545, "learning_rate": 0.00014501535943074124, "loss": 0.0411, "step": 2666 }, { "epoch": 0.3523466657859101, "grad_norm": 0.15792229771614075, "learning_rate": 0.0001449782436410403, "loss": 0.0218, "step": 2667 }, { "epoch": 0.35247877927139415, "grad_norm": 0.23417407274246216, "learning_rate": 0.00014494112008218873, "loss": 0.0248, "step": 2668 }, { "epoch": 0.3526108927568782, "grad_norm": 0.16839639842510223, "learning_rate": 0.00014490398876059897, "loss": 0.012, "step": 2669 }, { "epoch": 0.3527430062423622, "grad_norm": 0.2418825328350067, "learning_rate": 0.0001448668496826847, "loss": 0.0211, "step": 2670 }, { "epoch": 0.35287511972784624, "grad_norm": 0.17806826531887054, "learning_rate": 0.00014482970285486108, "loss": 0.0133, "step": 2671 }, { "epoch": 0.3530072332133303, "grad_norm": 0.2244100570678711, "learning_rate": 0.00014479254828354444, "loss": 0.0206, "step": 2672 }, { "epoch": 0.3531393466988143, "grad_norm": 0.1828577071428299, "learning_rate": 0.00014475538597515263, "loss": 0.0195, "step": 2673 }, { "epoch": 0.35327146018429834, "grad_norm": 0.1585683524608612, "learning_rate": 0.00014471821593610467, "loss": 0.0166, "step": 2674 }, { "epoch": 0.35340357366978237, "grad_norm": 0.18276363611221313, "learning_rate": 0.00014468103817282102, "loss": 0.0211, "step": 2675 }, { "epoch": 0.3535356871552664, "grad_norm": 0.22013506293296814, "learning_rate": 0.00014464385269172343, "loss": 0.0219, "step": 2676 }, { "epoch": 0.35366780064075043, "grad_norm": 0.3715534508228302, "learning_rate": 0.000144606659499235, "loss": 0.0198, "step": 2677 }, { "epoch": 0.35379991412623446, "grad_norm": 0.17348001897335052, "learning_rate": 0.00014456945860178019, "loss": 0.0226, "step": 2678 }, { "epoch": 0.3539320276117185, "grad_norm": 0.16937151551246643, "learning_rate": 0.00014453225000578472, "loss": 0.021, "step": 2679 }, { "epoch": 0.3540641410972025, "grad_norm": 0.17919118702411652, "learning_rate": 0.0001444950337176757, "loss": 0.0254, "step": 2680 }, { "epoch": 0.35419625458268655, "grad_norm": 0.3098297119140625, "learning_rate": 0.00014445780974388153, "loss": 0.0531, "step": 2681 }, { "epoch": 0.3543283680681706, "grad_norm": 0.1814904361963272, "learning_rate": 0.00014442057809083196, "loss": 0.0242, "step": 2682 }, { "epoch": 0.3544604815536546, "grad_norm": 0.20038634538650513, "learning_rate": 0.00014438333876495808, "loss": 0.027, "step": 2683 }, { "epoch": 0.35459259503913865, "grad_norm": 0.2531159222126007, "learning_rate": 0.00014434609177269226, "loss": 0.033, "step": 2684 }, { "epoch": 0.3547247085246227, "grad_norm": 0.18288713693618774, "learning_rate": 0.00014430883712046827, "loss": 0.023, "step": 2685 }, { "epoch": 0.35485682201010665, "grad_norm": 0.2389320433139801, "learning_rate": 0.00014427157481472112, "loss": 0.024, "step": 2686 }, { "epoch": 0.3549889354955907, "grad_norm": 0.15338629484176636, "learning_rate": 0.00014423430486188715, "loss": 0.0093, "step": 2687 }, { "epoch": 0.3551210489810747, "grad_norm": 0.28040778636932373, "learning_rate": 0.00014419702726840408, "loss": 0.036, "step": 2688 }, { "epoch": 0.35525316246655875, "grad_norm": 0.1844131052494049, "learning_rate": 0.00014415974204071093, "loss": 0.0268, "step": 2689 }, { "epoch": 0.3553852759520428, "grad_norm": 0.23858432471752167, "learning_rate": 0.00014412244918524797, "loss": 0.0274, "step": 2690 }, { "epoch": 0.3555173894375268, "grad_norm": 0.22725573182106018, "learning_rate": 0.0001440851487084569, "loss": 0.0231, "step": 2691 }, { "epoch": 0.35564950292301084, "grad_norm": 0.2215961068868637, "learning_rate": 0.0001440478406167807, "loss": 0.0254, "step": 2692 }, { "epoch": 0.35578161640849487, "grad_norm": 0.26224878430366516, "learning_rate": 0.00014401052491666357, "loss": 0.0309, "step": 2693 }, { "epoch": 0.3559137298939789, "grad_norm": 0.18535034358501434, "learning_rate": 0.00014397320161455116, "loss": 0.0211, "step": 2694 }, { "epoch": 0.35604584337946293, "grad_norm": 0.24484148621559143, "learning_rate": 0.00014393587071689034, "loss": 0.0345, "step": 2695 }, { "epoch": 0.35617795686494697, "grad_norm": 0.17378197610378265, "learning_rate": 0.00014389853223012935, "loss": 0.0159, "step": 2696 }, { "epoch": 0.356310070350431, "grad_norm": 0.28994354605674744, "learning_rate": 0.00014386118616071768, "loss": 0.0461, "step": 2697 }, { "epoch": 0.356442183835915, "grad_norm": 0.2110566794872284, "learning_rate": 0.00014382383251510618, "loss": 0.0195, "step": 2698 }, { "epoch": 0.35657429732139906, "grad_norm": 0.23441585898399353, "learning_rate": 0.00014378647129974703, "loss": 0.0268, "step": 2699 }, { "epoch": 0.3567064108068831, "grad_norm": 0.3104618191719055, "learning_rate": 0.00014374910252109362, "loss": 0.0259, "step": 2700 }, { "epoch": 0.3568385242923671, "grad_norm": 0.21284493803977966, "learning_rate": 0.00014371172618560073, "loss": 0.0296, "step": 2701 }, { "epoch": 0.35697063777785115, "grad_norm": 0.1908929944038391, "learning_rate": 0.00014367434229972445, "loss": 0.0269, "step": 2702 }, { "epoch": 0.3571027512633352, "grad_norm": 0.21304036676883698, "learning_rate": 0.00014363695086992215, "loss": 0.0278, "step": 2703 }, { "epoch": 0.3572348647488192, "grad_norm": 0.14423377811908722, "learning_rate": 0.00014359955190265246, "loss": 0.0217, "step": 2704 }, { "epoch": 0.35736697823430325, "grad_norm": 0.2462548464536667, "learning_rate": 0.00014356214540437535, "loss": 0.0264, "step": 2705 }, { "epoch": 0.3574990917197873, "grad_norm": 0.21595892310142517, "learning_rate": 0.00014352473138155215, "loss": 0.0247, "step": 2706 }, { "epoch": 0.3576312052052713, "grad_norm": 0.2566458582878113, "learning_rate": 0.00014348730984064539, "loss": 0.0322, "step": 2707 }, { "epoch": 0.35776331869075534, "grad_norm": 0.22518129646778107, "learning_rate": 0.00014344988078811893, "loss": 0.0161, "step": 2708 }, { "epoch": 0.35789543217623937, "grad_norm": 0.14789043366909027, "learning_rate": 0.00014341244423043796, "loss": 0.0127, "step": 2709 }, { "epoch": 0.3580275456617234, "grad_norm": 0.21939575672149658, "learning_rate": 0.00014337500017406899, "loss": 0.0213, "step": 2710 }, { "epoch": 0.35815965914720743, "grad_norm": 0.31216418743133545, "learning_rate": 0.00014333754862547968, "loss": 0.0146, "step": 2711 }, { "epoch": 0.35829177263269146, "grad_norm": 0.20372824370861053, "learning_rate": 0.00014330008959113915, "loss": 0.0254, "step": 2712 }, { "epoch": 0.3584238861181755, "grad_norm": 0.33901652693748474, "learning_rate": 0.00014326262307751773, "loss": 0.0356, "step": 2713 }, { "epoch": 0.3585559996036595, "grad_norm": 0.2312634438276291, "learning_rate": 0.00014322514909108708, "loss": 0.0278, "step": 2714 }, { "epoch": 0.35868811308914356, "grad_norm": 0.3031659722328186, "learning_rate": 0.0001431876676383201, "loss": 0.0255, "step": 2715 }, { "epoch": 0.3588202265746276, "grad_norm": 0.1266086995601654, "learning_rate": 0.00014315017872569105, "loss": 0.0165, "step": 2716 }, { "epoch": 0.3589523400601116, "grad_norm": 0.16536574065685272, "learning_rate": 0.00014311268235967544, "loss": 0.0149, "step": 2717 }, { "epoch": 0.35908445354559565, "grad_norm": 0.12575633823871613, "learning_rate": 0.00014307517854675, "loss": 0.0128, "step": 2718 }, { "epoch": 0.3592165670310797, "grad_norm": 0.12982703745365143, "learning_rate": 0.00014303766729339284, "loss": 0.0121, "step": 2719 }, { "epoch": 0.3593486805165637, "grad_norm": 0.18847040832042694, "learning_rate": 0.0001430001486060834, "loss": 0.0125, "step": 2720 }, { "epoch": 0.35948079400204774, "grad_norm": 0.1142876073718071, "learning_rate": 0.00014296262249130224, "loss": 0.0091, "step": 2721 }, { "epoch": 0.3596129074875318, "grad_norm": 0.18693424761295319, "learning_rate": 0.0001429250889555313, "loss": 0.0243, "step": 2722 }, { "epoch": 0.3597450209730158, "grad_norm": 0.22312524914741516, "learning_rate": 0.00014288754800525385, "loss": 0.0262, "step": 2723 }, { "epoch": 0.35987713445849984, "grad_norm": 0.2658948600292206, "learning_rate": 0.00014284999964695437, "loss": 0.0195, "step": 2724 }, { "epoch": 0.36000924794398387, "grad_norm": 0.3503406345844269, "learning_rate": 0.00014281244388711863, "loss": 0.0273, "step": 2725 }, { "epoch": 0.3601413614294679, "grad_norm": 0.22179050743579865, "learning_rate": 0.0001427748807322337, "loss": 0.018, "step": 2726 }, { "epoch": 0.36027347491495193, "grad_norm": 0.18278402090072632, "learning_rate": 0.00014273731018878784, "loss": 0.0164, "step": 2727 }, { "epoch": 0.36040558840043596, "grad_norm": 0.16992351412773132, "learning_rate": 0.00014269973226327078, "loss": 0.0191, "step": 2728 }, { "epoch": 0.36053770188592, "grad_norm": 0.16629379987716675, "learning_rate": 0.0001426621469621733, "loss": 0.0215, "step": 2729 }, { "epoch": 0.360669815371404, "grad_norm": 0.211905837059021, "learning_rate": 0.00014262455429198763, "loss": 0.0218, "step": 2730 }, { "epoch": 0.36080192885688805, "grad_norm": 0.12857602536678314, "learning_rate": 0.00014258695425920713, "loss": 0.0106, "step": 2731 }, { "epoch": 0.3609340423423721, "grad_norm": 0.1673976629972458, "learning_rate": 0.00014254934687032658, "loss": 0.0162, "step": 2732 }, { "epoch": 0.3610661558278561, "grad_norm": 0.1890508532524109, "learning_rate": 0.0001425117321318419, "loss": 0.0189, "step": 2733 }, { "epoch": 0.36119826931334015, "grad_norm": 0.24128776788711548, "learning_rate": 0.00014247411005025034, "loss": 0.0184, "step": 2734 }, { "epoch": 0.3613303827988242, "grad_norm": 0.264330118894577, "learning_rate": 0.00014243648063205042, "loss": 0.0254, "step": 2735 }, { "epoch": 0.3614624962843082, "grad_norm": 0.18204659223556519, "learning_rate": 0.0001423988438837419, "loss": 0.0303, "step": 2736 }, { "epoch": 0.36159460976979224, "grad_norm": 0.49082961678504944, "learning_rate": 0.00014236119981182589, "loss": 0.0168, "step": 2737 }, { "epoch": 0.3617267232552763, "grad_norm": 0.228216752409935, "learning_rate": 0.0001423235484228046, "loss": 0.0227, "step": 2738 }, { "epoch": 0.3618588367407603, "grad_norm": 0.24391178786754608, "learning_rate": 0.00014228588972318168, "loss": 0.0323, "step": 2739 }, { "epoch": 0.36199095022624433, "grad_norm": 0.20811639726161957, "learning_rate": 0.00014224822371946194, "loss": 0.0255, "step": 2740 }, { "epoch": 0.36212306371172837, "grad_norm": 0.21693377196788788, "learning_rate": 0.00014221055041815147, "loss": 0.0252, "step": 2741 }, { "epoch": 0.3622551771972124, "grad_norm": 0.21688027679920197, "learning_rate": 0.00014217286982575765, "loss": 0.0359, "step": 2742 }, { "epoch": 0.36238729068269643, "grad_norm": 0.1932186633348465, "learning_rate": 0.0001421351819487891, "loss": 0.0208, "step": 2743 }, { "epoch": 0.36251940416818046, "grad_norm": 0.17899347841739655, "learning_rate": 0.00014209748679375566, "loss": 0.0324, "step": 2744 }, { "epoch": 0.3626515176536645, "grad_norm": 0.20702287554740906, "learning_rate": 0.00014205978436716848, "loss": 0.0204, "step": 2745 }, { "epoch": 0.3627836311391485, "grad_norm": 0.18478304147720337, "learning_rate": 0.00014202207467553995, "loss": 0.0143, "step": 2746 }, { "epoch": 0.36291574462463255, "grad_norm": 0.2355339080095291, "learning_rate": 0.0001419843577253837, "loss": 0.0328, "step": 2747 }, { "epoch": 0.3630478581101166, "grad_norm": 0.17565876245498657, "learning_rate": 0.0001419466335232147, "loss": 0.0218, "step": 2748 }, { "epoch": 0.3631799715956006, "grad_norm": 0.35660335421562195, "learning_rate": 0.00014190890207554902, "loss": 0.0225, "step": 2749 }, { "epoch": 0.36331208508108465, "grad_norm": 0.17033158242702484, "learning_rate": 0.0001418711633889041, "loss": 0.0164, "step": 2750 }, { "epoch": 0.3634441985665687, "grad_norm": 0.17524130642414093, "learning_rate": 0.00014183341746979857, "loss": 0.0185, "step": 2751 }, { "epoch": 0.3635763120520527, "grad_norm": 0.23012326657772064, "learning_rate": 0.0001417956643247523, "loss": 0.0256, "step": 2752 }, { "epoch": 0.36370842553753674, "grad_norm": 0.3006463646888733, "learning_rate": 0.00014175790396028657, "loss": 0.0366, "step": 2753 }, { "epoch": 0.36384053902302077, "grad_norm": 0.16686567664146423, "learning_rate": 0.00014172013638292366, "loss": 0.0114, "step": 2754 }, { "epoch": 0.3639726525085048, "grad_norm": 0.2867545783519745, "learning_rate": 0.0001416823615991872, "loss": 0.0168, "step": 2755 }, { "epoch": 0.36410476599398883, "grad_norm": 0.2075122892856598, "learning_rate": 0.00014164457961560217, "loss": 0.0168, "step": 2756 }, { "epoch": 0.36423687947947286, "grad_norm": 0.13762469589710236, "learning_rate": 0.0001416067904386946, "loss": 0.0065, "step": 2757 }, { "epoch": 0.3643689929649569, "grad_norm": 0.17714820802211761, "learning_rate": 0.00014156899407499196, "loss": 0.0176, "step": 2758 }, { "epoch": 0.3645011064504409, "grad_norm": 0.37875422835350037, "learning_rate": 0.0001415311905310228, "loss": 0.0254, "step": 2759 }, { "epoch": 0.36463321993592496, "grad_norm": 0.23140572011470795, "learning_rate": 0.000141493379813317, "loss": 0.0231, "step": 2760 }, { "epoch": 0.364765333421409, "grad_norm": 0.1881604641675949, "learning_rate": 0.00014145556192840566, "loss": 0.0206, "step": 2761 }, { "epoch": 0.364897446906893, "grad_norm": 0.2795705795288086, "learning_rate": 0.00014141773688282108, "loss": 0.0337, "step": 2762 }, { "epoch": 0.36502956039237705, "grad_norm": 0.3115374445915222, "learning_rate": 0.0001413799046830969, "loss": 0.0386, "step": 2763 }, { "epoch": 0.3651616738778611, "grad_norm": 0.22487610578536987, "learning_rate": 0.0001413420653357678, "loss": 0.0394, "step": 2764 }, { "epoch": 0.3652937873633451, "grad_norm": 0.3041042387485504, "learning_rate": 0.00014130421884736997, "loss": 0.0266, "step": 2765 }, { "epoch": 0.36542590084882914, "grad_norm": 0.30442002415657043, "learning_rate": 0.00014126636522444056, "loss": 0.0325, "step": 2766 }, { "epoch": 0.3655580143343132, "grad_norm": 0.23418278992176056, "learning_rate": 0.00014122850447351816, "loss": 0.0268, "step": 2767 }, { "epoch": 0.3656901278197972, "grad_norm": 0.25879621505737305, "learning_rate": 0.00014119063660114246, "loss": 0.0317, "step": 2768 }, { "epoch": 0.36582224130528124, "grad_norm": 0.1678416132926941, "learning_rate": 0.00014115276161385444, "loss": 0.0115, "step": 2769 }, { "epoch": 0.36595435479076527, "grad_norm": 0.1974228322505951, "learning_rate": 0.0001411148795181963, "loss": 0.0336, "step": 2770 }, { "epoch": 0.3660864682762493, "grad_norm": 0.18305741250514984, "learning_rate": 0.00014107699032071144, "loss": 0.0148, "step": 2771 }, { "epoch": 0.36621858176173333, "grad_norm": 0.24715334177017212, "learning_rate": 0.00014103909402794456, "loss": 0.022, "step": 2772 }, { "epoch": 0.36635069524721736, "grad_norm": 0.25955766439437866, "learning_rate": 0.00014100119064644148, "loss": 0.0381, "step": 2773 }, { "epoch": 0.3664828087327014, "grad_norm": 0.14129871129989624, "learning_rate": 0.00014096328018274937, "loss": 0.01, "step": 2774 }, { "epoch": 0.3666149222181854, "grad_norm": 0.17851941287517548, "learning_rate": 0.00014092536264341646, "loss": 0.0146, "step": 2775 }, { "epoch": 0.36674703570366946, "grad_norm": 0.13794377446174622, "learning_rate": 0.00014088743803499236, "loss": 0.0108, "step": 2776 }, { "epoch": 0.3668791491891535, "grad_norm": 0.2250049114227295, "learning_rate": 0.00014084950636402782, "loss": 0.0031, "step": 2777 }, { "epoch": 0.3670112626746375, "grad_norm": 0.20712882280349731, "learning_rate": 0.00014081156763707484, "loss": 0.0224, "step": 2778 }, { "epoch": 0.36714337616012155, "grad_norm": 0.15495552122592926, "learning_rate": 0.00014077362186068658, "loss": 0.0232, "step": 2779 }, { "epoch": 0.3672754896456056, "grad_norm": 0.1719774305820465, "learning_rate": 0.00014073566904141755, "loss": 0.0231, "step": 2780 }, { "epoch": 0.3674076031310896, "grad_norm": 0.35171160101890564, "learning_rate": 0.0001406977091858233, "loss": 0.0271, "step": 2781 }, { "epoch": 0.36753971661657364, "grad_norm": 0.16442163288593292, "learning_rate": 0.00014065974230046075, "loss": 0.0215, "step": 2782 }, { "epoch": 0.3676718301020577, "grad_norm": 0.18466730415821075, "learning_rate": 0.00014062176839188794, "loss": 0.0191, "step": 2783 }, { "epoch": 0.3678039435875417, "grad_norm": 0.1335364282131195, "learning_rate": 0.00014058378746666417, "loss": 0.0141, "step": 2784 }, { "epoch": 0.36793605707302574, "grad_norm": 0.21508878469467163, "learning_rate": 0.0001405457995313499, "loss": 0.0217, "step": 2785 }, { "epoch": 0.36806817055850977, "grad_norm": 0.17799223959445953, "learning_rate": 0.00014050780459250685, "loss": 0.0121, "step": 2786 }, { "epoch": 0.3682002840439938, "grad_norm": 0.2416561394929886, "learning_rate": 0.00014046980265669797, "loss": 0.0308, "step": 2787 }, { "epoch": 0.36833239752947783, "grad_norm": 0.2069653421640396, "learning_rate": 0.0001404317937304873, "loss": 0.018, "step": 2788 }, { "epoch": 0.36846451101496186, "grad_norm": 0.15654677152633667, "learning_rate": 0.0001403937778204403, "loss": 0.0148, "step": 2789 }, { "epoch": 0.3685966245004459, "grad_norm": 0.20482538640499115, "learning_rate": 0.00014035575493312341, "loss": 0.0322, "step": 2790 }, { "epoch": 0.3687287379859299, "grad_norm": 0.29183149337768555, "learning_rate": 0.0001403177250751044, "loss": 0.0367, "step": 2791 }, { "epoch": 0.36886085147141395, "grad_norm": 0.19543787837028503, "learning_rate": 0.00014027968825295218, "loss": 0.0246, "step": 2792 }, { "epoch": 0.368992964956898, "grad_norm": 0.19467125833034515, "learning_rate": 0.00014024164447323697, "loss": 0.0296, "step": 2793 }, { "epoch": 0.369125078442382, "grad_norm": 0.19719550013542175, "learning_rate": 0.00014020359374253006, "loss": 0.0207, "step": 2794 }, { "epoch": 0.36925719192786605, "grad_norm": 0.20642030239105225, "learning_rate": 0.00014016553606740405, "loss": 0.0233, "step": 2795 }, { "epoch": 0.3693893054133501, "grad_norm": 0.17306675016880035, "learning_rate": 0.00014012747145443269, "loss": 0.0228, "step": 2796 }, { "epoch": 0.3695214188988341, "grad_norm": 0.19165189564228058, "learning_rate": 0.00014008939991019085, "loss": 0.0146, "step": 2797 }, { "epoch": 0.36965353238431814, "grad_norm": 0.35364142060279846, "learning_rate": 0.0001400513214412548, "loss": 0.0464, "step": 2798 }, { "epoch": 0.36978564586980217, "grad_norm": 0.16079017519950867, "learning_rate": 0.0001400132360542018, "loss": 0.016, "step": 2799 }, { "epoch": 0.3699177593552862, "grad_norm": 0.2617528736591339, "learning_rate": 0.0001399751437556104, "loss": 0.0185, "step": 2800 }, { "epoch": 0.37004987284077023, "grad_norm": 0.23893366754055023, "learning_rate": 0.00013993704455206034, "loss": 0.027, "step": 2801 }, { "epoch": 0.37018198632625426, "grad_norm": 0.24953344464302063, "learning_rate": 0.00013989893845013255, "loss": 0.0291, "step": 2802 }, { "epoch": 0.3703140998117383, "grad_norm": 0.27270838618278503, "learning_rate": 0.00013986082545640915, "loss": 0.043, "step": 2803 }, { "epoch": 0.3704462132972223, "grad_norm": 0.29569360613822937, "learning_rate": 0.00013982270557747343, "loss": 0.0282, "step": 2804 }, { "epoch": 0.37057832678270636, "grad_norm": 0.25201651453971863, "learning_rate": 0.00013978457881990992, "loss": 0.0146, "step": 2805 }, { "epoch": 0.3707104402681904, "grad_norm": 0.17621220648288727, "learning_rate": 0.0001397464451903043, "loss": 0.0171, "step": 2806 }, { "epoch": 0.3708425537536744, "grad_norm": 0.2043362408876419, "learning_rate": 0.00013970830469524337, "loss": 0.0305, "step": 2807 }, { "epoch": 0.37097466723915845, "grad_norm": 0.1952420324087143, "learning_rate": 0.00013967015734131527, "loss": 0.0134, "step": 2808 }, { "epoch": 0.3711067807246425, "grad_norm": 0.2854112386703491, "learning_rate": 0.0001396320031351092, "loss": 0.0349, "step": 2809 }, { "epoch": 0.3712388942101265, "grad_norm": 0.1703397035598755, "learning_rate": 0.00013959384208321558, "loss": 0.0166, "step": 2810 }, { "epoch": 0.37137100769561054, "grad_norm": 0.2521183490753174, "learning_rate": 0.00013955567419222606, "loss": 0.0367, "step": 2811 }, { "epoch": 0.3715031211810946, "grad_norm": 0.20728209614753723, "learning_rate": 0.0001395174994687334, "loss": 0.0177, "step": 2812 }, { "epoch": 0.3716352346665786, "grad_norm": 0.09481014311313629, "learning_rate": 0.0001394793179193316, "loss": 0.0135, "step": 2813 }, { "epoch": 0.37176734815206264, "grad_norm": 0.17332907021045685, "learning_rate": 0.00013944112955061575, "loss": 0.0194, "step": 2814 }, { "epoch": 0.37189946163754667, "grad_norm": 0.1680062860250473, "learning_rate": 0.00013940293436918226, "loss": 0.0282, "step": 2815 }, { "epoch": 0.3720315751230307, "grad_norm": 0.1549157202243805, "learning_rate": 0.00013936473238162854, "loss": 0.0146, "step": 2816 }, { "epoch": 0.37216368860851473, "grad_norm": 0.14746297895908356, "learning_rate": 0.00013932652359455335, "loss": 0.018, "step": 2817 }, { "epoch": 0.37229580209399876, "grad_norm": 0.206549271941185, "learning_rate": 0.00013928830801455649, "loss": 0.0292, "step": 2818 }, { "epoch": 0.3724279155794828, "grad_norm": 0.16838571429252625, "learning_rate": 0.00013925008564823899, "loss": 0.0188, "step": 2819 }, { "epoch": 0.3725600290649668, "grad_norm": 0.1660108119249344, "learning_rate": 0.00013921185650220307, "loss": 0.0212, "step": 2820 }, { "epoch": 0.37269214255045086, "grad_norm": 0.2400975227355957, "learning_rate": 0.0001391736205830521, "loss": 0.0206, "step": 2821 }, { "epoch": 0.3728242560359349, "grad_norm": 0.18147924542427063, "learning_rate": 0.00013913537789739063, "loss": 0.0146, "step": 2822 }, { "epoch": 0.3729563695214189, "grad_norm": 0.17276743054389954, "learning_rate": 0.00013909712845182432, "loss": 0.0128, "step": 2823 }, { "epoch": 0.37308848300690295, "grad_norm": 0.17892836034297943, "learning_rate": 0.0001390588722529601, "loss": 0.0267, "step": 2824 }, { "epoch": 0.373220596492387, "grad_norm": 0.19831377267837524, "learning_rate": 0.00013902060930740602, "loss": 0.0202, "step": 2825 }, { "epoch": 0.373352709977871, "grad_norm": 0.2546497583389282, "learning_rate": 0.0001389823396217712, "loss": 0.0253, "step": 2826 }, { "epoch": 0.37348482346335504, "grad_norm": 0.20438458025455475, "learning_rate": 0.00013894406320266614, "loss": 0.0191, "step": 2827 }, { "epoch": 0.3736169369488391, "grad_norm": 0.165092334151268, "learning_rate": 0.00013890578005670224, "loss": 0.0233, "step": 2828 }, { "epoch": 0.3737490504343231, "grad_norm": 0.16347329318523407, "learning_rate": 0.00013886749019049232, "loss": 0.0255, "step": 2829 }, { "epoch": 0.37388116391980714, "grad_norm": 0.1961696892976761, "learning_rate": 0.00013882919361065014, "loss": 0.0123, "step": 2830 }, { "epoch": 0.37401327740529117, "grad_norm": 0.15506936609745026, "learning_rate": 0.0001387908903237908, "loss": 0.0189, "step": 2831 }, { "epoch": 0.3741453908907752, "grad_norm": 0.1768009215593338, "learning_rate": 0.00013875258033653045, "loss": 0.0144, "step": 2832 }, { "epoch": 0.37427750437625923, "grad_norm": 0.19626346230506897, "learning_rate": 0.0001387142636554864, "loss": 0.014, "step": 2833 }, { "epoch": 0.37440961786174326, "grad_norm": 0.22314384579658508, "learning_rate": 0.00013867594028727715, "loss": 0.0168, "step": 2834 }, { "epoch": 0.3745417313472273, "grad_norm": 0.2579716444015503, "learning_rate": 0.00013863761023852233, "loss": 0.0249, "step": 2835 }, { "epoch": 0.3746738448327113, "grad_norm": 0.14838150143623352, "learning_rate": 0.00013859927351584278, "loss": 0.0185, "step": 2836 }, { "epoch": 0.37480595831819535, "grad_norm": 0.20027662813663483, "learning_rate": 0.00013856093012586045, "loss": 0.0222, "step": 2837 }, { "epoch": 0.3749380718036794, "grad_norm": 0.337117463350296, "learning_rate": 0.00013852258007519838, "loss": 0.0249, "step": 2838 }, { "epoch": 0.3750701852891634, "grad_norm": 0.20685425400733948, "learning_rate": 0.0001384842233704809, "loss": 0.0206, "step": 2839 }, { "epoch": 0.37520229877464745, "grad_norm": 0.20184406638145447, "learning_rate": 0.00013844586001833338, "loss": 0.0198, "step": 2840 }, { "epoch": 0.3753344122601315, "grad_norm": 0.21701376140117645, "learning_rate": 0.00013840749002538236, "loss": 0.021, "step": 2841 }, { "epoch": 0.3754665257456155, "grad_norm": 0.2415953427553177, "learning_rate": 0.00013836911339825558, "loss": 0.0219, "step": 2842 }, { "epoch": 0.37559863923109954, "grad_norm": 0.29480111598968506, "learning_rate": 0.00013833073014358184, "loss": 0.0213, "step": 2843 }, { "epoch": 0.37573075271658357, "grad_norm": 0.251859575510025, "learning_rate": 0.00013829234026799115, "loss": 0.0293, "step": 2844 }, { "epoch": 0.3758628662020676, "grad_norm": 0.23018434643745422, "learning_rate": 0.00013825394377811465, "loss": 0.0311, "step": 2845 }, { "epoch": 0.37599497968755163, "grad_norm": 0.21823716163635254, "learning_rate": 0.0001382155406805846, "loss": 0.0326, "step": 2846 }, { "epoch": 0.37612709317303566, "grad_norm": 0.2705758213996887, "learning_rate": 0.00013817713098203442, "loss": 0.0168, "step": 2847 }, { "epoch": 0.3762592066585197, "grad_norm": 0.1876915544271469, "learning_rate": 0.00013813871468909867, "loss": 0.0201, "step": 2848 }, { "epoch": 0.37639132014400367, "grad_norm": 0.19940240681171417, "learning_rate": 0.00013810029180841307, "loss": 0.0202, "step": 2849 }, { "epoch": 0.3765234336294877, "grad_norm": 0.22496294975280762, "learning_rate": 0.0001380618623466144, "loss": 0.0335, "step": 2850 }, { "epoch": 0.37665554711497173, "grad_norm": 0.31435123085975647, "learning_rate": 0.00013802342631034068, "loss": 0.0101, "step": 2851 }, { "epoch": 0.37678766060045576, "grad_norm": 0.14207758009433746, "learning_rate": 0.000137984983706231, "loss": 0.0152, "step": 2852 }, { "epoch": 0.3769197740859398, "grad_norm": 0.18659062683582306, "learning_rate": 0.00013794653454092559, "loss": 0.0213, "step": 2853 }, { "epoch": 0.3770518875714238, "grad_norm": 0.3208668529987335, "learning_rate": 0.0001379080788210658, "loss": 0.0271, "step": 2854 }, { "epoch": 0.37718400105690786, "grad_norm": 0.2228097915649414, "learning_rate": 0.00013786961655329425, "loss": 0.0354, "step": 2855 }, { "epoch": 0.3773161145423919, "grad_norm": 0.19546012580394745, "learning_rate": 0.00013783114774425448, "loss": 0.0229, "step": 2856 }, { "epoch": 0.3774482280278759, "grad_norm": 0.3273359537124634, "learning_rate": 0.00013779267240059123, "loss": 0.0272, "step": 2857 }, { "epoch": 0.37758034151335995, "grad_norm": 0.266356885433197, "learning_rate": 0.0001377541905289505, "loss": 0.0136, "step": 2858 }, { "epoch": 0.377712454998844, "grad_norm": 0.22096264362335205, "learning_rate": 0.0001377157021359792, "loss": 0.0327, "step": 2859 }, { "epoch": 0.377844568484328, "grad_norm": 0.1980806142091751, "learning_rate": 0.00013767720722832557, "loss": 0.0131, "step": 2860 }, { "epoch": 0.37797668196981205, "grad_norm": 0.16720090806484222, "learning_rate": 0.00013763870581263882, "loss": 0.0253, "step": 2861 }, { "epoch": 0.3781087954552961, "grad_norm": 0.1335599720478058, "learning_rate": 0.00013760019789556944, "loss": 0.0214, "step": 2862 }, { "epoch": 0.3782409089407801, "grad_norm": 0.20969679951667786, "learning_rate": 0.00013756168348376884, "loss": 0.0371, "step": 2863 }, { "epoch": 0.37837302242626414, "grad_norm": 0.17305636405944824, "learning_rate": 0.00013752316258388976, "loss": 0.0207, "step": 2864 }, { "epoch": 0.37850513591174817, "grad_norm": 0.29161009192466736, "learning_rate": 0.0001374846352025859, "loss": 0.0263, "step": 2865 }, { "epoch": 0.3786372493972322, "grad_norm": 0.21840567886829376, "learning_rate": 0.00013744610134651218, "loss": 0.0178, "step": 2866 }, { "epoch": 0.37876936288271623, "grad_norm": 0.1723618358373642, "learning_rate": 0.00013740756102232458, "loss": 0.0219, "step": 2867 }, { "epoch": 0.37890147636820026, "grad_norm": 0.1875240057706833, "learning_rate": 0.00013736901423668023, "loss": 0.02, "step": 2868 }, { "epoch": 0.3790335898536843, "grad_norm": 0.19212710857391357, "learning_rate": 0.00013733046099623737, "loss": 0.0179, "step": 2869 }, { "epoch": 0.3791657033391683, "grad_norm": 0.1740020364522934, "learning_rate": 0.00013729190130765538, "loss": 0.0285, "step": 2870 }, { "epoch": 0.37929781682465236, "grad_norm": 0.15244339406490326, "learning_rate": 0.00013725333517759463, "loss": 0.0195, "step": 2871 }, { "epoch": 0.3794299303101364, "grad_norm": 0.17750827968120575, "learning_rate": 0.00013721476261271685, "loss": 0.0193, "step": 2872 }, { "epoch": 0.3795620437956204, "grad_norm": 0.2849121391773224, "learning_rate": 0.0001371761836196846, "loss": 0.0382, "step": 2873 }, { "epoch": 0.37969415728110445, "grad_norm": 0.18970602750778198, "learning_rate": 0.00013713759820516171, "loss": 0.0157, "step": 2874 }, { "epoch": 0.3798262707665885, "grad_norm": 0.16193434596061707, "learning_rate": 0.0001370990063758131, "loss": 0.0185, "step": 2875 }, { "epoch": 0.3799583842520725, "grad_norm": 0.18522220849990845, "learning_rate": 0.00013706040813830483, "loss": 0.0232, "step": 2876 }, { "epoch": 0.38009049773755654, "grad_norm": 0.15757068991661072, "learning_rate": 0.00013702180349930396, "loss": 0.013, "step": 2877 }, { "epoch": 0.3802226112230406, "grad_norm": 0.14084728062152863, "learning_rate": 0.00013698319246547872, "loss": 0.0189, "step": 2878 }, { "epoch": 0.3803547247085246, "grad_norm": 0.1844385862350464, "learning_rate": 0.0001369445750434985, "loss": 0.0257, "step": 2879 }, { "epoch": 0.38048683819400864, "grad_norm": 0.11293325573205948, "learning_rate": 0.0001369059512400337, "loss": 0.0131, "step": 2880 }, { "epoch": 0.38061895167949267, "grad_norm": 0.22674749791622162, "learning_rate": 0.00013686732106175587, "loss": 0.0245, "step": 2881 }, { "epoch": 0.3807510651649767, "grad_norm": 0.2409743368625641, "learning_rate": 0.0001368286845153376, "loss": 0.0207, "step": 2882 }, { "epoch": 0.38088317865046073, "grad_norm": 0.15085680782794952, "learning_rate": 0.00013679004160745272, "loss": 0.0232, "step": 2883 }, { "epoch": 0.38101529213594476, "grad_norm": 0.15232190489768982, "learning_rate": 0.00013675139234477603, "loss": 0.0161, "step": 2884 }, { "epoch": 0.3811474056214288, "grad_norm": 0.17391395568847656, "learning_rate": 0.0001367127367339834, "loss": 0.0147, "step": 2885 }, { "epoch": 0.3812795191069128, "grad_norm": 0.4284667670726776, "learning_rate": 0.000136674074781752, "loss": 0.0393, "step": 2886 }, { "epoch": 0.38141163259239685, "grad_norm": 0.20392243564128876, "learning_rate": 0.00013663540649475986, "loss": 0.0149, "step": 2887 }, { "epoch": 0.3815437460778809, "grad_norm": 0.14438848197460175, "learning_rate": 0.00013659673187968623, "loss": 0.0195, "step": 2888 }, { "epoch": 0.3816758595633649, "grad_norm": 0.27825459837913513, "learning_rate": 0.00013655805094321143, "loss": 0.0132, "step": 2889 }, { "epoch": 0.38180797304884895, "grad_norm": 0.13513772189617157, "learning_rate": 0.00013651936369201686, "loss": 0.0195, "step": 2890 }, { "epoch": 0.381940086534333, "grad_norm": 0.1930549293756485, "learning_rate": 0.00013648067013278503, "loss": 0.025, "step": 2891 }, { "epoch": 0.382072200019817, "grad_norm": 0.18280315399169922, "learning_rate": 0.0001364419702721995, "loss": 0.026, "step": 2892 }, { "epoch": 0.38220431350530104, "grad_norm": 0.13674134016036987, "learning_rate": 0.000136403264116945, "loss": 0.009, "step": 2893 }, { "epoch": 0.38233642699078507, "grad_norm": 0.24852058291435242, "learning_rate": 0.00013636455167370724, "loss": 0.0282, "step": 2894 }, { "epoch": 0.3824685404762691, "grad_norm": 0.4249047636985779, "learning_rate": 0.00013632583294917314, "loss": 0.0331, "step": 2895 }, { "epoch": 0.38260065396175313, "grad_norm": 0.19219887256622314, "learning_rate": 0.00013628710795003055, "loss": 0.0248, "step": 2896 }, { "epoch": 0.38273276744723717, "grad_norm": 0.1995341181755066, "learning_rate": 0.00013624837668296854, "loss": 0.0237, "step": 2897 }, { "epoch": 0.3828648809327212, "grad_norm": 0.21090322732925415, "learning_rate": 0.00013620963915467722, "loss": 0.0203, "step": 2898 }, { "epoch": 0.3829969944182052, "grad_norm": 0.2628128230571747, "learning_rate": 0.00013617089537184776, "loss": 0.0318, "step": 2899 }, { "epoch": 0.38312910790368926, "grad_norm": 0.3117437958717346, "learning_rate": 0.00013613214534117238, "loss": 0.0215, "step": 2900 }, { "epoch": 0.3832612213891733, "grad_norm": 0.22514411807060242, "learning_rate": 0.0001360933890693445, "loss": 0.0198, "step": 2901 }, { "epoch": 0.3833933348746573, "grad_norm": 0.19629830121994019, "learning_rate": 0.0001360546265630585, "loss": 0.0251, "step": 2902 }, { "epoch": 0.38352544836014135, "grad_norm": 0.16926242411136627, "learning_rate": 0.00013601585782900988, "loss": 0.0229, "step": 2903 }, { "epoch": 0.3836575618456254, "grad_norm": 0.1834266632795334, "learning_rate": 0.00013597708287389524, "loss": 0.0331, "step": 2904 }, { "epoch": 0.3837896753311094, "grad_norm": 0.18873266875743866, "learning_rate": 0.00013593830170441218, "loss": 0.0309, "step": 2905 }, { "epoch": 0.38392178881659345, "grad_norm": 0.1950349062681198, "learning_rate": 0.00013589951432725947, "loss": 0.0213, "step": 2906 }, { "epoch": 0.3840539023020775, "grad_norm": 0.18346445262432098, "learning_rate": 0.00013586072074913685, "loss": 0.0185, "step": 2907 }, { "epoch": 0.3841860157875615, "grad_norm": 0.3406682014465332, "learning_rate": 0.00013582192097674525, "loss": 0.0222, "step": 2908 }, { "epoch": 0.38431812927304554, "grad_norm": 0.2803246080875397, "learning_rate": 0.00013578311501678657, "loss": 0.0266, "step": 2909 }, { "epoch": 0.38445024275852957, "grad_norm": 0.24295170605182648, "learning_rate": 0.0001357443028759638, "loss": 0.0288, "step": 2910 }, { "epoch": 0.3845823562440136, "grad_norm": 0.21152372658252716, "learning_rate": 0.00013570548456098104, "loss": 0.0256, "step": 2911 }, { "epoch": 0.38471446972949763, "grad_norm": 0.2268865704536438, "learning_rate": 0.00013566666007854342, "loss": 0.028, "step": 2912 }, { "epoch": 0.38484658321498166, "grad_norm": 0.2392731010913849, "learning_rate": 0.00013562782943535716, "loss": 0.0198, "step": 2913 }, { "epoch": 0.3849786967004657, "grad_norm": 0.13897490501403809, "learning_rate": 0.00013558899263812945, "loss": 0.0218, "step": 2914 }, { "epoch": 0.3851108101859497, "grad_norm": 0.1503855586051941, "learning_rate": 0.00013555014969356873, "loss": 0.0134, "step": 2915 }, { "epoch": 0.38524292367143376, "grad_norm": 0.15839877724647522, "learning_rate": 0.00013551130060838432, "loss": 0.0182, "step": 2916 }, { "epoch": 0.3853750371569178, "grad_norm": 0.23495671153068542, "learning_rate": 0.00013547244538928668, "loss": 0.0315, "step": 2917 }, { "epoch": 0.3855071506424018, "grad_norm": 0.17712470889091492, "learning_rate": 0.00013543358404298736, "loss": 0.0197, "step": 2918 }, { "epoch": 0.38563926412788585, "grad_norm": 0.23429818451404572, "learning_rate": 0.00013539471657619893, "loss": 0.0242, "step": 2919 }, { "epoch": 0.3857713776133699, "grad_norm": 0.16615261137485504, "learning_rate": 0.00013535584299563498, "loss": 0.0273, "step": 2920 }, { "epoch": 0.3859034910988539, "grad_norm": 0.20191910862922668, "learning_rate": 0.00013531696330801017, "loss": 0.0208, "step": 2921 }, { "epoch": 0.38603560458433794, "grad_norm": 0.20236913859844208, "learning_rate": 0.00013527807752004034, "loss": 0.0225, "step": 2922 }, { "epoch": 0.386167718069822, "grad_norm": 0.18063655495643616, "learning_rate": 0.00013523918563844217, "loss": 0.0239, "step": 2923 }, { "epoch": 0.386299831555306, "grad_norm": 0.21345975995063782, "learning_rate": 0.0001352002876699336, "loss": 0.0273, "step": 2924 }, { "epoch": 0.38643194504079004, "grad_norm": 0.20318594574928284, "learning_rate": 0.00013516138362123346, "loss": 0.0247, "step": 2925 }, { "epoch": 0.38656405852627407, "grad_norm": 0.1923476904630661, "learning_rate": 0.0001351224734990617, "loss": 0.0237, "step": 2926 }, { "epoch": 0.3866961720117581, "grad_norm": 0.26162242889404297, "learning_rate": 0.00013508355731013937, "loss": 0.026, "step": 2927 }, { "epoch": 0.38682828549724213, "grad_norm": 0.41296452283859253, "learning_rate": 0.00013504463506118847, "loss": 0.0247, "step": 2928 }, { "epoch": 0.38696039898272616, "grad_norm": 0.1592453122138977, "learning_rate": 0.0001350057067589321, "loss": 0.0149, "step": 2929 }, { "epoch": 0.3870925124682102, "grad_norm": 0.20988403260707855, "learning_rate": 0.00013496677241009442, "loss": 0.0237, "step": 2930 }, { "epoch": 0.3872246259536942, "grad_norm": 0.2678348422050476, "learning_rate": 0.00013492783202140058, "loss": 0.0307, "step": 2931 }, { "epoch": 0.38735673943917825, "grad_norm": 0.1757078468799591, "learning_rate": 0.00013488888559957683, "loss": 0.0338, "step": 2932 }, { "epoch": 0.3874888529246623, "grad_norm": 0.19529110193252563, "learning_rate": 0.00013484993315135036, "loss": 0.0296, "step": 2933 }, { "epoch": 0.3876209664101463, "grad_norm": 0.21215179562568665, "learning_rate": 0.0001348109746834496, "loss": 0.0217, "step": 2934 }, { "epoch": 0.38775307989563035, "grad_norm": 0.1827051341533661, "learning_rate": 0.0001347720102026038, "loss": 0.0193, "step": 2935 }, { "epoch": 0.3878851933811144, "grad_norm": 0.1964787244796753, "learning_rate": 0.00013473303971554338, "loss": 0.0165, "step": 2936 }, { "epoch": 0.3880173068665984, "grad_norm": 0.21198433637619019, "learning_rate": 0.0001346940632289998, "loss": 0.0249, "step": 2937 }, { "epoch": 0.38814942035208244, "grad_norm": 0.2188272625207901, "learning_rate": 0.00013465508074970544, "loss": 0.0178, "step": 2938 }, { "epoch": 0.3882815338375665, "grad_norm": 0.17410850524902344, "learning_rate": 0.00013461609228439386, "loss": 0.0277, "step": 2939 }, { "epoch": 0.3884136473230505, "grad_norm": 0.2147447019815445, "learning_rate": 0.00013457709783979956, "loss": 0.0237, "step": 2940 }, { "epoch": 0.38854576080853453, "grad_norm": 0.19408878684043884, "learning_rate": 0.0001345380974226581, "loss": 0.0174, "step": 2941 }, { "epoch": 0.38867787429401857, "grad_norm": 0.17176926136016846, "learning_rate": 0.00013449909103970605, "loss": 0.0173, "step": 2942 }, { "epoch": 0.3888099877795026, "grad_norm": 0.16017231345176697, "learning_rate": 0.00013446007869768109, "loss": 0.0247, "step": 2943 }, { "epoch": 0.38894210126498663, "grad_norm": 0.24421115219593048, "learning_rate": 0.00013442106040332182, "loss": 0.0284, "step": 2944 }, { "epoch": 0.38907421475047066, "grad_norm": 0.21438637375831604, "learning_rate": 0.00013438203616336798, "loss": 0.0236, "step": 2945 }, { "epoch": 0.3892063282359547, "grad_norm": 0.3065054416656494, "learning_rate": 0.00013434300598456024, "loss": 0.0249, "step": 2946 }, { "epoch": 0.3893384417214387, "grad_norm": 0.15419889986515045, "learning_rate": 0.00013430396987364033, "loss": 0.0155, "step": 2947 }, { "epoch": 0.38947055520692275, "grad_norm": 0.20932449400424957, "learning_rate": 0.00013426492783735102, "loss": 0.0212, "step": 2948 }, { "epoch": 0.3896026686924068, "grad_norm": 0.22069528698921204, "learning_rate": 0.00013422587988243606, "loss": 0.0219, "step": 2949 }, { "epoch": 0.3897347821778908, "grad_norm": 0.16955885291099548, "learning_rate": 0.00013418682601564033, "loss": 0.0147, "step": 2950 }, { "epoch": 0.38986689566337485, "grad_norm": 0.23010531067848206, "learning_rate": 0.00013414776624370958, "loss": 0.0245, "step": 2951 }, { "epoch": 0.3899990091488589, "grad_norm": 0.2735253870487213, "learning_rate": 0.00013410870057339067, "loss": 0.0251, "step": 2952 }, { "epoch": 0.3901311226343429, "grad_norm": 0.20499961078166962, "learning_rate": 0.00013406962901143146, "loss": 0.0159, "step": 2953 }, { "epoch": 0.39026323611982694, "grad_norm": 0.2517299950122833, "learning_rate": 0.0001340305515645809, "loss": 0.0217, "step": 2954 }, { "epoch": 0.39039534960531097, "grad_norm": 0.13152022659778595, "learning_rate": 0.00013399146823958878, "loss": 0.0143, "step": 2955 }, { "epoch": 0.390527463090795, "grad_norm": 0.23188278079032898, "learning_rate": 0.0001339523790432061, "loss": 0.0208, "step": 2956 }, { "epoch": 0.39065957657627903, "grad_norm": 0.20910461246967316, "learning_rate": 0.00013391328398218474, "loss": 0.0316, "step": 2957 }, { "epoch": 0.39079169006176306, "grad_norm": 0.13782212138175964, "learning_rate": 0.0001338741830632777, "loss": 0.015, "step": 2958 }, { "epoch": 0.3909238035472471, "grad_norm": 0.22674843668937683, "learning_rate": 0.00013383507629323883, "loss": 0.0252, "step": 2959 }, { "epoch": 0.3910559170327311, "grad_norm": 0.19780975580215454, "learning_rate": 0.0001337959636788232, "loss": 0.0251, "step": 2960 }, { "epoch": 0.39118803051821516, "grad_norm": 0.21379712224006653, "learning_rate": 0.00013375684522678674, "loss": 0.0199, "step": 2961 }, { "epoch": 0.3913201440036992, "grad_norm": 0.22542434930801392, "learning_rate": 0.0001337177209438864, "loss": 0.0288, "step": 2962 }, { "epoch": 0.3914522574891832, "grad_norm": 0.14902955293655396, "learning_rate": 0.0001336785908368802, "loss": 0.0119, "step": 2963 }, { "epoch": 0.39158437097466725, "grad_norm": 0.38387978076934814, "learning_rate": 0.00013363945491252715, "loss": 0.0341, "step": 2964 }, { "epoch": 0.3917164844601513, "grad_norm": 0.17453815042972565, "learning_rate": 0.0001336003131775872, "loss": 0.0222, "step": 2965 }, { "epoch": 0.3918485979456353, "grad_norm": 0.23411858081817627, "learning_rate": 0.00013356116563882143, "loss": 0.0224, "step": 2966 }, { "epoch": 0.39198071143111934, "grad_norm": 0.17240062355995178, "learning_rate": 0.00013352201230299176, "loss": 0.0228, "step": 2967 }, { "epoch": 0.3921128249166034, "grad_norm": 0.18300558626651764, "learning_rate": 0.00013348285317686123, "loss": 0.0235, "step": 2968 }, { "epoch": 0.3922449384020874, "grad_norm": 0.3427295386791229, "learning_rate": 0.00013344368826719388, "loss": 0.0277, "step": 2969 }, { "epoch": 0.39237705188757144, "grad_norm": 0.45966774225234985, "learning_rate": 0.00013340451758075468, "loss": 0.0399, "step": 2970 }, { "epoch": 0.39250916537305547, "grad_norm": 0.26486414670944214, "learning_rate": 0.0001333653411243096, "loss": 0.0185, "step": 2971 }, { "epoch": 0.3926412788585395, "grad_norm": 0.18045035004615784, "learning_rate": 0.00013332615890462575, "loss": 0.0186, "step": 2972 }, { "epoch": 0.39277339234402353, "grad_norm": 0.28566309809684753, "learning_rate": 0.000133286970928471, "loss": 0.0348, "step": 2973 }, { "epoch": 0.39290550582950756, "grad_norm": 0.209380641579628, "learning_rate": 0.00013324777720261443, "loss": 0.0149, "step": 2974 }, { "epoch": 0.3930376193149916, "grad_norm": 0.40362977981567383, "learning_rate": 0.00013320857773382598, "loss": 0.0138, "step": 2975 }, { "epoch": 0.3931697328004756, "grad_norm": 0.1985737681388855, "learning_rate": 0.00013316937252887665, "loss": 0.0188, "step": 2976 }, { "epoch": 0.39330184628595966, "grad_norm": 0.22340957820415497, "learning_rate": 0.0001331301615945384, "loss": 0.0262, "step": 2977 }, { "epoch": 0.3934339597714437, "grad_norm": 0.2279328852891922, "learning_rate": 0.00013309094493758411, "loss": 0.0186, "step": 2978 }, { "epoch": 0.3935660732569277, "grad_norm": 0.29355117678642273, "learning_rate": 0.00013305172256478787, "loss": 0.0292, "step": 2979 }, { "epoch": 0.39369818674241175, "grad_norm": 0.4233926832675934, "learning_rate": 0.0001330124944829245, "loss": 0.0333, "step": 2980 }, { "epoch": 0.3938303002278958, "grad_norm": 0.16877886652946472, "learning_rate": 0.00013297326069876996, "loss": 0.0214, "step": 2981 }, { "epoch": 0.3939624137133798, "grad_norm": 0.2501515746116638, "learning_rate": 0.00013293402121910113, "loss": 0.0257, "step": 2982 }, { "epoch": 0.39409452719886384, "grad_norm": 0.26275867223739624, "learning_rate": 0.00013289477605069595, "loss": 0.0245, "step": 2983 }, { "epoch": 0.3942266406843479, "grad_norm": 0.21009047329425812, "learning_rate": 0.00013285552520033318, "loss": 0.0286, "step": 2984 }, { "epoch": 0.3943587541698319, "grad_norm": 0.1921718567609787, "learning_rate": 0.00013281626867479276, "loss": 0.0221, "step": 2985 }, { "epoch": 0.39449086765531594, "grad_norm": 0.23634794354438782, "learning_rate": 0.00013277700648085556, "loss": 0.0263, "step": 2986 }, { "epoch": 0.39462298114079997, "grad_norm": 0.3247498571872711, "learning_rate": 0.0001327377386253033, "loss": 0.0315, "step": 2987 }, { "epoch": 0.394755094626284, "grad_norm": 0.21689558029174805, "learning_rate": 0.0001326984651149188, "loss": 0.0248, "step": 2988 }, { "epoch": 0.39488720811176803, "grad_norm": 0.23579855263233185, "learning_rate": 0.0001326591859564858, "loss": 0.0239, "step": 2989 }, { "epoch": 0.39501932159725206, "grad_norm": 0.21000924706459045, "learning_rate": 0.00013261990115678905, "loss": 0.0137, "step": 2990 }, { "epoch": 0.3951514350827361, "grad_norm": 0.15919966995716095, "learning_rate": 0.00013258061072261433, "loss": 0.0173, "step": 2991 }, { "epoch": 0.3952835485682201, "grad_norm": 0.15629446506500244, "learning_rate": 0.00013254131466074824, "loss": 0.0201, "step": 2992 }, { "epoch": 0.39541566205370415, "grad_norm": 0.20342610776424408, "learning_rate": 0.0001325020129779785, "loss": 0.0345, "step": 2993 }, { "epoch": 0.3955477755391882, "grad_norm": 0.17629854381084442, "learning_rate": 0.00013246270568109374, "loss": 0.0189, "step": 2994 }, { "epoch": 0.3956798890246722, "grad_norm": 0.18273794651031494, "learning_rate": 0.0001324233927768835, "loss": 0.0168, "step": 2995 }, { "epoch": 0.39581200251015625, "grad_norm": 0.28699859976768494, "learning_rate": 0.0001323840742721384, "loss": 0.037, "step": 2996 }, { "epoch": 0.3959441159956403, "grad_norm": 0.1269131451845169, "learning_rate": 0.00013234475017364993, "loss": 0.0137, "step": 2997 }, { "epoch": 0.3960762294811243, "grad_norm": 0.30667126178741455, "learning_rate": 0.00013230542048821067, "loss": 0.0324, "step": 2998 }, { "epoch": 0.39620834296660834, "grad_norm": 0.30856379866600037, "learning_rate": 0.000132266085222614, "loss": 0.0181, "step": 2999 }, { "epoch": 0.39634045645209237, "grad_norm": 0.21341513097286224, "learning_rate": 0.00013222674438365442, "loss": 0.0209, "step": 3000 }, { "epoch": 0.3964725699375764, "grad_norm": 0.22583794593811035, "learning_rate": 0.00013218739797812731, "loss": 0.0271, "step": 3001 }, { "epoch": 0.39660468342306043, "grad_norm": 0.19691592454910278, "learning_rate": 0.00013214804601282903, "loss": 0.0245, "step": 3002 }, { "epoch": 0.39673679690854446, "grad_norm": 0.15087178349494934, "learning_rate": 0.00013210868849455686, "loss": 0.0209, "step": 3003 }, { "epoch": 0.3968689103940285, "grad_norm": 0.196724995970726, "learning_rate": 0.0001320693254301091, "loss": 0.0231, "step": 3004 }, { "epoch": 0.3970010238795125, "grad_norm": 0.28341200947761536, "learning_rate": 0.00013202995682628496, "loss": 0.0291, "step": 3005 }, { "epoch": 0.39713313736499656, "grad_norm": 0.1786683052778244, "learning_rate": 0.00013199058268988467, "loss": 0.0194, "step": 3006 }, { "epoch": 0.3972652508504806, "grad_norm": 0.19349396228790283, "learning_rate": 0.00013195120302770936, "loss": 0.0226, "step": 3007 }, { "epoch": 0.3973973643359646, "grad_norm": 0.13992834091186523, "learning_rate": 0.00013191181784656114, "loss": 0.0215, "step": 3008 }, { "epoch": 0.39752947782144865, "grad_norm": 0.14852946996688843, "learning_rate": 0.000131872427153243, "loss": 0.0172, "step": 3009 }, { "epoch": 0.3976615913069327, "grad_norm": 0.24151255190372467, "learning_rate": 0.00013183303095455905, "loss": 0.0317, "step": 3010 }, { "epoch": 0.3977937047924167, "grad_norm": 0.22274500131607056, "learning_rate": 0.00013179362925731415, "loss": 0.0186, "step": 3011 }, { "epoch": 0.3979258182779007, "grad_norm": 0.2599247694015503, "learning_rate": 0.0001317542220683143, "loss": 0.0183, "step": 3012 }, { "epoch": 0.3980579317633847, "grad_norm": 0.2223990559577942, "learning_rate": 0.00013171480939436626, "loss": 0.0226, "step": 3013 }, { "epoch": 0.39819004524886875, "grad_norm": 0.20143398642539978, "learning_rate": 0.0001316753912422779, "loss": 0.0247, "step": 3014 }, { "epoch": 0.3983221587343528, "grad_norm": 0.1691291183233261, "learning_rate": 0.00013163596761885796, "loss": 0.0129, "step": 3015 }, { "epoch": 0.3984542722198368, "grad_norm": 0.2109353244304657, "learning_rate": 0.00013159653853091606, "loss": 0.0182, "step": 3016 }, { "epoch": 0.39858638570532084, "grad_norm": 0.08470373600721359, "learning_rate": 0.00013155710398526295, "loss": 0.0085, "step": 3017 }, { "epoch": 0.3987184991908049, "grad_norm": 0.16234825551509857, "learning_rate": 0.00013151766398871015, "loss": 0.015, "step": 3018 }, { "epoch": 0.3988506126762889, "grad_norm": 0.30378457903862, "learning_rate": 0.00013147821854807022, "loss": 0.036, "step": 3019 }, { "epoch": 0.39898272616177294, "grad_norm": 0.16837498545646667, "learning_rate": 0.00013143876767015655, "loss": 0.0182, "step": 3020 }, { "epoch": 0.39911483964725697, "grad_norm": 0.3299632668495178, "learning_rate": 0.00013139931136178359, "loss": 0.035, "step": 3021 }, { "epoch": 0.399246953132741, "grad_norm": 0.39903292059898376, "learning_rate": 0.0001313598496297667, "loss": 0.025, "step": 3022 }, { "epoch": 0.39937906661822503, "grad_norm": 0.1631740927696228, "learning_rate": 0.00013132038248092208, "loss": 0.0199, "step": 3023 }, { "epoch": 0.39951118010370906, "grad_norm": 0.17485398054122925, "learning_rate": 0.00013128090992206703, "loss": 0.0121, "step": 3024 }, { "epoch": 0.3996432935891931, "grad_norm": 0.17089051008224487, "learning_rate": 0.00013124143196001963, "loss": 0.0159, "step": 3025 }, { "epoch": 0.3997754070746771, "grad_norm": 0.21537205576896667, "learning_rate": 0.00013120194860159902, "loss": 0.0209, "step": 3026 }, { "epoch": 0.39990752056016116, "grad_norm": 0.46143659949302673, "learning_rate": 0.0001311624598536252, "loss": 0.0314, "step": 3027 }, { "epoch": 0.4000396340456452, "grad_norm": 0.1878209412097931, "learning_rate": 0.00013112296572291904, "loss": 0.0216, "step": 3028 }, { "epoch": 0.4001717475311292, "grad_norm": 0.13695985078811646, "learning_rate": 0.0001310834662163025, "loss": 0.0125, "step": 3029 }, { "epoch": 0.40030386101661325, "grad_norm": 0.14801840484142303, "learning_rate": 0.0001310439613405983, "loss": 0.0215, "step": 3030 }, { "epoch": 0.4004359745020973, "grad_norm": 0.2693830728530884, "learning_rate": 0.0001310044511026303, "loss": 0.023, "step": 3031 }, { "epoch": 0.4005680879875813, "grad_norm": 0.27653369307518005, "learning_rate": 0.000130964935509223, "loss": 0.0229, "step": 3032 }, { "epoch": 0.40070020147306534, "grad_norm": 0.15918390452861786, "learning_rate": 0.0001309254145672021, "loss": 0.0188, "step": 3033 }, { "epoch": 0.4008323149585494, "grad_norm": 0.23531582951545715, "learning_rate": 0.00013088588828339402, "loss": 0.0182, "step": 3034 }, { "epoch": 0.4009644284440334, "grad_norm": 0.13156545162200928, "learning_rate": 0.00013084635666462622, "loss": 0.0131, "step": 3035 }, { "epoch": 0.40109654192951744, "grad_norm": 0.17341113090515137, "learning_rate": 0.00013080681971772707, "loss": 0.0271, "step": 3036 }, { "epoch": 0.40122865541500147, "grad_norm": 0.1439688354730606, "learning_rate": 0.0001307672774495258, "loss": 0.0167, "step": 3037 }, { "epoch": 0.4013607689004855, "grad_norm": 0.18817555904388428, "learning_rate": 0.0001307277298668526, "loss": 0.0197, "step": 3038 }, { "epoch": 0.40149288238596953, "grad_norm": 0.28514885902404785, "learning_rate": 0.0001306881769765386, "loss": 0.0239, "step": 3039 }, { "epoch": 0.40162499587145356, "grad_norm": 0.2204063981771469, "learning_rate": 0.00013064861878541584, "loss": 0.0158, "step": 3040 }, { "epoch": 0.4017571093569376, "grad_norm": 0.1329529732465744, "learning_rate": 0.0001306090553003172, "loss": 0.0125, "step": 3041 }, { "epoch": 0.4018892228424216, "grad_norm": 0.1567203551530838, "learning_rate": 0.0001305694865280766, "loss": 0.0156, "step": 3042 }, { "epoch": 0.40202133632790565, "grad_norm": 0.258801132440567, "learning_rate": 0.00013052991247552873, "loss": 0.0284, "step": 3043 }, { "epoch": 0.4021534498133897, "grad_norm": 0.14416727423667908, "learning_rate": 0.00013049033314950931, "loss": 0.0191, "step": 3044 }, { "epoch": 0.4022855632988737, "grad_norm": 0.24458062648773193, "learning_rate": 0.00013045074855685493, "loss": 0.0358, "step": 3045 }, { "epoch": 0.40241767678435775, "grad_norm": 0.187479630112648, "learning_rate": 0.0001304111587044031, "loss": 0.0177, "step": 3046 }, { "epoch": 0.4025497902698418, "grad_norm": 0.17222760617733002, "learning_rate": 0.00013037156359899216, "loss": 0.0195, "step": 3047 }, { "epoch": 0.4026819037553258, "grad_norm": 0.20241984724998474, "learning_rate": 0.00013033196324746153, "loss": 0.023, "step": 3048 }, { "epoch": 0.40281401724080984, "grad_norm": 0.19136063754558563, "learning_rate": 0.00013029235765665134, "loss": 0.023, "step": 3049 }, { "epoch": 0.40294613072629387, "grad_norm": 0.17290174961090088, "learning_rate": 0.0001302527468334028, "loss": 0.0132, "step": 3050 }, { "epoch": 0.4030782442117779, "grad_norm": 0.26126572489738464, "learning_rate": 0.00013021313078455783, "loss": 0.0211, "step": 3051 }, { "epoch": 0.40321035769726193, "grad_norm": 0.30257081985473633, "learning_rate": 0.00013017350951695944, "loss": 0.0237, "step": 3052 }, { "epoch": 0.40334247118274597, "grad_norm": 0.1518670916557312, "learning_rate": 0.00013013388303745145, "loss": 0.0164, "step": 3053 }, { "epoch": 0.40347458466823, "grad_norm": 0.26053333282470703, "learning_rate": 0.0001300942513528786, "loss": 0.0167, "step": 3054 }, { "epoch": 0.403606698153714, "grad_norm": 0.2704867422580719, "learning_rate": 0.00013005461447008647, "loss": 0.0238, "step": 3055 }, { "epoch": 0.40373881163919806, "grad_norm": 0.1770847886800766, "learning_rate": 0.00013001497239592164, "loss": 0.0103, "step": 3056 }, { "epoch": 0.4038709251246821, "grad_norm": 0.44425147771835327, "learning_rate": 0.00012997532513723154, "loss": 0.024, "step": 3057 }, { "epoch": 0.4040030386101661, "grad_norm": 0.1942521631717682, "learning_rate": 0.0001299356727008645, "loss": 0.0178, "step": 3058 }, { "epoch": 0.40413515209565015, "grad_norm": 0.24964120984077454, "learning_rate": 0.00012989601509366967, "loss": 0.022, "step": 3059 }, { "epoch": 0.4042672655811342, "grad_norm": 0.34468597173690796, "learning_rate": 0.00012985635232249724, "loss": 0.0253, "step": 3060 }, { "epoch": 0.4043993790666182, "grad_norm": 0.19694958627223969, "learning_rate": 0.00012981668439419815, "loss": 0.0332, "step": 3061 }, { "epoch": 0.40453149255210225, "grad_norm": 0.09895392507314682, "learning_rate": 0.00012977701131562436, "loss": 0.009, "step": 3062 }, { "epoch": 0.4046636060375863, "grad_norm": 0.14473307132720947, "learning_rate": 0.00012973733309362857, "loss": 0.02, "step": 3063 }, { "epoch": 0.4047957195230703, "grad_norm": 0.2015502005815506, "learning_rate": 0.00012969764973506454, "loss": 0.0279, "step": 3064 }, { "epoch": 0.40492783300855434, "grad_norm": 0.17858926951885223, "learning_rate": 0.00012965796124678677, "loss": 0.024, "step": 3065 }, { "epoch": 0.40505994649403837, "grad_norm": 0.3196292519569397, "learning_rate": 0.0001296182676356507, "loss": 0.0288, "step": 3066 }, { "epoch": 0.4051920599795224, "grad_norm": 0.20552773773670197, "learning_rate": 0.0001295785689085127, "loss": 0.0251, "step": 3067 }, { "epoch": 0.40532417346500643, "grad_norm": 0.19985134899616241, "learning_rate": 0.00012953886507222992, "loss": 0.0173, "step": 3068 }, { "epoch": 0.40545628695049046, "grad_norm": 0.14639391005039215, "learning_rate": 0.0001294991561336605, "loss": 0.0225, "step": 3069 }, { "epoch": 0.4055884004359745, "grad_norm": 1.9710235595703125, "learning_rate": 0.00012945944209966345, "loss": 0.0663, "step": 3070 }, { "epoch": 0.4057205139214585, "grad_norm": 0.275114506483078, "learning_rate": 0.0001294197229770986, "loss": 0.0314, "step": 3071 }, { "epoch": 0.40585262740694256, "grad_norm": 0.12577217817306519, "learning_rate": 0.00012937999877282662, "loss": 0.0104, "step": 3072 }, { "epoch": 0.4059847408924266, "grad_norm": 0.12943196296691895, "learning_rate": 0.0001293402694937092, "loss": 0.0144, "step": 3073 }, { "epoch": 0.4061168543779106, "grad_norm": 0.18152198195457458, "learning_rate": 0.00012930053514660883, "loss": 0.0144, "step": 3074 }, { "epoch": 0.40624896786339465, "grad_norm": 0.1707521229982376, "learning_rate": 0.0001292607957383888, "loss": 0.0163, "step": 3075 }, { "epoch": 0.4063810813488787, "grad_norm": 0.1473418027162552, "learning_rate": 0.00012922105127591348, "loss": 0.0152, "step": 3076 }, { "epoch": 0.4065131948343627, "grad_norm": 0.19073860347270966, "learning_rate": 0.00012918130176604783, "loss": 0.0218, "step": 3077 }, { "epoch": 0.40664530831984674, "grad_norm": 0.23325873911380768, "learning_rate": 0.00012914154721565795, "loss": 0.0187, "step": 3078 }, { "epoch": 0.4067774218053308, "grad_norm": 0.1258496791124344, "learning_rate": 0.00012910178763161066, "loss": 0.0132, "step": 3079 }, { "epoch": 0.4069095352908148, "grad_norm": 0.25707176327705383, "learning_rate": 0.00012906202302077365, "loss": 0.0192, "step": 3080 }, { "epoch": 0.40704164877629884, "grad_norm": 0.2727690041065216, "learning_rate": 0.00012902225339001558, "loss": 0.043, "step": 3081 }, { "epoch": 0.40717376226178287, "grad_norm": 0.2927294075489044, "learning_rate": 0.00012898247874620585, "loss": 0.0261, "step": 3082 }, { "epoch": 0.4073058757472669, "grad_norm": 0.20090535283088684, "learning_rate": 0.0001289426990962148, "loss": 0.0243, "step": 3083 }, { "epoch": 0.40743798923275093, "grad_norm": 0.2168041467666626, "learning_rate": 0.0001289029144469137, "loss": 0.0344, "step": 3084 }, { "epoch": 0.40757010271823496, "grad_norm": 0.48691752552986145, "learning_rate": 0.00012886312480517447, "loss": 0.0255, "step": 3085 }, { "epoch": 0.407702216203719, "grad_norm": 0.2309977412223816, "learning_rate": 0.0001288233301778701, "loss": 0.0221, "step": 3086 }, { "epoch": 0.407834329689203, "grad_norm": 0.17883218824863434, "learning_rate": 0.00012878353057187435, "loss": 0.0205, "step": 3087 }, { "epoch": 0.40796644317468705, "grad_norm": 0.24214334785938263, "learning_rate": 0.00012874372599406192, "loss": 0.027, "step": 3088 }, { "epoch": 0.4080985566601711, "grad_norm": 0.15543243288993835, "learning_rate": 0.00012870391645130818, "loss": 0.0111, "step": 3089 }, { "epoch": 0.4082306701456551, "grad_norm": 0.25050976872444153, "learning_rate": 0.0001286641019504896, "loss": 0.0391, "step": 3090 }, { "epoch": 0.40836278363113915, "grad_norm": 0.2022220343351364, "learning_rate": 0.00012862428249848335, "loss": 0.0196, "step": 3091 }, { "epoch": 0.4084948971166232, "grad_norm": 0.19390860199928284, "learning_rate": 0.00012858445810216747, "loss": 0.0157, "step": 3092 }, { "epoch": 0.4086270106021072, "grad_norm": 0.22381038963794708, "learning_rate": 0.00012854462876842095, "loss": 0.0267, "step": 3093 }, { "epoch": 0.40875912408759124, "grad_norm": 0.32017675042152405, "learning_rate": 0.00012850479450412348, "loss": 0.0227, "step": 3094 }, { "epoch": 0.4088912375730753, "grad_norm": 0.253348708152771, "learning_rate": 0.00012846495531615573, "loss": 0.0343, "step": 3095 }, { "epoch": 0.4090233510585593, "grad_norm": 0.3195996582508087, "learning_rate": 0.00012842511121139916, "loss": 0.0157, "step": 3096 }, { "epoch": 0.40915546454404333, "grad_norm": 0.1614832580089569, "learning_rate": 0.0001283852621967361, "loss": 0.0201, "step": 3097 }, { "epoch": 0.40928757802952737, "grad_norm": 0.1978360265493393, "learning_rate": 0.00012834540827904976, "loss": 0.0217, "step": 3098 }, { "epoch": 0.4094196915150114, "grad_norm": 0.1857193112373352, "learning_rate": 0.00012830554946522405, "loss": 0.0137, "step": 3099 }, { "epoch": 0.40955180500049543, "grad_norm": 0.2626390755176544, "learning_rate": 0.00012826568576214398, "loss": 0.0146, "step": 3100 }, { "epoch": 0.40968391848597946, "grad_norm": 0.20259742438793182, "learning_rate": 0.00012822581717669514, "loss": 0.0226, "step": 3101 }, { "epoch": 0.4098160319714635, "grad_norm": 0.1670604795217514, "learning_rate": 0.00012818594371576412, "loss": 0.0168, "step": 3102 }, { "epoch": 0.4099481454569475, "grad_norm": 0.15540018677711487, "learning_rate": 0.00012814606538623835, "loss": 0.0151, "step": 3103 }, { "epoch": 0.41008025894243155, "grad_norm": 0.31598779559135437, "learning_rate": 0.00012810618219500603, "loss": 0.0346, "step": 3104 }, { "epoch": 0.4102123724279156, "grad_norm": 0.20833928883075714, "learning_rate": 0.00012806629414895625, "loss": 0.0234, "step": 3105 }, { "epoch": 0.4103444859133996, "grad_norm": 0.12443934381008148, "learning_rate": 0.00012802640125497892, "loss": 0.0135, "step": 3106 }, { "epoch": 0.41047659939888365, "grad_norm": 0.12392992526292801, "learning_rate": 0.00012798650351996478, "loss": 0.0127, "step": 3107 }, { "epoch": 0.4106087128843677, "grad_norm": 0.26349079608917236, "learning_rate": 0.00012794660095080543, "loss": 0.0233, "step": 3108 }, { "epoch": 0.4107408263698517, "grad_norm": 0.14821572601795197, "learning_rate": 0.0001279066935543933, "loss": 0.0161, "step": 3109 }, { "epoch": 0.41087293985533574, "grad_norm": 0.305846244096756, "learning_rate": 0.00012786678133762164, "loss": 0.0147, "step": 3110 }, { "epoch": 0.41100505334081977, "grad_norm": 0.2815128266811371, "learning_rate": 0.00012782686430738453, "loss": 0.0324, "step": 3111 }, { "epoch": 0.4111371668263038, "grad_norm": 0.5089820027351379, "learning_rate": 0.0001277869424705769, "loss": 0.0328, "step": 3112 }, { "epoch": 0.41126928031178783, "grad_norm": 0.24389733374118805, "learning_rate": 0.0001277470158340945, "loss": 0.0256, "step": 3113 }, { "epoch": 0.41140139379727186, "grad_norm": 0.281982958316803, "learning_rate": 0.0001277070844048339, "loss": 0.0348, "step": 3114 }, { "epoch": 0.4115335072827559, "grad_norm": 0.20420657098293304, "learning_rate": 0.00012766714818969254, "loss": 0.0229, "step": 3115 }, { "epoch": 0.4116656207682399, "grad_norm": 0.19268015027046204, "learning_rate": 0.0001276272071955686, "loss": 0.0111, "step": 3116 }, { "epoch": 0.41179773425372396, "grad_norm": 0.3611801266670227, "learning_rate": 0.00012758726142936117, "loss": 0.0402, "step": 3117 }, { "epoch": 0.411929847739208, "grad_norm": 0.18338996171951294, "learning_rate": 0.00012754731089797015, "loss": 0.0175, "step": 3118 }, { "epoch": 0.412061961224692, "grad_norm": 0.20748326182365417, "learning_rate": 0.00012750735560829624, "loss": 0.0255, "step": 3119 }, { "epoch": 0.41219407471017605, "grad_norm": 0.20322351157665253, "learning_rate": 0.00012746739556724091, "loss": 0.0112, "step": 3120 }, { "epoch": 0.4123261881956601, "grad_norm": 0.07140868902206421, "learning_rate": 0.00012742743078170664, "loss": 0.0043, "step": 3121 }, { "epoch": 0.4124583016811441, "grad_norm": 0.2231295257806778, "learning_rate": 0.0001273874612585965, "loss": 0.0225, "step": 3122 }, { "epoch": 0.41259041516662814, "grad_norm": 0.21522819995880127, "learning_rate": 0.00012734748700481444, "loss": 0.0333, "step": 3123 }, { "epoch": 0.4127225286521122, "grad_norm": 0.1936907023191452, "learning_rate": 0.0001273075080272654, "loss": 0.0216, "step": 3124 }, { "epoch": 0.4128546421375962, "grad_norm": 0.2655859887599945, "learning_rate": 0.00012726752433285486, "loss": 0.0306, "step": 3125 }, { "epoch": 0.41298675562308024, "grad_norm": 0.2395084798336029, "learning_rate": 0.00012722753592848935, "loss": 0.0302, "step": 3126 }, { "epoch": 0.41311886910856427, "grad_norm": 0.23985159397125244, "learning_rate": 0.00012718754282107608, "loss": 0.0232, "step": 3127 }, { "epoch": 0.4132509825940483, "grad_norm": 0.3043936491012573, "learning_rate": 0.00012714754501752312, "loss": 0.0345, "step": 3128 }, { "epoch": 0.41338309607953233, "grad_norm": 0.20778505504131317, "learning_rate": 0.00012710754252473935, "loss": 0.0189, "step": 3129 }, { "epoch": 0.41351520956501636, "grad_norm": 0.21860454976558685, "learning_rate": 0.00012706753534963444, "loss": 0.0239, "step": 3130 }, { "epoch": 0.4136473230505004, "grad_norm": 0.2815905213356018, "learning_rate": 0.00012702752349911888, "loss": 0.0339, "step": 3131 }, { "epoch": 0.4137794365359844, "grad_norm": 0.22736093401908875, "learning_rate": 0.000126987506980104, "loss": 0.025, "step": 3132 }, { "epoch": 0.41391155002146846, "grad_norm": 0.41342923045158386, "learning_rate": 0.00012694748579950187, "loss": 0.0222, "step": 3133 }, { "epoch": 0.4140436635069525, "grad_norm": 0.3165312707424164, "learning_rate": 0.00012690745996422542, "loss": 0.0362, "step": 3134 }, { "epoch": 0.4141757769924365, "grad_norm": 0.17335399985313416, "learning_rate": 0.00012686742948118831, "loss": 0.0216, "step": 3135 }, { "epoch": 0.41430789047792055, "grad_norm": 0.2092505544424057, "learning_rate": 0.0001268273943573052, "loss": 0.0275, "step": 3136 }, { "epoch": 0.4144400039634046, "grad_norm": 0.25818854570388794, "learning_rate": 0.0001267873545994912, "loss": 0.0275, "step": 3137 }, { "epoch": 0.4145721174488886, "grad_norm": 0.1769459843635559, "learning_rate": 0.00012674731021466263, "loss": 0.0203, "step": 3138 }, { "epoch": 0.41470423093437264, "grad_norm": 0.26153334975242615, "learning_rate": 0.00012670726120973631, "loss": 0.016, "step": 3139 }, { "epoch": 0.4148363444198567, "grad_norm": 0.33175545930862427, "learning_rate": 0.00012666720759162996, "loss": 0.034, "step": 3140 }, { "epoch": 0.4149684579053407, "grad_norm": 0.17180666327476501, "learning_rate": 0.0001266271493672621, "loss": 0.0187, "step": 3141 }, { "epoch": 0.41510057139082474, "grad_norm": 0.18626075983047485, "learning_rate": 0.00012658708654355203, "loss": 0.0193, "step": 3142 }, { "epoch": 0.41523268487630877, "grad_norm": 0.32776767015457153, "learning_rate": 0.00012654701912741988, "loss": 0.019, "step": 3143 }, { "epoch": 0.4153647983617928, "grad_norm": 0.1480986624956131, "learning_rate": 0.00012650694712578652, "loss": 0.0105, "step": 3144 }, { "epoch": 0.41549691184727683, "grad_norm": 0.18982647359371185, "learning_rate": 0.00012646687054557366, "loss": 0.0246, "step": 3145 }, { "epoch": 0.41562902533276086, "grad_norm": 0.3359149396419525, "learning_rate": 0.00012642678939370376, "loss": 0.0178, "step": 3146 }, { "epoch": 0.4157611388182449, "grad_norm": 0.1366925984621048, "learning_rate": 0.00012638670367710013, "loss": 0.0204, "step": 3147 }, { "epoch": 0.4158932523037289, "grad_norm": 0.18426810204982758, "learning_rate": 0.0001263466134026868, "loss": 0.019, "step": 3148 }, { "epoch": 0.41602536578921295, "grad_norm": 0.2569717764854431, "learning_rate": 0.00012630651857738854, "loss": 0.0189, "step": 3149 }, { "epoch": 0.416157479274697, "grad_norm": 0.17694905400276184, "learning_rate": 0.00012626641920813114, "loss": 0.0182, "step": 3150 }, { "epoch": 0.416289592760181, "grad_norm": 0.3232481777667999, "learning_rate": 0.0001262263153018409, "loss": 0.0192, "step": 3151 }, { "epoch": 0.41642170624566505, "grad_norm": 0.23598027229309082, "learning_rate": 0.00012618620686544505, "loss": 0.0315, "step": 3152 }, { "epoch": 0.4165538197311491, "grad_norm": 0.2639772891998291, "learning_rate": 0.00012614609390587157, "loss": 0.024, "step": 3153 }, { "epoch": 0.4166859332166331, "grad_norm": 0.17504766583442688, "learning_rate": 0.00012610597643004926, "loss": 0.0241, "step": 3154 }, { "epoch": 0.41681804670211714, "grad_norm": 0.1681995987892151, "learning_rate": 0.00012606585444490762, "loss": 0.0191, "step": 3155 }, { "epoch": 0.41695016018760117, "grad_norm": 0.13266129791736603, "learning_rate": 0.00012602572795737695, "loss": 0.015, "step": 3156 }, { "epoch": 0.4170822736730852, "grad_norm": 0.14186996221542358, "learning_rate": 0.00012598559697438844, "loss": 0.019, "step": 3157 }, { "epoch": 0.41721438715856923, "grad_norm": 0.20277422666549683, "learning_rate": 0.00012594546150287384, "loss": 0.0177, "step": 3158 }, { "epoch": 0.41734650064405326, "grad_norm": 0.18437455594539642, "learning_rate": 0.00012590532154976595, "loss": 0.0211, "step": 3159 }, { "epoch": 0.4174786141295373, "grad_norm": 0.22293388843536377, "learning_rate": 0.00012586517712199807, "loss": 0.0173, "step": 3160 }, { "epoch": 0.4176107276150213, "grad_norm": 0.1834786832332611, "learning_rate": 0.00012582502822650445, "loss": 0.0168, "step": 3161 }, { "epoch": 0.41774284110050536, "grad_norm": 0.16545066237449646, "learning_rate": 0.0001257848748702201, "loss": 0.012, "step": 3162 }, { "epoch": 0.4178749545859894, "grad_norm": 0.16071240603923798, "learning_rate": 0.00012574471706008067, "loss": 0.0217, "step": 3163 }, { "epoch": 0.4180070680714734, "grad_norm": 0.2613682746887207, "learning_rate": 0.00012570455480302278, "loss": 0.0285, "step": 3164 }, { "epoch": 0.41813918155695745, "grad_norm": 0.1711345762014389, "learning_rate": 0.00012566438810598365, "loss": 0.0248, "step": 3165 }, { "epoch": 0.4182712950424415, "grad_norm": 0.29203635454177856, "learning_rate": 0.0001256242169759013, "loss": 0.03, "step": 3166 }, { "epoch": 0.4184034085279255, "grad_norm": 0.18163099884986877, "learning_rate": 0.0001255840414197146, "loss": 0.0185, "step": 3167 }, { "epoch": 0.41853552201340954, "grad_norm": 0.21773065626621246, "learning_rate": 0.00012554386144436304, "loss": 0.0199, "step": 3168 }, { "epoch": 0.4186676354988936, "grad_norm": 0.19037318229675293, "learning_rate": 0.00012550367705678708, "loss": 0.0278, "step": 3169 }, { "epoch": 0.4187997489843776, "grad_norm": 0.23356559872627258, "learning_rate": 0.00012546348826392772, "loss": 0.021, "step": 3170 }, { "epoch": 0.41893186246986164, "grad_norm": 0.2311427742242813, "learning_rate": 0.00012542329507272688, "loss": 0.0234, "step": 3171 }, { "epoch": 0.41906397595534567, "grad_norm": 0.17222066223621368, "learning_rate": 0.00012538309749012715, "loss": 0.0198, "step": 3172 }, { "epoch": 0.4191960894408297, "grad_norm": 0.24496857821941376, "learning_rate": 0.0001253428955230719, "loss": 0.037, "step": 3173 }, { "epoch": 0.41932820292631373, "grad_norm": 0.2687060534954071, "learning_rate": 0.00012530268917850535, "loss": 0.0192, "step": 3174 }, { "epoch": 0.4194603164117977, "grad_norm": 0.24776218831539154, "learning_rate": 0.00012526247846337228, "loss": 0.0215, "step": 3175 }, { "epoch": 0.41959242989728174, "grad_norm": 0.15776574611663818, "learning_rate": 0.00012522226338461842, "loss": 0.0153, "step": 3176 }, { "epoch": 0.41972454338276577, "grad_norm": 0.3436722159385681, "learning_rate": 0.00012518204394919015, "loss": 0.0237, "step": 3177 }, { "epoch": 0.4198566568682498, "grad_norm": 0.17449195683002472, "learning_rate": 0.00012514182016403461, "loss": 0.0139, "step": 3178 }, { "epoch": 0.41998877035373383, "grad_norm": 0.21825307607650757, "learning_rate": 0.00012510159203609974, "loss": 0.0312, "step": 3179 }, { "epoch": 0.42012088383921786, "grad_norm": 0.207487091422081, "learning_rate": 0.00012506135957233416, "loss": 0.0302, "step": 3180 }, { "epoch": 0.4202529973247019, "grad_norm": 0.17855414748191833, "learning_rate": 0.0001250211227796873, "loss": 0.0176, "step": 3181 }, { "epoch": 0.4203851108101859, "grad_norm": 0.20130109786987305, "learning_rate": 0.00012498088166510931, "loss": 0.018, "step": 3182 }, { "epoch": 0.42051722429566996, "grad_norm": 0.25873619318008423, "learning_rate": 0.00012494063623555107, "loss": 0.022, "step": 3183 }, { "epoch": 0.420649337781154, "grad_norm": 0.1681220680475235, "learning_rate": 0.00012490038649796425, "loss": 0.0241, "step": 3184 }, { "epoch": 0.420781451266638, "grad_norm": 0.24111413955688477, "learning_rate": 0.00012486013245930125, "loss": 0.0265, "step": 3185 }, { "epoch": 0.42091356475212205, "grad_norm": 0.1893637329339981, "learning_rate": 0.0001248198741265152, "loss": 0.0233, "step": 3186 }, { "epoch": 0.4210456782376061, "grad_norm": 0.1877831071615219, "learning_rate": 0.0001247796115065599, "loss": 0.0292, "step": 3187 }, { "epoch": 0.4211777917230901, "grad_norm": 0.21083886921405792, "learning_rate": 0.00012473934460639007, "loss": 0.0336, "step": 3188 }, { "epoch": 0.42130990520857414, "grad_norm": 0.1846085637807846, "learning_rate": 0.00012469907343296097, "loss": 0.0175, "step": 3189 }, { "epoch": 0.4214420186940582, "grad_norm": 0.17249713838100433, "learning_rate": 0.00012465879799322877, "loss": 0.0168, "step": 3190 }, { "epoch": 0.4215741321795422, "grad_norm": 0.2629503011703491, "learning_rate": 0.00012461851829415028, "loss": 0.0222, "step": 3191 }, { "epoch": 0.42170624566502624, "grad_norm": 0.18024857342243195, "learning_rate": 0.00012457823434268303, "loss": 0.0196, "step": 3192 }, { "epoch": 0.42183835915051027, "grad_norm": 0.23293425142765045, "learning_rate": 0.00012453794614578537, "loss": 0.0248, "step": 3193 }, { "epoch": 0.4219704726359943, "grad_norm": 0.16819629073143005, "learning_rate": 0.00012449765371041628, "loss": 0.0108, "step": 3194 }, { "epoch": 0.42210258612147833, "grad_norm": 0.20160499215126038, "learning_rate": 0.00012445735704353557, "loss": 0.0185, "step": 3195 }, { "epoch": 0.42223469960696236, "grad_norm": 0.18579092621803284, "learning_rate": 0.00012441705615210368, "loss": 0.0225, "step": 3196 }, { "epoch": 0.4223668130924464, "grad_norm": 0.16678574681282043, "learning_rate": 0.00012437675104308194, "loss": 0.0222, "step": 3197 }, { "epoch": 0.4224989265779304, "grad_norm": 0.22756989300251007, "learning_rate": 0.0001243364417234322, "loss": 0.0211, "step": 3198 }, { "epoch": 0.42263104006341445, "grad_norm": 0.2957361042499542, "learning_rate": 0.00012429612820011717, "loss": 0.0244, "step": 3199 }, { "epoch": 0.4227631535488985, "grad_norm": 0.1491062194108963, "learning_rate": 0.00012425581048010028, "loss": 0.0145, "step": 3200 }, { "epoch": 0.4228952670343825, "grad_norm": 0.1475045531988144, "learning_rate": 0.0001242154885703456, "loss": 0.0145, "step": 3201 }, { "epoch": 0.42302738051986655, "grad_norm": 0.23000866174697876, "learning_rate": 0.0001241751624778181, "loss": 0.0158, "step": 3202 }, { "epoch": 0.4231594940053506, "grad_norm": 0.22191113233566284, "learning_rate": 0.00012413483220948324, "loss": 0.0203, "step": 3203 }, { "epoch": 0.4232916074908346, "grad_norm": 0.14425142109394073, "learning_rate": 0.0001240944977723074, "loss": 0.0112, "step": 3204 }, { "epoch": 0.42342372097631864, "grad_norm": 0.5771812796592712, "learning_rate": 0.00012405415917325757, "loss": 0.049, "step": 3205 }, { "epoch": 0.42355583446180267, "grad_norm": 0.19761916995048523, "learning_rate": 0.0001240138164193015, "loss": 0.0186, "step": 3206 }, { "epoch": 0.4236879479472867, "grad_norm": 0.19133833050727844, "learning_rate": 0.0001239734695174076, "loss": 0.0244, "step": 3207 }, { "epoch": 0.42382006143277073, "grad_norm": 0.16783583164215088, "learning_rate": 0.0001239331184745451, "loss": 0.0202, "step": 3208 }, { "epoch": 0.42395217491825476, "grad_norm": 0.23732124269008636, "learning_rate": 0.00012389276329768386, "loss": 0.0196, "step": 3209 }, { "epoch": 0.4240842884037388, "grad_norm": 0.0561022087931633, "learning_rate": 0.0001238524039937945, "loss": 0.0046, "step": 3210 }, { "epoch": 0.4242164018892228, "grad_norm": 0.18584446609020233, "learning_rate": 0.00012381204056984832, "loss": 0.0241, "step": 3211 }, { "epoch": 0.42434851537470686, "grad_norm": 0.6407673954963684, "learning_rate": 0.00012377167303281736, "loss": 0.0226, "step": 3212 }, { "epoch": 0.4244806288601909, "grad_norm": 0.2582055926322937, "learning_rate": 0.00012373130138967434, "loss": 0.0223, "step": 3213 }, { "epoch": 0.4246127423456749, "grad_norm": 0.18671056628227234, "learning_rate": 0.0001236909256473927, "loss": 0.0242, "step": 3214 }, { "epoch": 0.42474485583115895, "grad_norm": 0.18192487955093384, "learning_rate": 0.00012365054581294665, "loss": 0.0214, "step": 3215 }, { "epoch": 0.424876969316643, "grad_norm": 0.19129694998264313, "learning_rate": 0.00012361016189331098, "loss": 0.027, "step": 3216 }, { "epoch": 0.425009082802127, "grad_norm": 0.23282450437545776, "learning_rate": 0.0001235697738954613, "loss": 0.0234, "step": 3217 }, { "epoch": 0.42514119628761105, "grad_norm": 0.22248545289039612, "learning_rate": 0.00012352938182637387, "loss": 0.0178, "step": 3218 }, { "epoch": 0.4252733097730951, "grad_norm": 0.20974159240722656, "learning_rate": 0.00012348898569302565, "loss": 0.0286, "step": 3219 }, { "epoch": 0.4254054232585791, "grad_norm": 0.13747277855873108, "learning_rate": 0.00012344858550239433, "loss": 0.0152, "step": 3220 }, { "epoch": 0.42553753674406314, "grad_norm": 0.2712138593196869, "learning_rate": 0.0001234081812614583, "loss": 0.0259, "step": 3221 }, { "epoch": 0.42566965022954717, "grad_norm": 0.21853908896446228, "learning_rate": 0.00012336777297719667, "loss": 0.0219, "step": 3222 }, { "epoch": 0.4258017637150312, "grad_norm": 0.17728565633296967, "learning_rate": 0.0001233273606565891, "loss": 0.0233, "step": 3223 }, { "epoch": 0.42593387720051523, "grad_norm": 0.209276482462883, "learning_rate": 0.00012328694430661618, "loss": 0.0219, "step": 3224 }, { "epoch": 0.42606599068599926, "grad_norm": 0.15013103187084198, "learning_rate": 0.000123246523934259, "loss": 0.0143, "step": 3225 }, { "epoch": 0.4261981041714833, "grad_norm": 0.19634680449962616, "learning_rate": 0.0001232060995464995, "loss": 0.028, "step": 3226 }, { "epoch": 0.4263302176569673, "grad_norm": 0.19637438654899597, "learning_rate": 0.00012316567115032014, "loss": 0.0363, "step": 3227 }, { "epoch": 0.42646233114245136, "grad_norm": 0.3099977374076843, "learning_rate": 0.00012312523875270426, "loss": 0.0282, "step": 3228 }, { "epoch": 0.4265944446279354, "grad_norm": 0.18816916644573212, "learning_rate": 0.00012308480236063578, "loss": 0.0264, "step": 3229 }, { "epoch": 0.4267265581134194, "grad_norm": 0.14757655560970306, "learning_rate": 0.00012304436198109928, "loss": 0.0201, "step": 3230 }, { "epoch": 0.42685867159890345, "grad_norm": 0.24353700876235962, "learning_rate": 0.00012300391762108014, "loss": 0.0253, "step": 3231 }, { "epoch": 0.4269907850843875, "grad_norm": 0.2125053107738495, "learning_rate": 0.00012296346928756432, "loss": 0.0192, "step": 3232 }, { "epoch": 0.4271228985698715, "grad_norm": 0.1806890368461609, "learning_rate": 0.00012292301698753853, "loss": 0.021, "step": 3233 }, { "epoch": 0.42725501205535554, "grad_norm": 0.2877216637134552, "learning_rate": 0.00012288256072799018, "loss": 0.0249, "step": 3234 }, { "epoch": 0.4273871255408396, "grad_norm": 0.18381038308143616, "learning_rate": 0.0001228421005159073, "loss": 0.014, "step": 3235 }, { "epoch": 0.4275192390263236, "grad_norm": 0.23033744096755981, "learning_rate": 0.00012280163635827865, "loss": 0.0199, "step": 3236 }, { "epoch": 0.42765135251180764, "grad_norm": 0.2439228892326355, "learning_rate": 0.00012276116826209362, "loss": 0.0279, "step": 3237 }, { "epoch": 0.42778346599729167, "grad_norm": 0.1486399620771408, "learning_rate": 0.00012272069623434236, "loss": 0.0106, "step": 3238 }, { "epoch": 0.4279155794827757, "grad_norm": 0.20771166682243347, "learning_rate": 0.00012268022028201562, "loss": 0.0336, "step": 3239 }, { "epoch": 0.42804769296825973, "grad_norm": 0.18823282420635223, "learning_rate": 0.0001226397404121049, "loss": 0.0185, "step": 3240 }, { "epoch": 0.42817980645374376, "grad_norm": 0.14146701991558075, "learning_rate": 0.00012259925663160232, "loss": 0.0067, "step": 3241 }, { "epoch": 0.4283119199392278, "grad_norm": 0.16572389006614685, "learning_rate": 0.0001225587689475007, "loss": 0.0222, "step": 3242 }, { "epoch": 0.4284440334247118, "grad_norm": 0.20891334116458893, "learning_rate": 0.00012251827736679358, "loss": 0.0331, "step": 3243 }, { "epoch": 0.42857614691019585, "grad_norm": 0.16869033873081207, "learning_rate": 0.00012247778189647502, "loss": 0.0183, "step": 3244 }, { "epoch": 0.4287082603956799, "grad_norm": 0.2835995554924011, "learning_rate": 0.00012243728254353992, "loss": 0.0224, "step": 3245 }, { "epoch": 0.4288403738811639, "grad_norm": 0.23434291779994965, "learning_rate": 0.00012239677931498376, "loss": 0.0227, "step": 3246 }, { "epoch": 0.42897248736664795, "grad_norm": 0.1688721477985382, "learning_rate": 0.00012235627221780278, "loss": 0.0204, "step": 3247 }, { "epoch": 0.429104600852132, "grad_norm": 0.30692258477211, "learning_rate": 0.00012231576125899373, "loss": 0.0328, "step": 3248 }, { "epoch": 0.429236714337616, "grad_norm": 0.18616972863674164, "learning_rate": 0.00012227524644555418, "loss": 0.0162, "step": 3249 }, { "epoch": 0.42936882782310004, "grad_norm": 0.12355251610279083, "learning_rate": 0.0001222347277844823, "loss": 0.0116, "step": 3250 }, { "epoch": 0.42950094130858407, "grad_norm": 0.2097376137971878, "learning_rate": 0.00012219420528277692, "loss": 0.0148, "step": 3251 }, { "epoch": 0.4296330547940681, "grad_norm": 0.39561423659324646, "learning_rate": 0.00012215367894743756, "loss": 0.017, "step": 3252 }, { "epoch": 0.42976516827955213, "grad_norm": 0.18149615824222565, "learning_rate": 0.00012211314878546436, "loss": 0.0201, "step": 3253 }, { "epoch": 0.42989728176503617, "grad_norm": 0.20877254009246826, "learning_rate": 0.00012207261480385817, "loss": 0.0189, "step": 3254 }, { "epoch": 0.4300293952505202, "grad_norm": 0.17631439864635468, "learning_rate": 0.00012203207700962047, "loss": 0.0246, "step": 3255 }, { "epoch": 0.43016150873600423, "grad_norm": 0.24336519837379456, "learning_rate": 0.00012199153540975342, "loss": 0.0263, "step": 3256 }, { "epoch": 0.43029362222148826, "grad_norm": 0.16430045664310455, "learning_rate": 0.00012195099001125978, "loss": 0.0089, "step": 3257 }, { "epoch": 0.4304257357069723, "grad_norm": 0.12992851436138153, "learning_rate": 0.00012191044082114305, "loss": 0.0113, "step": 3258 }, { "epoch": 0.4305578491924563, "grad_norm": 0.1720288246870041, "learning_rate": 0.00012186988784640736, "loss": 0.015, "step": 3259 }, { "epoch": 0.43068996267794035, "grad_norm": 0.16994862258434296, "learning_rate": 0.0001218293310940574, "loss": 0.0256, "step": 3260 }, { "epoch": 0.4308220761634244, "grad_norm": 0.24179445207118988, "learning_rate": 0.0001217887705710987, "loss": 0.0196, "step": 3261 }, { "epoch": 0.4309541896489084, "grad_norm": 0.20141147077083588, "learning_rate": 0.00012174820628453725, "loss": 0.0191, "step": 3262 }, { "epoch": 0.43108630313439245, "grad_norm": 0.11286444216966629, "learning_rate": 0.00012170763824137978, "loss": 0.0186, "step": 3263 }, { "epoch": 0.4312184166198765, "grad_norm": 0.24733056128025055, "learning_rate": 0.0001216670664486337, "loss": 0.0177, "step": 3264 }, { "epoch": 0.4313505301053605, "grad_norm": 0.35599496960639954, "learning_rate": 0.00012162649091330698, "loss": 0.0328, "step": 3265 }, { "epoch": 0.43148264359084454, "grad_norm": 0.13181497156620026, "learning_rate": 0.00012158591164240833, "loss": 0.0189, "step": 3266 }, { "epoch": 0.43161475707632857, "grad_norm": 0.1531556248664856, "learning_rate": 0.00012154532864294703, "loss": 0.0171, "step": 3267 }, { "epoch": 0.4317468705618126, "grad_norm": 0.3520362377166748, "learning_rate": 0.00012150474192193306, "loss": 0.0141, "step": 3268 }, { "epoch": 0.43187898404729663, "grad_norm": 0.18968984484672546, "learning_rate": 0.00012146415148637702, "loss": 0.0133, "step": 3269 }, { "epoch": 0.43201109753278066, "grad_norm": 0.1430044025182724, "learning_rate": 0.0001214235573432901, "loss": 0.0119, "step": 3270 }, { "epoch": 0.4321432110182647, "grad_norm": 0.15614572167396545, "learning_rate": 0.00012138295949968424, "loss": 0.0108, "step": 3271 }, { "epoch": 0.4322753245037487, "grad_norm": 0.2793543338775635, "learning_rate": 0.0001213423579625719, "loss": 0.0239, "step": 3272 }, { "epoch": 0.43240743798923276, "grad_norm": 0.20650598406791687, "learning_rate": 0.00012130175273896626, "loss": 0.0326, "step": 3273 }, { "epoch": 0.4325395514747168, "grad_norm": 0.21166659891605377, "learning_rate": 0.00012126114383588114, "loss": 0.0181, "step": 3274 }, { "epoch": 0.4326716649602008, "grad_norm": 0.22816048562526703, "learning_rate": 0.00012122053126033096, "loss": 0.0248, "step": 3275 }, { "epoch": 0.43280377844568485, "grad_norm": 0.21115653216838837, "learning_rate": 0.00012117991501933074, "loss": 0.0341, "step": 3276 }, { "epoch": 0.4329358919311689, "grad_norm": 0.1442337930202484, "learning_rate": 0.00012113929511989619, "loss": 0.0105, "step": 3277 }, { "epoch": 0.4330680054166529, "grad_norm": 0.2589665949344635, "learning_rate": 0.00012109867156904371, "loss": 0.0185, "step": 3278 }, { "epoch": 0.43320011890213694, "grad_norm": 0.17965416610240936, "learning_rate": 0.00012105804437379018, "loss": 0.0283, "step": 3279 }, { "epoch": 0.433332232387621, "grad_norm": 0.19684933125972748, "learning_rate": 0.00012101741354115321, "loss": 0.0313, "step": 3280 }, { "epoch": 0.433464345873105, "grad_norm": 0.1718248873949051, "learning_rate": 0.00012097677907815103, "loss": 0.0165, "step": 3281 }, { "epoch": 0.43359645935858904, "grad_norm": 0.22733500599861145, "learning_rate": 0.00012093614099180246, "loss": 0.0212, "step": 3282 }, { "epoch": 0.43372857284407307, "grad_norm": 0.16537611186504364, "learning_rate": 0.00012089549928912698, "loss": 0.0161, "step": 3283 }, { "epoch": 0.4338606863295571, "grad_norm": 0.22515685856342316, "learning_rate": 0.00012085485397714469, "loss": 0.0244, "step": 3284 }, { "epoch": 0.43399279981504113, "grad_norm": 0.31087931990623474, "learning_rate": 0.0001208142050628763, "loss": 0.027, "step": 3285 }, { "epoch": 0.43412491330052516, "grad_norm": 0.26226183772087097, "learning_rate": 0.0001207735525533432, "loss": 0.0215, "step": 3286 }, { "epoch": 0.4342570267860092, "grad_norm": 0.14036330580711365, "learning_rate": 0.00012073289645556724, "loss": 0.0113, "step": 3287 }, { "epoch": 0.4343891402714932, "grad_norm": 0.32060906291007996, "learning_rate": 0.00012069223677657112, "loss": 0.0258, "step": 3288 }, { "epoch": 0.43452125375697725, "grad_norm": 0.2714420258998871, "learning_rate": 0.00012065157352337793, "loss": 0.0214, "step": 3289 }, { "epoch": 0.4346533672424613, "grad_norm": 0.2144080549478531, "learning_rate": 0.00012061090670301158, "loss": 0.0329, "step": 3290 }, { "epoch": 0.4347854807279453, "grad_norm": 0.20472444593906403, "learning_rate": 0.00012057023632249645, "loss": 0.016, "step": 3291 }, { "epoch": 0.43491759421342935, "grad_norm": 0.19094641506671906, "learning_rate": 0.00012052956238885762, "loss": 0.0245, "step": 3292 }, { "epoch": 0.4350497076989134, "grad_norm": 0.12641161680221558, "learning_rate": 0.00012048888490912071, "loss": 0.0177, "step": 3293 }, { "epoch": 0.4351818211843974, "grad_norm": 0.22624894976615906, "learning_rate": 0.00012044820389031203, "loss": 0.024, "step": 3294 }, { "epoch": 0.43531393466988144, "grad_norm": 0.33124491572380066, "learning_rate": 0.00012040751933945847, "loss": 0.0286, "step": 3295 }, { "epoch": 0.4354460481553655, "grad_norm": 0.24567212164402008, "learning_rate": 0.00012036683126358747, "loss": 0.0382, "step": 3296 }, { "epoch": 0.4355781616408495, "grad_norm": 0.296768456697464, "learning_rate": 0.00012032613966972721, "loss": 0.0238, "step": 3297 }, { "epoch": 0.43571027512633353, "grad_norm": 0.28525975346565247, "learning_rate": 0.00012028544456490634, "loss": 0.0274, "step": 3298 }, { "epoch": 0.43584238861181757, "grad_norm": 0.19230809807777405, "learning_rate": 0.00012024474595615422, "loss": 0.0246, "step": 3299 }, { "epoch": 0.4359745020973016, "grad_norm": 0.16705505549907684, "learning_rate": 0.00012020404385050078, "loss": 0.0072, "step": 3300 }, { "epoch": 0.43610661558278563, "grad_norm": 0.1971859335899353, "learning_rate": 0.00012016333825497647, "loss": 0.0167, "step": 3301 }, { "epoch": 0.43623872906826966, "grad_norm": 0.17014151811599731, "learning_rate": 0.00012012262917661252, "loss": 0.0236, "step": 3302 }, { "epoch": 0.4363708425537537, "grad_norm": 0.08884875476360321, "learning_rate": 0.00012008191662244059, "loss": 0.0083, "step": 3303 }, { "epoch": 0.4365029560392377, "grad_norm": 0.11667829751968384, "learning_rate": 0.00012004120059949307, "loss": 0.0147, "step": 3304 }, { "epoch": 0.43663506952472175, "grad_norm": 0.17145667970180511, "learning_rate": 0.00012000048111480283, "loss": 0.0202, "step": 3305 }, { "epoch": 0.4367671830102058, "grad_norm": 0.25768762826919556, "learning_rate": 0.00011995975817540346, "loss": 0.0228, "step": 3306 }, { "epoch": 0.4368992964956898, "grad_norm": 0.11496775597333908, "learning_rate": 0.00011991903178832902, "loss": 0.0103, "step": 3307 }, { "epoch": 0.43703140998117385, "grad_norm": 0.2076563835144043, "learning_rate": 0.00011987830196061429, "loss": 0.0183, "step": 3308 }, { "epoch": 0.4371635234666579, "grad_norm": 0.47022193670272827, "learning_rate": 0.00011983756869929456, "loss": 0.0437, "step": 3309 }, { "epoch": 0.4372956369521419, "grad_norm": 0.11586523801088333, "learning_rate": 0.00011979683201140577, "loss": 0.0163, "step": 3310 }, { "epoch": 0.43742775043762594, "grad_norm": 0.2560293972492218, "learning_rate": 0.00011975609190398438, "loss": 0.0398, "step": 3311 }, { "epoch": 0.43755986392310997, "grad_norm": 0.1786472499370575, "learning_rate": 0.00011971534838406753, "loss": 0.0278, "step": 3312 }, { "epoch": 0.437691977408594, "grad_norm": 0.1825864315032959, "learning_rate": 0.00011967460145869282, "loss": 0.0322, "step": 3313 }, { "epoch": 0.43782409089407803, "grad_norm": 0.31292450428009033, "learning_rate": 0.0001196338511348986, "loss": 0.0311, "step": 3314 }, { "epoch": 0.43795620437956206, "grad_norm": 0.21444229781627655, "learning_rate": 0.00011959309741972369, "loss": 0.025, "step": 3315 }, { "epoch": 0.4380883178650461, "grad_norm": 0.2052425593137741, "learning_rate": 0.00011955234032020752, "loss": 0.0196, "step": 3316 }, { "epoch": 0.4382204313505301, "grad_norm": 0.3350735306739807, "learning_rate": 0.00011951157984339014, "loss": 0.0315, "step": 3317 }, { "epoch": 0.43835254483601416, "grad_norm": 0.1974852830171585, "learning_rate": 0.00011947081599631218, "loss": 0.0219, "step": 3318 }, { "epoch": 0.4384846583214982, "grad_norm": 0.1646255999803543, "learning_rate": 0.0001194300487860148, "loss": 0.0301, "step": 3319 }, { "epoch": 0.4386167718069822, "grad_norm": 0.15746988356113434, "learning_rate": 0.00011938927821953978, "loss": 0.0203, "step": 3320 }, { "epoch": 0.43874888529246625, "grad_norm": 0.10100744664669037, "learning_rate": 0.00011934850430392948, "loss": 0.0134, "step": 3321 }, { "epoch": 0.4388809987779503, "grad_norm": 0.21147461235523224, "learning_rate": 0.00011930772704622679, "loss": 0.0242, "step": 3322 }, { "epoch": 0.4390131122634343, "grad_norm": 0.17001758515834808, "learning_rate": 0.00011926694645347529, "loss": 0.0154, "step": 3323 }, { "epoch": 0.43914522574891834, "grad_norm": 0.11639127880334854, "learning_rate": 0.00011922616253271901, "loss": 0.0125, "step": 3324 }, { "epoch": 0.4392773392344024, "grad_norm": 0.19367121160030365, "learning_rate": 0.00011918537529100264, "loss": 0.0214, "step": 3325 }, { "epoch": 0.4394094527198864, "grad_norm": 0.08695891499519348, "learning_rate": 0.00011914458473537142, "loss": 0.0081, "step": 3326 }, { "epoch": 0.43954156620537044, "grad_norm": 0.17832736670970917, "learning_rate": 0.00011910379087287111, "loss": 0.0195, "step": 3327 }, { "epoch": 0.43967367969085447, "grad_norm": 0.2097679078578949, "learning_rate": 0.00011906299371054814, "loss": 0.0252, "step": 3328 }, { "epoch": 0.4398057931763385, "grad_norm": 0.1070525124669075, "learning_rate": 0.00011902219325544939, "loss": 0.0087, "step": 3329 }, { "epoch": 0.43993790666182253, "grad_norm": 0.2027360051870346, "learning_rate": 0.00011898138951462248, "loss": 0.0172, "step": 3330 }, { "epoch": 0.44007002014730656, "grad_norm": 0.22899967432022095, "learning_rate": 0.00011894058249511537, "loss": 0.0203, "step": 3331 }, { "epoch": 0.4402021336327906, "grad_norm": 0.2035856544971466, "learning_rate": 0.00011889977220397682, "loss": 0.0221, "step": 3332 }, { "epoch": 0.4403342471182746, "grad_norm": 0.17923541367053986, "learning_rate": 0.00011885895864825599, "loss": 0.0147, "step": 3333 }, { "epoch": 0.44046636060375866, "grad_norm": 0.16749948263168335, "learning_rate": 0.00011881814183500262, "loss": 0.0203, "step": 3334 }, { "epoch": 0.4405984740892427, "grad_norm": 0.1700792908668518, "learning_rate": 0.00011877732177126715, "loss": 0.0207, "step": 3335 }, { "epoch": 0.4407305875747267, "grad_norm": 0.19713635742664337, "learning_rate": 0.00011873649846410038, "loss": 0.0146, "step": 3336 }, { "epoch": 0.4408627010602107, "grad_norm": 0.2588673233985901, "learning_rate": 0.00011869567192055382, "loss": 0.0296, "step": 3337 }, { "epoch": 0.4409948145456947, "grad_norm": 0.25432080030441284, "learning_rate": 0.00011865484214767955, "loss": 0.0244, "step": 3338 }, { "epoch": 0.44112692803117876, "grad_norm": 0.23010925948619843, "learning_rate": 0.00011861400915253005, "loss": 0.0267, "step": 3339 }, { "epoch": 0.4412590415166628, "grad_norm": 0.18061229586601257, "learning_rate": 0.00011857317294215851, "loss": 0.0214, "step": 3340 }, { "epoch": 0.4413911550021468, "grad_norm": 0.10983574390411377, "learning_rate": 0.0001185323335236186, "loss": 0.0131, "step": 3341 }, { "epoch": 0.44152326848763085, "grad_norm": 0.5960947275161743, "learning_rate": 0.00011849149090396461, "loss": 0.0207, "step": 3342 }, { "epoch": 0.4416553819731149, "grad_norm": 0.13170459866523743, "learning_rate": 0.0001184506450902513, "loss": 0.0177, "step": 3343 }, { "epoch": 0.4417874954585989, "grad_norm": 0.14053009450435638, "learning_rate": 0.000118409796089534, "loss": 0.0157, "step": 3344 }, { "epoch": 0.44191960894408294, "grad_norm": 0.17476029694080353, "learning_rate": 0.00011836894390886866, "loss": 0.013, "step": 3345 }, { "epoch": 0.442051722429567, "grad_norm": 0.1530439704656601, "learning_rate": 0.00011832808855531171, "loss": 0.0108, "step": 3346 }, { "epoch": 0.442183835915051, "grad_norm": 0.2883946895599365, "learning_rate": 0.00011828723003592015, "loss": 0.0223, "step": 3347 }, { "epoch": 0.44231594940053504, "grad_norm": 0.16099697351455688, "learning_rate": 0.00011824636835775149, "loss": 0.0209, "step": 3348 }, { "epoch": 0.44244806288601907, "grad_norm": 0.1955551952123642, "learning_rate": 0.00011820550352786388, "loss": 0.0174, "step": 3349 }, { "epoch": 0.4425801763715031, "grad_norm": 0.2711743414402008, "learning_rate": 0.00011816463555331594, "loss": 0.0186, "step": 3350 }, { "epoch": 0.44271228985698713, "grad_norm": 0.2542886734008789, "learning_rate": 0.00011812376444116681, "loss": 0.0306, "step": 3351 }, { "epoch": 0.44284440334247116, "grad_norm": 0.33619576692581177, "learning_rate": 0.00011808289019847627, "loss": 0.0232, "step": 3352 }, { "epoch": 0.4429765168279552, "grad_norm": 0.21356289088726044, "learning_rate": 0.00011804201283230452, "loss": 0.022, "step": 3353 }, { "epoch": 0.4431086303134392, "grad_norm": 0.28175997734069824, "learning_rate": 0.00011800113234971242, "loss": 0.0265, "step": 3354 }, { "epoch": 0.44324074379892325, "grad_norm": 0.16907796263694763, "learning_rate": 0.00011796024875776123, "loss": 0.0251, "step": 3355 }, { "epoch": 0.4433728572844073, "grad_norm": 0.16458839178085327, "learning_rate": 0.00011791936206351293, "loss": 0.021, "step": 3356 }, { "epoch": 0.4435049707698913, "grad_norm": 0.12611019611358643, "learning_rate": 0.00011787847227402986, "loss": 0.0074, "step": 3357 }, { "epoch": 0.44363708425537535, "grad_norm": 0.2491726577281952, "learning_rate": 0.00011783757939637494, "loss": 0.0272, "step": 3358 }, { "epoch": 0.4437691977408594, "grad_norm": 0.124544657766819, "learning_rate": 0.00011779668343761174, "loss": 0.01, "step": 3359 }, { "epoch": 0.4439013112263434, "grad_norm": 0.2654660940170288, "learning_rate": 0.00011775578440480421, "loss": 0.025, "step": 3360 }, { "epoch": 0.44403342471182744, "grad_norm": 0.24884164333343506, "learning_rate": 0.00011771488230501692, "loss": 0.025, "step": 3361 }, { "epoch": 0.44416553819731147, "grad_norm": 0.11097685992717743, "learning_rate": 0.00011767397714531492, "loss": 0.0079, "step": 3362 }, { "epoch": 0.4442976516827955, "grad_norm": 0.21636100113391876, "learning_rate": 0.00011763306893276382, "loss": 0.0261, "step": 3363 }, { "epoch": 0.44442976516827953, "grad_norm": 0.1294645369052887, "learning_rate": 0.00011759215767442977, "loss": 0.0091, "step": 3364 }, { "epoch": 0.44456187865376356, "grad_norm": 0.20058712363243103, "learning_rate": 0.00011755124337737937, "loss": 0.0178, "step": 3365 }, { "epoch": 0.4446939921392476, "grad_norm": 0.13472117483615875, "learning_rate": 0.00011751032604867987, "loss": 0.0172, "step": 3366 }, { "epoch": 0.4448261056247316, "grad_norm": 0.2171383798122406, "learning_rate": 0.00011746940569539893, "loss": 0.025, "step": 3367 }, { "epoch": 0.44495821911021566, "grad_norm": 0.18214093148708344, "learning_rate": 0.00011742848232460479, "loss": 0.0239, "step": 3368 }, { "epoch": 0.4450903325956997, "grad_norm": 0.23197366297245026, "learning_rate": 0.0001173875559433662, "loss": 0.0231, "step": 3369 }, { "epoch": 0.4452224460811837, "grad_norm": 0.24646830558776855, "learning_rate": 0.00011734662655875242, "loss": 0.0222, "step": 3370 }, { "epoch": 0.44535455956666775, "grad_norm": 0.25738704204559326, "learning_rate": 0.00011730569417783322, "loss": 0.0339, "step": 3371 }, { "epoch": 0.4454866730521518, "grad_norm": 0.2571242153644562, "learning_rate": 0.00011726475880767893, "loss": 0.0306, "step": 3372 }, { "epoch": 0.4456187865376358, "grad_norm": 0.18332485854625702, "learning_rate": 0.00011722382045536036, "loss": 0.0189, "step": 3373 }, { "epoch": 0.44575090002311984, "grad_norm": 0.28679078817367554, "learning_rate": 0.00011718287912794885, "loss": 0.0228, "step": 3374 }, { "epoch": 0.4458830135086039, "grad_norm": 0.21024517714977264, "learning_rate": 0.00011714193483251623, "loss": 0.0338, "step": 3375 }, { "epoch": 0.4460151269940879, "grad_norm": 0.2537991404533386, "learning_rate": 0.0001171009875761349, "loss": 0.0211, "step": 3376 }, { "epoch": 0.44614724047957194, "grad_norm": 0.21349625289440155, "learning_rate": 0.00011706003736587768, "loss": 0.0214, "step": 3377 }, { "epoch": 0.44627935396505597, "grad_norm": 0.21972428262233734, "learning_rate": 0.00011701908420881799, "loss": 0.0275, "step": 3378 }, { "epoch": 0.44641146745054, "grad_norm": 0.16195261478424072, "learning_rate": 0.00011697812811202971, "loss": 0.0194, "step": 3379 }, { "epoch": 0.44654358093602403, "grad_norm": 0.3250785171985626, "learning_rate": 0.00011693716908258727, "loss": 0.0283, "step": 3380 }, { "epoch": 0.44667569442150806, "grad_norm": 0.1459767371416092, "learning_rate": 0.00011689620712756553, "loss": 0.0197, "step": 3381 }, { "epoch": 0.4468078079069921, "grad_norm": 0.18223801255226135, "learning_rate": 0.00011685524225403993, "loss": 0.0247, "step": 3382 }, { "epoch": 0.4469399213924761, "grad_norm": 0.14529427886009216, "learning_rate": 0.00011681427446908637, "loss": 0.0164, "step": 3383 }, { "epoch": 0.44707203487796016, "grad_norm": 0.1351812779903412, "learning_rate": 0.00011677330377978127, "loss": 0.0119, "step": 3384 }, { "epoch": 0.4472041483634442, "grad_norm": 0.20510388910770416, "learning_rate": 0.00011673233019320155, "loss": 0.0201, "step": 3385 }, { "epoch": 0.4473362618489282, "grad_norm": 0.2545018196105957, "learning_rate": 0.00011669135371642465, "loss": 0.0263, "step": 3386 }, { "epoch": 0.44746837533441225, "grad_norm": 0.1861943006515503, "learning_rate": 0.00011665037435652849, "loss": 0.0287, "step": 3387 }, { "epoch": 0.4476004888198963, "grad_norm": 0.2189074158668518, "learning_rate": 0.00011660939212059147, "loss": 0.0224, "step": 3388 }, { "epoch": 0.4477326023053803, "grad_norm": 0.18213927745819092, "learning_rate": 0.0001165684070156925, "loss": 0.0196, "step": 3389 }, { "epoch": 0.44786471579086434, "grad_norm": 0.2507062256336212, "learning_rate": 0.000116527419048911, "loss": 0.0192, "step": 3390 }, { "epoch": 0.4479968292763484, "grad_norm": 0.17222446203231812, "learning_rate": 0.0001164864282273269, "loss": 0.016, "step": 3391 }, { "epoch": 0.4481289427618324, "grad_norm": 0.1732499599456787, "learning_rate": 0.00011644543455802055, "loss": 0.0221, "step": 3392 }, { "epoch": 0.44826105624731644, "grad_norm": 0.22399833798408508, "learning_rate": 0.00011640443804807286, "loss": 0.0317, "step": 3393 }, { "epoch": 0.44839316973280047, "grad_norm": 0.21018847823143005, "learning_rate": 0.00011636343870456523, "loss": 0.0217, "step": 3394 }, { "epoch": 0.4485252832182845, "grad_norm": 0.231175497174263, "learning_rate": 0.00011632243653457952, "loss": 0.0271, "step": 3395 }, { "epoch": 0.44865739670376853, "grad_norm": 0.26954904198646545, "learning_rate": 0.00011628143154519806, "loss": 0.023, "step": 3396 }, { "epoch": 0.44878951018925256, "grad_norm": 0.19411055743694305, "learning_rate": 0.00011624042374350377, "loss": 0.0201, "step": 3397 }, { "epoch": 0.4489216236747366, "grad_norm": 0.1828741729259491, "learning_rate": 0.00011619941313657987, "loss": 0.0093, "step": 3398 }, { "epoch": 0.4490537371602206, "grad_norm": 0.14031386375427246, "learning_rate": 0.00011615839973151028, "loss": 0.0148, "step": 3399 }, { "epoch": 0.44918585064570465, "grad_norm": 0.1643749177455902, "learning_rate": 0.00011611738353537924, "loss": 0.0149, "step": 3400 }, { "epoch": 0.4493179641311887, "grad_norm": 0.16316166520118713, "learning_rate": 0.00011607636455527155, "loss": 0.0116, "step": 3401 }, { "epoch": 0.4494500776166727, "grad_norm": 0.2305441051721573, "learning_rate": 0.00011603534279827246, "loss": 0.031, "step": 3402 }, { "epoch": 0.44958219110215675, "grad_norm": 0.20265796780586243, "learning_rate": 0.00011599431827146772, "loss": 0.0157, "step": 3403 }, { "epoch": 0.4497143045876408, "grad_norm": 0.19166412949562073, "learning_rate": 0.00011595329098194354, "loss": 0.0215, "step": 3404 }, { "epoch": 0.4498464180731248, "grad_norm": 0.19884948432445526, "learning_rate": 0.00011591226093678665, "loss": 0.0213, "step": 3405 }, { "epoch": 0.44997853155860884, "grad_norm": 0.15096840262413025, "learning_rate": 0.00011587122814308418, "loss": 0.0101, "step": 3406 }, { "epoch": 0.45011064504409287, "grad_norm": 0.17579104006290436, "learning_rate": 0.0001158301926079238, "loss": 0.0151, "step": 3407 }, { "epoch": 0.4502427585295769, "grad_norm": 0.2230728715658188, "learning_rate": 0.00011578915433839364, "loss": 0.0164, "step": 3408 }, { "epoch": 0.45037487201506093, "grad_norm": 0.17997904121875763, "learning_rate": 0.00011574811334158227, "loss": 0.0257, "step": 3409 }, { "epoch": 0.45050698550054497, "grad_norm": 0.16918472945690155, "learning_rate": 0.00011570706962457876, "loss": 0.0124, "step": 3410 }, { "epoch": 0.450639098986029, "grad_norm": 0.2919192910194397, "learning_rate": 0.00011566602319447266, "loss": 0.0235, "step": 3411 }, { "epoch": 0.450771212471513, "grad_norm": 0.20782966911792755, "learning_rate": 0.00011562497405835396, "loss": 0.0208, "step": 3412 }, { "epoch": 0.45090332595699706, "grad_norm": 0.17539459466934204, "learning_rate": 0.00011558392222331313, "loss": 0.0162, "step": 3413 }, { "epoch": 0.4510354394424811, "grad_norm": 0.17549258470535278, "learning_rate": 0.00011554286769644113, "loss": 0.0139, "step": 3414 }, { "epoch": 0.4511675529279651, "grad_norm": 0.1104462742805481, "learning_rate": 0.00011550181048482936, "loss": 0.0116, "step": 3415 }, { "epoch": 0.45129966641344915, "grad_norm": 0.17143550515174866, "learning_rate": 0.00011546075059556965, "loss": 0.0175, "step": 3416 }, { "epoch": 0.4514317798989332, "grad_norm": 0.18821991980075836, "learning_rate": 0.00011541968803575433, "loss": 0.0196, "step": 3417 }, { "epoch": 0.4515638933844172, "grad_norm": 0.178142711520195, "learning_rate": 0.00011537862281247624, "loss": 0.0218, "step": 3418 }, { "epoch": 0.45169600686990125, "grad_norm": 0.3631950616836548, "learning_rate": 0.00011533755493282857, "loss": 0.0212, "step": 3419 }, { "epoch": 0.4518281203553853, "grad_norm": 0.16244377195835114, "learning_rate": 0.00011529648440390508, "loss": 0.0195, "step": 3420 }, { "epoch": 0.4519602338408693, "grad_norm": 0.09488275647163391, "learning_rate": 0.00011525541123279991, "loss": 0.0109, "step": 3421 }, { "epoch": 0.45209234732635334, "grad_norm": 0.24750275909900665, "learning_rate": 0.00011521433542660767, "loss": 0.0297, "step": 3422 }, { "epoch": 0.45222446081183737, "grad_norm": 0.13846619427204132, "learning_rate": 0.00011517325699242345, "loss": 0.0183, "step": 3423 }, { "epoch": 0.4523565742973214, "grad_norm": 0.15736238658428192, "learning_rate": 0.00011513217593734277, "loss": 0.0101, "step": 3424 }, { "epoch": 0.45248868778280543, "grad_norm": 0.19229941070079803, "learning_rate": 0.00011509109226846164, "loss": 0.0167, "step": 3425 }, { "epoch": 0.45262080126828946, "grad_norm": 0.178132563829422, "learning_rate": 0.0001150500059928765, "loss": 0.0194, "step": 3426 }, { "epoch": 0.4527529147537735, "grad_norm": 0.21312348544597626, "learning_rate": 0.0001150089171176842, "loss": 0.0297, "step": 3427 }, { "epoch": 0.4528850282392575, "grad_norm": 0.20051683485507965, "learning_rate": 0.0001149678256499821, "loss": 0.0126, "step": 3428 }, { "epoch": 0.45301714172474156, "grad_norm": 0.22905012965202332, "learning_rate": 0.00011492673159686797, "loss": 0.0224, "step": 3429 }, { "epoch": 0.4531492552102256, "grad_norm": 0.25302499532699585, "learning_rate": 0.00011488563496544007, "loss": 0.0322, "step": 3430 }, { "epoch": 0.4532813686957096, "grad_norm": 0.20484831929206848, "learning_rate": 0.00011484453576279703, "loss": 0.0148, "step": 3431 }, { "epoch": 0.45341348218119365, "grad_norm": 0.22035667300224304, "learning_rate": 0.00011480343399603799, "loss": 0.0113, "step": 3432 }, { "epoch": 0.4535455956666777, "grad_norm": 0.2351701557636261, "learning_rate": 0.00011476232967226252, "loss": 0.0262, "step": 3433 }, { "epoch": 0.4536777091521617, "grad_norm": 0.14787419140338898, "learning_rate": 0.00011472122279857061, "loss": 0.0113, "step": 3434 }, { "epoch": 0.45380982263764574, "grad_norm": 0.47793006896972656, "learning_rate": 0.00011468011338206271, "loss": 0.037, "step": 3435 }, { "epoch": 0.4539419361231298, "grad_norm": 0.1750878393650055, "learning_rate": 0.0001146390014298397, "loss": 0.0179, "step": 3436 }, { "epoch": 0.4540740496086138, "grad_norm": 0.3732263147830963, "learning_rate": 0.00011459788694900289, "loss": 0.0284, "step": 3437 }, { "epoch": 0.45420616309409784, "grad_norm": 0.3407856822013855, "learning_rate": 0.00011455676994665407, "loss": 0.0347, "step": 3438 }, { "epoch": 0.45433827657958187, "grad_norm": 0.2009911835193634, "learning_rate": 0.0001145156504298954, "loss": 0.0255, "step": 3439 }, { "epoch": 0.4544703900650659, "grad_norm": 0.14986108243465424, "learning_rate": 0.00011447452840582952, "loss": 0.0158, "step": 3440 }, { "epoch": 0.45460250355054993, "grad_norm": 0.264070987701416, "learning_rate": 0.0001144334038815595, "loss": 0.023, "step": 3441 }, { "epoch": 0.45473461703603396, "grad_norm": 0.27384451031684875, "learning_rate": 0.00011439227686418883, "loss": 0.0191, "step": 3442 }, { "epoch": 0.454866730521518, "grad_norm": 0.2228957712650299, "learning_rate": 0.00011435114736082142, "loss": 0.0204, "step": 3443 }, { "epoch": 0.454998844007002, "grad_norm": 0.144685298204422, "learning_rate": 0.00011431001537856163, "loss": 0.0122, "step": 3444 }, { "epoch": 0.45513095749248605, "grad_norm": 0.1886610984802246, "learning_rate": 0.00011426888092451427, "loss": 0.0216, "step": 3445 }, { "epoch": 0.4552630709779701, "grad_norm": 0.17720267176628113, "learning_rate": 0.00011422774400578446, "loss": 0.0191, "step": 3446 }, { "epoch": 0.4553951844634541, "grad_norm": 0.30038321018218994, "learning_rate": 0.00011418660462947795, "loss": 0.0262, "step": 3447 }, { "epoch": 0.45552729794893815, "grad_norm": 0.18168781697750092, "learning_rate": 0.0001141454628027007, "loss": 0.0343, "step": 3448 }, { "epoch": 0.4556594114344222, "grad_norm": 0.2796495258808136, "learning_rate": 0.00011410431853255925, "loss": 0.0247, "step": 3449 }, { "epoch": 0.4557915249199062, "grad_norm": 0.23623229563236237, "learning_rate": 0.00011406317182616049, "loss": 0.0114, "step": 3450 }, { "epoch": 0.45592363840539024, "grad_norm": 0.1999683976173401, "learning_rate": 0.00011402202269061173, "loss": 0.0274, "step": 3451 }, { "epoch": 0.4560557518908743, "grad_norm": 0.2691020965576172, "learning_rate": 0.00011398087113302074, "loss": 0.0258, "step": 3452 }, { "epoch": 0.4561878653763583, "grad_norm": 0.31196510791778564, "learning_rate": 0.00011393971716049563, "loss": 0.0261, "step": 3453 }, { "epoch": 0.45631997886184233, "grad_norm": 0.14454197883605957, "learning_rate": 0.00011389856078014504, "loss": 0.0219, "step": 3454 }, { "epoch": 0.45645209234732637, "grad_norm": 0.20809254050254822, "learning_rate": 0.00011385740199907792, "loss": 0.0218, "step": 3455 }, { "epoch": 0.4565842058328104, "grad_norm": 0.21281687915325165, "learning_rate": 0.00011381624082440374, "loss": 0.0202, "step": 3456 }, { "epoch": 0.45671631931829443, "grad_norm": 0.11839515715837479, "learning_rate": 0.00011377507726323227, "loss": 0.0125, "step": 3457 }, { "epoch": 0.45684843280377846, "grad_norm": 0.18069903552532196, "learning_rate": 0.00011373391132267374, "loss": 0.0206, "step": 3458 }, { "epoch": 0.4569805462892625, "grad_norm": 0.11462843418121338, "learning_rate": 0.00011369274300983886, "loss": 0.0157, "step": 3459 }, { "epoch": 0.4571126597747465, "grad_norm": 0.2575992941856384, "learning_rate": 0.00011365157233183858, "loss": 0.0355, "step": 3460 }, { "epoch": 0.45724477326023055, "grad_norm": 0.1611703485250473, "learning_rate": 0.00011361039929578447, "loss": 0.0135, "step": 3461 }, { "epoch": 0.4573768867457146, "grad_norm": 0.2228485494852066, "learning_rate": 0.00011356922390878834, "loss": 0.0359, "step": 3462 }, { "epoch": 0.4575090002311986, "grad_norm": 0.15648458898067474, "learning_rate": 0.00011352804617796251, "loss": 0.013, "step": 3463 }, { "epoch": 0.45764111371668265, "grad_norm": 0.1394144892692566, "learning_rate": 0.00011348686611041963, "loss": 0.0195, "step": 3464 }, { "epoch": 0.4577732272021667, "grad_norm": 0.17651647329330444, "learning_rate": 0.00011344568371327277, "loss": 0.0174, "step": 3465 }, { "epoch": 0.4579053406876507, "grad_norm": 0.15418827533721924, "learning_rate": 0.00011340449899363547, "loss": 0.0238, "step": 3466 }, { "epoch": 0.45803745417313474, "grad_norm": 0.09896305948495865, "learning_rate": 0.00011336331195862159, "loss": 0.0098, "step": 3467 }, { "epoch": 0.45816956765861877, "grad_norm": 0.17593730986118317, "learning_rate": 0.00011332212261534545, "loss": 0.0188, "step": 3468 }, { "epoch": 0.4583016811441028, "grad_norm": 0.13243882358074188, "learning_rate": 0.00011328093097092168, "loss": 0.0102, "step": 3469 }, { "epoch": 0.45843379462958683, "grad_norm": 0.16697779297828674, "learning_rate": 0.00011323973703246542, "loss": 0.0152, "step": 3470 }, { "epoch": 0.45856590811507086, "grad_norm": 0.15453162789344788, "learning_rate": 0.00011319854080709215, "loss": 0.0202, "step": 3471 }, { "epoch": 0.4586980216005549, "grad_norm": 0.200933039188385, "learning_rate": 0.0001131573423019177, "loss": 0.0288, "step": 3472 }, { "epoch": 0.4588301350860389, "grad_norm": 0.21381884813308716, "learning_rate": 0.0001131161415240584, "loss": 0.0181, "step": 3473 }, { "epoch": 0.45896224857152296, "grad_norm": 0.32991519570350647, "learning_rate": 0.00011307493848063086, "loss": 0.033, "step": 3474 }, { "epoch": 0.459094362057007, "grad_norm": 0.21740446984767914, "learning_rate": 0.0001130337331787522, "loss": 0.0132, "step": 3475 }, { "epoch": 0.459226475542491, "grad_norm": 0.11426756531000137, "learning_rate": 0.00011299252562553979, "loss": 0.0115, "step": 3476 }, { "epoch": 0.45935858902797505, "grad_norm": 0.14595068991184235, "learning_rate": 0.00011295131582811153, "loss": 0.0149, "step": 3477 }, { "epoch": 0.4594907025134591, "grad_norm": 0.1178387925028801, "learning_rate": 0.00011291010379358563, "loss": 0.0137, "step": 3478 }, { "epoch": 0.4596228159989431, "grad_norm": 0.23638631403446198, "learning_rate": 0.00011286888952908063, "loss": 0.0201, "step": 3479 }, { "epoch": 0.45975492948442714, "grad_norm": 0.1705041527748108, "learning_rate": 0.00011282767304171562, "loss": 0.0146, "step": 3480 }, { "epoch": 0.4598870429699112, "grad_norm": 0.12411267310380936, "learning_rate": 0.00011278645433860991, "loss": 0.0216, "step": 3481 }, { "epoch": 0.4600191564553952, "grad_norm": 0.11123984307050705, "learning_rate": 0.00011274523342688328, "loss": 0.0155, "step": 3482 }, { "epoch": 0.46015126994087924, "grad_norm": 0.17560520768165588, "learning_rate": 0.00011270401031365592, "loss": 0.0209, "step": 3483 }, { "epoch": 0.46028338342636327, "grad_norm": 0.15862911939620972, "learning_rate": 0.00011266278500604826, "loss": 0.0203, "step": 3484 }, { "epoch": 0.4604154969118473, "grad_norm": 0.14899875223636627, "learning_rate": 0.00011262155751118128, "loss": 0.0154, "step": 3485 }, { "epoch": 0.46054761039733133, "grad_norm": 0.1562994122505188, "learning_rate": 0.0001125803278361762, "loss": 0.0148, "step": 3486 }, { "epoch": 0.46067972388281536, "grad_norm": 0.11963161826133728, "learning_rate": 0.00011253909598815474, "loss": 0.0105, "step": 3487 }, { "epoch": 0.4608118373682994, "grad_norm": 0.14713045954704285, "learning_rate": 0.00011249786197423888, "loss": 0.0233, "step": 3488 }, { "epoch": 0.4609439508537834, "grad_norm": 0.1988755166530609, "learning_rate": 0.00011245662580155102, "loss": 0.0227, "step": 3489 }, { "epoch": 0.46107606433926746, "grad_norm": 0.15347225964069366, "learning_rate": 0.00011241538747721401, "loss": 0.0145, "step": 3490 }, { "epoch": 0.4612081778247515, "grad_norm": 0.18890933692455292, "learning_rate": 0.00011237414700835089, "loss": 0.0163, "step": 3491 }, { "epoch": 0.4613402913102355, "grad_norm": 0.19923219084739685, "learning_rate": 0.00011233290440208528, "loss": 0.0233, "step": 3492 }, { "epoch": 0.46147240479571955, "grad_norm": 0.12134901434183121, "learning_rate": 0.00011229165966554098, "loss": 0.013, "step": 3493 }, { "epoch": 0.4616045182812036, "grad_norm": 0.17246650159358978, "learning_rate": 0.00011225041280584234, "loss": 0.0226, "step": 3494 }, { "epoch": 0.4617366317666876, "grad_norm": 0.1895778924226761, "learning_rate": 0.00011220916383011393, "loss": 0.0143, "step": 3495 }, { "epoch": 0.46186874525217164, "grad_norm": 0.3674432337284088, "learning_rate": 0.0001121679127454807, "loss": 0.0286, "step": 3496 }, { "epoch": 0.4620008587376557, "grad_norm": 0.24747756123542786, "learning_rate": 0.00011212665955906812, "loss": 0.0117, "step": 3497 }, { "epoch": 0.4621329722231397, "grad_norm": 0.17657174170017242, "learning_rate": 0.00011208540427800178, "loss": 0.0206, "step": 3498 }, { "epoch": 0.46226508570862374, "grad_norm": 0.18776048719882965, "learning_rate": 0.00011204414690940783, "loss": 0.0157, "step": 3499 }, { "epoch": 0.4623971991941077, "grad_norm": 0.16772229969501495, "learning_rate": 0.0001120028874604127, "loss": 0.0163, "step": 3500 }, { "epoch": 0.46252931267959174, "grad_norm": 0.19655610620975494, "learning_rate": 0.00011196162593814319, "loss": 0.0223, "step": 3501 }, { "epoch": 0.4626614261650758, "grad_norm": 0.1830878108739853, "learning_rate": 0.00011192036234972645, "loss": 0.0224, "step": 3502 }, { "epoch": 0.4627935396505598, "grad_norm": 0.13783776760101318, "learning_rate": 0.00011187909670228998, "loss": 0.0154, "step": 3503 }, { "epoch": 0.46292565313604384, "grad_norm": 0.5649703741073608, "learning_rate": 0.00011183782900296168, "loss": 0.0479, "step": 3504 }, { "epoch": 0.46305776662152787, "grad_norm": 0.16268661618232727, "learning_rate": 0.00011179655925886971, "loss": 0.0173, "step": 3505 }, { "epoch": 0.4631898801070119, "grad_norm": 0.15541048347949982, "learning_rate": 0.00011175528747714272, "loss": 0.0201, "step": 3506 }, { "epoch": 0.46332199359249593, "grad_norm": 0.33864355087280273, "learning_rate": 0.00011171401366490961, "loss": 0.0281, "step": 3507 }, { "epoch": 0.46345410707797996, "grad_norm": 0.1839621514081955, "learning_rate": 0.00011167273782929968, "loss": 0.0165, "step": 3508 }, { "epoch": 0.463586220563464, "grad_norm": 0.17850792407989502, "learning_rate": 0.0001116314599774425, "loss": 0.0181, "step": 3509 }, { "epoch": 0.463718334048948, "grad_norm": 0.18538732826709747, "learning_rate": 0.00011159018011646811, "loss": 0.021, "step": 3510 }, { "epoch": 0.46385044753443205, "grad_norm": 0.21953551471233368, "learning_rate": 0.00011154889825350681, "loss": 0.0184, "step": 3511 }, { "epoch": 0.4639825610199161, "grad_norm": 0.10858046263456345, "learning_rate": 0.00011150761439568925, "loss": 0.0121, "step": 3512 }, { "epoch": 0.4641146745054001, "grad_norm": 0.14788410067558289, "learning_rate": 0.00011146632855014647, "loss": 0.0142, "step": 3513 }, { "epoch": 0.46424678799088415, "grad_norm": 0.219330295920372, "learning_rate": 0.00011142504072400983, "loss": 0.0296, "step": 3514 }, { "epoch": 0.4643789014763682, "grad_norm": 0.12793508172035217, "learning_rate": 0.00011138375092441102, "loss": 0.0138, "step": 3515 }, { "epoch": 0.4645110149618522, "grad_norm": 0.11018357425928116, "learning_rate": 0.00011134245915848209, "loss": 0.0143, "step": 3516 }, { "epoch": 0.46464312844733624, "grad_norm": 0.15088607370853424, "learning_rate": 0.00011130116543335541, "loss": 0.0108, "step": 3517 }, { "epoch": 0.46477524193282027, "grad_norm": 0.19519521296024323, "learning_rate": 0.0001112598697561637, "loss": 0.0208, "step": 3518 }, { "epoch": 0.4649073554183043, "grad_norm": 0.17513087391853333, "learning_rate": 0.00011121857213404, "loss": 0.0241, "step": 3519 }, { "epoch": 0.46503946890378833, "grad_norm": 0.19763897359371185, "learning_rate": 0.00011117727257411776, "loss": 0.0158, "step": 3520 }, { "epoch": 0.46517158238927236, "grad_norm": 0.28380870819091797, "learning_rate": 0.00011113597108353064, "loss": 0.0134, "step": 3521 }, { "epoch": 0.4653036958747564, "grad_norm": 0.3162674307823181, "learning_rate": 0.00011109466766941275, "loss": 0.0371, "step": 3522 }, { "epoch": 0.4654358093602404, "grad_norm": 0.21053746342658997, "learning_rate": 0.00011105336233889845, "loss": 0.0265, "step": 3523 }, { "epoch": 0.46556792284572446, "grad_norm": 0.15278670191764832, "learning_rate": 0.00011101205509912245, "loss": 0.0103, "step": 3524 }, { "epoch": 0.4657000363312085, "grad_norm": 0.127748042345047, "learning_rate": 0.00011097074595721985, "loss": 0.0146, "step": 3525 }, { "epoch": 0.4658321498166925, "grad_norm": 0.3685021698474884, "learning_rate": 0.000110929434920326, "loss": 0.0292, "step": 3526 }, { "epoch": 0.46596426330217655, "grad_norm": 0.23161841928958893, "learning_rate": 0.00011088812199557663, "loss": 0.0256, "step": 3527 }, { "epoch": 0.4660963767876606, "grad_norm": 0.31696006655693054, "learning_rate": 0.00011084680719010777, "loss": 0.0109, "step": 3528 }, { "epoch": 0.4662284902731446, "grad_norm": 0.13080094754695892, "learning_rate": 0.00011080549051105573, "loss": 0.0108, "step": 3529 }, { "epoch": 0.46636060375862864, "grad_norm": 0.2266928255558014, "learning_rate": 0.00011076417196555728, "loss": 0.0227, "step": 3530 }, { "epoch": 0.4664927172441127, "grad_norm": 0.27095434069633484, "learning_rate": 0.00011072285156074935, "loss": 0.0289, "step": 3531 }, { "epoch": 0.4666248307295967, "grad_norm": 0.18671102821826935, "learning_rate": 0.00011068152930376933, "loss": 0.0199, "step": 3532 }, { "epoch": 0.46675694421508074, "grad_norm": 0.18002068996429443, "learning_rate": 0.00011064020520175482, "loss": 0.0119, "step": 3533 }, { "epoch": 0.46688905770056477, "grad_norm": 0.32328853011131287, "learning_rate": 0.00011059887926184382, "loss": 0.0199, "step": 3534 }, { "epoch": 0.4670211711860488, "grad_norm": 0.2010609656572342, "learning_rate": 0.00011055755149117462, "loss": 0.0247, "step": 3535 }, { "epoch": 0.46715328467153283, "grad_norm": 0.2535361647605896, "learning_rate": 0.00011051622189688575, "loss": 0.0382, "step": 3536 }, { "epoch": 0.46728539815701686, "grad_norm": 0.13432317972183228, "learning_rate": 0.00011047489048611619, "loss": 0.0161, "step": 3537 }, { "epoch": 0.4674175116425009, "grad_norm": 0.15032632648944855, "learning_rate": 0.00011043355726600516, "loss": 0.0145, "step": 3538 }, { "epoch": 0.4675496251279849, "grad_norm": 0.15258188545703888, "learning_rate": 0.0001103922222436922, "loss": 0.0233, "step": 3539 }, { "epoch": 0.46768173861346896, "grad_norm": 0.32376575469970703, "learning_rate": 0.0001103508854263171, "loss": 0.0367, "step": 3540 }, { "epoch": 0.467813852098953, "grad_norm": 0.18361371755599976, "learning_rate": 0.00011030954682102011, "loss": 0.0231, "step": 3541 }, { "epoch": 0.467945965584437, "grad_norm": 0.15508785843849182, "learning_rate": 0.00011026820643494167, "loss": 0.0156, "step": 3542 }, { "epoch": 0.46807807906992105, "grad_norm": 0.18575936555862427, "learning_rate": 0.00011022686427522255, "loss": 0.0214, "step": 3543 }, { "epoch": 0.4682101925554051, "grad_norm": 0.23005272448062897, "learning_rate": 0.00011018552034900385, "loss": 0.0256, "step": 3544 }, { "epoch": 0.4683423060408891, "grad_norm": 0.08953981846570969, "learning_rate": 0.00011014417466342695, "loss": 0.0095, "step": 3545 }, { "epoch": 0.46847441952637314, "grad_norm": 0.19710010290145874, "learning_rate": 0.00011010282722563354, "loss": 0.0146, "step": 3546 }, { "epoch": 0.4686065330118572, "grad_norm": 0.20660652220249176, "learning_rate": 0.00011006147804276563, "loss": 0.0091, "step": 3547 }, { "epoch": 0.4687386464973412, "grad_norm": 4.406435489654541, "learning_rate": 0.0001100201271219655, "loss": 0.0727, "step": 3548 }, { "epoch": 0.46887075998282524, "grad_norm": 0.14570850133895874, "learning_rate": 0.00010997877447037577, "loss": 0.0171, "step": 3549 }, { "epoch": 0.46900287346830927, "grad_norm": 0.23842819035053253, "learning_rate": 0.0001099374200951393, "loss": 0.0273, "step": 3550 }, { "epoch": 0.4691349869537933, "grad_norm": 0.15830504894256592, "learning_rate": 0.00010989606400339933, "loss": 0.0166, "step": 3551 }, { "epoch": 0.46926710043927733, "grad_norm": 0.27519121766090393, "learning_rate": 0.00010985470620229937, "loss": 0.0268, "step": 3552 }, { "epoch": 0.46939921392476136, "grad_norm": 0.21721608936786652, "learning_rate": 0.00010981334669898311, "loss": 0.0126, "step": 3553 }, { "epoch": 0.4695313274102454, "grad_norm": 0.16585609316825867, "learning_rate": 0.00010977198550059471, "loss": 0.0194, "step": 3554 }, { "epoch": 0.4696634408957294, "grad_norm": 0.2184048593044281, "learning_rate": 0.00010973062261427853, "loss": 0.0365, "step": 3555 }, { "epoch": 0.46979555438121345, "grad_norm": 0.17115822434425354, "learning_rate": 0.00010968925804717925, "loss": 0.0128, "step": 3556 }, { "epoch": 0.4699276678666975, "grad_norm": 0.10518503934144974, "learning_rate": 0.00010964789180644175, "loss": 0.0101, "step": 3557 }, { "epoch": 0.4700597813521815, "grad_norm": 0.27113738656044006, "learning_rate": 0.00010960652389921137, "loss": 0.0217, "step": 3558 }, { "epoch": 0.47019189483766555, "grad_norm": 0.15465222299098969, "learning_rate": 0.00010956515433263361, "loss": 0.0168, "step": 3559 }, { "epoch": 0.4703240083231496, "grad_norm": 0.1740829050540924, "learning_rate": 0.00010952378311385426, "loss": 0.0215, "step": 3560 }, { "epoch": 0.4704561218086336, "grad_norm": 0.2015177458524704, "learning_rate": 0.00010948241025001947, "loss": 0.021, "step": 3561 }, { "epoch": 0.47058823529411764, "grad_norm": 0.14692965149879456, "learning_rate": 0.00010944103574827555, "loss": 0.0231, "step": 3562 }, { "epoch": 0.47072034877960167, "grad_norm": 0.1983291208744049, "learning_rate": 0.00010939965961576927, "loss": 0.0203, "step": 3563 }, { "epoch": 0.4708524622650857, "grad_norm": 0.17785078287124634, "learning_rate": 0.00010935828185964754, "loss": 0.0087, "step": 3564 }, { "epoch": 0.47098457575056973, "grad_norm": 0.1965969204902649, "learning_rate": 0.00010931690248705759, "loss": 0.0202, "step": 3565 }, { "epoch": 0.47111668923605377, "grad_norm": 0.2205072045326233, "learning_rate": 0.00010927552150514693, "loss": 0.0209, "step": 3566 }, { "epoch": 0.4712488027215378, "grad_norm": 0.20020027458667755, "learning_rate": 0.00010923413892106335, "loss": 0.0207, "step": 3567 }, { "epoch": 0.4713809162070218, "grad_norm": 0.17927613854408264, "learning_rate": 0.00010919275474195494, "loss": 0.021, "step": 3568 }, { "epoch": 0.47151302969250586, "grad_norm": 0.22265446186065674, "learning_rate": 0.00010915136897497, "loss": 0.0246, "step": 3569 }, { "epoch": 0.4716451431779899, "grad_norm": 0.14600390195846558, "learning_rate": 0.00010910998162725718, "loss": 0.0116, "step": 3570 }, { "epoch": 0.4717772566634739, "grad_norm": 0.2514609098434448, "learning_rate": 0.00010906859270596541, "loss": 0.0244, "step": 3571 }, { "epoch": 0.47190937014895795, "grad_norm": 0.2012319266796112, "learning_rate": 0.00010902720221824376, "loss": 0.0261, "step": 3572 }, { "epoch": 0.472041483634442, "grad_norm": 0.28338226675987244, "learning_rate": 0.00010898581017124174, "loss": 0.0229, "step": 3573 }, { "epoch": 0.472173597119926, "grad_norm": 0.2096048891544342, "learning_rate": 0.00010894441657210898, "loss": 0.0372, "step": 3574 }, { "epoch": 0.47230571060541005, "grad_norm": 0.1837790608406067, "learning_rate": 0.00010890302142799555, "loss": 0.0142, "step": 3575 }, { "epoch": 0.4724378240908941, "grad_norm": 0.14642560482025146, "learning_rate": 0.00010886162474605159, "loss": 0.0154, "step": 3576 }, { "epoch": 0.4725699375763781, "grad_norm": 0.2872735559940338, "learning_rate": 0.00010882022653342767, "loss": 0.0238, "step": 3577 }, { "epoch": 0.47270205106186214, "grad_norm": 0.16493089497089386, "learning_rate": 0.00010877882679727453, "loss": 0.0247, "step": 3578 }, { "epoch": 0.47283416454734617, "grad_norm": 0.4759635329246521, "learning_rate": 0.00010873742554474317, "loss": 0.0378, "step": 3579 }, { "epoch": 0.4729662780328302, "grad_norm": 0.4725508391857147, "learning_rate": 0.00010869602278298496, "loss": 0.0363, "step": 3580 }, { "epoch": 0.47309839151831423, "grad_norm": 0.16512827575206757, "learning_rate": 0.00010865461851915138, "loss": 0.015, "step": 3581 }, { "epoch": 0.47323050500379826, "grad_norm": 0.18740400671958923, "learning_rate": 0.00010861321276039426, "loss": 0.0234, "step": 3582 }, { "epoch": 0.4733626184892823, "grad_norm": 0.14816854894161224, "learning_rate": 0.00010857180551386568, "loss": 0.0206, "step": 3583 }, { "epoch": 0.4734947319747663, "grad_norm": 0.3054364025592804, "learning_rate": 0.00010853039678671799, "loss": 0.0143, "step": 3584 }, { "epoch": 0.47362684546025036, "grad_norm": 0.22532625496387482, "learning_rate": 0.00010848898658610374, "loss": 0.0153, "step": 3585 }, { "epoch": 0.4737589589457344, "grad_norm": 0.19474582374095917, "learning_rate": 0.00010844757491917577, "loss": 0.0163, "step": 3586 }, { "epoch": 0.4738910724312184, "grad_norm": 0.11799314618110657, "learning_rate": 0.00010840616179308718, "loss": 0.0071, "step": 3587 }, { "epoch": 0.47402318591670245, "grad_norm": 0.3400368392467499, "learning_rate": 0.0001083647472149913, "loss": 0.0263, "step": 3588 }, { "epoch": 0.4741552994021865, "grad_norm": 0.2943856120109558, "learning_rate": 0.00010832333119204177, "loss": 0.0295, "step": 3589 }, { "epoch": 0.4742874128876705, "grad_norm": 0.16527043282985687, "learning_rate": 0.00010828191373139238, "loss": 0.0184, "step": 3590 }, { "epoch": 0.47441952637315454, "grad_norm": 0.2068243771791458, "learning_rate": 0.00010824049484019725, "loss": 0.029, "step": 3591 }, { "epoch": 0.4745516398586386, "grad_norm": 0.1771349161863327, "learning_rate": 0.00010819907452561071, "loss": 0.0189, "step": 3592 }, { "epoch": 0.4746837533441226, "grad_norm": 0.1710856556892395, "learning_rate": 0.00010815765279478733, "loss": 0.0162, "step": 3593 }, { "epoch": 0.47481586682960664, "grad_norm": 0.34175270795822144, "learning_rate": 0.00010811622965488198, "loss": 0.0243, "step": 3594 }, { "epoch": 0.47494798031509067, "grad_norm": 0.21491940319538116, "learning_rate": 0.00010807480511304968, "loss": 0.0161, "step": 3595 }, { "epoch": 0.4750800938005747, "grad_norm": 0.17027001082897186, "learning_rate": 0.0001080333791764458, "loss": 0.0174, "step": 3596 }, { "epoch": 0.47521220728605873, "grad_norm": 0.3166252672672272, "learning_rate": 0.00010799195185222584, "loss": 0.0326, "step": 3597 }, { "epoch": 0.47534432077154276, "grad_norm": 0.16182473301887512, "learning_rate": 0.00010795052314754565, "loss": 0.0167, "step": 3598 }, { "epoch": 0.4754764342570268, "grad_norm": 0.26130416989326477, "learning_rate": 0.00010790909306956125, "loss": 0.0296, "step": 3599 }, { "epoch": 0.4756085477425108, "grad_norm": 0.19988104701042175, "learning_rate": 0.00010786766162542884, "loss": 0.0229, "step": 3600 }, { "epoch": 0.47574066122799485, "grad_norm": 0.1847897619009018, "learning_rate": 0.00010782622882230504, "loss": 0.0323, "step": 3601 }, { "epoch": 0.4758727747134789, "grad_norm": 0.2483040988445282, "learning_rate": 0.00010778479466734654, "loss": 0.0287, "step": 3602 }, { "epoch": 0.4760048881989629, "grad_norm": 0.21745510399341583, "learning_rate": 0.0001077433591677103, "loss": 0.0226, "step": 3603 }, { "epoch": 0.47613700168444695, "grad_norm": 0.4640471041202545, "learning_rate": 0.00010770192233055356, "loss": 0.0218, "step": 3604 }, { "epoch": 0.476269115169931, "grad_norm": 0.3053547441959381, "learning_rate": 0.00010766048416303373, "loss": 0.0304, "step": 3605 }, { "epoch": 0.476401228655415, "grad_norm": 0.2106274515390396, "learning_rate": 0.0001076190446723085, "loss": 0.0201, "step": 3606 }, { "epoch": 0.47653334214089904, "grad_norm": 0.187311053276062, "learning_rate": 0.00010757760386553574, "loss": 0.0195, "step": 3607 }, { "epoch": 0.47666545562638307, "grad_norm": 0.13241535425186157, "learning_rate": 0.00010753616174987362, "loss": 0.0199, "step": 3608 }, { "epoch": 0.4767975691118671, "grad_norm": 0.18745078146457672, "learning_rate": 0.00010749471833248046, "loss": 0.0294, "step": 3609 }, { "epoch": 0.47692968259735113, "grad_norm": 0.5134204626083374, "learning_rate": 0.0001074532736205148, "loss": 0.0321, "step": 3610 }, { "epoch": 0.47706179608283517, "grad_norm": 0.43000027537345886, "learning_rate": 0.00010741182762113553, "loss": 0.0283, "step": 3611 }, { "epoch": 0.4771939095683192, "grad_norm": 0.17699387669563293, "learning_rate": 0.00010737038034150158, "loss": 0.0178, "step": 3612 }, { "epoch": 0.47732602305380323, "grad_norm": 0.19420090317726135, "learning_rate": 0.00010732893178877225, "loss": 0.021, "step": 3613 }, { "epoch": 0.47745813653928726, "grad_norm": 0.25928303599357605, "learning_rate": 0.00010728748197010699, "loss": 0.0215, "step": 3614 }, { "epoch": 0.4775902500247713, "grad_norm": 0.149773508310318, "learning_rate": 0.00010724603089266547, "loss": 0.0166, "step": 3615 }, { "epoch": 0.4777223635102553, "grad_norm": 0.1963023990392685, "learning_rate": 0.0001072045785636076, "loss": 0.0231, "step": 3616 }, { "epoch": 0.47785447699573935, "grad_norm": 0.22853727638721466, "learning_rate": 0.00010716312499009346, "loss": 0.0211, "step": 3617 }, { "epoch": 0.4779865904812234, "grad_norm": 0.31241101026535034, "learning_rate": 0.00010712167017928345, "loss": 0.0287, "step": 3618 }, { "epoch": 0.4781187039667074, "grad_norm": 0.1355995535850525, "learning_rate": 0.00010708021413833804, "loss": 0.0135, "step": 3619 }, { "epoch": 0.47825081745219145, "grad_norm": 0.23037013411521912, "learning_rate": 0.00010703875687441804, "loss": 0.0226, "step": 3620 }, { "epoch": 0.4783829309376755, "grad_norm": 0.272693008184433, "learning_rate": 0.00010699729839468437, "loss": 0.0315, "step": 3621 }, { "epoch": 0.4785150444231595, "grad_norm": 0.1348901391029358, "learning_rate": 0.00010695583870629827, "loss": 0.0135, "step": 3622 }, { "epoch": 0.47864715790864354, "grad_norm": 0.3382134437561035, "learning_rate": 0.00010691437781642107, "loss": 0.0371, "step": 3623 }, { "epoch": 0.47877927139412757, "grad_norm": 0.197440043091774, "learning_rate": 0.00010687291573221436, "loss": 0.0207, "step": 3624 }, { "epoch": 0.4789113848796116, "grad_norm": 0.2662707567214966, "learning_rate": 0.00010683145246083999, "loss": 0.0239, "step": 3625 }, { "epoch": 0.47904349836509563, "grad_norm": 0.20408916473388672, "learning_rate": 0.00010678998800945991, "loss": 0.0284, "step": 3626 }, { "epoch": 0.47917561185057966, "grad_norm": 0.16986006498336792, "learning_rate": 0.00010674852238523639, "loss": 0.0239, "step": 3627 }, { "epoch": 0.4793077253360637, "grad_norm": 0.2010790854692459, "learning_rate": 0.00010670705559533178, "loss": 0.0131, "step": 3628 }, { "epoch": 0.4794398388215477, "grad_norm": 0.14181357622146606, "learning_rate": 0.00010666558764690871, "loss": 0.0083, "step": 3629 }, { "epoch": 0.47957195230703176, "grad_norm": 0.3691260814666748, "learning_rate": 0.00010662411854713004, "loss": 0.0312, "step": 3630 }, { "epoch": 0.4797040657925158, "grad_norm": 0.18499897420406342, "learning_rate": 0.00010658264830315872, "loss": 0.0146, "step": 3631 }, { "epoch": 0.4798361792779998, "grad_norm": 0.17759235203266144, "learning_rate": 0.00010654117692215799, "loss": 0.0245, "step": 3632 }, { "epoch": 0.47996829276348385, "grad_norm": 0.21312375366687775, "learning_rate": 0.00010649970441129124, "loss": 0.0209, "step": 3633 }, { "epoch": 0.4801004062489679, "grad_norm": 0.17901498079299927, "learning_rate": 0.0001064582307777221, "loss": 0.0136, "step": 3634 }, { "epoch": 0.4802325197344519, "grad_norm": 0.2698601484298706, "learning_rate": 0.00010641675602861434, "loss": 0.0277, "step": 3635 }, { "epoch": 0.48036463321993594, "grad_norm": 0.17690856754779816, "learning_rate": 0.00010637528017113192, "loss": 0.0177, "step": 3636 }, { "epoch": 0.48049674670542, "grad_norm": 0.21805188059806824, "learning_rate": 0.00010633380321243909, "loss": 0.0103, "step": 3637 }, { "epoch": 0.480628860190904, "grad_norm": 0.16631975769996643, "learning_rate": 0.00010629232515970015, "loss": 0.0256, "step": 3638 }, { "epoch": 0.48076097367638804, "grad_norm": 0.21511219441890717, "learning_rate": 0.00010625084602007972, "loss": 0.0336, "step": 3639 }, { "epoch": 0.48089308716187207, "grad_norm": 0.22643572092056274, "learning_rate": 0.00010620936580074248, "loss": 0.0223, "step": 3640 }, { "epoch": 0.4810252006473561, "grad_norm": 0.13022497296333313, "learning_rate": 0.00010616788450885342, "loss": 0.0157, "step": 3641 }, { "epoch": 0.48115731413284013, "grad_norm": 0.5486253499984741, "learning_rate": 0.00010612640215157766, "loss": 0.0341, "step": 3642 }, { "epoch": 0.48128942761832416, "grad_norm": 0.18713150918483734, "learning_rate": 0.00010608491873608041, "loss": 0.0258, "step": 3643 }, { "epoch": 0.4814215411038082, "grad_norm": 0.24169377982616425, "learning_rate": 0.00010604343426952728, "loss": 0.0195, "step": 3644 }, { "epoch": 0.4815536545892922, "grad_norm": 0.13835279643535614, "learning_rate": 0.00010600194875908382, "loss": 0.0102, "step": 3645 }, { "epoch": 0.48168576807477625, "grad_norm": 0.21671341359615326, "learning_rate": 0.00010596046221191598, "loss": 0.0235, "step": 3646 }, { "epoch": 0.4818178815602603, "grad_norm": 0.17495031654834747, "learning_rate": 0.00010591897463518969, "loss": 0.018, "step": 3647 }, { "epoch": 0.4819499950457443, "grad_norm": 0.2383643090724945, "learning_rate": 0.00010587748603607124, "loss": 0.0145, "step": 3648 }, { "epoch": 0.48208210853122835, "grad_norm": 0.1689545065164566, "learning_rate": 0.00010583599642172697, "loss": 0.0169, "step": 3649 }, { "epoch": 0.4822142220167124, "grad_norm": 0.12825123965740204, "learning_rate": 0.00010579450579932342, "loss": 0.0108, "step": 3650 }, { "epoch": 0.4823463355021964, "grad_norm": 0.17009027302265167, "learning_rate": 0.00010575301417602734, "loss": 0.0175, "step": 3651 }, { "epoch": 0.48247844898768044, "grad_norm": 0.14984194934368134, "learning_rate": 0.00010571152155900561, "loss": 0.0186, "step": 3652 }, { "epoch": 0.4826105624731645, "grad_norm": 0.15858642756938934, "learning_rate": 0.00010567002795542535, "loss": 0.0225, "step": 3653 }, { "epoch": 0.4827426759586485, "grad_norm": 0.24578054249286652, "learning_rate": 0.00010562853337245373, "loss": 0.022, "step": 3654 }, { "epoch": 0.48287478944413254, "grad_norm": 0.14686349034309387, "learning_rate": 0.00010558703781725825, "loss": 0.0254, "step": 3655 }, { "epoch": 0.48300690292961657, "grad_norm": 0.23595882952213287, "learning_rate": 0.00010554554129700646, "loss": 0.0196, "step": 3656 }, { "epoch": 0.4831390164151006, "grad_norm": 0.1301383078098297, "learning_rate": 0.00010550404381886605, "loss": 0.0151, "step": 3657 }, { "epoch": 0.48327112990058463, "grad_norm": 0.15810106694698334, "learning_rate": 0.000105462545390005, "loss": 0.0238, "step": 3658 }, { "epoch": 0.48340324338606866, "grad_norm": 0.19346168637275696, "learning_rate": 0.00010542104601759137, "loss": 0.0224, "step": 3659 }, { "epoch": 0.4835353568715527, "grad_norm": 0.1737697869539261, "learning_rate": 0.0001053795457087934, "loss": 0.0182, "step": 3660 }, { "epoch": 0.4836674703570367, "grad_norm": 0.21048392355442047, "learning_rate": 0.0001053380444707795, "loss": 0.0143, "step": 3661 }, { "epoch": 0.48379958384252075, "grad_norm": 0.10285349935293198, "learning_rate": 0.00010529654231071821, "loss": 0.0127, "step": 3662 }, { "epoch": 0.48393169732800473, "grad_norm": 0.26792213320732117, "learning_rate": 0.00010525503923577829, "loss": 0.0287, "step": 3663 }, { "epoch": 0.48406381081348876, "grad_norm": 0.18912583589553833, "learning_rate": 0.00010521353525312859, "loss": 0.0225, "step": 3664 }, { "epoch": 0.4841959242989728, "grad_norm": 0.20273041725158691, "learning_rate": 0.00010517203036993815, "loss": 0.0325, "step": 3665 }, { "epoch": 0.4843280377844568, "grad_norm": 0.14839552342891693, "learning_rate": 0.00010513052459337616, "loss": 0.0114, "step": 3666 }, { "epoch": 0.48446015126994085, "grad_norm": 0.4060331881046295, "learning_rate": 0.00010508901793061196, "loss": 0.0209, "step": 3667 }, { "epoch": 0.4845922647554249, "grad_norm": 0.15798570215702057, "learning_rate": 0.00010504751038881511, "loss": 0.0161, "step": 3668 }, { "epoch": 0.4847243782409089, "grad_norm": 0.19073081016540527, "learning_rate": 0.00010500600197515515, "loss": 0.0183, "step": 3669 }, { "epoch": 0.48485649172639295, "grad_norm": 0.16638392210006714, "learning_rate": 0.000104964492696802, "loss": 0.0169, "step": 3670 }, { "epoch": 0.484988605211877, "grad_norm": 0.2253982424736023, "learning_rate": 0.00010492298256092552, "loss": 0.0277, "step": 3671 }, { "epoch": 0.485120718697361, "grad_norm": 0.20915861427783966, "learning_rate": 0.0001048814715746959, "loss": 0.0256, "step": 3672 }, { "epoch": 0.48525283218284504, "grad_norm": 0.24739712476730347, "learning_rate": 0.00010483995974528332, "loss": 0.0228, "step": 3673 }, { "epoch": 0.48538494566832907, "grad_norm": 0.16098754107952118, "learning_rate": 0.00010479844707985816, "loss": 0.0219, "step": 3674 }, { "epoch": 0.4855170591538131, "grad_norm": 0.18969130516052246, "learning_rate": 0.000104756933585591, "loss": 0.0191, "step": 3675 }, { "epoch": 0.48564917263929713, "grad_norm": 0.29885485768318176, "learning_rate": 0.00010471541926965249, "loss": 0.0327, "step": 3676 }, { "epoch": 0.48578128612478116, "grad_norm": 0.3478846848011017, "learning_rate": 0.0001046739041392135, "loss": 0.0399, "step": 3677 }, { "epoch": 0.4859133996102652, "grad_norm": 0.21352167427539825, "learning_rate": 0.00010463238820144492, "loss": 0.0203, "step": 3678 }, { "epoch": 0.4860455130957492, "grad_norm": 0.2575002908706665, "learning_rate": 0.00010459087146351791, "loss": 0.0243, "step": 3679 }, { "epoch": 0.48617762658123326, "grad_norm": 0.447734534740448, "learning_rate": 0.0001045493539326037, "loss": 0.0204, "step": 3680 }, { "epoch": 0.4863097400667173, "grad_norm": 0.19277682900428772, "learning_rate": 0.00010450783561587365, "loss": 0.0145, "step": 3681 }, { "epoch": 0.4864418535522013, "grad_norm": 0.18014197051525116, "learning_rate": 0.0001044663165204993, "loss": 0.0253, "step": 3682 }, { "epoch": 0.48657396703768535, "grad_norm": 0.1811141073703766, "learning_rate": 0.00010442479665365224, "loss": 0.0192, "step": 3683 }, { "epoch": 0.4867060805231694, "grad_norm": 0.18892107903957367, "learning_rate": 0.00010438327602250433, "loss": 0.0209, "step": 3684 }, { "epoch": 0.4868381940086534, "grad_norm": 0.17446766793727875, "learning_rate": 0.00010434175463422739, "loss": 0.0284, "step": 3685 }, { "epoch": 0.48697030749413744, "grad_norm": 0.29287025332450867, "learning_rate": 0.00010430023249599357, "loss": 0.0131, "step": 3686 }, { "epoch": 0.4871024209796215, "grad_norm": 0.16245250403881073, "learning_rate": 0.00010425870961497495, "loss": 0.0168, "step": 3687 }, { "epoch": 0.4872345344651055, "grad_norm": 0.17298054695129395, "learning_rate": 0.00010421718599834389, "loss": 0.0176, "step": 3688 }, { "epoch": 0.48736664795058954, "grad_norm": 0.21618597209453583, "learning_rate": 0.00010417566165327279, "loss": 0.0233, "step": 3689 }, { "epoch": 0.48749876143607357, "grad_norm": 0.0877385288476944, "learning_rate": 0.00010413413658693423, "loss": 0.0057, "step": 3690 }, { "epoch": 0.4876308749215576, "grad_norm": 0.19247932732105255, "learning_rate": 0.00010409261080650086, "loss": 0.0323, "step": 3691 }, { "epoch": 0.48776298840704163, "grad_norm": 0.23241962492465973, "learning_rate": 0.0001040510843191455, "loss": 0.0208, "step": 3692 }, { "epoch": 0.48789510189252566, "grad_norm": 0.17630332708358765, "learning_rate": 0.00010400955713204106, "loss": 0.0307, "step": 3693 }, { "epoch": 0.4880272153780097, "grad_norm": 0.16653984785079956, "learning_rate": 0.0001039680292523606, "loss": 0.0151, "step": 3694 }, { "epoch": 0.4881593288634937, "grad_norm": 0.2570105493068695, "learning_rate": 0.00010392650068727728, "loss": 0.0288, "step": 3695 }, { "epoch": 0.48829144234897776, "grad_norm": 0.1397189050912857, "learning_rate": 0.00010388497144396436, "loss": 0.0123, "step": 3696 }, { "epoch": 0.4884235558344618, "grad_norm": 0.24010951817035675, "learning_rate": 0.00010384344152959529, "loss": 0.0179, "step": 3697 }, { "epoch": 0.4885556693199458, "grad_norm": 0.26753032207489014, "learning_rate": 0.00010380191095134355, "loss": 0.0366, "step": 3698 }, { "epoch": 0.48868778280542985, "grad_norm": 0.28516536951065063, "learning_rate": 0.00010376037971638278, "loss": 0.0245, "step": 3699 }, { "epoch": 0.4888198962909139, "grad_norm": 0.1718878448009491, "learning_rate": 0.0001037188478318867, "loss": 0.0229, "step": 3700 }, { "epoch": 0.4889520097763979, "grad_norm": 0.1643366664648056, "learning_rate": 0.00010367731530502923, "loss": 0.0217, "step": 3701 }, { "epoch": 0.48908412326188194, "grad_norm": 0.15962187945842743, "learning_rate": 0.00010363578214298424, "loss": 0.0137, "step": 3702 }, { "epoch": 0.489216236747366, "grad_norm": 0.2725326418876648, "learning_rate": 0.00010359424835292591, "loss": 0.0187, "step": 3703 }, { "epoch": 0.48934835023285, "grad_norm": 0.38736462593078613, "learning_rate": 0.00010355271394202834, "loss": 0.0292, "step": 3704 }, { "epoch": 0.48948046371833404, "grad_norm": 0.23041263222694397, "learning_rate": 0.00010351117891746592, "loss": 0.0247, "step": 3705 }, { "epoch": 0.48961257720381807, "grad_norm": 0.19857461750507355, "learning_rate": 0.00010346964328641297, "loss": 0.0305, "step": 3706 }, { "epoch": 0.4897446906893021, "grad_norm": 0.18201592564582825, "learning_rate": 0.000103428107056044, "loss": 0.0176, "step": 3707 }, { "epoch": 0.48987680417478613, "grad_norm": 0.29200470447540283, "learning_rate": 0.00010338657023353364, "loss": 0.0278, "step": 3708 }, { "epoch": 0.49000891766027016, "grad_norm": 0.11148899048566818, "learning_rate": 0.00010334503282605656, "loss": 0.0163, "step": 3709 }, { "epoch": 0.4901410311457542, "grad_norm": 0.1690640151500702, "learning_rate": 0.00010330349484078765, "loss": 0.0148, "step": 3710 }, { "epoch": 0.4902731446312382, "grad_norm": 0.1554546058177948, "learning_rate": 0.00010326195628490174, "loss": 0.0125, "step": 3711 }, { "epoch": 0.49040525811672225, "grad_norm": 0.22738389670848846, "learning_rate": 0.00010322041716557391, "loss": 0.0214, "step": 3712 }, { "epoch": 0.4905373716022063, "grad_norm": 0.1931189000606537, "learning_rate": 0.0001031788774899792, "loss": 0.0271, "step": 3713 }, { "epoch": 0.4906694850876903, "grad_norm": 0.19231070578098297, "learning_rate": 0.00010313733726529284, "loss": 0.0238, "step": 3714 }, { "epoch": 0.49080159857317435, "grad_norm": 0.2267046421766281, "learning_rate": 0.00010309579649869014, "loss": 0.0178, "step": 3715 }, { "epoch": 0.4909337120586584, "grad_norm": 0.18220418691635132, "learning_rate": 0.00010305425519734646, "loss": 0.0134, "step": 3716 }, { "epoch": 0.4910658255441424, "grad_norm": 0.17011044919490814, "learning_rate": 0.0001030127133684373, "loss": 0.0142, "step": 3717 }, { "epoch": 0.49119793902962644, "grad_norm": 0.16121694445610046, "learning_rate": 0.00010297117101913825, "loss": 0.0149, "step": 3718 }, { "epoch": 0.49133005251511047, "grad_norm": 0.1913376748561859, "learning_rate": 0.00010292962815662494, "loss": 0.0221, "step": 3719 }, { "epoch": 0.4914621660005945, "grad_norm": 0.09859661012887955, "learning_rate": 0.00010288808478807316, "loss": 0.0187, "step": 3720 }, { "epoch": 0.49159427948607853, "grad_norm": 0.23269788920879364, "learning_rate": 0.00010284654092065873, "loss": 0.0333, "step": 3721 }, { "epoch": 0.49172639297156256, "grad_norm": 0.24885809421539307, "learning_rate": 0.00010280499656155757, "loss": 0.0169, "step": 3722 }, { "epoch": 0.4918585064570466, "grad_norm": 0.2113824039697647, "learning_rate": 0.00010276345171794573, "loss": 0.0287, "step": 3723 }, { "epoch": 0.4919906199425306, "grad_norm": 0.24789120256900787, "learning_rate": 0.00010272190639699924, "loss": 0.0205, "step": 3724 }, { "epoch": 0.49212273342801466, "grad_norm": 0.09954213351011276, "learning_rate": 0.00010268036060589432, "loss": 0.0098, "step": 3725 }, { "epoch": 0.4922548469134987, "grad_norm": 0.3465690314769745, "learning_rate": 0.00010263881435180722, "loss": 0.0237, "step": 3726 }, { "epoch": 0.4923869603989827, "grad_norm": 0.13831698894500732, "learning_rate": 0.00010259726764191428, "loss": 0.0129, "step": 3727 }, { "epoch": 0.49251907388446675, "grad_norm": 0.24895714223384857, "learning_rate": 0.0001025557204833919, "loss": 0.0227, "step": 3728 }, { "epoch": 0.4926511873699508, "grad_norm": 0.261017769575119, "learning_rate": 0.00010251417288341662, "loss": 0.0233, "step": 3729 }, { "epoch": 0.4927833008554348, "grad_norm": 0.22980886697769165, "learning_rate": 0.000102472624849165, "loss": 0.019, "step": 3730 }, { "epoch": 0.49291541434091884, "grad_norm": 0.2138877809047699, "learning_rate": 0.00010243107638781365, "loss": 0.0182, "step": 3731 }, { "epoch": 0.4930475278264029, "grad_norm": 0.16170454025268555, "learning_rate": 0.00010238952750653929, "loss": 0.0109, "step": 3732 }, { "epoch": 0.4931796413118869, "grad_norm": 0.1737380027770996, "learning_rate": 0.00010234797821251873, "loss": 0.026, "step": 3733 }, { "epoch": 0.49331175479737094, "grad_norm": 0.12878692150115967, "learning_rate": 0.00010230642851292887, "loss": 0.0073, "step": 3734 }, { "epoch": 0.49344386828285497, "grad_norm": 0.24098946154117584, "learning_rate": 0.00010226487841494656, "loss": 0.0277, "step": 3735 }, { "epoch": 0.493575981768339, "grad_norm": 0.24644453823566437, "learning_rate": 0.00010222332792574889, "loss": 0.024, "step": 3736 }, { "epoch": 0.49370809525382303, "grad_norm": 0.12515372037887573, "learning_rate": 0.00010218177705251289, "loss": 0.0164, "step": 3737 }, { "epoch": 0.49384020873930706, "grad_norm": 0.3598020672798157, "learning_rate": 0.00010214022580241567, "loss": 0.0173, "step": 3738 }, { "epoch": 0.4939723222247911, "grad_norm": 0.17398720979690552, "learning_rate": 0.00010209867418263448, "loss": 0.0154, "step": 3739 }, { "epoch": 0.4941044357102751, "grad_norm": 0.13843418657779694, "learning_rate": 0.00010205712220034654, "loss": 0.0202, "step": 3740 }, { "epoch": 0.49423654919575916, "grad_norm": 0.1310289353132248, "learning_rate": 0.00010201556986272922, "loss": 0.0148, "step": 3741 }, { "epoch": 0.4943686626812432, "grad_norm": 0.133134126663208, "learning_rate": 0.0001019740171769599, "loss": 0.0113, "step": 3742 }, { "epoch": 0.4945007761667272, "grad_norm": 0.19342927634716034, "learning_rate": 0.00010193246415021602, "loss": 0.0235, "step": 3743 }, { "epoch": 0.49463288965221125, "grad_norm": 0.19014842808246613, "learning_rate": 0.0001018909107896751, "loss": 0.0174, "step": 3744 }, { "epoch": 0.4947650031376953, "grad_norm": 0.2186693698167801, "learning_rate": 0.00010184935710251467, "loss": 0.0155, "step": 3745 }, { "epoch": 0.4948971166231793, "grad_norm": 0.17435306310653687, "learning_rate": 0.00010180780309591236, "loss": 0.0117, "step": 3746 }, { "epoch": 0.49502923010866334, "grad_norm": 0.14553245902061462, "learning_rate": 0.00010176624877704588, "loss": 0.0176, "step": 3747 }, { "epoch": 0.4951613435941474, "grad_norm": 0.19681425392627716, "learning_rate": 0.00010172469415309297, "loss": 0.0193, "step": 3748 }, { "epoch": 0.4952934570796314, "grad_norm": 0.14469482004642487, "learning_rate": 0.00010168313923123141, "loss": 0.0134, "step": 3749 }, { "epoch": 0.49542557056511544, "grad_norm": 0.16757139563560486, "learning_rate": 0.00010164158401863896, "loss": 0.0189, "step": 3750 }, { "epoch": 0.49555768405059947, "grad_norm": 0.22932220995426178, "learning_rate": 0.00010160002852249361, "loss": 0.0213, "step": 3751 }, { "epoch": 0.4956897975360835, "grad_norm": 0.13100813329219818, "learning_rate": 0.00010155847274997323, "loss": 0.0163, "step": 3752 }, { "epoch": 0.49582191102156753, "grad_norm": 0.2130032479763031, "learning_rate": 0.00010151691670825582, "loss": 0.0155, "step": 3753 }, { "epoch": 0.49595402450705156, "grad_norm": 0.14983811974525452, "learning_rate": 0.00010147536040451942, "loss": 0.021, "step": 3754 }, { "epoch": 0.4960861379925356, "grad_norm": 0.14002719521522522, "learning_rate": 0.0001014338038459421, "loss": 0.0152, "step": 3755 }, { "epoch": 0.4962182514780196, "grad_norm": 0.21298381686210632, "learning_rate": 0.00010139224703970198, "loss": 0.0276, "step": 3756 }, { "epoch": 0.49635036496350365, "grad_norm": 0.15027806162834167, "learning_rate": 0.0001013506899929772, "loss": 0.0171, "step": 3757 }, { "epoch": 0.4964824784489877, "grad_norm": 0.15074260532855988, "learning_rate": 0.00010130913271294598, "loss": 0.013, "step": 3758 }, { "epoch": 0.4966145919344717, "grad_norm": 0.22300437092781067, "learning_rate": 0.00010126757520678653, "loss": 0.0154, "step": 3759 }, { "epoch": 0.49674670541995575, "grad_norm": 0.15333257615566254, "learning_rate": 0.00010122601748167722, "loss": 0.0135, "step": 3760 }, { "epoch": 0.4968788189054398, "grad_norm": 0.13082996010780334, "learning_rate": 0.00010118445954479627, "loss": 0.0082, "step": 3761 }, { "epoch": 0.4970109323909238, "grad_norm": 0.21996119618415833, "learning_rate": 0.0001011429014033221, "loss": 0.0268, "step": 3762 }, { "epoch": 0.49714304587640784, "grad_norm": 0.15659058094024658, "learning_rate": 0.00010110134306443308, "loss": 0.011, "step": 3763 }, { "epoch": 0.49727515936189187, "grad_norm": 0.2219391167163849, "learning_rate": 0.00010105978453530765, "loss": 0.0265, "step": 3764 }, { "epoch": 0.4974072728473759, "grad_norm": 0.1443316787481308, "learning_rate": 0.00010101822582312424, "loss": 0.0182, "step": 3765 }, { "epoch": 0.49753938633285993, "grad_norm": 0.2249058187007904, "learning_rate": 0.00010097666693506134, "loss": 0.0244, "step": 3766 }, { "epoch": 0.49767149981834397, "grad_norm": 0.25690579414367676, "learning_rate": 0.00010093510787829752, "loss": 0.0223, "step": 3767 }, { "epoch": 0.497803613303828, "grad_norm": 0.19646599888801575, "learning_rate": 0.00010089354866001129, "loss": 0.0195, "step": 3768 }, { "epoch": 0.497935726789312, "grad_norm": 0.19536004960536957, "learning_rate": 0.00010085198928738122, "loss": 0.0373, "step": 3769 }, { "epoch": 0.49806784027479606, "grad_norm": 0.16124306619167328, "learning_rate": 0.00010081042976758597, "loss": 0.022, "step": 3770 }, { "epoch": 0.4981999537602801, "grad_norm": 0.22762157022953033, "learning_rate": 0.00010076887010780407, "loss": 0.0214, "step": 3771 }, { "epoch": 0.4983320672457641, "grad_norm": 0.14339573681354523, "learning_rate": 0.00010072731031521428, "loss": 0.0174, "step": 3772 }, { "epoch": 0.49846418073124815, "grad_norm": 0.22775132954120636, "learning_rate": 0.00010068575039699521, "loss": 0.0198, "step": 3773 }, { "epoch": 0.4985962942167322, "grad_norm": 0.209895059466362, "learning_rate": 0.0001006441903603256, "loss": 0.0167, "step": 3774 }, { "epoch": 0.4987284077022162, "grad_norm": 0.23028625547885895, "learning_rate": 0.00010060263021238412, "loss": 0.0132, "step": 3775 }, { "epoch": 0.49886052118770025, "grad_norm": 0.17354623973369598, "learning_rate": 0.00010056106996034955, "loss": 0.0119, "step": 3776 }, { "epoch": 0.4989926346731843, "grad_norm": 0.1872815638780594, "learning_rate": 0.00010051950961140066, "loss": 0.0255, "step": 3777 }, { "epoch": 0.4991247481586683, "grad_norm": 0.18477414548397064, "learning_rate": 0.00010047794917271615, "loss": 0.0186, "step": 3778 }, { "epoch": 0.49925686164415234, "grad_norm": 0.11314979195594788, "learning_rate": 0.0001004363886514749, "loss": 0.0125, "step": 3779 }, { "epoch": 0.49938897512963637, "grad_norm": 0.1584259271621704, "learning_rate": 0.00010039482805485567, "loss": 0.0163, "step": 3780 }, { "epoch": 0.4995210886151204, "grad_norm": 0.18248772621154785, "learning_rate": 0.00010035326739003726, "loss": 0.0255, "step": 3781 }, { "epoch": 0.49965320210060443, "grad_norm": 0.1315157413482666, "learning_rate": 0.00010031170666419853, "loss": 0.0158, "step": 3782 }, { "epoch": 0.49978531558608846, "grad_norm": 0.11246932297945023, "learning_rate": 0.00010027014588451827, "loss": 0.0085, "step": 3783 }, { "epoch": 0.4999174290715725, "grad_norm": 0.22276318073272705, "learning_rate": 0.00010022858505817539, "loss": 0.025, "step": 3784 }, { "epoch": 0.5000495425570565, "grad_norm": 0.15550485253334045, "learning_rate": 0.0001001870241923487, "loss": 0.0146, "step": 3785 }, { "epoch": 0.5001816560425405, "grad_norm": 0.123907171189785, "learning_rate": 0.00010014546329421707, "loss": 0.0146, "step": 3786 }, { "epoch": 0.5003137695280245, "grad_norm": 0.22745291888713837, "learning_rate": 0.00010010390237095941, "loss": 0.0361, "step": 3787 }, { "epoch": 0.5004458830135086, "grad_norm": 0.15615154802799225, "learning_rate": 0.00010006234142975452, "loss": 0.0172, "step": 3788 }, { "epoch": 0.5005779964989926, "grad_norm": 0.13986553251743317, "learning_rate": 0.00010002078047778134, "loss": 0.0163, "step": 3789 }, { "epoch": 0.5007101099844766, "grad_norm": 0.2259240746498108, "learning_rate": 9.99792195222187e-05, "loss": 0.0304, "step": 3790 }, { "epoch": 0.5008422234699607, "grad_norm": 0.21802827715873718, "learning_rate": 9.993765857024549e-05, "loss": 0.0186, "step": 3791 }, { "epoch": 0.5009743369554447, "grad_norm": 0.13896140456199646, "learning_rate": 9.989609762904061e-05, "loss": 0.0197, "step": 3792 }, { "epoch": 0.5011064504409287, "grad_norm": 0.24788185954093933, "learning_rate": 9.985453670578292e-05, "loss": 0.0257, "step": 3793 }, { "epoch": 0.5012385639264128, "grad_norm": 0.23262043297290802, "learning_rate": 9.981297580765132e-05, "loss": 0.0283, "step": 3794 }, { "epoch": 0.5013706774118968, "grad_norm": 0.1629173457622528, "learning_rate": 9.977141494182461e-05, "loss": 0.0123, "step": 3795 }, { "epoch": 0.5015027908973808, "grad_norm": 0.22046636044979095, "learning_rate": 9.972985411548173e-05, "loss": 0.0234, "step": 3796 }, { "epoch": 0.5016349043828648, "grad_norm": 0.1676786243915558, "learning_rate": 9.96882933358015e-05, "loss": 0.023, "step": 3797 }, { "epoch": 0.5017670178683489, "grad_norm": 0.13438016176223755, "learning_rate": 9.964673260996274e-05, "loss": 0.014, "step": 3798 }, { "epoch": 0.5018991313538329, "grad_norm": 0.15927131474018097, "learning_rate": 9.960517194514435e-05, "loss": 0.0166, "step": 3799 }, { "epoch": 0.5020312448393169, "grad_norm": 0.23379889130592346, "learning_rate": 9.956361134852509e-05, "loss": 0.0204, "step": 3800 }, { "epoch": 0.502163358324801, "grad_norm": 0.1917094886302948, "learning_rate": 9.952205082728384e-05, "loss": 0.0235, "step": 3801 }, { "epoch": 0.502295471810285, "grad_norm": 0.20136989653110504, "learning_rate": 9.948049038859935e-05, "loss": 0.0096, "step": 3802 }, { "epoch": 0.502427585295769, "grad_norm": 0.16332395374774933, "learning_rate": 9.943893003965044e-05, "loss": 0.0223, "step": 3803 }, { "epoch": 0.5025596987812531, "grad_norm": 0.3075923025608063, "learning_rate": 9.939736978761589e-05, "loss": 0.0225, "step": 3804 }, { "epoch": 0.5026918122667371, "grad_norm": 0.1782737374305725, "learning_rate": 9.935580963967442e-05, "loss": 0.0246, "step": 3805 }, { "epoch": 0.5028239257522211, "grad_norm": 0.11710914969444275, "learning_rate": 9.93142496030048e-05, "loss": 0.0092, "step": 3806 }, { "epoch": 0.5029560392377052, "grad_norm": 0.21303799748420715, "learning_rate": 9.927268968478573e-05, "loss": 0.0124, "step": 3807 }, { "epoch": 0.5030881527231892, "grad_norm": 0.41062068939208984, "learning_rate": 9.923112989219594e-05, "loss": 0.0311, "step": 3808 }, { "epoch": 0.5032202662086732, "grad_norm": 0.3300868570804596, "learning_rate": 9.918957023241406e-05, "loss": 0.0231, "step": 3809 }, { "epoch": 0.5033523796941572, "grad_norm": 0.19093488156795502, "learning_rate": 9.91480107126188e-05, "loss": 0.0232, "step": 3810 }, { "epoch": 0.5034844931796413, "grad_norm": 0.1572832465171814, "learning_rate": 9.910645133998875e-05, "loss": 0.0131, "step": 3811 }, { "epoch": 0.5036166066651253, "grad_norm": 0.1398879587650299, "learning_rate": 9.90648921217025e-05, "loss": 0.0141, "step": 3812 }, { "epoch": 0.5037487201506093, "grad_norm": 0.2715822160243988, "learning_rate": 9.902333306493868e-05, "loss": 0.0215, "step": 3813 }, { "epoch": 0.5038808336360934, "grad_norm": 0.18688692152500153, "learning_rate": 9.898177417687578e-05, "loss": 0.0224, "step": 3814 }, { "epoch": 0.5040129471215774, "grad_norm": 0.1852293759584427, "learning_rate": 9.894021546469239e-05, "loss": 0.0176, "step": 3815 }, { "epoch": 0.5041450606070614, "grad_norm": 0.28092116117477417, "learning_rate": 9.889865693556694e-05, "loss": 0.0187, "step": 3816 }, { "epoch": 0.5042771740925455, "grad_norm": 0.20824526250362396, "learning_rate": 9.885709859667792e-05, "loss": 0.0267, "step": 3817 }, { "epoch": 0.5044092875780295, "grad_norm": 0.19389167428016663, "learning_rate": 9.881554045520376e-05, "loss": 0.0348, "step": 3818 }, { "epoch": 0.5045414010635135, "grad_norm": 0.20891070365905762, "learning_rate": 9.87739825183228e-05, "loss": 0.0182, "step": 3819 }, { "epoch": 0.5046735145489976, "grad_norm": 0.17443668842315674, "learning_rate": 9.873242479321348e-05, "loss": 0.0123, "step": 3820 }, { "epoch": 0.5048056280344816, "grad_norm": 0.17366589605808258, "learning_rate": 9.869086728705406e-05, "loss": 0.0171, "step": 3821 }, { "epoch": 0.5049377415199656, "grad_norm": 0.17065440118312836, "learning_rate": 9.864931000702284e-05, "loss": 0.0126, "step": 3822 }, { "epoch": 0.5050698550054497, "grad_norm": 0.29523712396621704, "learning_rate": 9.860775296029805e-05, "loss": 0.023, "step": 3823 }, { "epoch": 0.5052019684909337, "grad_norm": 0.21273942291736603, "learning_rate": 9.856619615405793e-05, "loss": 0.0204, "step": 3824 }, { "epoch": 0.5053340819764177, "grad_norm": 0.2513734698295593, "learning_rate": 9.85246395954806e-05, "loss": 0.0222, "step": 3825 }, { "epoch": 0.5054661954619017, "grad_norm": 0.18025194108486176, "learning_rate": 9.848308329174419e-05, "loss": 0.0157, "step": 3826 }, { "epoch": 0.5055983089473858, "grad_norm": 0.13815920054912567, "learning_rate": 9.84415272500268e-05, "loss": 0.0125, "step": 3827 }, { "epoch": 0.5057304224328698, "grad_norm": 0.246177539229393, "learning_rate": 9.839997147750641e-05, "loss": 0.028, "step": 3828 }, { "epoch": 0.5058625359183538, "grad_norm": 0.10835841298103333, "learning_rate": 9.835841598136105e-05, "loss": 0.0079, "step": 3829 }, { "epoch": 0.5059946494038379, "grad_norm": 0.1650642603635788, "learning_rate": 9.831686076876863e-05, "loss": 0.0211, "step": 3830 }, { "epoch": 0.5061267628893219, "grad_norm": 0.1597212553024292, "learning_rate": 9.827530584690705e-05, "loss": 0.0147, "step": 3831 }, { "epoch": 0.5062588763748059, "grad_norm": 0.24264469742774963, "learning_rate": 9.823375122295414e-05, "loss": 0.0219, "step": 3832 }, { "epoch": 0.50639098986029, "grad_norm": 0.3036757707595825, "learning_rate": 9.819219690408766e-05, "loss": 0.0323, "step": 3833 }, { "epoch": 0.506523103345774, "grad_norm": 0.20398838818073273, "learning_rate": 9.815064289748538e-05, "loss": 0.0177, "step": 3834 }, { "epoch": 0.506655216831258, "grad_norm": 0.16952387988567352, "learning_rate": 9.810908921032495e-05, "loss": 0.013, "step": 3835 }, { "epoch": 0.5067873303167421, "grad_norm": 0.19409286975860596, "learning_rate": 9.806753584978403e-05, "loss": 0.0182, "step": 3836 }, { "epoch": 0.5069194438022261, "grad_norm": 0.1850726455450058, "learning_rate": 9.802598282304013e-05, "loss": 0.0206, "step": 3837 }, { "epoch": 0.5070515572877101, "grad_norm": 0.19560378789901733, "learning_rate": 9.798443013727082e-05, "loss": 0.0242, "step": 3838 }, { "epoch": 0.5071836707731942, "grad_norm": 0.13183735311031342, "learning_rate": 9.79428777996535e-05, "loss": 0.0229, "step": 3839 }, { "epoch": 0.5073157842586782, "grad_norm": 0.24583682417869568, "learning_rate": 9.790132581736557e-05, "loss": 0.029, "step": 3840 }, { "epoch": 0.5074478977441622, "grad_norm": 0.1971963793039322, "learning_rate": 9.785977419758439e-05, "loss": 0.0151, "step": 3841 }, { "epoch": 0.5075800112296462, "grad_norm": 0.12109408527612686, "learning_rate": 9.781822294748716e-05, "loss": 0.0132, "step": 3842 }, { "epoch": 0.5077121247151303, "grad_norm": 0.21014884114265442, "learning_rate": 9.777667207425116e-05, "loss": 0.0189, "step": 3843 }, { "epoch": 0.5078442382006143, "grad_norm": 0.16626541316509247, "learning_rate": 9.773512158505345e-05, "loss": 0.0097, "step": 3844 }, { "epoch": 0.5079763516860983, "grad_norm": 0.16020405292510986, "learning_rate": 9.76935714870712e-05, "loss": 0.0188, "step": 3845 }, { "epoch": 0.5081084651715824, "grad_norm": 0.3006601631641388, "learning_rate": 9.765202178748132e-05, "loss": 0.0337, "step": 3846 }, { "epoch": 0.5082405786570664, "grad_norm": 0.26183125376701355, "learning_rate": 9.761047249346076e-05, "loss": 0.0328, "step": 3847 }, { "epoch": 0.5083726921425504, "grad_norm": 0.1901049017906189, "learning_rate": 9.756892361218642e-05, "loss": 0.0175, "step": 3848 }, { "epoch": 0.5085048056280345, "grad_norm": 0.17531642317771912, "learning_rate": 9.752737515083501e-05, "loss": 0.0192, "step": 3849 }, { "epoch": 0.5086369191135185, "grad_norm": 0.2943665683269501, "learning_rate": 9.748582711658336e-05, "loss": 0.0346, "step": 3850 }, { "epoch": 0.5087690325990025, "grad_norm": 0.12566107511520386, "learning_rate": 9.744427951660809e-05, "loss": 0.0142, "step": 3851 }, { "epoch": 0.5089011460844866, "grad_norm": 0.1161152645945549, "learning_rate": 9.740273235808572e-05, "loss": 0.0037, "step": 3852 }, { "epoch": 0.5090332595699706, "grad_norm": 0.1520073562860489, "learning_rate": 9.736118564819279e-05, "loss": 0.0155, "step": 3853 }, { "epoch": 0.5091653730554546, "grad_norm": 0.13466641306877136, "learning_rate": 9.73196393941057e-05, "loss": 0.0089, "step": 3854 }, { "epoch": 0.5092974865409386, "grad_norm": 0.309459388256073, "learning_rate": 9.727809360300077e-05, "loss": 0.0203, "step": 3855 }, { "epoch": 0.5094296000264227, "grad_norm": 0.30888691544532776, "learning_rate": 9.72365482820543e-05, "loss": 0.0256, "step": 3856 }, { "epoch": 0.5095617135119067, "grad_norm": 0.16353288292884827, "learning_rate": 9.719500343844242e-05, "loss": 0.0181, "step": 3857 }, { "epoch": 0.5096938269973907, "grad_norm": 0.16413989663124084, "learning_rate": 9.715345907934128e-05, "loss": 0.0145, "step": 3858 }, { "epoch": 0.5098259404828748, "grad_norm": 0.37346822023391724, "learning_rate": 9.711191521192685e-05, "loss": 0.0271, "step": 3859 }, { "epoch": 0.5099580539683588, "grad_norm": 0.1723538339138031, "learning_rate": 9.707037184337506e-05, "loss": 0.0176, "step": 3860 }, { "epoch": 0.5100901674538428, "grad_norm": 0.18036048114299774, "learning_rate": 9.702882898086177e-05, "loss": 0.0241, "step": 3861 }, { "epoch": 0.5102222809393269, "grad_norm": 0.19181881844997406, "learning_rate": 9.698728663156271e-05, "loss": 0.0185, "step": 3862 }, { "epoch": 0.5103543944248109, "grad_norm": 0.17694401741027832, "learning_rate": 9.694574480265357e-05, "loss": 0.016, "step": 3863 }, { "epoch": 0.5104865079102949, "grad_norm": 0.20160824060440063, "learning_rate": 9.690420350130988e-05, "loss": 0.028, "step": 3864 }, { "epoch": 0.510618621395779, "grad_norm": 0.18602226674556732, "learning_rate": 9.686266273470718e-05, "loss": 0.0214, "step": 3865 }, { "epoch": 0.510750734881263, "grad_norm": 0.2467876374721527, "learning_rate": 9.682112251002082e-05, "loss": 0.0225, "step": 3866 }, { "epoch": 0.510882848366747, "grad_norm": 0.13355334103107452, "learning_rate": 9.677958283442612e-05, "loss": 0.0158, "step": 3867 }, { "epoch": 0.511014961852231, "grad_norm": 0.1554861217737198, "learning_rate": 9.673804371509827e-05, "loss": 0.0228, "step": 3868 }, { "epoch": 0.5111470753377151, "grad_norm": 0.2639622092247009, "learning_rate": 9.669650515921236e-05, "loss": 0.0184, "step": 3869 }, { "epoch": 0.5112791888231991, "grad_norm": 0.16291341185569763, "learning_rate": 9.665496717394345e-05, "loss": 0.0133, "step": 3870 }, { "epoch": 0.5114113023086831, "grad_norm": 0.1974400132894516, "learning_rate": 9.661342976646638e-05, "loss": 0.0234, "step": 3871 }, { "epoch": 0.5115434157941672, "grad_norm": 0.18314112722873688, "learning_rate": 9.657189294395603e-05, "loss": 0.0208, "step": 3872 }, { "epoch": 0.5116755292796512, "grad_norm": 0.17760257422924042, "learning_rate": 9.653035671358705e-05, "loss": 0.0186, "step": 3873 }, { "epoch": 0.5118076427651352, "grad_norm": 0.18622662127017975, "learning_rate": 9.64888210825341e-05, "loss": 0.0251, "step": 3874 }, { "epoch": 0.5119397562506193, "grad_norm": 0.19988639652729034, "learning_rate": 9.644728605797167e-05, "loss": 0.0244, "step": 3875 }, { "epoch": 0.5120718697361033, "grad_norm": 0.11412037163972855, "learning_rate": 9.64057516470741e-05, "loss": 0.0152, "step": 3876 }, { "epoch": 0.5122039832215873, "grad_norm": 0.13845624029636383, "learning_rate": 9.636421785701577e-05, "loss": 0.0189, "step": 3877 }, { "epoch": 0.5123360967070714, "grad_norm": 0.24381454288959503, "learning_rate": 9.632268469497081e-05, "loss": 0.0156, "step": 3878 }, { "epoch": 0.5124682101925554, "grad_norm": 0.18773894011974335, "learning_rate": 9.628115216811332e-05, "loss": 0.0212, "step": 3879 }, { "epoch": 0.5126003236780394, "grad_norm": 0.08962608873844147, "learning_rate": 9.623962028361725e-05, "loss": 0.0095, "step": 3880 }, { "epoch": 0.5127324371635235, "grad_norm": 0.12506498396396637, "learning_rate": 9.619808904865649e-05, "loss": 0.0146, "step": 3881 }, { "epoch": 0.5128645506490075, "grad_norm": 0.251169353723526, "learning_rate": 9.615655847040475e-05, "loss": 0.0267, "step": 3882 }, { "epoch": 0.5129966641344915, "grad_norm": 0.22696994245052338, "learning_rate": 9.611502855603565e-05, "loss": 0.0224, "step": 3883 }, { "epoch": 0.5131287776199756, "grad_norm": 0.08587751537561417, "learning_rate": 9.607349931272276e-05, "loss": 0.007, "step": 3884 }, { "epoch": 0.5132608911054596, "grad_norm": 0.2083951234817505, "learning_rate": 9.603197074763942e-05, "loss": 0.0168, "step": 3885 }, { "epoch": 0.5133930045909436, "grad_norm": 0.24723173677921295, "learning_rate": 9.599044286795896e-05, "loss": 0.0174, "step": 3886 }, { "epoch": 0.5135251180764276, "grad_norm": 0.1207137405872345, "learning_rate": 9.594891568085452e-05, "loss": 0.011, "step": 3887 }, { "epoch": 0.5136572315619117, "grad_norm": 0.15802474319934845, "learning_rate": 9.590738919349917e-05, "loss": 0.0164, "step": 3888 }, { "epoch": 0.5137893450473957, "grad_norm": 0.2264273762702942, "learning_rate": 9.58658634130658e-05, "loss": 0.0261, "step": 3889 }, { "epoch": 0.5139214585328797, "grad_norm": 0.09568583220243454, "learning_rate": 9.582433834672723e-05, "loss": 0.0094, "step": 3890 }, { "epoch": 0.5140535720183638, "grad_norm": 0.19316567480564117, "learning_rate": 9.578281400165614e-05, "loss": 0.0138, "step": 3891 }, { "epoch": 0.5141856855038478, "grad_norm": 0.1675315797328949, "learning_rate": 9.574129038502506e-05, "loss": 0.0195, "step": 3892 }, { "epoch": 0.5143177989893318, "grad_norm": 0.1676243245601654, "learning_rate": 9.569976750400648e-05, "loss": 0.0144, "step": 3893 }, { "epoch": 0.5144499124748159, "grad_norm": 0.1998102366924286, "learning_rate": 9.565824536577262e-05, "loss": 0.013, "step": 3894 }, { "epoch": 0.5145820259602999, "grad_norm": 0.2167922407388687, "learning_rate": 9.561672397749572e-05, "loss": 0.0206, "step": 3895 }, { "epoch": 0.5147141394457839, "grad_norm": 0.15804733335971832, "learning_rate": 9.557520334634781e-05, "loss": 0.0194, "step": 3896 }, { "epoch": 0.514846252931268, "grad_norm": 0.14819398522377014, "learning_rate": 9.553368347950076e-05, "loss": 0.0166, "step": 3897 }, { "epoch": 0.514978366416752, "grad_norm": 0.10181767493486404, "learning_rate": 9.549216438412639e-05, "loss": 0.0106, "step": 3898 }, { "epoch": 0.515110479902236, "grad_norm": 0.22075830399990082, "learning_rate": 9.545064606739633e-05, "loss": 0.0218, "step": 3899 }, { "epoch": 0.51524259338772, "grad_norm": 0.15722963213920593, "learning_rate": 9.540912853648212e-05, "loss": 0.0218, "step": 3900 }, { "epoch": 0.5153747068732041, "grad_norm": 0.17477941513061523, "learning_rate": 9.53676117985551e-05, "loss": 0.0229, "step": 3901 }, { "epoch": 0.5155068203586881, "grad_norm": 0.18030881881713867, "learning_rate": 9.532609586078655e-05, "loss": 0.0244, "step": 3902 }, { "epoch": 0.5156389338441721, "grad_norm": 0.15806345641613007, "learning_rate": 9.528458073034755e-05, "loss": 0.0272, "step": 3903 }, { "epoch": 0.5157710473296562, "grad_norm": 0.2096640020608902, "learning_rate": 9.524306641440904e-05, "loss": 0.0136, "step": 3904 }, { "epoch": 0.5159031608151402, "grad_norm": 0.11178679019212723, "learning_rate": 9.52015529201419e-05, "loss": 0.0101, "step": 3905 }, { "epoch": 0.5160352743006242, "grad_norm": 0.19791501760482788, "learning_rate": 9.516004025471675e-05, "loss": 0.0317, "step": 3906 }, { "epoch": 0.5161673877861083, "grad_norm": 0.24150808155536652, "learning_rate": 9.51185284253041e-05, "loss": 0.031, "step": 3907 }, { "epoch": 0.5162995012715923, "grad_norm": 0.2426425963640213, "learning_rate": 9.507701743907446e-05, "loss": 0.0318, "step": 3908 }, { "epoch": 0.5164316147570763, "grad_norm": 0.23894216120243073, "learning_rate": 9.5035507303198e-05, "loss": 0.0318, "step": 3909 }, { "epoch": 0.5165637282425604, "grad_norm": 0.21717692911624908, "learning_rate": 9.499399802484485e-05, "loss": 0.0199, "step": 3910 }, { "epoch": 0.5166958417280444, "grad_norm": 0.1702679842710495, "learning_rate": 9.495248961118492e-05, "loss": 0.0118, "step": 3911 }, { "epoch": 0.5168279552135284, "grad_norm": 0.17146004736423492, "learning_rate": 9.491098206938803e-05, "loss": 0.0149, "step": 3912 }, { "epoch": 0.5169600686990125, "grad_norm": 0.3598407506942749, "learning_rate": 9.486947540662385e-05, "loss": 0.0236, "step": 3913 }, { "epoch": 0.5170921821844965, "grad_norm": 0.19947464764118195, "learning_rate": 9.482796963006186e-05, "loss": 0.0369, "step": 3914 }, { "epoch": 0.5172242956699805, "grad_norm": 0.15635952353477478, "learning_rate": 9.478646474687142e-05, "loss": 0.0188, "step": 3915 }, { "epoch": 0.5173564091554645, "grad_norm": 0.1626300811767578, "learning_rate": 9.47449607642217e-05, "loss": 0.0167, "step": 3916 }, { "epoch": 0.5174885226409486, "grad_norm": 0.18757295608520508, "learning_rate": 9.470345768928178e-05, "loss": 0.0157, "step": 3917 }, { "epoch": 0.5176206361264326, "grad_norm": 0.28251126408576965, "learning_rate": 9.466195552922052e-05, "loss": 0.0313, "step": 3918 }, { "epoch": 0.5177527496119166, "grad_norm": 0.12264434248209, "learning_rate": 9.46204542912066e-05, "loss": 0.0192, "step": 3919 }, { "epoch": 0.5178848630974007, "grad_norm": 0.1789926439523697, "learning_rate": 9.457895398240864e-05, "loss": 0.0135, "step": 3920 }, { "epoch": 0.5180169765828847, "grad_norm": 0.14935386180877686, "learning_rate": 9.4537454609995e-05, "loss": 0.0115, "step": 3921 }, { "epoch": 0.5181490900683687, "grad_norm": 0.47989845275878906, "learning_rate": 9.449595618113395e-05, "loss": 0.0112, "step": 3922 }, { "epoch": 0.5182812035538528, "grad_norm": 0.14519643783569336, "learning_rate": 9.445445870299356e-05, "loss": 0.014, "step": 3923 }, { "epoch": 0.5184133170393368, "grad_norm": 0.15153749287128448, "learning_rate": 9.441296218274176e-05, "loss": 0.0181, "step": 3924 }, { "epoch": 0.5185454305248208, "grad_norm": 0.11909378319978714, "learning_rate": 9.437146662754628e-05, "loss": 0.018, "step": 3925 }, { "epoch": 0.5186775440103049, "grad_norm": 0.1426568329334259, "learning_rate": 9.432997204457467e-05, "loss": 0.014, "step": 3926 }, { "epoch": 0.5188096574957889, "grad_norm": 0.21028953790664673, "learning_rate": 9.428847844099441e-05, "loss": 0.0168, "step": 3927 }, { "epoch": 0.5189417709812729, "grad_norm": 0.37740516662597656, "learning_rate": 9.424698582397269e-05, "loss": 0.0188, "step": 3928 }, { "epoch": 0.519073884466757, "grad_norm": 0.1522783786058426, "learning_rate": 9.420549420067661e-05, "loss": 0.0218, "step": 3929 }, { "epoch": 0.519205997952241, "grad_norm": 0.11716365814208984, "learning_rate": 9.416400357827306e-05, "loss": 0.0085, "step": 3930 }, { "epoch": 0.519338111437725, "grad_norm": 0.24627836048603058, "learning_rate": 9.412251396392878e-05, "loss": 0.0115, "step": 3931 }, { "epoch": 0.519470224923209, "grad_norm": 0.22646081447601318, "learning_rate": 9.408102536481033e-05, "loss": 0.0314, "step": 3932 }, { "epoch": 0.5196023384086931, "grad_norm": 0.11769570410251617, "learning_rate": 9.403953778808406e-05, "loss": 0.0086, "step": 3933 }, { "epoch": 0.5197344518941771, "grad_norm": 0.1721094697713852, "learning_rate": 9.399805124091619e-05, "loss": 0.0216, "step": 3934 }, { "epoch": 0.5198665653796611, "grad_norm": 0.2551160752773285, "learning_rate": 9.395656573047276e-05, "loss": 0.0272, "step": 3935 }, { "epoch": 0.5199986788651452, "grad_norm": 0.23127736151218414, "learning_rate": 9.39150812639196e-05, "loss": 0.0174, "step": 3936 }, { "epoch": 0.5201307923506292, "grad_norm": 0.12697872519493103, "learning_rate": 9.387359784842238e-05, "loss": 0.0222, "step": 3937 }, { "epoch": 0.5202629058361132, "grad_norm": 0.19013381004333496, "learning_rate": 9.383211549114659e-05, "loss": 0.0246, "step": 3938 }, { "epoch": 0.5203950193215973, "grad_norm": 0.17478132247924805, "learning_rate": 9.379063419925753e-05, "loss": 0.0168, "step": 3939 }, { "epoch": 0.5205271328070813, "grad_norm": 0.1878492385149002, "learning_rate": 9.37491539799203e-05, "loss": 0.017, "step": 3940 }, { "epoch": 0.5206592462925653, "grad_norm": 0.2478381097316742, "learning_rate": 9.370767484029987e-05, "loss": 0.0159, "step": 3941 }, { "epoch": 0.5207913597780494, "grad_norm": 0.19272209703922272, "learning_rate": 9.366619678756092e-05, "loss": 0.0191, "step": 3942 }, { "epoch": 0.5209234732635334, "grad_norm": 0.16208535432815552, "learning_rate": 9.362471982886809e-05, "loss": 0.0122, "step": 3943 }, { "epoch": 0.5210555867490174, "grad_norm": 0.25687727332115173, "learning_rate": 9.358324397138568e-05, "loss": 0.0327, "step": 3944 }, { "epoch": 0.5211877002345014, "grad_norm": 0.1902291625738144, "learning_rate": 9.354176922227793e-05, "loss": 0.013, "step": 3945 }, { "epoch": 0.5213198137199855, "grad_norm": 0.22686885297298431, "learning_rate": 9.350029558870879e-05, "loss": 0.0269, "step": 3946 }, { "epoch": 0.5214519272054695, "grad_norm": 0.16572003066539764, "learning_rate": 9.345882307784203e-05, "loss": 0.0188, "step": 3947 }, { "epoch": 0.5215840406909535, "grad_norm": 0.23944619297981262, "learning_rate": 9.341735169684132e-05, "loss": 0.022, "step": 3948 }, { "epoch": 0.5217161541764376, "grad_norm": 0.2262609750032425, "learning_rate": 9.337588145286998e-05, "loss": 0.0295, "step": 3949 }, { "epoch": 0.5218482676619216, "grad_norm": 0.12378781288862228, "learning_rate": 9.33344123530913e-05, "loss": 0.0123, "step": 3950 }, { "epoch": 0.5219803811474056, "grad_norm": 0.26705402135849, "learning_rate": 9.329294440466825e-05, "loss": 0.0167, "step": 3951 }, { "epoch": 0.5221124946328897, "grad_norm": 0.22340580821037292, "learning_rate": 9.325147761476365e-05, "loss": 0.0229, "step": 3952 }, { "epoch": 0.5222446081183737, "grad_norm": 0.2640053331851959, "learning_rate": 9.321001199054012e-05, "loss": 0.0259, "step": 3953 }, { "epoch": 0.5223767216038577, "grad_norm": 0.18717001378536224, "learning_rate": 9.316854753916006e-05, "loss": 0.0145, "step": 3954 }, { "epoch": 0.5225088350893418, "grad_norm": 0.17452330887317657, "learning_rate": 9.312708426778568e-05, "loss": 0.022, "step": 3955 }, { "epoch": 0.5226409485748258, "grad_norm": 0.1928424835205078, "learning_rate": 9.308562218357898e-05, "loss": 0.0206, "step": 3956 }, { "epoch": 0.5227730620603098, "grad_norm": 0.16473954916000366, "learning_rate": 9.30441612937018e-05, "loss": 0.018, "step": 3957 }, { "epoch": 0.5229051755457939, "grad_norm": 0.21050867438316345, "learning_rate": 9.300270160531566e-05, "loss": 0.0154, "step": 3958 }, { "epoch": 0.5230372890312779, "grad_norm": 0.23349924385547638, "learning_rate": 9.296124312558201e-05, "loss": 0.0165, "step": 3959 }, { "epoch": 0.5231694025167619, "grad_norm": 0.17704570293426514, "learning_rate": 9.291978586166201e-05, "loss": 0.0283, "step": 3960 }, { "epoch": 0.523301516002246, "grad_norm": 0.3553520143032074, "learning_rate": 9.28783298207166e-05, "loss": 0.0396, "step": 3961 }, { "epoch": 0.52343362948773, "grad_norm": 0.19322533905506134, "learning_rate": 9.283687500990658e-05, "loss": 0.011, "step": 3962 }, { "epoch": 0.523565742973214, "grad_norm": 0.20268426835536957, "learning_rate": 9.279542143639245e-05, "loss": 0.0134, "step": 3963 }, { "epoch": 0.523697856458698, "grad_norm": 0.2077493667602539, "learning_rate": 9.275396910733458e-05, "loss": 0.034, "step": 3964 }, { "epoch": 0.5238299699441821, "grad_norm": 0.17809870839118958, "learning_rate": 9.271251802989305e-05, "loss": 0.0131, "step": 3965 }, { "epoch": 0.5239620834296661, "grad_norm": 0.22379878163337708, "learning_rate": 9.267106821122774e-05, "loss": 0.0083, "step": 3966 }, { "epoch": 0.5240941969151501, "grad_norm": 0.2034342736005783, "learning_rate": 9.26296196584984e-05, "loss": 0.0161, "step": 3967 }, { "epoch": 0.5242263104006342, "grad_norm": 0.20792457461357117, "learning_rate": 9.25881723788645e-05, "loss": 0.0298, "step": 3968 }, { "epoch": 0.5243584238861182, "grad_norm": 0.28438884019851685, "learning_rate": 9.254672637948518e-05, "loss": 0.029, "step": 3969 }, { "epoch": 0.5244905373716022, "grad_norm": 0.20338405668735504, "learning_rate": 9.250528166751956e-05, "loss": 0.0183, "step": 3970 }, { "epoch": 0.5246226508570863, "grad_norm": 0.22804833948612213, "learning_rate": 9.246383825012637e-05, "loss": 0.021, "step": 3971 }, { "epoch": 0.5247547643425703, "grad_norm": 0.16153642535209656, "learning_rate": 9.242239613446425e-05, "loss": 0.0119, "step": 3972 }, { "epoch": 0.5248868778280543, "grad_norm": 0.2589012086391449, "learning_rate": 9.238095532769149e-05, "loss": 0.025, "step": 3973 }, { "epoch": 0.5250189913135384, "grad_norm": 0.14929302036762238, "learning_rate": 9.233951583696628e-05, "loss": 0.015, "step": 3974 }, { "epoch": 0.5251511047990224, "grad_norm": 0.2125391960144043, "learning_rate": 9.229807766944645e-05, "loss": 0.0186, "step": 3975 }, { "epoch": 0.5252832182845064, "grad_norm": 0.16655296087265015, "learning_rate": 9.225664083228969e-05, "loss": 0.0156, "step": 3976 }, { "epoch": 0.5254153317699904, "grad_norm": 0.19410564005374908, "learning_rate": 9.221520533265347e-05, "loss": 0.0251, "step": 3977 }, { "epoch": 0.5255474452554745, "grad_norm": 0.30204856395721436, "learning_rate": 9.217377117769494e-05, "loss": 0.0303, "step": 3978 }, { "epoch": 0.5256795587409585, "grad_norm": 0.13079041242599487, "learning_rate": 9.213233837457115e-05, "loss": 0.0186, "step": 3979 }, { "epoch": 0.5258116722264425, "grad_norm": 0.18816164135932922, "learning_rate": 9.209090693043877e-05, "loss": 0.0179, "step": 3980 }, { "epoch": 0.5259437857119266, "grad_norm": 0.16152295470237732, "learning_rate": 9.204947685245436e-05, "loss": 0.0142, "step": 3981 }, { "epoch": 0.5260758991974106, "grad_norm": 0.1895074099302292, "learning_rate": 9.200804814777417e-05, "loss": 0.0314, "step": 3982 }, { "epoch": 0.5262080126828946, "grad_norm": 0.18929599225521088, "learning_rate": 9.196662082355423e-05, "loss": 0.0262, "step": 3983 }, { "epoch": 0.5263401261683787, "grad_norm": 0.10806319117546082, "learning_rate": 9.192519488695034e-05, "loss": 0.0119, "step": 3984 }, { "epoch": 0.5264722396538627, "grad_norm": 0.2200048416852951, "learning_rate": 9.188377034511805e-05, "loss": 0.0283, "step": 3985 }, { "epoch": 0.5266043531393467, "grad_norm": 0.14298328757286072, "learning_rate": 9.18423472052127e-05, "loss": 0.0184, "step": 3986 }, { "epoch": 0.5267364666248308, "grad_norm": 0.27281659841537476, "learning_rate": 9.180092547438931e-05, "loss": 0.0204, "step": 3987 }, { "epoch": 0.5268685801103148, "grad_norm": 0.13350260257720947, "learning_rate": 9.175950515980277e-05, "loss": 0.0133, "step": 3988 }, { "epoch": 0.5270006935957988, "grad_norm": 0.1405045986175537, "learning_rate": 9.171808626860765e-05, "loss": 0.0174, "step": 3989 }, { "epoch": 0.5271328070812829, "grad_norm": 0.18542882800102234, "learning_rate": 9.167666880795824e-05, "loss": 0.0148, "step": 3990 }, { "epoch": 0.5272649205667669, "grad_norm": 0.1905541568994522, "learning_rate": 9.16352527850087e-05, "loss": 0.018, "step": 3991 }, { "epoch": 0.5273970340522509, "grad_norm": 0.16615454852581024, "learning_rate": 9.159383820691283e-05, "loss": 0.015, "step": 3992 }, { "epoch": 0.5275291475377349, "grad_norm": 0.48656389117240906, "learning_rate": 9.155242508082425e-05, "loss": 0.0272, "step": 3993 }, { "epoch": 0.527661261023219, "grad_norm": 0.1482691913843155, "learning_rate": 9.151101341389627e-05, "loss": 0.0185, "step": 3994 }, { "epoch": 0.527793374508703, "grad_norm": 0.21181003749370575, "learning_rate": 9.146960321328205e-05, "loss": 0.0217, "step": 3995 }, { "epoch": 0.527925487994187, "grad_norm": 0.18101364374160767, "learning_rate": 9.142819448613433e-05, "loss": 0.0224, "step": 3996 }, { "epoch": 0.5280576014796711, "grad_norm": 0.1256485879421234, "learning_rate": 9.138678723960575e-05, "loss": 0.0128, "step": 3997 }, { "epoch": 0.5281897149651551, "grad_norm": 0.17414726316928864, "learning_rate": 9.134538148084866e-05, "loss": 0.0182, "step": 3998 }, { "epoch": 0.5283218284506391, "grad_norm": 0.15414351224899292, "learning_rate": 9.130397721701506e-05, "loss": 0.02, "step": 3999 }, { "epoch": 0.5284539419361232, "grad_norm": 0.14810499548912048, "learning_rate": 9.126257445525684e-05, "loss": 0.0152, "step": 4000 }, { "epoch": 0.5285860554216072, "grad_norm": 0.2823812961578369, "learning_rate": 9.122117320272549e-05, "loss": 0.0301, "step": 4001 }, { "epoch": 0.5287181689070912, "grad_norm": 0.14143690466880798, "learning_rate": 9.117977346657235e-05, "loss": 0.015, "step": 4002 }, { "epoch": 0.5288502823925753, "grad_norm": 0.17588815093040466, "learning_rate": 9.113837525394843e-05, "loss": 0.0164, "step": 4003 }, { "epoch": 0.5289823958780593, "grad_norm": 0.14937280118465424, "learning_rate": 9.109697857200447e-05, "loss": 0.0204, "step": 4004 }, { "epoch": 0.5291145093635433, "grad_norm": 0.19677433371543884, "learning_rate": 9.105558342789103e-05, "loss": 0.0117, "step": 4005 }, { "epoch": 0.5292466228490273, "grad_norm": 0.3028697371482849, "learning_rate": 9.10141898287583e-05, "loss": 0.0197, "step": 4006 }, { "epoch": 0.5293787363345114, "grad_norm": 0.19180864095687866, "learning_rate": 9.097279778175627e-05, "loss": 0.0145, "step": 4007 }, { "epoch": 0.5295108498199954, "grad_norm": 0.1871991902589798, "learning_rate": 9.093140729403463e-05, "loss": 0.0253, "step": 4008 }, { "epoch": 0.5296429633054794, "grad_norm": 0.20815619826316833, "learning_rate": 9.089001837274284e-05, "loss": 0.0211, "step": 4009 }, { "epoch": 0.5297750767909635, "grad_norm": 0.19759541749954224, "learning_rate": 9.084863102503003e-05, "loss": 0.0119, "step": 4010 }, { "epoch": 0.5299071902764475, "grad_norm": 0.19330792129039764, "learning_rate": 9.08072452580451e-05, "loss": 0.0276, "step": 4011 }, { "epoch": 0.5300393037619315, "grad_norm": 0.25872206687927246, "learning_rate": 9.07658610789367e-05, "loss": 0.0285, "step": 4012 }, { "epoch": 0.5301714172474156, "grad_norm": 0.30612707138061523, "learning_rate": 9.072447849485311e-05, "loss": 0.0369, "step": 4013 }, { "epoch": 0.5303035307328996, "grad_norm": 0.1953236609697342, "learning_rate": 9.068309751294246e-05, "loss": 0.028, "step": 4014 }, { "epoch": 0.5304356442183836, "grad_norm": 0.15855740010738373, "learning_rate": 9.06417181403525e-05, "loss": 0.0181, "step": 4015 }, { "epoch": 0.5305677577038677, "grad_norm": 0.1605386883020401, "learning_rate": 9.060034038423076e-05, "loss": 0.0262, "step": 4016 }, { "epoch": 0.5306998711893517, "grad_norm": 0.15657903254032135, "learning_rate": 9.055896425172448e-05, "loss": 0.0142, "step": 4017 }, { "epoch": 0.5308319846748357, "grad_norm": 0.14779461920261383, "learning_rate": 9.05175897499806e-05, "loss": 0.022, "step": 4018 }, { "epoch": 0.5309640981603198, "grad_norm": 0.1896156370639801, "learning_rate": 9.04762168861458e-05, "loss": 0.0133, "step": 4019 }, { "epoch": 0.5310962116458038, "grad_norm": 0.11784496158361435, "learning_rate": 9.043484566736644e-05, "loss": 0.0137, "step": 4020 }, { "epoch": 0.5312283251312878, "grad_norm": 0.13881589472293854, "learning_rate": 9.039347610078866e-05, "loss": 0.0144, "step": 4021 }, { "epoch": 0.5313604386167718, "grad_norm": 0.1399046927690506, "learning_rate": 9.035210819355827e-05, "loss": 0.0129, "step": 4022 }, { "epoch": 0.5314925521022559, "grad_norm": 0.14713436365127563, "learning_rate": 9.03107419528208e-05, "loss": 0.0095, "step": 4023 }, { "epoch": 0.5316246655877399, "grad_norm": 0.20945364236831665, "learning_rate": 9.026937738572148e-05, "loss": 0.0175, "step": 4024 }, { "epoch": 0.5317567790732239, "grad_norm": 0.10825034230947495, "learning_rate": 9.02280144994053e-05, "loss": 0.0128, "step": 4025 }, { "epoch": 0.531888892558708, "grad_norm": 0.1814744919538498, "learning_rate": 9.01866533010169e-05, "loss": 0.0164, "step": 4026 }, { "epoch": 0.532021006044192, "grad_norm": 0.3017287254333496, "learning_rate": 9.014529379770067e-05, "loss": 0.0222, "step": 4027 }, { "epoch": 0.532153119529676, "grad_norm": 0.177540123462677, "learning_rate": 9.010393599660065e-05, "loss": 0.0147, "step": 4028 }, { "epoch": 0.5322852330151601, "grad_norm": 0.16495585441589355, "learning_rate": 9.00625799048607e-05, "loss": 0.0132, "step": 4029 }, { "epoch": 0.5324173465006441, "grad_norm": 0.1539219319820404, "learning_rate": 9.002122552962423e-05, "loss": 0.0123, "step": 4030 }, { "epoch": 0.5325494599861281, "grad_norm": 0.19247393310070038, "learning_rate": 8.997987287803451e-05, "loss": 0.0192, "step": 4031 }, { "epoch": 0.5326815734716122, "grad_norm": 0.17403817176818848, "learning_rate": 8.993852195723438e-05, "loss": 0.0201, "step": 4032 }, { "epoch": 0.5328136869570962, "grad_norm": 0.20865043997764587, "learning_rate": 8.989717277436647e-05, "loss": 0.0225, "step": 4033 }, { "epoch": 0.5329458004425802, "grad_norm": 0.1501004695892334, "learning_rate": 8.985582533657306e-05, "loss": 0.0106, "step": 4034 }, { "epoch": 0.5330779139280643, "grad_norm": 0.329297810792923, "learning_rate": 8.981447965099616e-05, "loss": 0.0267, "step": 4035 }, { "epoch": 0.5332100274135483, "grad_norm": 0.12887880206108093, "learning_rate": 8.977313572477745e-05, "loss": 0.0172, "step": 4036 }, { "epoch": 0.5333421408990323, "grad_norm": 0.2528422772884369, "learning_rate": 8.973179356505834e-05, "loss": 0.0219, "step": 4037 }, { "epoch": 0.5334742543845163, "grad_norm": 0.2842426002025604, "learning_rate": 8.96904531789799e-05, "loss": 0.0413, "step": 4038 }, { "epoch": 0.5336063678700004, "grad_norm": 0.18451958894729614, "learning_rate": 8.964911457368292e-05, "loss": 0.0192, "step": 4039 }, { "epoch": 0.5337384813554844, "grad_norm": 0.17478865385055542, "learning_rate": 8.960777775630784e-05, "loss": 0.0169, "step": 4040 }, { "epoch": 0.5338705948409684, "grad_norm": 0.19136327505111694, "learning_rate": 8.956644273399487e-05, "loss": 0.0202, "step": 4041 }, { "epoch": 0.5340027083264525, "grad_norm": 0.3356846868991852, "learning_rate": 8.952510951388382e-05, "loss": 0.0192, "step": 4042 }, { "epoch": 0.5341348218119365, "grad_norm": 0.2409062683582306, "learning_rate": 8.948377810311427e-05, "loss": 0.023, "step": 4043 }, { "epoch": 0.5342669352974205, "grad_norm": 0.13506470620632172, "learning_rate": 8.94424485088254e-05, "loss": 0.0097, "step": 4044 }, { "epoch": 0.5343990487829046, "grad_norm": 0.21863852441310883, "learning_rate": 8.940112073815619e-05, "loss": 0.0208, "step": 4045 }, { "epoch": 0.5345311622683886, "grad_norm": 0.2842344641685486, "learning_rate": 8.935979479824519e-05, "loss": 0.0256, "step": 4046 }, { "epoch": 0.5346632757538726, "grad_norm": 0.12600763142108917, "learning_rate": 8.931847069623068e-05, "loss": 0.0183, "step": 4047 }, { "epoch": 0.5347953892393567, "grad_norm": 0.18402507901191711, "learning_rate": 8.927714843925066e-05, "loss": 0.0331, "step": 4048 }, { "epoch": 0.5349275027248407, "grad_norm": 0.18134433031082153, "learning_rate": 8.923582803444274e-05, "loss": 0.0118, "step": 4049 }, { "epoch": 0.5350596162103247, "grad_norm": 0.14058002829551697, "learning_rate": 8.919450948894429e-05, "loss": 0.0143, "step": 4050 }, { "epoch": 0.5351917296958087, "grad_norm": 0.11900404840707779, "learning_rate": 8.915319280989226e-05, "loss": 0.0085, "step": 4051 }, { "epoch": 0.5353238431812928, "grad_norm": 0.13689814507961273, "learning_rate": 8.91118780044234e-05, "loss": 0.0122, "step": 4052 }, { "epoch": 0.5354559566667768, "grad_norm": 0.13677994906902313, "learning_rate": 8.907056507967402e-05, "loss": 0.0186, "step": 4053 }, { "epoch": 0.5355880701522608, "grad_norm": 0.0793822780251503, "learning_rate": 8.902925404278017e-05, "loss": 0.0083, "step": 4054 }, { "epoch": 0.5357201836377449, "grad_norm": 0.18173062801361084, "learning_rate": 8.898794490087757e-05, "loss": 0.0259, "step": 4055 }, { "epoch": 0.5358522971232289, "grad_norm": 0.17125774919986725, "learning_rate": 8.894663766110159e-05, "loss": 0.0168, "step": 4056 }, { "epoch": 0.5359844106087129, "grad_norm": 0.19805628061294556, "learning_rate": 8.890533233058729e-05, "loss": 0.0237, "step": 4057 }, { "epoch": 0.536116524094197, "grad_norm": 0.15703165531158447, "learning_rate": 8.886402891646937e-05, "loss": 0.0157, "step": 4058 }, { "epoch": 0.536248637579681, "grad_norm": 0.12223298847675323, "learning_rate": 8.882272742588226e-05, "loss": 0.0155, "step": 4059 }, { "epoch": 0.536380751065165, "grad_norm": 0.2299620360136032, "learning_rate": 8.878142786596002e-05, "loss": 0.0076, "step": 4060 }, { "epoch": 0.5365128645506491, "grad_norm": 0.21877168118953705, "learning_rate": 8.874013024383631e-05, "loss": 0.0221, "step": 4061 }, { "epoch": 0.5366449780361331, "grad_norm": 0.20613844692707062, "learning_rate": 8.869883456664462e-05, "loss": 0.0192, "step": 4062 }, { "epoch": 0.5367770915216171, "grad_norm": 0.10324428975582123, "learning_rate": 8.865754084151792e-05, "loss": 0.0126, "step": 4063 }, { "epoch": 0.5369092050071012, "grad_norm": 0.0655319094657898, "learning_rate": 8.861624907558899e-05, "loss": 0.0048, "step": 4064 }, { "epoch": 0.5370413184925852, "grad_norm": 0.17869147658348083, "learning_rate": 8.857495927599018e-05, "loss": 0.0197, "step": 4065 }, { "epoch": 0.5371734319780692, "grad_norm": 0.1799241304397583, "learning_rate": 8.853367144985355e-05, "loss": 0.0275, "step": 4066 }, { "epoch": 0.5373055454635532, "grad_norm": 0.12295150011777878, "learning_rate": 8.849238560431079e-05, "loss": 0.0092, "step": 4067 }, { "epoch": 0.5374376589490373, "grad_norm": 0.23537562787532806, "learning_rate": 8.845110174649323e-05, "loss": 0.0248, "step": 4068 }, { "epoch": 0.5375697724345213, "grad_norm": 0.1479044109582901, "learning_rate": 8.840981988353193e-05, "loss": 0.0126, "step": 4069 }, { "epoch": 0.5377018859200052, "grad_norm": 0.14745278656482697, "learning_rate": 8.836854002255752e-05, "loss": 0.0151, "step": 4070 }, { "epoch": 0.5378339994054893, "grad_norm": 0.1753583401441574, "learning_rate": 8.832726217070037e-05, "loss": 0.0187, "step": 4071 }, { "epoch": 0.5379661128909733, "grad_norm": 0.1836109310388565, "learning_rate": 8.828598633509041e-05, "loss": 0.0173, "step": 4072 }, { "epoch": 0.5380982263764573, "grad_norm": 0.15662620961666107, "learning_rate": 8.82447125228573e-05, "loss": 0.0126, "step": 4073 }, { "epoch": 0.5382303398619414, "grad_norm": 0.1305164247751236, "learning_rate": 8.820344074113034e-05, "loss": 0.0105, "step": 4074 }, { "epoch": 0.5383624533474254, "grad_norm": 0.19191725552082062, "learning_rate": 8.816217099703839e-05, "loss": 0.0152, "step": 4075 }, { "epoch": 0.5384945668329094, "grad_norm": 0.23539984226226807, "learning_rate": 8.812090329771007e-05, "loss": 0.0161, "step": 4076 }, { "epoch": 0.5386266803183934, "grad_norm": 0.2325924038887024, "learning_rate": 8.807963765027359e-05, "loss": 0.0258, "step": 4077 }, { "epoch": 0.5387587938038775, "grad_norm": 0.15382160246372223, "learning_rate": 8.803837406185686e-05, "loss": 0.0154, "step": 4078 }, { "epoch": 0.5388909072893615, "grad_norm": 0.1661686897277832, "learning_rate": 8.799711253958733e-05, "loss": 0.0154, "step": 4079 }, { "epoch": 0.5390230207748455, "grad_norm": 0.16271163523197174, "learning_rate": 8.79558530905922e-05, "loss": 0.0139, "step": 4080 }, { "epoch": 0.5391551342603296, "grad_norm": 0.28479182720184326, "learning_rate": 8.791459572199827e-05, "loss": 0.023, "step": 4081 }, { "epoch": 0.5392872477458136, "grad_norm": 0.22662101686000824, "learning_rate": 8.787334044093195e-05, "loss": 0.0359, "step": 4082 }, { "epoch": 0.5394193612312976, "grad_norm": 0.16315026581287384, "learning_rate": 8.783208725451929e-05, "loss": 0.0142, "step": 4083 }, { "epoch": 0.5395514747167817, "grad_norm": 0.15125629305839539, "learning_rate": 8.779083616988611e-05, "loss": 0.0158, "step": 4084 }, { "epoch": 0.5396835882022657, "grad_norm": 0.11059519648551941, "learning_rate": 8.774958719415767e-05, "loss": 0.0094, "step": 4085 }, { "epoch": 0.5398157016877497, "grad_norm": 0.16920366883277893, "learning_rate": 8.770834033445901e-05, "loss": 0.0203, "step": 4086 }, { "epoch": 0.5399478151732338, "grad_norm": 0.15966933965682983, "learning_rate": 8.766709559791473e-05, "loss": 0.0122, "step": 4087 }, { "epoch": 0.5400799286587178, "grad_norm": 0.1668696254491806, "learning_rate": 8.762585299164912e-05, "loss": 0.0148, "step": 4088 }, { "epoch": 0.5402120421442018, "grad_norm": 0.14590130746364594, "learning_rate": 8.758461252278603e-05, "loss": 0.0142, "step": 4089 }, { "epoch": 0.5403441556296859, "grad_norm": 0.2944794297218323, "learning_rate": 8.754337419844897e-05, "loss": 0.0271, "step": 4090 }, { "epoch": 0.5404762691151699, "grad_norm": 0.20897763967514038, "learning_rate": 8.750213802576114e-05, "loss": 0.0183, "step": 4091 }, { "epoch": 0.5406083826006539, "grad_norm": 0.18604964017868042, "learning_rate": 8.746090401184526e-05, "loss": 0.0174, "step": 4092 }, { "epoch": 0.540740496086138, "grad_norm": 0.2030273675918579, "learning_rate": 8.74196721638238e-05, "loss": 0.0216, "step": 4093 }, { "epoch": 0.540872609571622, "grad_norm": 0.25758877396583557, "learning_rate": 8.737844248881873e-05, "loss": 0.0168, "step": 4094 }, { "epoch": 0.541004723057106, "grad_norm": 0.18899019062519073, "learning_rate": 8.733721499395174e-05, "loss": 0.0111, "step": 4095 }, { "epoch": 0.54113683654259, "grad_norm": 0.20160093903541565, "learning_rate": 8.729598968634412e-05, "loss": 0.0205, "step": 4096 }, { "epoch": 0.5412689500280741, "grad_norm": 0.14404238760471344, "learning_rate": 8.725476657311671e-05, "loss": 0.0146, "step": 4097 }, { "epoch": 0.5414010635135581, "grad_norm": 0.1566837728023529, "learning_rate": 8.72135456613901e-05, "loss": 0.0165, "step": 4098 }, { "epoch": 0.5415331769990421, "grad_norm": 0.18331080675125122, "learning_rate": 8.71723269582844e-05, "loss": 0.0209, "step": 4099 }, { "epoch": 0.5416652904845262, "grad_norm": 0.20835472643375397, "learning_rate": 8.713111047091939e-05, "loss": 0.0258, "step": 4100 }, { "epoch": 0.5417974039700102, "grad_norm": 0.263864666223526, "learning_rate": 8.70898962064144e-05, "loss": 0.0237, "step": 4101 }, { "epoch": 0.5419295174554942, "grad_norm": 0.20449554920196533, "learning_rate": 8.704868417188849e-05, "loss": 0.0284, "step": 4102 }, { "epoch": 0.5420616309409783, "grad_norm": 0.22227758169174194, "learning_rate": 8.700747437446023e-05, "loss": 0.018, "step": 4103 }, { "epoch": 0.5421937444264623, "grad_norm": 0.18106292188167572, "learning_rate": 8.696626682124782e-05, "loss": 0.0101, "step": 4104 }, { "epoch": 0.5423258579119463, "grad_norm": 0.28230977058410645, "learning_rate": 8.692506151936916e-05, "loss": 0.0135, "step": 4105 }, { "epoch": 0.5424579713974303, "grad_norm": 0.16065078973770142, "learning_rate": 8.688385847594162e-05, "loss": 0.0204, "step": 4106 }, { "epoch": 0.5425900848829144, "grad_norm": 0.41624608635902405, "learning_rate": 8.684265769808232e-05, "loss": 0.0154, "step": 4107 }, { "epoch": 0.5427221983683984, "grad_norm": 0.17645595967769623, "learning_rate": 8.680145919290787e-05, "loss": 0.0354, "step": 4108 }, { "epoch": 0.5428543118538824, "grad_norm": 0.16414207220077515, "learning_rate": 8.676026296753459e-05, "loss": 0.0138, "step": 4109 }, { "epoch": 0.5429864253393665, "grad_norm": 0.20613645017147064, "learning_rate": 8.671906902907833e-05, "loss": 0.0144, "step": 4110 }, { "epoch": 0.5431185388248505, "grad_norm": 0.2929520010948181, "learning_rate": 8.667787738465458e-05, "loss": 0.0298, "step": 4111 }, { "epoch": 0.5432506523103345, "grad_norm": 0.2482895404100418, "learning_rate": 8.663668804137843e-05, "loss": 0.0263, "step": 4112 }, { "epoch": 0.5433827657958186, "grad_norm": 0.15026050806045532, "learning_rate": 8.659550100636454e-05, "loss": 0.0185, "step": 4113 }, { "epoch": 0.5435148792813026, "grad_norm": 0.1841093897819519, "learning_rate": 8.655431628672725e-05, "loss": 0.0171, "step": 4114 }, { "epoch": 0.5436469927667866, "grad_norm": 0.36599624156951904, "learning_rate": 8.65131338895804e-05, "loss": 0.0197, "step": 4115 }, { "epoch": 0.5437791062522707, "grad_norm": 0.24211275577545166, "learning_rate": 8.647195382203753e-05, "loss": 0.025, "step": 4116 }, { "epoch": 0.5439112197377547, "grad_norm": 0.20414648950099945, "learning_rate": 8.643077609121168e-05, "loss": 0.0254, "step": 4117 }, { "epoch": 0.5440433332232387, "grad_norm": 0.15253716707229614, "learning_rate": 8.638960070421554e-05, "loss": 0.0194, "step": 4118 }, { "epoch": 0.5441754467087228, "grad_norm": 0.19736583530902863, "learning_rate": 8.634842766816143e-05, "loss": 0.0179, "step": 4119 }, { "epoch": 0.5443075601942068, "grad_norm": 0.06327956169843674, "learning_rate": 8.630725699016118e-05, "loss": 0.0053, "step": 4120 }, { "epoch": 0.5444396736796908, "grad_norm": 0.19995535910129547, "learning_rate": 8.626608867732627e-05, "loss": 0.0205, "step": 4121 }, { "epoch": 0.5445717871651748, "grad_norm": 0.2537860572338104, "learning_rate": 8.622492273676774e-05, "loss": 0.0158, "step": 4122 }, { "epoch": 0.5447039006506589, "grad_norm": 0.180791437625885, "learning_rate": 8.618375917559627e-05, "loss": 0.0214, "step": 4123 }, { "epoch": 0.5448360141361429, "grad_norm": 0.1629365086555481, "learning_rate": 8.614259800092209e-05, "loss": 0.0267, "step": 4124 }, { "epoch": 0.5449681276216269, "grad_norm": 0.12369240075349808, "learning_rate": 8.610143921985498e-05, "loss": 0.0117, "step": 4125 }, { "epoch": 0.545100241107111, "grad_norm": 0.16354741156101227, "learning_rate": 8.606028283950441e-05, "loss": 0.0138, "step": 4126 }, { "epoch": 0.545232354592595, "grad_norm": 0.12069833278656006, "learning_rate": 8.60191288669793e-05, "loss": 0.0123, "step": 4127 }, { "epoch": 0.545364468078079, "grad_norm": 0.1605876088142395, "learning_rate": 8.59779773093883e-05, "loss": 0.0168, "step": 4128 }, { "epoch": 0.5454965815635631, "grad_norm": 0.21643571555614471, "learning_rate": 8.593682817383955e-05, "loss": 0.0171, "step": 4129 }, { "epoch": 0.5456286950490471, "grad_norm": 0.10620002448558807, "learning_rate": 8.589568146744078e-05, "loss": 0.0105, "step": 4130 }, { "epoch": 0.5457608085345311, "grad_norm": 0.21386931836605072, "learning_rate": 8.585453719729935e-05, "loss": 0.0162, "step": 4131 }, { "epoch": 0.5458929220200152, "grad_norm": 0.18839512765407562, "learning_rate": 8.58133953705221e-05, "loss": 0.024, "step": 4132 }, { "epoch": 0.5460250355054992, "grad_norm": 0.2717703878879547, "learning_rate": 8.577225599421558e-05, "loss": 0.0203, "step": 4133 }, { "epoch": 0.5461571489909832, "grad_norm": 0.16906102001667023, "learning_rate": 8.573111907548578e-05, "loss": 0.0206, "step": 4134 }, { "epoch": 0.5462892624764673, "grad_norm": 1.166779637336731, "learning_rate": 8.56899846214384e-05, "loss": 0.0125, "step": 4135 }, { "epoch": 0.5464213759619513, "grad_norm": 0.21797332167625427, "learning_rate": 8.564885263917861e-05, "loss": 0.0299, "step": 4136 }, { "epoch": 0.5465534894474353, "grad_norm": 0.11769171804189682, "learning_rate": 8.56077231358112e-05, "loss": 0.0096, "step": 4137 }, { "epoch": 0.5466856029329193, "grad_norm": 0.18901102244853973, "learning_rate": 8.556659611844054e-05, "loss": 0.0107, "step": 4138 }, { "epoch": 0.5468177164184034, "grad_norm": 0.16076532006263733, "learning_rate": 8.55254715941705e-05, "loss": 0.0184, "step": 4139 }, { "epoch": 0.5469498299038874, "grad_norm": 0.3321286141872406, "learning_rate": 8.548434957010464e-05, "loss": 0.0244, "step": 4140 }, { "epoch": 0.5470819433893714, "grad_norm": 0.23241344094276428, "learning_rate": 8.544323005334596e-05, "loss": 0.017, "step": 4141 }, { "epoch": 0.5472140568748555, "grad_norm": 0.17156289517879486, "learning_rate": 8.54021130509971e-05, "loss": 0.0198, "step": 4142 }, { "epoch": 0.5473461703603395, "grad_norm": 0.1776096373796463, "learning_rate": 8.536099857016031e-05, "loss": 0.0207, "step": 4143 }, { "epoch": 0.5474782838458235, "grad_norm": 0.17695477604866028, "learning_rate": 8.531988661793729e-05, "loss": 0.0239, "step": 4144 }, { "epoch": 0.5476103973313076, "grad_norm": 0.1847931146621704, "learning_rate": 8.52787772014294e-05, "loss": 0.0209, "step": 4145 }, { "epoch": 0.5477425108167916, "grad_norm": 0.10361135005950928, "learning_rate": 8.52376703277375e-05, "loss": 0.0096, "step": 4146 }, { "epoch": 0.5478746243022756, "grad_norm": 0.19445562362670898, "learning_rate": 8.5196566003962e-05, "loss": 0.0168, "step": 4147 }, { "epoch": 0.5480067377877597, "grad_norm": 0.24059665203094482, "learning_rate": 8.515546423720298e-05, "loss": 0.0118, "step": 4148 }, { "epoch": 0.5481388512732437, "grad_norm": 0.201985165476799, "learning_rate": 8.511436503455994e-05, "loss": 0.0134, "step": 4149 }, { "epoch": 0.5482709647587277, "grad_norm": 0.21374210715293884, "learning_rate": 8.507326840313204e-05, "loss": 0.0167, "step": 4150 }, { "epoch": 0.5484030782442118, "grad_norm": 0.16954916715621948, "learning_rate": 8.503217435001788e-05, "loss": 0.0146, "step": 4151 }, { "epoch": 0.5485351917296958, "grad_norm": 0.21463601291179657, "learning_rate": 8.499108288231581e-05, "loss": 0.0216, "step": 4152 }, { "epoch": 0.5486673052151798, "grad_norm": 0.21518591046333313, "learning_rate": 8.494999400712352e-05, "loss": 0.0226, "step": 4153 }, { "epoch": 0.5487994187006638, "grad_norm": 0.20929618179798126, "learning_rate": 8.490890773153835e-05, "loss": 0.0154, "step": 4154 }, { "epoch": 0.5489315321861479, "grad_norm": 0.27093571424484253, "learning_rate": 8.486782406265724e-05, "loss": 0.0306, "step": 4155 }, { "epoch": 0.5490636456716319, "grad_norm": 0.14851485192775726, "learning_rate": 8.482674300757657e-05, "loss": 0.0174, "step": 4156 }, { "epoch": 0.5491957591571159, "grad_norm": 0.11216623336076736, "learning_rate": 8.478566457339237e-05, "loss": 0.0087, "step": 4157 }, { "epoch": 0.5493278726426, "grad_norm": 0.14842119812965393, "learning_rate": 8.474458876720011e-05, "loss": 0.0114, "step": 4158 }, { "epoch": 0.549459986128084, "grad_norm": 0.27710989117622375, "learning_rate": 8.470351559609494e-05, "loss": 0.0226, "step": 4159 }, { "epoch": 0.549592099613568, "grad_norm": 0.16247709095478058, "learning_rate": 8.466244506717146e-05, "loss": 0.0166, "step": 4160 }, { "epoch": 0.5497242130990521, "grad_norm": 0.22627033293247223, "learning_rate": 8.462137718752378e-05, "loss": 0.0114, "step": 4161 }, { "epoch": 0.5498563265845361, "grad_norm": 0.12567301094532013, "learning_rate": 8.458031196424569e-05, "loss": 0.0117, "step": 4162 }, { "epoch": 0.5499884400700201, "grad_norm": 0.11331895738840103, "learning_rate": 8.453924940443037e-05, "loss": 0.0095, "step": 4163 }, { "epoch": 0.5501205535555042, "grad_norm": 0.13814541697502136, "learning_rate": 8.449818951517068e-05, "loss": 0.0152, "step": 4164 }, { "epoch": 0.5502526670409882, "grad_norm": 0.15670281648635864, "learning_rate": 8.445713230355888e-05, "loss": 0.0152, "step": 4165 }, { "epoch": 0.5503847805264722, "grad_norm": 0.1425250917673111, "learning_rate": 8.441607777668688e-05, "loss": 0.0143, "step": 4166 }, { "epoch": 0.5505168940119562, "grad_norm": 0.16183023154735565, "learning_rate": 8.437502594164607e-05, "loss": 0.0121, "step": 4167 }, { "epoch": 0.5506490074974403, "grad_norm": 0.18989704549312592, "learning_rate": 8.433397680552735e-05, "loss": 0.0173, "step": 4168 }, { "epoch": 0.5507811209829243, "grad_norm": 0.24186430871486664, "learning_rate": 8.429293037542127e-05, "loss": 0.0211, "step": 4169 }, { "epoch": 0.5509132344684083, "grad_norm": 0.1806151121854782, "learning_rate": 8.425188665841775e-05, "loss": 0.0216, "step": 4170 }, { "epoch": 0.5510453479538924, "grad_norm": 0.2237834483385086, "learning_rate": 8.42108456616064e-05, "loss": 0.0426, "step": 4171 }, { "epoch": 0.5511774614393764, "grad_norm": 0.23168422281742096, "learning_rate": 8.416980739207621e-05, "loss": 0.0285, "step": 4172 }, { "epoch": 0.5513095749248604, "grad_norm": 0.14686016738414764, "learning_rate": 8.412877185691584e-05, "loss": 0.0128, "step": 4173 }, { "epoch": 0.5514416884103445, "grad_norm": 0.14953650534152985, "learning_rate": 8.408773906321339e-05, "loss": 0.0159, "step": 4174 }, { "epoch": 0.5515738018958285, "grad_norm": 0.21378695964813232, "learning_rate": 8.404670901805647e-05, "loss": 0.022, "step": 4175 }, { "epoch": 0.5517059153813125, "grad_norm": 0.22658562660217285, "learning_rate": 8.400568172853232e-05, "loss": 0.0148, "step": 4176 }, { "epoch": 0.5518380288667966, "grad_norm": 0.25272002816200256, "learning_rate": 8.396465720172755e-05, "loss": 0.0254, "step": 4177 }, { "epoch": 0.5519701423522806, "grad_norm": 0.18372265994548798, "learning_rate": 8.392363544472848e-05, "loss": 0.0182, "step": 4178 }, { "epoch": 0.5521022558377646, "grad_norm": 0.2196115404367447, "learning_rate": 8.388261646462077e-05, "loss": 0.024, "step": 4179 }, { "epoch": 0.5522343693232487, "grad_norm": 0.19392231106758118, "learning_rate": 8.384160026848974e-05, "loss": 0.023, "step": 4180 }, { "epoch": 0.5523664828087327, "grad_norm": 0.1629837453365326, "learning_rate": 8.380058686342014e-05, "loss": 0.0183, "step": 4181 }, { "epoch": 0.5524985962942167, "grad_norm": 0.28126442432403564, "learning_rate": 8.375957625649627e-05, "loss": 0.0321, "step": 4182 }, { "epoch": 0.5526307097797007, "grad_norm": 0.2287890464067459, "learning_rate": 8.371856845480195e-05, "loss": 0.0204, "step": 4183 }, { "epoch": 0.5527628232651848, "grad_norm": 0.18556039035320282, "learning_rate": 8.36775634654205e-05, "loss": 0.0273, "step": 4184 }, { "epoch": 0.5528949367506688, "grad_norm": 0.21663334965705872, "learning_rate": 8.363656129543478e-05, "loss": 0.0238, "step": 4185 }, { "epoch": 0.5530270502361528, "grad_norm": 0.24293342232704163, "learning_rate": 8.359556195192715e-05, "loss": 0.0249, "step": 4186 }, { "epoch": 0.5531591637216369, "grad_norm": 0.18286862969398499, "learning_rate": 8.355456544197949e-05, "loss": 0.0322, "step": 4187 }, { "epoch": 0.5532912772071209, "grad_norm": 0.12769746780395508, "learning_rate": 8.351357177267317e-05, "loss": 0.0161, "step": 4188 }, { "epoch": 0.5534233906926049, "grad_norm": 0.17266884446144104, "learning_rate": 8.347258095108902e-05, "loss": 0.0196, "step": 4189 }, { "epoch": 0.553555504178089, "grad_norm": 0.2900910973548889, "learning_rate": 8.343159298430755e-05, "loss": 0.0199, "step": 4190 }, { "epoch": 0.553687617663573, "grad_norm": 0.2615102529525757, "learning_rate": 8.339060787940858e-05, "loss": 0.0328, "step": 4191 }, { "epoch": 0.553819731149057, "grad_norm": 0.19165071845054626, "learning_rate": 8.334962564347156e-05, "loss": 0.0214, "step": 4192 }, { "epoch": 0.5539518446345411, "grad_norm": 0.2141731232404709, "learning_rate": 8.330864628357537e-05, "loss": 0.0236, "step": 4193 }, { "epoch": 0.5540839581200251, "grad_norm": 0.471525639295578, "learning_rate": 8.326766980679849e-05, "loss": 0.0303, "step": 4194 }, { "epoch": 0.5542160716055091, "grad_norm": 0.15691663324832916, "learning_rate": 8.32266962202188e-05, "loss": 0.0087, "step": 4195 }, { "epoch": 0.5543481850909932, "grad_norm": 0.1748238503932953, "learning_rate": 8.318572553091368e-05, "loss": 0.0155, "step": 4196 }, { "epoch": 0.5544802985764772, "grad_norm": 0.2593716084957123, "learning_rate": 8.314475774596014e-05, "loss": 0.0246, "step": 4197 }, { "epoch": 0.5546124120619612, "grad_norm": 0.2592805325984955, "learning_rate": 8.31037928724345e-05, "loss": 0.0277, "step": 4198 }, { "epoch": 0.5547445255474452, "grad_norm": 0.15535563230514526, "learning_rate": 8.306283091741278e-05, "loss": 0.0171, "step": 4199 }, { "epoch": 0.5548766390329293, "grad_norm": 0.18834006786346436, "learning_rate": 8.302187188797029e-05, "loss": 0.0163, "step": 4200 }, { "epoch": 0.5550087525184133, "grad_norm": 0.1360701322555542, "learning_rate": 8.2980915791182e-05, "loss": 0.013, "step": 4201 }, { "epoch": 0.5551408660038973, "grad_norm": 0.15888836979866028, "learning_rate": 8.293996263412233e-05, "loss": 0.0106, "step": 4202 }, { "epoch": 0.5552729794893814, "grad_norm": 0.13162779808044434, "learning_rate": 8.289901242386513e-05, "loss": 0.0194, "step": 4203 }, { "epoch": 0.5554050929748654, "grad_norm": 0.28755760192871094, "learning_rate": 8.285806516748377e-05, "loss": 0.0243, "step": 4204 }, { "epoch": 0.5555372064603494, "grad_norm": 0.22045432031154633, "learning_rate": 8.281712087205115e-05, "loss": 0.0285, "step": 4205 }, { "epoch": 0.5556693199458335, "grad_norm": 0.19695398211479187, "learning_rate": 8.277617954463964e-05, "loss": 0.0201, "step": 4206 }, { "epoch": 0.5558014334313175, "grad_norm": 0.17969001829624176, "learning_rate": 8.273524119232108e-05, "loss": 0.0223, "step": 4207 }, { "epoch": 0.5559335469168015, "grad_norm": 0.1344221979379654, "learning_rate": 8.269430582216678e-05, "loss": 0.0181, "step": 4208 }, { "epoch": 0.5560656604022856, "grad_norm": 0.2272825688123703, "learning_rate": 8.26533734412476e-05, "loss": 0.0226, "step": 4209 }, { "epoch": 0.5561977738877696, "grad_norm": 0.17867231369018555, "learning_rate": 8.261244405663382e-05, "loss": 0.0221, "step": 4210 }, { "epoch": 0.5563298873732536, "grad_norm": 0.1968875676393509, "learning_rate": 8.25715176753952e-05, "loss": 0.019, "step": 4211 }, { "epoch": 0.5564620008587376, "grad_norm": 0.1409069150686264, "learning_rate": 8.253059430460108e-05, "loss": 0.0194, "step": 4212 }, { "epoch": 0.5565941143442217, "grad_norm": 0.19733133912086487, "learning_rate": 8.248967395132013e-05, "loss": 0.0168, "step": 4213 }, { "epoch": 0.5567262278297057, "grad_norm": 0.3398911654949188, "learning_rate": 8.244875662262064e-05, "loss": 0.0266, "step": 4214 }, { "epoch": 0.5568583413151897, "grad_norm": 0.1569708287715912, "learning_rate": 8.240784232557024e-05, "loss": 0.0151, "step": 4215 }, { "epoch": 0.5569904548006738, "grad_norm": 0.12734025716781616, "learning_rate": 8.23669310672362e-05, "loss": 0.0092, "step": 4216 }, { "epoch": 0.5571225682861578, "grad_norm": 0.18991941213607788, "learning_rate": 8.232602285468512e-05, "loss": 0.0159, "step": 4217 }, { "epoch": 0.5572546817716418, "grad_norm": 0.14975979924201965, "learning_rate": 8.22851176949831e-05, "loss": 0.0202, "step": 4218 }, { "epoch": 0.5573867952571259, "grad_norm": 0.17158475518226624, "learning_rate": 8.224421559519581e-05, "loss": 0.0249, "step": 4219 }, { "epoch": 0.5575189087426099, "grad_norm": 0.08295943588018417, "learning_rate": 8.220331656238827e-05, "loss": 0.0058, "step": 4220 }, { "epoch": 0.5576510222280939, "grad_norm": 0.13717104494571686, "learning_rate": 8.216242060362507e-05, "loss": 0.0139, "step": 4221 }, { "epoch": 0.557783135713578, "grad_norm": 0.2490658164024353, "learning_rate": 8.212152772597018e-05, "loss": 0.008, "step": 4222 }, { "epoch": 0.557915249199062, "grad_norm": 0.17430169880390167, "learning_rate": 8.208063793648711e-05, "loss": 0.0185, "step": 4223 }, { "epoch": 0.558047362684546, "grad_norm": 0.19915857911109924, "learning_rate": 8.203975124223878e-05, "loss": 0.018, "step": 4224 }, { "epoch": 0.55817947617003, "grad_norm": 0.14223940670490265, "learning_rate": 8.199886765028762e-05, "loss": 0.0127, "step": 4225 }, { "epoch": 0.5583115896555141, "grad_norm": 0.13848844170570374, "learning_rate": 8.19579871676955e-05, "loss": 0.017, "step": 4226 }, { "epoch": 0.5584437031409981, "grad_norm": 0.16251258552074432, "learning_rate": 8.191710980152374e-05, "loss": 0.0156, "step": 4227 }, { "epoch": 0.5585758166264821, "grad_norm": 0.24074551463127136, "learning_rate": 8.187623555883321e-05, "loss": 0.0162, "step": 4228 }, { "epoch": 0.5587079301119662, "grad_norm": 0.1603114753961563, "learning_rate": 8.183536444668407e-05, "loss": 0.0222, "step": 4229 }, { "epoch": 0.5588400435974502, "grad_norm": 0.37383100390434265, "learning_rate": 8.179449647213613e-05, "loss": 0.0278, "step": 4230 }, { "epoch": 0.5589721570829342, "grad_norm": 0.1535310596227646, "learning_rate": 8.175363164224853e-05, "loss": 0.0116, "step": 4231 }, { "epoch": 0.5591042705684183, "grad_norm": 0.16033555567264557, "learning_rate": 8.171276996407989e-05, "loss": 0.0128, "step": 4232 }, { "epoch": 0.5592363840539023, "grad_norm": 0.17384503781795502, "learning_rate": 8.167191144468832e-05, "loss": 0.0182, "step": 4233 }, { "epoch": 0.5593684975393863, "grad_norm": 0.1818269044160843, "learning_rate": 8.163105609113135e-05, "loss": 0.0169, "step": 4234 }, { "epoch": 0.5595006110248704, "grad_norm": 0.12374034523963928, "learning_rate": 8.159020391046601e-05, "loss": 0.0147, "step": 4235 }, { "epoch": 0.5596327245103544, "grad_norm": 0.13295878469944, "learning_rate": 8.154935490974873e-05, "loss": 0.0155, "step": 4236 }, { "epoch": 0.5597648379958384, "grad_norm": 0.25557607412338257, "learning_rate": 8.150850909603541e-05, "loss": 0.0318, "step": 4237 }, { "epoch": 0.5598969514813225, "grad_norm": 0.2126423716545105, "learning_rate": 8.146766647638142e-05, "loss": 0.0335, "step": 4238 }, { "epoch": 0.5600290649668065, "grad_norm": 0.19664879143238068, "learning_rate": 8.14268270578415e-05, "loss": 0.03, "step": 4239 }, { "epoch": 0.5601611784522905, "grad_norm": 0.1645100712776184, "learning_rate": 8.138599084746998e-05, "loss": 0.0226, "step": 4240 }, { "epoch": 0.5602932919377746, "grad_norm": 0.1636127531528473, "learning_rate": 8.134515785232049e-05, "loss": 0.0279, "step": 4241 }, { "epoch": 0.5604254054232586, "grad_norm": 0.1539396494626999, "learning_rate": 8.130432807944618e-05, "loss": 0.0113, "step": 4242 }, { "epoch": 0.5605575189087426, "grad_norm": 0.16480429470539093, "learning_rate": 8.126350153589964e-05, "loss": 0.0306, "step": 4243 }, { "epoch": 0.5606896323942266, "grad_norm": 0.16860733926296234, "learning_rate": 8.12226782287329e-05, "loss": 0.0256, "step": 4244 }, { "epoch": 0.5608217458797107, "grad_norm": 0.19238042831420898, "learning_rate": 8.118185816499743e-05, "loss": 0.0203, "step": 4245 }, { "epoch": 0.5609538593651947, "grad_norm": 0.176408052444458, "learning_rate": 8.114104135174408e-05, "loss": 0.0273, "step": 4246 }, { "epoch": 0.5610859728506787, "grad_norm": 0.19235467910766602, "learning_rate": 8.110022779602323e-05, "loss": 0.02, "step": 4247 }, { "epoch": 0.5612180863361628, "grad_norm": 0.1220388188958168, "learning_rate": 8.105941750488465e-05, "loss": 0.008, "step": 4248 }, { "epoch": 0.5613501998216468, "grad_norm": 0.15233349800109863, "learning_rate": 8.101861048537757e-05, "loss": 0.0237, "step": 4249 }, { "epoch": 0.5614823133071308, "grad_norm": 0.2459915727376938, "learning_rate": 8.097780674455062e-05, "loss": 0.0249, "step": 4250 }, { "epoch": 0.5616144267926149, "grad_norm": 0.14926408231258392, "learning_rate": 8.093700628945191e-05, "loss": 0.016, "step": 4251 }, { "epoch": 0.5617465402780989, "grad_norm": 0.2099609375, "learning_rate": 8.089620912712894e-05, "loss": 0.0302, "step": 4252 }, { "epoch": 0.5618786537635829, "grad_norm": 0.24913211166858673, "learning_rate": 8.085541526462862e-05, "loss": 0.0193, "step": 4253 }, { "epoch": 0.562010767249067, "grad_norm": 0.1699565201997757, "learning_rate": 8.081462470899738e-05, "loss": 0.0135, "step": 4254 }, { "epoch": 0.562142880734551, "grad_norm": 0.14638663828372955, "learning_rate": 8.077383746728101e-05, "loss": 0.0176, "step": 4255 }, { "epoch": 0.562274994220035, "grad_norm": 0.22291651368141174, "learning_rate": 8.073305354652475e-05, "loss": 0.0208, "step": 4256 }, { "epoch": 0.562407107705519, "grad_norm": 0.1744988113641739, "learning_rate": 8.069227295377322e-05, "loss": 0.0161, "step": 4257 }, { "epoch": 0.5625392211910031, "grad_norm": 0.17007562518119812, "learning_rate": 8.065149569607057e-05, "loss": 0.029, "step": 4258 }, { "epoch": 0.5626713346764871, "grad_norm": 0.19890165328979492, "learning_rate": 8.061072178046023e-05, "loss": 0.0136, "step": 4259 }, { "epoch": 0.5628034481619711, "grad_norm": 0.19456276297569275, "learning_rate": 8.05699512139852e-05, "loss": 0.0153, "step": 4260 }, { "epoch": 0.5629355616474552, "grad_norm": 0.21431830525398254, "learning_rate": 8.05291840036878e-05, "loss": 0.02, "step": 4261 }, { "epoch": 0.5630676751329392, "grad_norm": 0.1389990746974945, "learning_rate": 8.048842015660984e-05, "loss": 0.017, "step": 4262 }, { "epoch": 0.5631997886184232, "grad_norm": 0.11417852342128754, "learning_rate": 8.044765967979247e-05, "loss": 0.0113, "step": 4263 }, { "epoch": 0.5633319021039073, "grad_norm": 0.23878905177116394, "learning_rate": 8.040690258027632e-05, "loss": 0.0196, "step": 4264 }, { "epoch": 0.5634640155893913, "grad_norm": 0.2255990356206894, "learning_rate": 8.03661488651014e-05, "loss": 0.0174, "step": 4265 }, { "epoch": 0.5635961290748753, "grad_norm": 0.2871243357658386, "learning_rate": 8.032539854130719e-05, "loss": 0.0168, "step": 4266 }, { "epoch": 0.5637282425603594, "grad_norm": 0.3738422393798828, "learning_rate": 8.028465161593251e-05, "loss": 0.0372, "step": 4267 }, { "epoch": 0.5638603560458434, "grad_norm": 0.19977661967277527, "learning_rate": 8.02439080960156e-05, "loss": 0.0156, "step": 4268 }, { "epoch": 0.5639924695313274, "grad_norm": 0.2826353907585144, "learning_rate": 8.020316798859424e-05, "loss": 0.0253, "step": 4269 }, { "epoch": 0.5641245830168115, "grad_norm": 0.1400228589773178, "learning_rate": 8.016243130070542e-05, "loss": 0.015, "step": 4270 }, { "epoch": 0.5642566965022955, "grad_norm": 0.14575998485088348, "learning_rate": 8.012169803938572e-05, "loss": 0.0108, "step": 4271 }, { "epoch": 0.5643888099877795, "grad_norm": 0.4165917932987213, "learning_rate": 8.008096821167097e-05, "loss": 0.0185, "step": 4272 }, { "epoch": 0.5645209234732635, "grad_norm": 0.1631278544664383, "learning_rate": 8.004024182459657e-05, "loss": 0.0149, "step": 4273 }, { "epoch": 0.5646530369587476, "grad_norm": 0.22125643491744995, "learning_rate": 7.99995188851972e-05, "loss": 0.0294, "step": 4274 }, { "epoch": 0.5647851504442316, "grad_norm": 0.12258799374103546, "learning_rate": 7.995879940050695e-05, "loss": 0.017, "step": 4275 }, { "epoch": 0.5649172639297156, "grad_norm": 0.11797162145376205, "learning_rate": 7.991808337755944e-05, "loss": 0.0111, "step": 4276 }, { "epoch": 0.5650493774151997, "grad_norm": 0.1532120555639267, "learning_rate": 7.98773708233875e-05, "loss": 0.0177, "step": 4277 }, { "epoch": 0.5651814909006837, "grad_norm": 0.24839898943901062, "learning_rate": 7.983666174502355e-05, "loss": 0.0164, "step": 4278 }, { "epoch": 0.5653136043861677, "grad_norm": 0.19382832944393158, "learning_rate": 7.979595614949925e-05, "loss": 0.0235, "step": 4279 }, { "epoch": 0.5654457178716518, "grad_norm": 0.11616852134466171, "learning_rate": 7.97552540438458e-05, "loss": 0.0089, "step": 4280 }, { "epoch": 0.5655778313571358, "grad_norm": 0.24005387723445892, "learning_rate": 7.971455543509367e-05, "loss": 0.0167, "step": 4281 }, { "epoch": 0.5657099448426198, "grad_norm": 0.14314687252044678, "learning_rate": 7.967386033027281e-05, "loss": 0.0177, "step": 4282 }, { "epoch": 0.5658420583281039, "grad_norm": 0.23170839250087738, "learning_rate": 7.963316873641254e-05, "loss": 0.0205, "step": 4283 }, { "epoch": 0.5659741718135879, "grad_norm": 0.4849940538406372, "learning_rate": 7.959248066054155e-05, "loss": 0.057, "step": 4284 }, { "epoch": 0.5661062852990719, "grad_norm": 0.17027738690376282, "learning_rate": 7.955179610968799e-05, "loss": 0.0163, "step": 4285 }, { "epoch": 0.566238398784556, "grad_norm": 0.12950539588928223, "learning_rate": 7.95111150908793e-05, "loss": 0.0125, "step": 4286 }, { "epoch": 0.56637051227004, "grad_norm": 0.18599557876586914, "learning_rate": 7.947043761114241e-05, "loss": 0.0203, "step": 4287 }, { "epoch": 0.566502625755524, "grad_norm": 0.1637999415397644, "learning_rate": 7.942976367750357e-05, "loss": 0.0165, "step": 4288 }, { "epoch": 0.566634739241008, "grad_norm": 0.21928507089614868, "learning_rate": 7.938909329698844e-05, "loss": 0.0244, "step": 4289 }, { "epoch": 0.5667668527264921, "grad_norm": 0.15104596316814423, "learning_rate": 7.934842647662208e-05, "loss": 0.0174, "step": 4290 }, { "epoch": 0.5668989662119761, "grad_norm": 0.16194121539592743, "learning_rate": 7.930776322342892e-05, "loss": 0.0125, "step": 4291 }, { "epoch": 0.5670310796974601, "grad_norm": 0.12635424733161926, "learning_rate": 7.926710354443278e-05, "loss": 0.0162, "step": 4292 }, { "epoch": 0.5671631931829442, "grad_norm": 0.1685585230588913, "learning_rate": 7.922644744665684e-05, "loss": 0.0245, "step": 4293 }, { "epoch": 0.5672953066684282, "grad_norm": 0.1646813303232193, "learning_rate": 7.91857949371237e-05, "loss": 0.0138, "step": 4294 }, { "epoch": 0.5674274201539122, "grad_norm": 0.2017247974872589, "learning_rate": 7.914514602285534e-05, "loss": 0.0163, "step": 4295 }, { "epoch": 0.5675595336393963, "grad_norm": 0.1613525003194809, "learning_rate": 7.910450071087303e-05, "loss": 0.0177, "step": 4296 }, { "epoch": 0.5676916471248803, "grad_norm": 0.1867039054632187, "learning_rate": 7.906385900819757e-05, "loss": 0.0201, "step": 4297 }, { "epoch": 0.5678237606103643, "grad_norm": 0.17872785031795502, "learning_rate": 7.902322092184899e-05, "loss": 0.0206, "step": 4298 }, { "epoch": 0.5679558740958484, "grad_norm": 0.12369637936353683, "learning_rate": 7.898258645884681e-05, "loss": 0.0104, "step": 4299 }, { "epoch": 0.5680879875813324, "grad_norm": 0.2565552294254303, "learning_rate": 7.894195562620983e-05, "loss": 0.0167, "step": 4300 }, { "epoch": 0.5682201010668164, "grad_norm": 0.3674928843975067, "learning_rate": 7.890132843095631e-05, "loss": 0.0251, "step": 4301 }, { "epoch": 0.5683522145523004, "grad_norm": 0.14947600662708282, "learning_rate": 7.886070488010382e-05, "loss": 0.011, "step": 4302 }, { "epoch": 0.5684843280377845, "grad_norm": 0.18126291036605835, "learning_rate": 7.882008498066928e-05, "loss": 0.0196, "step": 4303 }, { "epoch": 0.5686164415232685, "grad_norm": 0.159513458609581, "learning_rate": 7.87794687396691e-05, "loss": 0.0181, "step": 4304 }, { "epoch": 0.5687485550087525, "grad_norm": 0.2177513837814331, "learning_rate": 7.873885616411888e-05, "loss": 0.0315, "step": 4305 }, { "epoch": 0.5688806684942366, "grad_norm": 0.2580743432044983, "learning_rate": 7.869824726103376e-05, "loss": 0.017, "step": 4306 }, { "epoch": 0.5690127819797206, "grad_norm": 0.16138285398483276, "learning_rate": 7.865764203742813e-05, "loss": 0.0165, "step": 4307 }, { "epoch": 0.5691448954652046, "grad_norm": 0.14814524352550507, "learning_rate": 7.861704050031583e-05, "loss": 0.0224, "step": 4308 }, { "epoch": 0.5692770089506887, "grad_norm": 0.15521246194839478, "learning_rate": 7.857644265670994e-05, "loss": 0.0187, "step": 4309 }, { "epoch": 0.5694091224361727, "grad_norm": 0.11792432516813278, "learning_rate": 7.853584851362302e-05, "loss": 0.0113, "step": 4310 }, { "epoch": 0.5695412359216567, "grad_norm": 0.17021100223064423, "learning_rate": 7.849525807806697e-05, "loss": 0.0143, "step": 4311 }, { "epoch": 0.5696733494071408, "grad_norm": 0.18499933183193207, "learning_rate": 7.845467135705298e-05, "loss": 0.0183, "step": 4312 }, { "epoch": 0.5698054628926248, "grad_norm": 0.1422792673110962, "learning_rate": 7.84140883575917e-05, "loss": 0.0179, "step": 4313 }, { "epoch": 0.5699375763781088, "grad_norm": 0.17566844820976257, "learning_rate": 7.837350908669302e-05, "loss": 0.0133, "step": 4314 }, { "epoch": 0.5700696898635929, "grad_norm": 0.19467930495738983, "learning_rate": 7.833293355136635e-05, "loss": 0.0183, "step": 4315 }, { "epoch": 0.5702018033490769, "grad_norm": 0.12969674170017242, "learning_rate": 7.829236175862027e-05, "loss": 0.0142, "step": 4316 }, { "epoch": 0.5703339168345609, "grad_norm": 0.10846174508333206, "learning_rate": 7.825179371546277e-05, "loss": 0.0071, "step": 4317 }, { "epoch": 0.570466030320045, "grad_norm": 0.5153217315673828, "learning_rate": 7.82112294289013e-05, "loss": 0.0175, "step": 4318 }, { "epoch": 0.570598143805529, "grad_norm": 0.21997900307178497, "learning_rate": 7.817066890594259e-05, "loss": 0.0217, "step": 4319 }, { "epoch": 0.570730257291013, "grad_norm": 0.23901112377643585, "learning_rate": 7.813011215359265e-05, "loss": 0.0165, "step": 4320 }, { "epoch": 0.570862370776497, "grad_norm": 0.22473230957984924, "learning_rate": 7.808955917885694e-05, "loss": 0.0319, "step": 4321 }, { "epoch": 0.5709944842619811, "grad_norm": 0.1821734756231308, "learning_rate": 7.80490099887402e-05, "loss": 0.015, "step": 4322 }, { "epoch": 0.5711265977474651, "grad_norm": 0.2418357878923416, "learning_rate": 7.80084645902466e-05, "loss": 0.0323, "step": 4323 }, { "epoch": 0.5712587112329491, "grad_norm": 0.18403179943561554, "learning_rate": 7.796792299037954e-05, "loss": 0.0228, "step": 4324 }, { "epoch": 0.5713908247184332, "grad_norm": 0.2135356068611145, "learning_rate": 7.792738519614182e-05, "loss": 0.0168, "step": 4325 }, { "epoch": 0.5715229382039172, "grad_norm": 0.20723068714141846, "learning_rate": 7.788685121453564e-05, "loss": 0.0203, "step": 4326 }, { "epoch": 0.5716550516894012, "grad_norm": 0.14351293444633484, "learning_rate": 7.784632105256244e-05, "loss": 0.019, "step": 4327 }, { "epoch": 0.5717871651748853, "grad_norm": 0.14696793258190155, "learning_rate": 7.780579471722308e-05, "loss": 0.0201, "step": 4328 }, { "epoch": 0.5719192786603693, "grad_norm": 0.22235284745693207, "learning_rate": 7.776527221551769e-05, "loss": 0.0246, "step": 4329 }, { "epoch": 0.5720513921458533, "grad_norm": 0.1321786344051361, "learning_rate": 7.772475355444582e-05, "loss": 0.0125, "step": 4330 }, { "epoch": 0.5721835056313374, "grad_norm": 0.14376619458198547, "learning_rate": 7.768423874100629e-05, "loss": 0.02, "step": 4331 }, { "epoch": 0.5723156191168214, "grad_norm": 0.285086065530777, "learning_rate": 7.764372778219723e-05, "loss": 0.0258, "step": 4332 }, { "epoch": 0.5724477326023054, "grad_norm": 0.32190439105033875, "learning_rate": 7.760322068501624e-05, "loss": 0.0201, "step": 4333 }, { "epoch": 0.5725798460877894, "grad_norm": 0.17368465662002563, "learning_rate": 7.75627174564601e-05, "loss": 0.0255, "step": 4334 }, { "epoch": 0.5727119595732735, "grad_norm": 0.2123628556728363, "learning_rate": 7.752221810352501e-05, "loss": 0.0213, "step": 4335 }, { "epoch": 0.5728440730587575, "grad_norm": 0.16958920657634735, "learning_rate": 7.748172263320646e-05, "loss": 0.0137, "step": 4336 }, { "epoch": 0.5729761865442415, "grad_norm": 0.22543765604496002, "learning_rate": 7.74412310524993e-05, "loss": 0.0142, "step": 4337 }, { "epoch": 0.5731083000297256, "grad_norm": 0.24494275450706482, "learning_rate": 7.74007433683977e-05, "loss": 0.0175, "step": 4338 }, { "epoch": 0.5732404135152096, "grad_norm": 0.22296825051307678, "learning_rate": 7.736025958789512e-05, "loss": 0.0145, "step": 4339 }, { "epoch": 0.5733725270006936, "grad_norm": 0.152304545044899, "learning_rate": 7.73197797179844e-05, "loss": 0.0164, "step": 4340 }, { "epoch": 0.5735046404861777, "grad_norm": 0.14745329320430756, "learning_rate": 7.727930376565766e-05, "loss": 0.017, "step": 4341 }, { "epoch": 0.5736367539716617, "grad_norm": 0.13277247548103333, "learning_rate": 7.723883173790641e-05, "loss": 0.0153, "step": 4342 }, { "epoch": 0.5737688674571457, "grad_norm": 0.13933366537094116, "learning_rate": 7.719836364172138e-05, "loss": 0.0128, "step": 4343 }, { "epoch": 0.5739009809426298, "grad_norm": 0.21639874577522278, "learning_rate": 7.715789948409274e-05, "loss": 0.0259, "step": 4344 }, { "epoch": 0.5740330944281138, "grad_norm": 0.29860708117485046, "learning_rate": 7.711743927200985e-05, "loss": 0.0138, "step": 4345 }, { "epoch": 0.5741652079135978, "grad_norm": 0.2232387363910675, "learning_rate": 7.707698301246146e-05, "loss": 0.0233, "step": 4346 }, { "epoch": 0.5742973213990819, "grad_norm": 0.15625309944152832, "learning_rate": 7.703653071243571e-05, "loss": 0.0132, "step": 4347 }, { "epoch": 0.5744294348845659, "grad_norm": 0.084880031645298, "learning_rate": 7.699608237891988e-05, "loss": 0.0066, "step": 4348 }, { "epoch": 0.5745615483700499, "grad_norm": 0.18512706458568573, "learning_rate": 7.695563801890074e-05, "loss": 0.0319, "step": 4349 }, { "epoch": 0.5746936618555339, "grad_norm": 0.13333538174629211, "learning_rate": 7.691519763936424e-05, "loss": 0.0111, "step": 4350 }, { "epoch": 0.574825775341018, "grad_norm": 0.3957698345184326, "learning_rate": 7.687476124729576e-05, "loss": 0.0348, "step": 4351 }, { "epoch": 0.574957888826502, "grad_norm": 0.1441449224948883, "learning_rate": 7.683432884967987e-05, "loss": 0.0177, "step": 4352 }, { "epoch": 0.575090002311986, "grad_norm": 0.16597506403923035, "learning_rate": 7.679390045350054e-05, "loss": 0.0125, "step": 4353 }, { "epoch": 0.5752221157974701, "grad_norm": 0.13051097095012665, "learning_rate": 7.675347606574102e-05, "loss": 0.0144, "step": 4354 }, { "epoch": 0.5753542292829541, "grad_norm": 0.27841895818710327, "learning_rate": 7.671305569338385e-05, "loss": 0.0337, "step": 4355 }, { "epoch": 0.5754863427684381, "grad_norm": 0.29610154032707214, "learning_rate": 7.667263934341092e-05, "loss": 0.0182, "step": 4356 }, { "epoch": 0.5756184562539222, "grad_norm": 0.13614515960216522, "learning_rate": 7.663222702280337e-05, "loss": 0.0121, "step": 4357 }, { "epoch": 0.5757505697394062, "grad_norm": 0.1613374799489975, "learning_rate": 7.659181873854171e-05, "loss": 0.0178, "step": 4358 }, { "epoch": 0.5758826832248902, "grad_norm": 0.35530921816825867, "learning_rate": 7.655141449760569e-05, "loss": 0.0233, "step": 4359 }, { "epoch": 0.5760147967103743, "grad_norm": 0.2377680391073227, "learning_rate": 7.651101430697439e-05, "loss": 0.024, "step": 4360 }, { "epoch": 0.5761469101958583, "grad_norm": 0.20939378440380096, "learning_rate": 7.647061817362617e-05, "loss": 0.02, "step": 4361 }, { "epoch": 0.5762790236813423, "grad_norm": 0.1147724911570549, "learning_rate": 7.643022610453874e-05, "loss": 0.0101, "step": 4362 }, { "epoch": 0.5764111371668263, "grad_norm": 0.10789413750171661, "learning_rate": 7.638983810668906e-05, "loss": 0.0087, "step": 4363 }, { "epoch": 0.5765432506523104, "grad_norm": 0.18182159960269928, "learning_rate": 7.634945418705339e-05, "loss": 0.012, "step": 4364 }, { "epoch": 0.5766753641377944, "grad_norm": 0.18856218457221985, "learning_rate": 7.630907435260733e-05, "loss": 0.0304, "step": 4365 }, { "epoch": 0.5768074776232784, "grad_norm": 0.1378273069858551, "learning_rate": 7.626869861032571e-05, "loss": 0.015, "step": 4366 }, { "epoch": 0.5769395911087625, "grad_norm": 0.1456385999917984, "learning_rate": 7.622832696718269e-05, "loss": 0.0139, "step": 4367 }, { "epoch": 0.5770717045942465, "grad_norm": 0.17599926888942719, "learning_rate": 7.618795943015172e-05, "loss": 0.0217, "step": 4368 }, { "epoch": 0.5772038180797305, "grad_norm": 0.2156069576740265, "learning_rate": 7.614759600620553e-05, "loss": 0.0237, "step": 4369 }, { "epoch": 0.5773359315652146, "grad_norm": 0.18197335302829742, "learning_rate": 7.610723670231619e-05, "loss": 0.0274, "step": 4370 }, { "epoch": 0.5774680450506986, "grad_norm": 0.17185929417610168, "learning_rate": 7.606688152545494e-05, "loss": 0.0288, "step": 4371 }, { "epoch": 0.5776001585361826, "grad_norm": 0.2738763391971588, "learning_rate": 7.602653048259244e-05, "loss": 0.0267, "step": 4372 }, { "epoch": 0.5777322720216667, "grad_norm": 0.13118204474449158, "learning_rate": 7.598618358069858e-05, "loss": 0.0241, "step": 4373 }, { "epoch": 0.5778643855071507, "grad_norm": 0.22741925716400146, "learning_rate": 7.594584082674248e-05, "loss": 0.0388, "step": 4374 }, { "epoch": 0.5779964989926347, "grad_norm": 0.1998399943113327, "learning_rate": 7.590550222769265e-05, "loss": 0.0165, "step": 4375 }, { "epoch": 0.5781286124781188, "grad_norm": 0.19354160130023956, "learning_rate": 7.586516779051677e-05, "loss": 0.0127, "step": 4376 }, { "epoch": 0.5782607259636028, "grad_norm": 0.32591480016708374, "learning_rate": 7.582483752218192e-05, "loss": 0.0128, "step": 4377 }, { "epoch": 0.5783928394490868, "grad_norm": 0.142011359333992, "learning_rate": 7.57845114296544e-05, "loss": 0.0174, "step": 4378 }, { "epoch": 0.5785249529345708, "grad_norm": 0.21151968836784363, "learning_rate": 7.574418951989975e-05, "loss": 0.0183, "step": 4379 }, { "epoch": 0.5786570664200549, "grad_norm": 0.20516203343868256, "learning_rate": 7.570387179988286e-05, "loss": 0.0234, "step": 4380 }, { "epoch": 0.5787891799055389, "grad_norm": 0.17113912105560303, "learning_rate": 7.566355827656783e-05, "loss": 0.0242, "step": 4381 }, { "epoch": 0.5789212933910229, "grad_norm": 0.2233685404062271, "learning_rate": 7.562324895691809e-05, "loss": 0.0204, "step": 4382 }, { "epoch": 0.579053406876507, "grad_norm": 0.14263029396533966, "learning_rate": 7.55829438478963e-05, "loss": 0.0179, "step": 4383 }, { "epoch": 0.579185520361991, "grad_norm": 0.16608430445194244, "learning_rate": 7.554264295646444e-05, "loss": 0.0152, "step": 4384 }, { "epoch": 0.579317633847475, "grad_norm": 0.1453695148229599, "learning_rate": 7.550234628958373e-05, "loss": 0.0106, "step": 4385 }, { "epoch": 0.5794497473329591, "grad_norm": 0.10571756213903427, "learning_rate": 7.546205385421463e-05, "loss": 0.0098, "step": 4386 }, { "epoch": 0.5795818608184431, "grad_norm": 0.1574452966451645, "learning_rate": 7.542176565731698e-05, "loss": 0.0143, "step": 4387 }, { "epoch": 0.5797139743039271, "grad_norm": 0.17890842258930206, "learning_rate": 7.538148170584974e-05, "loss": 0.0215, "step": 4388 }, { "epoch": 0.5798460877894112, "grad_norm": 0.1550913155078888, "learning_rate": 7.534120200677122e-05, "loss": 0.0111, "step": 4389 }, { "epoch": 0.5799782012748952, "grad_norm": 0.14007329940795898, "learning_rate": 7.530092656703904e-05, "loss": 0.0231, "step": 4390 }, { "epoch": 0.5801103147603792, "grad_norm": 0.13921970129013062, "learning_rate": 7.526065539360996e-05, "loss": 0.0155, "step": 4391 }, { "epoch": 0.5802424282458633, "grad_norm": 0.20098300278186798, "learning_rate": 7.522038849344012e-05, "loss": 0.0273, "step": 4392 }, { "epoch": 0.5803745417313473, "grad_norm": 0.1520303636789322, "learning_rate": 7.518012587348483e-05, "loss": 0.0162, "step": 4393 }, { "epoch": 0.5805066552168313, "grad_norm": 0.24330918490886688, "learning_rate": 7.513986754069877e-05, "loss": 0.0273, "step": 4394 }, { "epoch": 0.5806387687023153, "grad_norm": 0.18926988542079926, "learning_rate": 7.509961350203576e-05, "loss": 0.0124, "step": 4395 }, { "epoch": 0.5807708821877993, "grad_norm": 0.2736714780330658, "learning_rate": 7.505936376444893e-05, "loss": 0.0119, "step": 4396 }, { "epoch": 0.5809029956732833, "grad_norm": 0.1956966370344162, "learning_rate": 7.501911833489071e-05, "loss": 0.0224, "step": 4397 }, { "epoch": 0.5810351091587673, "grad_norm": 0.21423551440238953, "learning_rate": 7.497887722031272e-05, "loss": 0.0316, "step": 4398 }, { "epoch": 0.5811672226442514, "grad_norm": 0.21427178382873535, "learning_rate": 7.493864042766585e-05, "loss": 0.0249, "step": 4399 }, { "epoch": 0.5812993361297354, "grad_norm": 0.15347111225128174, "learning_rate": 7.489840796390028e-05, "loss": 0.0188, "step": 4400 }, { "epoch": 0.5814314496152194, "grad_norm": 0.2119879424571991, "learning_rate": 7.485817983596541e-05, "loss": 0.0237, "step": 4401 }, { "epoch": 0.5815635631007035, "grad_norm": 0.16195791959762573, "learning_rate": 7.481795605080987e-05, "loss": 0.0177, "step": 4402 }, { "epoch": 0.5816956765861875, "grad_norm": 0.18902483582496643, "learning_rate": 7.477773661538159e-05, "loss": 0.0362, "step": 4403 }, { "epoch": 0.5818277900716715, "grad_norm": 0.15727530419826508, "learning_rate": 7.473752153662774e-05, "loss": 0.0161, "step": 4404 }, { "epoch": 0.5819599035571555, "grad_norm": 0.19354747235774994, "learning_rate": 7.469731082149467e-05, "loss": 0.0112, "step": 4405 }, { "epoch": 0.5820920170426396, "grad_norm": 0.17567463219165802, "learning_rate": 7.465710447692811e-05, "loss": 0.0279, "step": 4406 }, { "epoch": 0.5822241305281236, "grad_norm": 0.1853395253419876, "learning_rate": 7.461690250987287e-05, "loss": 0.0147, "step": 4407 }, { "epoch": 0.5823562440136076, "grad_norm": 0.1291511058807373, "learning_rate": 7.457670492727316e-05, "loss": 0.0178, "step": 4408 }, { "epoch": 0.5824883574990917, "grad_norm": 0.11233451217412949, "learning_rate": 7.45365117360723e-05, "loss": 0.0179, "step": 4409 }, { "epoch": 0.5826204709845757, "grad_norm": 0.1814539134502411, "learning_rate": 7.449632294321294e-05, "loss": 0.0142, "step": 4410 }, { "epoch": 0.5827525844700597, "grad_norm": 0.22415396571159363, "learning_rate": 7.445613855563698e-05, "loss": 0.0247, "step": 4411 }, { "epoch": 0.5828846979555438, "grad_norm": 0.1492513120174408, "learning_rate": 7.441595858028543e-05, "loss": 0.0123, "step": 4412 }, { "epoch": 0.5830168114410278, "grad_norm": 0.32033640146255493, "learning_rate": 7.437578302409873e-05, "loss": 0.0279, "step": 4413 }, { "epoch": 0.5831489249265118, "grad_norm": 0.15683256089687347, "learning_rate": 7.433561189401637e-05, "loss": 0.0069, "step": 4414 }, { "epoch": 0.5832810384119959, "grad_norm": 0.16468288004398346, "learning_rate": 7.429544519697723e-05, "loss": 0.021, "step": 4415 }, { "epoch": 0.5834131518974799, "grad_norm": 0.14463230967521667, "learning_rate": 7.425528293991932e-05, "loss": 0.0182, "step": 4416 }, { "epoch": 0.5835452653829639, "grad_norm": 0.16453178226947784, "learning_rate": 7.421512512977993e-05, "loss": 0.0123, "step": 4417 }, { "epoch": 0.583677378868448, "grad_norm": 0.16732649505138397, "learning_rate": 7.417497177349556e-05, "loss": 0.0157, "step": 4418 }, { "epoch": 0.583809492353932, "grad_norm": 0.20836827158927917, "learning_rate": 7.413482287800195e-05, "loss": 0.0265, "step": 4419 }, { "epoch": 0.583941605839416, "grad_norm": 0.2837749421596527, "learning_rate": 7.40946784502341e-05, "loss": 0.0238, "step": 4420 }, { "epoch": 0.5840737193249, "grad_norm": 0.18477781116962433, "learning_rate": 7.405453849712616e-05, "loss": 0.0198, "step": 4421 }, { "epoch": 0.5842058328103841, "grad_norm": 0.17079903185367584, "learning_rate": 7.401440302561162e-05, "loss": 0.0172, "step": 4422 }, { "epoch": 0.5843379462958681, "grad_norm": 0.183742955327034, "learning_rate": 7.397427204262308e-05, "loss": 0.0179, "step": 4423 }, { "epoch": 0.5844700597813521, "grad_norm": 0.12258812040090561, "learning_rate": 7.393414555509243e-05, "loss": 0.0137, "step": 4424 }, { "epoch": 0.5846021732668362, "grad_norm": 0.28085893392562866, "learning_rate": 7.389402356995078e-05, "loss": 0.0274, "step": 4425 }, { "epoch": 0.5847342867523202, "grad_norm": 0.2764669358730316, "learning_rate": 7.385390609412844e-05, "loss": 0.0289, "step": 4426 }, { "epoch": 0.5848664002378042, "grad_norm": 0.08393105864524841, "learning_rate": 7.381379313455499e-05, "loss": 0.0075, "step": 4427 }, { "epoch": 0.5849985137232883, "grad_norm": 0.1453171968460083, "learning_rate": 7.377368469815913e-05, "loss": 0.0172, "step": 4428 }, { "epoch": 0.5851306272087723, "grad_norm": 0.1916455179452896, "learning_rate": 7.37335807918689e-05, "loss": 0.0234, "step": 4429 }, { "epoch": 0.5852627406942563, "grad_norm": 0.1560536026954651, "learning_rate": 7.369348142261148e-05, "loss": 0.0177, "step": 4430 }, { "epoch": 0.5853948541797404, "grad_norm": 0.17626263201236725, "learning_rate": 7.365338659731327e-05, "loss": 0.0182, "step": 4431 }, { "epoch": 0.5855269676652244, "grad_norm": 0.12068506330251694, "learning_rate": 7.361329632289992e-05, "loss": 0.011, "step": 4432 }, { "epoch": 0.5856590811507084, "grad_norm": 0.2062414288520813, "learning_rate": 7.357321060629626e-05, "loss": 0.0071, "step": 4433 }, { "epoch": 0.5857911946361924, "grad_norm": 0.14255057275295258, "learning_rate": 7.353312945442639e-05, "loss": 0.0212, "step": 4434 }, { "epoch": 0.5859233081216765, "grad_norm": 0.259384423494339, "learning_rate": 7.349305287421348e-05, "loss": 0.0251, "step": 4435 }, { "epoch": 0.5860554216071605, "grad_norm": 0.11826782673597336, "learning_rate": 7.345298087258013e-05, "loss": 0.0099, "step": 4436 }, { "epoch": 0.5861875350926445, "grad_norm": 0.15347795188426971, "learning_rate": 7.341291345644797e-05, "loss": 0.0169, "step": 4437 }, { "epoch": 0.5863196485781286, "grad_norm": 0.19067321717739105, "learning_rate": 7.337285063273793e-05, "loss": 0.0142, "step": 4438 }, { "epoch": 0.5864517620636126, "grad_norm": 0.21550928056240082, "learning_rate": 7.333279240837005e-05, "loss": 0.0282, "step": 4439 }, { "epoch": 0.5865838755490966, "grad_norm": 0.12859563529491425, "learning_rate": 7.329273879026371e-05, "loss": 0.0073, "step": 4440 }, { "epoch": 0.5867159890345807, "grad_norm": 0.16383865475654602, "learning_rate": 7.325268978533735e-05, "loss": 0.0114, "step": 4441 }, { "epoch": 0.5868481025200647, "grad_norm": 0.15545353293418884, "learning_rate": 7.321264540050876e-05, "loss": 0.0154, "step": 4442 }, { "epoch": 0.5869802160055487, "grad_norm": 0.1711602658033371, "learning_rate": 7.317260564269482e-05, "loss": 0.0202, "step": 4443 }, { "epoch": 0.5871123294910328, "grad_norm": 0.15408815443515778, "learning_rate": 7.313257051881165e-05, "loss": 0.0142, "step": 4444 }, { "epoch": 0.5872444429765168, "grad_norm": 0.10001006722450256, "learning_rate": 7.309254003577459e-05, "loss": 0.0097, "step": 4445 }, { "epoch": 0.5873765564620008, "grad_norm": 0.17244599759578705, "learning_rate": 7.305251420049813e-05, "loss": 0.016, "step": 4446 }, { "epoch": 0.5875086699474849, "grad_norm": 0.20834384858608246, "learning_rate": 7.301249301989601e-05, "loss": 0.0201, "step": 4447 }, { "epoch": 0.5876407834329689, "grad_norm": 0.21501821279525757, "learning_rate": 7.29724765008811e-05, "loss": 0.027, "step": 4448 }, { "epoch": 0.5877728969184529, "grad_norm": 0.293439656496048, "learning_rate": 7.293246465036557e-05, "loss": 0.0206, "step": 4449 }, { "epoch": 0.587905010403937, "grad_norm": 0.20541538298130035, "learning_rate": 7.289245747526066e-05, "loss": 0.0277, "step": 4450 }, { "epoch": 0.588037123889421, "grad_norm": 0.21135063469409943, "learning_rate": 7.285245498247689e-05, "loss": 0.0151, "step": 4451 }, { "epoch": 0.588169237374905, "grad_norm": 0.21523725986480713, "learning_rate": 7.281245717892396e-05, "loss": 0.0092, "step": 4452 }, { "epoch": 0.588301350860389, "grad_norm": 0.4050748646259308, "learning_rate": 7.277246407151067e-05, "loss": 0.0288, "step": 4453 }, { "epoch": 0.5884334643458731, "grad_norm": 0.15043921768665314, "learning_rate": 7.273247566714517e-05, "loss": 0.0172, "step": 4454 }, { "epoch": 0.5885655778313571, "grad_norm": 0.12210755795240402, "learning_rate": 7.269249197273465e-05, "loss": 0.0135, "step": 4455 }, { "epoch": 0.5886976913168411, "grad_norm": 0.16365686058998108, "learning_rate": 7.265251299518558e-05, "loss": 0.0168, "step": 4456 }, { "epoch": 0.5888298048023252, "grad_norm": 0.15842820703983307, "learning_rate": 7.261253874140354e-05, "loss": 0.0164, "step": 4457 }, { "epoch": 0.5889619182878092, "grad_norm": 0.11781178414821625, "learning_rate": 7.25725692182934e-05, "loss": 0.0102, "step": 4458 }, { "epoch": 0.5890940317732932, "grad_norm": 0.13468272984027863, "learning_rate": 7.253260443275908e-05, "loss": 0.0168, "step": 4459 }, { "epoch": 0.5892261452587773, "grad_norm": 0.20080485939979553, "learning_rate": 7.249264439170378e-05, "loss": 0.0284, "step": 4460 }, { "epoch": 0.5893582587442613, "grad_norm": 0.1445317268371582, "learning_rate": 7.245268910202988e-05, "loss": 0.0131, "step": 4461 }, { "epoch": 0.5894903722297453, "grad_norm": 0.21400819718837738, "learning_rate": 7.241273857063884e-05, "loss": 0.0206, "step": 4462 }, { "epoch": 0.5896224857152293, "grad_norm": 0.42813754081726074, "learning_rate": 7.237279280443143e-05, "loss": 0.0348, "step": 4463 }, { "epoch": 0.5897545992007134, "grad_norm": 0.1779608279466629, "learning_rate": 7.23328518103075e-05, "loss": 0.0245, "step": 4464 }, { "epoch": 0.5898867126861974, "grad_norm": 0.1536242812871933, "learning_rate": 7.229291559516612e-05, "loss": 0.0205, "step": 4465 }, { "epoch": 0.5900188261716814, "grad_norm": 0.17254947125911713, "learning_rate": 7.225298416590554e-05, "loss": 0.0207, "step": 4466 }, { "epoch": 0.5901509396571655, "grad_norm": 0.2188318520784378, "learning_rate": 7.221305752942313e-05, "loss": 0.0275, "step": 4467 }, { "epoch": 0.5902830531426495, "grad_norm": 0.1342228502035141, "learning_rate": 7.21731356926155e-05, "loss": 0.0155, "step": 4468 }, { "epoch": 0.5904151666281335, "grad_norm": 0.1679522544145584, "learning_rate": 7.213321866237837e-05, "loss": 0.0157, "step": 4469 }, { "epoch": 0.5905472801136176, "grad_norm": 0.12442659586668015, "learning_rate": 7.209330644560673e-05, "loss": 0.0132, "step": 4470 }, { "epoch": 0.5906793935991016, "grad_norm": 0.30356472730636597, "learning_rate": 7.205339904919456e-05, "loss": 0.0325, "step": 4471 }, { "epoch": 0.5908115070845856, "grad_norm": 0.2188597321510315, "learning_rate": 7.201349648003524e-05, "loss": 0.0227, "step": 4472 }, { "epoch": 0.5909436205700697, "grad_norm": 0.2081504613161087, "learning_rate": 7.197359874502112e-05, "loss": 0.0213, "step": 4473 }, { "epoch": 0.5910757340555537, "grad_norm": 0.09858710318803787, "learning_rate": 7.193370585104377e-05, "loss": 0.0121, "step": 4474 }, { "epoch": 0.5912078475410377, "grad_norm": 0.18282747268676758, "learning_rate": 7.1893817804994e-05, "loss": 0.013, "step": 4475 }, { "epoch": 0.5913399610265218, "grad_norm": 0.16425664722919464, "learning_rate": 7.185393461376166e-05, "loss": 0.019, "step": 4476 }, { "epoch": 0.5914720745120058, "grad_norm": 0.22384493052959442, "learning_rate": 7.18140562842359e-05, "loss": 0.0207, "step": 4477 }, { "epoch": 0.5916041879974898, "grad_norm": 0.18941235542297363, "learning_rate": 7.17741828233049e-05, "loss": 0.0345, "step": 4478 }, { "epoch": 0.5917363014829738, "grad_norm": 0.34682533144950867, "learning_rate": 7.173431423785609e-05, "loss": 0.051, "step": 4479 }, { "epoch": 0.5918684149684579, "grad_norm": 0.1639723926782608, "learning_rate": 7.169445053477599e-05, "loss": 0.0274, "step": 4480 }, { "epoch": 0.5920005284539419, "grad_norm": 0.1771511286497116, "learning_rate": 7.16545917209503e-05, "loss": 0.0151, "step": 4481 }, { "epoch": 0.5921326419394259, "grad_norm": 0.1359596848487854, "learning_rate": 7.161473780326393e-05, "loss": 0.0103, "step": 4482 }, { "epoch": 0.59226475542491, "grad_norm": 0.14128483831882477, "learning_rate": 7.157488878860087e-05, "loss": 0.015, "step": 4483 }, { "epoch": 0.592396868910394, "grad_norm": 0.1506226509809494, "learning_rate": 7.153504468384431e-05, "loss": 0.02, "step": 4484 }, { "epoch": 0.592528982395878, "grad_norm": 0.1360141783952713, "learning_rate": 7.149520549587656e-05, "loss": 0.0097, "step": 4485 }, { "epoch": 0.5926610958813621, "grad_norm": 0.19670647382736206, "learning_rate": 7.14553712315791e-05, "loss": 0.0167, "step": 4486 }, { "epoch": 0.5927932093668461, "grad_norm": 0.2583443224430084, "learning_rate": 7.141554189783256e-05, "loss": 0.0243, "step": 4487 }, { "epoch": 0.5929253228523301, "grad_norm": 0.22510045766830444, "learning_rate": 7.137571750151668e-05, "loss": 0.0223, "step": 4488 }, { "epoch": 0.5930574363378142, "grad_norm": 0.201186403632164, "learning_rate": 7.133589804951044e-05, "loss": 0.0226, "step": 4489 }, { "epoch": 0.5931895498232982, "grad_norm": 0.13508693873882294, "learning_rate": 7.129608354869184e-05, "loss": 0.0128, "step": 4490 }, { "epoch": 0.5933216633087822, "grad_norm": 0.12510472536087036, "learning_rate": 7.125627400593815e-05, "loss": 0.0139, "step": 4491 }, { "epoch": 0.5934537767942663, "grad_norm": 0.2334831953048706, "learning_rate": 7.121646942812566e-05, "loss": 0.0264, "step": 4492 }, { "epoch": 0.5935858902797503, "grad_norm": 0.13933241367340088, "learning_rate": 7.117666982212989e-05, "loss": 0.0182, "step": 4493 }, { "epoch": 0.5937180037652343, "grad_norm": 0.19615226984024048, "learning_rate": 7.113687519482555e-05, "loss": 0.0239, "step": 4494 }, { "epoch": 0.5938501172507183, "grad_norm": 0.1652141511440277, "learning_rate": 7.109708555308634e-05, "loss": 0.017, "step": 4495 }, { "epoch": 0.5939822307362024, "grad_norm": 0.1287144422531128, "learning_rate": 7.105730090378517e-05, "loss": 0.0182, "step": 4496 }, { "epoch": 0.5941143442216864, "grad_norm": 0.1629124879837036, "learning_rate": 7.101752125379414e-05, "loss": 0.0137, "step": 4497 }, { "epoch": 0.5942464577071704, "grad_norm": 0.13514606654644012, "learning_rate": 7.097774660998442e-05, "loss": 0.0172, "step": 4498 }, { "epoch": 0.5943785711926545, "grad_norm": 0.1825205534696579, "learning_rate": 7.093797697922635e-05, "loss": 0.0172, "step": 4499 }, { "epoch": 0.5945106846781385, "grad_norm": 0.1886797845363617, "learning_rate": 7.089821236838934e-05, "loss": 0.024, "step": 4500 }, { "epoch": 0.5946427981636225, "grad_norm": 0.2030637115240097, "learning_rate": 7.085845278434206e-05, "loss": 0.0206, "step": 4501 }, { "epoch": 0.5947749116491066, "grad_norm": 0.2834545075893402, "learning_rate": 7.081869823395217e-05, "loss": 0.0249, "step": 4502 }, { "epoch": 0.5949070251345906, "grad_norm": 0.11763806641101837, "learning_rate": 7.077894872408655e-05, "loss": 0.0086, "step": 4503 }, { "epoch": 0.5950391386200746, "grad_norm": 0.11526691913604736, "learning_rate": 7.073920426161121e-05, "loss": 0.0067, "step": 4504 }, { "epoch": 0.5951712521055587, "grad_norm": 0.12258782237768173, "learning_rate": 7.069946485339118e-05, "loss": 0.0105, "step": 4505 }, { "epoch": 0.5953033655910427, "grad_norm": 0.36307042837142944, "learning_rate": 7.065973050629081e-05, "loss": 0.0201, "step": 4506 }, { "epoch": 0.5954354790765267, "grad_norm": 0.26752519607543945, "learning_rate": 7.062000122717338e-05, "loss": 0.0321, "step": 4507 }, { "epoch": 0.5955675925620108, "grad_norm": 0.17262883484363556, "learning_rate": 7.058027702290144e-05, "loss": 0.0253, "step": 4508 }, { "epoch": 0.5956997060474948, "grad_norm": 0.132182776927948, "learning_rate": 7.054055790033655e-05, "loss": 0.0148, "step": 4509 }, { "epoch": 0.5958318195329788, "grad_norm": 0.1420915126800537, "learning_rate": 7.050084386633948e-05, "loss": 0.0138, "step": 4510 }, { "epoch": 0.5959639330184628, "grad_norm": 0.3453371226787567, "learning_rate": 7.046113492777009e-05, "loss": 0.016, "step": 4511 }, { "epoch": 0.5960960465039469, "grad_norm": 0.12592832744121552, "learning_rate": 7.042143109148733e-05, "loss": 0.0098, "step": 4512 }, { "epoch": 0.5962281599894309, "grad_norm": 0.17428047955036163, "learning_rate": 7.038173236434933e-05, "loss": 0.0183, "step": 4513 }, { "epoch": 0.5963602734749149, "grad_norm": 0.17273180186748505, "learning_rate": 7.034203875321326e-05, "loss": 0.013, "step": 4514 }, { "epoch": 0.596492386960399, "grad_norm": 0.13108059763908386, "learning_rate": 7.030235026493548e-05, "loss": 0.0184, "step": 4515 }, { "epoch": 0.596624500445883, "grad_norm": 0.20756517350673676, "learning_rate": 7.026266690637145e-05, "loss": 0.0193, "step": 4516 }, { "epoch": 0.596756613931367, "grad_norm": 0.13016660511493683, "learning_rate": 7.022298868437567e-05, "loss": 0.0165, "step": 4517 }, { "epoch": 0.5968887274168511, "grad_norm": 0.11081884056329727, "learning_rate": 7.018331560580187e-05, "loss": 0.0108, "step": 4518 }, { "epoch": 0.5970208409023351, "grad_norm": 0.11399253457784653, "learning_rate": 7.014364767750277e-05, "loss": 0.0186, "step": 4519 }, { "epoch": 0.5971529543878191, "grad_norm": 0.11999227106571198, "learning_rate": 7.010398490633035e-05, "loss": 0.0147, "step": 4520 }, { "epoch": 0.5972850678733032, "grad_norm": 0.13482487201690674, "learning_rate": 7.006432729913552e-05, "loss": 0.0121, "step": 4521 }, { "epoch": 0.5974171813587872, "grad_norm": 0.16840508580207825, "learning_rate": 7.002467486276847e-05, "loss": 0.0078, "step": 4522 }, { "epoch": 0.5975492948442712, "grad_norm": 0.13563968241214752, "learning_rate": 6.998502760407838e-05, "loss": 0.0125, "step": 4523 }, { "epoch": 0.5976814083297552, "grad_norm": 0.18716301023960114, "learning_rate": 6.994538552991354e-05, "loss": 0.0224, "step": 4524 }, { "epoch": 0.5978135218152393, "grad_norm": 0.16288627684116364, "learning_rate": 6.990574864712144e-05, "loss": 0.0164, "step": 4525 }, { "epoch": 0.5979456353007233, "grad_norm": 0.19571024179458618, "learning_rate": 6.986611696254857e-05, "loss": 0.0232, "step": 4526 }, { "epoch": 0.5980777487862073, "grad_norm": 0.09460264444351196, "learning_rate": 6.982649048304057e-05, "loss": 0.0092, "step": 4527 }, { "epoch": 0.5982098622716914, "grad_norm": 0.3020939230918884, "learning_rate": 6.978686921544218e-05, "loss": 0.0218, "step": 4528 }, { "epoch": 0.5983419757571754, "grad_norm": 0.15641634166240692, "learning_rate": 6.974725316659725e-05, "loss": 0.0086, "step": 4529 }, { "epoch": 0.5984740892426594, "grad_norm": 0.17395232617855072, "learning_rate": 6.970764234334868e-05, "loss": 0.0134, "step": 4530 }, { "epoch": 0.5986062027281435, "grad_norm": 0.14680452644824982, "learning_rate": 6.966803675253848e-05, "loss": 0.0134, "step": 4531 }, { "epoch": 0.5987383162136275, "grad_norm": 0.1943461298942566, "learning_rate": 6.962843640100785e-05, "loss": 0.0233, "step": 4532 }, { "epoch": 0.5988704296991115, "grad_norm": 0.10282100737094879, "learning_rate": 6.958884129559693e-05, "loss": 0.0118, "step": 4533 }, { "epoch": 0.5990025431845956, "grad_norm": 0.1541905701160431, "learning_rate": 6.954925144314511e-05, "loss": 0.014, "step": 4534 }, { "epoch": 0.5991346566700796, "grad_norm": 0.12435435503721237, "learning_rate": 6.950966685049071e-05, "loss": 0.0178, "step": 4535 }, { "epoch": 0.5992667701555636, "grad_norm": 0.17765118181705475, "learning_rate": 6.947008752447131e-05, "loss": 0.0207, "step": 4536 }, { "epoch": 0.5993988836410477, "grad_norm": 0.13467107713222504, "learning_rate": 6.943051347192346e-05, "loss": 0.0131, "step": 4537 }, { "epoch": 0.5995309971265317, "grad_norm": 0.14471574127674103, "learning_rate": 6.939094469968282e-05, "loss": 0.0162, "step": 4538 }, { "epoch": 0.5996631106120157, "grad_norm": 0.176444873213768, "learning_rate": 6.93513812145842e-05, "loss": 0.0252, "step": 4539 }, { "epoch": 0.5997952240974997, "grad_norm": 0.18236540257930756, "learning_rate": 6.931182302346142e-05, "loss": 0.0155, "step": 4540 }, { "epoch": 0.5999273375829838, "grad_norm": 0.19694674015045166, "learning_rate": 6.927227013314743e-05, "loss": 0.0128, "step": 4541 }, { "epoch": 0.6000594510684678, "grad_norm": 0.12531161308288574, "learning_rate": 6.923272255047424e-05, "loss": 0.0088, "step": 4542 }, { "epoch": 0.6001915645539518, "grad_norm": 0.18968115746974945, "learning_rate": 6.919318028227298e-05, "loss": 0.029, "step": 4543 }, { "epoch": 0.6003236780394359, "grad_norm": 0.15968884527683258, "learning_rate": 6.915364333537383e-05, "loss": 0.011, "step": 4544 }, { "epoch": 0.6004557915249199, "grad_norm": 0.18767327070236206, "learning_rate": 6.911411171660602e-05, "loss": 0.0233, "step": 4545 }, { "epoch": 0.6005879050104039, "grad_norm": 0.19067466259002686, "learning_rate": 6.907458543279797e-05, "loss": 0.0175, "step": 4546 }, { "epoch": 0.600720018495888, "grad_norm": 0.7749344706535339, "learning_rate": 6.903506449077704e-05, "loss": 0.0203, "step": 4547 }, { "epoch": 0.600852131981372, "grad_norm": 0.19207334518432617, "learning_rate": 6.899554889736976e-05, "loss": 0.0201, "step": 4548 }, { "epoch": 0.600984245466856, "grad_norm": 0.12247084081172943, "learning_rate": 6.89560386594017e-05, "loss": 0.0156, "step": 4549 }, { "epoch": 0.6011163589523401, "grad_norm": 0.27735868096351624, "learning_rate": 6.891653378369754e-05, "loss": 0.0202, "step": 4550 }, { "epoch": 0.6012484724378241, "grad_norm": 0.14426004886627197, "learning_rate": 6.887703427708101e-05, "loss": 0.0132, "step": 4551 }, { "epoch": 0.6013805859233081, "grad_norm": 0.1699414700269699, "learning_rate": 6.883754014637483e-05, "loss": 0.0187, "step": 4552 }, { "epoch": 0.6015126994087922, "grad_norm": 0.136097714304924, "learning_rate": 6.879805139840096e-05, "loss": 0.0138, "step": 4553 }, { "epoch": 0.6016448128942762, "grad_norm": 0.14943480491638184, "learning_rate": 6.875856803998035e-05, "loss": 0.0143, "step": 4554 }, { "epoch": 0.6017769263797602, "grad_norm": 0.23178283870220184, "learning_rate": 6.871909007793296e-05, "loss": 0.0186, "step": 4555 }, { "epoch": 0.6019090398652442, "grad_norm": 0.1980462372303009, "learning_rate": 6.867961751907792e-05, "loss": 0.024, "step": 4556 }, { "epoch": 0.6020411533507283, "grad_norm": 0.1882098913192749, "learning_rate": 6.864015037023332e-05, "loss": 0.0195, "step": 4557 }, { "epoch": 0.6021732668362123, "grad_norm": 0.18741685152053833, "learning_rate": 6.860068863821641e-05, "loss": 0.0138, "step": 4558 }, { "epoch": 0.6023053803216963, "grad_norm": 0.1524559110403061, "learning_rate": 6.856123232984347e-05, "loss": 0.0221, "step": 4559 }, { "epoch": 0.6024374938071804, "grad_norm": 0.1414857655763626, "learning_rate": 6.852178145192981e-05, "loss": 0.0177, "step": 4560 }, { "epoch": 0.6025696072926644, "grad_norm": 0.16209979355335236, "learning_rate": 6.848233601128985e-05, "loss": 0.017, "step": 4561 }, { "epoch": 0.6027017207781484, "grad_norm": 0.23777304589748383, "learning_rate": 6.844289601473704e-05, "loss": 0.0207, "step": 4562 }, { "epoch": 0.6028338342636325, "grad_norm": 0.2142631858587265, "learning_rate": 6.840346146908394e-05, "loss": 0.013, "step": 4563 }, { "epoch": 0.6029659477491165, "grad_norm": 0.22291328012943268, "learning_rate": 6.836403238114206e-05, "loss": 0.017, "step": 4564 }, { "epoch": 0.6030980612346005, "grad_norm": 0.14466069638729095, "learning_rate": 6.83246087577221e-05, "loss": 0.0229, "step": 4565 }, { "epoch": 0.6032301747200846, "grad_norm": 0.1637120097875595, "learning_rate": 6.828519060563376e-05, "loss": 0.0164, "step": 4566 }, { "epoch": 0.6033622882055686, "grad_norm": 0.1512189507484436, "learning_rate": 6.824577793168573e-05, "loss": 0.0135, "step": 4567 }, { "epoch": 0.6034944016910526, "grad_norm": 0.19535644352436066, "learning_rate": 6.820637074268585e-05, "loss": 0.0205, "step": 4568 }, { "epoch": 0.6036265151765366, "grad_norm": 0.20127522945404053, "learning_rate": 6.816696904544097e-05, "loss": 0.0326, "step": 4569 }, { "epoch": 0.6037586286620207, "grad_norm": 0.15570081770420074, "learning_rate": 6.812757284675702e-05, "loss": 0.0157, "step": 4570 }, { "epoch": 0.6038907421475047, "grad_norm": 0.22564396262168884, "learning_rate": 6.80881821534389e-05, "loss": 0.0256, "step": 4571 }, { "epoch": 0.6040228556329887, "grad_norm": 0.1477121114730835, "learning_rate": 6.804879697229068e-05, "loss": 0.0139, "step": 4572 }, { "epoch": 0.6041549691184728, "grad_norm": 0.19079719483852386, "learning_rate": 6.800941731011537e-05, "loss": 0.031, "step": 4573 }, { "epoch": 0.6042870826039568, "grad_norm": 0.21819524466991425, "learning_rate": 6.797004317371507e-05, "loss": 0.0214, "step": 4574 }, { "epoch": 0.6044191960894408, "grad_norm": 0.16312861442565918, "learning_rate": 6.793067456989095e-05, "loss": 0.0167, "step": 4575 }, { "epoch": 0.6045513095749249, "grad_norm": 0.19186748564243317, "learning_rate": 6.789131150544316e-05, "loss": 0.0228, "step": 4576 }, { "epoch": 0.6046834230604089, "grad_norm": 0.1798516809940338, "learning_rate": 6.785195398717101e-05, "loss": 0.0261, "step": 4577 }, { "epoch": 0.6048155365458929, "grad_norm": 0.11643780767917633, "learning_rate": 6.78126020218727e-05, "loss": 0.0066, "step": 4578 }, { "epoch": 0.604947650031377, "grad_norm": 0.13909032940864563, "learning_rate": 6.777325561634557e-05, "loss": 0.0152, "step": 4579 }, { "epoch": 0.605079763516861, "grad_norm": 0.18303309381008148, "learning_rate": 6.773391477738603e-05, "loss": 0.0195, "step": 4580 }, { "epoch": 0.605211877002345, "grad_norm": 0.12904851138591766, "learning_rate": 6.769457951178935e-05, "loss": 0.0178, "step": 4581 }, { "epoch": 0.605343990487829, "grad_norm": 0.1660364419221878, "learning_rate": 6.765524982635009e-05, "loss": 0.0214, "step": 4582 }, { "epoch": 0.6054761039733131, "grad_norm": 0.18555185198783875, "learning_rate": 6.761592572786164e-05, "loss": 0.0145, "step": 4583 }, { "epoch": 0.6056082174587971, "grad_norm": 0.1879284679889679, "learning_rate": 6.757660722311653e-05, "loss": 0.0214, "step": 4584 }, { "epoch": 0.6057403309442811, "grad_norm": 0.18958178162574768, "learning_rate": 6.75372943189063e-05, "loss": 0.0199, "step": 4585 }, { "epoch": 0.6058724444297652, "grad_norm": 0.17727269232273102, "learning_rate": 6.749798702202151e-05, "loss": 0.0221, "step": 4586 }, { "epoch": 0.6060045579152492, "grad_norm": 0.24471880495548248, "learning_rate": 6.745868533925177e-05, "loss": 0.0229, "step": 4587 }, { "epoch": 0.6061366714007332, "grad_norm": 0.27247729897499084, "learning_rate": 6.741938927738568e-05, "loss": 0.0215, "step": 4588 }, { "epoch": 0.6062687848862173, "grad_norm": 0.137033611536026, "learning_rate": 6.738009884321094e-05, "loss": 0.009, "step": 4589 }, { "epoch": 0.6064008983717013, "grad_norm": 0.20412221550941467, "learning_rate": 6.734081404351423e-05, "loss": 0.0179, "step": 4590 }, { "epoch": 0.6065330118571853, "grad_norm": 0.17425969243049622, "learning_rate": 6.730153488508124e-05, "loss": 0.0108, "step": 4591 }, { "epoch": 0.6066651253426694, "grad_norm": 0.2222953587770462, "learning_rate": 6.726226137469673e-05, "loss": 0.0222, "step": 4592 }, { "epoch": 0.6067972388281534, "grad_norm": 0.16521556675434113, "learning_rate": 6.722299351914448e-05, "loss": 0.0138, "step": 4593 }, { "epoch": 0.6069293523136374, "grad_norm": 0.2272307425737381, "learning_rate": 6.718373132520724e-05, "loss": 0.0239, "step": 4594 }, { "epoch": 0.6070614657991215, "grad_norm": 0.12938404083251953, "learning_rate": 6.714447479966683e-05, "loss": 0.0124, "step": 4595 }, { "epoch": 0.6071935792846055, "grad_norm": 0.1868760585784912, "learning_rate": 6.710522394930412e-05, "loss": 0.0228, "step": 4596 }, { "epoch": 0.6073256927700895, "grad_norm": 0.18667960166931152, "learning_rate": 6.706597878089888e-05, "loss": 0.0148, "step": 4597 }, { "epoch": 0.6074578062555736, "grad_norm": 0.2412571907043457, "learning_rate": 6.702673930123009e-05, "loss": 0.0295, "step": 4598 }, { "epoch": 0.6075899197410576, "grad_norm": 0.1540968120098114, "learning_rate": 6.698750551707553e-05, "loss": 0.0183, "step": 4599 }, { "epoch": 0.6077220332265416, "grad_norm": 0.1171368882060051, "learning_rate": 6.694827743521217e-05, "loss": 0.0096, "step": 4600 }, { "epoch": 0.6078541467120256, "grad_norm": 0.1463003009557724, "learning_rate": 6.690905506241591e-05, "loss": 0.0151, "step": 4601 }, { "epoch": 0.6079862601975097, "grad_norm": 0.12538869678974152, "learning_rate": 6.686983840546166e-05, "loss": 0.0207, "step": 4602 }, { "epoch": 0.6081183736829937, "grad_norm": 0.5921977162361145, "learning_rate": 6.683062747112341e-05, "loss": 0.038, "step": 4603 }, { "epoch": 0.6082504871684777, "grad_norm": 0.14486072957515717, "learning_rate": 6.679142226617406e-05, "loss": 0.0168, "step": 4604 }, { "epoch": 0.6083826006539618, "grad_norm": 0.1616198569536209, "learning_rate": 6.675222279738562e-05, "loss": 0.0208, "step": 4605 }, { "epoch": 0.6085147141394458, "grad_norm": 0.15971048176288605, "learning_rate": 6.671302907152902e-05, "loss": 0.019, "step": 4606 }, { "epoch": 0.6086468276249298, "grad_norm": 0.17092269659042358, "learning_rate": 6.66738410953743e-05, "loss": 0.0195, "step": 4607 }, { "epoch": 0.6087789411104139, "grad_norm": 0.1348509043455124, "learning_rate": 6.663465887569043e-05, "loss": 0.0153, "step": 4608 }, { "epoch": 0.6089110545958979, "grad_norm": 0.12598492205142975, "learning_rate": 6.659548241924537e-05, "loss": 0.0111, "step": 4609 }, { "epoch": 0.6090431680813819, "grad_norm": 0.17370101809501648, "learning_rate": 6.655631173280613e-05, "loss": 0.0173, "step": 4610 }, { "epoch": 0.609175281566866, "grad_norm": 0.18581904470920563, "learning_rate": 6.651714682313877e-05, "loss": 0.0131, "step": 4611 }, { "epoch": 0.60930739505235, "grad_norm": 0.14752104878425598, "learning_rate": 6.647798769700824e-05, "loss": 0.0159, "step": 4612 }, { "epoch": 0.609439508537834, "grad_norm": 0.3449000418186188, "learning_rate": 6.64388343611786e-05, "loss": 0.0429, "step": 4613 }, { "epoch": 0.609571622023318, "grad_norm": 0.12423645704984665, "learning_rate": 6.639968682241277e-05, "loss": 0.0111, "step": 4614 }, { "epoch": 0.6097037355088021, "grad_norm": 0.25737959146499634, "learning_rate": 6.636054508747286e-05, "loss": 0.0232, "step": 4615 }, { "epoch": 0.6098358489942861, "grad_norm": 0.20272405445575714, "learning_rate": 6.632140916311981e-05, "loss": 0.0195, "step": 4616 }, { "epoch": 0.6099679624797701, "grad_norm": 0.3123978078365326, "learning_rate": 6.62822790561136e-05, "loss": 0.0341, "step": 4617 }, { "epoch": 0.6101000759652542, "grad_norm": 0.15190282464027405, "learning_rate": 6.624315477321328e-05, "loss": 0.017, "step": 4618 }, { "epoch": 0.6102321894507382, "grad_norm": 0.1630333811044693, "learning_rate": 6.62040363211768e-05, "loss": 0.0185, "step": 4619 }, { "epoch": 0.6103643029362222, "grad_norm": 0.16900700330734253, "learning_rate": 6.616492370676114e-05, "loss": 0.0183, "step": 4620 }, { "epoch": 0.6104964164217063, "grad_norm": 0.1657828986644745, "learning_rate": 6.612581693672231e-05, "loss": 0.0291, "step": 4621 }, { "epoch": 0.6106285299071903, "grad_norm": 0.1700577735900879, "learning_rate": 6.608671601781525e-05, "loss": 0.0158, "step": 4622 }, { "epoch": 0.6107606433926743, "grad_norm": 0.18941977620124817, "learning_rate": 6.60476209567939e-05, "loss": 0.0286, "step": 4623 }, { "epoch": 0.6108927568781584, "grad_norm": 0.21406427025794983, "learning_rate": 6.600853176041121e-05, "loss": 0.0257, "step": 4624 }, { "epoch": 0.6110248703636424, "grad_norm": 0.36508557200431824, "learning_rate": 6.596944843541913e-05, "loss": 0.0158, "step": 4625 }, { "epoch": 0.6111569838491264, "grad_norm": 0.31032535433769226, "learning_rate": 6.593037098856853e-05, "loss": 0.0232, "step": 4626 }, { "epoch": 0.6112890973346105, "grad_norm": 0.30940091609954834, "learning_rate": 6.589129942660936e-05, "loss": 0.0227, "step": 4627 }, { "epoch": 0.6114212108200945, "grad_norm": 0.19503773748874664, "learning_rate": 6.585223375629044e-05, "loss": 0.0217, "step": 4628 }, { "epoch": 0.6115533243055785, "grad_norm": 0.19577208161354065, "learning_rate": 6.58131739843597e-05, "loss": 0.026, "step": 4629 }, { "epoch": 0.6116854377910625, "grad_norm": 0.0925767570734024, "learning_rate": 6.577412011756394e-05, "loss": 0.0078, "step": 4630 }, { "epoch": 0.6118175512765466, "grad_norm": 0.6488147974014282, "learning_rate": 6.5735072162649e-05, "loss": 0.0224, "step": 4631 }, { "epoch": 0.6119496647620306, "grad_norm": 0.16737666726112366, "learning_rate": 6.569603012635969e-05, "loss": 0.0208, "step": 4632 }, { "epoch": 0.6120817782475146, "grad_norm": 0.12599046528339386, "learning_rate": 6.565699401543977e-05, "loss": 0.0113, "step": 4633 }, { "epoch": 0.6122138917329987, "grad_norm": 0.19616693258285522, "learning_rate": 6.561796383663203e-05, "loss": 0.0108, "step": 4634 }, { "epoch": 0.6123460052184827, "grad_norm": 0.1805287003517151, "learning_rate": 6.557893959667817e-05, "loss": 0.0133, "step": 4635 }, { "epoch": 0.6124781187039667, "grad_norm": 0.16672852635383606, "learning_rate": 6.553992130231892e-05, "loss": 0.0189, "step": 4636 }, { "epoch": 0.6126102321894508, "grad_norm": 0.10782662779092789, "learning_rate": 6.550090896029397e-05, "loss": 0.0119, "step": 4637 }, { "epoch": 0.6127423456749348, "grad_norm": 0.21042101085186005, "learning_rate": 6.546190257734194e-05, "loss": 0.0238, "step": 4638 }, { "epoch": 0.6128744591604188, "grad_norm": 0.16405358910560608, "learning_rate": 6.542290216020048e-05, "loss": 0.017, "step": 4639 }, { "epoch": 0.6130065726459029, "grad_norm": 0.22960861027240753, "learning_rate": 6.538390771560616e-05, "loss": 0.0229, "step": 4640 }, { "epoch": 0.6131386861313869, "grad_norm": 0.1733035296201706, "learning_rate": 6.534491925029458e-05, "loss": 0.0073, "step": 4641 }, { "epoch": 0.6132707996168709, "grad_norm": 0.1495857983827591, "learning_rate": 6.530593677100025e-05, "loss": 0.0172, "step": 4642 }, { "epoch": 0.613402913102355, "grad_norm": 0.18175730109214783, "learning_rate": 6.526696028445663e-05, "loss": 0.0201, "step": 4643 }, { "epoch": 0.613535026587839, "grad_norm": 0.1300545483827591, "learning_rate": 6.522798979739622e-05, "loss": 0.0121, "step": 4644 }, { "epoch": 0.613667140073323, "grad_norm": 0.16516652703285217, "learning_rate": 6.518902531655043e-05, "loss": 0.0181, "step": 4645 }, { "epoch": 0.613799253558807, "grad_norm": 0.19759850203990936, "learning_rate": 6.515006684864963e-05, "loss": 0.0216, "step": 4646 }, { "epoch": 0.6139313670442911, "grad_norm": 0.22777485847473145, "learning_rate": 6.51111144004232e-05, "loss": 0.0171, "step": 4647 }, { "epoch": 0.6140634805297751, "grad_norm": 0.20385077595710754, "learning_rate": 6.507216797859944e-05, "loss": 0.0115, "step": 4648 }, { "epoch": 0.6141955940152591, "grad_norm": 0.13005998730659485, "learning_rate": 6.503322758990559e-05, "loss": 0.0119, "step": 4649 }, { "epoch": 0.6143277075007432, "grad_norm": 0.16478468477725983, "learning_rate": 6.49942932410679e-05, "loss": 0.0166, "step": 4650 }, { "epoch": 0.6144598209862272, "grad_norm": 0.1881481558084488, "learning_rate": 6.495536493881154e-05, "loss": 0.0305, "step": 4651 }, { "epoch": 0.6145919344717112, "grad_norm": 0.15473182499408722, "learning_rate": 6.491644268986064e-05, "loss": 0.0238, "step": 4652 }, { "epoch": 0.6147240479571953, "grad_norm": 0.13667252659797668, "learning_rate": 6.487752650093832e-05, "loss": 0.0111, "step": 4653 }, { "epoch": 0.6148561614426793, "grad_norm": 0.29269251227378845, "learning_rate": 6.483861637876657e-05, "loss": 0.0307, "step": 4654 }, { "epoch": 0.6149882749281633, "grad_norm": 0.22216090559959412, "learning_rate": 6.479971233006645e-05, "loss": 0.0186, "step": 4655 }, { "epoch": 0.6151203884136474, "grad_norm": 0.1534397006034851, "learning_rate": 6.476081436155787e-05, "loss": 0.008, "step": 4656 }, { "epoch": 0.6152525018991314, "grad_norm": 0.19428527355194092, "learning_rate": 6.472192247995971e-05, "loss": 0.0124, "step": 4657 }, { "epoch": 0.6153846153846154, "grad_norm": 0.16967034339904785, "learning_rate": 6.468303669198985e-05, "loss": 0.0167, "step": 4658 }, { "epoch": 0.6155167288700994, "grad_norm": 0.17743651568889618, "learning_rate": 6.464415700436506e-05, "loss": 0.0156, "step": 4659 }, { "epoch": 0.6156488423555835, "grad_norm": 0.19646964967250824, "learning_rate": 6.460528342380112e-05, "loss": 0.0197, "step": 4660 }, { "epoch": 0.6157809558410675, "grad_norm": 0.5039405822753906, "learning_rate": 6.456641595701265e-05, "loss": 0.0292, "step": 4661 }, { "epoch": 0.6159130693265515, "grad_norm": 0.17759563028812408, "learning_rate": 6.452755461071334e-05, "loss": 0.0154, "step": 4662 }, { "epoch": 0.6160451828120356, "grad_norm": 0.19328869879245758, "learning_rate": 6.448869939161573e-05, "loss": 0.0101, "step": 4663 }, { "epoch": 0.6161772962975196, "grad_norm": 0.17122767865657806, "learning_rate": 6.444985030643131e-05, "loss": 0.0163, "step": 4664 }, { "epoch": 0.6163094097830036, "grad_norm": 0.1447526514530182, "learning_rate": 6.441100736187058e-05, "loss": 0.0128, "step": 4665 }, { "epoch": 0.6164415232684877, "grad_norm": 0.19974549114704132, "learning_rate": 6.43721705646429e-05, "loss": 0.0261, "step": 4666 }, { "epoch": 0.6165736367539717, "grad_norm": 0.19671718776226044, "learning_rate": 6.433333992145662e-05, "loss": 0.0304, "step": 4667 }, { "epoch": 0.6167057502394557, "grad_norm": 0.1849774569272995, "learning_rate": 6.429451543901899e-05, "loss": 0.0241, "step": 4668 }, { "epoch": 0.6168378637249398, "grad_norm": 0.20412255823612213, "learning_rate": 6.425569712403619e-05, "loss": 0.0193, "step": 4669 }, { "epoch": 0.6169699772104238, "grad_norm": 0.6266623735427856, "learning_rate": 6.421688498321344e-05, "loss": 0.0203, "step": 4670 }, { "epoch": 0.6171020906959078, "grad_norm": 0.1785677969455719, "learning_rate": 6.417807902325477e-05, "loss": 0.0174, "step": 4671 }, { "epoch": 0.6172342041813919, "grad_norm": 0.0982740968465805, "learning_rate": 6.413927925086313e-05, "loss": 0.01, "step": 4672 }, { "epoch": 0.6173663176668759, "grad_norm": 0.12710632383823395, "learning_rate": 6.410048567274056e-05, "loss": 0.0121, "step": 4673 }, { "epoch": 0.6174984311523599, "grad_norm": 0.30231770873069763, "learning_rate": 6.406169829558781e-05, "loss": 0.0206, "step": 4674 }, { "epoch": 0.617630544637844, "grad_norm": 0.1966603398323059, "learning_rate": 6.402291712610477e-05, "loss": 0.0164, "step": 4675 }, { "epoch": 0.617762658123328, "grad_norm": 0.23650838434696198, "learning_rate": 6.398414217099011e-05, "loss": 0.0312, "step": 4676 }, { "epoch": 0.617894771608812, "grad_norm": 0.19775453209877014, "learning_rate": 6.39453734369415e-05, "loss": 0.0226, "step": 4677 }, { "epoch": 0.618026885094296, "grad_norm": 0.14034952223300934, "learning_rate": 6.39066109306555e-05, "loss": 0.0179, "step": 4678 }, { "epoch": 0.6181589985797801, "grad_norm": 0.21675540506839752, "learning_rate": 6.386785465882761e-05, "loss": 0.0164, "step": 4679 }, { "epoch": 0.6182911120652641, "grad_norm": 0.18577510118484497, "learning_rate": 6.382910462815228e-05, "loss": 0.0171, "step": 4680 }, { "epoch": 0.6184232255507481, "grad_norm": 0.19053871929645538, "learning_rate": 6.379036084532279e-05, "loss": 0.0139, "step": 4681 }, { "epoch": 0.6185553390362322, "grad_norm": 0.2634298801422119, "learning_rate": 6.375162331703146e-05, "loss": 0.0209, "step": 4682 }, { "epoch": 0.6186874525217162, "grad_norm": 0.1316649615764618, "learning_rate": 6.371289204996946e-05, "loss": 0.0082, "step": 4683 }, { "epoch": 0.6188195660072002, "grad_norm": 0.17446738481521606, "learning_rate": 6.36741670508269e-05, "loss": 0.0209, "step": 4684 }, { "epoch": 0.6189516794926843, "grad_norm": 0.24308434128761292, "learning_rate": 6.363544832629277e-05, "loss": 0.0125, "step": 4685 }, { "epoch": 0.6190837929781683, "grad_norm": 0.15134310722351074, "learning_rate": 6.359673588305502e-05, "loss": 0.0147, "step": 4686 }, { "epoch": 0.6192159064636523, "grad_norm": 0.24589499831199646, "learning_rate": 6.355802972780052e-05, "loss": 0.0296, "step": 4687 }, { "epoch": 0.6193480199491364, "grad_norm": 0.16589580476284027, "learning_rate": 6.351932986721498e-05, "loss": 0.0143, "step": 4688 }, { "epoch": 0.6194801334346204, "grad_norm": 0.16544070839881897, "learning_rate": 6.348063630798316e-05, "loss": 0.0222, "step": 4689 }, { "epoch": 0.6196122469201044, "grad_norm": 0.21558035910129547, "learning_rate": 6.344194905678858e-05, "loss": 0.0191, "step": 4690 }, { "epoch": 0.6197443604055884, "grad_norm": 0.12022353708744049, "learning_rate": 6.340326812031378e-05, "loss": 0.0156, "step": 4691 }, { "epoch": 0.6198764738910725, "grad_norm": 0.15502390265464783, "learning_rate": 6.336459350524016e-05, "loss": 0.0116, "step": 4692 }, { "epoch": 0.6200085873765565, "grad_norm": 0.10256446897983551, "learning_rate": 6.332592521824801e-05, "loss": 0.0061, "step": 4693 }, { "epoch": 0.6201407008620405, "grad_norm": 0.40712037682533264, "learning_rate": 6.328726326601659e-05, "loss": 0.0313, "step": 4694 }, { "epoch": 0.6202728143475246, "grad_norm": 0.23398032784461975, "learning_rate": 6.3248607655224e-05, "loss": 0.0134, "step": 4695 }, { "epoch": 0.6204049278330086, "grad_norm": 0.14428845047950745, "learning_rate": 6.320995839254731e-05, "loss": 0.0139, "step": 4696 }, { "epoch": 0.6205370413184926, "grad_norm": 0.14608009159564972, "learning_rate": 6.31713154846624e-05, "loss": 0.0141, "step": 4697 }, { "epoch": 0.6206691548039767, "grad_norm": 0.23591239750385284, "learning_rate": 6.313267893824417e-05, "loss": 0.0179, "step": 4698 }, { "epoch": 0.6208012682894607, "grad_norm": 0.18376502394676208, "learning_rate": 6.309404875996632e-05, "loss": 0.0214, "step": 4699 }, { "epoch": 0.6209333817749447, "grad_norm": 0.20980310440063477, "learning_rate": 6.30554249565015e-05, "loss": 0.0261, "step": 4700 }, { "epoch": 0.6210654952604288, "grad_norm": 0.2826692759990692, "learning_rate": 6.301680753452129e-05, "loss": 0.0372, "step": 4701 }, { "epoch": 0.6211976087459128, "grad_norm": 0.2074815183877945, "learning_rate": 6.297819650069605e-05, "loss": 0.0118, "step": 4702 }, { "epoch": 0.6213297222313968, "grad_norm": 0.16915234923362732, "learning_rate": 6.293959186169521e-05, "loss": 0.0185, "step": 4703 }, { "epoch": 0.6214618357168809, "grad_norm": 0.14982269704341888, "learning_rate": 6.290099362418689e-05, "loss": 0.0247, "step": 4704 }, { "epoch": 0.6215939492023649, "grad_norm": 0.16975736618041992, "learning_rate": 6.286240179483831e-05, "loss": 0.0156, "step": 4705 }, { "epoch": 0.6217260626878489, "grad_norm": 0.1671624630689621, "learning_rate": 6.282381638031545e-05, "loss": 0.0152, "step": 4706 }, { "epoch": 0.6218581761733329, "grad_norm": 0.16445261240005493, "learning_rate": 6.278523738728317e-05, "loss": 0.0126, "step": 4707 }, { "epoch": 0.621990289658817, "grad_norm": 0.18265871703624725, "learning_rate": 6.274666482240536e-05, "loss": 0.0112, "step": 4708 }, { "epoch": 0.622122403144301, "grad_norm": 0.17513689398765564, "learning_rate": 6.270809869234466e-05, "loss": 0.0254, "step": 4709 }, { "epoch": 0.622254516629785, "grad_norm": 0.19515810906887054, "learning_rate": 6.266953900376265e-05, "loss": 0.0146, "step": 4710 }, { "epoch": 0.6223866301152691, "grad_norm": 0.17156194150447845, "learning_rate": 6.263098576331978e-05, "loss": 0.0222, "step": 4711 }, { "epoch": 0.6225187436007531, "grad_norm": 0.23880699276924133, "learning_rate": 6.259243897767546e-05, "loss": 0.0183, "step": 4712 }, { "epoch": 0.6226508570862371, "grad_norm": 0.14619049429893494, "learning_rate": 6.255389865348787e-05, "loss": 0.0136, "step": 4713 }, { "epoch": 0.6227829705717212, "grad_norm": 0.19037936627864838, "learning_rate": 6.251536479741414e-05, "loss": 0.0096, "step": 4714 }, { "epoch": 0.6229150840572052, "grad_norm": 0.15618152916431427, "learning_rate": 6.24768374161103e-05, "loss": 0.0232, "step": 4715 }, { "epoch": 0.6230471975426892, "grad_norm": 0.14753462374210358, "learning_rate": 6.243831651623118e-05, "loss": 0.0215, "step": 4716 }, { "epoch": 0.6231793110281733, "grad_norm": 0.14098712801933289, "learning_rate": 6.239980210443061e-05, "loss": 0.019, "step": 4717 }, { "epoch": 0.6233114245136573, "grad_norm": 0.24728527665138245, "learning_rate": 6.236129418736118e-05, "loss": 0.0247, "step": 4718 }, { "epoch": 0.6234435379991413, "grad_norm": 0.20924845337867737, "learning_rate": 6.232279277167448e-05, "loss": 0.0266, "step": 4719 }, { "epoch": 0.6235756514846253, "grad_norm": 0.18728812038898468, "learning_rate": 6.228429786402084e-05, "loss": 0.0139, "step": 4720 }, { "epoch": 0.6237077649701093, "grad_norm": 0.17054276168346405, "learning_rate": 6.224580947104957e-05, "loss": 0.0142, "step": 4721 }, { "epoch": 0.6238398784555933, "grad_norm": 0.14980489015579224, "learning_rate": 6.220732759940882e-05, "loss": 0.0168, "step": 4722 }, { "epoch": 0.6239719919410773, "grad_norm": 0.10852868854999542, "learning_rate": 6.216885225574558e-05, "loss": 0.0107, "step": 4723 }, { "epoch": 0.6241041054265614, "grad_norm": 0.14671917259693146, "learning_rate": 6.213038344670579e-05, "loss": 0.0185, "step": 4724 }, { "epoch": 0.6242362189120454, "grad_norm": 0.1404392421245575, "learning_rate": 6.209192117893418e-05, "loss": 0.0118, "step": 4725 }, { "epoch": 0.6243683323975294, "grad_norm": 0.17956751585006714, "learning_rate": 6.205346545907445e-05, "loss": 0.0196, "step": 4726 }, { "epoch": 0.6245004458830135, "grad_norm": 0.14269684255123138, "learning_rate": 6.201501629376906e-05, "loss": 0.0148, "step": 4727 }, { "epoch": 0.6246325593684975, "grad_norm": 0.11928752064704895, "learning_rate": 6.197657368965934e-05, "loss": 0.0058, "step": 4728 }, { "epoch": 0.6247646728539815, "grad_norm": 0.27102896571159363, "learning_rate": 6.193813765338561e-05, "loss": 0.0393, "step": 4729 }, { "epoch": 0.6248967863394655, "grad_norm": 0.1431749016046524, "learning_rate": 6.189970819158696e-05, "loss": 0.0079, "step": 4730 }, { "epoch": 0.6250288998249496, "grad_norm": 0.12428173422813416, "learning_rate": 6.186128531090132e-05, "loss": 0.0142, "step": 4731 }, { "epoch": 0.6251610133104336, "grad_norm": 0.12851598858833313, "learning_rate": 6.182286901796558e-05, "loss": 0.0165, "step": 4732 }, { "epoch": 0.6252931267959176, "grad_norm": 0.14972122013568878, "learning_rate": 6.17844593194154e-05, "loss": 0.0221, "step": 4733 }, { "epoch": 0.6254252402814017, "grad_norm": 0.1200818419456482, "learning_rate": 6.174605622188536e-05, "loss": 0.0173, "step": 4734 }, { "epoch": 0.6255573537668857, "grad_norm": 0.10524090379476547, "learning_rate": 6.170765973200887e-05, "loss": 0.0198, "step": 4735 }, { "epoch": 0.6256894672523697, "grad_norm": 0.07244350016117096, "learning_rate": 6.166926985641817e-05, "loss": 0.0084, "step": 4736 }, { "epoch": 0.6258215807378538, "grad_norm": 0.1618015170097351, "learning_rate": 6.163088660174443e-05, "loss": 0.0174, "step": 4737 }, { "epoch": 0.6259536942233378, "grad_norm": 0.3594222664833069, "learning_rate": 6.159250997461763e-05, "loss": 0.0215, "step": 4738 }, { "epoch": 0.6260858077088218, "grad_norm": 0.163102388381958, "learning_rate": 6.155413998166664e-05, "loss": 0.0136, "step": 4739 }, { "epoch": 0.6262179211943059, "grad_norm": 0.1344098001718521, "learning_rate": 6.15157766295191e-05, "loss": 0.0156, "step": 4740 }, { "epoch": 0.6263500346797899, "grad_norm": 0.12539459764957428, "learning_rate": 6.147741992480163e-05, "loss": 0.0197, "step": 4741 }, { "epoch": 0.6264821481652739, "grad_norm": 0.1360604166984558, "learning_rate": 6.143906987413959e-05, "loss": 0.0184, "step": 4742 }, { "epoch": 0.626614261650758, "grad_norm": 0.12114249914884567, "learning_rate": 6.140072648415722e-05, "loss": 0.0107, "step": 4743 }, { "epoch": 0.626746375136242, "grad_norm": 0.19674722850322723, "learning_rate": 6.136238976147768e-05, "loss": 0.0165, "step": 4744 }, { "epoch": 0.626878488621726, "grad_norm": 0.23598308861255646, "learning_rate": 6.132405971272288e-05, "loss": 0.0221, "step": 4745 }, { "epoch": 0.62701060210721, "grad_norm": 0.12908229231834412, "learning_rate": 6.128573634451364e-05, "loss": 0.0131, "step": 4746 }, { "epoch": 0.6271427155926941, "grad_norm": 0.32704001665115356, "learning_rate": 6.124741966346956e-05, "loss": 0.0318, "step": 4747 }, { "epoch": 0.6272748290781781, "grad_norm": 0.1676252782344818, "learning_rate": 6.120910967620921e-05, "loss": 0.0216, "step": 4748 }, { "epoch": 0.6274069425636621, "grad_norm": 0.15792980790138245, "learning_rate": 6.117080638934987e-05, "loss": 0.0179, "step": 4749 }, { "epoch": 0.6275390560491462, "grad_norm": 0.12319156527519226, "learning_rate": 6.113250980950772e-05, "loss": 0.0142, "step": 4750 }, { "epoch": 0.6276711695346302, "grad_norm": 0.17394210398197174, "learning_rate": 6.109421994329778e-05, "loss": 0.0188, "step": 4751 }, { "epoch": 0.6278032830201142, "grad_norm": 0.19802264869213104, "learning_rate": 6.10559367973339e-05, "loss": 0.017, "step": 4752 }, { "epoch": 0.6279353965055983, "grad_norm": 0.14336781203746796, "learning_rate": 6.1017660378228814e-05, "loss": 0.0161, "step": 4753 }, { "epoch": 0.6280675099910823, "grad_norm": 0.19559355080127716, "learning_rate": 6.097939069259402e-05, "loss": 0.0112, "step": 4754 }, { "epoch": 0.6281996234765663, "grad_norm": 0.11833098530769348, "learning_rate": 6.0941127747039914e-05, "loss": 0.0076, "step": 4755 }, { "epoch": 0.6283317369620504, "grad_norm": 0.2858635187149048, "learning_rate": 6.09028715481757e-05, "loss": 0.028, "step": 4756 }, { "epoch": 0.6284638504475344, "grad_norm": 0.2030288726091385, "learning_rate": 6.0864622102609395e-05, "loss": 0.0207, "step": 4757 }, { "epoch": 0.6285959639330184, "grad_norm": 0.1523391306400299, "learning_rate": 6.082637941694792e-05, "loss": 0.0165, "step": 4758 }, { "epoch": 0.6287280774185025, "grad_norm": 0.17553837597370148, "learning_rate": 6.078814349779693e-05, "loss": 0.018, "step": 4759 }, { "epoch": 0.6288601909039865, "grad_norm": 0.214276984333992, "learning_rate": 6.074991435176103e-05, "loss": 0.0291, "step": 4760 }, { "epoch": 0.6289923043894705, "grad_norm": 0.14726516604423523, "learning_rate": 6.071169198544353e-05, "loss": 0.022, "step": 4761 }, { "epoch": 0.6291244178749545, "grad_norm": 0.13246333599090576, "learning_rate": 6.067347640544668e-05, "loss": 0.0102, "step": 4762 }, { "epoch": 0.6292565313604386, "grad_norm": 0.204450324177742, "learning_rate": 6.0635267618371485e-05, "loss": 0.027, "step": 4763 }, { "epoch": 0.6293886448459226, "grad_norm": 0.14314386248588562, "learning_rate": 6.059706563081777e-05, "loss": 0.0129, "step": 4764 }, { "epoch": 0.6295207583314066, "grad_norm": 0.3385053873062134, "learning_rate": 6.055887044938426e-05, "loss": 0.018, "step": 4765 }, { "epoch": 0.6296528718168907, "grad_norm": 0.14576059579849243, "learning_rate": 6.052068208066841e-05, "loss": 0.0083, "step": 4766 }, { "epoch": 0.6297849853023747, "grad_norm": 0.22583547234535217, "learning_rate": 6.048250053126661e-05, "loss": 0.0127, "step": 4767 }, { "epoch": 0.6299170987878587, "grad_norm": 0.26217544078826904, "learning_rate": 6.044432580777395e-05, "loss": 0.0125, "step": 4768 }, { "epoch": 0.6300492122733428, "grad_norm": 0.12332071363925934, "learning_rate": 6.040615791678444e-05, "loss": 0.0104, "step": 4769 }, { "epoch": 0.6301813257588268, "grad_norm": 0.2592184543609619, "learning_rate": 6.036799686489085e-05, "loss": 0.0328, "step": 4770 }, { "epoch": 0.6303134392443108, "grad_norm": 0.10757103562355042, "learning_rate": 6.032984265868478e-05, "loss": 0.0145, "step": 4771 }, { "epoch": 0.6304455527297949, "grad_norm": 0.13460460305213928, "learning_rate": 6.029169530475668e-05, "loss": 0.0179, "step": 4772 }, { "epoch": 0.6305776662152789, "grad_norm": 0.1447414755821228, "learning_rate": 6.0253554809695765e-05, "loss": 0.0176, "step": 4773 }, { "epoch": 0.6307097797007629, "grad_norm": 0.2827383875846863, "learning_rate": 6.021542118009012e-05, "loss": 0.0216, "step": 4774 }, { "epoch": 0.630841893186247, "grad_norm": 0.13897140324115753, "learning_rate": 6.0177294422526584e-05, "loss": 0.0149, "step": 4775 }, { "epoch": 0.630974006671731, "grad_norm": 0.22664396464824677, "learning_rate": 6.013917454359088e-05, "loss": 0.0114, "step": 4776 }, { "epoch": 0.631106120157215, "grad_norm": 0.19409899413585663, "learning_rate": 6.01010615498675e-05, "loss": 0.0221, "step": 4777 }, { "epoch": 0.631238233642699, "grad_norm": 0.21891489624977112, "learning_rate": 6.0062955447939694e-05, "loss": 0.031, "step": 4778 }, { "epoch": 0.6313703471281831, "grad_norm": 0.1206694096326828, "learning_rate": 6.002485624438965e-05, "loss": 0.0104, "step": 4779 }, { "epoch": 0.6315024606136671, "grad_norm": 0.23604559898376465, "learning_rate": 5.998676394579824e-05, "loss": 0.023, "step": 4780 }, { "epoch": 0.6316345740991511, "grad_norm": 0.22032544016838074, "learning_rate": 5.994867855874524e-05, "loss": 0.016, "step": 4781 }, { "epoch": 0.6317666875846352, "grad_norm": 0.15425023436546326, "learning_rate": 5.991060008980916e-05, "loss": 0.0132, "step": 4782 }, { "epoch": 0.6318988010701192, "grad_norm": 0.1916843056678772, "learning_rate": 5.9872528545567365e-05, "loss": 0.0273, "step": 4783 }, { "epoch": 0.6320309145556032, "grad_norm": 0.22243209183216095, "learning_rate": 5.9834463932595984e-05, "loss": 0.0239, "step": 4784 }, { "epoch": 0.6321630280410873, "grad_norm": 0.16001495718955994, "learning_rate": 5.979640625746996e-05, "loss": 0.0135, "step": 4785 }, { "epoch": 0.6322951415265713, "grad_norm": 0.20562537014484406, "learning_rate": 5.975835552676303e-05, "loss": 0.0169, "step": 4786 }, { "epoch": 0.6324272550120553, "grad_norm": 0.19932161271572113, "learning_rate": 5.972031174704782e-05, "loss": 0.0173, "step": 4787 }, { "epoch": 0.6325593684975394, "grad_norm": 0.17273645102977753, "learning_rate": 5.968227492489562e-05, "loss": 0.0225, "step": 4788 }, { "epoch": 0.6326914819830234, "grad_norm": 0.20375372469425201, "learning_rate": 5.96442450668766e-05, "loss": 0.0197, "step": 4789 }, { "epoch": 0.6328235954685074, "grad_norm": 0.189348503947258, "learning_rate": 5.960622217955969e-05, "loss": 0.0156, "step": 4790 }, { "epoch": 0.6329557089539914, "grad_norm": 0.1797255575656891, "learning_rate": 5.956820626951267e-05, "loss": 0.0157, "step": 4791 }, { "epoch": 0.6330878224394755, "grad_norm": 0.178375706076622, "learning_rate": 5.9530197343302054e-05, "loss": 0.0136, "step": 4792 }, { "epoch": 0.6332199359249595, "grad_norm": 0.15374751389026642, "learning_rate": 5.9492195407493144e-05, "loss": 0.0177, "step": 4793 }, { "epoch": 0.6333520494104435, "grad_norm": 0.17693348228931427, "learning_rate": 5.945420046865011e-05, "loss": 0.0204, "step": 4794 }, { "epoch": 0.6334841628959276, "grad_norm": 0.236887127161026, "learning_rate": 5.941621253333585e-05, "loss": 0.0131, "step": 4795 }, { "epoch": 0.6336162763814116, "grad_norm": 0.13650371134281158, "learning_rate": 5.937823160811207e-05, "loss": 0.0148, "step": 4796 }, { "epoch": 0.6337483898668956, "grad_norm": 0.17048349976539612, "learning_rate": 5.9340257699539236e-05, "loss": 0.0119, "step": 4797 }, { "epoch": 0.6338805033523797, "grad_norm": 0.11208688467741013, "learning_rate": 5.9302290814176684e-05, "loss": 0.0083, "step": 4798 }, { "epoch": 0.6340126168378637, "grad_norm": 0.1634460985660553, "learning_rate": 5.926433095858247e-05, "loss": 0.0128, "step": 4799 }, { "epoch": 0.6341447303233477, "grad_norm": 0.16812430322170258, "learning_rate": 5.922637813931341e-05, "loss": 0.0181, "step": 4800 }, { "epoch": 0.6342768438088318, "grad_norm": 0.13605019450187683, "learning_rate": 5.9188432362925196e-05, "loss": 0.014, "step": 4801 }, { "epoch": 0.6344089572943158, "grad_norm": 0.11597549915313721, "learning_rate": 5.9150493635972194e-05, "loss": 0.0168, "step": 4802 }, { "epoch": 0.6345410707797998, "grad_norm": 0.2631230056285858, "learning_rate": 5.9112561965007676e-05, "loss": 0.0202, "step": 4803 }, { "epoch": 0.6346731842652839, "grad_norm": 0.1404346376657486, "learning_rate": 5.9074637356583564e-05, "loss": 0.0124, "step": 4804 }, { "epoch": 0.6348052977507679, "grad_norm": 0.13995184004306793, "learning_rate": 5.9036719817250675e-05, "loss": 0.0122, "step": 4805 }, { "epoch": 0.6349374112362519, "grad_norm": 0.12033804506063461, "learning_rate": 5.899880935355854e-05, "loss": 0.0094, "step": 4806 }, { "epoch": 0.635069524721736, "grad_norm": 0.19234386086463928, "learning_rate": 5.896090597205546e-05, "loss": 0.0171, "step": 4807 }, { "epoch": 0.63520163820722, "grad_norm": 0.1520492434501648, "learning_rate": 5.8923009679288565e-05, "loss": 0.0184, "step": 4808 }, { "epoch": 0.635333751692704, "grad_norm": 0.15152016282081604, "learning_rate": 5.8885120481803715e-05, "loss": 0.0178, "step": 4809 }, { "epoch": 0.635465865178188, "grad_norm": 0.15282461047172546, "learning_rate": 5.884723838614559e-05, "loss": 0.0156, "step": 4810 }, { "epoch": 0.6355979786636721, "grad_norm": 0.2525296211242676, "learning_rate": 5.880936339885754e-05, "loss": 0.0305, "step": 4811 }, { "epoch": 0.6357300921491561, "grad_norm": 0.19699762761592865, "learning_rate": 5.877149552648186e-05, "loss": 0.0171, "step": 4812 }, { "epoch": 0.6358622056346401, "grad_norm": 0.10816171765327454, "learning_rate": 5.8733634775559456e-05, "loss": 0.0124, "step": 4813 }, { "epoch": 0.6359943191201242, "grad_norm": 0.34312406182289124, "learning_rate": 5.869578115263006e-05, "loss": 0.0277, "step": 4814 }, { "epoch": 0.6361264326056082, "grad_norm": 0.1626928299665451, "learning_rate": 5.8657934664232205e-05, "loss": 0.0242, "step": 4815 }, { "epoch": 0.6362585460910922, "grad_norm": 0.12160996347665787, "learning_rate": 5.862009531690313e-05, "loss": 0.014, "step": 4816 }, { "epoch": 0.6363906595765763, "grad_norm": 0.2601321339607239, "learning_rate": 5.858226311717894e-05, "loss": 0.0173, "step": 4817 }, { "epoch": 0.6365227730620603, "grad_norm": 0.1506567746400833, "learning_rate": 5.8544438071594354e-05, "loss": 0.0216, "step": 4818 }, { "epoch": 0.6366548865475443, "grad_norm": 0.20273055136203766, "learning_rate": 5.8506620186683014e-05, "loss": 0.0238, "step": 4819 }, { "epoch": 0.6367870000330283, "grad_norm": 0.1697157919406891, "learning_rate": 5.846880946897722e-05, "loss": 0.0164, "step": 4820 }, { "epoch": 0.6369191135185124, "grad_norm": 0.10192979127168655, "learning_rate": 5.843100592500805e-05, "loss": 0.0113, "step": 4821 }, { "epoch": 0.6370512270039964, "grad_norm": 0.1476384401321411, "learning_rate": 5.839320956130542e-05, "loss": 0.0095, "step": 4822 }, { "epoch": 0.6371833404894804, "grad_norm": 0.16391359269618988, "learning_rate": 5.8355420384397865e-05, "loss": 0.0176, "step": 4823 }, { "epoch": 0.6373154539749645, "grad_norm": 0.124471016228199, "learning_rate": 5.83176384008128e-05, "loss": 0.0102, "step": 4824 }, { "epoch": 0.6374475674604485, "grad_norm": 0.17791259288787842, "learning_rate": 5.82798636170764e-05, "loss": 0.0136, "step": 4825 }, { "epoch": 0.6375796809459325, "grad_norm": 0.2600245475769043, "learning_rate": 5.824209603971347e-05, "loss": 0.0181, "step": 4826 }, { "epoch": 0.6377117944314166, "grad_norm": 0.2228402942419052, "learning_rate": 5.8204335675247676e-05, "loss": 0.0164, "step": 4827 }, { "epoch": 0.6378439079169006, "grad_norm": 0.1939469873905182, "learning_rate": 5.8166582530201483e-05, "loss": 0.0214, "step": 4828 }, { "epoch": 0.6379760214023846, "grad_norm": 0.38613161444664, "learning_rate": 5.812883661109594e-05, "loss": 0.0219, "step": 4829 }, { "epoch": 0.6381081348878687, "grad_norm": 0.1431112140417099, "learning_rate": 5.8091097924451e-05, "loss": 0.0173, "step": 4830 }, { "epoch": 0.6382402483733527, "grad_norm": 0.17689582705497742, "learning_rate": 5.805336647678531e-05, "loss": 0.0213, "step": 4831 }, { "epoch": 0.6383723618588367, "grad_norm": 0.12897051870822906, "learning_rate": 5.801564227461631e-05, "loss": 0.0157, "step": 4832 }, { "epoch": 0.6385044753443208, "grad_norm": 0.1856435388326645, "learning_rate": 5.797792532446008e-05, "loss": 0.0159, "step": 4833 }, { "epoch": 0.6386365888298048, "grad_norm": 0.21794654428958893, "learning_rate": 5.794021563283154e-05, "loss": 0.0111, "step": 4834 }, { "epoch": 0.6387687023152888, "grad_norm": 0.21521590650081635, "learning_rate": 5.7902513206244404e-05, "loss": 0.0244, "step": 4835 }, { "epoch": 0.6389008158007728, "grad_norm": 0.18167127668857574, "learning_rate": 5.786481805121096e-05, "loss": 0.0131, "step": 4836 }, { "epoch": 0.6390329292862569, "grad_norm": 0.1692265123128891, "learning_rate": 5.782713017424237e-05, "loss": 0.0151, "step": 4837 }, { "epoch": 0.6391650427717409, "grad_norm": 0.25949326157569885, "learning_rate": 5.7789449581848534e-05, "loss": 0.0149, "step": 4838 }, { "epoch": 0.6392971562572249, "grad_norm": 0.09822472929954529, "learning_rate": 5.7751776280538104e-05, "loss": 0.0088, "step": 4839 }, { "epoch": 0.639429269742709, "grad_norm": 0.16368919610977173, "learning_rate": 5.7714110276818354e-05, "loss": 0.0161, "step": 4840 }, { "epoch": 0.639561383228193, "grad_norm": 0.12360408902168274, "learning_rate": 5.7676451577195425e-05, "loss": 0.0135, "step": 4841 }, { "epoch": 0.639693496713677, "grad_norm": 0.17056262493133545, "learning_rate": 5.763880018817418e-05, "loss": 0.0191, "step": 4842 }, { "epoch": 0.6398256101991611, "grad_norm": 0.1338781863451004, "learning_rate": 5.760115611625814e-05, "loss": 0.0167, "step": 4843 }, { "epoch": 0.6399577236846451, "grad_norm": 0.30153313279151917, "learning_rate": 5.756351936794961e-05, "loss": 0.0173, "step": 4844 }, { "epoch": 0.6400898371701291, "grad_norm": 0.1391494870185852, "learning_rate": 5.75258899497497e-05, "loss": 0.0094, "step": 4845 }, { "epoch": 0.6402219506556132, "grad_norm": 0.20900259912014008, "learning_rate": 5.748826786815813e-05, "loss": 0.0297, "step": 4846 }, { "epoch": 0.6403540641410972, "grad_norm": 0.16079430282115936, "learning_rate": 5.745065312967344e-05, "loss": 0.0143, "step": 4847 }, { "epoch": 0.6404861776265812, "grad_norm": 0.19876495003700256, "learning_rate": 5.741304574079285e-05, "loss": 0.0127, "step": 4848 }, { "epoch": 0.6406182911120653, "grad_norm": 0.13469110429286957, "learning_rate": 5.73754457080124e-05, "loss": 0.0098, "step": 4849 }, { "epoch": 0.6407504045975493, "grad_norm": 0.15722264349460602, "learning_rate": 5.7337853037826706e-05, "loss": 0.018, "step": 4850 }, { "epoch": 0.6408825180830333, "grad_norm": 0.1624094843864441, "learning_rate": 5.730026773672923e-05, "loss": 0.0168, "step": 4851 }, { "epoch": 0.6410146315685173, "grad_norm": 0.35877251625061035, "learning_rate": 5.726268981121217e-05, "loss": 0.0136, "step": 4852 }, { "epoch": 0.6411467450540014, "grad_norm": 0.15343332290649414, "learning_rate": 5.7225119267766326e-05, "loss": 0.0125, "step": 4853 }, { "epoch": 0.6412788585394854, "grad_norm": 0.21046748757362366, "learning_rate": 5.718755611288137e-05, "loss": 0.021, "step": 4854 }, { "epoch": 0.6414109720249694, "grad_norm": 0.13725142180919647, "learning_rate": 5.715000035304561e-05, "loss": 0.011, "step": 4855 }, { "epoch": 0.6415430855104535, "grad_norm": 0.17193253338336945, "learning_rate": 5.7112451994746154e-05, "loss": 0.0238, "step": 4856 }, { "epoch": 0.6416751989959375, "grad_norm": 0.1537133902311325, "learning_rate": 5.7074911044468696e-05, "loss": 0.0146, "step": 4857 }, { "epoch": 0.6418073124814215, "grad_norm": 0.12296643853187561, "learning_rate": 5.7037377508697774e-05, "loss": 0.0086, "step": 4858 }, { "epoch": 0.6419394259669056, "grad_norm": 0.19924704730510712, "learning_rate": 5.6999851393916645e-05, "loss": 0.0096, "step": 4859 }, { "epoch": 0.6420715394523896, "grad_norm": 0.15559297800064087, "learning_rate": 5.696233270660716e-05, "loss": 0.0148, "step": 4860 }, { "epoch": 0.6422036529378736, "grad_norm": 0.2160009890794754, "learning_rate": 5.692482145325002e-05, "loss": 0.0216, "step": 4861 }, { "epoch": 0.6423357664233577, "grad_norm": 0.2480064034461975, "learning_rate": 5.688731764032458e-05, "loss": 0.0223, "step": 4862 }, { "epoch": 0.6424678799088417, "grad_norm": 0.1594134420156479, "learning_rate": 5.684982127430895e-05, "loss": 0.0184, "step": 4863 }, { "epoch": 0.6425999933943257, "grad_norm": 0.25493767857551575, "learning_rate": 5.681233236167989e-05, "loss": 0.0167, "step": 4864 }, { "epoch": 0.6427321068798098, "grad_norm": 0.1988016664981842, "learning_rate": 5.6774850908912926e-05, "loss": 0.0168, "step": 4865 }, { "epoch": 0.6428642203652938, "grad_norm": 0.19576209783554077, "learning_rate": 5.6737376922482296e-05, "loss": 0.0172, "step": 4866 }, { "epoch": 0.6429963338507778, "grad_norm": 0.16945160925388336, "learning_rate": 5.669991040886088e-05, "loss": 0.0156, "step": 4867 }, { "epoch": 0.6431284473362618, "grad_norm": 0.147054985165596, "learning_rate": 5.666245137452034e-05, "loss": 0.0107, "step": 4868 }, { "epoch": 0.6432605608217459, "grad_norm": 0.1534264087677002, "learning_rate": 5.662499982593104e-05, "loss": 0.0215, "step": 4869 }, { "epoch": 0.6433926743072299, "grad_norm": 0.15351586043834686, "learning_rate": 5.6587555769562064e-05, "loss": 0.0116, "step": 4870 }, { "epoch": 0.6435247877927139, "grad_norm": 0.10792558640241623, "learning_rate": 5.6550119211881095e-05, "loss": 0.0099, "step": 4871 }, { "epoch": 0.643656901278198, "grad_norm": 0.20718395709991455, "learning_rate": 5.651269015935463e-05, "loss": 0.022, "step": 4872 }, { "epoch": 0.643789014763682, "grad_norm": 0.15308907628059387, "learning_rate": 5.6475268618447896e-05, "loss": 0.0166, "step": 4873 }, { "epoch": 0.643921128249166, "grad_norm": 0.13652434945106506, "learning_rate": 5.643785459562466e-05, "loss": 0.0148, "step": 4874 }, { "epoch": 0.6440532417346501, "grad_norm": 0.14849905669689178, "learning_rate": 5.640044809734756e-05, "loss": 0.0182, "step": 4875 }, { "epoch": 0.6441853552201341, "grad_norm": 0.14648254215717316, "learning_rate": 5.636304913007786e-05, "loss": 0.0115, "step": 4876 }, { "epoch": 0.6443174687056181, "grad_norm": 0.21516041457653046, "learning_rate": 5.6325657700275555e-05, "loss": 0.0135, "step": 4877 }, { "epoch": 0.6444495821911022, "grad_norm": 0.1507827490568161, "learning_rate": 5.6288273814399276e-05, "loss": 0.0209, "step": 4878 }, { "epoch": 0.6445816956765862, "grad_norm": 0.17382963001728058, "learning_rate": 5.6250897478906396e-05, "loss": 0.0056, "step": 4879 }, { "epoch": 0.6447138091620702, "grad_norm": 0.19124169647693634, "learning_rate": 5.621352870025302e-05, "loss": 0.021, "step": 4880 }, { "epoch": 0.6448459226475542, "grad_norm": 0.1883854866027832, "learning_rate": 5.617616748489384e-05, "loss": 0.0276, "step": 4881 }, { "epoch": 0.6449780361330383, "grad_norm": 0.15622587502002716, "learning_rate": 5.6138813839282346e-05, "loss": 0.0125, "step": 4882 }, { "epoch": 0.6451101496185223, "grad_norm": 0.23187081515789032, "learning_rate": 5.6101467769870666e-05, "loss": 0.0176, "step": 4883 }, { "epoch": 0.6452422631040063, "grad_norm": 0.25262168049812317, "learning_rate": 5.606412928310969e-05, "loss": 0.0224, "step": 4884 }, { "epoch": 0.6453743765894904, "grad_norm": 0.1560918241739273, "learning_rate": 5.6026798385448866e-05, "loss": 0.023, "step": 4885 }, { "epoch": 0.6455064900749744, "grad_norm": 0.16323047876358032, "learning_rate": 5.598947508333643e-05, "loss": 0.0235, "step": 4886 }, { "epoch": 0.6456386035604584, "grad_norm": 0.16999828815460205, "learning_rate": 5.595215938321934e-05, "loss": 0.0088, "step": 4887 }, { "epoch": 0.6457707170459425, "grad_norm": 0.12709566950798035, "learning_rate": 5.5914851291543104e-05, "loss": 0.0105, "step": 4888 }, { "epoch": 0.6459028305314265, "grad_norm": 0.17557081580162048, "learning_rate": 5.587755081475203e-05, "loss": 0.0171, "step": 4889 }, { "epoch": 0.6460349440169105, "grad_norm": 0.14582808315753937, "learning_rate": 5.584025795928909e-05, "loss": 0.0136, "step": 4890 }, { "epoch": 0.6461670575023946, "grad_norm": 0.20256875455379486, "learning_rate": 5.580297273159596e-05, "loss": 0.0205, "step": 4891 }, { "epoch": 0.6462991709878786, "grad_norm": 0.2181100994348526, "learning_rate": 5.5765695138112896e-05, "loss": 0.0252, "step": 4892 }, { "epoch": 0.6464312844733626, "grad_norm": 0.17208850383758545, "learning_rate": 5.572842518527892e-05, "loss": 0.0211, "step": 4893 }, { "epoch": 0.6465633979588467, "grad_norm": 0.1517520546913147, "learning_rate": 5.5691162879531774e-05, "loss": 0.0148, "step": 4894 }, { "epoch": 0.6466955114443307, "grad_norm": 0.13908398151397705, "learning_rate": 5.5653908227307764e-05, "loss": 0.0136, "step": 4895 }, { "epoch": 0.6468276249298147, "grad_norm": 0.20299845933914185, "learning_rate": 5.5616661235041945e-05, "loss": 0.0157, "step": 4896 }, { "epoch": 0.6469597384152987, "grad_norm": 0.12722185254096985, "learning_rate": 5.557942190916805e-05, "loss": 0.0079, "step": 4897 }, { "epoch": 0.6470918519007828, "grad_norm": 0.27305638790130615, "learning_rate": 5.554219025611853e-05, "loss": 0.0303, "step": 4898 }, { "epoch": 0.6472239653862668, "grad_norm": 0.19381427764892578, "learning_rate": 5.550496628232435e-05, "loss": 0.022, "step": 4899 }, { "epoch": 0.6473560788717508, "grad_norm": 0.17927169799804688, "learning_rate": 5.5467749994215315e-05, "loss": 0.0175, "step": 4900 }, { "epoch": 0.6474881923572349, "grad_norm": 0.10161993652582169, "learning_rate": 5.543054139821986e-05, "loss": 0.0061, "step": 4901 }, { "epoch": 0.6476203058427189, "grad_norm": 0.14280255138874054, "learning_rate": 5.539334050076503e-05, "loss": 0.0125, "step": 4902 }, { "epoch": 0.6477524193282029, "grad_norm": 0.22020979225635529, "learning_rate": 5.535614730827656e-05, "loss": 0.0193, "step": 4903 }, { "epoch": 0.647884532813687, "grad_norm": 0.17913341522216797, "learning_rate": 5.531896182717901e-05, "loss": 0.0127, "step": 4904 }, { "epoch": 0.648016646299171, "grad_norm": 0.10980281233787537, "learning_rate": 5.528178406389535e-05, "loss": 0.0091, "step": 4905 }, { "epoch": 0.648148759784655, "grad_norm": 0.1731892079114914, "learning_rate": 5.5244614024847374e-05, "loss": 0.0151, "step": 4906 }, { "epoch": 0.6482808732701391, "grad_norm": 0.15978483855724335, "learning_rate": 5.520745171645556e-05, "loss": 0.0192, "step": 4907 }, { "epoch": 0.6484129867556231, "grad_norm": 0.15524441003799438, "learning_rate": 5.517029714513893e-05, "loss": 0.0138, "step": 4908 }, { "epoch": 0.6485451002411071, "grad_norm": 0.11588185280561447, "learning_rate": 5.513315031731527e-05, "loss": 0.0107, "step": 4909 }, { "epoch": 0.6486772137265912, "grad_norm": 0.13747727870941162, "learning_rate": 5.509601123940103e-05, "loss": 0.0171, "step": 4910 }, { "epoch": 0.6488093272120752, "grad_norm": 0.14048703014850616, "learning_rate": 5.5058879917811276e-05, "loss": 0.0212, "step": 4911 }, { "epoch": 0.6489414406975592, "grad_norm": 0.17134098708629608, "learning_rate": 5.502175635895972e-05, "loss": 0.017, "step": 4912 }, { "epoch": 0.6490735541830432, "grad_norm": 0.1490405946969986, "learning_rate": 5.498464056925876e-05, "loss": 0.0183, "step": 4913 }, { "epoch": 0.6492056676685273, "grad_norm": 0.2269468903541565, "learning_rate": 5.494753255511953e-05, "loss": 0.0236, "step": 4914 }, { "epoch": 0.6493377811540113, "grad_norm": 0.16695119440555573, "learning_rate": 5.4910432322951656e-05, "loss": 0.0218, "step": 4915 }, { "epoch": 0.6494698946394953, "grad_norm": 0.14104008674621582, "learning_rate": 5.4873339879163545e-05, "loss": 0.0114, "step": 4916 }, { "epoch": 0.6496020081249794, "grad_norm": 0.17982810735702515, "learning_rate": 5.483625523016223e-05, "loss": 0.0179, "step": 4917 }, { "epoch": 0.6497341216104634, "grad_norm": 0.18194417655467987, "learning_rate": 5.4799178382353425e-05, "loss": 0.018, "step": 4918 }, { "epoch": 0.6498662350959474, "grad_norm": 0.22586305439472198, "learning_rate": 5.476210934214137e-05, "loss": 0.017, "step": 4919 }, { "epoch": 0.6499983485814315, "grad_norm": 0.4077160060405731, "learning_rate": 5.47250481159291e-05, "loss": 0.0202, "step": 4920 }, { "epoch": 0.6501304620669155, "grad_norm": 0.1317765861749649, "learning_rate": 5.46879947101183e-05, "loss": 0.0161, "step": 4921 }, { "epoch": 0.6502625755523995, "grad_norm": 0.12023100256919861, "learning_rate": 5.465094913110915e-05, "loss": 0.0144, "step": 4922 }, { "epoch": 0.6503946890378836, "grad_norm": 0.3004123866558075, "learning_rate": 5.461391138530065e-05, "loss": 0.0153, "step": 4923 }, { "epoch": 0.6505268025233676, "grad_norm": 0.11634991317987442, "learning_rate": 5.457688147909036e-05, "loss": 0.0116, "step": 4924 }, { "epoch": 0.6506589160088516, "grad_norm": 0.12899671494960785, "learning_rate": 5.453985941887454e-05, "loss": 0.0112, "step": 4925 }, { "epoch": 0.6507910294943356, "grad_norm": 0.2067309319972992, "learning_rate": 5.450284521104798e-05, "loss": 0.0135, "step": 4926 }, { "epoch": 0.6509231429798197, "grad_norm": 0.15902547538280487, "learning_rate": 5.446583886200425e-05, "loss": 0.0145, "step": 4927 }, { "epoch": 0.6510552564653037, "grad_norm": 0.17149271070957184, "learning_rate": 5.4428840378135524e-05, "loss": 0.0241, "step": 4928 }, { "epoch": 0.6511873699507877, "grad_norm": 0.14578677713871002, "learning_rate": 5.439184976583254e-05, "loss": 0.0195, "step": 4929 }, { "epoch": 0.6513194834362718, "grad_norm": 0.1228649690747261, "learning_rate": 5.4354867031484736e-05, "loss": 0.0105, "step": 4930 }, { "epoch": 0.6514515969217558, "grad_norm": 0.210816890001297, "learning_rate": 5.4317892181480226e-05, "loss": 0.0296, "step": 4931 }, { "epoch": 0.6515837104072398, "grad_norm": 0.18472038209438324, "learning_rate": 5.428092522220576e-05, "loss": 0.0265, "step": 4932 }, { "epoch": 0.6517158238927239, "grad_norm": 0.22026722133159637, "learning_rate": 5.424396616004659e-05, "loss": 0.0312, "step": 4933 }, { "epoch": 0.6518479373782079, "grad_norm": 0.17818962037563324, "learning_rate": 5.420701500138674e-05, "loss": 0.0176, "step": 4934 }, { "epoch": 0.6519800508636919, "grad_norm": 0.15920813381671906, "learning_rate": 5.417007175260891e-05, "loss": 0.0188, "step": 4935 }, { "epoch": 0.652112164349176, "grad_norm": 0.19341981410980225, "learning_rate": 5.4133136420094234e-05, "loss": 0.015, "step": 4936 }, { "epoch": 0.65224427783466, "grad_norm": 0.24290108680725098, "learning_rate": 5.4096209010222654e-05, "loss": 0.0303, "step": 4937 }, { "epoch": 0.652376391320144, "grad_norm": 0.13119028508663177, "learning_rate": 5.4059289529372704e-05, "loss": 0.0143, "step": 4938 }, { "epoch": 0.652508504805628, "grad_norm": 0.1338653862476349, "learning_rate": 5.402237798392156e-05, "loss": 0.0141, "step": 4939 }, { "epoch": 0.6526406182911121, "grad_norm": 0.18375612795352936, "learning_rate": 5.398547438024492e-05, "loss": 0.0281, "step": 4940 }, { "epoch": 0.6527727317765961, "grad_norm": 0.17538763582706451, "learning_rate": 5.3948578724717236e-05, "loss": 0.0113, "step": 4941 }, { "epoch": 0.6529048452620801, "grad_norm": 0.12488379329442978, "learning_rate": 5.3911691023711565e-05, "loss": 0.0129, "step": 4942 }, { "epoch": 0.6530369587475642, "grad_norm": 0.22224709391593933, "learning_rate": 5.3874811283599524e-05, "loss": 0.0118, "step": 4943 }, { "epoch": 0.6531690722330482, "grad_norm": 0.10177693516016006, "learning_rate": 5.383793951075141e-05, "loss": 0.0087, "step": 4944 }, { "epoch": 0.6533011857185322, "grad_norm": 0.15108650922775269, "learning_rate": 5.380107571153614e-05, "loss": 0.0117, "step": 4945 }, { "epoch": 0.6534332992040163, "grad_norm": 0.15009194612503052, "learning_rate": 5.37642198923213e-05, "loss": 0.0132, "step": 4946 }, { "epoch": 0.6535654126895003, "grad_norm": 0.35668736696243286, "learning_rate": 5.3727372059472934e-05, "loss": 0.0265, "step": 4947 }, { "epoch": 0.6536975261749843, "grad_norm": 0.16157856583595276, "learning_rate": 5.369053221935588e-05, "loss": 0.0146, "step": 4948 }, { "epoch": 0.6538296396604684, "grad_norm": 0.15010638535022736, "learning_rate": 5.365370037833357e-05, "loss": 0.0113, "step": 4949 }, { "epoch": 0.6539617531459524, "grad_norm": 0.1262449473142624, "learning_rate": 5.3616876542767924e-05, "loss": 0.0156, "step": 4950 }, { "epoch": 0.6540938666314364, "grad_norm": 0.10484588891267776, "learning_rate": 5.358006071901962e-05, "loss": 0.0168, "step": 4951 }, { "epoch": 0.6542259801169205, "grad_norm": 0.15599007904529572, "learning_rate": 5.3543252913447894e-05, "loss": 0.0198, "step": 4952 }, { "epoch": 0.6543580936024045, "grad_norm": 0.24297498166561127, "learning_rate": 5.350645313241066e-05, "loss": 0.0242, "step": 4953 }, { "epoch": 0.6544902070878885, "grad_norm": 0.20962654054164886, "learning_rate": 5.34696613822643e-05, "loss": 0.0198, "step": 4954 }, { "epoch": 0.6546223205733726, "grad_norm": 0.13655038177967072, "learning_rate": 5.3432877669363956e-05, "loss": 0.0158, "step": 4955 }, { "epoch": 0.6547544340588566, "grad_norm": 0.12744346261024475, "learning_rate": 5.339610200006334e-05, "loss": 0.0131, "step": 4956 }, { "epoch": 0.6548865475443406, "grad_norm": 0.09227416664361954, "learning_rate": 5.335933438071471e-05, "loss": 0.0038, "step": 4957 }, { "epoch": 0.6550186610298246, "grad_norm": 0.29073062539100647, "learning_rate": 5.3322574817669004e-05, "loss": 0.0217, "step": 4958 }, { "epoch": 0.6551507745153087, "grad_norm": 0.11703348159790039, "learning_rate": 5.328582331727576e-05, "loss": 0.0094, "step": 4959 }, { "epoch": 0.6552828880007927, "grad_norm": 0.17218007147312164, "learning_rate": 5.324907988588316e-05, "loss": 0.0183, "step": 4960 }, { "epoch": 0.6554150014862767, "grad_norm": 0.16797365248203278, "learning_rate": 5.321234452983786e-05, "loss": 0.0199, "step": 4961 }, { "epoch": 0.6555471149717608, "grad_norm": 0.117110975086689, "learning_rate": 5.317561725548518e-05, "loss": 0.0116, "step": 4962 }, { "epoch": 0.6556792284572448, "grad_norm": 0.23052513599395752, "learning_rate": 5.313889806916921e-05, "loss": 0.0239, "step": 4963 }, { "epoch": 0.6558113419427288, "grad_norm": 0.11556703597307205, "learning_rate": 5.310218697723239e-05, "loss": 0.009, "step": 4964 }, { "epoch": 0.6559434554282129, "grad_norm": 0.22753220796585083, "learning_rate": 5.306548398601592e-05, "loss": 0.0213, "step": 4965 }, { "epoch": 0.6560755689136969, "grad_norm": 0.2390792965888977, "learning_rate": 5.302878910185958e-05, "loss": 0.0215, "step": 4966 }, { "epoch": 0.6562076823991809, "grad_norm": 0.22379370033740997, "learning_rate": 5.299210233110163e-05, "loss": 0.0228, "step": 4967 }, { "epoch": 0.656339795884665, "grad_norm": 0.18010953068733215, "learning_rate": 5.295542368007911e-05, "loss": 0.0236, "step": 4968 }, { "epoch": 0.656471909370149, "grad_norm": 0.13127058744430542, "learning_rate": 5.291875315512753e-05, "loss": 0.0139, "step": 4969 }, { "epoch": 0.656604022855633, "grad_norm": 0.15722498297691345, "learning_rate": 5.288209076258109e-05, "loss": 0.0167, "step": 4970 }, { "epoch": 0.656736136341117, "grad_norm": 0.16607514023780823, "learning_rate": 5.284543650877246e-05, "loss": 0.0175, "step": 4971 }, { "epoch": 0.6568682498266011, "grad_norm": 0.17862720787525177, "learning_rate": 5.2808790400033015e-05, "loss": 0.0176, "step": 4972 }, { "epoch": 0.6570003633120851, "grad_norm": 0.198783740401268, "learning_rate": 5.2772152442692715e-05, "loss": 0.0146, "step": 4973 }, { "epoch": 0.6571324767975691, "grad_norm": 0.1337892860174179, "learning_rate": 5.2735522643080014e-05, "loss": 0.0181, "step": 4974 }, { "epoch": 0.6572645902830532, "grad_norm": 0.11915634572505951, "learning_rate": 5.269890100752205e-05, "loss": 0.0139, "step": 4975 }, { "epoch": 0.6573967037685372, "grad_norm": 0.1657743602991104, "learning_rate": 5.266228754234455e-05, "loss": 0.0183, "step": 4976 }, { "epoch": 0.6575288172540212, "grad_norm": 0.16712002456188202, "learning_rate": 5.262568225387181e-05, "loss": 0.0168, "step": 4977 }, { "epoch": 0.6576609307395053, "grad_norm": 0.19823117554187775, "learning_rate": 5.258908514842667e-05, "loss": 0.0264, "step": 4978 }, { "epoch": 0.6577930442249893, "grad_norm": 0.2605159282684326, "learning_rate": 5.2552496232330605e-05, "loss": 0.0134, "step": 4979 }, { "epoch": 0.6579251577104733, "grad_norm": 0.188632994890213, "learning_rate": 5.2515915511903715e-05, "loss": 0.0214, "step": 4980 }, { "epoch": 0.6580572711959574, "grad_norm": 0.1341782957315445, "learning_rate": 5.247934299346455e-05, "loss": 0.0196, "step": 4981 }, { "epoch": 0.6581893846814414, "grad_norm": 0.27716678380966187, "learning_rate": 5.2442778683330405e-05, "loss": 0.0185, "step": 4982 }, { "epoch": 0.6583214981669254, "grad_norm": 0.3105461597442627, "learning_rate": 5.240622258781702e-05, "loss": 0.0315, "step": 4983 }, { "epoch": 0.6584536116524095, "grad_norm": 0.3102239668369293, "learning_rate": 5.236967471323888e-05, "loss": 0.0194, "step": 4984 }, { "epoch": 0.6585857251378935, "grad_norm": 0.14362885057926178, "learning_rate": 5.233313506590881e-05, "loss": 0.0098, "step": 4985 }, { "epoch": 0.6587178386233775, "grad_norm": 0.21245473623275757, "learning_rate": 5.229660365213843e-05, "loss": 0.0147, "step": 4986 }, { "epoch": 0.6588499521088615, "grad_norm": 0.10704601556062698, "learning_rate": 5.226008047823788e-05, "loss": 0.0086, "step": 4987 }, { "epoch": 0.6589820655943456, "grad_norm": 0.26376500725746155, "learning_rate": 5.222356555051579e-05, "loss": 0.0234, "step": 4988 }, { "epoch": 0.6591141790798296, "grad_norm": 0.15192173421382904, "learning_rate": 5.218705887527946e-05, "loss": 0.0186, "step": 4989 }, { "epoch": 0.6592462925653136, "grad_norm": 0.1726227104663849, "learning_rate": 5.215056045883473e-05, "loss": 0.0267, "step": 4990 }, { "epoch": 0.6593784060507977, "grad_norm": 0.1516808718442917, "learning_rate": 5.211407030748607e-05, "loss": 0.0137, "step": 4991 }, { "epoch": 0.6595105195362817, "grad_norm": 0.07529601454734802, "learning_rate": 5.207758842753638e-05, "loss": 0.0056, "step": 4992 }, { "epoch": 0.6596426330217657, "grad_norm": 0.140614315867424, "learning_rate": 5.2041114825287284e-05, "loss": 0.0118, "step": 4993 }, { "epoch": 0.6597747465072498, "grad_norm": 0.22331038117408752, "learning_rate": 5.200464950703894e-05, "loss": 0.0119, "step": 4994 }, { "epoch": 0.6599068599927338, "grad_norm": 0.16355013847351074, "learning_rate": 5.196819247908997e-05, "loss": 0.0142, "step": 4995 }, { "epoch": 0.6600389734782178, "grad_norm": 0.1185201033949852, "learning_rate": 5.193174374773768e-05, "loss": 0.0069, "step": 4996 }, { "epoch": 0.6601710869637019, "grad_norm": 0.1419517993927002, "learning_rate": 5.189530331927792e-05, "loss": 0.017, "step": 4997 }, { "epoch": 0.6603032004491859, "grad_norm": 0.16416120529174805, "learning_rate": 5.185887120000512e-05, "loss": 0.02, "step": 4998 }, { "epoch": 0.6604353139346699, "grad_norm": 0.5249677896499634, "learning_rate": 5.182244739621218e-05, "loss": 0.0137, "step": 4999 }, { "epoch": 0.660567427420154, "grad_norm": 0.11340130120515823, "learning_rate": 5.178603191419066e-05, "loss": 0.0063, "step": 5000 }, { "epoch": 0.660699540905638, "grad_norm": 0.17886826395988464, "learning_rate": 5.174962476023069e-05, "loss": 0.0144, "step": 5001 }, { "epoch": 0.660831654391122, "grad_norm": 0.1607665717601776, "learning_rate": 5.171322594062086e-05, "loss": 0.0179, "step": 5002 }, { "epoch": 0.660963767876606, "grad_norm": 0.20411817729473114, "learning_rate": 5.167683546164841e-05, "loss": 0.0207, "step": 5003 }, { "epoch": 0.6610958813620901, "grad_norm": 0.14540739357471466, "learning_rate": 5.164045332959913e-05, "loss": 0.0102, "step": 5004 }, { "epoch": 0.6612279948475741, "grad_norm": 0.17851538956165314, "learning_rate": 5.160407955075739e-05, "loss": 0.0205, "step": 5005 }, { "epoch": 0.6613601083330581, "grad_norm": 0.1295069456100464, "learning_rate": 5.1567714131405984e-05, "loss": 0.016, "step": 5006 }, { "epoch": 0.6614922218185422, "grad_norm": 0.15553872287273407, "learning_rate": 5.153135707782641e-05, "loss": 0.0183, "step": 5007 }, { "epoch": 0.6616243353040262, "grad_norm": 0.0835447609424591, "learning_rate": 5.1495008396298726e-05, "loss": 0.0067, "step": 5008 }, { "epoch": 0.6617564487895102, "grad_norm": 0.09932634234428406, "learning_rate": 5.1458668093101384e-05, "loss": 0.0086, "step": 5009 }, { "epoch": 0.6618885622749943, "grad_norm": 0.09297791868448257, "learning_rate": 5.142233617451153e-05, "loss": 0.0104, "step": 5010 }, { "epoch": 0.6620206757604783, "grad_norm": 0.16000889241695404, "learning_rate": 5.1386012646804826e-05, "loss": 0.0157, "step": 5011 }, { "epoch": 0.6621527892459623, "grad_norm": 0.1270449310541153, "learning_rate": 5.1349697516255535e-05, "loss": 0.0186, "step": 5012 }, { "epoch": 0.6622849027314464, "grad_norm": 0.18740740418434143, "learning_rate": 5.131339078913634e-05, "loss": 0.0265, "step": 5013 }, { "epoch": 0.6624170162169304, "grad_norm": 0.13335908949375153, "learning_rate": 5.1277092471718566e-05, "loss": 0.014, "step": 5014 }, { "epoch": 0.6625491297024144, "grad_norm": 0.2011447548866272, "learning_rate": 5.1240802570272126e-05, "loss": 0.0221, "step": 5015 }, { "epoch": 0.6626812431878984, "grad_norm": 0.07277203351259232, "learning_rate": 5.120452109106535e-05, "loss": 0.0081, "step": 5016 }, { "epoch": 0.6628133566733825, "grad_norm": 0.172392338514328, "learning_rate": 5.11682480403652e-05, "loss": 0.0159, "step": 5017 }, { "epoch": 0.6629454701588665, "grad_norm": 0.2753337323665619, "learning_rate": 5.113198342443719e-05, "loss": 0.0284, "step": 5018 }, { "epoch": 0.6630775836443505, "grad_norm": 0.16935433447360992, "learning_rate": 5.109572724954538e-05, "loss": 0.0175, "step": 5019 }, { "epoch": 0.6632096971298346, "grad_norm": 0.1387585699558258, "learning_rate": 5.105947952195227e-05, "loss": 0.013, "step": 5020 }, { "epoch": 0.6633418106153186, "grad_norm": 0.23182858526706696, "learning_rate": 5.102324024791902e-05, "loss": 0.0269, "step": 5021 }, { "epoch": 0.6634739241008026, "grad_norm": 0.2846618890762329, "learning_rate": 5.098700943370528e-05, "loss": 0.0264, "step": 5022 }, { "epoch": 0.6636060375862867, "grad_norm": 0.21124884486198425, "learning_rate": 5.0950787085569265e-05, "loss": 0.0186, "step": 5023 }, { "epoch": 0.6637381510717707, "grad_norm": 0.21159759163856506, "learning_rate": 5.091457320976767e-05, "loss": 0.0335, "step": 5024 }, { "epoch": 0.6638702645572547, "grad_norm": 0.2589932084083557, "learning_rate": 5.087836781255585e-05, "loss": 0.0141, "step": 5025 }, { "epoch": 0.6640023780427388, "grad_norm": 0.2727756202220917, "learning_rate": 5.08421709001875e-05, "loss": 0.0202, "step": 5026 }, { "epoch": 0.6641344915282228, "grad_norm": 0.09600035101175308, "learning_rate": 5.0805982478915015e-05, "loss": 0.009, "step": 5027 }, { "epoch": 0.6642666050137068, "grad_norm": 0.18655623495578766, "learning_rate": 5.076980255498931e-05, "loss": 0.0134, "step": 5028 }, { "epoch": 0.6643987184991909, "grad_norm": 0.13871648907661438, "learning_rate": 5.073363113465969e-05, "loss": 0.0108, "step": 5029 }, { "epoch": 0.6645308319846749, "grad_norm": 0.1454835832118988, "learning_rate": 5.069746822417415e-05, "loss": 0.0173, "step": 5030 }, { "epoch": 0.6646629454701589, "grad_norm": 0.18189920485019684, "learning_rate": 5.066131382977914e-05, "loss": 0.014, "step": 5031 }, { "epoch": 0.664795058955643, "grad_norm": 0.19891028106212616, "learning_rate": 5.0625167957719724e-05, "loss": 0.0279, "step": 5032 }, { "epoch": 0.664927172441127, "grad_norm": 0.1992846429347992, "learning_rate": 5.058903061423932e-05, "loss": 0.0164, "step": 5033 }, { "epoch": 0.665059285926611, "grad_norm": 0.13385896384716034, "learning_rate": 5.0552901805580034e-05, "loss": 0.0189, "step": 5034 }, { "epoch": 0.665191399412095, "grad_norm": 0.5015331506729126, "learning_rate": 5.051678153798247e-05, "loss": 0.0159, "step": 5035 }, { "epoch": 0.6653235128975791, "grad_norm": 0.21186839044094086, "learning_rate": 5.0480669817685656e-05, "loss": 0.0191, "step": 5036 }, { "epoch": 0.6654556263830631, "grad_norm": 0.10766605287790298, "learning_rate": 5.044456665092725e-05, "loss": 0.01, "step": 5037 }, { "epoch": 0.6655877398685471, "grad_norm": 0.11912591010332108, "learning_rate": 5.040847204394341e-05, "loss": 0.0079, "step": 5038 }, { "epoch": 0.6657198533540312, "grad_norm": 0.15789036452770233, "learning_rate": 5.037238600296883e-05, "loss": 0.0084, "step": 5039 }, { "epoch": 0.6658519668395152, "grad_norm": 0.14228802919387817, "learning_rate": 5.033630853423663e-05, "loss": 0.0163, "step": 5040 }, { "epoch": 0.6659840803249992, "grad_norm": 0.19446247816085815, "learning_rate": 5.030023964397857e-05, "loss": 0.0208, "step": 5041 }, { "epoch": 0.6661161938104833, "grad_norm": 0.15082289278507233, "learning_rate": 5.026417933842489e-05, "loss": 0.0172, "step": 5042 }, { "epoch": 0.6662483072959673, "grad_norm": 0.24349327385425568, "learning_rate": 5.0228127623804266e-05, "loss": 0.0171, "step": 5043 }, { "epoch": 0.6663804207814513, "grad_norm": 0.1227944940328598, "learning_rate": 5.019208450634398e-05, "loss": 0.0157, "step": 5044 }, { "epoch": 0.6665125342669354, "grad_norm": 0.16644077003002167, "learning_rate": 5.015604999226985e-05, "loss": 0.0138, "step": 5045 }, { "epoch": 0.6666446477524194, "grad_norm": 0.15668226778507233, "learning_rate": 5.012002408780616e-05, "loss": 0.014, "step": 5046 }, { "epoch": 0.6667767612379033, "grad_norm": 0.18227237462997437, "learning_rate": 5.008400679917567e-05, "loss": 0.0228, "step": 5047 }, { "epoch": 0.6669088747233873, "grad_norm": 0.18179632723331451, "learning_rate": 5.004799813259968e-05, "loss": 0.0189, "step": 5048 }, { "epoch": 0.6670409882088714, "grad_norm": 0.20042505860328674, "learning_rate": 5.001199809429811e-05, "loss": 0.0206, "step": 5049 }, { "epoch": 0.6671731016943554, "grad_norm": 0.11901211738586426, "learning_rate": 4.9976006690489184e-05, "loss": 0.0165, "step": 5050 }, { "epoch": 0.6673052151798394, "grad_norm": 0.1777932196855545, "learning_rate": 4.9940023927389786e-05, "loss": 0.0181, "step": 5051 }, { "epoch": 0.6674373286653235, "grad_norm": 0.16570496559143066, "learning_rate": 4.990404981121528e-05, "loss": 0.0219, "step": 5052 }, { "epoch": 0.6675694421508075, "grad_norm": 0.18598343431949615, "learning_rate": 4.986808434817954e-05, "loss": 0.0238, "step": 5053 }, { "epoch": 0.6677015556362915, "grad_norm": 0.17573808133602142, "learning_rate": 4.983212754449487e-05, "loss": 0.0141, "step": 5054 }, { "epoch": 0.6678336691217756, "grad_norm": 0.1362745463848114, "learning_rate": 4.979617940637216e-05, "loss": 0.021, "step": 5055 }, { "epoch": 0.6679657826072596, "grad_norm": 0.26245835423469543, "learning_rate": 4.976023994002081e-05, "loss": 0.0208, "step": 5056 }, { "epoch": 0.6680978960927436, "grad_norm": 0.15304391086101532, "learning_rate": 4.972430915164864e-05, "loss": 0.0126, "step": 5057 }, { "epoch": 0.6682300095782276, "grad_norm": 0.1346382051706314, "learning_rate": 4.968838704746205e-05, "loss": 0.0115, "step": 5058 }, { "epoch": 0.6683621230637117, "grad_norm": 0.12355558574199677, "learning_rate": 4.9652473633665896e-05, "loss": 0.0085, "step": 5059 }, { "epoch": 0.6684942365491957, "grad_norm": 0.19657886028289795, "learning_rate": 4.96165689164636e-05, "loss": 0.018, "step": 5060 }, { "epoch": 0.6686263500346797, "grad_norm": 0.13189183175563812, "learning_rate": 4.9580672902056954e-05, "loss": 0.0146, "step": 5061 }, { "epoch": 0.6687584635201638, "grad_norm": 0.15036815404891968, "learning_rate": 4.954478559664636e-05, "loss": 0.0117, "step": 5062 }, { "epoch": 0.6688905770056478, "grad_norm": 0.11169622093439102, "learning_rate": 4.9508907006430724e-05, "loss": 0.0142, "step": 5063 }, { "epoch": 0.6690226904911318, "grad_norm": 0.21585702896118164, "learning_rate": 4.947303713760731e-05, "loss": 0.0162, "step": 5064 }, { "epoch": 0.6691548039766159, "grad_norm": 0.13905102014541626, "learning_rate": 4.943717599637202e-05, "loss": 0.014, "step": 5065 }, { "epoch": 0.6692869174620999, "grad_norm": 0.16844500601291656, "learning_rate": 4.940132358891919e-05, "loss": 0.0233, "step": 5066 }, { "epoch": 0.6694190309475839, "grad_norm": 0.18952059745788574, "learning_rate": 4.9365479921441684e-05, "loss": 0.0144, "step": 5067 }, { "epoch": 0.669551144433068, "grad_norm": 0.2647591233253479, "learning_rate": 4.932964500013077e-05, "loss": 0.0269, "step": 5068 }, { "epoch": 0.669683257918552, "grad_norm": 0.13148827850818634, "learning_rate": 4.929381883117626e-05, "loss": 0.0242, "step": 5069 }, { "epoch": 0.669815371404036, "grad_norm": 0.19894959032535553, "learning_rate": 4.925800142076654e-05, "loss": 0.0215, "step": 5070 }, { "epoch": 0.66994748488952, "grad_norm": 0.15500599145889282, "learning_rate": 4.9222192775088296e-05, "loss": 0.0138, "step": 5071 }, { "epoch": 0.6700795983750041, "grad_norm": 0.22023585438728333, "learning_rate": 4.9186392900326836e-05, "loss": 0.0249, "step": 5072 }, { "epoch": 0.6702117118604881, "grad_norm": 0.14289738237857819, "learning_rate": 4.915060180266593e-05, "loss": 0.0114, "step": 5073 }, { "epoch": 0.6703438253459721, "grad_norm": 0.15781345963478088, "learning_rate": 4.9114819488287855e-05, "loss": 0.0175, "step": 5074 }, { "epoch": 0.6704759388314562, "grad_norm": 0.2148284912109375, "learning_rate": 4.907904596337326e-05, "loss": 0.0161, "step": 5075 }, { "epoch": 0.6706080523169402, "grad_norm": 0.2217796891927719, "learning_rate": 4.90432812341014e-05, "loss": 0.0141, "step": 5076 }, { "epoch": 0.6707401658024242, "grad_norm": 0.4486676752567291, "learning_rate": 4.900752530664998e-05, "loss": 0.0325, "step": 5077 }, { "epoch": 0.6708722792879083, "grad_norm": 0.15495853126049042, "learning_rate": 4.897177818719512e-05, "loss": 0.0078, "step": 5078 }, { "epoch": 0.6710043927733923, "grad_norm": 0.22847874462604523, "learning_rate": 4.893603988191145e-05, "loss": 0.0205, "step": 5079 }, { "epoch": 0.6711365062588763, "grad_norm": 0.12668964266777039, "learning_rate": 4.890031039697219e-05, "loss": 0.0138, "step": 5080 }, { "epoch": 0.6712686197443604, "grad_norm": 0.16602325439453125, "learning_rate": 4.886458973854886e-05, "loss": 0.0205, "step": 5081 }, { "epoch": 0.6714007332298444, "grad_norm": 0.18135517835617065, "learning_rate": 4.882887791281157e-05, "loss": 0.0346, "step": 5082 }, { "epoch": 0.6715328467153284, "grad_norm": 0.17789128422737122, "learning_rate": 4.8793174925928884e-05, "loss": 0.0207, "step": 5083 }, { "epoch": 0.6716649602008125, "grad_norm": 0.2250068634748459, "learning_rate": 4.8757480784067764e-05, "loss": 0.0225, "step": 5084 }, { "epoch": 0.6717970736862965, "grad_norm": 0.2617173194885254, "learning_rate": 4.872179549339375e-05, "loss": 0.0336, "step": 5085 }, { "epoch": 0.6719291871717805, "grad_norm": 0.14353157579898834, "learning_rate": 4.86861190600708e-05, "loss": 0.0164, "step": 5086 }, { "epoch": 0.6720613006572645, "grad_norm": 0.17198024690151215, "learning_rate": 4.8650451490261386e-05, "loss": 0.0229, "step": 5087 }, { "epoch": 0.6721934141427486, "grad_norm": 0.15205325186252594, "learning_rate": 4.861479279012635e-05, "loss": 0.0128, "step": 5088 }, { "epoch": 0.6723255276282326, "grad_norm": 0.09790311753749847, "learning_rate": 4.857914296582509e-05, "loss": 0.0117, "step": 5089 }, { "epoch": 0.6724576411137166, "grad_norm": 0.21584492921829224, "learning_rate": 4.85435020235155e-05, "loss": 0.0204, "step": 5090 }, { "epoch": 0.6725897545992007, "grad_norm": 0.10409726947546005, "learning_rate": 4.8507869969353794e-05, "loss": 0.0148, "step": 5091 }, { "epoch": 0.6727218680846847, "grad_norm": 0.14929348230361938, "learning_rate": 4.8472246809494784e-05, "loss": 0.0151, "step": 5092 }, { "epoch": 0.6728539815701687, "grad_norm": 0.17571406066417694, "learning_rate": 4.843663255009171e-05, "loss": 0.025, "step": 5093 }, { "epoch": 0.6729860950556528, "grad_norm": 0.13798776268959045, "learning_rate": 4.840102719729631e-05, "loss": 0.0161, "step": 5094 }, { "epoch": 0.6731182085411368, "grad_norm": 0.25582829117774963, "learning_rate": 4.836543075725867e-05, "loss": 0.0255, "step": 5095 }, { "epoch": 0.6732503220266208, "grad_norm": 0.14417283236980438, "learning_rate": 4.832984323612744e-05, "loss": 0.02, "step": 5096 }, { "epoch": 0.6733824355121049, "grad_norm": 0.23102858662605286, "learning_rate": 4.829426464004974e-05, "loss": 0.0158, "step": 5097 }, { "epoch": 0.6735145489975889, "grad_norm": 0.16588328778743744, "learning_rate": 4.825869497517102e-05, "loss": 0.0107, "step": 5098 }, { "epoch": 0.6736466624830729, "grad_norm": 0.19536004960536957, "learning_rate": 4.8223134247635316e-05, "loss": 0.0163, "step": 5099 }, { "epoch": 0.673778775968557, "grad_norm": 0.15462182462215424, "learning_rate": 4.818758246358509e-05, "loss": 0.0133, "step": 5100 }, { "epoch": 0.673910889454041, "grad_norm": 0.6555613875389099, "learning_rate": 4.81520396291613e-05, "loss": 0.0187, "step": 5101 }, { "epoch": 0.674043002939525, "grad_norm": 0.1688404679298401, "learning_rate": 4.811650575050318e-05, "loss": 0.0192, "step": 5102 }, { "epoch": 0.674175116425009, "grad_norm": 0.16610851883888245, "learning_rate": 4.808098083374863e-05, "loss": 0.0117, "step": 5103 }, { "epoch": 0.6743072299104931, "grad_norm": 0.1498517096042633, "learning_rate": 4.804546488503393e-05, "loss": 0.0183, "step": 5104 }, { "epoch": 0.6744393433959771, "grad_norm": 0.0877426490187645, "learning_rate": 4.800995791049373e-05, "loss": 0.0105, "step": 5105 }, { "epoch": 0.6745714568814611, "grad_norm": 0.2841498851776123, "learning_rate": 4.797445991626123e-05, "loss": 0.0165, "step": 5106 }, { "epoch": 0.6747035703669452, "grad_norm": 0.11339423060417175, "learning_rate": 4.793897090846803e-05, "loss": 0.0161, "step": 5107 }, { "epoch": 0.6748356838524292, "grad_norm": 0.23054392635822296, "learning_rate": 4.790349089324425e-05, "loss": 0.0185, "step": 5108 }, { "epoch": 0.6749677973379132, "grad_norm": 0.10517584532499313, "learning_rate": 4.786801987671833e-05, "loss": 0.0126, "step": 5109 }, { "epoch": 0.6750999108233973, "grad_norm": 0.19798646867275238, "learning_rate": 4.7832557865017235e-05, "loss": 0.0161, "step": 5110 }, { "epoch": 0.6752320243088813, "grad_norm": 0.2040533870458603, "learning_rate": 4.779710486426643e-05, "loss": 0.0119, "step": 5111 }, { "epoch": 0.6753641377943653, "grad_norm": 0.13550186157226562, "learning_rate": 4.7761660880589666e-05, "loss": 0.0119, "step": 5112 }, { "epoch": 0.6754962512798494, "grad_norm": 0.15335050225257874, "learning_rate": 4.772622592010927e-05, "loss": 0.0151, "step": 5113 }, { "epoch": 0.6756283647653334, "grad_norm": 0.19715610146522522, "learning_rate": 4.7690799988945963e-05, "loss": 0.0148, "step": 5114 }, { "epoch": 0.6757604782508174, "grad_norm": 0.17787174880504608, "learning_rate": 4.765538309321896e-05, "loss": 0.0271, "step": 5115 }, { "epoch": 0.6758925917363015, "grad_norm": 0.22029972076416016, "learning_rate": 4.761997523904579e-05, "loss": 0.0211, "step": 5116 }, { "epoch": 0.6760247052217855, "grad_norm": 0.14438408613204956, "learning_rate": 4.758457643254254e-05, "loss": 0.0102, "step": 5117 }, { "epoch": 0.6761568187072695, "grad_norm": 0.15860889852046967, "learning_rate": 4.754918667982371e-05, "loss": 0.0188, "step": 5118 }, { "epoch": 0.6762889321927535, "grad_norm": 0.17438390851020813, "learning_rate": 4.7513805987002166e-05, "loss": 0.0177, "step": 5119 }, { "epoch": 0.6764210456782376, "grad_norm": 0.1780308187007904, "learning_rate": 4.7478434360189284e-05, "loss": 0.0188, "step": 5120 }, { "epoch": 0.6765531591637216, "grad_norm": 0.16482895612716675, "learning_rate": 4.7443071805494865e-05, "loss": 0.0128, "step": 5121 }, { "epoch": 0.6766852726492056, "grad_norm": 0.1242159903049469, "learning_rate": 4.740771832902715e-05, "loss": 0.0114, "step": 5122 }, { "epoch": 0.6768173861346897, "grad_norm": 0.1562412679195404, "learning_rate": 4.737237393689272e-05, "loss": 0.0166, "step": 5123 }, { "epoch": 0.6769494996201737, "grad_norm": 0.16549445688724518, "learning_rate": 4.7337038635196704e-05, "loss": 0.0183, "step": 5124 }, { "epoch": 0.6770816131056577, "grad_norm": 0.25368818640708923, "learning_rate": 4.730171243004265e-05, "loss": 0.0186, "step": 5125 }, { "epoch": 0.6772137265911418, "grad_norm": 0.10777774453163147, "learning_rate": 4.726639532753243e-05, "loss": 0.0094, "step": 5126 }, { "epoch": 0.6773458400766258, "grad_norm": 0.1838887333869934, "learning_rate": 4.7231087333766435e-05, "loss": 0.024, "step": 5127 }, { "epoch": 0.6774779535621098, "grad_norm": 0.25211283564567566, "learning_rate": 4.719578845484346e-05, "loss": 0.0191, "step": 5128 }, { "epoch": 0.6776100670475939, "grad_norm": 0.09369178861379623, "learning_rate": 4.716049869686078e-05, "loss": 0.0086, "step": 5129 }, { "epoch": 0.6777421805330779, "grad_norm": 0.11376402527093887, "learning_rate": 4.712521806591396e-05, "loss": 0.0086, "step": 5130 }, { "epoch": 0.6778742940185619, "grad_norm": 0.1679966002702713, "learning_rate": 4.70899465680971e-05, "loss": 0.0183, "step": 5131 }, { "epoch": 0.678006407504046, "grad_norm": 0.1598343551158905, "learning_rate": 4.705468420950273e-05, "loss": 0.0149, "step": 5132 }, { "epoch": 0.67813852098953, "grad_norm": 0.23596306145191193, "learning_rate": 4.70194309962217e-05, "loss": 0.0242, "step": 5133 }, { "epoch": 0.678270634475014, "grad_norm": 0.21659186482429504, "learning_rate": 4.698418693434338e-05, "loss": 0.0207, "step": 5134 }, { "epoch": 0.678402747960498, "grad_norm": 0.15793080627918243, "learning_rate": 4.6948952029955506e-05, "loss": 0.0198, "step": 5135 }, { "epoch": 0.6785348614459821, "grad_norm": 0.1283729374408722, "learning_rate": 4.69137262891443e-05, "loss": 0.016, "step": 5136 }, { "epoch": 0.6786669749314661, "grad_norm": 0.11209773272275925, "learning_rate": 4.687850971799427e-05, "loss": 0.0038, "step": 5137 }, { "epoch": 0.6787990884169501, "grad_norm": 0.1797674298286438, "learning_rate": 4.6843302322588423e-05, "loss": 0.0104, "step": 5138 }, { "epoch": 0.6789312019024342, "grad_norm": 0.10780422389507294, "learning_rate": 4.680810410900829e-05, "loss": 0.0146, "step": 5139 }, { "epoch": 0.6790633153879182, "grad_norm": 0.10967176407575607, "learning_rate": 4.6772915083333576e-05, "loss": 0.0128, "step": 5140 }, { "epoch": 0.6791954288734022, "grad_norm": 0.20822446048259735, "learning_rate": 4.67377352516426e-05, "loss": 0.0174, "step": 5141 }, { "epoch": 0.6793275423588863, "grad_norm": 0.13655947148799896, "learning_rate": 4.6702564620012035e-05, "loss": 0.0192, "step": 5142 }, { "epoch": 0.6794596558443703, "grad_norm": 0.12891127169132233, "learning_rate": 4.666740319451687e-05, "loss": 0.0196, "step": 5143 }, { "epoch": 0.6795917693298543, "grad_norm": 0.2711154818534851, "learning_rate": 4.663225098123063e-05, "loss": 0.0046, "step": 5144 }, { "epoch": 0.6797238828153384, "grad_norm": 0.1241656094789505, "learning_rate": 4.659710798622521e-05, "loss": 0.0144, "step": 5145 }, { "epoch": 0.6798559963008224, "grad_norm": 0.12367275357246399, "learning_rate": 4.656197421557092e-05, "loss": 0.0149, "step": 5146 }, { "epoch": 0.6799881097863064, "grad_norm": 0.13578025996685028, "learning_rate": 4.652684967533641e-05, "loss": 0.0122, "step": 5147 }, { "epoch": 0.6801202232717904, "grad_norm": 0.18295130133628845, "learning_rate": 4.649173437158882e-05, "loss": 0.0217, "step": 5148 }, { "epoch": 0.6802523367572745, "grad_norm": 0.0850638747215271, "learning_rate": 4.64566283103937e-05, "loss": 0.0101, "step": 5149 }, { "epoch": 0.6803844502427585, "grad_norm": 0.20700609683990479, "learning_rate": 4.642153149781488e-05, "loss": 0.0186, "step": 5150 }, { "epoch": 0.6805165637282425, "grad_norm": 0.0652320459485054, "learning_rate": 4.638644393991472e-05, "loss": 0.0056, "step": 5151 }, { "epoch": 0.6806486772137266, "grad_norm": 0.12985356152057648, "learning_rate": 4.635136564275395e-05, "loss": 0.0111, "step": 5152 }, { "epoch": 0.6807807906992106, "grad_norm": 0.3793286979198456, "learning_rate": 4.631629661239171e-05, "loss": 0.0326, "step": 5153 }, { "epoch": 0.6809129041846946, "grad_norm": 0.15389122068881989, "learning_rate": 4.6281236854885456e-05, "loss": 0.0194, "step": 5154 }, { "epoch": 0.6810450176701787, "grad_norm": 0.1676003634929657, "learning_rate": 4.624618637629115e-05, "loss": 0.024, "step": 5155 }, { "epoch": 0.6811771311556627, "grad_norm": 0.23593369126319885, "learning_rate": 4.621114518266313e-05, "loss": 0.0212, "step": 5156 }, { "epoch": 0.6813092446411467, "grad_norm": 0.340415894985199, "learning_rate": 4.617611328005403e-05, "loss": 0.0186, "step": 5157 }, { "epoch": 0.6814413581266308, "grad_norm": 0.15453080832958221, "learning_rate": 4.6141090674515006e-05, "loss": 0.0121, "step": 5158 }, { "epoch": 0.6815734716121148, "grad_norm": 0.1813836246728897, "learning_rate": 4.6106077372095556e-05, "loss": 0.0219, "step": 5159 }, { "epoch": 0.6817055850975988, "grad_norm": 0.2668374478816986, "learning_rate": 4.607107337884361e-05, "loss": 0.0169, "step": 5160 }, { "epoch": 0.6818376985830829, "grad_norm": 0.41349849104881287, "learning_rate": 4.603607870080537e-05, "loss": 0.02, "step": 5161 }, { "epoch": 0.6819698120685669, "grad_norm": 0.15275467932224274, "learning_rate": 4.600109334402556e-05, "loss": 0.0151, "step": 5162 }, { "epoch": 0.6821019255540509, "grad_norm": 0.1274024397134781, "learning_rate": 4.596611731454728e-05, "loss": 0.0094, "step": 5163 }, { "epoch": 0.682234039039535, "grad_norm": 0.17443904280662537, "learning_rate": 4.593115061841191e-05, "loss": 0.0151, "step": 5164 }, { "epoch": 0.682366152525019, "grad_norm": 0.1481851190328598, "learning_rate": 4.589619326165932e-05, "loss": 0.014, "step": 5165 }, { "epoch": 0.682498266010503, "grad_norm": 0.15162219107151031, "learning_rate": 4.5861245250327764e-05, "loss": 0.0146, "step": 5166 }, { "epoch": 0.682630379495987, "grad_norm": 0.19407233595848083, "learning_rate": 4.582630659045388e-05, "loss": 0.0161, "step": 5167 }, { "epoch": 0.6827624929814711, "grad_norm": 0.1700238138437271, "learning_rate": 4.57913772880726e-05, "loss": 0.0182, "step": 5168 }, { "epoch": 0.6828946064669551, "grad_norm": 0.12905320525169373, "learning_rate": 4.575645734921733e-05, "loss": 0.0116, "step": 5169 }, { "epoch": 0.6830267199524391, "grad_norm": 0.10565055906772614, "learning_rate": 4.572154677991989e-05, "loss": 0.0068, "step": 5170 }, { "epoch": 0.6831588334379232, "grad_norm": 0.13061608374118805, "learning_rate": 4.568664558621034e-05, "loss": 0.0172, "step": 5171 }, { "epoch": 0.6832909469234072, "grad_norm": 0.11616747826337814, "learning_rate": 4.5651753774117255e-05, "loss": 0.0093, "step": 5172 }, { "epoch": 0.6834230604088912, "grad_norm": 0.16339434683322906, "learning_rate": 4.561687134966755e-05, "loss": 0.0152, "step": 5173 }, { "epoch": 0.6835551738943753, "grad_norm": 0.09892938286066055, "learning_rate": 4.558199831888653e-05, "loss": 0.0128, "step": 5174 }, { "epoch": 0.6836872873798593, "grad_norm": 0.15133710205554962, "learning_rate": 4.554713468779781e-05, "loss": 0.0112, "step": 5175 }, { "epoch": 0.6838194008653433, "grad_norm": 0.16055816411972046, "learning_rate": 4.551228046242344e-05, "loss": 0.0104, "step": 5176 }, { "epoch": 0.6839515143508273, "grad_norm": 0.24436479806900024, "learning_rate": 4.5477435648783885e-05, "loss": 0.0191, "step": 5177 }, { "epoch": 0.6840836278363114, "grad_norm": 0.16935010254383087, "learning_rate": 4.544260025289787e-05, "loss": 0.0136, "step": 5178 }, { "epoch": 0.6842157413217954, "grad_norm": 0.11614146828651428, "learning_rate": 4.540777428078258e-05, "loss": 0.0123, "step": 5179 }, { "epoch": 0.6843478548072794, "grad_norm": 0.32731178402900696, "learning_rate": 4.537295773845356e-05, "loss": 0.0189, "step": 5180 }, { "epoch": 0.6844799682927635, "grad_norm": 0.21909061074256897, "learning_rate": 4.5338150631924745e-05, "loss": 0.024, "step": 5181 }, { "epoch": 0.6846120817782475, "grad_norm": 0.4557572901248932, "learning_rate": 4.530335296720835e-05, "loss": 0.0205, "step": 5182 }, { "epoch": 0.6847441952637315, "grad_norm": 0.16844063997268677, "learning_rate": 4.526856475031504e-05, "loss": 0.0215, "step": 5183 }, { "epoch": 0.6848763087492156, "grad_norm": 0.30879849195480347, "learning_rate": 4.52337859872539e-05, "loss": 0.0209, "step": 5184 }, { "epoch": 0.6850084222346996, "grad_norm": 0.23125775158405304, "learning_rate": 4.51990166840322e-05, "loss": 0.02, "step": 5185 }, { "epoch": 0.6851405357201836, "grad_norm": 0.1605955809354782, "learning_rate": 4.5164256846655737e-05, "loss": 0.0112, "step": 5186 }, { "epoch": 0.6852726492056677, "grad_norm": 0.16270045936107635, "learning_rate": 4.512950648112864e-05, "loss": 0.0171, "step": 5187 }, { "epoch": 0.6854047626911517, "grad_norm": 0.09362021088600159, "learning_rate": 4.509476559345339e-05, "loss": 0.0092, "step": 5188 }, { "epoch": 0.6855368761766357, "grad_norm": 0.13662898540496826, "learning_rate": 4.5060034189630774e-05, "loss": 0.0202, "step": 5189 }, { "epoch": 0.6856689896621198, "grad_norm": 0.19320911169052124, "learning_rate": 4.5025312275660025e-05, "loss": 0.0268, "step": 5190 }, { "epoch": 0.6858011031476038, "grad_norm": 0.1628652662038803, "learning_rate": 4.499059985753874e-05, "loss": 0.0181, "step": 5191 }, { "epoch": 0.6859332166330878, "grad_norm": 0.16020523011684418, "learning_rate": 4.495589694126278e-05, "loss": 0.0118, "step": 5192 }, { "epoch": 0.6860653301185718, "grad_norm": 0.15857800841331482, "learning_rate": 4.492120353282643e-05, "loss": 0.0173, "step": 5193 }, { "epoch": 0.6861974436040559, "grad_norm": 0.13342146575450897, "learning_rate": 4.4886519638222355e-05, "loss": 0.0111, "step": 5194 }, { "epoch": 0.6863295570895399, "grad_norm": 0.35289087891578674, "learning_rate": 4.485184526344157e-05, "loss": 0.0127, "step": 5195 }, { "epoch": 0.6864616705750239, "grad_norm": 0.17435653507709503, "learning_rate": 4.4817180414473333e-05, "loss": 0.0211, "step": 5196 }, { "epoch": 0.686593784060508, "grad_norm": 0.21706287562847137, "learning_rate": 4.478252509730548e-05, "loss": 0.0201, "step": 5197 }, { "epoch": 0.686725897545992, "grad_norm": 0.14080168306827545, "learning_rate": 4.4747879317923966e-05, "loss": 0.016, "step": 5198 }, { "epoch": 0.686858011031476, "grad_norm": 0.11725561320781708, "learning_rate": 4.471324308231323e-05, "loss": 0.0105, "step": 5199 }, { "epoch": 0.6869901245169601, "grad_norm": 0.1532520353794098, "learning_rate": 4.467861639645604e-05, "loss": 0.0124, "step": 5200 }, { "epoch": 0.6871222380024441, "grad_norm": 0.16441814601421356, "learning_rate": 4.4643999266333544e-05, "loss": 0.0163, "step": 5201 }, { "epoch": 0.6872543514879281, "grad_norm": 0.188445582985878, "learning_rate": 4.460939169792514e-05, "loss": 0.0261, "step": 5202 }, { "epoch": 0.6873864649734122, "grad_norm": 0.15494723618030548, "learning_rate": 4.4574793697208675e-05, "loss": 0.0173, "step": 5203 }, { "epoch": 0.6875185784588962, "grad_norm": 0.214800164103508, "learning_rate": 4.4540205270160316e-05, "loss": 0.0314, "step": 5204 }, { "epoch": 0.6876506919443802, "grad_norm": 0.2373284250497818, "learning_rate": 4.450562642275452e-05, "loss": 0.0359, "step": 5205 }, { "epoch": 0.6877828054298643, "grad_norm": 0.12864165008068085, "learning_rate": 4.447105716096417e-05, "loss": 0.0145, "step": 5206 }, { "epoch": 0.6879149189153483, "grad_norm": 0.2078774869441986, "learning_rate": 4.443649749076045e-05, "loss": 0.0318, "step": 5207 }, { "epoch": 0.6880470324008323, "grad_norm": 0.13967713713645935, "learning_rate": 4.440194741811295e-05, "loss": 0.021, "step": 5208 }, { "epoch": 0.6881791458863163, "grad_norm": 0.16248448193073273, "learning_rate": 4.436740694898946e-05, "loss": 0.0159, "step": 5209 }, { "epoch": 0.6883112593718004, "grad_norm": 0.19797800481319427, "learning_rate": 4.433287608935622e-05, "loss": 0.0158, "step": 5210 }, { "epoch": 0.6884433728572844, "grad_norm": 0.21106760203838348, "learning_rate": 4.429835484517788e-05, "loss": 0.0139, "step": 5211 }, { "epoch": 0.6885754863427684, "grad_norm": 0.14479297399520874, "learning_rate": 4.4263843222417224e-05, "loss": 0.0154, "step": 5212 }, { "epoch": 0.6887075998282525, "grad_norm": 0.1643540859222412, "learning_rate": 4.4229341227035525e-05, "loss": 0.0211, "step": 5213 }, { "epoch": 0.6888397133137365, "grad_norm": 0.14425019919872284, "learning_rate": 4.419484886499239e-05, "loss": 0.0179, "step": 5214 }, { "epoch": 0.6889718267992205, "grad_norm": 0.1826268434524536, "learning_rate": 4.416036614224574e-05, "loss": 0.019, "step": 5215 }, { "epoch": 0.6891039402847046, "grad_norm": 0.23135367035865784, "learning_rate": 4.412589306475174e-05, "loss": 0.0303, "step": 5216 }, { "epoch": 0.6892360537701886, "grad_norm": 0.13859330117702484, "learning_rate": 4.409142963846503e-05, "loss": 0.0133, "step": 5217 }, { "epoch": 0.6893681672556726, "grad_norm": 0.08792946487665176, "learning_rate": 4.4056975869338544e-05, "loss": 0.0092, "step": 5218 }, { "epoch": 0.6895002807411567, "grad_norm": 0.24918882548809052, "learning_rate": 4.402253176332347e-05, "loss": 0.0126, "step": 5219 }, { "epoch": 0.6896323942266407, "grad_norm": 0.25261256098747253, "learning_rate": 4.3988097326369396e-05, "loss": 0.0207, "step": 5220 }, { "epoch": 0.6897645077121247, "grad_norm": 0.23432409763336182, "learning_rate": 4.395367256442424e-05, "loss": 0.0194, "step": 5221 }, { "epoch": 0.6898966211976088, "grad_norm": 0.1472122222185135, "learning_rate": 4.3919257483434284e-05, "loss": 0.0163, "step": 5222 }, { "epoch": 0.6900287346830928, "grad_norm": 0.23036111891269684, "learning_rate": 4.3884852089344e-05, "loss": 0.0234, "step": 5223 }, { "epoch": 0.6901608481685768, "grad_norm": 0.21664488315582275, "learning_rate": 4.38504563880963e-05, "loss": 0.0308, "step": 5224 }, { "epoch": 0.6902929616540608, "grad_norm": 0.18760326504707336, "learning_rate": 4.381607038563247e-05, "loss": 0.0185, "step": 5225 }, { "epoch": 0.6904250751395449, "grad_norm": 0.13710762560367584, "learning_rate": 4.378169408789196e-05, "loss": 0.0118, "step": 5226 }, { "epoch": 0.6905571886250289, "grad_norm": 0.19669315218925476, "learning_rate": 4.374732750081265e-05, "loss": 0.0348, "step": 5227 }, { "epoch": 0.6906893021105129, "grad_norm": 0.18398793041706085, "learning_rate": 4.371297063033075e-05, "loss": 0.0155, "step": 5228 }, { "epoch": 0.690821415595997, "grad_norm": 0.194778174161911, "learning_rate": 4.3678623482380806e-05, "loss": 0.025, "step": 5229 }, { "epoch": 0.690953529081481, "grad_norm": 0.12789078056812286, "learning_rate": 4.364428606289556e-05, "loss": 0.0218, "step": 5230 }, { "epoch": 0.691085642566965, "grad_norm": 0.1261407881975174, "learning_rate": 4.3609958377806194e-05, "loss": 0.0122, "step": 5231 }, { "epoch": 0.6912177560524491, "grad_norm": 0.2067817747592926, "learning_rate": 4.3575640433042206e-05, "loss": 0.0179, "step": 5232 }, { "epoch": 0.6913498695379331, "grad_norm": 0.19638954102993011, "learning_rate": 4.354133223453133e-05, "loss": 0.0259, "step": 5233 }, { "epoch": 0.6914819830234171, "grad_norm": 0.16034339368343353, "learning_rate": 4.350703378819968e-05, "loss": 0.0142, "step": 5234 }, { "epoch": 0.6916140965089012, "grad_norm": 0.16898213326931, "learning_rate": 4.347274509997169e-05, "loss": 0.0204, "step": 5235 }, { "epoch": 0.6917462099943852, "grad_norm": 0.4070425033569336, "learning_rate": 4.34384661757701e-05, "loss": 0.0292, "step": 5236 }, { "epoch": 0.6918783234798692, "grad_norm": 0.15171203017234802, "learning_rate": 4.34041970215159e-05, "loss": 0.0198, "step": 5237 }, { "epoch": 0.6920104369653532, "grad_norm": 0.35594651103019714, "learning_rate": 4.3369937643128475e-05, "loss": 0.0221, "step": 5238 }, { "epoch": 0.6921425504508373, "grad_norm": 0.18406252562999725, "learning_rate": 4.3335688046525534e-05, "loss": 0.0299, "step": 5239 }, { "epoch": 0.6922746639363213, "grad_norm": 0.1940668672323227, "learning_rate": 4.330144823762299e-05, "loss": 0.0205, "step": 5240 }, { "epoch": 0.6924067774218053, "grad_norm": 0.1684083491563797, "learning_rate": 4.326721822233514e-05, "loss": 0.0295, "step": 5241 }, { "epoch": 0.6925388909072894, "grad_norm": 0.16625122725963593, "learning_rate": 4.32329980065746e-05, "loss": 0.0125, "step": 5242 }, { "epoch": 0.6926710043927734, "grad_norm": 0.1683010458946228, "learning_rate": 4.31987875962523e-05, "loss": 0.0192, "step": 5243 }, { "epoch": 0.6928031178782574, "grad_norm": 0.16990509629249573, "learning_rate": 4.316458699727738e-05, "loss": 0.016, "step": 5244 }, { "epoch": 0.6929352313637415, "grad_norm": 0.13994839787483215, "learning_rate": 4.313039621555738e-05, "loss": 0.0173, "step": 5245 }, { "epoch": 0.6930673448492255, "grad_norm": 0.2251029908657074, "learning_rate": 4.3096215256998175e-05, "loss": 0.0141, "step": 5246 }, { "epoch": 0.6931994583347095, "grad_norm": 0.14937463402748108, "learning_rate": 4.30620441275038e-05, "loss": 0.0118, "step": 5247 }, { "epoch": 0.6933315718201936, "grad_norm": 0.20369993150234222, "learning_rate": 4.302788283297672e-05, "loss": 0.017, "step": 5248 }, { "epoch": 0.6934636853056776, "grad_norm": 0.10524363815784454, "learning_rate": 4.299373137931765e-05, "loss": 0.0103, "step": 5249 }, { "epoch": 0.6935957987911616, "grad_norm": 0.0838766023516655, "learning_rate": 4.295958977242566e-05, "loss": 0.0066, "step": 5250 }, { "epoch": 0.6937279122766457, "grad_norm": 0.16826088726520538, "learning_rate": 4.292545801819801e-05, "loss": 0.0118, "step": 5251 }, { "epoch": 0.6938600257621297, "grad_norm": 0.16063901782035828, "learning_rate": 4.2891336122530335e-05, "loss": 0.0204, "step": 5252 }, { "epoch": 0.6939921392476137, "grad_norm": 0.1232612207531929, "learning_rate": 4.2857224091316615e-05, "loss": 0.0184, "step": 5253 }, { "epoch": 0.6941242527330977, "grad_norm": 0.16036061942577362, "learning_rate": 4.282312193044897e-05, "loss": 0.0139, "step": 5254 }, { "epoch": 0.6942563662185818, "grad_norm": 0.1817048043012619, "learning_rate": 4.2789029645817945e-05, "loss": 0.013, "step": 5255 }, { "epoch": 0.6943884797040658, "grad_norm": 0.16562654078006744, "learning_rate": 4.275494724331242e-05, "loss": 0.0105, "step": 5256 }, { "epoch": 0.6945205931895498, "grad_norm": 0.19718943536281586, "learning_rate": 4.272087472881939e-05, "loss": 0.0182, "step": 5257 }, { "epoch": 0.6946527066750339, "grad_norm": 0.10483067482709885, "learning_rate": 4.2686812108224294e-05, "loss": 0.0104, "step": 5258 }, { "epoch": 0.6947848201605179, "grad_norm": 0.14471067488193512, "learning_rate": 4.2652759387410814e-05, "loss": 0.0075, "step": 5259 }, { "epoch": 0.6949169336460019, "grad_norm": 0.07532824575901031, "learning_rate": 4.2618716572260944e-05, "loss": 0.0059, "step": 5260 }, { "epoch": 0.695049047131486, "grad_norm": 0.16629336774349213, "learning_rate": 4.258468366865487e-05, "loss": 0.0123, "step": 5261 }, { "epoch": 0.69518116061697, "grad_norm": 0.19355374574661255, "learning_rate": 4.255066068247118e-05, "loss": 0.0158, "step": 5262 }, { "epoch": 0.695313274102454, "grad_norm": 0.20441442728042603, "learning_rate": 4.251664761958676e-05, "loss": 0.0259, "step": 5263 }, { "epoch": 0.6954453875879381, "grad_norm": 0.16446572542190552, "learning_rate": 4.248264448587663e-05, "loss": 0.0142, "step": 5264 }, { "epoch": 0.6955775010734221, "grad_norm": 0.13875527679920197, "learning_rate": 4.244865128721426e-05, "loss": 0.0141, "step": 5265 }, { "epoch": 0.6957096145589061, "grad_norm": 0.08903460204601288, "learning_rate": 4.241466802947133e-05, "loss": 0.0071, "step": 5266 }, { "epoch": 0.6958417280443902, "grad_norm": 0.13762259483337402, "learning_rate": 4.238069471851783e-05, "loss": 0.0079, "step": 5267 }, { "epoch": 0.6959738415298742, "grad_norm": 0.1628267914056778, "learning_rate": 4.234673136022197e-05, "loss": 0.0199, "step": 5268 }, { "epoch": 0.6961059550153582, "grad_norm": 0.12279197573661804, "learning_rate": 4.23127779604503e-05, "loss": 0.0078, "step": 5269 }, { "epoch": 0.6962380685008422, "grad_norm": 0.19203080236911774, "learning_rate": 4.227883452506769e-05, "loss": 0.0177, "step": 5270 }, { "epoch": 0.6963701819863263, "grad_norm": 0.1306513249874115, "learning_rate": 4.2244901059937144e-05, "loss": 0.0085, "step": 5271 }, { "epoch": 0.6965022954718103, "grad_norm": 0.15973524749279022, "learning_rate": 4.2210977570920085e-05, "loss": 0.0173, "step": 5272 }, { "epoch": 0.6966344089572943, "grad_norm": 0.17098453640937805, "learning_rate": 4.2177064063876145e-05, "loss": 0.0211, "step": 5273 }, { "epoch": 0.6967665224427784, "grad_norm": 0.11793388426303864, "learning_rate": 4.21431605446633e-05, "loss": 0.0151, "step": 5274 }, { "epoch": 0.6968986359282624, "grad_norm": 0.11402009427547455, "learning_rate": 4.2109267019137656e-05, "loss": 0.0117, "step": 5275 }, { "epoch": 0.6970307494137464, "grad_norm": 0.09435441344976425, "learning_rate": 4.207538349315375e-05, "loss": 0.0127, "step": 5276 }, { "epoch": 0.6971628628992305, "grad_norm": 0.2840203642845154, "learning_rate": 4.204150997256434e-05, "loss": 0.019, "step": 5277 }, { "epoch": 0.6972949763847145, "grad_norm": 0.306834876537323, "learning_rate": 4.2007646463220384e-05, "loss": 0.0146, "step": 5278 }, { "epoch": 0.6974270898701985, "grad_norm": 0.24466602504253387, "learning_rate": 4.197379297097121e-05, "loss": 0.0231, "step": 5279 }, { "epoch": 0.6975592033556826, "grad_norm": 0.19821403920650482, "learning_rate": 4.193994950166435e-05, "loss": 0.0143, "step": 5280 }, { "epoch": 0.6976913168411666, "grad_norm": 0.1387878656387329, "learning_rate": 4.190611606114571e-05, "loss": 0.0114, "step": 5281 }, { "epoch": 0.6978234303266506, "grad_norm": 0.14503693580627441, "learning_rate": 4.1872292655259274e-05, "loss": 0.0223, "step": 5282 }, { "epoch": 0.6979555438121346, "grad_norm": 0.14114725589752197, "learning_rate": 4.1838479289847456e-05, "loss": 0.0177, "step": 5283 }, { "epoch": 0.6980876572976187, "grad_norm": 0.18414288759231567, "learning_rate": 4.1804675970750906e-05, "loss": 0.0148, "step": 5284 }, { "epoch": 0.6982197707831027, "grad_norm": 0.14067934453487396, "learning_rate": 4.177088270380846e-05, "loss": 0.0129, "step": 5285 }, { "epoch": 0.6983518842685867, "grad_norm": 0.16954699158668518, "learning_rate": 4.17370994948573e-05, "loss": 0.0192, "step": 5286 }, { "epoch": 0.6984839977540708, "grad_norm": 0.18649275600910187, "learning_rate": 4.170332634973284e-05, "loss": 0.0164, "step": 5287 }, { "epoch": 0.6986161112395548, "grad_norm": 0.11534375697374344, "learning_rate": 4.166956327426881e-05, "loss": 0.0062, "step": 5288 }, { "epoch": 0.6987482247250388, "grad_norm": 0.1631186604499817, "learning_rate": 4.163581027429706e-05, "loss": 0.0111, "step": 5289 }, { "epoch": 0.6988803382105229, "grad_norm": 0.21578098833560944, "learning_rate": 4.160206735564783e-05, "loss": 0.0155, "step": 5290 }, { "epoch": 0.6990124516960069, "grad_norm": 0.1441599726676941, "learning_rate": 4.156833452414963e-05, "loss": 0.011, "step": 5291 }, { "epoch": 0.6991445651814909, "grad_norm": 0.18236543238162994, "learning_rate": 4.1534611785629087e-05, "loss": 0.028, "step": 5292 }, { "epoch": 0.699276678666975, "grad_norm": 0.18258638679981232, "learning_rate": 4.150089914591121e-05, "loss": 0.0169, "step": 5293 }, { "epoch": 0.699408792152459, "grad_norm": 0.12462858110666275, "learning_rate": 4.1467196610819234e-05, "loss": 0.0082, "step": 5294 }, { "epoch": 0.699540905637943, "grad_norm": 0.14412254095077515, "learning_rate": 4.143350418617469e-05, "loss": 0.0104, "step": 5295 }, { "epoch": 0.699673019123427, "grad_norm": 0.1670442372560501, "learning_rate": 4.1399821877797205e-05, "loss": 0.0183, "step": 5296 }, { "epoch": 0.6998051326089111, "grad_norm": 0.1631360799074173, "learning_rate": 4.136614969150484e-05, "loss": 0.0176, "step": 5297 }, { "epoch": 0.6999372460943951, "grad_norm": 0.2254563421010971, "learning_rate": 4.133248763311386e-05, "loss": 0.0146, "step": 5298 }, { "epoch": 0.7000693595798791, "grad_norm": 0.1660764068365097, "learning_rate": 4.129883570843868e-05, "loss": 0.0184, "step": 5299 }, { "epoch": 0.7002014730653632, "grad_norm": 0.14396461844444275, "learning_rate": 4.1265193923292076e-05, "loss": 0.0147, "step": 5300 }, { "epoch": 0.7003335865508472, "grad_norm": 0.11012925952672958, "learning_rate": 4.123156228348505e-05, "loss": 0.0109, "step": 5301 }, { "epoch": 0.7004657000363312, "grad_norm": 0.14452813565731049, "learning_rate": 4.119794079482686e-05, "loss": 0.0208, "step": 5302 }, { "epoch": 0.7005978135218153, "grad_norm": 0.2126447707414627, "learning_rate": 4.116432946312493e-05, "loss": 0.027, "step": 5303 }, { "epoch": 0.7007299270072993, "grad_norm": 0.18720543384552002, "learning_rate": 4.113072829418502e-05, "loss": 0.021, "step": 5304 }, { "epoch": 0.7008620404927833, "grad_norm": 0.2129470854997635, "learning_rate": 4.109713729381113e-05, "loss": 0.0254, "step": 5305 }, { "epoch": 0.7009941539782674, "grad_norm": 0.20480234920978546, "learning_rate": 4.106355646780541e-05, "loss": 0.0193, "step": 5306 }, { "epoch": 0.7011262674637514, "grad_norm": 0.11257723718881607, "learning_rate": 4.1029985821968366e-05, "loss": 0.0092, "step": 5307 }, { "epoch": 0.7012583809492354, "grad_norm": 0.13825350999832153, "learning_rate": 4.099642536209869e-05, "loss": 0.0151, "step": 5308 }, { "epoch": 0.7013904944347195, "grad_norm": 0.150064617395401, "learning_rate": 4.096287509399337e-05, "loss": 0.0206, "step": 5309 }, { "epoch": 0.7015226079202035, "grad_norm": 0.14601023495197296, "learning_rate": 4.09293350234475e-05, "loss": 0.0147, "step": 5310 }, { "epoch": 0.7016547214056875, "grad_norm": 0.14445830881595612, "learning_rate": 4.089580515625454e-05, "loss": 0.0185, "step": 5311 }, { "epoch": 0.7017868348911716, "grad_norm": 0.15438856184482574, "learning_rate": 4.08622854982062e-05, "loss": 0.0146, "step": 5312 }, { "epoch": 0.7019189483766556, "grad_norm": 0.1915358155965805, "learning_rate": 4.082877605509229e-05, "loss": 0.016, "step": 5313 }, { "epoch": 0.7020510618621396, "grad_norm": 0.25274017453193665, "learning_rate": 4.079527683270093e-05, "loss": 0.0155, "step": 5314 }, { "epoch": 0.7021831753476236, "grad_norm": 0.17730306088924408, "learning_rate": 4.076178783681861e-05, "loss": 0.0232, "step": 5315 }, { "epoch": 0.7023152888331077, "grad_norm": 0.1441715657711029, "learning_rate": 4.072830907322981e-05, "loss": 0.013, "step": 5316 }, { "epoch": 0.7024474023185917, "grad_norm": 0.23619748651981354, "learning_rate": 4.0694840547717394e-05, "loss": 0.0233, "step": 5317 }, { "epoch": 0.7025795158040757, "grad_norm": 0.16689759492874146, "learning_rate": 4.0661382266062475e-05, "loss": 0.0125, "step": 5318 }, { "epoch": 0.7027116292895598, "grad_norm": 0.14752112329006195, "learning_rate": 4.062793423404426e-05, "loss": 0.0101, "step": 5319 }, { "epoch": 0.7028437427750438, "grad_norm": 0.14175213873386383, "learning_rate": 4.0594496457440314e-05, "loss": 0.0162, "step": 5320 }, { "epoch": 0.7029758562605278, "grad_norm": 0.16413505375385284, "learning_rate": 4.056106894202637e-05, "loss": 0.0122, "step": 5321 }, { "epoch": 0.7031079697460119, "grad_norm": 0.18232764303684235, "learning_rate": 4.0527651693576463e-05, "loss": 0.0204, "step": 5322 }, { "epoch": 0.7032400832314959, "grad_norm": 0.13424061238765717, "learning_rate": 4.049424471786273e-05, "loss": 0.0193, "step": 5323 }, { "epoch": 0.7033721967169799, "grad_norm": 0.13029293715953827, "learning_rate": 4.046084802065562e-05, "loss": 0.0097, "step": 5324 }, { "epoch": 0.703504310202464, "grad_norm": 0.11119290441274643, "learning_rate": 4.042746160772382e-05, "loss": 0.0161, "step": 5325 }, { "epoch": 0.703636423687948, "grad_norm": 0.17957177758216858, "learning_rate": 4.039408548483416e-05, "loss": 0.0161, "step": 5326 }, { "epoch": 0.703768537173432, "grad_norm": 0.23708337545394897, "learning_rate": 4.036071965775175e-05, "loss": 0.0258, "step": 5327 }, { "epoch": 0.703900650658916, "grad_norm": 0.18829962611198425, "learning_rate": 4.032736413223994e-05, "loss": 0.0153, "step": 5328 }, { "epoch": 0.7040327641444001, "grad_norm": 0.27105703949928284, "learning_rate": 4.02940189140603e-05, "loss": 0.0318, "step": 5329 }, { "epoch": 0.7041648776298841, "grad_norm": 0.11548773944377899, "learning_rate": 4.026068400897251e-05, "loss": 0.0109, "step": 5330 }, { "epoch": 0.7042969911153681, "grad_norm": 0.10774447023868561, "learning_rate": 4.02273594227346e-05, "loss": 0.0143, "step": 5331 }, { "epoch": 0.7044291046008522, "grad_norm": 0.1054147332906723, "learning_rate": 4.019404516110279e-05, "loss": 0.0151, "step": 5332 }, { "epoch": 0.7045612180863362, "grad_norm": 0.12237270921468735, "learning_rate": 4.016074122983144e-05, "loss": 0.0151, "step": 5333 }, { "epoch": 0.7046933315718202, "grad_norm": 0.1550246775150299, "learning_rate": 4.012744763467322e-05, "loss": 0.0152, "step": 5334 }, { "epoch": 0.7048254450573043, "grad_norm": 0.1900588423013687, "learning_rate": 4.0094164381378964e-05, "loss": 0.0169, "step": 5335 }, { "epoch": 0.7049575585427883, "grad_norm": 0.12014942616224289, "learning_rate": 4.006089147569776e-05, "loss": 0.0096, "step": 5336 }, { "epoch": 0.7050896720282723, "grad_norm": 0.13331158459186554, "learning_rate": 4.002762892337684e-05, "loss": 0.0114, "step": 5337 }, { "epoch": 0.7052217855137564, "grad_norm": 0.1908334344625473, "learning_rate": 3.9994376730161685e-05, "loss": 0.0168, "step": 5338 }, { "epoch": 0.7053538989992404, "grad_norm": 0.11662087589502335, "learning_rate": 3.996113490179605e-05, "loss": 0.014, "step": 5339 }, { "epoch": 0.7054860124847244, "grad_norm": 0.17463147640228271, "learning_rate": 3.992790344402176e-05, "loss": 0.0148, "step": 5340 }, { "epoch": 0.7056181259702085, "grad_norm": 0.1223292127251625, "learning_rate": 3.989468236257897e-05, "loss": 0.0084, "step": 5341 }, { "epoch": 0.7057502394556925, "grad_norm": 0.2078678011894226, "learning_rate": 3.986147166320599e-05, "loss": 0.0237, "step": 5342 }, { "epoch": 0.7058823529411765, "grad_norm": 0.145186647772789, "learning_rate": 3.98282713516394e-05, "loss": 0.0053, "step": 5343 }, { "epoch": 0.7060144664266605, "grad_norm": 0.1675570160150528, "learning_rate": 3.979508143361385e-05, "loss": 0.0118, "step": 5344 }, { "epoch": 0.7061465799121446, "grad_norm": 0.14878936111927032, "learning_rate": 3.976190191486231e-05, "loss": 0.0197, "step": 5345 }, { "epoch": 0.7062786933976286, "grad_norm": 0.1630447953939438, "learning_rate": 3.972873280111597e-05, "loss": 0.0181, "step": 5346 }, { "epoch": 0.7064108068831126, "grad_norm": 0.28922614455223083, "learning_rate": 3.969557409810408e-05, "loss": 0.0206, "step": 5347 }, { "epoch": 0.7065429203685967, "grad_norm": 0.12345405668020248, "learning_rate": 3.966242581155424e-05, "loss": 0.008, "step": 5348 }, { "epoch": 0.7066750338540807, "grad_norm": 0.16507402062416077, "learning_rate": 3.9629287947192196e-05, "loss": 0.0221, "step": 5349 }, { "epoch": 0.7068071473395647, "grad_norm": 0.23153814673423767, "learning_rate": 3.959616051074193e-05, "loss": 0.0161, "step": 5350 }, { "epoch": 0.7069392608250488, "grad_norm": 0.20211441814899445, "learning_rate": 3.9563043507925514e-05, "loss": 0.0312, "step": 5351 }, { "epoch": 0.7070713743105328, "grad_norm": 0.14686128497123718, "learning_rate": 3.952993694446332e-05, "loss": 0.0174, "step": 5352 }, { "epoch": 0.7072034877960168, "grad_norm": 0.19460754096508026, "learning_rate": 3.949684082607393e-05, "loss": 0.0259, "step": 5353 }, { "epoch": 0.7073356012815009, "grad_norm": 0.24404078722000122, "learning_rate": 3.9463755158474e-05, "loss": 0.017, "step": 5354 }, { "epoch": 0.7074677147669849, "grad_norm": 0.13295841217041016, "learning_rate": 3.9430679947378526e-05, "loss": 0.0108, "step": 5355 }, { "epoch": 0.7075998282524689, "grad_norm": 0.11946427822113037, "learning_rate": 3.939761519850059e-05, "loss": 0.0098, "step": 5356 }, { "epoch": 0.707731941737953, "grad_norm": 0.1620432436466217, "learning_rate": 3.9364560917551574e-05, "loss": 0.0173, "step": 5357 }, { "epoch": 0.707864055223437, "grad_norm": 0.17194488644599915, "learning_rate": 3.93315171102409e-05, "loss": 0.0117, "step": 5358 }, { "epoch": 0.707996168708921, "grad_norm": 0.19578425586223602, "learning_rate": 3.929848378227632e-05, "loss": 0.0237, "step": 5359 }, { "epoch": 0.708128282194405, "grad_norm": 0.11690971255302429, "learning_rate": 3.926546093936374e-05, "loss": 0.0138, "step": 5360 }, { "epoch": 0.7082603956798891, "grad_norm": 0.12806108593940735, "learning_rate": 3.923244858720718e-05, "loss": 0.0144, "step": 5361 }, { "epoch": 0.7083925091653731, "grad_norm": 0.2078218311071396, "learning_rate": 3.919944673150894e-05, "loss": 0.0222, "step": 5362 }, { "epoch": 0.7085246226508571, "grad_norm": 0.1328975111246109, "learning_rate": 3.916645537796947e-05, "loss": 0.0121, "step": 5363 }, { "epoch": 0.7086567361363412, "grad_norm": 0.12743492424488068, "learning_rate": 3.9133474532287453e-05, "loss": 0.0207, "step": 5364 }, { "epoch": 0.7087888496218252, "grad_norm": 0.11305283010005951, "learning_rate": 3.910050420015964e-05, "loss": 0.0083, "step": 5365 }, { "epoch": 0.7089209631073092, "grad_norm": 0.20388872921466827, "learning_rate": 3.906754438728106e-05, "loss": 0.0219, "step": 5366 }, { "epoch": 0.7090530765927933, "grad_norm": 0.17367129027843475, "learning_rate": 3.9034595099344964e-05, "loss": 0.0194, "step": 5367 }, { "epoch": 0.7091851900782773, "grad_norm": 0.18559734523296356, "learning_rate": 3.900165634204263e-05, "loss": 0.0219, "step": 5368 }, { "epoch": 0.7093173035637613, "grad_norm": 0.14173002541065216, "learning_rate": 3.896872812106367e-05, "loss": 0.0099, "step": 5369 }, { "epoch": 0.7094494170492454, "grad_norm": 0.20723380148410797, "learning_rate": 3.89358104420958e-05, "loss": 0.0108, "step": 5370 }, { "epoch": 0.7095815305347294, "grad_norm": 0.6518858075141907, "learning_rate": 3.890290331082499e-05, "loss": 0.0205, "step": 5371 }, { "epoch": 0.7097136440202133, "grad_norm": 0.15502440929412842, "learning_rate": 3.8870006732935206e-05, "loss": 0.0129, "step": 5372 }, { "epoch": 0.7098457575056973, "grad_norm": 0.13822178542613983, "learning_rate": 3.883712071410882e-05, "loss": 0.0171, "step": 5373 }, { "epoch": 0.7099778709911814, "grad_norm": 0.1479300558567047, "learning_rate": 3.880424526002631e-05, "loss": 0.0137, "step": 5374 }, { "epoch": 0.7101099844766654, "grad_norm": 0.21749626100063324, "learning_rate": 3.8771380376366186e-05, "loss": 0.0216, "step": 5375 }, { "epoch": 0.7102420979621494, "grad_norm": 0.20055606961250305, "learning_rate": 3.873852606880529e-05, "loss": 0.0194, "step": 5376 }, { "epoch": 0.7103742114476335, "grad_norm": 0.13387270271778107, "learning_rate": 3.8705682343018645e-05, "loss": 0.0103, "step": 5377 }, { "epoch": 0.7105063249331175, "grad_norm": 0.15972426533699036, "learning_rate": 3.86728492046793e-05, "loss": 0.0201, "step": 5378 }, { "epoch": 0.7106384384186015, "grad_norm": 0.07348188757896423, "learning_rate": 3.864002665945859e-05, "loss": 0.0059, "step": 5379 }, { "epoch": 0.7107705519040856, "grad_norm": 0.07760182023048401, "learning_rate": 3.8607214713026016e-05, "loss": 0.007, "step": 5380 }, { "epoch": 0.7109026653895696, "grad_norm": 0.1682509332895279, "learning_rate": 3.8574413371049264e-05, "loss": 0.0116, "step": 5381 }, { "epoch": 0.7110347788750536, "grad_norm": 0.2266969084739685, "learning_rate": 3.854162263919408e-05, "loss": 0.015, "step": 5382 }, { "epoch": 0.7111668923605377, "grad_norm": 0.14937283098697662, "learning_rate": 3.8508842523124466e-05, "loss": 0.0089, "step": 5383 }, { "epoch": 0.7112990058460217, "grad_norm": 0.1911482810974121, "learning_rate": 3.8476073028502634e-05, "loss": 0.016, "step": 5384 }, { "epoch": 0.7114311193315057, "grad_norm": 0.13798931241035461, "learning_rate": 3.844331416098882e-05, "loss": 0.0214, "step": 5385 }, { "epoch": 0.7115632328169897, "grad_norm": 0.12851540744304657, "learning_rate": 3.841056592624155e-05, "loss": 0.0175, "step": 5386 }, { "epoch": 0.7116953463024738, "grad_norm": 0.18764302134513855, "learning_rate": 3.8377828329917456e-05, "loss": 0.0166, "step": 5387 }, { "epoch": 0.7118274597879578, "grad_norm": 0.1515864133834839, "learning_rate": 3.834510137767138e-05, "loss": 0.0184, "step": 5388 }, { "epoch": 0.7119595732734418, "grad_norm": 0.19872577488422394, "learning_rate": 3.831238507515623e-05, "loss": 0.0197, "step": 5389 }, { "epoch": 0.7120916867589259, "grad_norm": 0.27276068925857544, "learning_rate": 3.827967942802317e-05, "loss": 0.0258, "step": 5390 }, { "epoch": 0.7122238002444099, "grad_norm": 0.15714675188064575, "learning_rate": 3.824698444192153e-05, "loss": 0.0137, "step": 5391 }, { "epoch": 0.7123559137298939, "grad_norm": 0.1761281043291092, "learning_rate": 3.821430012249867e-05, "loss": 0.0226, "step": 5392 }, { "epoch": 0.712488027215378, "grad_norm": 0.14952731132507324, "learning_rate": 3.818162647540024e-05, "loss": 0.0155, "step": 5393 }, { "epoch": 0.712620140700862, "grad_norm": 0.16243423521518707, "learning_rate": 3.814896350627001e-05, "loss": 0.015, "step": 5394 }, { "epoch": 0.712752254186346, "grad_norm": 0.13705137372016907, "learning_rate": 3.8116311220749915e-05, "loss": 0.0151, "step": 5395 }, { "epoch": 0.71288436767183, "grad_norm": 0.13219165802001953, "learning_rate": 3.8083669624479964e-05, "loss": 0.0138, "step": 5396 }, { "epoch": 0.7130164811573141, "grad_norm": 0.2142573893070221, "learning_rate": 3.805103872309843e-05, "loss": 0.0163, "step": 5397 }, { "epoch": 0.7131485946427981, "grad_norm": 0.12394694983959198, "learning_rate": 3.8018418522241705e-05, "loss": 0.015, "step": 5398 }, { "epoch": 0.7132807081282821, "grad_norm": 0.17286397516727448, "learning_rate": 3.798580902754426e-05, "loss": 0.015, "step": 5399 }, { "epoch": 0.7134128216137662, "grad_norm": 0.7974072694778442, "learning_rate": 3.795321024463882e-05, "loss": 0.0205, "step": 5400 }, { "epoch": 0.7135449350992502, "grad_norm": 0.16915547847747803, "learning_rate": 3.7920622179156194e-05, "loss": 0.0171, "step": 5401 }, { "epoch": 0.7136770485847342, "grad_norm": 0.1622791886329651, "learning_rate": 3.78880448367254e-05, "loss": 0.0162, "step": 5402 }, { "epoch": 0.7138091620702183, "grad_norm": 0.2695022523403168, "learning_rate": 3.785547822297352e-05, "loss": 0.0136, "step": 5403 }, { "epoch": 0.7139412755557023, "grad_norm": 0.15922003984451294, "learning_rate": 3.7822922343525826e-05, "loss": 0.0136, "step": 5404 }, { "epoch": 0.7140733890411863, "grad_norm": 0.1592172086238861, "learning_rate": 3.77903772040058e-05, "loss": 0.0179, "step": 5405 }, { "epoch": 0.7142055025266704, "grad_norm": 0.2182742953300476, "learning_rate": 3.775784281003493e-05, "loss": 0.0189, "step": 5406 }, { "epoch": 0.7143376160121544, "grad_norm": 0.16970114409923553, "learning_rate": 3.772531916723294e-05, "loss": 0.0261, "step": 5407 }, { "epoch": 0.7144697294976384, "grad_norm": 0.21392595767974854, "learning_rate": 3.769280628121772e-05, "loss": 0.0196, "step": 5408 }, { "epoch": 0.7146018429831225, "grad_norm": 0.09747704863548279, "learning_rate": 3.766030415760525e-05, "loss": 0.0069, "step": 5409 }, { "epoch": 0.7147339564686065, "grad_norm": 0.16599935293197632, "learning_rate": 3.762781280200964e-05, "loss": 0.0174, "step": 5410 }, { "epoch": 0.7148660699540905, "grad_norm": 0.25641050934791565, "learning_rate": 3.759533222004318e-05, "loss": 0.0117, "step": 5411 }, { "epoch": 0.7149981834395746, "grad_norm": 0.1257232427597046, "learning_rate": 3.75628624173163e-05, "loss": 0.0109, "step": 5412 }, { "epoch": 0.7151302969250586, "grad_norm": 0.0978141725063324, "learning_rate": 3.7530403399437506e-05, "loss": 0.0048, "step": 5413 }, { "epoch": 0.7152624104105426, "grad_norm": 0.12040681391954422, "learning_rate": 3.749795517201352e-05, "loss": 0.0074, "step": 5414 }, { "epoch": 0.7153945238960266, "grad_norm": 0.20707263052463531, "learning_rate": 3.746551774064915e-05, "loss": 0.0269, "step": 5415 }, { "epoch": 0.7155266373815107, "grad_norm": 0.10615110397338867, "learning_rate": 3.7433091110947406e-05, "loss": 0.0084, "step": 5416 }, { "epoch": 0.7156587508669947, "grad_norm": 0.14646649360656738, "learning_rate": 3.740067528850931e-05, "loss": 0.0182, "step": 5417 }, { "epoch": 0.7157908643524787, "grad_norm": 0.16973420977592468, "learning_rate": 3.736827027893411e-05, "loss": 0.0195, "step": 5418 }, { "epoch": 0.7159229778379628, "grad_norm": 0.2045861929655075, "learning_rate": 3.733587608781922e-05, "loss": 0.025, "step": 5419 }, { "epoch": 0.7160550913234468, "grad_norm": 0.12251792848110199, "learning_rate": 3.730349272076006e-05, "loss": 0.0182, "step": 5420 }, { "epoch": 0.7161872048089308, "grad_norm": 0.14665895700454712, "learning_rate": 3.7271120183350274e-05, "loss": 0.0127, "step": 5421 }, { "epoch": 0.7163193182944149, "grad_norm": 0.15170572698116302, "learning_rate": 3.7238758481181614e-05, "loss": 0.0128, "step": 5422 }, { "epoch": 0.7164514317798989, "grad_norm": 0.20112718641757965, "learning_rate": 3.7206407619843995e-05, "loss": 0.0158, "step": 5423 }, { "epoch": 0.7165835452653829, "grad_norm": 0.13899517059326172, "learning_rate": 3.7174067604925354e-05, "loss": 0.0133, "step": 5424 }, { "epoch": 0.716715658750867, "grad_norm": 0.17532852292060852, "learning_rate": 3.714173844201187e-05, "loss": 0.0194, "step": 5425 }, { "epoch": 0.716847772236351, "grad_norm": 0.1427544355392456, "learning_rate": 3.710942013668782e-05, "loss": 0.0109, "step": 5426 }, { "epoch": 0.716979885721835, "grad_norm": 0.158977672457695, "learning_rate": 3.707711269453553e-05, "loss": 0.0181, "step": 5427 }, { "epoch": 0.717111999207319, "grad_norm": 0.10790096968412399, "learning_rate": 3.704481612113554e-05, "loss": 0.0104, "step": 5428 }, { "epoch": 0.7172441126928031, "grad_norm": 0.12931567430496216, "learning_rate": 3.701253042206646e-05, "loss": 0.0142, "step": 5429 }, { "epoch": 0.7173762261782871, "grad_norm": 0.09444590657949448, "learning_rate": 3.69802556029051e-05, "loss": 0.0117, "step": 5430 }, { "epoch": 0.7175083396637711, "grad_norm": 0.22288116812705994, "learning_rate": 3.6947991669226225e-05, "loss": 0.024, "step": 5431 }, { "epoch": 0.7176404531492552, "grad_norm": 0.1592830866575241, "learning_rate": 3.6915738626602936e-05, "loss": 0.0085, "step": 5432 }, { "epoch": 0.7177725666347392, "grad_norm": 0.1483837217092514, "learning_rate": 3.688349648060628e-05, "loss": 0.0118, "step": 5433 }, { "epoch": 0.7179046801202232, "grad_norm": 0.11007316410541534, "learning_rate": 3.68512652368055e-05, "loss": 0.0132, "step": 5434 }, { "epoch": 0.7180367936057073, "grad_norm": 0.20218022167682648, "learning_rate": 3.681904490076793e-05, "loss": 0.0206, "step": 5435 }, { "epoch": 0.7181689070911913, "grad_norm": 0.16886425018310547, "learning_rate": 3.678683547805908e-05, "loss": 0.0221, "step": 5436 }, { "epoch": 0.7183010205766753, "grad_norm": 0.1661931276321411, "learning_rate": 3.675463697424246e-05, "loss": 0.0176, "step": 5437 }, { "epoch": 0.7184331340621594, "grad_norm": 0.13714224100112915, "learning_rate": 3.6722449394879774e-05, "loss": 0.0113, "step": 5438 }, { "epoch": 0.7185652475476434, "grad_norm": 0.13536618649959564, "learning_rate": 3.669027274553088e-05, "loss": 0.0171, "step": 5439 }, { "epoch": 0.7186973610331274, "grad_norm": 0.2858543395996094, "learning_rate": 3.665810703175362e-05, "loss": 0.0207, "step": 5440 }, { "epoch": 0.7188294745186115, "grad_norm": 0.16099917888641357, "learning_rate": 3.6625952259104045e-05, "loss": 0.0187, "step": 5441 }, { "epoch": 0.7189615880040955, "grad_norm": 0.21166282892227173, "learning_rate": 3.65938084331363e-05, "loss": 0.0206, "step": 5442 }, { "epoch": 0.7190937014895795, "grad_norm": 0.17520025372505188, "learning_rate": 3.656167555940265e-05, "loss": 0.0256, "step": 5443 }, { "epoch": 0.7192258149750635, "grad_norm": 0.15425994992256165, "learning_rate": 3.65295536434534e-05, "loss": 0.014, "step": 5444 }, { "epoch": 0.7193579284605476, "grad_norm": 0.14609721302986145, "learning_rate": 3.6497442690837025e-05, "loss": 0.0142, "step": 5445 }, { "epoch": 0.7194900419460316, "grad_norm": 0.4000420570373535, "learning_rate": 3.6465342707100136e-05, "loss": 0.015, "step": 5446 }, { "epoch": 0.7196221554315156, "grad_norm": 0.21911656856536865, "learning_rate": 3.643325369778734e-05, "loss": 0.0137, "step": 5447 }, { "epoch": 0.7197542689169997, "grad_norm": 0.19847404956817627, "learning_rate": 3.640117566844144e-05, "loss": 0.0275, "step": 5448 }, { "epoch": 0.7198863824024837, "grad_norm": 0.22252239286899567, "learning_rate": 3.636910862460332e-05, "loss": 0.0295, "step": 5449 }, { "epoch": 0.7200184958879677, "grad_norm": 0.15440602600574493, "learning_rate": 3.633705257181199e-05, "loss": 0.0093, "step": 5450 }, { "epoch": 0.7201506093734518, "grad_norm": 0.13582760095596313, "learning_rate": 3.6305007515604484e-05, "loss": 0.012, "step": 5451 }, { "epoch": 0.7202827228589358, "grad_norm": 0.26740747690200806, "learning_rate": 3.6272973461515994e-05, "loss": 0.0175, "step": 5452 }, { "epoch": 0.7204148363444198, "grad_norm": 0.11057921499013901, "learning_rate": 3.624095041507985e-05, "loss": 0.0145, "step": 5453 }, { "epoch": 0.7205469498299039, "grad_norm": 0.17326849699020386, "learning_rate": 3.620893838182737e-05, "loss": 0.0138, "step": 5454 }, { "epoch": 0.7206790633153879, "grad_norm": 0.13421256840229034, "learning_rate": 3.617693736728808e-05, "loss": 0.0125, "step": 5455 }, { "epoch": 0.7208111768008719, "grad_norm": 0.22888900339603424, "learning_rate": 3.6144947376989525e-05, "loss": 0.0186, "step": 5456 }, { "epoch": 0.720943290286356, "grad_norm": 0.10150676965713501, "learning_rate": 3.611296841645744e-05, "loss": 0.0069, "step": 5457 }, { "epoch": 0.72107540377184, "grad_norm": 0.15139426290988922, "learning_rate": 3.608100049121551e-05, "loss": 0.0148, "step": 5458 }, { "epoch": 0.721207517257324, "grad_norm": 0.25483521819114685, "learning_rate": 3.604904360678563e-05, "loss": 0.0203, "step": 5459 }, { "epoch": 0.721339630742808, "grad_norm": 0.23597757518291473, "learning_rate": 3.601709776868779e-05, "loss": 0.0205, "step": 5460 }, { "epoch": 0.7214717442282921, "grad_norm": 0.14819002151489258, "learning_rate": 3.598516298243998e-05, "loss": 0.0155, "step": 5461 }, { "epoch": 0.7216038577137761, "grad_norm": 0.15574060380458832, "learning_rate": 3.595323925355836e-05, "loss": 0.0156, "step": 5462 }, { "epoch": 0.7217359711992601, "grad_norm": 0.21066302061080933, "learning_rate": 3.592132658755716e-05, "loss": 0.0146, "step": 5463 }, { "epoch": 0.7218680846847442, "grad_norm": 0.13125142455101013, "learning_rate": 3.5889424989948725e-05, "loss": 0.0105, "step": 5464 }, { "epoch": 0.7220001981702282, "grad_norm": 0.13324397802352905, "learning_rate": 3.5857534466243404e-05, "loss": 0.0133, "step": 5465 }, { "epoch": 0.7221323116557122, "grad_norm": 0.27421751618385315, "learning_rate": 3.58256550219497e-05, "loss": 0.0137, "step": 5466 }, { "epoch": 0.7222644251411963, "grad_norm": 0.14124605059623718, "learning_rate": 3.5793786662574255e-05, "loss": 0.0144, "step": 5467 }, { "epoch": 0.7223965386266803, "grad_norm": 0.14751692116260529, "learning_rate": 3.576192939362164e-05, "loss": 0.0188, "step": 5468 }, { "epoch": 0.7225286521121643, "grad_norm": 0.123799629509449, "learning_rate": 3.5730083220594646e-05, "loss": 0.0082, "step": 5469 }, { "epoch": 0.7226607655976484, "grad_norm": 0.12402285635471344, "learning_rate": 3.5698248148994106e-05, "loss": 0.0167, "step": 5470 }, { "epoch": 0.7227928790831324, "grad_norm": 0.18801485002040863, "learning_rate": 3.566642418431897e-05, "loss": 0.0184, "step": 5471 }, { "epoch": 0.7229249925686164, "grad_norm": 0.15218046307563782, "learning_rate": 3.563461133206616e-05, "loss": 0.0215, "step": 5472 }, { "epoch": 0.7230571060541005, "grad_norm": 0.1386367380619049, "learning_rate": 3.560280959773078e-05, "loss": 0.0168, "step": 5473 }, { "epoch": 0.7231892195395845, "grad_norm": 0.1280830353498459, "learning_rate": 3.557101898680601e-05, "loss": 0.013, "step": 5474 }, { "epoch": 0.7233213330250685, "grad_norm": 0.1812014877796173, "learning_rate": 3.553923950478305e-05, "loss": 0.0152, "step": 5475 }, { "epoch": 0.7234534465105525, "grad_norm": 0.12511831521987915, "learning_rate": 3.5507471157151214e-05, "loss": 0.0136, "step": 5476 }, { "epoch": 0.7235855599960366, "grad_norm": 0.1868468075990677, "learning_rate": 3.5475713949397914e-05, "loss": 0.0083, "step": 5477 }, { "epoch": 0.7237176734815206, "grad_norm": 0.17483773827552795, "learning_rate": 3.544396788700863e-05, "loss": 0.0154, "step": 5478 }, { "epoch": 0.7238497869670046, "grad_norm": 0.0938841700553894, "learning_rate": 3.541223297546683e-05, "loss": 0.0086, "step": 5479 }, { "epoch": 0.7239819004524887, "grad_norm": 0.13073642551898956, "learning_rate": 3.5380509220254176e-05, "loss": 0.0126, "step": 5480 }, { "epoch": 0.7241140139379727, "grad_norm": 0.14963223040103912, "learning_rate": 3.534879662685038e-05, "loss": 0.0183, "step": 5481 }, { "epoch": 0.7242461274234567, "grad_norm": 0.11111201345920563, "learning_rate": 3.531709520073313e-05, "loss": 0.0104, "step": 5482 }, { "epoch": 0.7243782409089408, "grad_norm": 0.22355780005455017, "learning_rate": 3.528540494737829e-05, "loss": 0.0241, "step": 5483 }, { "epoch": 0.7245103543944248, "grad_norm": 0.22933557629585266, "learning_rate": 3.5253725872259756e-05, "loss": 0.015, "step": 5484 }, { "epoch": 0.7246424678799088, "grad_norm": 0.19833749532699585, "learning_rate": 3.522205798084954e-05, "loss": 0.0243, "step": 5485 }, { "epoch": 0.7247745813653929, "grad_norm": 0.18630573153495789, "learning_rate": 3.51904012786176e-05, "loss": 0.0221, "step": 5486 }, { "epoch": 0.7249066948508769, "grad_norm": 0.2737390100955963, "learning_rate": 3.515875577103207e-05, "loss": 0.0285, "step": 5487 }, { "epoch": 0.7250388083363609, "grad_norm": 0.16898681223392487, "learning_rate": 3.5127121463559165e-05, "loss": 0.0163, "step": 5488 }, { "epoch": 0.725170921821845, "grad_norm": 0.14862626791000366, "learning_rate": 3.5095498361663015e-05, "loss": 0.0104, "step": 5489 }, { "epoch": 0.725303035307329, "grad_norm": 0.2469290941953659, "learning_rate": 3.5063886470806015e-05, "loss": 0.0314, "step": 5490 }, { "epoch": 0.725435148792813, "grad_norm": 0.116226427257061, "learning_rate": 3.503228579644854e-05, "loss": 0.0112, "step": 5491 }, { "epoch": 0.725567262278297, "grad_norm": 0.08892618864774704, "learning_rate": 3.5000696344048934e-05, "loss": 0.0101, "step": 5492 }, { "epoch": 0.7256993757637811, "grad_norm": 0.16128192842006683, "learning_rate": 3.496911811906373e-05, "loss": 0.0202, "step": 5493 }, { "epoch": 0.7258314892492651, "grad_norm": 0.20747168362140656, "learning_rate": 3.4937551126947465e-05, "loss": 0.0136, "step": 5494 }, { "epoch": 0.7259636027347491, "grad_norm": 0.16624540090560913, "learning_rate": 3.490599537315279e-05, "loss": 0.0151, "step": 5495 }, { "epoch": 0.7260957162202332, "grad_norm": 0.1490861475467682, "learning_rate": 3.48744508631303e-05, "loss": 0.0149, "step": 5496 }, { "epoch": 0.7262278297057172, "grad_norm": 0.1925140619277954, "learning_rate": 3.484291760232876e-05, "loss": 0.0133, "step": 5497 }, { "epoch": 0.7263599431912012, "grad_norm": 0.18869644403457642, "learning_rate": 3.481139559619497e-05, "loss": 0.0177, "step": 5498 }, { "epoch": 0.7264920566766853, "grad_norm": 0.16262781620025635, "learning_rate": 3.47798848501737e-05, "loss": 0.0103, "step": 5499 }, { "epoch": 0.7266241701621693, "grad_norm": 0.15225239098072052, "learning_rate": 3.4748385369707906e-05, "loss": 0.0161, "step": 5500 }, { "epoch": 0.7267562836476533, "grad_norm": 0.1811465173959732, "learning_rate": 3.471689716023849e-05, "loss": 0.0109, "step": 5501 }, { "epoch": 0.7268883971331374, "grad_norm": 0.1548740416765213, "learning_rate": 3.4685420227204526e-05, "loss": 0.0169, "step": 5502 }, { "epoch": 0.7270205106186214, "grad_norm": 0.11394735425710678, "learning_rate": 3.465395457604297e-05, "loss": 0.0148, "step": 5503 }, { "epoch": 0.7271526241041054, "grad_norm": 0.1491413563489914, "learning_rate": 3.4622500212188966e-05, "loss": 0.0135, "step": 5504 }, { "epoch": 0.7272847375895894, "grad_norm": 0.23304897546768188, "learning_rate": 3.459105714107571e-05, "loss": 0.016, "step": 5505 }, { "epoch": 0.7274168510750735, "grad_norm": 0.318596214056015, "learning_rate": 3.455962536813432e-05, "loss": 0.0168, "step": 5506 }, { "epoch": 0.7275489645605575, "grad_norm": 0.2458907663822174, "learning_rate": 3.4528204898794104e-05, "loss": 0.0184, "step": 5507 }, { "epoch": 0.7276810780460415, "grad_norm": 0.2839372456073761, "learning_rate": 3.449679573848233e-05, "loss": 0.0173, "step": 5508 }, { "epoch": 0.7278131915315256, "grad_norm": 0.19794289767742157, "learning_rate": 3.4465397892624417e-05, "loss": 0.0155, "step": 5509 }, { "epoch": 0.7279453050170096, "grad_norm": 0.17745184898376465, "learning_rate": 3.4434011366643645e-05, "loss": 0.022, "step": 5510 }, { "epoch": 0.7280774185024936, "grad_norm": 0.16284947097301483, "learning_rate": 3.4402636165961524e-05, "loss": 0.0171, "step": 5511 }, { "epoch": 0.7282095319879777, "grad_norm": 0.1310679018497467, "learning_rate": 3.437127229599754e-05, "loss": 0.0112, "step": 5512 }, { "epoch": 0.7283416454734617, "grad_norm": 0.21692624688148499, "learning_rate": 3.433991976216915e-05, "loss": 0.0348, "step": 5513 }, { "epoch": 0.7284737589589457, "grad_norm": 0.16403941810131073, "learning_rate": 3.430857856989196e-05, "loss": 0.0196, "step": 5514 }, { "epoch": 0.7286058724444298, "grad_norm": 0.19069243967533112, "learning_rate": 3.427724872457957e-05, "loss": 0.0215, "step": 5515 }, { "epoch": 0.7287379859299138, "grad_norm": 0.17088550329208374, "learning_rate": 3.424593023164366e-05, "loss": 0.0116, "step": 5516 }, { "epoch": 0.7288700994153978, "grad_norm": 0.19054432213306427, "learning_rate": 3.421462309649385e-05, "loss": 0.0175, "step": 5517 }, { "epoch": 0.7290022129008819, "grad_norm": 0.17557857930660248, "learning_rate": 3.41833273245379e-05, "loss": 0.0155, "step": 5518 }, { "epoch": 0.7291343263863659, "grad_norm": 0.13773734867572784, "learning_rate": 3.4152042921181584e-05, "loss": 0.0141, "step": 5519 }, { "epoch": 0.7292664398718499, "grad_norm": 0.20974580943584442, "learning_rate": 3.412076989182864e-05, "loss": 0.029, "step": 5520 }, { "epoch": 0.729398553357334, "grad_norm": 0.17927774786949158, "learning_rate": 3.408950824188094e-05, "loss": 0.024, "step": 5521 }, { "epoch": 0.729530666842818, "grad_norm": 0.20614896714687347, "learning_rate": 3.405825797673835e-05, "loss": 0.0305, "step": 5522 }, { "epoch": 0.729662780328302, "grad_norm": 0.2965283691883087, "learning_rate": 3.402701910179879e-05, "loss": 0.026, "step": 5523 }, { "epoch": 0.729794893813786, "grad_norm": 0.1992853283882141, "learning_rate": 3.399579162245814e-05, "loss": 0.0199, "step": 5524 }, { "epoch": 0.7299270072992701, "grad_norm": 0.24798119068145752, "learning_rate": 3.396457554411038e-05, "loss": 0.0276, "step": 5525 }, { "epoch": 0.7300591207847541, "grad_norm": 0.07950809597969055, "learning_rate": 3.393337087214755e-05, "loss": 0.0069, "step": 5526 }, { "epoch": 0.7301912342702381, "grad_norm": 0.1755135953426361, "learning_rate": 3.3902177611959606e-05, "loss": 0.0148, "step": 5527 }, { "epoch": 0.7303233477557222, "grad_norm": 0.13987764716148376, "learning_rate": 3.387099576893462e-05, "loss": 0.0178, "step": 5528 }, { "epoch": 0.7304554612412062, "grad_norm": 0.15794652700424194, "learning_rate": 3.38398253484587e-05, "loss": 0.0214, "step": 5529 }, { "epoch": 0.7305875747266902, "grad_norm": 0.1341640204191208, "learning_rate": 3.3808666355915954e-05, "loss": 0.0144, "step": 5530 }, { "epoch": 0.7307196882121743, "grad_norm": 0.152041494846344, "learning_rate": 3.377751879668847e-05, "loss": 0.0153, "step": 5531 }, { "epoch": 0.7308518016976583, "grad_norm": 0.10529367625713348, "learning_rate": 3.374638267615643e-05, "loss": 0.0043, "step": 5532 }, { "epoch": 0.7309839151831423, "grad_norm": 0.13288097083568573, "learning_rate": 3.371525799969806e-05, "loss": 0.0159, "step": 5533 }, { "epoch": 0.7311160286686263, "grad_norm": 0.1831805258989334, "learning_rate": 3.3684144772689494e-05, "loss": 0.0204, "step": 5534 }, { "epoch": 0.7312481421541104, "grad_norm": 0.21946914494037628, "learning_rate": 3.365304300050499e-05, "loss": 0.0278, "step": 5535 }, { "epoch": 0.7313802556395944, "grad_norm": 0.17427095770835876, "learning_rate": 3.36219526885168e-05, "loss": 0.0222, "step": 5536 }, { "epoch": 0.7315123691250784, "grad_norm": 0.10152660310268402, "learning_rate": 3.359087384209523e-05, "loss": 0.0108, "step": 5537 }, { "epoch": 0.7316444826105625, "grad_norm": 0.21317848563194275, "learning_rate": 3.35598064666085e-05, "loss": 0.0276, "step": 5538 }, { "epoch": 0.7317765960960465, "grad_norm": 0.10813872516155243, "learning_rate": 3.352875056742295e-05, "loss": 0.0065, "step": 5539 }, { "epoch": 0.7319087095815305, "grad_norm": 0.13802549242973328, "learning_rate": 3.3497706149902944e-05, "loss": 0.0146, "step": 5540 }, { "epoch": 0.7320408230670146, "grad_norm": 0.1580958366394043, "learning_rate": 3.346667321941076e-05, "loss": 0.0158, "step": 5541 }, { "epoch": 0.7321729365524986, "grad_norm": 0.1649957150220871, "learning_rate": 3.343565178130678e-05, "loss": 0.0171, "step": 5542 }, { "epoch": 0.7323050500379826, "grad_norm": 0.1353890597820282, "learning_rate": 3.340464184094938e-05, "loss": 0.0075, "step": 5543 }, { "epoch": 0.7324371635234667, "grad_norm": 0.14383232593536377, "learning_rate": 3.337364340369499e-05, "loss": 0.0205, "step": 5544 }, { "epoch": 0.7325692770089507, "grad_norm": 0.12641078233718872, "learning_rate": 3.334265647489794e-05, "loss": 0.0116, "step": 5545 }, { "epoch": 0.7327013904944347, "grad_norm": 0.1792270541191101, "learning_rate": 3.331168105991067e-05, "loss": 0.0248, "step": 5546 }, { "epoch": 0.7328335039799188, "grad_norm": 0.14551857113838196, "learning_rate": 3.328071716408364e-05, "loss": 0.0102, "step": 5547 }, { "epoch": 0.7329656174654028, "grad_norm": 0.15047802031040192, "learning_rate": 3.324976479276518e-05, "loss": 0.0043, "step": 5548 }, { "epoch": 0.7330977309508868, "grad_norm": 0.20460718870162964, "learning_rate": 3.321882395130185e-05, "loss": 0.0155, "step": 5549 }, { "epoch": 0.7332298444363708, "grad_norm": 0.09014448523521423, "learning_rate": 3.318789464503808e-05, "loss": 0.0095, "step": 5550 }, { "epoch": 0.7333619579218549, "grad_norm": 0.145356684923172, "learning_rate": 3.315697687931627e-05, "loss": 0.0129, "step": 5551 }, { "epoch": 0.7334940714073389, "grad_norm": 0.29295435547828674, "learning_rate": 3.312607065947693e-05, "loss": 0.0171, "step": 5552 }, { "epoch": 0.7336261848928229, "grad_norm": 0.2355080246925354, "learning_rate": 3.309517599085855e-05, "loss": 0.0095, "step": 5553 }, { "epoch": 0.733758298378307, "grad_norm": 0.17795896530151367, "learning_rate": 3.3064292878797556e-05, "loss": 0.014, "step": 5554 }, { "epoch": 0.733890411863791, "grad_norm": 0.1291665881872177, "learning_rate": 3.3033421328628447e-05, "loss": 0.0163, "step": 5555 }, { "epoch": 0.734022525349275, "grad_norm": 0.27356666326522827, "learning_rate": 3.3002561345683715e-05, "loss": 0.0275, "step": 5556 }, { "epoch": 0.7341546388347591, "grad_norm": 0.22058287262916565, "learning_rate": 3.297171293529386e-05, "loss": 0.0217, "step": 5557 }, { "epoch": 0.7342867523202431, "grad_norm": 0.12218068540096283, "learning_rate": 3.2940876102787336e-05, "loss": 0.016, "step": 5558 }, { "epoch": 0.7344188658057271, "grad_norm": 0.07197923213243484, "learning_rate": 3.291005085349062e-05, "loss": 0.0077, "step": 5559 }, { "epoch": 0.7345509792912112, "grad_norm": 0.1551055759191513, "learning_rate": 3.2879237192728276e-05, "loss": 0.024, "step": 5560 }, { "epoch": 0.7346830927766952, "grad_norm": 0.24776233732700348, "learning_rate": 3.284843512582268e-05, "loss": 0.0305, "step": 5561 }, { "epoch": 0.7348152062621792, "grad_norm": 0.20823752880096436, "learning_rate": 3.2817644658094384e-05, "loss": 0.0215, "step": 5562 }, { "epoch": 0.7349473197476633, "grad_norm": 0.17506052553653717, "learning_rate": 3.278686579486183e-05, "loss": 0.0101, "step": 5563 }, { "epoch": 0.7350794332331473, "grad_norm": 0.14158056676387787, "learning_rate": 3.275609854144155e-05, "loss": 0.0124, "step": 5564 }, { "epoch": 0.7352115467186313, "grad_norm": 0.14460735023021698, "learning_rate": 3.2725342903147936e-05, "loss": 0.0201, "step": 5565 }, { "epoch": 0.7353436602041153, "grad_norm": 0.2015376091003418, "learning_rate": 3.2694598885293485e-05, "loss": 0.0197, "step": 5566 }, { "epoch": 0.7354757736895994, "grad_norm": 0.1528630405664444, "learning_rate": 3.266386649318868e-05, "loss": 0.0114, "step": 5567 }, { "epoch": 0.7356078871750834, "grad_norm": 0.2069101631641388, "learning_rate": 3.263314573214189e-05, "loss": 0.0239, "step": 5568 }, { "epoch": 0.7357400006605674, "grad_norm": 0.1767331212759018, "learning_rate": 3.260243660745961e-05, "loss": 0.0151, "step": 5569 }, { "epoch": 0.7358721141460515, "grad_norm": 0.13966885209083557, "learning_rate": 3.2571739124446255e-05, "loss": 0.0186, "step": 5570 }, { "epoch": 0.7360042276315355, "grad_norm": 0.16529060900211334, "learning_rate": 3.254105328840428e-05, "loss": 0.0178, "step": 5571 }, { "epoch": 0.7361363411170195, "grad_norm": 0.11949215084314346, "learning_rate": 3.2510379104634e-05, "loss": 0.0082, "step": 5572 }, { "epoch": 0.7362684546025036, "grad_norm": 0.1410934329032898, "learning_rate": 3.2479716578433884e-05, "loss": 0.017, "step": 5573 }, { "epoch": 0.7364005680879876, "grad_norm": 0.1328480988740921, "learning_rate": 3.24490657151003e-05, "loss": 0.0179, "step": 5574 }, { "epoch": 0.7365326815734716, "grad_norm": 0.23808583617210388, "learning_rate": 3.241842651992757e-05, "loss": 0.0139, "step": 5575 }, { "epoch": 0.7366647950589557, "grad_norm": 0.7010180354118347, "learning_rate": 3.2387798998208064e-05, "loss": 0.014, "step": 5576 }, { "epoch": 0.7367969085444397, "grad_norm": 0.16123968362808228, "learning_rate": 3.2357183155232106e-05, "loss": 0.0109, "step": 5577 }, { "epoch": 0.7369290220299237, "grad_norm": 0.2551591992378235, "learning_rate": 3.232657899628807e-05, "loss": 0.0156, "step": 5578 }, { "epoch": 0.7370611355154078, "grad_norm": 0.2279757261276245, "learning_rate": 3.229598652666217e-05, "loss": 0.0254, "step": 5579 }, { "epoch": 0.7371932490008918, "grad_norm": 0.21289393305778503, "learning_rate": 3.226540575163871e-05, "loss": 0.0187, "step": 5580 }, { "epoch": 0.7373253624863758, "grad_norm": 0.11979267001152039, "learning_rate": 3.223483667649999e-05, "loss": 0.0192, "step": 5581 }, { "epoch": 0.7374574759718598, "grad_norm": 0.21272334456443787, "learning_rate": 3.2204279306526175e-05, "loss": 0.0211, "step": 5582 }, { "epoch": 0.7375895894573439, "grad_norm": 0.1978456676006317, "learning_rate": 3.2173733646995516e-05, "loss": 0.0165, "step": 5583 }, { "epoch": 0.7377217029428279, "grad_norm": 0.13364800810813904, "learning_rate": 3.214319970318421e-05, "loss": 0.0133, "step": 5584 }, { "epoch": 0.7378538164283119, "grad_norm": 0.15159595012664795, "learning_rate": 3.211267748036645e-05, "loss": 0.0245, "step": 5585 }, { "epoch": 0.737985929913796, "grad_norm": 0.4125491976737976, "learning_rate": 3.208216698381431e-05, "loss": 0.046, "step": 5586 }, { "epoch": 0.73811804339928, "grad_norm": 0.2198030799627304, "learning_rate": 3.205166821879795e-05, "loss": 0.0164, "step": 5587 }, { "epoch": 0.738250156884764, "grad_norm": 0.13166871666908264, "learning_rate": 3.202118119058548e-05, "loss": 0.0155, "step": 5588 }, { "epoch": 0.7383822703702481, "grad_norm": 0.17269164323806763, "learning_rate": 3.199070590444292e-05, "loss": 0.0135, "step": 5589 }, { "epoch": 0.7385143838557321, "grad_norm": 0.13313475251197815, "learning_rate": 3.1960242365634316e-05, "loss": 0.0118, "step": 5590 }, { "epoch": 0.7386464973412161, "grad_norm": 0.15216787159442902, "learning_rate": 3.192979057942169e-05, "loss": 0.0144, "step": 5591 }, { "epoch": 0.7387786108267002, "grad_norm": 0.17991912364959717, "learning_rate": 3.189935055106506e-05, "loss": 0.0167, "step": 5592 }, { "epoch": 0.7389107243121842, "grad_norm": 0.20824044942855835, "learning_rate": 3.1868922285822265e-05, "loss": 0.0209, "step": 5593 }, { "epoch": 0.7390428377976682, "grad_norm": 0.15515249967575073, "learning_rate": 3.18385057889493e-05, "loss": 0.0092, "step": 5594 }, { "epoch": 0.7391749512831522, "grad_norm": 0.15328307449817657, "learning_rate": 3.180810106570006e-05, "loss": 0.0176, "step": 5595 }, { "epoch": 0.7393070647686363, "grad_norm": 0.15170076489448547, "learning_rate": 3.1777708121326324e-05, "loss": 0.0118, "step": 5596 }, { "epoch": 0.7394391782541203, "grad_norm": 0.12343598157167435, "learning_rate": 3.174732696107793e-05, "loss": 0.0148, "step": 5597 }, { "epoch": 0.7395712917396043, "grad_norm": 0.16363227367401123, "learning_rate": 3.171695759020267e-05, "loss": 0.0238, "step": 5598 }, { "epoch": 0.7397034052250884, "grad_norm": 0.15410636365413666, "learning_rate": 3.168660001394631e-05, "loss": 0.019, "step": 5599 }, { "epoch": 0.7398355187105724, "grad_norm": 0.1777234524488449, "learning_rate": 3.1656254237552495e-05, "loss": 0.0189, "step": 5600 }, { "epoch": 0.7399676321960564, "grad_norm": 0.1454850733280182, "learning_rate": 3.162592026626291e-05, "loss": 0.0137, "step": 5601 }, { "epoch": 0.7400997456815405, "grad_norm": 0.31561899185180664, "learning_rate": 3.159559810531724e-05, "loss": 0.012, "step": 5602 }, { "epoch": 0.7402318591670245, "grad_norm": 0.13987892866134644, "learning_rate": 3.156528775995298e-05, "loss": 0.0097, "step": 5603 }, { "epoch": 0.7403639726525085, "grad_norm": 0.17761053144931793, "learning_rate": 3.153498923540571e-05, "loss": 0.0116, "step": 5604 }, { "epoch": 0.7404960861379926, "grad_norm": 0.1544499695301056, "learning_rate": 3.1504702536908946e-05, "loss": 0.0168, "step": 5605 }, { "epoch": 0.7406281996234766, "grad_norm": 0.15382182598114014, "learning_rate": 3.147442766969417e-05, "loss": 0.0166, "step": 5606 }, { "epoch": 0.7407603131089606, "grad_norm": 0.1814822107553482, "learning_rate": 3.144416463899071e-05, "loss": 0.0158, "step": 5607 }, { "epoch": 0.7408924265944447, "grad_norm": 0.13609567284584045, "learning_rate": 3.1413913450026047e-05, "loss": 0.0164, "step": 5608 }, { "epoch": 0.7410245400799287, "grad_norm": 0.175263911485672, "learning_rate": 3.1383674108025484e-05, "loss": 0.0176, "step": 5609 }, { "epoch": 0.7411566535654127, "grad_norm": 0.15628427267074585, "learning_rate": 3.135344661821226e-05, "loss": 0.0175, "step": 5610 }, { "epoch": 0.7412887670508967, "grad_norm": 0.22614404559135437, "learning_rate": 3.1323230985807614e-05, "loss": 0.0382, "step": 5611 }, { "epoch": 0.7414208805363808, "grad_norm": 0.1333487331867218, "learning_rate": 3.129302721603078e-05, "loss": 0.0138, "step": 5612 }, { "epoch": 0.7415529940218648, "grad_norm": 0.25842511653900146, "learning_rate": 3.1262835314098835e-05, "loss": 0.0177, "step": 5613 }, { "epoch": 0.7416851075073488, "grad_norm": 0.17177551984786987, "learning_rate": 3.12326552852269e-05, "loss": 0.0138, "step": 5614 }, { "epoch": 0.7418172209928329, "grad_norm": 0.21476882696151733, "learning_rate": 3.120248713462799e-05, "loss": 0.017, "step": 5615 }, { "epoch": 0.7419493344783169, "grad_norm": 0.11701980978250504, "learning_rate": 3.1172330867513135e-05, "loss": 0.0123, "step": 5616 }, { "epoch": 0.7420814479638009, "grad_norm": 0.17461326718330383, "learning_rate": 3.1142186489091206e-05, "loss": 0.0193, "step": 5617 }, { "epoch": 0.742213561449285, "grad_norm": 0.1957211196422577, "learning_rate": 3.11120540045691e-05, "loss": 0.0195, "step": 5618 }, { "epoch": 0.742345674934769, "grad_norm": 0.15636511147022247, "learning_rate": 3.108193341915169e-05, "loss": 0.0189, "step": 5619 }, { "epoch": 0.742477788420253, "grad_norm": 0.12543295323848724, "learning_rate": 3.1051824738041666e-05, "loss": 0.0142, "step": 5620 }, { "epoch": 0.7426099019057371, "grad_norm": 0.19471098482608795, "learning_rate": 3.1021727966439773e-05, "loss": 0.0189, "step": 5621 }, { "epoch": 0.7427420153912211, "grad_norm": 0.12781931459903717, "learning_rate": 3.099164310954468e-05, "loss": 0.0107, "step": 5622 }, { "epoch": 0.7428741288767051, "grad_norm": 0.2116575390100479, "learning_rate": 3.096157017255299e-05, "loss": 0.0194, "step": 5623 }, { "epoch": 0.7430062423621892, "grad_norm": 0.16661399602890015, "learning_rate": 3.09315091606592e-05, "loss": 0.0139, "step": 5624 }, { "epoch": 0.7431383558476732, "grad_norm": 0.13406845927238464, "learning_rate": 3.09014600790558e-05, "loss": 0.0107, "step": 5625 }, { "epoch": 0.7432704693331572, "grad_norm": 0.16295015811920166, "learning_rate": 3.087142293293326e-05, "loss": 0.0158, "step": 5626 }, { "epoch": 0.7434025828186412, "grad_norm": 0.15222692489624023, "learning_rate": 3.084139772747985e-05, "loss": 0.0111, "step": 5627 }, { "epoch": 0.7435346963041253, "grad_norm": 0.14454548060894012, "learning_rate": 3.081138446788191e-05, "loss": 0.0115, "step": 5628 }, { "epoch": 0.7436668097896093, "grad_norm": 0.14269648492336273, "learning_rate": 3.078138315932366e-05, "loss": 0.0125, "step": 5629 }, { "epoch": 0.7437989232750933, "grad_norm": 0.30650556087493896, "learning_rate": 3.07513938069873e-05, "loss": 0.0176, "step": 5630 }, { "epoch": 0.7439310367605774, "grad_norm": 0.12681737542152405, "learning_rate": 3.0721416416052884e-05, "loss": 0.0093, "step": 5631 }, { "epoch": 0.7440631502460614, "grad_norm": 0.19957475364208221, "learning_rate": 3.0691450991698456e-05, "loss": 0.0139, "step": 5632 }, { "epoch": 0.7441952637315454, "grad_norm": 0.15414023399353027, "learning_rate": 3.066149753910002e-05, "loss": 0.0081, "step": 5633 }, { "epoch": 0.7443273772170295, "grad_norm": 0.12280531227588654, "learning_rate": 3.06315560634314e-05, "loss": 0.0138, "step": 5634 }, { "epoch": 0.7444594907025135, "grad_norm": 0.14327071607112885, "learning_rate": 3.060162656986448e-05, "loss": 0.0107, "step": 5635 }, { "epoch": 0.7445916041879975, "grad_norm": 0.13788668811321259, "learning_rate": 3.057170906356901e-05, "loss": 0.018, "step": 5636 }, { "epoch": 0.7447237176734816, "grad_norm": 0.1548607051372528, "learning_rate": 3.05418035497127e-05, "loss": 0.0116, "step": 5637 }, { "epoch": 0.7448558311589656, "grad_norm": 0.363374799489975, "learning_rate": 3.0511910033461134e-05, "loss": 0.0199, "step": 5638 }, { "epoch": 0.7449879446444496, "grad_norm": 0.1580091416835785, "learning_rate": 3.0482028519977857e-05, "loss": 0.0158, "step": 5639 }, { "epoch": 0.7451200581299336, "grad_norm": 0.18859657645225525, "learning_rate": 3.0452159014424396e-05, "loss": 0.0164, "step": 5640 }, { "epoch": 0.7452521716154177, "grad_norm": 0.16260674595832825, "learning_rate": 3.0422301521960074e-05, "loss": 0.0138, "step": 5641 }, { "epoch": 0.7453842851009017, "grad_norm": 0.14317110180854797, "learning_rate": 3.0392456047742257e-05, "loss": 0.0134, "step": 5642 }, { "epoch": 0.7455163985863857, "grad_norm": 0.20928920805454254, "learning_rate": 3.036262259692618e-05, "loss": 0.0195, "step": 5643 }, { "epoch": 0.7456485120718698, "grad_norm": 0.2817302644252777, "learning_rate": 3.033280117466506e-05, "loss": 0.0303, "step": 5644 }, { "epoch": 0.7457806255573538, "grad_norm": 0.13203193247318268, "learning_rate": 3.0302991786109913e-05, "loss": 0.0183, "step": 5645 }, { "epoch": 0.7459127390428378, "grad_norm": 0.4010050594806671, "learning_rate": 3.027319443640979e-05, "loss": 0.0273, "step": 5646 }, { "epoch": 0.7460448525283219, "grad_norm": 0.2017243355512619, "learning_rate": 3.0243409130711665e-05, "loss": 0.0275, "step": 5647 }, { "epoch": 0.7461769660138059, "grad_norm": 0.18311437964439392, "learning_rate": 3.0213635874160316e-05, "loss": 0.0144, "step": 5648 }, { "epoch": 0.7463090794992899, "grad_norm": 0.11720288544893265, "learning_rate": 3.018387467189856e-05, "loss": 0.0105, "step": 5649 }, { "epoch": 0.746441192984774, "grad_norm": 0.11851353943347931, "learning_rate": 3.015412552906708e-05, "loss": 0.009, "step": 5650 }, { "epoch": 0.746573306470258, "grad_norm": 0.10665460675954819, "learning_rate": 3.012438845080452e-05, "loss": 0.0088, "step": 5651 }, { "epoch": 0.746705419955742, "grad_norm": 0.11494076997041702, "learning_rate": 3.009466344224734e-05, "loss": 0.0131, "step": 5652 }, { "epoch": 0.746837533441226, "grad_norm": 0.12239422649145126, "learning_rate": 3.006495050853001e-05, "loss": 0.0084, "step": 5653 }, { "epoch": 0.7469696469267101, "grad_norm": 0.12373550236225128, "learning_rate": 3.0035249654784926e-05, "loss": 0.0072, "step": 5654 }, { "epoch": 0.7471017604121941, "grad_norm": 0.17365232110023499, "learning_rate": 3.000556088614227e-05, "loss": 0.0136, "step": 5655 }, { "epoch": 0.7472338738976781, "grad_norm": 0.17435801029205322, "learning_rate": 2.9975884207730275e-05, "loss": 0.0148, "step": 5656 }, { "epoch": 0.7473659873831622, "grad_norm": 0.17454983294010162, "learning_rate": 2.994621962467502e-05, "loss": 0.0249, "step": 5657 }, { "epoch": 0.7474981008686462, "grad_norm": 0.31457021832466125, "learning_rate": 2.9916567142100538e-05, "loss": 0.0167, "step": 5658 }, { "epoch": 0.7476302143541302, "grad_norm": 0.24514144659042358, "learning_rate": 2.9886926765128688e-05, "loss": 0.0192, "step": 5659 }, { "epoch": 0.7477623278396143, "grad_norm": 0.18715476989746094, "learning_rate": 2.9857298498879306e-05, "loss": 0.0193, "step": 5660 }, { "epoch": 0.7478944413250983, "grad_norm": 0.15529604256153107, "learning_rate": 2.9827682348470178e-05, "loss": 0.0119, "step": 5661 }, { "epoch": 0.7480265548105823, "grad_norm": 0.18206310272216797, "learning_rate": 2.979807831901684e-05, "loss": 0.0168, "step": 5662 }, { "epoch": 0.7481586682960664, "grad_norm": 0.14686575531959534, "learning_rate": 2.9768486415632914e-05, "loss": 0.0174, "step": 5663 }, { "epoch": 0.7482907817815504, "grad_norm": 0.14675019681453705, "learning_rate": 2.973890664342981e-05, "loss": 0.0185, "step": 5664 }, { "epoch": 0.7484228952670344, "grad_norm": 0.1723177284002304, "learning_rate": 2.970933900751689e-05, "loss": 0.0167, "step": 5665 }, { "epoch": 0.7485550087525185, "grad_norm": 0.1251523345708847, "learning_rate": 2.9679783513001412e-05, "loss": 0.0125, "step": 5666 }, { "epoch": 0.7486871222380025, "grad_norm": 0.1644349992275238, "learning_rate": 2.9650240164988563e-05, "loss": 0.0217, "step": 5667 }, { "epoch": 0.7488192357234865, "grad_norm": 0.17225313186645508, "learning_rate": 2.9620708968581356e-05, "loss": 0.018, "step": 5668 }, { "epoch": 0.7489513492089706, "grad_norm": 0.19602759182453156, "learning_rate": 2.959118992888077e-05, "loss": 0.0189, "step": 5669 }, { "epoch": 0.7490834626944546, "grad_norm": 0.14376594126224518, "learning_rate": 2.9561683050985677e-05, "loss": 0.0159, "step": 5670 }, { "epoch": 0.7492155761799386, "grad_norm": 0.1839967966079712, "learning_rate": 2.953218833999285e-05, "loss": 0.0209, "step": 5671 }, { "epoch": 0.7493476896654226, "grad_norm": 0.20183296501636505, "learning_rate": 2.950270580099691e-05, "loss": 0.0253, "step": 5672 }, { "epoch": 0.7494798031509067, "grad_norm": 0.24092161655426025, "learning_rate": 2.947323543909044e-05, "loss": 0.0162, "step": 5673 }, { "epoch": 0.7496119166363907, "grad_norm": 0.3723273277282715, "learning_rate": 2.9443777259363912e-05, "loss": 0.0208, "step": 5674 }, { "epoch": 0.7497440301218747, "grad_norm": 0.19728074967861176, "learning_rate": 2.9414331266905627e-05, "loss": 0.0167, "step": 5675 }, { "epoch": 0.7498761436073588, "grad_norm": 0.12510357797145844, "learning_rate": 2.9384897466801852e-05, "loss": 0.0131, "step": 5676 }, { "epoch": 0.7500082570928428, "grad_norm": 0.22277474403381348, "learning_rate": 2.935547586413674e-05, "loss": 0.0152, "step": 5677 }, { "epoch": 0.7501403705783268, "grad_norm": 0.1679277867078781, "learning_rate": 2.932606646399233e-05, "loss": 0.0209, "step": 5678 }, { "epoch": 0.7502724840638109, "grad_norm": 0.1746317446231842, "learning_rate": 2.929666927144851e-05, "loss": 0.0092, "step": 5679 }, { "epoch": 0.7504045975492949, "grad_norm": 0.14948616921901703, "learning_rate": 2.926728429158311e-05, "loss": 0.0181, "step": 5680 }, { "epoch": 0.7505367110347789, "grad_norm": 0.13942132890224457, "learning_rate": 2.9237911529471862e-05, "loss": 0.0109, "step": 5681 }, { "epoch": 0.750668824520263, "grad_norm": 0.1250900775194168, "learning_rate": 2.9208550990188312e-05, "loss": 0.0168, "step": 5682 }, { "epoch": 0.750800938005747, "grad_norm": 0.1930038183927536, "learning_rate": 2.9179202678803973e-05, "loss": 0.0216, "step": 5683 }, { "epoch": 0.750933051491231, "grad_norm": 0.14589211344718933, "learning_rate": 2.914986660038822e-05, "loss": 0.0134, "step": 5684 }, { "epoch": 0.751065164976715, "grad_norm": 0.19304388761520386, "learning_rate": 2.912054276000834e-05, "loss": 0.0155, "step": 5685 }, { "epoch": 0.7511972784621991, "grad_norm": 0.2510669231414795, "learning_rate": 2.9091231162729403e-05, "loss": 0.0218, "step": 5686 }, { "epoch": 0.7513293919476831, "grad_norm": 0.13263855874538422, "learning_rate": 2.9061931813614497e-05, "loss": 0.0118, "step": 5687 }, { "epoch": 0.7514615054331671, "grad_norm": 0.1308731585741043, "learning_rate": 2.9032644717724543e-05, "loss": 0.0129, "step": 5688 }, { "epoch": 0.7515936189186512, "grad_norm": 0.453821063041687, "learning_rate": 2.900336988011829e-05, "loss": 0.0257, "step": 5689 }, { "epoch": 0.7517257324041352, "grad_norm": 0.14083927869796753, "learning_rate": 2.897410730585245e-05, "loss": 0.0118, "step": 5690 }, { "epoch": 0.7518578458896192, "grad_norm": 0.15414288640022278, "learning_rate": 2.8944856999981572e-05, "loss": 0.0137, "step": 5691 }, { "epoch": 0.7519899593751033, "grad_norm": 0.2339172065258026, "learning_rate": 2.8915618967558144e-05, "loss": 0.024, "step": 5692 }, { "epoch": 0.7521220728605873, "grad_norm": 0.17207179963588715, "learning_rate": 2.8886393213632435e-05, "loss": 0.0183, "step": 5693 }, { "epoch": 0.7522541863460713, "grad_norm": 0.08931787312030792, "learning_rate": 2.885717974325266e-05, "loss": 0.0093, "step": 5694 }, { "epoch": 0.7523862998315554, "grad_norm": 0.1811581701040268, "learning_rate": 2.8827978561464943e-05, "loss": 0.0285, "step": 5695 }, { "epoch": 0.7525184133170394, "grad_norm": 0.14569365978240967, "learning_rate": 2.8798789673313164e-05, "loss": 0.0099, "step": 5696 }, { "epoch": 0.7526505268025234, "grad_norm": 0.3001488149166107, "learning_rate": 2.8769613083839208e-05, "loss": 0.0276, "step": 5697 }, { "epoch": 0.7527826402880073, "grad_norm": 0.15972566604614258, "learning_rate": 2.8740448798082786e-05, "loss": 0.0101, "step": 5698 }, { "epoch": 0.7529147537734914, "grad_norm": 0.1297338753938675, "learning_rate": 2.871129682108149e-05, "loss": 0.0174, "step": 5699 }, { "epoch": 0.7530468672589754, "grad_norm": 0.19060315191745758, "learning_rate": 2.868215715787075e-05, "loss": 0.0276, "step": 5700 }, { "epoch": 0.7531789807444594, "grad_norm": 0.11152473092079163, "learning_rate": 2.86530298134839e-05, "loss": 0.0102, "step": 5701 }, { "epoch": 0.7533110942299435, "grad_norm": 0.2610558271408081, "learning_rate": 2.8623914792952188e-05, "loss": 0.0196, "step": 5702 }, { "epoch": 0.7534432077154275, "grad_norm": 0.1260378360748291, "learning_rate": 2.8594812101304624e-05, "loss": 0.0125, "step": 5703 }, { "epoch": 0.7535753212009115, "grad_norm": 0.14703889191150665, "learning_rate": 2.8565721743568195e-05, "loss": 0.0132, "step": 5704 }, { "epoch": 0.7537074346863956, "grad_norm": 0.27930232882499695, "learning_rate": 2.853664372476771e-05, "loss": 0.0163, "step": 5705 }, { "epoch": 0.7538395481718796, "grad_norm": 0.17410196363925934, "learning_rate": 2.8507578049925875e-05, "loss": 0.0173, "step": 5706 }, { "epoch": 0.7539716616573636, "grad_norm": 0.1297542154788971, "learning_rate": 2.8478524724063195e-05, "loss": 0.0127, "step": 5707 }, { "epoch": 0.7541037751428477, "grad_norm": 0.18584750592708588, "learning_rate": 2.84494837521981e-05, "loss": 0.0179, "step": 5708 }, { "epoch": 0.7542358886283317, "grad_norm": 0.1973746418952942, "learning_rate": 2.8420455139346935e-05, "loss": 0.0144, "step": 5709 }, { "epoch": 0.7543680021138157, "grad_norm": 0.18765583634376526, "learning_rate": 2.8391438890523757e-05, "loss": 0.0161, "step": 5710 }, { "epoch": 0.7545001155992997, "grad_norm": 0.11874744296073914, "learning_rate": 2.836243501074064e-05, "loss": 0.0082, "step": 5711 }, { "epoch": 0.7546322290847838, "grad_norm": 0.17037716507911682, "learning_rate": 2.833344350500744e-05, "loss": 0.0158, "step": 5712 }, { "epoch": 0.7547643425702678, "grad_norm": 0.13802199065685272, "learning_rate": 2.830446437833193e-05, "loss": 0.0136, "step": 5713 }, { "epoch": 0.7548964560557518, "grad_norm": 0.15014030039310455, "learning_rate": 2.8275497635719663e-05, "loss": 0.0152, "step": 5714 }, { "epoch": 0.7550285695412359, "grad_norm": 0.30440592765808105, "learning_rate": 2.824654328217413e-05, "loss": 0.0265, "step": 5715 }, { "epoch": 0.7551606830267199, "grad_norm": 0.1533273160457611, "learning_rate": 2.8217601322696675e-05, "loss": 0.0176, "step": 5716 }, { "epoch": 0.7552927965122039, "grad_norm": 0.09308493882417679, "learning_rate": 2.8188671762286434e-05, "loss": 0.0033, "step": 5717 }, { "epoch": 0.755424909997688, "grad_norm": 0.10201088339090347, "learning_rate": 2.815975460594047e-05, "loss": 0.0094, "step": 5718 }, { "epoch": 0.755557023483172, "grad_norm": 0.18083249032497406, "learning_rate": 2.8130849858653673e-05, "loss": 0.0194, "step": 5719 }, { "epoch": 0.755689136968656, "grad_norm": 0.14451417326927185, "learning_rate": 2.8101957525418842e-05, "loss": 0.0125, "step": 5720 }, { "epoch": 0.7558212504541401, "grad_norm": 0.10200035572052002, "learning_rate": 2.8073077611226518e-05, "loss": 0.0086, "step": 5721 }, { "epoch": 0.7559533639396241, "grad_norm": 0.1436675488948822, "learning_rate": 2.8044210121065195e-05, "loss": 0.011, "step": 5722 }, { "epoch": 0.7560854774251081, "grad_norm": 0.2443426251411438, "learning_rate": 2.8015355059921235e-05, "loss": 0.0392, "step": 5723 }, { "epoch": 0.7562175909105922, "grad_norm": 0.18814495205879211, "learning_rate": 2.798651243277871e-05, "loss": 0.026, "step": 5724 }, { "epoch": 0.7563497043960762, "grad_norm": 0.20542608201503754, "learning_rate": 2.7957682244619733e-05, "loss": 0.0227, "step": 5725 }, { "epoch": 0.7564818178815602, "grad_norm": 0.1900263875722885, "learning_rate": 2.792886450042419e-05, "loss": 0.0185, "step": 5726 }, { "epoch": 0.7566139313670442, "grad_norm": 0.12506645917892456, "learning_rate": 2.790005920516974e-05, "loss": 0.0144, "step": 5727 }, { "epoch": 0.7567460448525283, "grad_norm": 0.14222222566604614, "learning_rate": 2.7871266363831983e-05, "loss": 0.0195, "step": 5728 }, { "epoch": 0.7568781583380123, "grad_norm": 0.13620705902576447, "learning_rate": 2.784248598138435e-05, "loss": 0.0264, "step": 5729 }, { "epoch": 0.7570102718234963, "grad_norm": 0.17226412892341614, "learning_rate": 2.7813718062798156e-05, "loss": 0.0133, "step": 5730 }, { "epoch": 0.7571423853089804, "grad_norm": 0.14862510561943054, "learning_rate": 2.7784962613042442e-05, "loss": 0.0164, "step": 5731 }, { "epoch": 0.7572744987944644, "grad_norm": 0.18596234917640686, "learning_rate": 2.7756219637084212e-05, "loss": 0.0186, "step": 5732 }, { "epoch": 0.7574066122799484, "grad_norm": 0.1989210993051529, "learning_rate": 2.772748913988832e-05, "loss": 0.0146, "step": 5733 }, { "epoch": 0.7575387257654325, "grad_norm": 0.18087489902973175, "learning_rate": 2.7698771126417333e-05, "loss": 0.0189, "step": 5734 }, { "epoch": 0.7576708392509165, "grad_norm": 0.10741331428289413, "learning_rate": 2.767006560163181e-05, "loss": 0.0104, "step": 5735 }, { "epoch": 0.7578029527364005, "grad_norm": 0.14782588183879852, "learning_rate": 2.7641372570490076e-05, "loss": 0.0197, "step": 5736 }, { "epoch": 0.7579350662218846, "grad_norm": 0.16277161240577698, "learning_rate": 2.7612692037948352e-05, "loss": 0.0123, "step": 5737 }, { "epoch": 0.7580671797073686, "grad_norm": 0.17703665792942047, "learning_rate": 2.7584024008960607e-05, "loss": 0.0117, "step": 5738 }, { "epoch": 0.7581992931928526, "grad_norm": 0.3099222779273987, "learning_rate": 2.7555368488478727e-05, "loss": 0.0265, "step": 5739 }, { "epoch": 0.7583314066783367, "grad_norm": 0.18668995797634125, "learning_rate": 2.7526725481452464e-05, "loss": 0.0232, "step": 5740 }, { "epoch": 0.7584635201638207, "grad_norm": 0.213165283203125, "learning_rate": 2.7498094992829283e-05, "loss": 0.0227, "step": 5741 }, { "epoch": 0.7585956336493047, "grad_norm": 0.1654159426689148, "learning_rate": 2.74694770275546e-05, "loss": 0.017, "step": 5742 }, { "epoch": 0.7587277471347887, "grad_norm": 0.12483178824186325, "learning_rate": 2.744087159057165e-05, "loss": 0.0155, "step": 5743 }, { "epoch": 0.7588598606202728, "grad_norm": 0.3348318040370941, "learning_rate": 2.7412278686821502e-05, "loss": 0.0245, "step": 5744 }, { "epoch": 0.7589919741057568, "grad_norm": 0.16964954137802124, "learning_rate": 2.738369832124298e-05, "loss": 0.022, "step": 5745 }, { "epoch": 0.7591240875912408, "grad_norm": 0.21486179530620575, "learning_rate": 2.735513049877285e-05, "loss": 0.0209, "step": 5746 }, { "epoch": 0.7592562010767249, "grad_norm": 0.15283586084842682, "learning_rate": 2.7326575224345697e-05, "loss": 0.0169, "step": 5747 }, { "epoch": 0.7593883145622089, "grad_norm": 0.15681065618991852, "learning_rate": 2.7298032502893855e-05, "loss": 0.0141, "step": 5748 }, { "epoch": 0.7595204280476929, "grad_norm": 0.26059630513191223, "learning_rate": 2.7269502339347564e-05, "loss": 0.0222, "step": 5749 }, { "epoch": 0.759652541533177, "grad_norm": 0.15516114234924316, "learning_rate": 2.7240984738634877e-05, "loss": 0.0161, "step": 5750 }, { "epoch": 0.759784655018661, "grad_norm": 0.16622744500637054, "learning_rate": 2.7212479705681715e-05, "loss": 0.02, "step": 5751 }, { "epoch": 0.759916768504145, "grad_norm": 0.17459112405776978, "learning_rate": 2.7183987245411724e-05, "loss": 0.0199, "step": 5752 }, { "epoch": 0.760048881989629, "grad_norm": 0.17584876716136932, "learning_rate": 2.7155507362746478e-05, "loss": 0.0197, "step": 5753 }, { "epoch": 0.7601809954751131, "grad_norm": 0.1639474630355835, "learning_rate": 2.712704006260538e-05, "loss": 0.0188, "step": 5754 }, { "epoch": 0.7603131089605971, "grad_norm": 0.15318681299686432, "learning_rate": 2.7098585349905547e-05, "loss": 0.0157, "step": 5755 }, { "epoch": 0.7604452224460811, "grad_norm": 0.15242336690425873, "learning_rate": 2.707014322956204e-05, "loss": 0.0154, "step": 5756 }, { "epoch": 0.7605773359315652, "grad_norm": 0.17166076600551605, "learning_rate": 2.7041713706487692e-05, "loss": 0.0214, "step": 5757 }, { "epoch": 0.7607094494170492, "grad_norm": 0.3508407771587372, "learning_rate": 2.7013296785593223e-05, "loss": 0.0325, "step": 5758 }, { "epoch": 0.7608415629025332, "grad_norm": 0.22720810770988464, "learning_rate": 2.698489247178705e-05, "loss": 0.0274, "step": 5759 }, { "epoch": 0.7609736763880173, "grad_norm": 0.16163846850395203, "learning_rate": 2.6956500769975512e-05, "loss": 0.0179, "step": 5760 }, { "epoch": 0.7611057898735013, "grad_norm": 0.12275891751050949, "learning_rate": 2.692812168506278e-05, "loss": 0.0135, "step": 5761 }, { "epoch": 0.7612379033589853, "grad_norm": 0.22080405056476593, "learning_rate": 2.6899755221950764e-05, "loss": 0.0192, "step": 5762 }, { "epoch": 0.7613700168444694, "grad_norm": 0.1974334418773651, "learning_rate": 2.687140138553925e-05, "loss": 0.0113, "step": 5763 }, { "epoch": 0.7615021303299534, "grad_norm": 0.18567024171352386, "learning_rate": 2.6843060180725844e-05, "loss": 0.014, "step": 5764 }, { "epoch": 0.7616342438154374, "grad_norm": 0.25421595573425293, "learning_rate": 2.6814731612405987e-05, "loss": 0.0178, "step": 5765 }, { "epoch": 0.7617663573009215, "grad_norm": 0.09751256555318832, "learning_rate": 2.6786415685472843e-05, "loss": 0.0132, "step": 5766 }, { "epoch": 0.7618984707864055, "grad_norm": 0.26204532384872437, "learning_rate": 2.6758112404817503e-05, "loss": 0.0276, "step": 5767 }, { "epoch": 0.7620305842718895, "grad_norm": 0.15766333043575287, "learning_rate": 2.6729821775328844e-05, "loss": 0.0114, "step": 5768 }, { "epoch": 0.7621626977573736, "grad_norm": 0.15987655520439148, "learning_rate": 2.670154380189349e-05, "loss": 0.0189, "step": 5769 }, { "epoch": 0.7622948112428576, "grad_norm": 0.11225584894418716, "learning_rate": 2.667327848939597e-05, "loss": 0.0091, "step": 5770 }, { "epoch": 0.7624269247283416, "grad_norm": 0.12827295064926147, "learning_rate": 2.6645025842718587e-05, "loss": 0.0116, "step": 5771 }, { "epoch": 0.7625590382138256, "grad_norm": 0.11164995282888412, "learning_rate": 2.6616785866741467e-05, "loss": 0.0133, "step": 5772 }, { "epoch": 0.7626911516993097, "grad_norm": 0.10456161201000214, "learning_rate": 2.65885585663425e-05, "loss": 0.0082, "step": 5773 }, { "epoch": 0.7628232651847937, "grad_norm": 0.11802589893341064, "learning_rate": 2.656034394639745e-05, "loss": 0.0093, "step": 5774 }, { "epoch": 0.7629553786702777, "grad_norm": 0.24839834868907928, "learning_rate": 2.653214201177988e-05, "loss": 0.0264, "step": 5775 }, { "epoch": 0.7630874921557618, "grad_norm": 0.14445337653160095, "learning_rate": 2.6503952767361117e-05, "loss": 0.0189, "step": 5776 }, { "epoch": 0.7632196056412458, "grad_norm": 0.14437027275562286, "learning_rate": 2.647577621801033e-05, "loss": 0.0137, "step": 5777 }, { "epoch": 0.7633517191267298, "grad_norm": 0.14201775193214417, "learning_rate": 2.6447612368594488e-05, "loss": 0.0208, "step": 5778 }, { "epoch": 0.7634838326122139, "grad_norm": 0.21351680159568787, "learning_rate": 2.6419461223978425e-05, "loss": 0.0189, "step": 5779 }, { "epoch": 0.7636159460976979, "grad_norm": 0.1488891988992691, "learning_rate": 2.639132278902464e-05, "loss": 0.0168, "step": 5780 }, { "epoch": 0.7637480595831819, "grad_norm": 0.1703590601682663, "learning_rate": 2.636319706859357e-05, "loss": 0.0144, "step": 5781 }, { "epoch": 0.763880173068666, "grad_norm": 0.2704126536846161, "learning_rate": 2.633508406754339e-05, "loss": 0.0216, "step": 5782 }, { "epoch": 0.76401228655415, "grad_norm": 0.15560157597064972, "learning_rate": 2.6306983790730112e-05, "loss": 0.0152, "step": 5783 }, { "epoch": 0.764144400039634, "grad_norm": 0.15597420930862427, "learning_rate": 2.627889624300752e-05, "loss": 0.0171, "step": 5784 }, { "epoch": 0.764276513525118, "grad_norm": 0.1779235601425171, "learning_rate": 2.6250821429227258e-05, "loss": 0.0118, "step": 5785 }, { "epoch": 0.7644086270106021, "grad_norm": 0.14752568304538727, "learning_rate": 2.6222759354238645e-05, "loss": 0.0182, "step": 5786 }, { "epoch": 0.7645407404960861, "grad_norm": 0.1394970864057541, "learning_rate": 2.6194710022888937e-05, "loss": 0.0212, "step": 5787 }, { "epoch": 0.7646728539815701, "grad_norm": 0.09506494551897049, "learning_rate": 2.6166673440023127e-05, "loss": 0.0101, "step": 5788 }, { "epoch": 0.7648049674670542, "grad_norm": 0.12080918252468109, "learning_rate": 2.613864961048398e-05, "loss": 0.0119, "step": 5789 }, { "epoch": 0.7649370809525382, "grad_norm": 0.17146851122379303, "learning_rate": 2.6110638539112098e-05, "loss": 0.022, "step": 5790 }, { "epoch": 0.7650691944380222, "grad_norm": 0.1401028335094452, "learning_rate": 2.608264023074588e-05, "loss": 0.0162, "step": 5791 }, { "epoch": 0.7652013079235063, "grad_norm": 0.20553302764892578, "learning_rate": 2.605465469022155e-05, "loss": 0.025, "step": 5792 }, { "epoch": 0.7653334214089903, "grad_norm": 0.20330609381198883, "learning_rate": 2.6026681922372998e-05, "loss": 0.016, "step": 5793 }, { "epoch": 0.7654655348944743, "grad_norm": 0.12444961071014404, "learning_rate": 2.5998721932032056e-05, "loss": 0.0132, "step": 5794 }, { "epoch": 0.7655976483799584, "grad_norm": 0.14549531042575836, "learning_rate": 2.5970774724028314e-05, "loss": 0.0066, "step": 5795 }, { "epoch": 0.7657297618654424, "grad_norm": 0.16223883628845215, "learning_rate": 2.5942840303189055e-05, "loss": 0.0266, "step": 5796 }, { "epoch": 0.7658618753509264, "grad_norm": 0.219892218708992, "learning_rate": 2.591491867433946e-05, "loss": 0.0123, "step": 5797 }, { "epoch": 0.7659939888364105, "grad_norm": 0.14039765298366547, "learning_rate": 2.588700984230249e-05, "loss": 0.0223, "step": 5798 }, { "epoch": 0.7661261023218945, "grad_norm": 0.14343780279159546, "learning_rate": 2.5859113811898885e-05, "loss": 0.0137, "step": 5799 }, { "epoch": 0.7662582158073785, "grad_norm": 0.11566592007875443, "learning_rate": 2.5831230587947097e-05, "loss": 0.013, "step": 5800 }, { "epoch": 0.7663903292928625, "grad_norm": 0.5028908252716064, "learning_rate": 2.580336017526348e-05, "loss": 0.0214, "step": 5801 }, { "epoch": 0.7665224427783466, "grad_norm": 0.17451821267604828, "learning_rate": 2.5775502578662148e-05, "loss": 0.0249, "step": 5802 }, { "epoch": 0.7666545562638306, "grad_norm": 0.1618116796016693, "learning_rate": 2.5747657802954918e-05, "loss": 0.0244, "step": 5803 }, { "epoch": 0.7667866697493146, "grad_norm": 0.17778781056404114, "learning_rate": 2.5719825852951484e-05, "loss": 0.0226, "step": 5804 }, { "epoch": 0.7669187832347987, "grad_norm": 0.09891565144062042, "learning_rate": 2.5692006733459294e-05, "loss": 0.0066, "step": 5805 }, { "epoch": 0.7670508967202827, "grad_norm": 0.15232637524604797, "learning_rate": 2.5664200449283627e-05, "loss": 0.0155, "step": 5806 }, { "epoch": 0.7671830102057667, "grad_norm": 0.24721577763557434, "learning_rate": 2.5636407005227413e-05, "loss": 0.0165, "step": 5807 }, { "epoch": 0.7673151236912508, "grad_norm": 0.08634576201438904, "learning_rate": 2.5608626406091507e-05, "loss": 0.0049, "step": 5808 }, { "epoch": 0.7674472371767348, "grad_norm": 0.15494407713413239, "learning_rate": 2.558085865667449e-05, "loss": 0.0153, "step": 5809 }, { "epoch": 0.7675793506622188, "grad_norm": 0.2543340027332306, "learning_rate": 2.555310376177268e-05, "loss": 0.0208, "step": 5810 }, { "epoch": 0.7677114641477029, "grad_norm": 0.1335250288248062, "learning_rate": 2.5525361726180243e-05, "loss": 0.0151, "step": 5811 }, { "epoch": 0.7678435776331869, "grad_norm": 0.1976231038570404, "learning_rate": 2.549763255468909e-05, "loss": 0.0156, "step": 5812 }, { "epoch": 0.7679756911186709, "grad_norm": 0.1935354620218277, "learning_rate": 2.5469916252088954e-05, "loss": 0.024, "step": 5813 }, { "epoch": 0.768107804604155, "grad_norm": 0.1447988748550415, "learning_rate": 2.5442212823167243e-05, "loss": 0.0168, "step": 5814 }, { "epoch": 0.768239918089639, "grad_norm": 0.10164899379014969, "learning_rate": 2.5414522272709253e-05, "loss": 0.015, "step": 5815 }, { "epoch": 0.768372031575123, "grad_norm": 0.07989451289176941, "learning_rate": 2.5386844605498015e-05, "loss": 0.0061, "step": 5816 }, { "epoch": 0.768504145060607, "grad_norm": 0.13421711325645447, "learning_rate": 2.5359179826314283e-05, "loss": 0.0075, "step": 5817 }, { "epoch": 0.7686362585460911, "grad_norm": 0.06816146522760391, "learning_rate": 2.533152793993665e-05, "loss": 0.0044, "step": 5818 }, { "epoch": 0.7687683720315751, "grad_norm": 0.13832999765872955, "learning_rate": 2.5303888951141476e-05, "loss": 0.0162, "step": 5819 }, { "epoch": 0.7689004855170591, "grad_norm": 0.17114047706127167, "learning_rate": 2.5276262864702895e-05, "loss": 0.0185, "step": 5820 }, { "epoch": 0.7690325990025432, "grad_norm": 0.12861768901348114, "learning_rate": 2.5248649685392743e-05, "loss": 0.0118, "step": 5821 }, { "epoch": 0.7691647124880272, "grad_norm": 0.16697217524051666, "learning_rate": 2.5221049417980726e-05, "loss": 0.0116, "step": 5822 }, { "epoch": 0.7692968259735112, "grad_norm": 0.10849463194608688, "learning_rate": 2.5193462067234275e-05, "loss": 0.0052, "step": 5823 }, { "epoch": 0.7694289394589953, "grad_norm": 0.2121121883392334, "learning_rate": 2.516588763791855e-05, "loss": 0.0218, "step": 5824 }, { "epoch": 0.7695610529444793, "grad_norm": 0.23473268747329712, "learning_rate": 2.5138326134796543e-05, "loss": 0.0253, "step": 5825 }, { "epoch": 0.7696931664299633, "grad_norm": 0.1477525532245636, "learning_rate": 2.5110777562628985e-05, "loss": 0.011, "step": 5826 }, { "epoch": 0.7698252799154474, "grad_norm": 0.16129234433174133, "learning_rate": 2.5083241926174406e-05, "loss": 0.0226, "step": 5827 }, { "epoch": 0.7699573934009314, "grad_norm": 0.17213378846645355, "learning_rate": 2.5055719230189013e-05, "loss": 0.0161, "step": 5828 }, { "epoch": 0.7700895068864154, "grad_norm": 0.16342443227767944, "learning_rate": 2.502820947942688e-05, "loss": 0.0181, "step": 5829 }, { "epoch": 0.7702216203718995, "grad_norm": 0.16742447018623352, "learning_rate": 2.5000712678639815e-05, "loss": 0.0129, "step": 5830 }, { "epoch": 0.7703537338573835, "grad_norm": 0.08959437161684036, "learning_rate": 2.4973228832577324e-05, "loss": 0.0053, "step": 5831 }, { "epoch": 0.7704858473428675, "grad_norm": 0.07583253085613251, "learning_rate": 2.4945757945986748e-05, "loss": 0.0067, "step": 5832 }, { "epoch": 0.7706179608283515, "grad_norm": 0.14746922254562378, "learning_rate": 2.4918300023613183e-05, "loss": 0.0137, "step": 5833 }, { "epoch": 0.7707500743138356, "grad_norm": 0.18567243218421936, "learning_rate": 2.4890855070199505e-05, "loss": 0.0147, "step": 5834 }, { "epoch": 0.7708821877993196, "grad_norm": 0.3144448697566986, "learning_rate": 2.486342309048624e-05, "loss": 0.023, "step": 5835 }, { "epoch": 0.7710143012848036, "grad_norm": 0.23152709007263184, "learning_rate": 2.4836004089211785e-05, "loss": 0.016, "step": 5836 }, { "epoch": 0.7711464147702877, "grad_norm": 0.11881105601787567, "learning_rate": 2.4808598071112288e-05, "loss": 0.0066, "step": 5837 }, { "epoch": 0.7712785282557717, "grad_norm": 0.17560985684394836, "learning_rate": 2.4781205040921584e-05, "loss": 0.0123, "step": 5838 }, { "epoch": 0.7714106417412557, "grad_norm": 0.15621699392795563, "learning_rate": 2.475382500337131e-05, "loss": 0.0204, "step": 5839 }, { "epoch": 0.7715427552267398, "grad_norm": 0.12854041159152985, "learning_rate": 2.4726457963190875e-05, "loss": 0.0117, "step": 5840 }, { "epoch": 0.7716748687122238, "grad_norm": 0.2113855928182602, "learning_rate": 2.4699103925107413e-05, "loss": 0.0167, "step": 5841 }, { "epoch": 0.7718069821977078, "grad_norm": 0.14856669306755066, "learning_rate": 2.4671762893845828e-05, "loss": 0.0112, "step": 5842 }, { "epoch": 0.7719390956831919, "grad_norm": 0.12639237940311432, "learning_rate": 2.4644434874128776e-05, "loss": 0.0069, "step": 5843 }, { "epoch": 0.7720712091686759, "grad_norm": 0.18509209156036377, "learning_rate": 2.4617119870676676e-05, "loss": 0.0155, "step": 5844 }, { "epoch": 0.7722033226541599, "grad_norm": 0.11860229820013046, "learning_rate": 2.4589817888207645e-05, "loss": 0.0097, "step": 5845 }, { "epoch": 0.772335436139644, "grad_norm": 0.12120820581912994, "learning_rate": 2.45625289314376e-05, "loss": 0.0145, "step": 5846 }, { "epoch": 0.772467549625128, "grad_norm": 0.14185993373394012, "learning_rate": 2.453525300508024e-05, "loss": 0.0127, "step": 5847 }, { "epoch": 0.772599663110612, "grad_norm": 0.14186576008796692, "learning_rate": 2.4507990113846913e-05, "loss": 0.0093, "step": 5848 }, { "epoch": 0.772731776596096, "grad_norm": 0.13555972278118134, "learning_rate": 2.44807402624468e-05, "loss": 0.0184, "step": 5849 }, { "epoch": 0.7728638900815801, "grad_norm": 0.14067058265209198, "learning_rate": 2.445350345558679e-05, "loss": 0.0166, "step": 5850 }, { "epoch": 0.7729960035670641, "grad_norm": 0.1019555851817131, "learning_rate": 2.4426279697971587e-05, "loss": 0.0103, "step": 5851 }, { "epoch": 0.7731281170525481, "grad_norm": 0.08230996876955032, "learning_rate": 2.439906899430351e-05, "loss": 0.0074, "step": 5852 }, { "epoch": 0.7732602305380322, "grad_norm": 0.21562132239341736, "learning_rate": 2.4371871349282727e-05, "loss": 0.0113, "step": 5853 }, { "epoch": 0.7733923440235162, "grad_norm": 0.34086093306541443, "learning_rate": 2.4344686767607172e-05, "loss": 0.0284, "step": 5854 }, { "epoch": 0.7735244575090002, "grad_norm": 0.1579699069261551, "learning_rate": 2.431751525397239e-05, "loss": 0.0206, "step": 5855 }, { "epoch": 0.7736565709944843, "grad_norm": 0.2254895567893982, "learning_rate": 2.429035681307179e-05, "loss": 0.0329, "step": 5856 }, { "epoch": 0.7737886844799683, "grad_norm": 0.22535626590251923, "learning_rate": 2.426321144959649e-05, "loss": 0.0262, "step": 5857 }, { "epoch": 0.7739207979654523, "grad_norm": 0.14168086647987366, "learning_rate": 2.423607916823537e-05, "loss": 0.0116, "step": 5858 }, { "epoch": 0.7740529114509364, "grad_norm": 0.1516113579273224, "learning_rate": 2.420895997367497e-05, "loss": 0.0193, "step": 5859 }, { "epoch": 0.7741850249364204, "grad_norm": 0.19735190272331238, "learning_rate": 2.4181853870599648e-05, "loss": 0.0131, "step": 5860 }, { "epoch": 0.7743171384219044, "grad_norm": 0.256398469209671, "learning_rate": 2.4154760863691505e-05, "loss": 0.0066, "step": 5861 }, { "epoch": 0.7744492519073884, "grad_norm": 0.13530072569847107, "learning_rate": 2.4127680957630295e-05, "loss": 0.0137, "step": 5862 }, { "epoch": 0.7745813653928725, "grad_norm": 0.2853071391582489, "learning_rate": 2.4100614157093593e-05, "loss": 0.0287, "step": 5863 }, { "epoch": 0.7747134788783565, "grad_norm": 0.10476741939783096, "learning_rate": 2.4073560466756682e-05, "loss": 0.011, "step": 5864 }, { "epoch": 0.7748455923638405, "grad_norm": 0.15531739592552185, "learning_rate": 2.4046519891292607e-05, "loss": 0.0169, "step": 5865 }, { "epoch": 0.7749777058493246, "grad_norm": 0.19582904875278473, "learning_rate": 2.4019492435372083e-05, "loss": 0.0132, "step": 5866 }, { "epoch": 0.7751098193348086, "grad_norm": 0.1912730634212494, "learning_rate": 2.3992478103663606e-05, "loss": 0.0181, "step": 5867 }, { "epoch": 0.7752419328202926, "grad_norm": 0.1268959492444992, "learning_rate": 2.3965476900833428e-05, "loss": 0.0147, "step": 5868 }, { "epoch": 0.7753740463057767, "grad_norm": 0.13870106637477875, "learning_rate": 2.3938488831545446e-05, "loss": 0.0166, "step": 5869 }, { "epoch": 0.7755061597912607, "grad_norm": 0.13729718327522278, "learning_rate": 2.3911513900461392e-05, "loss": 0.0172, "step": 5870 }, { "epoch": 0.7756382732767447, "grad_norm": 0.08225198090076447, "learning_rate": 2.3884552112240655e-05, "loss": 0.0065, "step": 5871 }, { "epoch": 0.7757703867622288, "grad_norm": 0.1484571397304535, "learning_rate": 2.3857603471540414e-05, "loss": 0.0066, "step": 5872 }, { "epoch": 0.7759025002477128, "grad_norm": 0.21782946586608887, "learning_rate": 2.3830667983015486e-05, "loss": 0.01, "step": 5873 }, { "epoch": 0.7760346137331968, "grad_norm": 0.1634368747472763, "learning_rate": 2.380374565131852e-05, "loss": 0.0196, "step": 5874 }, { "epoch": 0.7761667272186809, "grad_norm": 0.21178163588047028, "learning_rate": 2.377683648109985e-05, "loss": 0.0185, "step": 5875 }, { "epoch": 0.7762988407041649, "grad_norm": 0.09029104560613632, "learning_rate": 2.3749940477007482e-05, "loss": 0.0063, "step": 5876 }, { "epoch": 0.7764309541896489, "grad_norm": 0.1842057704925537, "learning_rate": 2.372305764368723e-05, "loss": 0.0142, "step": 5877 }, { "epoch": 0.776563067675133, "grad_norm": 0.16264231503009796, "learning_rate": 2.3696187985782602e-05, "loss": 0.0152, "step": 5878 }, { "epoch": 0.776695181160617, "grad_norm": 0.17312869429588318, "learning_rate": 2.3669331507934856e-05, "loss": 0.0193, "step": 5879 }, { "epoch": 0.776827294646101, "grad_norm": 0.21124525368213654, "learning_rate": 2.3642488214782886e-05, "loss": 0.0283, "step": 5880 }, { "epoch": 0.776959408131585, "grad_norm": 0.1384699046611786, "learning_rate": 2.36156581109634e-05, "loss": 0.0098, "step": 5881 }, { "epoch": 0.7770915216170691, "grad_norm": 0.1894114464521408, "learning_rate": 2.358884120111082e-05, "loss": 0.0225, "step": 5882 }, { "epoch": 0.7772236351025531, "grad_norm": 0.0946621522307396, "learning_rate": 2.3562037489857226e-05, "loss": 0.0087, "step": 5883 }, { "epoch": 0.7773557485880371, "grad_norm": 0.2099320888519287, "learning_rate": 2.353524698183246e-05, "loss": 0.022, "step": 5884 }, { "epoch": 0.7774878620735212, "grad_norm": 0.1482798159122467, "learning_rate": 2.3508469681664102e-05, "loss": 0.0209, "step": 5885 }, { "epoch": 0.7776199755590052, "grad_norm": 0.15039187669754028, "learning_rate": 2.3481705593977456e-05, "loss": 0.0137, "step": 5886 }, { "epoch": 0.7777520890444892, "grad_norm": 0.17164143919944763, "learning_rate": 2.345495472339545e-05, "loss": 0.0128, "step": 5887 }, { "epoch": 0.7778842025299733, "grad_norm": 0.2455880045890808, "learning_rate": 2.342821707453884e-05, "loss": 0.0136, "step": 5888 }, { "epoch": 0.7780163160154573, "grad_norm": 0.15861807763576508, "learning_rate": 2.340149265202607e-05, "loss": 0.0209, "step": 5889 }, { "epoch": 0.7781484295009413, "grad_norm": 0.1428367793560028, "learning_rate": 2.3374781460473226e-05, "loss": 0.0166, "step": 5890 }, { "epoch": 0.7782805429864253, "grad_norm": 0.1530083864927292, "learning_rate": 2.334808350449421e-05, "loss": 0.0218, "step": 5891 }, { "epoch": 0.7784126564719094, "grad_norm": 0.11520033329725266, "learning_rate": 2.3321398788700622e-05, "loss": 0.0133, "step": 5892 }, { "epoch": 0.7785447699573934, "grad_norm": 0.23249691724777222, "learning_rate": 2.3294727317701672e-05, "loss": 0.0089, "step": 5893 }, { "epoch": 0.7786768834428774, "grad_norm": 0.1689661145210266, "learning_rate": 2.3268069096104406e-05, "loss": 0.0178, "step": 5894 }, { "epoch": 0.7788089969283615, "grad_norm": 0.1357138752937317, "learning_rate": 2.3241424128513522e-05, "loss": 0.0124, "step": 5895 }, { "epoch": 0.7789411104138455, "grad_norm": 0.142770916223526, "learning_rate": 2.3214792419531473e-05, "loss": 0.0139, "step": 5896 }, { "epoch": 0.7790732238993295, "grad_norm": 0.14981845021247864, "learning_rate": 2.318817397375833e-05, "loss": 0.0122, "step": 5897 }, { "epoch": 0.7792053373848136, "grad_norm": 0.21843738853931427, "learning_rate": 2.3161568795791965e-05, "loss": 0.017, "step": 5898 }, { "epoch": 0.7793374508702976, "grad_norm": 0.12665678560733795, "learning_rate": 2.3134976890227923e-05, "loss": 0.0128, "step": 5899 }, { "epoch": 0.7794695643557816, "grad_norm": 0.20801448822021484, "learning_rate": 2.3108398261659447e-05, "loss": 0.0213, "step": 5900 }, { "epoch": 0.7796016778412657, "grad_norm": 0.1517508625984192, "learning_rate": 2.3081832914677514e-05, "loss": 0.0255, "step": 5901 }, { "epoch": 0.7797337913267497, "grad_norm": 0.1105872094631195, "learning_rate": 2.305528085387082e-05, "loss": 0.015, "step": 5902 }, { "epoch": 0.7798659048122337, "grad_norm": 0.3150453567504883, "learning_rate": 2.302874208382567e-05, "loss": 0.0267, "step": 5903 }, { "epoch": 0.7799980182977178, "grad_norm": 0.1409350335597992, "learning_rate": 2.300221660912617e-05, "loss": 0.0104, "step": 5904 }, { "epoch": 0.7801301317832018, "grad_norm": 0.1132245883345604, "learning_rate": 2.2975704434354096e-05, "loss": 0.0147, "step": 5905 }, { "epoch": 0.7802622452686858, "grad_norm": 0.24876229465007782, "learning_rate": 2.294920556408897e-05, "loss": 0.0213, "step": 5906 }, { "epoch": 0.7803943587541698, "grad_norm": 0.1730552315711975, "learning_rate": 2.2922720002907926e-05, "loss": 0.0088, "step": 5907 }, { "epoch": 0.7805264722396539, "grad_norm": 0.18458811938762665, "learning_rate": 2.2896247755385857e-05, "loss": 0.0177, "step": 5908 }, { "epoch": 0.7806585857251379, "grad_norm": 0.15846845507621765, "learning_rate": 2.2869788826095383e-05, "loss": 0.0143, "step": 5909 }, { "epoch": 0.7807906992106219, "grad_norm": 0.141799196600914, "learning_rate": 2.284334321960674e-05, "loss": 0.0083, "step": 5910 }, { "epoch": 0.780922812696106, "grad_norm": 0.19804981350898743, "learning_rate": 2.2816910940487935e-05, "loss": 0.0234, "step": 5911 }, { "epoch": 0.78105492618159, "grad_norm": 0.16551578044891357, "learning_rate": 2.279049199330465e-05, "loss": 0.02, "step": 5912 }, { "epoch": 0.781187039667074, "grad_norm": 0.12346825003623962, "learning_rate": 2.276408638262031e-05, "loss": 0.0141, "step": 5913 }, { "epoch": 0.7813191531525581, "grad_norm": 0.12936897575855255, "learning_rate": 2.27376941129959e-05, "loss": 0.0098, "step": 5914 }, { "epoch": 0.7814512666380421, "grad_norm": 0.22945274412631989, "learning_rate": 2.2711315188990247e-05, "loss": 0.0152, "step": 5915 }, { "epoch": 0.7815833801235261, "grad_norm": 0.17532186210155487, "learning_rate": 2.2684949615159834e-05, "loss": 0.017, "step": 5916 }, { "epoch": 0.7817154936090102, "grad_norm": 0.27837732434272766, "learning_rate": 2.2658597396058768e-05, "loss": 0.013, "step": 5917 }, { "epoch": 0.7818476070944942, "grad_norm": 0.1335458606481552, "learning_rate": 2.2632258536238915e-05, "loss": 0.0172, "step": 5918 }, { "epoch": 0.7819797205799782, "grad_norm": 0.17067228257656097, "learning_rate": 2.260593304024985e-05, "loss": 0.013, "step": 5919 }, { "epoch": 0.7821118340654623, "grad_norm": 0.19821752607822418, "learning_rate": 2.257962091263882e-05, "loss": 0.0227, "step": 5920 }, { "epoch": 0.7822439475509463, "grad_norm": 0.19090385735034943, "learning_rate": 2.2553322157950696e-05, "loss": 0.0163, "step": 5921 }, { "epoch": 0.7823760610364303, "grad_norm": 0.16109652817249298, "learning_rate": 2.2527036780728128e-05, "loss": 0.0252, "step": 5922 }, { "epoch": 0.7825081745219143, "grad_norm": 0.1403699666261673, "learning_rate": 2.250076478551145e-05, "loss": 0.0079, "step": 5923 }, { "epoch": 0.7826402880073984, "grad_norm": 0.14240224659442902, "learning_rate": 2.2474506176838605e-05, "loss": 0.0105, "step": 5924 }, { "epoch": 0.7827724014928824, "grad_norm": 0.19920337200164795, "learning_rate": 2.2448260959245304e-05, "loss": 0.0234, "step": 5925 }, { "epoch": 0.7829045149783664, "grad_norm": 0.1772182583808899, "learning_rate": 2.242202913726491e-05, "loss": 0.0208, "step": 5926 }, { "epoch": 0.7830366284638505, "grad_norm": 0.2252548635005951, "learning_rate": 2.239581071542852e-05, "loss": 0.0219, "step": 5927 }, { "epoch": 0.7831687419493345, "grad_norm": 0.13585056364536285, "learning_rate": 2.2369605698264817e-05, "loss": 0.0079, "step": 5928 }, { "epoch": 0.7833008554348185, "grad_norm": 0.22028230130672455, "learning_rate": 2.234341409030024e-05, "loss": 0.026, "step": 5929 }, { "epoch": 0.7834329689203026, "grad_norm": 0.11685667932033539, "learning_rate": 2.2317235896058953e-05, "loss": 0.0183, "step": 5930 }, { "epoch": 0.7835650824057866, "grad_norm": 0.14003141224384308, "learning_rate": 2.229107112006268e-05, "loss": 0.0205, "step": 5931 }, { "epoch": 0.7836971958912706, "grad_norm": 0.19484570622444153, "learning_rate": 2.2264919766830927e-05, "loss": 0.0204, "step": 5932 }, { "epoch": 0.7838293093767547, "grad_norm": 0.19756214320659637, "learning_rate": 2.223878184088084e-05, "loss": 0.0194, "step": 5933 }, { "epoch": 0.7839614228622387, "grad_norm": 0.11648136377334595, "learning_rate": 2.2212657346727307e-05, "loss": 0.0039, "step": 5934 }, { "epoch": 0.7840935363477227, "grad_norm": 0.21022813022136688, "learning_rate": 2.218654628888277e-05, "loss": 0.0224, "step": 5935 }, { "epoch": 0.7842256498332068, "grad_norm": 0.15801659226417542, "learning_rate": 2.216044867185747e-05, "loss": 0.0151, "step": 5936 }, { "epoch": 0.7843577633186908, "grad_norm": 0.2231561243534088, "learning_rate": 2.21343645001593e-05, "loss": 0.017, "step": 5937 }, { "epoch": 0.7844898768041748, "grad_norm": 0.16394644975662231, "learning_rate": 2.2108293778293754e-05, "loss": 0.0187, "step": 5938 }, { "epoch": 0.7846219902896588, "grad_norm": 0.1428672969341278, "learning_rate": 2.2082236510764098e-05, "loss": 0.0147, "step": 5939 }, { "epoch": 0.7847541037751429, "grad_norm": 0.22260761260986328, "learning_rate": 2.2056192702071233e-05, "loss": 0.0148, "step": 5940 }, { "epoch": 0.7848862172606269, "grad_norm": 0.25145766139030457, "learning_rate": 2.203016235671378e-05, "loss": 0.0246, "step": 5941 }, { "epoch": 0.7850183307461109, "grad_norm": 0.4117206633090973, "learning_rate": 2.2004145479187922e-05, "loss": 0.0179, "step": 5942 }, { "epoch": 0.785150444231595, "grad_norm": 0.17742082476615906, "learning_rate": 2.1978142073987617e-05, "loss": 0.0236, "step": 5943 }, { "epoch": 0.785282557717079, "grad_norm": 0.10295175760984421, "learning_rate": 2.195215214560451e-05, "loss": 0.0066, "step": 5944 }, { "epoch": 0.785414671202563, "grad_norm": 0.149460569024086, "learning_rate": 2.1926175698527806e-05, "loss": 0.0173, "step": 5945 }, { "epoch": 0.7855467846880471, "grad_norm": 0.15568538010120392, "learning_rate": 2.1900212737244484e-05, "loss": 0.0147, "step": 5946 }, { "epoch": 0.7856788981735311, "grad_norm": 0.18868260085582733, "learning_rate": 2.187426326623916e-05, "loss": 0.0186, "step": 5947 }, { "epoch": 0.7858110116590151, "grad_norm": 0.19524319469928741, "learning_rate": 2.1848327289994143e-05, "loss": 0.017, "step": 5948 }, { "epoch": 0.7859431251444992, "grad_norm": 0.29335448145866394, "learning_rate": 2.182240481298934e-05, "loss": 0.0109, "step": 5949 }, { "epoch": 0.7860752386299832, "grad_norm": 0.16811603307724, "learning_rate": 2.1796495839702392e-05, "loss": 0.0157, "step": 5950 }, { "epoch": 0.7862073521154672, "grad_norm": 0.1527618020772934, "learning_rate": 2.177060037460863e-05, "loss": 0.0135, "step": 5951 }, { "epoch": 0.7863394656009512, "grad_norm": 0.11982507258653641, "learning_rate": 2.1744718422180945e-05, "loss": 0.0087, "step": 5952 }, { "epoch": 0.7864715790864353, "grad_norm": 0.2619565427303314, "learning_rate": 2.171884998688999e-05, "loss": 0.019, "step": 5953 }, { "epoch": 0.7866036925719193, "grad_norm": 0.1242903470993042, "learning_rate": 2.169299507320406e-05, "loss": 0.0148, "step": 5954 }, { "epoch": 0.7867358060574033, "grad_norm": 0.150680810213089, "learning_rate": 2.1667153685589124e-05, "loss": 0.013, "step": 5955 }, { "epoch": 0.7868679195428874, "grad_norm": 0.14262771606445312, "learning_rate": 2.1641325828508764e-05, "loss": 0.0171, "step": 5956 }, { "epoch": 0.7870000330283714, "grad_norm": 0.2812264561653137, "learning_rate": 2.161551150642427e-05, "loss": 0.025, "step": 5957 }, { "epoch": 0.7871321465138554, "grad_norm": 0.1431320160627365, "learning_rate": 2.1589710723794575e-05, "loss": 0.011, "step": 5958 }, { "epoch": 0.7872642599993395, "grad_norm": 0.20700080692768097, "learning_rate": 2.156392348507631e-05, "loss": 0.0219, "step": 5959 }, { "epoch": 0.7873963734848235, "grad_norm": 0.10715476423501968, "learning_rate": 2.1538149794723706e-05, "loss": 0.0109, "step": 5960 }, { "epoch": 0.7875284869703075, "grad_norm": 0.10400570929050446, "learning_rate": 2.1512389657188748e-05, "loss": 0.0084, "step": 5961 }, { "epoch": 0.7876606004557916, "grad_norm": 0.1712016463279724, "learning_rate": 2.1486643076920932e-05, "loss": 0.0197, "step": 5962 }, { "epoch": 0.7877927139412756, "grad_norm": 0.0920313149690628, "learning_rate": 2.1460910058367543e-05, "loss": 0.0099, "step": 5963 }, { "epoch": 0.7879248274267596, "grad_norm": 0.10525844246149063, "learning_rate": 2.14351906059735e-05, "loss": 0.0073, "step": 5964 }, { "epoch": 0.7880569409122437, "grad_norm": 0.30008089542388916, "learning_rate": 2.1409484724181306e-05, "loss": 0.0242, "step": 5965 }, { "epoch": 0.7881890543977277, "grad_norm": 0.12237905710935593, "learning_rate": 2.138379241743119e-05, "loss": 0.0181, "step": 5966 }, { "epoch": 0.7883211678832117, "grad_norm": 0.13109365105628967, "learning_rate": 2.135811369016104e-05, "loss": 0.0111, "step": 5967 }, { "epoch": 0.7884532813686957, "grad_norm": 0.12001150846481323, "learning_rate": 2.1332448546806382e-05, "loss": 0.0079, "step": 5968 }, { "epoch": 0.7885853948541798, "grad_norm": 0.11735592782497406, "learning_rate": 2.1306796991800337e-05, "loss": 0.0167, "step": 5969 }, { "epoch": 0.7887175083396638, "grad_norm": 0.1726117581129074, "learning_rate": 2.1281159029573772e-05, "loss": 0.0137, "step": 5970 }, { "epoch": 0.7888496218251478, "grad_norm": 0.20775265991687775, "learning_rate": 2.1255534664555175e-05, "loss": 0.0211, "step": 5971 }, { "epoch": 0.7889817353106319, "grad_norm": 0.15340012311935425, "learning_rate": 2.1229923901170646e-05, "loss": 0.0099, "step": 5972 }, { "epoch": 0.7891138487961159, "grad_norm": 0.1911078691482544, "learning_rate": 2.1204326743843962e-05, "loss": 0.0236, "step": 5973 }, { "epoch": 0.7892459622815999, "grad_norm": 0.1343451589345932, "learning_rate": 2.1178743196996576e-05, "loss": 0.012, "step": 5974 }, { "epoch": 0.789378075767084, "grad_norm": 0.29213130474090576, "learning_rate": 2.115317326504759e-05, "loss": 0.0219, "step": 5975 }, { "epoch": 0.789510189252568, "grad_norm": 0.30969685316085815, "learning_rate": 2.1127616952413666e-05, "loss": 0.0286, "step": 5976 }, { "epoch": 0.789642302738052, "grad_norm": 0.12944519519805908, "learning_rate": 2.110207426350922e-05, "loss": 0.0105, "step": 5977 }, { "epoch": 0.7897744162235361, "grad_norm": 0.14682377874851227, "learning_rate": 2.10765452027463e-05, "loss": 0.0167, "step": 5978 }, { "epoch": 0.7899065297090201, "grad_norm": 0.19225028157234192, "learning_rate": 2.1051029774534504e-05, "loss": 0.0151, "step": 5979 }, { "epoch": 0.7900386431945041, "grad_norm": 0.18472789227962494, "learning_rate": 2.102552798328119e-05, "loss": 0.0204, "step": 5980 }, { "epoch": 0.7901707566799882, "grad_norm": 0.11308620870113373, "learning_rate": 2.1000039833391318e-05, "loss": 0.0139, "step": 5981 }, { "epoch": 0.7903028701654722, "grad_norm": 0.1950245052576065, "learning_rate": 2.0974565329267502e-05, "loss": 0.0196, "step": 5982 }, { "epoch": 0.7904349836509562, "grad_norm": 0.21679812669754028, "learning_rate": 2.0949104475309933e-05, "loss": 0.0198, "step": 5983 }, { "epoch": 0.7905670971364402, "grad_norm": 0.12117374688386917, "learning_rate": 2.092365727591654e-05, "loss": 0.015, "step": 5984 }, { "epoch": 0.7906992106219243, "grad_norm": 0.09717853367328644, "learning_rate": 2.0898223735482857e-05, "loss": 0.0106, "step": 5985 }, { "epoch": 0.7908313241074083, "grad_norm": 0.16154758632183075, "learning_rate": 2.0872803858402013e-05, "loss": 0.0216, "step": 5986 }, { "epoch": 0.7909634375928923, "grad_norm": 0.3653877079486847, "learning_rate": 2.084739764906485e-05, "loss": 0.0397, "step": 5987 }, { "epoch": 0.7910955510783764, "grad_norm": 0.12396520376205444, "learning_rate": 2.082200511185979e-05, "loss": 0.0165, "step": 5988 }, { "epoch": 0.7912276645638604, "grad_norm": 0.1662580817937851, "learning_rate": 2.0796626251172968e-05, "loss": 0.0197, "step": 5989 }, { "epoch": 0.7913597780493444, "grad_norm": 0.13176557421684265, "learning_rate": 2.0771261071388047e-05, "loss": 0.0181, "step": 5990 }, { "epoch": 0.7914918915348285, "grad_norm": 0.14741529524326324, "learning_rate": 2.0745909576886414e-05, "loss": 0.0116, "step": 5991 }, { "epoch": 0.7916240050203125, "grad_norm": 0.20206743478775024, "learning_rate": 2.0720571772047092e-05, "loss": 0.0165, "step": 5992 }, { "epoch": 0.7917561185057965, "grad_norm": 0.11618878692388535, "learning_rate": 2.0695247661246665e-05, "loss": 0.0089, "step": 5993 }, { "epoch": 0.7918882319912806, "grad_norm": 0.16395948827266693, "learning_rate": 2.0669937248859416e-05, "loss": 0.0167, "step": 5994 }, { "epoch": 0.7920203454767646, "grad_norm": 0.17397412657737732, "learning_rate": 2.0644640539257266e-05, "loss": 0.0181, "step": 5995 }, { "epoch": 0.7921524589622486, "grad_norm": 0.12989552319049835, "learning_rate": 2.0619357536809746e-05, "loss": 0.0155, "step": 5996 }, { "epoch": 0.7922845724477326, "grad_norm": 0.140245720744133, "learning_rate": 2.0594088245883982e-05, "loss": 0.0132, "step": 5997 }, { "epoch": 0.7924166859332167, "grad_norm": 0.13462164998054504, "learning_rate": 2.0568832670844805e-05, "loss": 0.0104, "step": 5998 }, { "epoch": 0.7925487994187007, "grad_norm": 0.10564357042312622, "learning_rate": 2.054359081605467e-05, "loss": 0.0082, "step": 5999 }, { "epoch": 0.7926809129041847, "grad_norm": 0.13238169252872467, "learning_rate": 2.051836268587357e-05, "loss": 0.0127, "step": 6000 }, { "epoch": 0.7928130263896688, "grad_norm": 0.1388818621635437, "learning_rate": 2.0493148284659225e-05, "loss": 0.0165, "step": 6001 }, { "epoch": 0.7929451398751528, "grad_norm": 0.13400214910507202, "learning_rate": 2.046794761676696e-05, "loss": 0.0108, "step": 6002 }, { "epoch": 0.7930772533606368, "grad_norm": 0.27306875586509705, "learning_rate": 2.0442760686549732e-05, "loss": 0.023, "step": 6003 }, { "epoch": 0.7932093668461209, "grad_norm": 0.26368728280067444, "learning_rate": 2.041758749835806e-05, "loss": 0.019, "step": 6004 }, { "epoch": 0.7933414803316049, "grad_norm": 0.13244707882404327, "learning_rate": 2.039242805654018e-05, "loss": 0.0165, "step": 6005 }, { "epoch": 0.7934735938170889, "grad_norm": 0.13773098587989807, "learning_rate": 2.036728236544194e-05, "loss": 0.013, "step": 6006 }, { "epoch": 0.793605707302573, "grad_norm": 0.18311674892902374, "learning_rate": 2.0342150429406727e-05, "loss": 0.0141, "step": 6007 }, { "epoch": 0.793737820788057, "grad_norm": 0.20516015589237213, "learning_rate": 2.0317032252775638e-05, "loss": 0.0289, "step": 6008 }, { "epoch": 0.793869934273541, "grad_norm": 0.16869759559631348, "learning_rate": 2.0291927839887383e-05, "loss": 0.017, "step": 6009 }, { "epoch": 0.794002047759025, "grad_norm": 0.24092960357666016, "learning_rate": 2.026683719507828e-05, "loss": 0.0159, "step": 6010 }, { "epoch": 0.7941341612445091, "grad_norm": 0.14799204468727112, "learning_rate": 2.0241760322682247e-05, "loss": 0.0196, "step": 6011 }, { "epoch": 0.7942662747299931, "grad_norm": 0.18595071136951447, "learning_rate": 2.0216697227030855e-05, "loss": 0.0109, "step": 6012 }, { "epoch": 0.7943983882154771, "grad_norm": 0.112423837184906, "learning_rate": 2.0191647912453317e-05, "loss": 0.0057, "step": 6013 }, { "epoch": 0.7945305017009612, "grad_norm": 0.1787814199924469, "learning_rate": 2.016661238327636e-05, "loss": 0.0096, "step": 6014 }, { "epoch": 0.7946626151864452, "grad_norm": 0.14639584720134735, "learning_rate": 2.014159064382446e-05, "loss": 0.0119, "step": 6015 }, { "epoch": 0.7947947286719292, "grad_norm": 0.1412990540266037, "learning_rate": 2.0116582698419638e-05, "loss": 0.016, "step": 6016 }, { "epoch": 0.7949268421574133, "grad_norm": 0.1513022631406784, "learning_rate": 2.009158855138156e-05, "loss": 0.012, "step": 6017 }, { "epoch": 0.7950589556428973, "grad_norm": 0.37479570508003235, "learning_rate": 2.006660820702748e-05, "loss": 0.0187, "step": 6018 }, { "epoch": 0.7951910691283813, "grad_norm": 0.13417895138263702, "learning_rate": 2.0041641669672305e-05, "loss": 0.0104, "step": 6019 }, { "epoch": 0.7953231826138654, "grad_norm": 0.18156084418296814, "learning_rate": 2.001668894362856e-05, "loss": 0.0228, "step": 6020 }, { "epoch": 0.7954552960993494, "grad_norm": 0.14486129581928253, "learning_rate": 1.999175003320629e-05, "loss": 0.0091, "step": 6021 }, { "epoch": 0.7955874095848334, "grad_norm": 0.16329576075077057, "learning_rate": 1.996682494271327e-05, "loss": 0.0131, "step": 6022 }, { "epoch": 0.7957195230703173, "grad_norm": 0.13377881050109863, "learning_rate": 1.9941913676454872e-05, "loss": 0.0121, "step": 6023 }, { "epoch": 0.7958516365558014, "grad_norm": 0.21078723669052124, "learning_rate": 1.9917016238733976e-05, "loss": 0.009, "step": 6024 }, { "epoch": 0.7959837500412854, "grad_norm": 0.16063301265239716, "learning_rate": 1.9892132633851214e-05, "loss": 0.0152, "step": 6025 }, { "epoch": 0.7961158635267694, "grad_norm": 0.22228194773197174, "learning_rate": 1.986726286610472e-05, "loss": 0.014, "step": 6026 }, { "epoch": 0.7962479770122535, "grad_norm": 0.12263708561658859, "learning_rate": 1.9842406939790337e-05, "loss": 0.0171, "step": 6027 }, { "epoch": 0.7963800904977375, "grad_norm": 0.08849442005157471, "learning_rate": 1.981756485920141e-05, "loss": 0.0051, "step": 6028 }, { "epoch": 0.7965122039832215, "grad_norm": 0.1484423279762268, "learning_rate": 1.979273662862895e-05, "loss": 0.0111, "step": 6029 }, { "epoch": 0.7966443174687056, "grad_norm": 0.20627662539482117, "learning_rate": 1.9767922252361603e-05, "loss": 0.0295, "step": 6030 }, { "epoch": 0.7967764309541896, "grad_norm": 0.18533170223236084, "learning_rate": 1.9743121734685545e-05, "loss": 0.0181, "step": 6031 }, { "epoch": 0.7969085444396736, "grad_norm": 0.13025982677936554, "learning_rate": 1.971833507988462e-05, "loss": 0.0132, "step": 6032 }, { "epoch": 0.7970406579251577, "grad_norm": 0.13095568120479584, "learning_rate": 1.9693562292240265e-05, "loss": 0.0093, "step": 6033 }, { "epoch": 0.7971727714106417, "grad_norm": 0.2232745885848999, "learning_rate": 1.966880337603154e-05, "loss": 0.0243, "step": 6034 }, { "epoch": 0.7973048848961257, "grad_norm": 0.11646081507205963, "learning_rate": 1.964405833553503e-05, "loss": 0.0077, "step": 6035 }, { "epoch": 0.7974369983816098, "grad_norm": 0.14113196730613708, "learning_rate": 1.9619327175025004e-05, "loss": 0.0137, "step": 6036 }, { "epoch": 0.7975691118670938, "grad_norm": 0.12095404416322708, "learning_rate": 1.9594609898773343e-05, "loss": 0.0103, "step": 6037 }, { "epoch": 0.7977012253525778, "grad_norm": 0.45921650528907776, "learning_rate": 1.956990651104943e-05, "loss": 0.0236, "step": 6038 }, { "epoch": 0.7978333388380618, "grad_norm": 0.23896144330501556, "learning_rate": 1.9545217016120342e-05, "loss": 0.036, "step": 6039 }, { "epoch": 0.7979654523235459, "grad_norm": 0.1391197144985199, "learning_rate": 1.9520541418250727e-05, "loss": 0.0115, "step": 6040 }, { "epoch": 0.7980975658090299, "grad_norm": 0.2012115865945816, "learning_rate": 1.949587972170286e-05, "loss": 0.0149, "step": 6041 }, { "epoch": 0.7982296792945139, "grad_norm": 0.1714034527540207, "learning_rate": 1.9471231930736546e-05, "loss": 0.0211, "step": 6042 }, { "epoch": 0.798361792779998, "grad_norm": 0.12861862778663635, "learning_rate": 1.9446598049609245e-05, "loss": 0.0146, "step": 6043 }, { "epoch": 0.798493906265482, "grad_norm": 0.1933763325214386, "learning_rate": 1.942197808257602e-05, "loss": 0.0105, "step": 6044 }, { "epoch": 0.798626019750966, "grad_norm": 0.12401843816041946, "learning_rate": 1.939737203388948e-05, "loss": 0.0163, "step": 6045 }, { "epoch": 0.7987581332364501, "grad_norm": 0.16275176405906677, "learning_rate": 1.9372779907799865e-05, "loss": 0.0142, "step": 6046 }, { "epoch": 0.7988902467219341, "grad_norm": 0.1322012096643448, "learning_rate": 1.9348201708555015e-05, "loss": 0.0102, "step": 6047 }, { "epoch": 0.7990223602074181, "grad_norm": 0.1378653198480606, "learning_rate": 1.9323637440400365e-05, "loss": 0.0242, "step": 6048 }, { "epoch": 0.7991544736929022, "grad_norm": 0.18538536131381989, "learning_rate": 1.9299087107578908e-05, "loss": 0.0123, "step": 6049 }, { "epoch": 0.7992865871783862, "grad_norm": 0.1685674786567688, "learning_rate": 1.9274550714331253e-05, "loss": 0.0172, "step": 6050 }, { "epoch": 0.7994187006638702, "grad_norm": 0.16482692956924438, "learning_rate": 1.925002826489566e-05, "loss": 0.0157, "step": 6051 }, { "epoch": 0.7995508141493542, "grad_norm": 0.11981571465730667, "learning_rate": 1.9225519763507838e-05, "loss": 0.0063, "step": 6052 }, { "epoch": 0.7996829276348383, "grad_norm": 0.17827653884887695, "learning_rate": 1.9201025214401223e-05, "loss": 0.0119, "step": 6053 }, { "epoch": 0.7998150411203223, "grad_norm": 0.238278329372406, "learning_rate": 1.917654462180678e-05, "loss": 0.0319, "step": 6054 }, { "epoch": 0.7999471546058063, "grad_norm": 0.18031644821166992, "learning_rate": 1.9152077989953097e-05, "loss": 0.0164, "step": 6055 }, { "epoch": 0.8000792680912904, "grad_norm": 0.21356049180030823, "learning_rate": 1.912762532306628e-05, "loss": 0.0147, "step": 6056 }, { "epoch": 0.8002113815767744, "grad_norm": 0.1678241789340973, "learning_rate": 1.9103186625370095e-05, "loss": 0.0128, "step": 6057 }, { "epoch": 0.8003434950622584, "grad_norm": 0.2096024453639984, "learning_rate": 1.9078761901085905e-05, "loss": 0.0211, "step": 6058 }, { "epoch": 0.8004756085477425, "grad_norm": 0.15075698494911194, "learning_rate": 1.905435115443256e-05, "loss": 0.0161, "step": 6059 }, { "epoch": 0.8006077220332265, "grad_norm": 0.11386120319366455, "learning_rate": 1.902995438962659e-05, "loss": 0.0135, "step": 6060 }, { "epoch": 0.8007398355187105, "grad_norm": 0.22624462842941284, "learning_rate": 1.900557161088208e-05, "loss": 0.0127, "step": 6061 }, { "epoch": 0.8008719490041946, "grad_norm": 0.16365881264209747, "learning_rate": 1.8981202822410725e-05, "loss": 0.0203, "step": 6062 }, { "epoch": 0.8010040624896786, "grad_norm": 0.2588220536708832, "learning_rate": 1.8956848028421725e-05, "loss": 0.0195, "step": 6063 }, { "epoch": 0.8011361759751626, "grad_norm": 0.16350001096725464, "learning_rate": 1.8932507233121944e-05, "loss": 0.0221, "step": 6064 }, { "epoch": 0.8012682894606467, "grad_norm": 0.14093947410583496, "learning_rate": 1.8908180440715828e-05, "loss": 0.0114, "step": 6065 }, { "epoch": 0.8014004029461307, "grad_norm": 0.12100745737552643, "learning_rate": 1.8883867655405306e-05, "loss": 0.0122, "step": 6066 }, { "epoch": 0.8015325164316147, "grad_norm": 0.15186220407485962, "learning_rate": 1.885956888139001e-05, "loss": 0.0183, "step": 6067 }, { "epoch": 0.8016646299170987, "grad_norm": 0.17774933576583862, "learning_rate": 1.883528412286708e-05, "loss": 0.0196, "step": 6068 }, { "epoch": 0.8017967434025828, "grad_norm": 0.16041073203086853, "learning_rate": 1.8811013384031283e-05, "loss": 0.0257, "step": 6069 }, { "epoch": 0.8019288568880668, "grad_norm": 0.15739186108112335, "learning_rate": 1.878675666907489e-05, "loss": 0.0088, "step": 6070 }, { "epoch": 0.8020609703735508, "grad_norm": 0.18149971961975098, "learning_rate": 1.8762513982187812e-05, "loss": 0.0202, "step": 6071 }, { "epoch": 0.8021930838590349, "grad_norm": 0.13623569905757904, "learning_rate": 1.8738285327557546e-05, "loss": 0.0121, "step": 6072 }, { "epoch": 0.8023251973445189, "grad_norm": 0.2696712613105774, "learning_rate": 1.8714070709369103e-05, "loss": 0.0127, "step": 6073 }, { "epoch": 0.8024573108300029, "grad_norm": 0.1282675415277481, "learning_rate": 1.868987013180511e-05, "loss": 0.0056, "step": 6074 }, { "epoch": 0.802589424315487, "grad_norm": 0.1721215546131134, "learning_rate": 1.866568359904578e-05, "loss": 0.017, "step": 6075 }, { "epoch": 0.802721537800971, "grad_norm": 0.31074950098991394, "learning_rate": 1.8641511115268873e-05, "loss": 0.0204, "step": 6076 }, { "epoch": 0.802853651286455, "grad_norm": 0.13287027180194855, "learning_rate": 1.8617352684649736e-05, "loss": 0.009, "step": 6077 }, { "epoch": 0.8029857647719391, "grad_norm": 0.17623546719551086, "learning_rate": 1.859320831136132e-05, "loss": 0.0133, "step": 6078 }, { "epoch": 0.8031178782574231, "grad_norm": 0.309251993894577, "learning_rate": 1.856907799957406e-05, "loss": 0.0276, "step": 6079 }, { "epoch": 0.8032499917429071, "grad_norm": 0.12996280193328857, "learning_rate": 1.8544961753456037e-05, "loss": 0.0123, "step": 6080 }, { "epoch": 0.8033821052283912, "grad_norm": 0.2583175301551819, "learning_rate": 1.8520859577172887e-05, "loss": 0.0095, "step": 6081 }, { "epoch": 0.8035142187138752, "grad_norm": 0.21694119274616241, "learning_rate": 1.8496771474887832e-05, "loss": 0.0205, "step": 6082 }, { "epoch": 0.8036463321993592, "grad_norm": 0.1103861853480339, "learning_rate": 1.847269745076159e-05, "loss": 0.0113, "step": 6083 }, { "epoch": 0.8037784456848432, "grad_norm": 0.14845329523086548, "learning_rate": 1.8448637508952526e-05, "loss": 0.0129, "step": 6084 }, { "epoch": 0.8039105591703273, "grad_norm": 0.2458660900592804, "learning_rate": 1.8424591653616564e-05, "loss": 0.0214, "step": 6085 }, { "epoch": 0.8040426726558113, "grad_norm": 0.2118985801935196, "learning_rate": 1.840055988890714e-05, "loss": 0.0189, "step": 6086 }, { "epoch": 0.8041747861412953, "grad_norm": 0.1412656158208847, "learning_rate": 1.83765422189753e-05, "loss": 0.0138, "step": 6087 }, { "epoch": 0.8043068996267794, "grad_norm": 0.24748028814792633, "learning_rate": 1.8352538647969662e-05, "loss": 0.0264, "step": 6088 }, { "epoch": 0.8044390131122634, "grad_norm": 0.41792353987693787, "learning_rate": 1.8328549180036412e-05, "loss": 0.0254, "step": 6089 }, { "epoch": 0.8045711265977474, "grad_norm": 0.23838505148887634, "learning_rate": 1.8304573819319226e-05, "loss": 0.0168, "step": 6090 }, { "epoch": 0.8047032400832315, "grad_norm": 0.15929056704044342, "learning_rate": 1.8280612569959443e-05, "loss": 0.0143, "step": 6091 }, { "epoch": 0.8048353535687155, "grad_norm": 0.21665242314338684, "learning_rate": 1.825666543609592e-05, "loss": 0.0143, "step": 6092 }, { "epoch": 0.8049674670541995, "grad_norm": 0.20312796533107758, "learning_rate": 1.823273242186505e-05, "loss": 0.0189, "step": 6093 }, { "epoch": 0.8050995805396836, "grad_norm": 0.11221221834421158, "learning_rate": 1.8208813531400827e-05, "loss": 0.0082, "step": 6094 }, { "epoch": 0.8052316940251676, "grad_norm": 0.23713520169258118, "learning_rate": 1.8184908768834796e-05, "loss": 0.026, "step": 6095 }, { "epoch": 0.8053638075106516, "grad_norm": 0.18237431347370148, "learning_rate": 1.8161018138296083e-05, "loss": 0.0206, "step": 6096 }, { "epoch": 0.8054959209961357, "grad_norm": 0.13127483427524567, "learning_rate": 1.8137141643911294e-05, "loss": 0.0091, "step": 6097 }, { "epoch": 0.8056280344816197, "grad_norm": 0.13977746665477753, "learning_rate": 1.8113279289804673e-05, "loss": 0.0158, "step": 6098 }, { "epoch": 0.8057601479671037, "grad_norm": 0.15803252160549164, "learning_rate": 1.808943108009802e-05, "loss": 0.0169, "step": 6099 }, { "epoch": 0.8058922614525877, "grad_norm": 0.240422323346138, "learning_rate": 1.8065597018910617e-05, "loss": 0.0096, "step": 6100 }, { "epoch": 0.8060243749380718, "grad_norm": 0.17421869933605194, "learning_rate": 1.804177711035938e-05, "loss": 0.0182, "step": 6101 }, { "epoch": 0.8061564884235558, "grad_norm": 0.27411147952079773, "learning_rate": 1.801797135855876e-05, "loss": 0.0278, "step": 6102 }, { "epoch": 0.8062886019090398, "grad_norm": 0.0999927967786789, "learning_rate": 1.7994179767620766e-05, "loss": 0.0094, "step": 6103 }, { "epoch": 0.8064207153945239, "grad_norm": 0.37366190552711487, "learning_rate": 1.7970402341654902e-05, "loss": 0.0269, "step": 6104 }, { "epoch": 0.8065528288800079, "grad_norm": 0.17533250153064728, "learning_rate": 1.794663908476831e-05, "loss": 0.0107, "step": 6105 }, { "epoch": 0.8066849423654919, "grad_norm": 0.13562843203544617, "learning_rate": 1.7922890001065673e-05, "loss": 0.0139, "step": 6106 }, { "epoch": 0.806817055850976, "grad_norm": 0.24872201681137085, "learning_rate": 1.7899155094649135e-05, "loss": 0.0257, "step": 6107 }, { "epoch": 0.80694916933646, "grad_norm": 0.15552151203155518, "learning_rate": 1.7875434369618495e-05, "loss": 0.0146, "step": 6108 }, { "epoch": 0.807081282821944, "grad_norm": 0.11690077930688858, "learning_rate": 1.7851727830071063e-05, "loss": 0.0094, "step": 6109 }, { "epoch": 0.807213396307428, "grad_norm": 0.17192292213439941, "learning_rate": 1.7828035480101722e-05, "loss": 0.0187, "step": 6110 }, { "epoch": 0.8073455097929121, "grad_norm": 0.24925702810287476, "learning_rate": 1.7804357323802845e-05, "loss": 0.0325, "step": 6111 }, { "epoch": 0.8074776232783961, "grad_norm": 0.10455185174942017, "learning_rate": 1.778069336526439e-05, "loss": 0.0135, "step": 6112 }, { "epoch": 0.8076097367638801, "grad_norm": 0.09632913768291473, "learning_rate": 1.775704360857392e-05, "loss": 0.0063, "step": 6113 }, { "epoch": 0.8077418502493642, "grad_norm": 0.15340298414230347, "learning_rate": 1.7733408057816413e-05, "loss": 0.0227, "step": 6114 }, { "epoch": 0.8078739637348482, "grad_norm": 0.10595980286598206, "learning_rate": 1.7709786717074504e-05, "loss": 0.0089, "step": 6115 }, { "epoch": 0.8080060772203322, "grad_norm": 0.11453713476657867, "learning_rate": 1.7686179590428344e-05, "loss": 0.0111, "step": 6116 }, { "epoch": 0.8081381907058163, "grad_norm": 0.23642563819885254, "learning_rate": 1.766258668195564e-05, "loss": 0.0243, "step": 6117 }, { "epoch": 0.8082703041913003, "grad_norm": 0.23531033098697662, "learning_rate": 1.763900799573157e-05, "loss": 0.0161, "step": 6118 }, { "epoch": 0.8084024176767843, "grad_norm": 0.1391163170337677, "learning_rate": 1.7615443535828945e-05, "loss": 0.013, "step": 6119 }, { "epoch": 0.8085345311622684, "grad_norm": 0.14361713826656342, "learning_rate": 1.759189330631811e-05, "loss": 0.0183, "step": 6120 }, { "epoch": 0.8086666446477524, "grad_norm": 0.2131107896566391, "learning_rate": 1.756835731126687e-05, "loss": 0.0212, "step": 6121 }, { "epoch": 0.8087987581332364, "grad_norm": 0.14085067808628082, "learning_rate": 1.754483555474067e-05, "loss": 0.0153, "step": 6122 }, { "epoch": 0.8089308716187205, "grad_norm": 0.14828112721443176, "learning_rate": 1.7521328040802422e-05, "loss": 0.0151, "step": 6123 }, { "epoch": 0.8090629851042045, "grad_norm": 0.1502753645181656, "learning_rate": 1.7497834773512666e-05, "loss": 0.0118, "step": 6124 }, { "epoch": 0.8091950985896885, "grad_norm": 0.16962772607803345, "learning_rate": 1.747435575692936e-05, "loss": 0.0202, "step": 6125 }, { "epoch": 0.8093272120751726, "grad_norm": 0.1703941524028778, "learning_rate": 1.7450890995108095e-05, "loss": 0.0202, "step": 6126 }, { "epoch": 0.8094593255606566, "grad_norm": 0.17077529430389404, "learning_rate": 1.7427440492101986e-05, "loss": 0.0162, "step": 6127 }, { "epoch": 0.8095914390461406, "grad_norm": 0.3168460428714752, "learning_rate": 1.7404004251961635e-05, "loss": 0.0215, "step": 6128 }, { "epoch": 0.8097235525316246, "grad_norm": 0.17691540718078613, "learning_rate": 1.7380582278735224e-05, "loss": 0.0219, "step": 6129 }, { "epoch": 0.8098556660171087, "grad_norm": 0.1743171364068985, "learning_rate": 1.735717457646847e-05, "loss": 0.0123, "step": 6130 }, { "epoch": 0.8099877795025927, "grad_norm": 0.1374894380569458, "learning_rate": 1.733378114920463e-05, "loss": 0.0104, "step": 6131 }, { "epoch": 0.8101198929880767, "grad_norm": 0.18259602785110474, "learning_rate": 1.731040200098445e-05, "loss": 0.0143, "step": 6132 }, { "epoch": 0.8102520064735608, "grad_norm": 0.25777730345726013, "learning_rate": 1.728703713584624e-05, "loss": 0.0277, "step": 6133 }, { "epoch": 0.8103841199590448, "grad_norm": 0.12829157710075378, "learning_rate": 1.7263686557825864e-05, "loss": 0.0136, "step": 6134 }, { "epoch": 0.8105162334445288, "grad_norm": 0.15571993589401245, "learning_rate": 1.7240350270956697e-05, "loss": 0.0167, "step": 6135 }, { "epoch": 0.8106483469300129, "grad_norm": 0.1591111421585083, "learning_rate": 1.7217028279269644e-05, "loss": 0.0136, "step": 6136 }, { "epoch": 0.8107804604154969, "grad_norm": 0.14531998336315155, "learning_rate": 1.719372058679315e-05, "loss": 0.0117, "step": 6137 }, { "epoch": 0.8109125739009809, "grad_norm": 0.17569659650325775, "learning_rate": 1.7170427197553164e-05, "loss": 0.0137, "step": 6138 }, { "epoch": 0.811044687386465, "grad_norm": 0.18718554079532623, "learning_rate": 1.7147148115573175e-05, "loss": 0.0124, "step": 6139 }, { "epoch": 0.811176800871949, "grad_norm": 0.14613309502601624, "learning_rate": 1.712388334487425e-05, "loss": 0.016, "step": 6140 }, { "epoch": 0.811308914357433, "grad_norm": 0.14430725574493408, "learning_rate": 1.710063288947492e-05, "loss": 0.0133, "step": 6141 }, { "epoch": 0.811441027842917, "grad_norm": 0.16222628951072693, "learning_rate": 1.7077396753391262e-05, "loss": 0.0118, "step": 6142 }, { "epoch": 0.8115731413284011, "grad_norm": 0.1703341156244278, "learning_rate": 1.705417494063687e-05, "loss": 0.0191, "step": 6143 }, { "epoch": 0.8117052548138851, "grad_norm": 0.11394409835338593, "learning_rate": 1.7030967455222936e-05, "loss": 0.0143, "step": 6144 }, { "epoch": 0.8118373682993691, "grad_norm": 0.1658678501844406, "learning_rate": 1.7007774301158054e-05, "loss": 0.0216, "step": 6145 }, { "epoch": 0.8119694817848532, "grad_norm": 0.14283902943134308, "learning_rate": 1.6984595482448418e-05, "loss": 0.0114, "step": 6146 }, { "epoch": 0.8121015952703372, "grad_norm": 0.1382722705602646, "learning_rate": 1.696143100309776e-05, "loss": 0.0184, "step": 6147 }, { "epoch": 0.8122337087558212, "grad_norm": 0.13045085966587067, "learning_rate": 1.6938280867107335e-05, "loss": 0.0076, "step": 6148 }, { "epoch": 0.8123658222413053, "grad_norm": 0.16077639162540436, "learning_rate": 1.6915145078475824e-05, "loss": 0.0157, "step": 6149 }, { "epoch": 0.8124979357267893, "grad_norm": 0.12093255668878555, "learning_rate": 1.689202364119955e-05, "loss": 0.0137, "step": 6150 }, { "epoch": 0.8126300492122733, "grad_norm": 0.14236758649349213, "learning_rate": 1.686891655927232e-05, "loss": 0.0103, "step": 6151 }, { "epoch": 0.8127621626977574, "grad_norm": 0.1466006338596344, "learning_rate": 1.6845823836685413e-05, "loss": 0.0109, "step": 6152 }, { "epoch": 0.8128942761832414, "grad_norm": 0.12988708913326263, "learning_rate": 1.682274547742767e-05, "loss": 0.0148, "step": 6153 }, { "epoch": 0.8130263896687254, "grad_norm": 0.14889812469482422, "learning_rate": 1.6799681485485464e-05, "loss": 0.0078, "step": 6154 }, { "epoch": 0.8131585031542095, "grad_norm": 0.1435319036245346, "learning_rate": 1.6776631864842685e-05, "loss": 0.014, "step": 6155 }, { "epoch": 0.8132906166396935, "grad_norm": 0.13213448226451874, "learning_rate": 1.6753596619480684e-05, "loss": 0.0082, "step": 6156 }, { "epoch": 0.8134227301251775, "grad_norm": 0.16954585909843445, "learning_rate": 1.6730575753378375e-05, "loss": 0.0153, "step": 6157 }, { "epoch": 0.8135548436106615, "grad_norm": 0.18840613961219788, "learning_rate": 1.6707569270512224e-05, "loss": 0.0178, "step": 6158 }, { "epoch": 0.8136869570961456, "grad_norm": 0.07202202081680298, "learning_rate": 1.6684577174856118e-05, "loss": 0.0041, "step": 6159 }, { "epoch": 0.8138190705816296, "grad_norm": 0.18953919410705566, "learning_rate": 1.666159947038153e-05, "loss": 0.0237, "step": 6160 }, { "epoch": 0.8139511840671136, "grad_norm": 0.1994379609823227, "learning_rate": 1.6638636161057442e-05, "loss": 0.0253, "step": 6161 }, { "epoch": 0.8140832975525977, "grad_norm": 0.1680680215358734, "learning_rate": 1.6615687250850344e-05, "loss": 0.0133, "step": 6162 }, { "epoch": 0.8142154110380817, "grad_norm": 0.1264996975660324, "learning_rate": 1.659275274372418e-05, "loss": 0.0087, "step": 6163 }, { "epoch": 0.8143475245235657, "grad_norm": 0.11757846176624298, "learning_rate": 1.6569832643640505e-05, "loss": 0.0119, "step": 6164 }, { "epoch": 0.8144796380090498, "grad_norm": 0.17562219500541687, "learning_rate": 1.654692695455835e-05, "loss": 0.015, "step": 6165 }, { "epoch": 0.8146117514945338, "grad_norm": 0.14509950578212738, "learning_rate": 1.652403568043418e-05, "loss": 0.0173, "step": 6166 }, { "epoch": 0.8147438649800178, "grad_norm": 0.17648886144161224, "learning_rate": 1.6501158825222085e-05, "loss": 0.0191, "step": 6167 }, { "epoch": 0.8148759784655019, "grad_norm": 0.17454563081264496, "learning_rate": 1.6478296392873603e-05, "loss": 0.0137, "step": 6168 }, { "epoch": 0.8150080919509859, "grad_norm": 0.13573813438415527, "learning_rate": 1.6455448387337812e-05, "loss": 0.0142, "step": 6169 }, { "epoch": 0.8151402054364699, "grad_norm": 0.12894220650196075, "learning_rate": 1.643261481256123e-05, "loss": 0.0152, "step": 6170 }, { "epoch": 0.815272318921954, "grad_norm": 0.15514323115348816, "learning_rate": 1.640979567248796e-05, "loss": 0.0148, "step": 6171 }, { "epoch": 0.815404432407438, "grad_norm": 0.2614951729774475, "learning_rate": 1.63869909710596e-05, "loss": 0.0127, "step": 6172 }, { "epoch": 0.815536545892922, "grad_norm": 0.26733624935150146, "learning_rate": 1.6364200712215194e-05, "loss": 0.0322, "step": 6173 }, { "epoch": 0.815668659378406, "grad_norm": 0.13763479888439178, "learning_rate": 1.6341424899891355e-05, "loss": 0.0155, "step": 6174 }, { "epoch": 0.8158007728638901, "grad_norm": 0.3485512137413025, "learning_rate": 1.631866353802217e-05, "loss": 0.0181, "step": 6175 }, { "epoch": 0.8159328863493741, "grad_norm": 0.13298067450523376, "learning_rate": 1.6295916630539286e-05, "loss": 0.0218, "step": 6176 }, { "epoch": 0.8160649998348581, "grad_norm": 0.11390813440084457, "learning_rate": 1.6273184181371724e-05, "loss": 0.0089, "step": 6177 }, { "epoch": 0.8161971133203422, "grad_norm": 0.19629289209842682, "learning_rate": 1.6250466194446147e-05, "loss": 0.0156, "step": 6178 }, { "epoch": 0.8163292268058262, "grad_norm": 0.1922062635421753, "learning_rate": 1.6227762673686665e-05, "loss": 0.0193, "step": 6179 }, { "epoch": 0.8164613402913102, "grad_norm": 0.225606307387352, "learning_rate": 1.620507362301483e-05, "loss": 0.0152, "step": 6180 }, { "epoch": 0.8165934537767943, "grad_norm": 0.16128596663475037, "learning_rate": 1.6182399046349806e-05, "loss": 0.0147, "step": 6181 }, { "epoch": 0.8167255672622783, "grad_norm": 0.09733172506093979, "learning_rate": 1.6159738947608184e-05, "loss": 0.0127, "step": 6182 }, { "epoch": 0.8168576807477623, "grad_norm": 0.17848917841911316, "learning_rate": 1.61370933307041e-05, "loss": 0.0119, "step": 6183 }, { "epoch": 0.8169897942332464, "grad_norm": 0.19904151558876038, "learning_rate": 1.6114462199549106e-05, "loss": 0.0181, "step": 6184 }, { "epoch": 0.8171219077187304, "grad_norm": 0.07581532746553421, "learning_rate": 1.6091845558052343e-05, "loss": 0.0073, "step": 6185 }, { "epoch": 0.8172540212042144, "grad_norm": 0.3024727702140808, "learning_rate": 1.6069243410120427e-05, "loss": 0.0215, "step": 6186 }, { "epoch": 0.8173861346896985, "grad_norm": 0.08493215590715408, "learning_rate": 1.6046655759657413e-05, "loss": 0.0071, "step": 6187 }, { "epoch": 0.8175182481751825, "grad_norm": 0.1411338597536087, "learning_rate": 1.6024082610564918e-05, "loss": 0.0125, "step": 6188 }, { "epoch": 0.8176503616606665, "grad_norm": 0.11337349563837051, "learning_rate": 1.6001523966742025e-05, "loss": 0.0118, "step": 6189 }, { "epoch": 0.8177824751461505, "grad_norm": 0.2526368200778961, "learning_rate": 1.597897983208536e-05, "loss": 0.0281, "step": 6190 }, { "epoch": 0.8179145886316346, "grad_norm": 0.22702518105506897, "learning_rate": 1.5956450210488936e-05, "loss": 0.0265, "step": 6191 }, { "epoch": 0.8180467021171186, "grad_norm": 0.17712631821632385, "learning_rate": 1.5933935105844345e-05, "loss": 0.0167, "step": 6192 }, { "epoch": 0.8181788156026026, "grad_norm": 0.22530877590179443, "learning_rate": 1.591143452204067e-05, "loss": 0.0147, "step": 6193 }, { "epoch": 0.8183109290880867, "grad_norm": 0.13503044843673706, "learning_rate": 1.588894846296445e-05, "loss": 0.0118, "step": 6194 }, { "epoch": 0.8184430425735707, "grad_norm": 0.22267352044582367, "learning_rate": 1.586647693249973e-05, "loss": 0.0241, "step": 6195 }, { "epoch": 0.8185751560590547, "grad_norm": 0.1505124866962433, "learning_rate": 1.5844019934528088e-05, "loss": 0.016, "step": 6196 }, { "epoch": 0.8187072695445388, "grad_norm": 0.1663101613521576, "learning_rate": 1.5821577472928484e-05, "loss": 0.016, "step": 6197 }, { "epoch": 0.8188393830300228, "grad_norm": 0.135949045419693, "learning_rate": 1.5799149551577464e-05, "loss": 0.0138, "step": 6198 }, { "epoch": 0.8189714965155068, "grad_norm": 0.15780101716518402, "learning_rate": 1.577673617434906e-05, "loss": 0.0093, "step": 6199 }, { "epoch": 0.8191036100009909, "grad_norm": 0.2328743040561676, "learning_rate": 1.575433734511471e-05, "loss": 0.0176, "step": 6200 }, { "epoch": 0.8192357234864749, "grad_norm": 0.3225279748439789, "learning_rate": 1.573195306774342e-05, "loss": 0.0189, "step": 6201 }, { "epoch": 0.8193678369719589, "grad_norm": 0.2229243665933609, "learning_rate": 1.5709583346101653e-05, "loss": 0.0219, "step": 6202 }, { "epoch": 0.819499950457443, "grad_norm": 0.17149214446544647, "learning_rate": 1.5687228184053393e-05, "loss": 0.0154, "step": 6203 }, { "epoch": 0.819632063942927, "grad_norm": 0.18319302797317505, "learning_rate": 1.566488758546002e-05, "loss": 0.0204, "step": 6204 }, { "epoch": 0.819764177428411, "grad_norm": 0.1378282606601715, "learning_rate": 1.564256155418047e-05, "loss": 0.0158, "step": 6205 }, { "epoch": 0.819896290913895, "grad_norm": 0.17166048288345337, "learning_rate": 1.5620250094071188e-05, "loss": 0.0112, "step": 6206 }, { "epoch": 0.8200284043993791, "grad_norm": 0.17572304606437683, "learning_rate": 1.5597953208986017e-05, "loss": 0.0151, "step": 6207 }, { "epoch": 0.8201605178848631, "grad_norm": 0.1491813212633133, "learning_rate": 1.557567090277633e-05, "loss": 0.0105, "step": 6208 }, { "epoch": 0.8202926313703471, "grad_norm": 0.18040210008621216, "learning_rate": 1.5553403179290994e-05, "loss": 0.0159, "step": 6209 }, { "epoch": 0.8204247448558312, "grad_norm": 0.18628931045532227, "learning_rate": 1.553115004237635e-05, "loss": 0.0187, "step": 6210 }, { "epoch": 0.8205568583413152, "grad_norm": 0.24394503235816956, "learning_rate": 1.5508911495876188e-05, "loss": 0.0226, "step": 6211 }, { "epoch": 0.8206889718267992, "grad_norm": 0.14266592264175415, "learning_rate": 1.5486687543631807e-05, "loss": 0.0207, "step": 6212 }, { "epoch": 0.8208210853122833, "grad_norm": 0.11844737082719803, "learning_rate": 1.5464478189482013e-05, "loss": 0.0126, "step": 6213 }, { "epoch": 0.8209531987977673, "grad_norm": 0.1422780603170395, "learning_rate": 1.5442283437263005e-05, "loss": 0.0178, "step": 6214 }, { "epoch": 0.8210853122832513, "grad_norm": 0.1076541319489479, "learning_rate": 1.542010329080853e-05, "loss": 0.0056, "step": 6215 }, { "epoch": 0.8212174257687354, "grad_norm": 0.13329507410526276, "learning_rate": 1.5397937753949798e-05, "loss": 0.015, "step": 6216 }, { "epoch": 0.8213495392542194, "grad_norm": 0.14316102862358093, "learning_rate": 1.5375786830515515e-05, "loss": 0.0104, "step": 6217 }, { "epoch": 0.8214816527397034, "grad_norm": 0.13540340960025787, "learning_rate": 1.5353650524331787e-05, "loss": 0.0106, "step": 6218 }, { "epoch": 0.8216137662251874, "grad_norm": 0.1679118573665619, "learning_rate": 1.5331528839222287e-05, "loss": 0.0145, "step": 6219 }, { "epoch": 0.8217458797106715, "grad_norm": 0.21163515746593475, "learning_rate": 1.5309421779008125e-05, "loss": 0.0152, "step": 6220 }, { "epoch": 0.8218779931961555, "grad_norm": 0.3536343574523926, "learning_rate": 1.528732934750785e-05, "loss": 0.0184, "step": 6221 }, { "epoch": 0.8220101066816395, "grad_norm": 0.16983529925346375, "learning_rate": 1.526525154853753e-05, "loss": 0.0181, "step": 6222 }, { "epoch": 0.8221422201671236, "grad_norm": 0.20116670429706573, "learning_rate": 1.52431883859107e-05, "loss": 0.0223, "step": 6223 }, { "epoch": 0.8222743336526076, "grad_norm": 0.20575542747974396, "learning_rate": 1.5221139863438372e-05, "loss": 0.0225, "step": 6224 }, { "epoch": 0.8224064471380916, "grad_norm": 0.12439869344234467, "learning_rate": 1.5199105984928985e-05, "loss": 0.0139, "step": 6225 }, { "epoch": 0.8225385606235757, "grad_norm": 0.10762098431587219, "learning_rate": 1.5177086754188486e-05, "loss": 0.007, "step": 6226 }, { "epoch": 0.8226706741090597, "grad_norm": 0.20593222975730896, "learning_rate": 1.5155082175020318e-05, "loss": 0.0219, "step": 6227 }, { "epoch": 0.8228027875945437, "grad_norm": 0.1083982065320015, "learning_rate": 1.513309225122531e-05, "loss": 0.0094, "step": 6228 }, { "epoch": 0.8229349010800278, "grad_norm": 0.12976078689098358, "learning_rate": 1.5111116986601848e-05, "loss": 0.009, "step": 6229 }, { "epoch": 0.8230670145655118, "grad_norm": 0.17364388704299927, "learning_rate": 1.5089156384945725e-05, "loss": 0.0152, "step": 6230 }, { "epoch": 0.8231991280509958, "grad_norm": 0.11856499314308167, "learning_rate": 1.5067210450050261e-05, "loss": 0.0128, "step": 6231 }, { "epoch": 0.8233312415364799, "grad_norm": 0.2314266711473465, "learning_rate": 1.5045279185706162e-05, "loss": 0.0183, "step": 6232 }, { "epoch": 0.8234633550219639, "grad_norm": 0.11939840018749237, "learning_rate": 1.5023362595701652e-05, "loss": 0.0127, "step": 6233 }, { "epoch": 0.8235954685074479, "grad_norm": 0.10636747628450394, "learning_rate": 1.5001460683822456e-05, "loss": 0.0103, "step": 6234 }, { "epoch": 0.823727581992932, "grad_norm": 0.14871914684772491, "learning_rate": 1.4979573453851658e-05, "loss": 0.0128, "step": 6235 }, { "epoch": 0.823859695478416, "grad_norm": 0.16616952419281006, "learning_rate": 1.4957700909569894e-05, "loss": 0.0159, "step": 6236 }, { "epoch": 0.8239918089639, "grad_norm": 0.167495995759964, "learning_rate": 1.4935843054755238e-05, "loss": 0.015, "step": 6237 }, { "epoch": 0.824123922449384, "grad_norm": 0.15769924223423004, "learning_rate": 1.491399989318325e-05, "loss": 0.0135, "step": 6238 }, { "epoch": 0.8242560359348681, "grad_norm": 0.1334027200937271, "learning_rate": 1.489217142862689e-05, "loss": 0.0134, "step": 6239 }, { "epoch": 0.8243881494203521, "grad_norm": 0.2270067185163498, "learning_rate": 1.4870357664856626e-05, "loss": 0.0108, "step": 6240 }, { "epoch": 0.8245202629058361, "grad_norm": 0.12010292708873749, "learning_rate": 1.4848558605640406e-05, "loss": 0.0064, "step": 6241 }, { "epoch": 0.8246523763913202, "grad_norm": 0.23596997559070587, "learning_rate": 1.4826774254743559e-05, "loss": 0.0175, "step": 6242 }, { "epoch": 0.8247844898768042, "grad_norm": 0.24235551059246063, "learning_rate": 1.4805004615928953e-05, "loss": 0.0251, "step": 6243 }, { "epoch": 0.8249166033622882, "grad_norm": 0.1819847673177719, "learning_rate": 1.478324969295689e-05, "loss": 0.0122, "step": 6244 }, { "epoch": 0.8250487168477723, "grad_norm": 0.23536427319049835, "learning_rate": 1.4761509489585146e-05, "loss": 0.0103, "step": 6245 }, { "epoch": 0.8251808303332563, "grad_norm": 0.19968615472316742, "learning_rate": 1.473978400956889e-05, "loss": 0.0244, "step": 6246 }, { "epoch": 0.8253129438187403, "grad_norm": 0.2093326598405838, "learning_rate": 1.4718073256660802e-05, "loss": 0.0135, "step": 6247 }, { "epoch": 0.8254450573042243, "grad_norm": 0.1305864006280899, "learning_rate": 1.4696377234611058e-05, "loss": 0.0141, "step": 6248 }, { "epoch": 0.8255771707897084, "grad_norm": 0.2174091637134552, "learning_rate": 1.4674695947167171e-05, "loss": 0.0229, "step": 6249 }, { "epoch": 0.8257092842751924, "grad_norm": 0.2369208186864853, "learning_rate": 1.4653029398074202e-05, "loss": 0.015, "step": 6250 }, { "epoch": 0.8258413977606764, "grad_norm": 0.13431859016418457, "learning_rate": 1.4631377591074658e-05, "loss": 0.0108, "step": 6251 }, { "epoch": 0.8259735112461605, "grad_norm": 0.12542381882667542, "learning_rate": 1.4609740529908467e-05, "loss": 0.0139, "step": 6252 }, { "epoch": 0.8261056247316445, "grad_norm": 0.22405870258808136, "learning_rate": 1.4588118218313041e-05, "loss": 0.022, "step": 6253 }, { "epoch": 0.8262377382171285, "grad_norm": 0.10808577388525009, "learning_rate": 1.4566510660023225e-05, "loss": 0.0134, "step": 6254 }, { "epoch": 0.8263698517026126, "grad_norm": 0.14455221593379974, "learning_rate": 1.4544917858771335e-05, "loss": 0.0167, "step": 6255 }, { "epoch": 0.8265019651880966, "grad_norm": 0.15318076312541962, "learning_rate": 1.4523339818287085e-05, "loss": 0.0102, "step": 6256 }, { "epoch": 0.8266340786735806, "grad_norm": 0.21173270046710968, "learning_rate": 1.4501776542297706e-05, "loss": 0.0208, "step": 6257 }, { "epoch": 0.8267661921590647, "grad_norm": 0.1443268209695816, "learning_rate": 1.4480228034527876e-05, "loss": 0.0137, "step": 6258 }, { "epoch": 0.8268983056445487, "grad_norm": 0.10800027847290039, "learning_rate": 1.4458694298699626e-05, "loss": 0.0145, "step": 6259 }, { "epoch": 0.8270304191300327, "grad_norm": 0.1543063521385193, "learning_rate": 1.443717533853256e-05, "loss": 0.0174, "step": 6260 }, { "epoch": 0.8271625326155168, "grad_norm": 0.1753726601600647, "learning_rate": 1.441567115774366e-05, "loss": 0.0208, "step": 6261 }, { "epoch": 0.8272946461010008, "grad_norm": 0.1982455551624298, "learning_rate": 1.4394181760047398e-05, "loss": 0.0106, "step": 6262 }, { "epoch": 0.8274267595864848, "grad_norm": 0.25313007831573486, "learning_rate": 1.4372707149155617e-05, "loss": 0.0231, "step": 6263 }, { "epoch": 0.8275588730719688, "grad_norm": 0.15842032432556152, "learning_rate": 1.4351247328777672e-05, "loss": 0.0231, "step": 6264 }, { "epoch": 0.8276909865574529, "grad_norm": 0.22092995047569275, "learning_rate": 1.4329802302620388e-05, "loss": 0.0193, "step": 6265 }, { "epoch": 0.8278231000429369, "grad_norm": 0.1875273585319519, "learning_rate": 1.4308372074387933e-05, "loss": 0.0187, "step": 6266 }, { "epoch": 0.8279552135284209, "grad_norm": 0.17721475660800934, "learning_rate": 1.428695664778199e-05, "loss": 0.0119, "step": 6267 }, { "epoch": 0.828087327013905, "grad_norm": 0.11427704244852066, "learning_rate": 1.4265556026501703e-05, "loss": 0.011, "step": 6268 }, { "epoch": 0.828219440499389, "grad_norm": 0.16599473357200623, "learning_rate": 1.4244170214243624e-05, "loss": 0.0098, "step": 6269 }, { "epoch": 0.828351553984873, "grad_norm": 0.1077287420630455, "learning_rate": 1.4222799214701721e-05, "loss": 0.0094, "step": 6270 }, { "epoch": 0.8284836674703571, "grad_norm": 0.16686657071113586, "learning_rate": 1.4201443031567451e-05, "loss": 0.0113, "step": 6271 }, { "epoch": 0.8286157809558411, "grad_norm": 0.17287173867225647, "learning_rate": 1.4180101668529721e-05, "loss": 0.0046, "step": 6272 }, { "epoch": 0.8287478944413251, "grad_norm": 0.10685086995363235, "learning_rate": 1.4158775129274815e-05, "loss": 0.0078, "step": 6273 }, { "epoch": 0.8288800079268092, "grad_norm": 0.23552167415618896, "learning_rate": 1.4137463417486495e-05, "loss": 0.0187, "step": 6274 }, { "epoch": 0.8290121214122932, "grad_norm": 0.24776104092597961, "learning_rate": 1.4116166536845988e-05, "loss": 0.0167, "step": 6275 }, { "epoch": 0.8291442348977772, "grad_norm": 0.15167926251888275, "learning_rate": 1.4094884491031934e-05, "loss": 0.02, "step": 6276 }, { "epoch": 0.8292763483832613, "grad_norm": 0.22547586262226105, "learning_rate": 1.4073617283720376e-05, "loss": 0.0244, "step": 6277 }, { "epoch": 0.8294084618687453, "grad_norm": 0.13497163355350494, "learning_rate": 1.4052364918584837e-05, "loss": 0.0127, "step": 6278 }, { "epoch": 0.8295405753542293, "grad_norm": 0.14811024069786072, "learning_rate": 1.4031127399296296e-05, "loss": 0.0194, "step": 6279 }, { "epoch": 0.8296726888397133, "grad_norm": 0.1362769901752472, "learning_rate": 1.4009904729523083e-05, "loss": 0.0157, "step": 6280 }, { "epoch": 0.8298048023251974, "grad_norm": 0.16739948093891144, "learning_rate": 1.3988696912931065e-05, "loss": 0.0076, "step": 6281 }, { "epoch": 0.8299369158106814, "grad_norm": 0.1778571456670761, "learning_rate": 1.396750395318347e-05, "loss": 0.0116, "step": 6282 }, { "epoch": 0.8300690292961654, "grad_norm": 0.18066781759262085, "learning_rate": 1.3946325853941012e-05, "loss": 0.0104, "step": 6283 }, { "epoch": 0.8302011427816495, "grad_norm": 0.15547846257686615, "learning_rate": 1.3925162618861776e-05, "loss": 0.0076, "step": 6284 }, { "epoch": 0.8303332562671335, "grad_norm": 0.2607351541519165, "learning_rate": 1.3904014251601328e-05, "loss": 0.0104, "step": 6285 }, { "epoch": 0.8304653697526175, "grad_norm": 0.14374835789203644, "learning_rate": 1.3882880755812689e-05, "loss": 0.0135, "step": 6286 }, { "epoch": 0.8305974832381016, "grad_norm": 0.2493097335100174, "learning_rate": 1.3861762135146217e-05, "loss": 0.0216, "step": 6287 }, { "epoch": 0.8307295967235856, "grad_norm": 0.15661141276359558, "learning_rate": 1.3840658393249784e-05, "loss": 0.0151, "step": 6288 }, { "epoch": 0.8308617102090696, "grad_norm": 0.1942441165447235, "learning_rate": 1.3819569533768673e-05, "loss": 0.0124, "step": 6289 }, { "epoch": 0.8309938236945537, "grad_norm": 0.10488870739936829, "learning_rate": 1.3798495560345603e-05, "loss": 0.0078, "step": 6290 }, { "epoch": 0.8311259371800377, "grad_norm": 0.1622200906276703, "learning_rate": 1.3777436476620675e-05, "loss": 0.0115, "step": 6291 }, { "epoch": 0.8312580506655217, "grad_norm": 0.22020554542541504, "learning_rate": 1.3756392286231468e-05, "loss": 0.0153, "step": 6292 }, { "epoch": 0.8313901641510058, "grad_norm": 0.11919867992401123, "learning_rate": 1.3735362992812994e-05, "loss": 0.0146, "step": 6293 }, { "epoch": 0.8315222776364898, "grad_norm": 0.1653178334236145, "learning_rate": 1.3714348599997628e-05, "loss": 0.0156, "step": 6294 }, { "epoch": 0.8316543911219738, "grad_norm": 0.09803801029920578, "learning_rate": 1.3693349111415243e-05, "loss": 0.0113, "step": 6295 }, { "epoch": 0.8317865046074578, "grad_norm": 0.20118437707424164, "learning_rate": 1.3672364530693094e-05, "loss": 0.0268, "step": 6296 }, { "epoch": 0.8319186180929419, "grad_norm": 0.12770311534404755, "learning_rate": 1.3651394861455902e-05, "loss": 0.0113, "step": 6297 }, { "epoch": 0.8320507315784259, "grad_norm": 0.12034901976585388, "learning_rate": 1.3630440107325737e-05, "loss": 0.014, "step": 6298 }, { "epoch": 0.8321828450639099, "grad_norm": 0.16769957542419434, "learning_rate": 1.3609500271922171e-05, "loss": 0.0135, "step": 6299 }, { "epoch": 0.832314958549394, "grad_norm": 0.16110952198505402, "learning_rate": 1.3588575358862188e-05, "loss": 0.0189, "step": 6300 }, { "epoch": 0.832447072034878, "grad_norm": 0.09111898392438889, "learning_rate": 1.3567665371760141e-05, "loss": 0.0064, "step": 6301 }, { "epoch": 0.832579185520362, "grad_norm": 0.1689785271883011, "learning_rate": 1.3546770314227841e-05, "loss": 0.0126, "step": 6302 }, { "epoch": 0.8327112990058461, "grad_norm": 0.19347065687179565, "learning_rate": 1.3525890189874536e-05, "loss": 0.0109, "step": 6303 }, { "epoch": 0.8328434124913301, "grad_norm": 0.18107086420059204, "learning_rate": 1.3505025002306893e-05, "loss": 0.0156, "step": 6304 }, { "epoch": 0.8329755259768141, "grad_norm": 0.16515274345874786, "learning_rate": 1.3484174755128932e-05, "loss": 0.0211, "step": 6305 }, { "epoch": 0.8331076394622982, "grad_norm": 0.10837220400571823, "learning_rate": 1.3463339451942181e-05, "loss": 0.0082, "step": 6306 }, { "epoch": 0.8332397529477822, "grad_norm": 0.18501004576683044, "learning_rate": 1.3442519096345563e-05, "loss": 0.0301, "step": 6307 }, { "epoch": 0.8333718664332662, "grad_norm": 0.12566491961479187, "learning_rate": 1.3421713691935356e-05, "loss": 0.0104, "step": 6308 }, { "epoch": 0.8335039799187502, "grad_norm": 0.14533939957618713, "learning_rate": 1.340092324230533e-05, "loss": 0.0126, "step": 6309 }, { "epoch": 0.8336360934042343, "grad_norm": 0.11882299929857254, "learning_rate": 1.3380147751046646e-05, "loss": 0.0096, "step": 6310 }, { "epoch": 0.8337682068897183, "grad_norm": 0.19842758774757385, "learning_rate": 1.3359387221747876e-05, "loss": 0.015, "step": 6311 }, { "epoch": 0.8339003203752023, "grad_norm": 0.1555911749601364, "learning_rate": 1.3338641657995033e-05, "loss": 0.0254, "step": 6312 }, { "epoch": 0.8340324338606864, "grad_norm": 0.12339769303798676, "learning_rate": 1.3317911063371536e-05, "loss": 0.0098, "step": 6313 }, { "epoch": 0.8341645473461704, "grad_norm": 0.15423816442489624, "learning_rate": 1.3297195441458154e-05, "loss": 0.0155, "step": 6314 }, { "epoch": 0.8342966608316544, "grad_norm": 0.2922482192516327, "learning_rate": 1.327649479583316e-05, "loss": 0.0221, "step": 6315 }, { "epoch": 0.8344287743171385, "grad_norm": 0.19567056000232697, "learning_rate": 1.3255809130072194e-05, "loss": 0.0133, "step": 6316 }, { "epoch": 0.8345608878026225, "grad_norm": 0.18917563557624817, "learning_rate": 1.3235138447748342e-05, "loss": 0.0175, "step": 6317 }, { "epoch": 0.8346930012881065, "grad_norm": 0.1895277500152588, "learning_rate": 1.3214482752432033e-05, "loss": 0.0193, "step": 6318 }, { "epoch": 0.8348251147735906, "grad_norm": 0.10119043290615082, "learning_rate": 1.3193842047691174e-05, "loss": 0.0102, "step": 6319 }, { "epoch": 0.8349572282590746, "grad_norm": 0.21565113961696625, "learning_rate": 1.3173216337091098e-05, "loss": 0.0242, "step": 6320 }, { "epoch": 0.8350893417445586, "grad_norm": 0.20141518115997314, "learning_rate": 1.3152605624194436e-05, "loss": 0.0182, "step": 6321 }, { "epoch": 0.8352214552300427, "grad_norm": 0.08046968281269073, "learning_rate": 1.3132009912561361e-05, "loss": 0.0064, "step": 6322 }, { "epoch": 0.8353535687155267, "grad_norm": 0.13898640871047974, "learning_rate": 1.311142920574937e-05, "loss": 0.0135, "step": 6323 }, { "epoch": 0.8354856822010107, "grad_norm": 0.12394464761018753, "learning_rate": 1.3090863507313422e-05, "loss": 0.0107, "step": 6324 }, { "epoch": 0.8356177956864947, "grad_norm": 0.23427864909172058, "learning_rate": 1.307031282080582e-05, "loss": 0.0229, "step": 6325 }, { "epoch": 0.8357499091719788, "grad_norm": 0.13156038522720337, "learning_rate": 1.3049777149776332e-05, "loss": 0.0175, "step": 6326 }, { "epoch": 0.8358820226574628, "grad_norm": 0.17762839794158936, "learning_rate": 1.3029256497772136e-05, "loss": 0.0177, "step": 6327 }, { "epoch": 0.8360141361429468, "grad_norm": 0.10986623913049698, "learning_rate": 1.3008750868337738e-05, "loss": 0.0153, "step": 6328 }, { "epoch": 0.8361462496284309, "grad_norm": 0.23786404728889465, "learning_rate": 1.2988260265015128e-05, "loss": 0.0173, "step": 6329 }, { "epoch": 0.8362783631139149, "grad_norm": 0.13705205917358398, "learning_rate": 1.2967784691343676e-05, "loss": 0.0129, "step": 6330 }, { "epoch": 0.8364104765993989, "grad_norm": 0.1301295906305313, "learning_rate": 1.2947324150860174e-05, "loss": 0.0174, "step": 6331 }, { "epoch": 0.836542590084883, "grad_norm": 0.17995145916938782, "learning_rate": 1.2926878647098762e-05, "loss": 0.0178, "step": 6332 }, { "epoch": 0.836674703570367, "grad_norm": 0.11307818442583084, "learning_rate": 1.2906448183591024e-05, "loss": 0.0083, "step": 6333 }, { "epoch": 0.836806817055851, "grad_norm": 0.2079186737537384, "learning_rate": 1.2886032763865975e-05, "loss": 0.0111, "step": 6334 }, { "epoch": 0.8369389305413351, "grad_norm": 0.15454719960689545, "learning_rate": 1.2865632391449956e-05, "loss": 0.0187, "step": 6335 }, { "epoch": 0.8370710440268191, "grad_norm": 0.1949504017829895, "learning_rate": 1.2845247069866761e-05, "loss": 0.0207, "step": 6336 }, { "epoch": 0.8372031575123031, "grad_norm": 0.11500924825668335, "learning_rate": 1.2824876802637586e-05, "loss": 0.0128, "step": 6337 }, { "epoch": 0.8373352709977872, "grad_norm": 0.1913449466228485, "learning_rate": 1.2804521593281016e-05, "loss": 0.0188, "step": 6338 }, { "epoch": 0.8374673844832712, "grad_norm": 0.19859804213047028, "learning_rate": 1.2784181445313015e-05, "loss": 0.0271, "step": 6339 }, { "epoch": 0.8375994979687552, "grad_norm": 0.17501254379749298, "learning_rate": 1.2763856362246962e-05, "loss": 0.02, "step": 6340 }, { "epoch": 0.8377316114542392, "grad_norm": 0.15442462265491486, "learning_rate": 1.2743546347593672e-05, "loss": 0.0149, "step": 6341 }, { "epoch": 0.8378637249397233, "grad_norm": 0.17348526418209076, "learning_rate": 1.2723251404861259e-05, "loss": 0.0178, "step": 6342 }, { "epoch": 0.8379958384252073, "grad_norm": 0.1573207974433899, "learning_rate": 1.270297153755534e-05, "loss": 0.027, "step": 6343 }, { "epoch": 0.8381279519106913, "grad_norm": 0.9048620462417603, "learning_rate": 1.2682706749178874e-05, "loss": 0.0215, "step": 6344 }, { "epoch": 0.8382600653961754, "grad_norm": 0.12792988121509552, "learning_rate": 1.2662457043232235e-05, "loss": 0.0092, "step": 6345 }, { "epoch": 0.8383921788816594, "grad_norm": 0.1024986281991005, "learning_rate": 1.2642222423213145e-05, "loss": 0.0105, "step": 6346 }, { "epoch": 0.8385242923671434, "grad_norm": 0.16506196558475494, "learning_rate": 1.262200289261679e-05, "loss": 0.0158, "step": 6347 }, { "epoch": 0.8386564058526275, "grad_norm": 0.11527720093727112, "learning_rate": 1.260179845493572e-05, "loss": 0.0124, "step": 6348 }, { "epoch": 0.8387885193381114, "grad_norm": 0.09696735441684723, "learning_rate": 1.2581609113659842e-05, "loss": 0.0161, "step": 6349 }, { "epoch": 0.8389206328235954, "grad_norm": 0.16589656472206116, "learning_rate": 1.25614348722765e-05, "loss": 0.0157, "step": 6350 }, { "epoch": 0.8390527463090794, "grad_norm": 0.16736134886741638, "learning_rate": 1.2541275734270419e-05, "loss": 0.0159, "step": 6351 }, { "epoch": 0.8391848597945635, "grad_norm": 0.13019654154777527, "learning_rate": 1.2521131703123745e-05, "loss": 0.0089, "step": 6352 }, { "epoch": 0.8393169732800475, "grad_norm": 0.15413515269756317, "learning_rate": 1.2501002782315918e-05, "loss": 0.0146, "step": 6353 }, { "epoch": 0.8394490867655315, "grad_norm": 0.14028388261795044, "learning_rate": 1.248088897532388e-05, "loss": 0.0148, "step": 6354 }, { "epoch": 0.8395812002510156, "grad_norm": 0.12065458297729492, "learning_rate": 1.2460790285621916e-05, "loss": 0.0117, "step": 6355 }, { "epoch": 0.8397133137364996, "grad_norm": 0.19000723958015442, "learning_rate": 1.2440706716681672e-05, "loss": 0.0273, "step": 6356 }, { "epoch": 0.8398454272219836, "grad_norm": 0.2658844590187073, "learning_rate": 1.2420638271972218e-05, "loss": 0.0329, "step": 6357 }, { "epoch": 0.8399775407074677, "grad_norm": 0.22127166390419006, "learning_rate": 1.2400584954960016e-05, "loss": 0.0139, "step": 6358 }, { "epoch": 0.8401096541929517, "grad_norm": 0.13642321527004242, "learning_rate": 1.2380546769108903e-05, "loss": 0.0162, "step": 6359 }, { "epoch": 0.8402417676784357, "grad_norm": 0.24587512016296387, "learning_rate": 1.2360523717880068e-05, "loss": 0.031, "step": 6360 }, { "epoch": 0.8403738811639198, "grad_norm": 0.17175354063510895, "learning_rate": 1.2340515804732155e-05, "loss": 0.0194, "step": 6361 }, { "epoch": 0.8405059946494038, "grad_norm": 0.1904156357049942, "learning_rate": 1.2320523033121156e-05, "loss": 0.015, "step": 6362 }, { "epoch": 0.8406381081348878, "grad_norm": 0.12668251991271973, "learning_rate": 1.2300545406500408e-05, "loss": 0.011, "step": 6363 }, { "epoch": 0.8407702216203718, "grad_norm": 0.17923864722251892, "learning_rate": 1.2280582928320717e-05, "loss": 0.0188, "step": 6364 }, { "epoch": 0.8409023351058559, "grad_norm": 0.154831200838089, "learning_rate": 1.2260635602030202e-05, "loss": 0.0175, "step": 6365 }, { "epoch": 0.8410344485913399, "grad_norm": 0.13334999978542328, "learning_rate": 1.2240703431074418e-05, "loss": 0.013, "step": 6366 }, { "epoch": 0.8411665620768239, "grad_norm": 0.17862273752689362, "learning_rate": 1.2220786418896236e-05, "loss": 0.0161, "step": 6367 }, { "epoch": 0.841298675562308, "grad_norm": 0.2127719670534134, "learning_rate": 1.2200884568935956e-05, "loss": 0.0221, "step": 6368 }, { "epoch": 0.841430789047792, "grad_norm": 0.09783175587654114, "learning_rate": 1.2180997884631296e-05, "loss": 0.0088, "step": 6369 }, { "epoch": 0.841562902533276, "grad_norm": 0.15384633839130402, "learning_rate": 1.2161126369417252e-05, "loss": 0.0143, "step": 6370 }, { "epoch": 0.8416950160187601, "grad_norm": 0.14649969339370728, "learning_rate": 1.2141270026726293e-05, "loss": 0.0163, "step": 6371 }, { "epoch": 0.8418271295042441, "grad_norm": 0.1692427098751068, "learning_rate": 1.2121428859988227e-05, "loss": 0.0216, "step": 6372 }, { "epoch": 0.8419592429897281, "grad_norm": 0.2643669545650482, "learning_rate": 1.2101602872630224e-05, "loss": 0.0154, "step": 6373 }, { "epoch": 0.8420913564752122, "grad_norm": 0.0852733626961708, "learning_rate": 1.2081792068076858e-05, "loss": 0.0067, "step": 6374 }, { "epoch": 0.8422234699606962, "grad_norm": 0.13838987052440643, "learning_rate": 1.206199644975009e-05, "loss": 0.0146, "step": 6375 }, { "epoch": 0.8423555834461802, "grad_norm": 0.1700979620218277, "learning_rate": 1.2042216021069252e-05, "loss": 0.0219, "step": 6376 }, { "epoch": 0.8424876969316643, "grad_norm": 0.14072169363498688, "learning_rate": 1.2022450785451001e-05, "loss": 0.0125, "step": 6377 }, { "epoch": 0.8426198104171483, "grad_norm": 0.15077058970928192, "learning_rate": 1.2002700746309437e-05, "loss": 0.0109, "step": 6378 }, { "epoch": 0.8427519239026323, "grad_norm": 0.26554206013679504, "learning_rate": 1.1982965907056032e-05, "loss": 0.0241, "step": 6379 }, { "epoch": 0.8428840373881163, "grad_norm": 0.10113240778446198, "learning_rate": 1.1963246271099571e-05, "loss": 0.0041, "step": 6380 }, { "epoch": 0.8430161508736004, "grad_norm": 0.3097294569015503, "learning_rate": 1.1943541841846262e-05, "loss": 0.0248, "step": 6381 }, { "epoch": 0.8431482643590844, "grad_norm": 0.13991662859916687, "learning_rate": 1.1923852622699693e-05, "loss": 0.0058, "step": 6382 }, { "epoch": 0.8432803778445684, "grad_norm": 0.13752928376197815, "learning_rate": 1.1904178617060812e-05, "loss": 0.0102, "step": 6383 }, { "epoch": 0.8434124913300525, "grad_norm": 0.16692815721035004, "learning_rate": 1.1884519828327912e-05, "loss": 0.0153, "step": 6384 }, { "epoch": 0.8435446048155365, "grad_norm": 0.12915128469467163, "learning_rate": 1.1864876259896684e-05, "loss": 0.012, "step": 6385 }, { "epoch": 0.8436767183010205, "grad_norm": 0.12745681405067444, "learning_rate": 1.1845247915160219e-05, "loss": 0.0116, "step": 6386 }, { "epoch": 0.8438088317865046, "grad_norm": 0.13691522181034088, "learning_rate": 1.18256347975089e-05, "loss": 0.0193, "step": 6387 }, { "epoch": 0.8439409452719886, "grad_norm": 0.24836905300617218, "learning_rate": 1.1806036910330554e-05, "loss": 0.0289, "step": 6388 }, { "epoch": 0.8440730587574726, "grad_norm": 0.12612557411193848, "learning_rate": 1.1786454257010337e-05, "loss": 0.0081, "step": 6389 }, { "epoch": 0.8442051722429567, "grad_norm": 0.1398400068283081, "learning_rate": 1.1766886840930824e-05, "loss": 0.0217, "step": 6390 }, { "epoch": 0.8443372857284407, "grad_norm": 0.19320712983608246, "learning_rate": 1.1747334665471865e-05, "loss": 0.0297, "step": 6391 }, { "epoch": 0.8444693992139247, "grad_norm": 0.12965945899486542, "learning_rate": 1.1727797734010771e-05, "loss": 0.0153, "step": 6392 }, { "epoch": 0.8446015126994088, "grad_norm": 0.08290174603462219, "learning_rate": 1.1708276049922174e-05, "loss": 0.008, "step": 6393 }, { "epoch": 0.8447336261848928, "grad_norm": 0.28244665265083313, "learning_rate": 1.1688769616578067e-05, "loss": 0.019, "step": 6394 }, { "epoch": 0.8448657396703768, "grad_norm": 0.23474980890750885, "learning_rate": 1.1669278437347819e-05, "loss": 0.0266, "step": 6395 }, { "epoch": 0.8449978531558608, "grad_norm": 0.11438631266355515, "learning_rate": 1.1649802515598185e-05, "loss": 0.0194, "step": 6396 }, { "epoch": 0.8451299666413449, "grad_norm": 0.11677414923906326, "learning_rate": 1.1630341854693273e-05, "loss": 0.0106, "step": 6397 }, { "epoch": 0.8452620801268289, "grad_norm": 0.1963653713464737, "learning_rate": 1.1610896457994513e-05, "loss": 0.0202, "step": 6398 }, { "epoch": 0.8453941936123129, "grad_norm": 0.13961468636989594, "learning_rate": 1.1591466328860756e-05, "loss": 0.0181, "step": 6399 }, { "epoch": 0.845526307097797, "grad_norm": 0.1912894994020462, "learning_rate": 1.1572051470648216e-05, "loss": 0.0173, "step": 6400 }, { "epoch": 0.845658420583281, "grad_norm": 0.11899048835039139, "learning_rate": 1.1552651886710398e-05, "loss": 0.0149, "step": 6401 }, { "epoch": 0.845790534068765, "grad_norm": 0.22597384452819824, "learning_rate": 1.1533267580398254e-05, "loss": 0.0179, "step": 6402 }, { "epoch": 0.8459226475542491, "grad_norm": 0.1366269737482071, "learning_rate": 1.1513898555060033e-05, "loss": 0.0067, "step": 6403 }, { "epoch": 0.8460547610397331, "grad_norm": 0.22207403182983398, "learning_rate": 1.1494544814041419e-05, "loss": 0.0179, "step": 6404 }, { "epoch": 0.8461868745252171, "grad_norm": 0.17005328834056854, "learning_rate": 1.1475206360685353e-05, "loss": 0.0162, "step": 6405 }, { "epoch": 0.8463189880107012, "grad_norm": 0.0824325904250145, "learning_rate": 1.1455883198332217e-05, "loss": 0.0036, "step": 6406 }, { "epoch": 0.8464511014961852, "grad_norm": 0.22979408502578735, "learning_rate": 1.1436575330319744e-05, "loss": 0.0226, "step": 6407 }, { "epoch": 0.8465832149816692, "grad_norm": 0.08795081824064255, "learning_rate": 1.1417282759982972e-05, "loss": 0.009, "step": 6408 }, { "epoch": 0.8467153284671532, "grad_norm": 0.30979159474372864, "learning_rate": 1.1398005490654352e-05, "loss": 0.0271, "step": 6409 }, { "epoch": 0.8468474419526373, "grad_norm": 0.1569080799818039, "learning_rate": 1.1378743525663659e-05, "loss": 0.0145, "step": 6410 }, { "epoch": 0.8469795554381213, "grad_norm": 0.08720047026872635, "learning_rate": 1.1359496868338072e-05, "loss": 0.0099, "step": 6411 }, { "epoch": 0.8471116689236053, "grad_norm": 0.2238951474428177, "learning_rate": 1.1340265522002036e-05, "loss": 0.0138, "step": 6412 }, { "epoch": 0.8472437824090894, "grad_norm": 0.17409300804138184, "learning_rate": 1.1321049489977443e-05, "loss": 0.0181, "step": 6413 }, { "epoch": 0.8473758958945734, "grad_norm": 0.23001131415367126, "learning_rate": 1.1301848775583513e-05, "loss": 0.0222, "step": 6414 }, { "epoch": 0.8475080093800574, "grad_norm": 0.15086467564105988, "learning_rate": 1.1282663382136783e-05, "loss": 0.0132, "step": 6415 }, { "epoch": 0.8476401228655415, "grad_norm": 0.08834061026573181, "learning_rate": 1.1263493312951168e-05, "loss": 0.0057, "step": 6416 }, { "epoch": 0.8477722363510255, "grad_norm": 0.12019912898540497, "learning_rate": 1.1244338571337964e-05, "loss": 0.0165, "step": 6417 }, { "epoch": 0.8479043498365095, "grad_norm": 0.29346486926078796, "learning_rate": 1.1225199160605793e-05, "loss": 0.0147, "step": 6418 }, { "epoch": 0.8480364633219936, "grad_norm": 0.13543280959129333, "learning_rate": 1.1206075084060608e-05, "loss": 0.0154, "step": 6419 }, { "epoch": 0.8481685768074776, "grad_norm": 0.1138705313205719, "learning_rate": 1.1186966345005745e-05, "loss": 0.0128, "step": 6420 }, { "epoch": 0.8483006902929616, "grad_norm": 0.1780928373336792, "learning_rate": 1.1167872946741909e-05, "loss": 0.0218, "step": 6421 }, { "epoch": 0.8484328037784457, "grad_norm": 0.16972623765468597, "learning_rate": 1.1148794892567071e-05, "loss": 0.0147, "step": 6422 }, { "epoch": 0.8485649172639297, "grad_norm": 0.14040710031986237, "learning_rate": 1.1129732185776654e-05, "loss": 0.0171, "step": 6423 }, { "epoch": 0.8486970307494137, "grad_norm": 0.1257903128862381, "learning_rate": 1.1110684829663364e-05, "loss": 0.0168, "step": 6424 }, { "epoch": 0.8488291442348977, "grad_norm": 0.12174368649721146, "learning_rate": 1.1091652827517296e-05, "loss": 0.011, "step": 6425 }, { "epoch": 0.8489612577203818, "grad_norm": 0.19201435148715973, "learning_rate": 1.1072636182625851e-05, "loss": 0.0216, "step": 6426 }, { "epoch": 0.8490933712058658, "grad_norm": 0.1880090832710266, "learning_rate": 1.1053634898273802e-05, "loss": 0.0252, "step": 6427 }, { "epoch": 0.8492254846913498, "grad_norm": 0.15559418499469757, "learning_rate": 1.1034648977743267e-05, "loss": 0.0176, "step": 6428 }, { "epoch": 0.8493575981768339, "grad_norm": 0.16924457252025604, "learning_rate": 1.1015678424313713e-05, "loss": 0.015, "step": 6429 }, { "epoch": 0.8494897116623179, "grad_norm": 0.11257937550544739, "learning_rate": 1.0996723241261942e-05, "loss": 0.0124, "step": 6430 }, { "epoch": 0.8496218251478019, "grad_norm": 0.19125103950500488, "learning_rate": 1.0977783431862144e-05, "loss": 0.0184, "step": 6431 }, { "epoch": 0.849753938633286, "grad_norm": 0.28629064559936523, "learning_rate": 1.0958858999385758e-05, "loss": 0.0297, "step": 6432 }, { "epoch": 0.84988605211877, "grad_norm": 0.17692437767982483, "learning_rate": 1.0939949947101646e-05, "loss": 0.0095, "step": 6433 }, { "epoch": 0.850018165604254, "grad_norm": 0.12471568584442139, "learning_rate": 1.0921056278276031e-05, "loss": 0.0106, "step": 6434 }, { "epoch": 0.8501502790897381, "grad_norm": 0.1254560351371765, "learning_rate": 1.0902177996172392e-05, "loss": 0.0136, "step": 6435 }, { "epoch": 0.8502823925752221, "grad_norm": 0.0843840092420578, "learning_rate": 1.0883315104051617e-05, "loss": 0.0068, "step": 6436 }, { "epoch": 0.8504145060607061, "grad_norm": 0.14639145135879517, "learning_rate": 1.0864467605171912e-05, "loss": 0.0176, "step": 6437 }, { "epoch": 0.8505466195461902, "grad_norm": 0.204713374376297, "learning_rate": 1.0845635502788865e-05, "loss": 0.0178, "step": 6438 }, { "epoch": 0.8506787330316742, "grad_norm": 0.15403705835342407, "learning_rate": 1.0826818800155313e-05, "loss": 0.0244, "step": 6439 }, { "epoch": 0.8508108465171582, "grad_norm": 0.16191795468330383, "learning_rate": 1.0808017500521528e-05, "loss": 0.0118, "step": 6440 }, { "epoch": 0.8509429600026422, "grad_norm": 0.1546401083469391, "learning_rate": 1.0789231607135086e-05, "loss": 0.0149, "step": 6441 }, { "epoch": 0.8510750734881263, "grad_norm": 0.09436122328042984, "learning_rate": 1.0770461123240871e-05, "loss": 0.0087, "step": 6442 }, { "epoch": 0.8512071869736103, "grad_norm": 0.31698280572891235, "learning_rate": 1.0751706052081146e-05, "loss": 0.0299, "step": 6443 }, { "epoch": 0.8513393004590943, "grad_norm": 0.20286083221435547, "learning_rate": 1.0732966396895494e-05, "loss": 0.0232, "step": 6444 }, { "epoch": 0.8514714139445784, "grad_norm": 0.20891332626342773, "learning_rate": 1.0714242160920873e-05, "loss": 0.0158, "step": 6445 }, { "epoch": 0.8516035274300624, "grad_norm": 0.25830474495887756, "learning_rate": 1.0695533347391507e-05, "loss": 0.0451, "step": 6446 }, { "epoch": 0.8517356409155464, "grad_norm": 0.15606266260147095, "learning_rate": 1.0676839959538986e-05, "loss": 0.0165, "step": 6447 }, { "epoch": 0.8518677544010305, "grad_norm": 0.1563085913658142, "learning_rate": 1.0658162000592298e-05, "loss": 0.0202, "step": 6448 }, { "epoch": 0.8519998678865145, "grad_norm": 0.13570018112659454, "learning_rate": 1.0639499473777648e-05, "loss": 0.0083, "step": 6449 }, { "epoch": 0.8521319813719985, "grad_norm": 0.11977182328701019, "learning_rate": 1.0620852382318669e-05, "loss": 0.0142, "step": 6450 }, { "epoch": 0.8522640948574826, "grad_norm": 0.15313532948493958, "learning_rate": 1.0602220729436297e-05, "loss": 0.0192, "step": 6451 }, { "epoch": 0.8523962083429666, "grad_norm": 0.12061281502246857, "learning_rate": 1.0583604518348821e-05, "loss": 0.0162, "step": 6452 }, { "epoch": 0.8525283218284506, "grad_norm": 0.1367337554693222, "learning_rate": 1.0565003752271796e-05, "loss": 0.0148, "step": 6453 }, { "epoch": 0.8526604353139347, "grad_norm": 0.14763393998146057, "learning_rate": 1.0546418434418181e-05, "loss": 0.0186, "step": 6454 }, { "epoch": 0.8527925487994187, "grad_norm": 0.19914959371089935, "learning_rate": 1.0527848567998266e-05, "loss": 0.014, "step": 6455 }, { "epoch": 0.8529246622849027, "grad_norm": 0.2014371007680893, "learning_rate": 1.0509294156219608e-05, "loss": 0.0295, "step": 6456 }, { "epoch": 0.8530567757703867, "grad_norm": 0.1509523242712021, "learning_rate": 1.049075520228715e-05, "loss": 0.017, "step": 6457 }, { "epoch": 0.8531888892558708, "grad_norm": 0.17880138754844666, "learning_rate": 1.0472231709403157e-05, "loss": 0.0091, "step": 6458 }, { "epoch": 0.8533210027413548, "grad_norm": 0.09584607183933258, "learning_rate": 1.0453723680767225e-05, "loss": 0.0125, "step": 6459 }, { "epoch": 0.8534531162268388, "grad_norm": 0.20094123482704163, "learning_rate": 1.0435231119576239e-05, "loss": 0.0164, "step": 6460 }, { "epoch": 0.8535852297123229, "grad_norm": 0.1413847953081131, "learning_rate": 1.0416754029024467e-05, "loss": 0.014, "step": 6461 }, { "epoch": 0.8537173431978069, "grad_norm": 0.14215205609798431, "learning_rate": 1.0398292412303478e-05, "loss": 0.0179, "step": 6462 }, { "epoch": 0.8538494566832909, "grad_norm": 0.1754753738641739, "learning_rate": 1.0379846272602156e-05, "loss": 0.0107, "step": 6463 }, { "epoch": 0.853981570168775, "grad_norm": 0.14488495886325836, "learning_rate": 1.036141561310674e-05, "loss": 0.0108, "step": 6464 }, { "epoch": 0.854113683654259, "grad_norm": 0.15773916244506836, "learning_rate": 1.0343000437000783e-05, "loss": 0.0137, "step": 6465 }, { "epoch": 0.854245797139743, "grad_norm": 0.16901792585849762, "learning_rate": 1.0324600747465174e-05, "loss": 0.0159, "step": 6466 }, { "epoch": 0.854377910625227, "grad_norm": 0.1686817854642868, "learning_rate": 1.0306216547678082e-05, "loss": 0.0144, "step": 6467 }, { "epoch": 0.8545100241107111, "grad_norm": 0.1480901539325714, "learning_rate": 1.0287847840815046e-05, "loss": 0.022, "step": 6468 }, { "epoch": 0.8546421375961951, "grad_norm": 0.1318725347518921, "learning_rate": 1.0269494630048948e-05, "loss": 0.0152, "step": 6469 }, { "epoch": 0.8547742510816791, "grad_norm": 0.10686702281236649, "learning_rate": 1.025115691854992e-05, "loss": 0.0097, "step": 6470 }, { "epoch": 0.8549063645671632, "grad_norm": 0.15249253809452057, "learning_rate": 1.0232834709485472e-05, "loss": 0.0153, "step": 6471 }, { "epoch": 0.8550384780526472, "grad_norm": 0.10670942068099976, "learning_rate": 1.0214528006020429e-05, "loss": 0.0148, "step": 6472 }, { "epoch": 0.8551705915381312, "grad_norm": 0.16914334893226624, "learning_rate": 1.019623681131695e-05, "loss": 0.0242, "step": 6473 }, { "epoch": 0.8553027050236153, "grad_norm": 0.13639064133167267, "learning_rate": 1.0177961128534453e-05, "loss": 0.0144, "step": 6474 }, { "epoch": 0.8554348185090993, "grad_norm": 0.1852792203426361, "learning_rate": 1.0159700960829744e-05, "loss": 0.0135, "step": 6475 }, { "epoch": 0.8555669319945833, "grad_norm": 0.15428900718688965, "learning_rate": 1.0141456311356945e-05, "loss": 0.0148, "step": 6476 }, { "epoch": 0.8556990454800674, "grad_norm": 0.11808072775602341, "learning_rate": 1.0123227183267437e-05, "loss": 0.0152, "step": 6477 }, { "epoch": 0.8558311589655514, "grad_norm": 0.11304613947868347, "learning_rate": 1.0105013579709987e-05, "loss": 0.0093, "step": 6478 }, { "epoch": 0.8559632724510354, "grad_norm": 0.1460811197757721, "learning_rate": 1.008681550383065e-05, "loss": 0.0137, "step": 6479 }, { "epoch": 0.8560953859365195, "grad_norm": 0.15243810415267944, "learning_rate": 1.0068632958772829e-05, "loss": 0.0169, "step": 6480 }, { "epoch": 0.8562274994220035, "grad_norm": 0.17395997047424316, "learning_rate": 1.0050465947677167e-05, "loss": 0.0165, "step": 6481 }, { "epoch": 0.8563596129074875, "grad_norm": 0.14886704087257385, "learning_rate": 1.0032314473681692e-05, "loss": 0.0143, "step": 6482 }, { "epoch": 0.8564917263929716, "grad_norm": 0.1678609549999237, "learning_rate": 1.0014178539921782e-05, "loss": 0.019, "step": 6483 }, { "epoch": 0.8566238398784556, "grad_norm": 0.13445799052715302, "learning_rate": 9.996058149530008e-06, "loss": 0.0108, "step": 6484 }, { "epoch": 0.8567559533639396, "grad_norm": 0.09424842149019241, "learning_rate": 9.977953305636335e-06, "loss": 0.0099, "step": 6485 }, { "epoch": 0.8568880668494236, "grad_norm": 0.22159160673618317, "learning_rate": 9.959864011368115e-06, "loss": 0.0179, "step": 6486 }, { "epoch": 0.8570201803349077, "grad_norm": 0.17116421461105347, "learning_rate": 9.941790269849848e-06, "loss": 0.0147, "step": 6487 }, { "epoch": 0.8571522938203917, "grad_norm": 0.1429840326309204, "learning_rate": 9.923732084203475e-06, "loss": 0.0106, "step": 6488 }, { "epoch": 0.8572844073058757, "grad_norm": 0.17208239436149597, "learning_rate": 9.905689457548207e-06, "loss": 0.0227, "step": 6489 }, { "epoch": 0.8574165207913598, "grad_norm": 0.18521353602409363, "learning_rate": 9.887662393000585e-06, "loss": 0.0181, "step": 6490 }, { "epoch": 0.8575486342768438, "grad_norm": 0.21980801224708557, "learning_rate": 9.86965089367441e-06, "loss": 0.0096, "step": 6491 }, { "epoch": 0.8576807477623278, "grad_norm": 0.16842244565486908, "learning_rate": 9.851654962680856e-06, "loss": 0.012, "step": 6492 }, { "epoch": 0.8578128612478119, "grad_norm": 0.12133488804101944, "learning_rate": 9.833674603128395e-06, "loss": 0.0128, "step": 6493 }, { "epoch": 0.8579449747332959, "grad_norm": 0.13182833790779114, "learning_rate": 9.815709818122753e-06, "loss": 0.0104, "step": 6494 }, { "epoch": 0.8580770882187799, "grad_norm": 0.1352997124195099, "learning_rate": 9.797760610767049e-06, "loss": 0.0132, "step": 6495 }, { "epoch": 0.858209201704264, "grad_norm": 0.1745452880859375, "learning_rate": 9.779826984161666e-06, "loss": 0.0174, "step": 6496 }, { "epoch": 0.858341315189748, "grad_norm": 0.1221051961183548, "learning_rate": 9.761908941404319e-06, "loss": 0.0153, "step": 6497 }, { "epoch": 0.858473428675232, "grad_norm": 0.191941037774086, "learning_rate": 9.744006485589974e-06, "loss": 0.018, "step": 6498 }, { "epoch": 0.858605542160716, "grad_norm": 0.1384330838918686, "learning_rate": 9.726119619810969e-06, "loss": 0.0136, "step": 6499 }, { "epoch": 0.8587376556462001, "grad_norm": 0.19022780656814575, "learning_rate": 9.708248347156946e-06, "loss": 0.016, "step": 6500 }, { "epoch": 0.8588697691316841, "grad_norm": 0.16361016035079956, "learning_rate": 9.690392670714787e-06, "loss": 0.0149, "step": 6501 }, { "epoch": 0.8590018826171681, "grad_norm": 0.14385753870010376, "learning_rate": 9.672552593568751e-06, "loss": 0.0088, "step": 6502 }, { "epoch": 0.8591339961026522, "grad_norm": 0.1275266408920288, "learning_rate": 9.654728118800383e-06, "loss": 0.01, "step": 6503 }, { "epoch": 0.8592661095881362, "grad_norm": 0.18232011795043945, "learning_rate": 9.636919249488541e-06, "loss": 0.0139, "step": 6504 }, { "epoch": 0.8593982230736202, "grad_norm": 0.1661888062953949, "learning_rate": 9.619125988709332e-06, "loss": 0.0137, "step": 6505 }, { "epoch": 0.8595303365591043, "grad_norm": 0.18684124946594238, "learning_rate": 9.601348339536232e-06, "loss": 0.0142, "step": 6506 }, { "epoch": 0.8596624500445883, "grad_norm": 0.1515057533979416, "learning_rate": 9.583586305040016e-06, "loss": 0.0094, "step": 6507 }, { "epoch": 0.8597945635300723, "grad_norm": 0.12896044552326202, "learning_rate": 9.56583988828872e-06, "loss": 0.0169, "step": 6508 }, { "epoch": 0.8599266770155564, "grad_norm": 0.15606920421123505, "learning_rate": 9.548109092347702e-06, "loss": 0.0205, "step": 6509 }, { "epoch": 0.8600587905010404, "grad_norm": 0.09972091764211655, "learning_rate": 9.530393920279624e-06, "loss": 0.0102, "step": 6510 }, { "epoch": 0.8601909039865244, "grad_norm": 0.18018166720867157, "learning_rate": 9.512694375144494e-06, "loss": 0.017, "step": 6511 }, { "epoch": 0.8603230174720085, "grad_norm": 0.2273482084274292, "learning_rate": 9.49501045999952e-06, "loss": 0.0141, "step": 6512 }, { "epoch": 0.8604551309574925, "grad_norm": 0.19603601098060608, "learning_rate": 9.477342177899296e-06, "loss": 0.0202, "step": 6513 }, { "epoch": 0.8605872444429765, "grad_norm": 0.0983457863330841, "learning_rate": 9.4596895318957e-06, "loss": 0.0124, "step": 6514 }, { "epoch": 0.8607193579284605, "grad_norm": 0.23922207951545715, "learning_rate": 9.44205252503787e-06, "loss": 0.0164, "step": 6515 }, { "epoch": 0.8608514714139446, "grad_norm": 0.2205938994884491, "learning_rate": 9.42443116037227e-06, "loss": 0.0194, "step": 6516 }, { "epoch": 0.8609835848994286, "grad_norm": 0.17030929028987885, "learning_rate": 9.406825440942678e-06, "loss": 0.0208, "step": 6517 }, { "epoch": 0.8611156983849126, "grad_norm": 0.26909691095352173, "learning_rate": 9.389235369790162e-06, "loss": 0.0169, "step": 6518 }, { "epoch": 0.8612478118703967, "grad_norm": 0.24096190929412842, "learning_rate": 9.371660949953043e-06, "loss": 0.0194, "step": 6519 }, { "epoch": 0.8613799253558807, "grad_norm": 0.21997550129890442, "learning_rate": 9.354102184466984e-06, "loss": 0.0173, "step": 6520 }, { "epoch": 0.8615120388413647, "grad_norm": 0.20401428639888763, "learning_rate": 9.33655907636497e-06, "loss": 0.016, "step": 6521 }, { "epoch": 0.8616441523268488, "grad_norm": 0.14194954931735992, "learning_rate": 9.3190316286772e-06, "loss": 0.0216, "step": 6522 }, { "epoch": 0.8617762658123328, "grad_norm": 0.1661507934331894, "learning_rate": 9.301519844431217e-06, "loss": 0.0245, "step": 6523 }, { "epoch": 0.8619083792978168, "grad_norm": 0.11384832113981247, "learning_rate": 9.28402372665188e-06, "loss": 0.0102, "step": 6524 }, { "epoch": 0.8620404927833009, "grad_norm": 0.12378716468811035, "learning_rate": 9.266543278361318e-06, "loss": 0.0124, "step": 6525 }, { "epoch": 0.8621726062687849, "grad_norm": 0.2553863823413849, "learning_rate": 9.249078502578913e-06, "loss": 0.0125, "step": 6526 }, { "epoch": 0.8623047197542689, "grad_norm": 0.16368991136550903, "learning_rate": 9.231629402321406e-06, "loss": 0.0205, "step": 6527 }, { "epoch": 0.862436833239753, "grad_norm": 0.2544208765029907, "learning_rate": 9.214195980602813e-06, "loss": 0.0244, "step": 6528 }, { "epoch": 0.862568946725237, "grad_norm": 0.1652582883834839, "learning_rate": 9.196778240434401e-06, "loss": 0.0096, "step": 6529 }, { "epoch": 0.862701060210721, "grad_norm": 0.18706029653549194, "learning_rate": 9.179376184824785e-06, "loss": 0.0118, "step": 6530 }, { "epoch": 0.862833173696205, "grad_norm": 0.12401444464921951, "learning_rate": 9.161989816779825e-06, "loss": 0.0128, "step": 6531 }, { "epoch": 0.8629652871816891, "grad_norm": 0.1702900379896164, "learning_rate": 9.144619139302712e-06, "loss": 0.0218, "step": 6532 }, { "epoch": 0.8630974006671731, "grad_norm": 0.16278304159641266, "learning_rate": 9.12726415539389e-06, "loss": 0.0271, "step": 6533 }, { "epoch": 0.8632295141526571, "grad_norm": 0.14746056497097015, "learning_rate": 9.109924868051112e-06, "loss": 0.0119, "step": 6534 }, { "epoch": 0.8633616276381412, "grad_norm": 0.20100457966327667, "learning_rate": 9.09260128026943e-06, "loss": 0.027, "step": 6535 }, { "epoch": 0.8634937411236252, "grad_norm": 0.16931913793087006, "learning_rate": 9.07529339504114e-06, "loss": 0.0189, "step": 6536 }, { "epoch": 0.8636258546091092, "grad_norm": 0.16593056917190552, "learning_rate": 9.058001215355872e-06, "loss": 0.0229, "step": 6537 }, { "epoch": 0.8637579680945933, "grad_norm": 0.2710971534252167, "learning_rate": 9.040724744200524e-06, "loss": 0.0207, "step": 6538 }, { "epoch": 0.8638900815800773, "grad_norm": 0.13531526923179626, "learning_rate": 9.02346398455931e-06, "loss": 0.0135, "step": 6539 }, { "epoch": 0.8640221950655613, "grad_norm": 0.1501166820526123, "learning_rate": 9.006218939413658e-06, "loss": 0.0159, "step": 6540 }, { "epoch": 0.8641543085510454, "grad_norm": 0.16745105385780334, "learning_rate": 8.98898961174235e-06, "loss": 0.0154, "step": 6541 }, { "epoch": 0.8642864220365294, "grad_norm": 0.20952169597148895, "learning_rate": 8.971776004521449e-06, "loss": 0.0147, "step": 6542 }, { "epoch": 0.8644185355220134, "grad_norm": 0.17372874915599823, "learning_rate": 8.954578120724244e-06, "loss": 0.0174, "step": 6543 }, { "epoch": 0.8645506490074975, "grad_norm": 0.193229541182518, "learning_rate": 8.937395963321338e-06, "loss": 0.0183, "step": 6544 }, { "epoch": 0.8646827624929815, "grad_norm": 0.1361188441514969, "learning_rate": 8.920229535280712e-06, "loss": 0.0135, "step": 6545 }, { "epoch": 0.8648148759784655, "grad_norm": 0.16256161034107208, "learning_rate": 8.903078839567457e-06, "loss": 0.0131, "step": 6546 }, { "epoch": 0.8649469894639495, "grad_norm": 0.1579897105693817, "learning_rate": 8.885943879144076e-06, "loss": 0.0241, "step": 6547 }, { "epoch": 0.8650791029494336, "grad_norm": 0.14809980988502502, "learning_rate": 8.868824656970332e-06, "loss": 0.011, "step": 6548 }, { "epoch": 0.8652112164349176, "grad_norm": 0.17065958678722382, "learning_rate": 8.851721176003192e-06, "loss": 0.0158, "step": 6549 }, { "epoch": 0.8653433299204016, "grad_norm": 0.13681527972221375, "learning_rate": 8.834633439197004e-06, "loss": 0.0125, "step": 6550 }, { "epoch": 0.8654754434058857, "grad_norm": 0.13763427734375, "learning_rate": 8.817561449503343e-06, "loss": 0.0106, "step": 6551 }, { "epoch": 0.8656075568913697, "grad_norm": 0.10495647042989731, "learning_rate": 8.80050520987109e-06, "loss": 0.0106, "step": 6552 }, { "epoch": 0.8657396703768537, "grad_norm": 0.11058825254440308, "learning_rate": 8.783464723246371e-06, "loss": 0.0094, "step": 6553 }, { "epoch": 0.8658717838623378, "grad_norm": 0.2615870237350464, "learning_rate": 8.766439992572618e-06, "loss": 0.0122, "step": 6554 }, { "epoch": 0.8660038973478218, "grad_norm": 0.17798537015914917, "learning_rate": 8.749431020790555e-06, "loss": 0.0162, "step": 6555 }, { "epoch": 0.8661360108333058, "grad_norm": 0.2039807289838791, "learning_rate": 8.732437810838124e-06, "loss": 0.0294, "step": 6556 }, { "epoch": 0.8662681243187899, "grad_norm": 0.11167783290147781, "learning_rate": 8.715460365650607e-06, "loss": 0.0065, "step": 6557 }, { "epoch": 0.8664002378042739, "grad_norm": 0.18214501440525055, "learning_rate": 8.69849868816055e-06, "loss": 0.0084, "step": 6558 }, { "epoch": 0.8665323512897579, "grad_norm": 0.17260056734085083, "learning_rate": 8.681552781297763e-06, "loss": 0.0162, "step": 6559 }, { "epoch": 0.866664464775242, "grad_norm": 0.16543985903263092, "learning_rate": 8.664622647989317e-06, "loss": 0.0102, "step": 6560 }, { "epoch": 0.866796578260726, "grad_norm": 0.23392952978610992, "learning_rate": 8.647708291159583e-06, "loss": 0.021, "step": 6561 }, { "epoch": 0.86692869174621, "grad_norm": 0.11497484147548676, "learning_rate": 8.630809713730226e-06, "loss": 0.0075, "step": 6562 }, { "epoch": 0.867060805231694, "grad_norm": 0.1542213410139084, "learning_rate": 8.613926918620108e-06, "loss": 0.0125, "step": 6563 }, { "epoch": 0.8671929187171781, "grad_norm": 0.16790266335010529, "learning_rate": 8.597059908745453e-06, "loss": 0.0198, "step": 6564 }, { "epoch": 0.8673250322026621, "grad_norm": 0.10018698126077652, "learning_rate": 8.580208687019709e-06, "loss": 0.0068, "step": 6565 }, { "epoch": 0.8674571456881461, "grad_norm": 0.13718554377555847, "learning_rate": 8.563373256353635e-06, "loss": 0.0105, "step": 6566 }, { "epoch": 0.8675892591736302, "grad_norm": 0.1705043762922287, "learning_rate": 8.546553619655196e-06, "loss": 0.017, "step": 6567 }, { "epoch": 0.8677213726591142, "grad_norm": 0.11266963928937912, "learning_rate": 8.529749779829688e-06, "loss": 0.0115, "step": 6568 }, { "epoch": 0.8678534861445982, "grad_norm": 0.1814977079629898, "learning_rate": 8.512961739779678e-06, "loss": 0.0192, "step": 6569 }, { "epoch": 0.8679855996300823, "grad_norm": 0.2693195343017578, "learning_rate": 8.49618950240495e-06, "loss": 0.0232, "step": 6570 }, { "epoch": 0.8681177131155663, "grad_norm": 0.14247596263885498, "learning_rate": 8.479433070602616e-06, "loss": 0.0167, "step": 6571 }, { "epoch": 0.8682498266010503, "grad_norm": 0.14830470085144043, "learning_rate": 8.46269244726704e-06, "loss": 0.0107, "step": 6572 }, { "epoch": 0.8683819400865344, "grad_norm": 0.14798453450202942, "learning_rate": 8.445967635289854e-06, "loss": 0.0204, "step": 6573 }, { "epoch": 0.8685140535720184, "grad_norm": 0.24593189358711243, "learning_rate": 8.429258637559933e-06, "loss": 0.0119, "step": 6574 }, { "epoch": 0.8686461670575024, "grad_norm": 0.15432271361351013, "learning_rate": 8.41256545696346e-06, "loss": 0.0092, "step": 6575 }, { "epoch": 0.8687782805429864, "grad_norm": 0.12995439767837524, "learning_rate": 8.395888096383897e-06, "loss": 0.0172, "step": 6576 }, { "epoch": 0.8689103940284705, "grad_norm": 0.10527817904949188, "learning_rate": 8.37922655870189e-06, "loss": 0.0091, "step": 6577 }, { "epoch": 0.8690425075139545, "grad_norm": 0.16179177165031433, "learning_rate": 8.362580846795443e-06, "loss": 0.0131, "step": 6578 }, { "epoch": 0.8691746209994385, "grad_norm": 0.27898624539375305, "learning_rate": 8.345950963539772e-06, "loss": 0.0204, "step": 6579 }, { "epoch": 0.8693067344849226, "grad_norm": 0.156561017036438, "learning_rate": 8.329336911807417e-06, "loss": 0.0145, "step": 6580 }, { "epoch": 0.8694388479704066, "grad_norm": 0.19675347208976746, "learning_rate": 8.312738694468103e-06, "loss": 0.0241, "step": 6581 }, { "epoch": 0.8695709614558906, "grad_norm": 0.1354101300239563, "learning_rate": 8.29615631438888e-06, "loss": 0.014, "step": 6582 }, { "epoch": 0.8697030749413747, "grad_norm": 0.11665624380111694, "learning_rate": 8.27958977443406e-06, "loss": 0.0136, "step": 6583 }, { "epoch": 0.8698351884268587, "grad_norm": 0.15645678341388702, "learning_rate": 8.263039077465163e-06, "loss": 0.0188, "step": 6584 }, { "epoch": 0.8699673019123427, "grad_norm": 0.19669926166534424, "learning_rate": 8.246504226341035e-06, "loss": 0.0206, "step": 6585 }, { "epoch": 0.8700994153978268, "grad_norm": 0.09903514385223389, "learning_rate": 8.229985223917757e-06, "loss": 0.009, "step": 6586 }, { "epoch": 0.8702315288833108, "grad_norm": 0.35547196865081787, "learning_rate": 8.213482073048707e-06, "loss": 0.0254, "step": 6587 }, { "epoch": 0.8703636423687948, "grad_norm": 0.22408826649188995, "learning_rate": 8.196994776584455e-06, "loss": 0.0136, "step": 6588 }, { "epoch": 0.8704957558542789, "grad_norm": 0.16860933601856232, "learning_rate": 8.180523337372881e-06, "loss": 0.0177, "step": 6589 }, { "epoch": 0.8706278693397629, "grad_norm": 0.14488452672958374, "learning_rate": 8.164067758259153e-06, "loss": 0.0146, "step": 6590 }, { "epoch": 0.8707599828252469, "grad_norm": 0.1677248477935791, "learning_rate": 8.14762804208562e-06, "loss": 0.0119, "step": 6591 }, { "epoch": 0.870892096310731, "grad_norm": 0.13122430443763733, "learning_rate": 8.131204191691954e-06, "loss": 0.0136, "step": 6592 }, { "epoch": 0.871024209796215, "grad_norm": 0.23603014647960663, "learning_rate": 8.114796209915066e-06, "loss": 0.0074, "step": 6593 }, { "epoch": 0.871156323281699, "grad_norm": 0.10830773413181305, "learning_rate": 8.098404099589141e-06, "loss": 0.0148, "step": 6594 }, { "epoch": 0.871288436767183, "grad_norm": 0.12386782467365265, "learning_rate": 8.082027863545594e-06, "loss": 0.0145, "step": 6595 }, { "epoch": 0.8714205502526671, "grad_norm": 0.0879635289311409, "learning_rate": 8.065667504613107e-06, "loss": 0.0085, "step": 6596 }, { "epoch": 0.8715526637381511, "grad_norm": 0.13344678282737732, "learning_rate": 8.049323025617662e-06, "loss": 0.0155, "step": 6597 }, { "epoch": 0.8716847772236351, "grad_norm": 0.1500246524810791, "learning_rate": 8.032994429382412e-06, "loss": 0.0184, "step": 6598 }, { "epoch": 0.8718168907091192, "grad_norm": 0.16355468332767487, "learning_rate": 8.016681718727848e-06, "loss": 0.0156, "step": 6599 }, { "epoch": 0.8719490041946032, "grad_norm": 0.17966286838054657, "learning_rate": 8.00038489647168e-06, "loss": 0.0198, "step": 6600 }, { "epoch": 0.8720811176800872, "grad_norm": 0.18434107303619385, "learning_rate": 7.984103965428902e-06, "loss": 0.0268, "step": 6601 }, { "epoch": 0.8722132311655713, "grad_norm": 0.15422309935092926, "learning_rate": 7.967838928411698e-06, "loss": 0.0159, "step": 6602 }, { "epoch": 0.8723453446510553, "grad_norm": 0.11695985496044159, "learning_rate": 7.951589788229542e-06, "loss": 0.0225, "step": 6603 }, { "epoch": 0.8724774581365393, "grad_norm": 0.11366904526948929, "learning_rate": 7.935356547689244e-06, "loss": 0.0086, "step": 6604 }, { "epoch": 0.8726095716220234, "grad_norm": 0.08670374006032944, "learning_rate": 7.919139209594717e-06, "loss": 0.0085, "step": 6605 }, { "epoch": 0.8727416851075074, "grad_norm": 0.1729203164577484, "learning_rate": 7.902937776747232e-06, "loss": 0.013, "step": 6606 }, { "epoch": 0.8728737985929914, "grad_norm": 0.12578381597995758, "learning_rate": 7.886752251945306e-06, "loss": 0.0067, "step": 6607 }, { "epoch": 0.8730059120784754, "grad_norm": 0.11218776553869247, "learning_rate": 7.870582637984636e-06, "loss": 0.0127, "step": 6608 }, { "epoch": 0.8731380255639595, "grad_norm": 0.2189989537000656, "learning_rate": 7.854428937658253e-06, "loss": 0.0173, "step": 6609 }, { "epoch": 0.8732701390494435, "grad_norm": 0.15799090266227722, "learning_rate": 7.838291153756395e-06, "loss": 0.0095, "step": 6610 }, { "epoch": 0.8734022525349275, "grad_norm": 0.1464925855398178, "learning_rate": 7.822169289066583e-06, "loss": 0.0181, "step": 6611 }, { "epoch": 0.8735343660204116, "grad_norm": 0.17121390998363495, "learning_rate": 7.80606334637355e-06, "loss": 0.0152, "step": 6612 }, { "epoch": 0.8736664795058956, "grad_norm": 0.13004009425640106, "learning_rate": 7.789973328459288e-06, "loss": 0.0174, "step": 6613 }, { "epoch": 0.8737985929913796, "grad_norm": 0.08923876285552979, "learning_rate": 7.773899238103066e-06, "loss": 0.011, "step": 6614 }, { "epoch": 0.8739307064768637, "grad_norm": 0.21297034621238708, "learning_rate": 7.757841078081373e-06, "loss": 0.0254, "step": 6615 }, { "epoch": 0.8740628199623477, "grad_norm": 0.11785417795181274, "learning_rate": 7.741798851167947e-06, "loss": 0.0124, "step": 6616 }, { "epoch": 0.8741949334478317, "grad_norm": 0.2768336534500122, "learning_rate": 7.725772560133792e-06, "loss": 0.0245, "step": 6617 }, { "epoch": 0.8743270469333158, "grad_norm": 0.15279079973697662, "learning_rate": 7.709762207747173e-06, "loss": 0.0181, "step": 6618 }, { "epoch": 0.8744591604187998, "grad_norm": 0.15320299565792084, "learning_rate": 7.693767796773543e-06, "loss": 0.0192, "step": 6619 }, { "epoch": 0.8745912739042838, "grad_norm": 0.1632274091243744, "learning_rate": 7.677789329975648e-06, "loss": 0.0165, "step": 6620 }, { "epoch": 0.8747233873897678, "grad_norm": 0.16090667247772217, "learning_rate": 7.661826810113493e-06, "loss": 0.0141, "step": 6621 }, { "epoch": 0.8748555008752519, "grad_norm": 0.16104121506214142, "learning_rate": 7.645880239944259e-06, "loss": 0.0167, "step": 6622 }, { "epoch": 0.8749876143607359, "grad_norm": 0.27421918511390686, "learning_rate": 7.629949622222443e-06, "loss": 0.0249, "step": 6623 }, { "epoch": 0.8751197278462199, "grad_norm": 0.17468570172786713, "learning_rate": 7.6140349596997675e-06, "loss": 0.0156, "step": 6624 }, { "epoch": 0.875251841331704, "grad_norm": 0.11296243220567703, "learning_rate": 7.59813625512521e-06, "loss": 0.0139, "step": 6625 }, { "epoch": 0.875383954817188, "grad_norm": 0.1640525609254837, "learning_rate": 7.58225351124493e-06, "loss": 0.0235, "step": 6626 }, { "epoch": 0.875516068302672, "grad_norm": 0.09715352207422256, "learning_rate": 7.566386730802388e-06, "loss": 0.0069, "step": 6627 }, { "epoch": 0.8756481817881561, "grad_norm": 0.136221244931221, "learning_rate": 7.550535916538304e-06, "loss": 0.0143, "step": 6628 }, { "epoch": 0.8757802952736401, "grad_norm": 0.17743642628192902, "learning_rate": 7.534701071190575e-06, "loss": 0.0116, "step": 6629 }, { "epoch": 0.8759124087591241, "grad_norm": 0.21691836416721344, "learning_rate": 7.518882197494382e-06, "loss": 0.0194, "step": 6630 }, { "epoch": 0.8760445222446082, "grad_norm": 0.1735524982213974, "learning_rate": 7.503079298182147e-06, "loss": 0.0136, "step": 6631 }, { "epoch": 0.8761766357300922, "grad_norm": 0.2355421930551529, "learning_rate": 7.487292375983545e-06, "loss": 0.018, "step": 6632 }, { "epoch": 0.8763087492155762, "grad_norm": 0.21247804164886475, "learning_rate": 7.471521433625428e-06, "loss": 0.0246, "step": 6633 }, { "epoch": 0.8764408627010603, "grad_norm": 0.15796038508415222, "learning_rate": 7.455766473831949e-06, "loss": 0.0128, "step": 6634 }, { "epoch": 0.8765729761865443, "grad_norm": 0.16143396496772766, "learning_rate": 7.440027499324509e-06, "loss": 0.0141, "step": 6635 }, { "epoch": 0.8767050896720283, "grad_norm": 0.2800748646259308, "learning_rate": 7.424304512821678e-06, "loss": 0.0109, "step": 6636 }, { "epoch": 0.8768372031575123, "grad_norm": 0.16707810759544373, "learning_rate": 7.4085975170393395e-06, "loss": 0.0178, "step": 6637 }, { "epoch": 0.8769693166429964, "grad_norm": 0.1927337348461151, "learning_rate": 7.392906514690567e-06, "loss": 0.0177, "step": 6638 }, { "epoch": 0.8771014301284804, "grad_norm": 0.10031986981630325, "learning_rate": 7.377231508485705e-06, "loss": 0.0071, "step": 6639 }, { "epoch": 0.8772335436139644, "grad_norm": 0.15559269487857819, "learning_rate": 7.3615725011322964e-06, "loss": 0.0198, "step": 6640 }, { "epoch": 0.8773656570994485, "grad_norm": 0.142380028963089, "learning_rate": 7.345929495335158e-06, "loss": 0.0195, "step": 6641 }, { "epoch": 0.8774977705849325, "grad_norm": 0.21019425988197327, "learning_rate": 7.330302493796326e-06, "loss": 0.0195, "step": 6642 }, { "epoch": 0.8776298840704165, "grad_norm": 0.09215234220027924, "learning_rate": 7.314691499215054e-06, "loss": 0.0076, "step": 6643 }, { "epoch": 0.8777619975559006, "grad_norm": 0.13140788674354553, "learning_rate": 7.299096514287862e-06, "loss": 0.009, "step": 6644 }, { "epoch": 0.8778941110413846, "grad_norm": 0.2494693100452423, "learning_rate": 7.2835175417084954e-06, "loss": 0.0176, "step": 6645 }, { "epoch": 0.8780262245268686, "grad_norm": 0.21612705290317535, "learning_rate": 7.2679545841679464e-06, "loss": 0.0178, "step": 6646 }, { "epoch": 0.8781583380123527, "grad_norm": 0.20610679686069489, "learning_rate": 7.252407644354397e-06, "loss": 0.0235, "step": 6647 }, { "epoch": 0.8782904514978367, "grad_norm": 0.19679805636405945, "learning_rate": 7.23687672495329e-06, "loss": 0.0362, "step": 6648 }, { "epoch": 0.8784225649833207, "grad_norm": 0.12437400966882706, "learning_rate": 7.221361828647333e-06, "loss": 0.0109, "step": 6649 }, { "epoch": 0.8785546784688048, "grad_norm": 0.11433594673871994, "learning_rate": 7.205862958116394e-06, "loss": 0.0098, "step": 6650 }, { "epoch": 0.8786867919542888, "grad_norm": 0.1080607920885086, "learning_rate": 7.190380116037631e-06, "loss": 0.0108, "step": 6651 }, { "epoch": 0.8788189054397728, "grad_norm": 0.14211799204349518, "learning_rate": 7.174913305085406e-06, "loss": 0.0149, "step": 6652 }, { "epoch": 0.8789510189252568, "grad_norm": 0.22237631678581238, "learning_rate": 7.15946252793136e-06, "loss": 0.0226, "step": 6653 }, { "epoch": 0.8790831324107409, "grad_norm": 0.18012912571430206, "learning_rate": 7.144027787244289e-06, "loss": 0.0103, "step": 6654 }, { "epoch": 0.8792152458962249, "grad_norm": 0.11783156543970108, "learning_rate": 7.128609085690252e-06, "loss": 0.0132, "step": 6655 }, { "epoch": 0.8793473593817089, "grad_norm": 0.12326275557279587, "learning_rate": 7.113206425932573e-06, "loss": 0.0118, "step": 6656 }, { "epoch": 0.879479472867193, "grad_norm": 0.16605113446712494, "learning_rate": 7.097819810631734e-06, "loss": 0.0138, "step": 6657 }, { "epoch": 0.879611586352677, "grad_norm": 0.13420824706554413, "learning_rate": 7.08244924244551e-06, "loss": 0.0188, "step": 6658 }, { "epoch": 0.879743699838161, "grad_norm": 0.16448046267032623, "learning_rate": 7.0670947240288775e-06, "loss": 0.0241, "step": 6659 }, { "epoch": 0.8798758133236451, "grad_norm": 0.1652592271566391, "learning_rate": 7.051756258034048e-06, "loss": 0.0233, "step": 6660 }, { "epoch": 0.8800079268091291, "grad_norm": 0.14599083364009857, "learning_rate": 7.036433847110424e-06, "loss": 0.0175, "step": 6661 }, { "epoch": 0.8801400402946131, "grad_norm": 0.14731009304523468, "learning_rate": 7.021127493904711e-06, "loss": 0.014, "step": 6662 }, { "epoch": 0.8802721537800972, "grad_norm": 0.18450333178043365, "learning_rate": 7.005837201060761e-06, "loss": 0.0166, "step": 6663 }, { "epoch": 0.8804042672655812, "grad_norm": 0.17317119240760803, "learning_rate": 6.990562971219694e-06, "loss": 0.0244, "step": 6664 }, { "epoch": 0.8805363807510652, "grad_norm": 0.13952180743217468, "learning_rate": 6.9753048070198554e-06, "loss": 0.0158, "step": 6665 }, { "epoch": 0.8806684942365492, "grad_norm": 0.09438351541757584, "learning_rate": 6.9600627110968155e-06, "loss": 0.011, "step": 6666 }, { "epoch": 0.8808006077220333, "grad_norm": 0.1911882907152176, "learning_rate": 6.944836686083334e-06, "loss": 0.023, "step": 6667 }, { "epoch": 0.8809327212075173, "grad_norm": 0.12660865485668182, "learning_rate": 6.9296267346094405e-06, "loss": 0.0169, "step": 6668 }, { "epoch": 0.8810648346930013, "grad_norm": 0.15999475121498108, "learning_rate": 6.914432859302377e-06, "loss": 0.0099, "step": 6669 }, { "epoch": 0.8811969481784854, "grad_norm": 0.2235376089811325, "learning_rate": 6.89925506278658e-06, "loss": 0.0259, "step": 6670 }, { "epoch": 0.8813290616639694, "grad_norm": 0.10345666855573654, "learning_rate": 6.88409334768374e-06, "loss": 0.0071, "step": 6671 }, { "epoch": 0.8814611751494534, "grad_norm": 0.1981154978275299, "learning_rate": 6.868947716612762e-06, "loss": 0.0148, "step": 6672 }, { "epoch": 0.8815932886349375, "grad_norm": 0.2624906897544861, "learning_rate": 6.853818172189774e-06, "loss": 0.0123, "step": 6673 }, { "epoch": 0.8817254021204214, "grad_norm": 0.1436055451631546, "learning_rate": 6.838704717028111e-06, "loss": 0.0123, "step": 6674 }, { "epoch": 0.8818575156059054, "grad_norm": 0.11317655444145203, "learning_rate": 6.8236073537383485e-06, "loss": 0.0087, "step": 6675 }, { "epoch": 0.8819896290913894, "grad_norm": 0.19594664871692657, "learning_rate": 6.80852608492828e-06, "loss": 0.0171, "step": 6676 }, { "epoch": 0.8821217425768735, "grad_norm": 0.16084632277488708, "learning_rate": 6.7934609132028985e-06, "loss": 0.0132, "step": 6677 }, { "epoch": 0.8822538560623575, "grad_norm": 0.15218578279018402, "learning_rate": 6.778411841164423e-06, "loss": 0.0145, "step": 6678 }, { "epoch": 0.8823859695478415, "grad_norm": 0.130471333861351, "learning_rate": 6.763378871412318e-06, "loss": 0.0075, "step": 6679 }, { "epoch": 0.8825180830333256, "grad_norm": 0.1638917773962021, "learning_rate": 6.748362006543263e-06, "loss": 0.0169, "step": 6680 }, { "epoch": 0.8826501965188096, "grad_norm": 0.18149948120117188, "learning_rate": 6.733361249151104e-06, "loss": 0.0142, "step": 6681 }, { "epoch": 0.8827823100042936, "grad_norm": 0.3009966313838959, "learning_rate": 6.718376601826948e-06, "loss": 0.021, "step": 6682 }, { "epoch": 0.8829144234897777, "grad_norm": 0.12758490443229675, "learning_rate": 6.7034080671591446e-06, "loss": 0.0109, "step": 6683 }, { "epoch": 0.8830465369752617, "grad_norm": 0.0941629558801651, "learning_rate": 6.6884556477331936e-06, "loss": 0.0083, "step": 6684 }, { "epoch": 0.8831786504607457, "grad_norm": 0.1017313301563263, "learning_rate": 6.673519346131851e-06, "loss": 0.0078, "step": 6685 }, { "epoch": 0.8833107639462298, "grad_norm": 0.16601882874965668, "learning_rate": 6.658599164935097e-06, "loss": 0.0175, "step": 6686 }, { "epoch": 0.8834428774317138, "grad_norm": 0.1425500214099884, "learning_rate": 6.6436951067201155e-06, "loss": 0.0144, "step": 6687 }, { "epoch": 0.8835749909171978, "grad_norm": 0.21121959388256073, "learning_rate": 6.628807174061291e-06, "loss": 0.0183, "step": 6688 }, { "epoch": 0.8837071044026819, "grad_norm": 0.15938931703567505, "learning_rate": 6.613935369530233e-06, "loss": 0.0129, "step": 6689 }, { "epoch": 0.8838392178881659, "grad_norm": 0.11776001751422882, "learning_rate": 6.5990796956957865e-06, "loss": 0.0142, "step": 6690 }, { "epoch": 0.8839713313736499, "grad_norm": 0.12473277002573013, "learning_rate": 6.584240155123977e-06, "loss": 0.0069, "step": 6691 }, { "epoch": 0.884103444859134, "grad_norm": 0.14708156883716583, "learning_rate": 6.569416750378055e-06, "loss": 0.0218, "step": 6692 }, { "epoch": 0.884235558344618, "grad_norm": 0.11134511977434158, "learning_rate": 6.554609484018492e-06, "loss": 0.0072, "step": 6693 }, { "epoch": 0.884367671830102, "grad_norm": 0.13585522770881653, "learning_rate": 6.5398183586029786e-06, "loss": 0.0174, "step": 6694 }, { "epoch": 0.884499785315586, "grad_norm": 0.16885550320148468, "learning_rate": 6.52504337668638e-06, "loss": 0.0117, "step": 6695 }, { "epoch": 0.8846318988010701, "grad_norm": 0.15060658752918243, "learning_rate": 6.51028454082081e-06, "loss": 0.0249, "step": 6696 }, { "epoch": 0.8847640122865541, "grad_norm": 0.14600324630737305, "learning_rate": 6.495541853555609e-06, "loss": 0.0257, "step": 6697 }, { "epoch": 0.8848961257720381, "grad_norm": 0.12542365491390228, "learning_rate": 6.48081531743725e-06, "loss": 0.0152, "step": 6698 }, { "epoch": 0.8850282392575222, "grad_norm": 0.1343553513288498, "learning_rate": 6.466104935009487e-06, "loss": 0.0168, "step": 6699 }, { "epoch": 0.8851603527430062, "grad_norm": 0.1018458679318428, "learning_rate": 6.451410708813277e-06, "loss": 0.0088, "step": 6700 }, { "epoch": 0.8852924662284902, "grad_norm": 0.15615858137607574, "learning_rate": 6.436732641386778e-06, "loss": 0.0205, "step": 6701 }, { "epoch": 0.8854245797139743, "grad_norm": 0.1561017632484436, "learning_rate": 6.422070735265318e-06, "loss": 0.0136, "step": 6702 }, { "epoch": 0.8855566931994583, "grad_norm": 0.2412395179271698, "learning_rate": 6.4074249929814815e-06, "loss": 0.0205, "step": 6703 }, { "epoch": 0.8856888066849423, "grad_norm": 0.17810501158237457, "learning_rate": 6.392795417065078e-06, "loss": 0.0147, "step": 6704 }, { "epoch": 0.8858209201704264, "grad_norm": 0.16065442562103271, "learning_rate": 6.378182010043044e-06, "loss": 0.0131, "step": 6705 }, { "epoch": 0.8859530336559104, "grad_norm": 0.12913085520267487, "learning_rate": 6.363584774439601e-06, "loss": 0.0123, "step": 6706 }, { "epoch": 0.8860851471413944, "grad_norm": 0.3279435336589813, "learning_rate": 6.349003712776136e-06, "loss": 0.0271, "step": 6707 }, { "epoch": 0.8862172606268784, "grad_norm": 0.14449019730091095, "learning_rate": 6.3344388275712875e-06, "loss": 0.0116, "step": 6708 }, { "epoch": 0.8863493741123625, "grad_norm": 0.16083396971225739, "learning_rate": 6.31989012134081e-06, "loss": 0.014, "step": 6709 }, { "epoch": 0.8864814875978465, "grad_norm": 0.25596585869789124, "learning_rate": 6.305357596597761e-06, "loss": 0.0225, "step": 6710 }, { "epoch": 0.8866136010833305, "grad_norm": 0.18523181974887848, "learning_rate": 6.290841255852375e-06, "loss": 0.0165, "step": 6711 }, { "epoch": 0.8867457145688146, "grad_norm": 0.19872015714645386, "learning_rate": 6.2763411016120265e-06, "loss": 0.0179, "step": 6712 }, { "epoch": 0.8868778280542986, "grad_norm": 0.10085300356149673, "learning_rate": 6.261857136381388e-06, "loss": 0.0103, "step": 6713 }, { "epoch": 0.8870099415397826, "grad_norm": 0.13313262164592743, "learning_rate": 6.24738936266227e-06, "loss": 0.0076, "step": 6714 }, { "epoch": 0.8871420550252667, "grad_norm": 0.1402639001607895, "learning_rate": 6.232937782953752e-06, "loss": 0.0135, "step": 6715 }, { "epoch": 0.8872741685107507, "grad_norm": 0.14339350163936615, "learning_rate": 6.218502399752013e-06, "loss": 0.0112, "step": 6716 }, { "epoch": 0.8874062819962347, "grad_norm": 0.1772618293762207, "learning_rate": 6.204083215550538e-06, "loss": 0.0209, "step": 6717 }, { "epoch": 0.8875383954817188, "grad_norm": 0.18298009037971497, "learning_rate": 6.1896802328399675e-06, "loss": 0.0197, "step": 6718 }, { "epoch": 0.8876705089672028, "grad_norm": 0.1741136759519577, "learning_rate": 6.175293454108122e-06, "loss": 0.0159, "step": 6719 }, { "epoch": 0.8878026224526868, "grad_norm": 0.1649044305086136, "learning_rate": 6.1609228818400585e-06, "loss": 0.0106, "step": 6720 }, { "epoch": 0.8879347359381708, "grad_norm": 0.10933619737625122, "learning_rate": 6.146568518518059e-06, "loss": 0.0154, "step": 6721 }, { "epoch": 0.8880668494236549, "grad_norm": 0.09617764502763748, "learning_rate": 6.132230366621527e-06, "loss": 0.0132, "step": 6722 }, { "epoch": 0.8881989629091389, "grad_norm": 0.15637919306755066, "learning_rate": 6.117908428627139e-06, "loss": 0.0156, "step": 6723 }, { "epoch": 0.8883310763946229, "grad_norm": 0.18323490023612976, "learning_rate": 6.103602707008726e-06, "loss": 0.0091, "step": 6724 }, { "epoch": 0.888463189880107, "grad_norm": 0.13832803070545197, "learning_rate": 6.089313204237346e-06, "loss": 0.0098, "step": 6725 }, { "epoch": 0.888595303365591, "grad_norm": 0.22622539103031158, "learning_rate": 6.075039922781234e-06, "loss": 0.0232, "step": 6726 }, { "epoch": 0.888727416851075, "grad_norm": 0.14251096546649933, "learning_rate": 6.060782865105819e-06, "loss": 0.0076, "step": 6727 }, { "epoch": 0.8888595303365591, "grad_norm": 0.11312871426343918, "learning_rate": 6.046542033673786e-06, "loss": 0.0122, "step": 6728 }, { "epoch": 0.8889916438220431, "grad_norm": 0.14361023902893066, "learning_rate": 6.032317430944923e-06, "loss": 0.0167, "step": 6729 }, { "epoch": 0.8891237573075271, "grad_norm": 0.18894532322883606, "learning_rate": 6.018109059376287e-06, "loss": 0.0246, "step": 6730 }, { "epoch": 0.8892558707930112, "grad_norm": 0.13724221289157867, "learning_rate": 6.003916921422115e-06, "loss": 0.0125, "step": 6731 }, { "epoch": 0.8893879842784952, "grad_norm": 0.42069005966186523, "learning_rate": 5.989741019533812e-06, "loss": 0.0158, "step": 6732 }, { "epoch": 0.8895200977639792, "grad_norm": 0.14127984642982483, "learning_rate": 5.975581356160009e-06, "loss": 0.0119, "step": 6733 }, { "epoch": 0.8896522112494633, "grad_norm": 0.1737101823091507, "learning_rate": 5.961437933746539e-06, "loss": 0.0127, "step": 6734 }, { "epoch": 0.8897843247349473, "grad_norm": 0.28091874718666077, "learning_rate": 5.947310754736402e-06, "loss": 0.0143, "step": 6735 }, { "epoch": 0.8899164382204313, "grad_norm": 0.16934643685817719, "learning_rate": 5.933199821569801e-06, "loss": 0.022, "step": 6736 }, { "epoch": 0.8900485517059153, "grad_norm": 0.13304175436496735, "learning_rate": 5.919105136684133e-06, "loss": 0.0113, "step": 6737 }, { "epoch": 0.8901806651913994, "grad_norm": 0.12925884127616882, "learning_rate": 5.905026702514005e-06, "loss": 0.0065, "step": 6738 }, { "epoch": 0.8903127786768834, "grad_norm": 0.15998664498329163, "learning_rate": 5.890964521491182e-06, "loss": 0.0179, "step": 6739 }, { "epoch": 0.8904448921623674, "grad_norm": 0.11898373067378998, "learning_rate": 5.876918596044667e-06, "loss": 0.0044, "step": 6740 }, { "epoch": 0.8905770056478515, "grad_norm": 0.152593195438385, "learning_rate": 5.862888928600607e-06, "loss": 0.0138, "step": 6741 }, { "epoch": 0.8907091191333355, "grad_norm": 0.19877228140830994, "learning_rate": 5.8488755215823975e-06, "loss": 0.0071, "step": 6742 }, { "epoch": 0.8908412326188195, "grad_norm": 0.20507656037807465, "learning_rate": 5.834878377410557e-06, "loss": 0.0191, "step": 6743 }, { "epoch": 0.8909733461043036, "grad_norm": 0.15045703947544098, "learning_rate": 5.8208974985028535e-06, "loss": 0.0116, "step": 6744 }, { "epoch": 0.8911054595897876, "grad_norm": 0.15195822715759277, "learning_rate": 5.806932887274219e-06, "loss": 0.0123, "step": 6745 }, { "epoch": 0.8912375730752716, "grad_norm": 0.17124532163143158, "learning_rate": 5.792984546136759e-06, "loss": 0.01, "step": 6746 }, { "epoch": 0.8913696865607557, "grad_norm": 0.21854707598686218, "learning_rate": 5.7790524774998136e-06, "loss": 0.025, "step": 6747 }, { "epoch": 0.8915018000462397, "grad_norm": 0.12468767166137695, "learning_rate": 5.765136683769868e-06, "loss": 0.0108, "step": 6748 }, { "epoch": 0.8916339135317237, "grad_norm": 0.12682819366455078, "learning_rate": 5.751237167350643e-06, "loss": 0.013, "step": 6749 }, { "epoch": 0.8917660270172078, "grad_norm": 0.15752992033958435, "learning_rate": 5.737353930642997e-06, "loss": 0.0182, "step": 6750 }, { "epoch": 0.8918981405026918, "grad_norm": 0.07519881427288055, "learning_rate": 5.723486976045001e-06, "loss": 0.0066, "step": 6751 }, { "epoch": 0.8920302539881758, "grad_norm": 0.2467702180147171, "learning_rate": 5.709636305951926e-06, "loss": 0.0228, "step": 6752 }, { "epoch": 0.8921623674736598, "grad_norm": 0.14889715611934662, "learning_rate": 5.695801922756194e-06, "loss": 0.0139, "step": 6753 }, { "epoch": 0.8922944809591439, "grad_norm": 0.39248546957969666, "learning_rate": 5.681983828847448e-06, "loss": 0.0226, "step": 6754 }, { "epoch": 0.8924265944446279, "grad_norm": 0.16626907885074615, "learning_rate": 5.6681820266125006e-06, "loss": 0.0312, "step": 6755 }, { "epoch": 0.8925587079301119, "grad_norm": 0.11991027742624283, "learning_rate": 5.65439651843539e-06, "loss": 0.0153, "step": 6756 }, { "epoch": 0.892690821415596, "grad_norm": 0.2725294530391693, "learning_rate": 5.640627306697244e-06, "loss": 0.0379, "step": 6757 }, { "epoch": 0.89282293490108, "grad_norm": 0.16560478508472443, "learning_rate": 5.626874393776482e-06, "loss": 0.0128, "step": 6758 }, { "epoch": 0.892955048386564, "grad_norm": 0.19976456463336945, "learning_rate": 5.61313778204865e-06, "loss": 0.0194, "step": 6759 }, { "epoch": 0.8930871618720481, "grad_norm": 0.20092976093292236, "learning_rate": 5.599417473886481e-06, "loss": 0.0192, "step": 6760 }, { "epoch": 0.8932192753575321, "grad_norm": 0.09612659364938736, "learning_rate": 5.585713471659915e-06, "loss": 0.0109, "step": 6761 }, { "epoch": 0.8933513888430161, "grad_norm": 0.11753436177968979, "learning_rate": 5.572025777736056e-06, "loss": 0.0164, "step": 6762 }, { "epoch": 0.8934835023285002, "grad_norm": 0.153545081615448, "learning_rate": 5.558354394479204e-06, "loss": 0.0164, "step": 6763 }, { "epoch": 0.8936156158139842, "grad_norm": 0.19137759506702423, "learning_rate": 5.5446993242508235e-06, "loss": 0.0229, "step": 6764 }, { "epoch": 0.8937477292994682, "grad_norm": 0.13186432421207428, "learning_rate": 5.531060569409574e-06, "loss": 0.0162, "step": 6765 }, { "epoch": 0.8938798427849522, "grad_norm": 0.23756030201911926, "learning_rate": 5.517438132311315e-06, "loss": 0.0189, "step": 6766 }, { "epoch": 0.8940119562704363, "grad_norm": 0.1321917027235031, "learning_rate": 5.50383201530904e-06, "loss": 0.0119, "step": 6767 }, { "epoch": 0.8941440697559203, "grad_norm": 0.1857648640871048, "learning_rate": 5.490242220752961e-06, "loss": 0.019, "step": 6768 }, { "epoch": 0.8942761832414043, "grad_norm": 0.19691239297389984, "learning_rate": 5.476668750990466e-06, "loss": 0.0182, "step": 6769 }, { "epoch": 0.8944082967268884, "grad_norm": 0.29870468378067017, "learning_rate": 5.463111608366122e-06, "loss": 0.013, "step": 6770 }, { "epoch": 0.8945404102123724, "grad_norm": 0.17927725613117218, "learning_rate": 5.449570795221659e-06, "loss": 0.0147, "step": 6771 }, { "epoch": 0.8946725236978564, "grad_norm": 0.12823109328746796, "learning_rate": 5.436046313896015e-06, "loss": 0.0152, "step": 6772 }, { "epoch": 0.8948046371833405, "grad_norm": 0.18331226706504822, "learning_rate": 5.422538166725277e-06, "loss": 0.0131, "step": 6773 }, { "epoch": 0.8949367506688245, "grad_norm": 0.17507404088974, "learning_rate": 5.409046356042735e-06, "loss": 0.0172, "step": 6774 }, { "epoch": 0.8950688641543085, "grad_norm": 0.08184914290904999, "learning_rate": 5.395570884178824e-06, "loss": 0.0079, "step": 6775 }, { "epoch": 0.8952009776397926, "grad_norm": 0.12260734289884567, "learning_rate": 5.3821117534612165e-06, "loss": 0.0141, "step": 6776 }, { "epoch": 0.8953330911252766, "grad_norm": 0.15787914395332336, "learning_rate": 5.368668966214707e-06, "loss": 0.014, "step": 6777 }, { "epoch": 0.8954652046107606, "grad_norm": 0.09953964501619339, "learning_rate": 5.355242524761261e-06, "loss": 0.0087, "step": 6778 }, { "epoch": 0.8955973180962447, "grad_norm": 0.16149206459522247, "learning_rate": 5.341832431420091e-06, "loss": 0.017, "step": 6779 }, { "epoch": 0.8957294315817287, "grad_norm": 0.22760704159736633, "learning_rate": 5.3284386885075195e-06, "loss": 0.0177, "step": 6780 }, { "epoch": 0.8958615450672127, "grad_norm": 0.09379708766937256, "learning_rate": 5.315061298337065e-06, "loss": 0.0111, "step": 6781 }, { "epoch": 0.8959936585526967, "grad_norm": 0.14150847494602203, "learning_rate": 5.301700263219411e-06, "loss": 0.022, "step": 6782 }, { "epoch": 0.8961257720381808, "grad_norm": 0.10962291061878204, "learning_rate": 5.2883555854624565e-06, "loss": 0.0128, "step": 6783 }, { "epoch": 0.8962578855236648, "grad_norm": 0.11838527023792267, "learning_rate": 5.275027267371213e-06, "loss": 0.0127, "step": 6784 }, { "epoch": 0.8963899990091488, "grad_norm": 0.15095725655555725, "learning_rate": 5.2617153112479055e-06, "loss": 0.0111, "step": 6785 }, { "epoch": 0.8965221124946329, "grad_norm": 0.15705175697803497, "learning_rate": 5.2484197193919286e-06, "loss": 0.0155, "step": 6786 }, { "epoch": 0.8966542259801169, "grad_norm": 0.12951944768428802, "learning_rate": 5.235140494099866e-06, "loss": 0.0164, "step": 6787 }, { "epoch": 0.8967863394656009, "grad_norm": 0.13726073503494263, "learning_rate": 5.221877637665429e-06, "loss": 0.0157, "step": 6788 }, { "epoch": 0.896918452951085, "grad_norm": 0.14434929192066193, "learning_rate": 5.208631152379528e-06, "loss": 0.0138, "step": 6789 }, { "epoch": 0.897050566436569, "grad_norm": 0.12067513167858124, "learning_rate": 5.195401040530279e-06, "loss": 0.0087, "step": 6790 }, { "epoch": 0.897182679922053, "grad_norm": 0.17936775088310242, "learning_rate": 5.182187304402897e-06, "loss": 0.0172, "step": 6791 }, { "epoch": 0.8973147934075371, "grad_norm": 0.2976245880126953, "learning_rate": 5.168989946279823e-06, "loss": 0.038, "step": 6792 }, { "epoch": 0.8974469068930211, "grad_norm": 0.2362768054008484, "learning_rate": 5.155808968440645e-06, "loss": 0.0188, "step": 6793 }, { "epoch": 0.8975790203785051, "grad_norm": 0.1400487720966339, "learning_rate": 5.142644373162164e-06, "loss": 0.0043, "step": 6794 }, { "epoch": 0.8977111338639892, "grad_norm": 0.1404900997877121, "learning_rate": 5.129496162718284e-06, "loss": 0.0172, "step": 6795 }, { "epoch": 0.8978432473494732, "grad_norm": 0.14347364008426666, "learning_rate": 5.11636433938012e-06, "loss": 0.0124, "step": 6796 }, { "epoch": 0.8979753608349572, "grad_norm": 0.17006570100784302, "learning_rate": 5.103248905415958e-06, "loss": 0.0183, "step": 6797 }, { "epoch": 0.8981074743204412, "grad_norm": 0.34829583764076233, "learning_rate": 5.09014986309122e-06, "loss": 0.0187, "step": 6798 }, { "epoch": 0.8982395878059253, "grad_norm": 0.13791634142398834, "learning_rate": 5.077067214668552e-06, "loss": 0.0118, "step": 6799 }, { "epoch": 0.8983717012914093, "grad_norm": 0.1171623021364212, "learning_rate": 5.064000962407711e-06, "loss": 0.0116, "step": 6800 }, { "epoch": 0.8985038147768933, "grad_norm": 0.21284325420856476, "learning_rate": 5.050951108565682e-06, "loss": 0.0194, "step": 6801 }, { "epoch": 0.8986359282623774, "grad_norm": 0.1664762794971466, "learning_rate": 5.03791765539654e-06, "loss": 0.0182, "step": 6802 }, { "epoch": 0.8987680417478614, "grad_norm": 0.21837081015110016, "learning_rate": 5.024900605151595e-06, "loss": 0.0258, "step": 6803 }, { "epoch": 0.8989001552333454, "grad_norm": 0.17388522624969482, "learning_rate": 5.011899960079303e-06, "loss": 0.0136, "step": 6804 }, { "epoch": 0.8990322687188295, "grad_norm": 0.10882629454135895, "learning_rate": 4.998915722425268e-06, "loss": 0.0033, "step": 6805 }, { "epoch": 0.8991643822043135, "grad_norm": 0.11351826041936874, "learning_rate": 4.985947894432286e-06, "loss": 0.0083, "step": 6806 }, { "epoch": 0.8992964956897975, "grad_norm": 0.16879047453403473, "learning_rate": 4.972996478340286e-06, "loss": 0.0143, "step": 6807 }, { "epoch": 0.8994286091752816, "grad_norm": 0.25198879837989807, "learning_rate": 4.960061476386424e-06, "loss": 0.0156, "step": 6808 }, { "epoch": 0.8995607226607656, "grad_norm": 0.1777067482471466, "learning_rate": 4.9471428908049345e-06, "loss": 0.0201, "step": 6809 }, { "epoch": 0.8996928361462496, "grad_norm": 0.14899660646915436, "learning_rate": 4.934240723827288e-06, "loss": 0.0106, "step": 6810 }, { "epoch": 0.8998249496317337, "grad_norm": 0.14987502992153168, "learning_rate": 4.921354977682091e-06, "loss": 0.011, "step": 6811 }, { "epoch": 0.8999570631172177, "grad_norm": 0.16233918070793152, "learning_rate": 4.908485654595107e-06, "loss": 0.0172, "step": 6812 }, { "epoch": 0.9000891766027017, "grad_norm": 0.13594211637973785, "learning_rate": 4.89563275678927e-06, "loss": 0.0132, "step": 6813 }, { "epoch": 0.9002212900881857, "grad_norm": 0.1069660559296608, "learning_rate": 4.882796286484681e-06, "loss": 0.0138, "step": 6814 }, { "epoch": 0.9003534035736698, "grad_norm": 0.15343253314495087, "learning_rate": 4.8699762458986106e-06, "loss": 0.0229, "step": 6815 }, { "epoch": 0.9004855170591538, "grad_norm": 0.15718504786491394, "learning_rate": 4.857172637245466e-06, "loss": 0.0102, "step": 6816 }, { "epoch": 0.9006176305446378, "grad_norm": 0.18040478229522705, "learning_rate": 4.844385462736834e-06, "loss": 0.0137, "step": 6817 }, { "epoch": 0.9007497440301219, "grad_norm": 0.15509812533855438, "learning_rate": 4.831614724581468e-06, "loss": 0.0196, "step": 6818 }, { "epoch": 0.9008818575156059, "grad_norm": 0.12873555719852448, "learning_rate": 4.818860424985272e-06, "loss": 0.0128, "step": 6819 }, { "epoch": 0.9010139710010899, "grad_norm": 0.16008254885673523, "learning_rate": 4.806122566151294e-06, "loss": 0.0167, "step": 6820 }, { "epoch": 0.901146084486574, "grad_norm": 0.11089048534631729, "learning_rate": 4.793401150279786e-06, "loss": 0.0092, "step": 6821 }, { "epoch": 0.901278197972058, "grad_norm": 0.1454131156206131, "learning_rate": 4.780696179568133e-06, "loss": 0.0137, "step": 6822 }, { "epoch": 0.901410311457542, "grad_norm": 0.3379257023334503, "learning_rate": 4.768007656210871e-06, "loss": 0.0206, "step": 6823 }, { "epoch": 0.901542424943026, "grad_norm": 0.17755140364170074, "learning_rate": 4.7553355823997e-06, "loss": 0.0139, "step": 6824 }, { "epoch": 0.9016745384285101, "grad_norm": 0.11262303590774536, "learning_rate": 4.7426799603235036e-06, "loss": 0.0112, "step": 6825 }, { "epoch": 0.9018066519139941, "grad_norm": 0.16604208946228027, "learning_rate": 4.730040792168289e-06, "loss": 0.0156, "step": 6826 }, { "epoch": 0.9019387653994781, "grad_norm": 0.09217502176761627, "learning_rate": 4.7174180801172305e-06, "loss": 0.0062, "step": 6827 }, { "epoch": 0.9020708788849622, "grad_norm": 0.10860409587621689, "learning_rate": 4.704811826350675e-06, "loss": 0.0109, "step": 6828 }, { "epoch": 0.9022029923704462, "grad_norm": 0.1001148670911789, "learning_rate": 4.692222033046134e-06, "loss": 0.0112, "step": 6829 }, { "epoch": 0.9023351058559302, "grad_norm": 0.13455744087696075, "learning_rate": 4.6796487023782145e-06, "loss": 0.0108, "step": 6830 }, { "epoch": 0.9024672193414143, "grad_norm": 0.13678915798664093, "learning_rate": 4.667091836518766e-06, "loss": 0.018, "step": 6831 }, { "epoch": 0.9025993328268983, "grad_norm": 0.15132828056812286, "learning_rate": 4.654551437636745e-06, "loss": 0.0111, "step": 6832 }, { "epoch": 0.9027314463123823, "grad_norm": 0.19527070224285126, "learning_rate": 4.64202750789825e-06, "loss": 0.0185, "step": 6833 }, { "epoch": 0.9028635597978664, "grad_norm": 0.11000338196754456, "learning_rate": 4.629520049466573e-06, "loss": 0.0089, "step": 6834 }, { "epoch": 0.9029956732833504, "grad_norm": 0.09668418020009995, "learning_rate": 4.617029064502132e-06, "loss": 0.0092, "step": 6835 }, { "epoch": 0.9031277867688344, "grad_norm": 0.16052260994911194, "learning_rate": 4.6045545551625434e-06, "loss": 0.0149, "step": 6836 }, { "epoch": 0.9032599002543185, "grad_norm": 0.3567523956298828, "learning_rate": 4.592096523602485e-06, "loss": 0.0233, "step": 6837 }, { "epoch": 0.9033920137398025, "grad_norm": 0.18221727013587952, "learning_rate": 4.579654971973912e-06, "loss": 0.0197, "step": 6838 }, { "epoch": 0.9035241272252865, "grad_norm": 0.12295905500650406, "learning_rate": 4.567229902425829e-06, "loss": 0.018, "step": 6839 }, { "epoch": 0.9036562407107706, "grad_norm": 0.15242114663124084, "learning_rate": 4.554821317104452e-06, "loss": 0.0128, "step": 6840 }, { "epoch": 0.9037883541962546, "grad_norm": 0.14071272313594818, "learning_rate": 4.542429218153121e-06, "loss": 0.0097, "step": 6841 }, { "epoch": 0.9039204676817386, "grad_norm": 0.11783215403556824, "learning_rate": 4.53005360771237e-06, "loss": 0.0104, "step": 6842 }, { "epoch": 0.9040525811672226, "grad_norm": 0.17464067041873932, "learning_rate": 4.51769448791981e-06, "loss": 0.0186, "step": 6843 }, { "epoch": 0.9041846946527067, "grad_norm": 0.1822936236858368, "learning_rate": 4.505351860910268e-06, "loss": 0.0166, "step": 6844 }, { "epoch": 0.9043168081381907, "grad_norm": 0.13986508548259735, "learning_rate": 4.493025728815725e-06, "loss": 0.0148, "step": 6845 }, { "epoch": 0.9044489216236747, "grad_norm": 0.24553631246089935, "learning_rate": 4.480716093765247e-06, "loss": 0.0196, "step": 6846 }, { "epoch": 0.9045810351091588, "grad_norm": 0.20718611776828766, "learning_rate": 4.46842295788511e-06, "loss": 0.0145, "step": 6847 }, { "epoch": 0.9047131485946428, "grad_norm": 0.3277038633823395, "learning_rate": 4.4561463232987265e-06, "loss": 0.0216, "step": 6848 }, { "epoch": 0.9048452620801268, "grad_norm": 0.12654957175254822, "learning_rate": 4.443886192126679e-06, "loss": 0.0125, "step": 6849 }, { "epoch": 0.9049773755656109, "grad_norm": 0.1359531730413437, "learning_rate": 4.431642566486638e-06, "loss": 0.0144, "step": 6850 }, { "epoch": 0.9051094890510949, "grad_norm": 0.18986926972866058, "learning_rate": 4.419415448493469e-06, "loss": 0.018, "step": 6851 }, { "epoch": 0.9052416025365789, "grad_norm": 0.4173817038536072, "learning_rate": 4.4072048402592045e-06, "loss": 0.0228, "step": 6852 }, { "epoch": 0.905373716022063, "grad_norm": 0.2071004956960678, "learning_rate": 4.395010743892957e-06, "loss": 0.0208, "step": 6853 }, { "epoch": 0.905505829507547, "grad_norm": 0.11187253892421722, "learning_rate": 4.382833161501065e-06, "loss": 0.0122, "step": 6854 }, { "epoch": 0.905637942993031, "grad_norm": 0.19523876905441284, "learning_rate": 4.370672095186956e-06, "loss": 0.0156, "step": 6855 }, { "epoch": 0.905770056478515, "grad_norm": 0.10688573122024536, "learning_rate": 4.35852754705125e-06, "loss": 0.0108, "step": 6856 }, { "epoch": 0.9059021699639991, "grad_norm": 0.08100339025259018, "learning_rate": 4.346399519191657e-06, "loss": 0.0078, "step": 6857 }, { "epoch": 0.9060342834494831, "grad_norm": 0.11563509702682495, "learning_rate": 4.334288013703091e-06, "loss": 0.0118, "step": 6858 }, { "epoch": 0.9061663969349671, "grad_norm": 0.20008042454719543, "learning_rate": 4.322193032677602e-06, "loss": 0.0232, "step": 6859 }, { "epoch": 0.9062985104204512, "grad_norm": 0.14969661831855774, "learning_rate": 4.310114578204327e-06, "loss": 0.011, "step": 6860 }, { "epoch": 0.9064306239059352, "grad_norm": 0.14968842267990112, "learning_rate": 4.298052652369633e-06, "loss": 0.0146, "step": 6861 }, { "epoch": 0.9065627373914192, "grad_norm": 0.25634080171585083, "learning_rate": 4.286007257256963e-06, "loss": 0.0267, "step": 6862 }, { "epoch": 0.9066948508769033, "grad_norm": 0.1451036036014557, "learning_rate": 4.2739783949469645e-06, "loss": 0.0189, "step": 6863 }, { "epoch": 0.9068269643623873, "grad_norm": 0.09582092612981796, "learning_rate": 4.261966067517375e-06, "loss": 0.0163, "step": 6864 }, { "epoch": 0.9069590778478713, "grad_norm": 0.1373281031847, "learning_rate": 4.249970277043114e-06, "loss": 0.0108, "step": 6865 }, { "epoch": 0.9070911913333554, "grad_norm": 0.14923548698425293, "learning_rate": 4.237991025596222e-06, "loss": 0.0207, "step": 6866 }, { "epoch": 0.9072233048188394, "grad_norm": 0.3042697608470917, "learning_rate": 4.226028315245889e-06, "loss": 0.0217, "step": 6867 }, { "epoch": 0.9073554183043234, "grad_norm": 0.11148915439844131, "learning_rate": 4.214082148058451e-06, "loss": 0.0065, "step": 6868 }, { "epoch": 0.9074875317898075, "grad_norm": 0.1816503405570984, "learning_rate": 4.20215252609738e-06, "loss": 0.0215, "step": 6869 }, { "epoch": 0.9076196452752915, "grad_norm": 0.24329888820648193, "learning_rate": 4.190239451423317e-06, "loss": 0.0175, "step": 6870 }, { "epoch": 0.9077517587607755, "grad_norm": 0.1107625663280487, "learning_rate": 4.178342926094003e-06, "loss": 0.0106, "step": 6871 }, { "epoch": 0.9078838722462595, "grad_norm": 0.12002073973417282, "learning_rate": 4.166462952164341e-06, "loss": 0.0136, "step": 6872 }, { "epoch": 0.9080159857317436, "grad_norm": 0.1341729611158371, "learning_rate": 4.154599531686387e-06, "loss": 0.0126, "step": 6873 }, { "epoch": 0.9081480992172276, "grad_norm": 0.14817306399345398, "learning_rate": 4.142752666709304e-06, "loss": 0.0132, "step": 6874 }, { "epoch": 0.9082802127027116, "grad_norm": 0.10765121132135391, "learning_rate": 4.130922359279432e-06, "loss": 0.0101, "step": 6875 }, { "epoch": 0.9084123261881957, "grad_norm": 0.11597680300474167, "learning_rate": 4.119108611440225e-06, "loss": 0.0052, "step": 6876 }, { "epoch": 0.9085444396736797, "grad_norm": 0.090935617685318, "learning_rate": 4.107311425232319e-06, "loss": 0.0067, "step": 6877 }, { "epoch": 0.9086765531591637, "grad_norm": 0.24119891226291656, "learning_rate": 4.095530802693404e-06, "loss": 0.0303, "step": 6878 }, { "epoch": 0.9088086666446478, "grad_norm": 0.11198988556861877, "learning_rate": 4.083766745858408e-06, "loss": 0.0134, "step": 6879 }, { "epoch": 0.9089407801301318, "grad_norm": 0.21285560727119446, "learning_rate": 4.07201925675933e-06, "loss": 0.0224, "step": 6880 }, { "epoch": 0.9090728936156158, "grad_norm": 0.2002650648355484, "learning_rate": 4.060288337425333e-06, "loss": 0.0212, "step": 6881 }, { "epoch": 0.9092050071010999, "grad_norm": 0.1997833251953125, "learning_rate": 4.04857398988272e-06, "loss": 0.0205, "step": 6882 }, { "epoch": 0.9093371205865839, "grad_norm": 0.13389846682548523, "learning_rate": 4.036876216154906e-06, "loss": 0.0117, "step": 6883 }, { "epoch": 0.9094692340720679, "grad_norm": 0.1802111119031906, "learning_rate": 4.025195018262495e-06, "loss": 0.0201, "step": 6884 }, { "epoch": 0.909601347557552, "grad_norm": 0.16740918159484863, "learning_rate": 4.0135303982231755e-06, "loss": 0.0227, "step": 6885 }, { "epoch": 0.909733461043036, "grad_norm": 0.1399318426847458, "learning_rate": 4.001882358051779e-06, "loss": 0.0142, "step": 6886 }, { "epoch": 0.90986557452852, "grad_norm": 0.16021879017353058, "learning_rate": 3.9902508997603175e-06, "loss": 0.0141, "step": 6887 }, { "epoch": 0.909997688014004, "grad_norm": 0.1452077478170395, "learning_rate": 3.978636025357885e-06, "loss": 0.0141, "step": 6888 }, { "epoch": 0.9101298014994881, "grad_norm": 0.1472679078578949, "learning_rate": 3.967037736850743e-06, "loss": 0.0097, "step": 6889 }, { "epoch": 0.9102619149849721, "grad_norm": 0.2741580903530121, "learning_rate": 3.9554560362422775e-06, "loss": 0.0192, "step": 6890 }, { "epoch": 0.9103940284704561, "grad_norm": 0.13638873398303986, "learning_rate": 3.943890925533022e-06, "loss": 0.016, "step": 6891 }, { "epoch": 0.9105261419559402, "grad_norm": 0.20388948917388916, "learning_rate": 3.932342406720602e-06, "loss": 0.0169, "step": 6892 }, { "epoch": 0.9106582554414242, "grad_norm": 0.11608298122882843, "learning_rate": 3.920810481799841e-06, "loss": 0.0094, "step": 6893 }, { "epoch": 0.9107903689269082, "grad_norm": 0.12942413985729218, "learning_rate": 3.909295152762648e-06, "loss": 0.01, "step": 6894 }, { "epoch": 0.9109224824123923, "grad_norm": 0.13718636333942413, "learning_rate": 3.897796421598088e-06, "loss": 0.0125, "step": 6895 }, { "epoch": 0.9110545958978763, "grad_norm": 0.19180552661418915, "learning_rate": 3.886314290292326e-06, "loss": 0.0184, "step": 6896 }, { "epoch": 0.9111867093833603, "grad_norm": 0.16199803352355957, "learning_rate": 3.874848760828731e-06, "loss": 0.0191, "step": 6897 }, { "epoch": 0.9113188228688444, "grad_norm": 0.10444027185440063, "learning_rate": 3.86339983518772e-06, "loss": 0.0086, "step": 6898 }, { "epoch": 0.9114509363543284, "grad_norm": 0.19840379059314728, "learning_rate": 3.8519675153468995e-06, "loss": 0.0168, "step": 6899 }, { "epoch": 0.9115830498398124, "grad_norm": 0.21695414185523987, "learning_rate": 3.8405518032809785e-06, "loss": 0.0187, "step": 6900 }, { "epoch": 0.9117151633252965, "grad_norm": 0.07511827349662781, "learning_rate": 3.829152700961835e-06, "loss": 0.0044, "step": 6901 }, { "epoch": 0.9118472768107805, "grad_norm": 0.15764155983924866, "learning_rate": 3.817770210358407e-06, "loss": 0.0243, "step": 6902 }, { "epoch": 0.9119793902962645, "grad_norm": 0.11205530911684036, "learning_rate": 3.8064043334368416e-06, "loss": 0.0112, "step": 6903 }, { "epoch": 0.9121115037817485, "grad_norm": 0.14387571811676025, "learning_rate": 3.7950550721603696e-06, "loss": 0.0109, "step": 6904 }, { "epoch": 0.9122436172672326, "grad_norm": 0.2204601764678955, "learning_rate": 3.783722428489367e-06, "loss": 0.0215, "step": 6905 }, { "epoch": 0.9123757307527166, "grad_norm": 0.10634907335042953, "learning_rate": 3.7724064043813232e-06, "loss": 0.0087, "step": 6906 }, { "epoch": 0.9125078442382006, "grad_norm": 0.19405195116996765, "learning_rate": 3.7611070017908757e-06, "loss": 0.0179, "step": 6907 }, { "epoch": 0.9126399577236847, "grad_norm": 0.1435571163892746, "learning_rate": 3.7498242226698066e-06, "loss": 0.0096, "step": 6908 }, { "epoch": 0.9127720712091687, "grad_norm": 0.159325510263443, "learning_rate": 3.7385580689669796e-06, "loss": 0.0147, "step": 6909 }, { "epoch": 0.9129041846946527, "grad_norm": 0.1752527505159378, "learning_rate": 3.727308542628416e-06, "loss": 0.0305, "step": 6910 }, { "epoch": 0.9130362981801368, "grad_norm": 0.17263196408748627, "learning_rate": 3.716075645597272e-06, "loss": 0.0214, "step": 6911 }, { "epoch": 0.9131684116656208, "grad_norm": 0.07739172875881195, "learning_rate": 3.7048593798138077e-06, "loss": 0.0052, "step": 6912 }, { "epoch": 0.9133005251511048, "grad_norm": 0.21998505294322968, "learning_rate": 3.6936597472154277e-06, "loss": 0.0325, "step": 6913 }, { "epoch": 0.9134326386365889, "grad_norm": 0.22086240351200104, "learning_rate": 3.682476749736663e-06, "loss": 0.0175, "step": 6914 }, { "epoch": 0.9135647521220729, "grad_norm": 0.1500852108001709, "learning_rate": 3.671310389309168e-06, "loss": 0.0132, "step": 6915 }, { "epoch": 0.9136968656075569, "grad_norm": 0.2943585515022278, "learning_rate": 3.660160667861712e-06, "loss": 0.0152, "step": 6916 }, { "epoch": 0.913828979093041, "grad_norm": 0.16644565761089325, "learning_rate": 3.6490275873202085e-06, "loss": 0.0124, "step": 6917 }, { "epoch": 0.913961092578525, "grad_norm": 0.2597222924232483, "learning_rate": 3.637911149607709e-06, "loss": 0.0221, "step": 6918 }, { "epoch": 0.914093206064009, "grad_norm": 0.13682372868061066, "learning_rate": 3.6268113566443327e-06, "loss": 0.0144, "step": 6919 }, { "epoch": 0.914225319549493, "grad_norm": 0.1653459668159485, "learning_rate": 3.615728210347369e-06, "loss": 0.021, "step": 6920 }, { "epoch": 0.9143574330349771, "grad_norm": 0.17368349432945251, "learning_rate": 3.604661712631241e-06, "loss": 0.0117, "step": 6921 }, { "epoch": 0.9144895465204611, "grad_norm": 0.14491091668605804, "learning_rate": 3.5936118654074867e-06, "loss": 0.0136, "step": 6922 }, { "epoch": 0.9146216600059451, "grad_norm": 0.17515389621257782, "learning_rate": 3.5825786705847354e-06, "loss": 0.0167, "step": 6923 }, { "epoch": 0.9147537734914292, "grad_norm": 0.1294727772474289, "learning_rate": 3.571562130068773e-06, "loss": 0.0113, "step": 6924 }, { "epoch": 0.9148858869769132, "grad_norm": 0.25797829031944275, "learning_rate": 3.560562245762522e-06, "loss": 0.0219, "step": 6925 }, { "epoch": 0.9150180004623972, "grad_norm": 0.11501938849687576, "learning_rate": 3.549579019565974e-06, "loss": 0.0094, "step": 6926 }, { "epoch": 0.9151501139478813, "grad_norm": 0.1509166806936264, "learning_rate": 3.5386124533762775e-06, "loss": 0.0133, "step": 6927 }, { "epoch": 0.9152822274333653, "grad_norm": 0.156303271651268, "learning_rate": 3.527662549087729e-06, "loss": 0.0104, "step": 6928 }, { "epoch": 0.9154143409188493, "grad_norm": 0.39509233832359314, "learning_rate": 3.5167293085917153e-06, "loss": 0.0125, "step": 6929 }, { "epoch": 0.9155464544043334, "grad_norm": 0.14178279042243958, "learning_rate": 3.5058127337767253e-06, "loss": 0.0188, "step": 6930 }, { "epoch": 0.9156785678898174, "grad_norm": 0.1454003006219864, "learning_rate": 3.494912826528407e-06, "loss": 0.0119, "step": 6931 }, { "epoch": 0.9158106813753014, "grad_norm": 0.13929800689220428, "learning_rate": 3.4840295887295315e-06, "loss": 0.0113, "step": 6932 }, { "epoch": 0.9159427948607854, "grad_norm": 0.1115756407380104, "learning_rate": 3.47316302225994e-06, "loss": 0.0138, "step": 6933 }, { "epoch": 0.9160749083462695, "grad_norm": 0.12636327743530273, "learning_rate": 3.4623131289966525e-06, "loss": 0.0113, "step": 6934 }, { "epoch": 0.9162070218317535, "grad_norm": 0.11508063971996307, "learning_rate": 3.451479910813782e-06, "loss": 0.0094, "step": 6935 }, { "epoch": 0.9163391353172375, "grad_norm": 0.14656183123588562, "learning_rate": 3.4406633695825752e-06, "loss": 0.0134, "step": 6936 }, { "epoch": 0.9164712488027216, "grad_norm": 0.14659330248832703, "learning_rate": 3.42986350717136e-06, "loss": 0.0118, "step": 6937 }, { "epoch": 0.9166033622882056, "grad_norm": 0.08480847626924515, "learning_rate": 3.419080325445634e-06, "loss": 0.0063, "step": 6938 }, { "epoch": 0.9167354757736896, "grad_norm": 0.1341126710176468, "learning_rate": 3.408313826267984e-06, "loss": 0.0132, "step": 6939 }, { "epoch": 0.9168675892591737, "grad_norm": 0.24358314275741577, "learning_rate": 3.397564011498111e-06, "loss": 0.02, "step": 6940 }, { "epoch": 0.9169997027446577, "grad_norm": 0.23924903571605682, "learning_rate": 3.386830882992853e-06, "loss": 0.0172, "step": 6941 }, { "epoch": 0.9171318162301417, "grad_norm": 0.16076403856277466, "learning_rate": 3.37611444260616e-06, "loss": 0.0154, "step": 6942 }, { "epoch": 0.9172639297156258, "grad_norm": 0.20377475023269653, "learning_rate": 3.365414692189106e-06, "loss": 0.0272, "step": 6943 }, { "epoch": 0.9173960432011098, "grad_norm": 0.13567671179771423, "learning_rate": 3.354731633589847e-06, "loss": 0.0096, "step": 6944 }, { "epoch": 0.9175281566865938, "grad_norm": 0.18160465359687805, "learning_rate": 3.3440652686536957e-06, "loss": 0.0213, "step": 6945 }, { "epoch": 0.9176602701720779, "grad_norm": 0.13435958325862885, "learning_rate": 3.3334155992230776e-06, "loss": 0.0133, "step": 6946 }, { "epoch": 0.9177923836575619, "grad_norm": 0.14226631820201874, "learning_rate": 3.3227826271374997e-06, "loss": 0.0083, "step": 6947 }, { "epoch": 0.9179244971430459, "grad_norm": 0.12110575288534164, "learning_rate": 3.3121663542336256e-06, "loss": 0.011, "step": 6948 }, { "epoch": 0.91805661062853, "grad_norm": 0.1794366091489792, "learning_rate": 3.301566782345211e-06, "loss": 0.0214, "step": 6949 }, { "epoch": 0.918188724114014, "grad_norm": 0.1318056434392929, "learning_rate": 3.290983913303147e-06, "loss": 0.0115, "step": 6950 }, { "epoch": 0.918320837599498, "grad_norm": 0.14880859851837158, "learning_rate": 3.280417748935416e-06, "loss": 0.0105, "step": 6951 }, { "epoch": 0.918452951084982, "grad_norm": 0.14884725213050842, "learning_rate": 3.269868291067124e-06, "loss": 0.0109, "step": 6952 }, { "epoch": 0.9185850645704661, "grad_norm": 0.19536083936691284, "learning_rate": 3.259335541520503e-06, "loss": 0.0113, "step": 6953 }, { "epoch": 0.9187171780559501, "grad_norm": 0.13253453373908997, "learning_rate": 3.2488195021148525e-06, "loss": 0.014, "step": 6954 }, { "epoch": 0.9188492915414341, "grad_norm": 0.11469663679599762, "learning_rate": 3.238320174666676e-06, "loss": 0.0182, "step": 6955 }, { "epoch": 0.9189814050269182, "grad_norm": 0.12924715876579285, "learning_rate": 3.227837560989511e-06, "loss": 0.0105, "step": 6956 }, { "epoch": 0.9191135185124022, "grad_norm": 0.13562564551830292, "learning_rate": 3.2173716628940198e-06, "loss": 0.0074, "step": 6957 }, { "epoch": 0.9192456319978862, "grad_norm": 0.1238885447382927, "learning_rate": 3.2069224821880127e-06, "loss": 0.0165, "step": 6958 }, { "epoch": 0.9193777454833703, "grad_norm": 0.1311292201280594, "learning_rate": 3.196490020676379e-06, "loss": 0.0113, "step": 6959 }, { "epoch": 0.9195098589688543, "grad_norm": 0.14047713577747345, "learning_rate": 3.186074280161122e-06, "loss": 0.0117, "step": 6960 }, { "epoch": 0.9196419724543383, "grad_norm": 0.1856159120798111, "learning_rate": 3.175675262441391e-06, "loss": 0.0202, "step": 6961 }, { "epoch": 0.9197740859398224, "grad_norm": 0.1386088728904724, "learning_rate": 3.1652929693133935e-06, "loss": 0.0133, "step": 6962 }, { "epoch": 0.9199061994253064, "grad_norm": 0.21501292288303375, "learning_rate": 3.1549274025705065e-06, "loss": 0.0237, "step": 6963 }, { "epoch": 0.9200383129107904, "grad_norm": 0.14579859375953674, "learning_rate": 3.1445785640031646e-06, "loss": 0.0169, "step": 6964 }, { "epoch": 0.9201704263962744, "grad_norm": 0.2133261114358902, "learning_rate": 3.134246455398937e-06, "loss": 0.0159, "step": 6965 }, { "epoch": 0.9203025398817585, "grad_norm": 0.21119467914104462, "learning_rate": 3.1239310785425192e-06, "loss": 0.0227, "step": 6966 }, { "epoch": 0.9204346533672425, "grad_norm": 0.13757382333278656, "learning_rate": 3.1136324352156852e-06, "loss": 0.0142, "step": 6967 }, { "epoch": 0.9205667668527265, "grad_norm": 0.18206921219825745, "learning_rate": 3.103350527197335e-06, "loss": 0.0138, "step": 6968 }, { "epoch": 0.9206988803382106, "grad_norm": 0.1949104219675064, "learning_rate": 3.093085356263481e-06, "loss": 0.0293, "step": 6969 }, { "epoch": 0.9208309938236946, "grad_norm": 0.14741899073123932, "learning_rate": 3.082836924187238e-06, "loss": 0.0149, "step": 6970 }, { "epoch": 0.9209631073091786, "grad_norm": 0.11937861144542694, "learning_rate": 3.072605232738823e-06, "loss": 0.0149, "step": 6971 }, { "epoch": 0.9210952207946627, "grad_norm": 0.09795008599758148, "learning_rate": 3.062390283685579e-06, "loss": 0.0074, "step": 6972 }, { "epoch": 0.9212273342801467, "grad_norm": 0.09507367014884949, "learning_rate": 3.05219207879196e-06, "loss": 0.0047, "step": 6973 }, { "epoch": 0.9213594477656307, "grad_norm": 0.1361870914697647, "learning_rate": 3.0420106198194797e-06, "loss": 0.019, "step": 6974 }, { "epoch": 0.9214915612511148, "grad_norm": 0.21445150673389435, "learning_rate": 3.03184590852682e-06, "loss": 0.0137, "step": 6975 }, { "epoch": 0.9216236747365988, "grad_norm": 0.18917971849441528, "learning_rate": 3.0216979466697436e-06, "loss": 0.0212, "step": 6976 }, { "epoch": 0.9217557882220828, "grad_norm": 0.20330829918384552, "learning_rate": 3.0115667360011144e-06, "loss": 0.0178, "step": 6977 }, { "epoch": 0.9218879017075668, "grad_norm": 0.14233195781707764, "learning_rate": 3.001452278270911e-06, "loss": 0.0116, "step": 6978 }, { "epoch": 0.9220200151930509, "grad_norm": 0.1598593294620514, "learning_rate": 2.991354575226224e-06, "loss": 0.0166, "step": 6979 }, { "epoch": 0.9221521286785349, "grad_norm": 0.24355551600456238, "learning_rate": 2.9812736286112365e-06, "loss": 0.034, "step": 6980 }, { "epoch": 0.9222842421640189, "grad_norm": 0.15401063859462738, "learning_rate": 2.971209440167244e-06, "loss": 0.0115, "step": 6981 }, { "epoch": 0.922416355649503, "grad_norm": 0.13340553641319275, "learning_rate": 2.9611620116326346e-06, "loss": 0.0092, "step": 6982 }, { "epoch": 0.922548469134987, "grad_norm": 0.1478549987077713, "learning_rate": 2.9511313447429303e-06, "loss": 0.0144, "step": 6983 }, { "epoch": 0.922680582620471, "grad_norm": 0.1342504918575287, "learning_rate": 2.941117441230756e-06, "loss": 0.0158, "step": 6984 }, { "epoch": 0.9228126961059551, "grad_norm": 0.1833777129650116, "learning_rate": 2.931120302825785e-06, "loss": 0.0132, "step": 6985 }, { "epoch": 0.9229448095914391, "grad_norm": 0.1599016785621643, "learning_rate": 2.9211399312548684e-06, "loss": 0.0163, "step": 6986 }, { "epoch": 0.9230769230769231, "grad_norm": 0.16944639384746552, "learning_rate": 2.911176328241927e-06, "loss": 0.0187, "step": 6987 }, { "epoch": 0.9232090365624072, "grad_norm": 0.30180472135543823, "learning_rate": 2.901229495507962e-06, "loss": 0.0431, "step": 6988 }, { "epoch": 0.9233411500478912, "grad_norm": 0.13991716504096985, "learning_rate": 2.891299434771133e-06, "loss": 0.0087, "step": 6989 }, { "epoch": 0.9234732635333752, "grad_norm": 0.09995570778846741, "learning_rate": 2.8813861477466455e-06, "loss": 0.0063, "step": 6990 }, { "epoch": 0.9236053770188593, "grad_norm": 0.17359285056591034, "learning_rate": 2.8714896361468624e-06, "loss": 0.0149, "step": 6991 }, { "epoch": 0.9237374905043433, "grad_norm": 0.1824413388967514, "learning_rate": 2.861609901681195e-06, "loss": 0.0159, "step": 6992 }, { "epoch": 0.9238696039898273, "grad_norm": 0.1265413910150528, "learning_rate": 2.8517469460561998e-06, "loss": 0.0136, "step": 6993 }, { "epoch": 0.9240017174753113, "grad_norm": 0.17025385797023773, "learning_rate": 2.841900770975514e-06, "loss": 0.0156, "step": 6994 }, { "epoch": 0.9241338309607954, "grad_norm": 0.1325269490480423, "learning_rate": 2.832071378139878e-06, "loss": 0.0145, "step": 6995 }, { "epoch": 0.9242659444462794, "grad_norm": 0.10881488770246506, "learning_rate": 2.8222587692471213e-06, "loss": 0.0095, "step": 6996 }, { "epoch": 0.9243980579317634, "grad_norm": 0.12182066589593887, "learning_rate": 2.8124629459922224e-06, "loss": 0.0161, "step": 6997 }, { "epoch": 0.9245301714172475, "grad_norm": 0.12361127883195877, "learning_rate": 2.8026839100672054e-06, "loss": 0.0143, "step": 6998 }, { "epoch": 0.9246622849027315, "grad_norm": 0.11965993791818619, "learning_rate": 2.7929216631612076e-06, "loss": 0.01, "step": 6999 }, { "epoch": 0.9247943983882154, "grad_norm": 0.2763892412185669, "learning_rate": 2.7831762069604915e-06, "loss": 0.0245, "step": 7000 }, { "epoch": 0.9249265118736995, "grad_norm": 0.29099127650260925, "learning_rate": 2.7734475431483995e-06, "loss": 0.017, "step": 7001 }, { "epoch": 0.9250586253591835, "grad_norm": 0.13824976980686188, "learning_rate": 2.7637356734053764e-06, "loss": 0.0145, "step": 7002 }, { "epoch": 0.9251907388446675, "grad_norm": 0.11890945583581924, "learning_rate": 2.754040599408947e-06, "loss": 0.0117, "step": 7003 }, { "epoch": 0.9253228523301515, "grad_norm": 0.20134928822517395, "learning_rate": 2.7443623228337822e-06, "loss": 0.0276, "step": 7004 }, { "epoch": 0.9254549658156356, "grad_norm": 0.0993884950876236, "learning_rate": 2.734700845351612e-06, "loss": 0.008, "step": 7005 }, { "epoch": 0.9255870793011196, "grad_norm": 0.09369389712810516, "learning_rate": 2.725056168631268e-06, "loss": 0.0066, "step": 7006 }, { "epoch": 0.9257191927866036, "grad_norm": 0.20960983633995056, "learning_rate": 2.7154282943386957e-06, "loss": 0.0244, "step": 7007 }, { "epoch": 0.9258513062720877, "grad_norm": 0.15154051780700684, "learning_rate": 2.7058172241369417e-06, "loss": 0.0113, "step": 7008 }, { "epoch": 0.9259834197575717, "grad_norm": 0.09699692577123642, "learning_rate": 2.696222959686112e-06, "loss": 0.0081, "step": 7009 }, { "epoch": 0.9261155332430557, "grad_norm": 0.2003384381532669, "learning_rate": 2.6866455026434477e-06, "loss": 0.0159, "step": 7010 }, { "epoch": 0.9262476467285398, "grad_norm": 0.1282658725976944, "learning_rate": 2.677084854663292e-06, "loss": 0.0147, "step": 7011 }, { "epoch": 0.9263797602140238, "grad_norm": 0.2649095356464386, "learning_rate": 2.667541017397057e-06, "loss": 0.0205, "step": 7012 }, { "epoch": 0.9265118736995078, "grad_norm": 0.1102382019162178, "learning_rate": 2.6580139924932355e-06, "loss": 0.0092, "step": 7013 }, { "epoch": 0.9266439871849919, "grad_norm": 0.2010749727487564, "learning_rate": 2.6485037815974778e-06, "loss": 0.0156, "step": 7014 }, { "epoch": 0.9267761006704759, "grad_norm": 0.38905027508735657, "learning_rate": 2.6390103863525028e-06, "loss": 0.0141, "step": 7015 }, { "epoch": 0.9269082141559599, "grad_norm": 0.18634909391403198, "learning_rate": 2.629533808398077e-06, "loss": 0.0177, "step": 7016 }, { "epoch": 0.927040327641444, "grad_norm": 0.13378973305225372, "learning_rate": 2.620074049371135e-06, "loss": 0.0137, "step": 7017 }, { "epoch": 0.927172441126928, "grad_norm": 0.16449542343616486, "learning_rate": 2.610631110905659e-06, "loss": 0.013, "step": 7018 }, { "epoch": 0.927304554612412, "grad_norm": 0.10755720734596252, "learning_rate": 2.6012049946327445e-06, "loss": 0.0114, "step": 7019 }, { "epoch": 0.927436668097896, "grad_norm": 0.11967146396636963, "learning_rate": 2.591795702180577e-06, "loss": 0.016, "step": 7020 }, { "epoch": 0.9275687815833801, "grad_norm": 0.18971772491931915, "learning_rate": 2.5824032351744244e-06, "loss": 0.0113, "step": 7021 }, { "epoch": 0.9277008950688641, "grad_norm": 0.13666923344135284, "learning_rate": 2.5730275952366877e-06, "loss": 0.0145, "step": 7022 }, { "epoch": 0.9278330085543481, "grad_norm": 0.21552050113677979, "learning_rate": 2.5636687839867947e-06, "loss": 0.0193, "step": 7023 }, { "epoch": 0.9279651220398322, "grad_norm": 0.08234214037656784, "learning_rate": 2.5543268030413405e-06, "loss": 0.0082, "step": 7024 }, { "epoch": 0.9280972355253162, "grad_norm": 0.2322162389755249, "learning_rate": 2.5450016540139566e-06, "loss": 0.0286, "step": 7025 }, { "epoch": 0.9282293490108002, "grad_norm": 0.1154404729604721, "learning_rate": 2.535693338515399e-06, "loss": 0.0069, "step": 7026 }, { "epoch": 0.9283614624962843, "grad_norm": 0.10801186412572861, "learning_rate": 2.526401858153493e-06, "loss": 0.0096, "step": 7027 }, { "epoch": 0.9284935759817683, "grad_norm": 0.13217969238758087, "learning_rate": 2.517127214533177e-06, "loss": 0.0185, "step": 7028 }, { "epoch": 0.9286256894672523, "grad_norm": 0.11214915663003922, "learning_rate": 2.5078694092564804e-06, "loss": 0.0086, "step": 7029 }, { "epoch": 0.9287578029527364, "grad_norm": 0.133506640791893, "learning_rate": 2.4986284439225015e-06, "loss": 0.0137, "step": 7030 }, { "epoch": 0.9288899164382204, "grad_norm": 0.1064433827996254, "learning_rate": 2.489404320127442e-06, "loss": 0.0161, "step": 7031 }, { "epoch": 0.9290220299237044, "grad_norm": 0.14859448373317719, "learning_rate": 2.4801970394646157e-06, "loss": 0.0124, "step": 7032 }, { "epoch": 0.9291541434091884, "grad_norm": 0.160023033618927, "learning_rate": 2.4710066035243838e-06, "loss": 0.0103, "step": 7033 }, { "epoch": 0.9292862568946725, "grad_norm": 0.1463107019662857, "learning_rate": 2.4618330138942437e-06, "loss": 0.017, "step": 7034 }, { "epoch": 0.9294183703801565, "grad_norm": 0.12404201179742813, "learning_rate": 2.4526762721587494e-06, "loss": 0.0149, "step": 7035 }, { "epoch": 0.9295504838656405, "grad_norm": 0.19135360419750214, "learning_rate": 2.4435363798995692e-06, "loss": 0.0201, "step": 7036 }, { "epoch": 0.9296825973511246, "grad_norm": 0.11712557822465897, "learning_rate": 2.434413338695429e-06, "loss": 0.0146, "step": 7037 }, { "epoch": 0.9298147108366086, "grad_norm": 0.09654711931943893, "learning_rate": 2.425307150122169e-06, "loss": 0.0065, "step": 7038 }, { "epoch": 0.9299468243220926, "grad_norm": 0.13926123082637787, "learning_rate": 2.4162178157527304e-06, "loss": 0.0128, "step": 7039 }, { "epoch": 0.9300789378075767, "grad_norm": 0.10085835307836533, "learning_rate": 2.407145337157113e-06, "loss": 0.009, "step": 7040 }, { "epoch": 0.9302110512930607, "grad_norm": 0.09652812778949738, "learning_rate": 2.3980897159024073e-06, "loss": 0.0087, "step": 7041 }, { "epoch": 0.9303431647785447, "grad_norm": 0.1405116319656372, "learning_rate": 2.389050953552818e-06, "loss": 0.0146, "step": 7042 }, { "epoch": 0.9304752782640288, "grad_norm": 0.21289518475532532, "learning_rate": 2.38002905166963e-06, "loss": 0.0181, "step": 7043 }, { "epoch": 0.9306073917495128, "grad_norm": 0.2870105504989624, "learning_rate": 2.371024011811185e-06, "loss": 0.0197, "step": 7044 }, { "epoch": 0.9307395052349968, "grad_norm": 0.14215752482414246, "learning_rate": 2.36203583553295e-06, "loss": 0.0142, "step": 7045 }, { "epoch": 0.9308716187204809, "grad_norm": 0.0909138098359108, "learning_rate": 2.3530645243874604e-06, "loss": 0.0137, "step": 7046 }, { "epoch": 0.9310037322059649, "grad_norm": 0.18109942972660065, "learning_rate": 2.344110079924344e-06, "loss": 0.0152, "step": 7047 }, { "epoch": 0.9311358456914489, "grad_norm": 0.10473036766052246, "learning_rate": 2.335172503690308e-06, "loss": 0.0075, "step": 7048 }, { "epoch": 0.931267959176933, "grad_norm": 0.18801212310791016, "learning_rate": 2.326251797229162e-06, "loss": 0.0266, "step": 7049 }, { "epoch": 0.931400072662417, "grad_norm": 0.2324409782886505, "learning_rate": 2.317347962081784e-06, "loss": 0.0188, "step": 7050 }, { "epoch": 0.931532186147901, "grad_norm": 0.18094132840633392, "learning_rate": 2.308460999786144e-06, "loss": 0.0155, "step": 7051 }, { "epoch": 0.931664299633385, "grad_norm": 0.2537223696708679, "learning_rate": 2.299590911877303e-06, "loss": 0.0144, "step": 7052 }, { "epoch": 0.9317964131188691, "grad_norm": 0.16453498601913452, "learning_rate": 2.290737699887402e-06, "loss": 0.0284, "step": 7053 }, { "epoch": 0.9319285266043531, "grad_norm": 0.15937440097332, "learning_rate": 2.281901365345662e-06, "loss": 0.0226, "step": 7054 }, { "epoch": 0.9320606400898371, "grad_norm": 0.14340832829475403, "learning_rate": 2.2730819097783964e-06, "loss": 0.0124, "step": 7055 }, { "epoch": 0.9321927535753212, "grad_norm": 0.09002848714590073, "learning_rate": 2.2642793347090075e-06, "loss": 0.0067, "step": 7056 }, { "epoch": 0.9323248670608052, "grad_norm": 0.12743428349494934, "learning_rate": 2.25549364165798e-06, "loss": 0.0136, "step": 7057 }, { "epoch": 0.9324569805462892, "grad_norm": 0.15904666483402252, "learning_rate": 2.246724832142866e-06, "loss": 0.0163, "step": 7058 }, { "epoch": 0.9325890940317733, "grad_norm": 0.14966551959514618, "learning_rate": 2.2379729076783096e-06, "loss": 0.0178, "step": 7059 }, { "epoch": 0.9327212075172573, "grad_norm": 0.11228898167610168, "learning_rate": 2.2292378697760683e-06, "loss": 0.0124, "step": 7060 }, { "epoch": 0.9328533210027413, "grad_norm": 0.30276328325271606, "learning_rate": 2.2205197199449248e-06, "loss": 0.0145, "step": 7061 }, { "epoch": 0.9329854344882254, "grad_norm": 0.15492114424705505, "learning_rate": 2.2118184596907845e-06, "loss": 0.0154, "step": 7062 }, { "epoch": 0.9331175479737094, "grad_norm": 0.19936561584472656, "learning_rate": 2.203134090516634e-06, "loss": 0.0213, "step": 7063 }, { "epoch": 0.9332496614591934, "grad_norm": 0.19477428495883942, "learning_rate": 2.194466613922552e-06, "loss": 0.0182, "step": 7064 }, { "epoch": 0.9333817749446774, "grad_norm": 0.22519803047180176, "learning_rate": 2.185816031405652e-06, "loss": 0.0234, "step": 7065 }, { "epoch": 0.9335138884301615, "grad_norm": 0.10661128163337708, "learning_rate": 2.1771823444601714e-06, "loss": 0.0099, "step": 7066 }, { "epoch": 0.9336460019156455, "grad_norm": 0.21245616674423218, "learning_rate": 2.16856555457744e-06, "loss": 0.0136, "step": 7067 }, { "epoch": 0.9337781154011295, "grad_norm": 0.1352900117635727, "learning_rate": 2.159965663245811e-06, "loss": 0.0127, "step": 7068 }, { "epoch": 0.9339102288866136, "grad_norm": 0.2113463431596756, "learning_rate": 2.1513826719507748e-06, "loss": 0.0263, "step": 7069 }, { "epoch": 0.9340423423720976, "grad_norm": 0.14445137977600098, "learning_rate": 2.142816582174878e-06, "loss": 0.0154, "step": 7070 }, { "epoch": 0.9341744558575816, "grad_norm": 0.2524077892303467, "learning_rate": 2.134267395397749e-06, "loss": 0.0174, "step": 7071 }, { "epoch": 0.9343065693430657, "grad_norm": 0.12157204002141953, "learning_rate": 2.1257351130961167e-06, "loss": 0.0117, "step": 7072 }, { "epoch": 0.9344386828285497, "grad_norm": 0.20304414629936218, "learning_rate": 2.1172197367437587e-06, "loss": 0.0198, "step": 7073 }, { "epoch": 0.9345707963140337, "grad_norm": 0.20831957459449768, "learning_rate": 2.1087212678115533e-06, "loss": 0.0214, "step": 7074 }, { "epoch": 0.9347029097995178, "grad_norm": 0.11911209672689438, "learning_rate": 2.1002397077674372e-06, "loss": 0.0111, "step": 7075 }, { "epoch": 0.9348350232850018, "grad_norm": 0.16563266515731812, "learning_rate": 2.0917750580764616e-06, "loss": 0.0216, "step": 7076 }, { "epoch": 0.9349671367704858, "grad_norm": 0.1556750386953354, "learning_rate": 2.083327320200734e-06, "loss": 0.0092, "step": 7077 }, { "epoch": 0.9350992502559698, "grad_norm": 0.17585864663124084, "learning_rate": 2.074896495599432e-06, "loss": 0.0206, "step": 7078 }, { "epoch": 0.9352313637414539, "grad_norm": 0.13946789503097534, "learning_rate": 2.066482585728824e-06, "loss": 0.015, "step": 7079 }, { "epoch": 0.9353634772269379, "grad_norm": 0.1683516651391983, "learning_rate": 2.058085592042269e-06, "loss": 0.0232, "step": 7080 }, { "epoch": 0.9354955907124219, "grad_norm": 0.16836416721343994, "learning_rate": 2.0497055159901746e-06, "loss": 0.0152, "step": 7081 }, { "epoch": 0.935627704197906, "grad_norm": 0.18335430324077606, "learning_rate": 2.0413423590200377e-06, "loss": 0.0251, "step": 7082 }, { "epoch": 0.93575981768339, "grad_norm": 0.12575763463974, "learning_rate": 2.0329961225764584e-06, "loss": 0.0109, "step": 7083 }, { "epoch": 0.935891931168874, "grad_norm": 0.09663445502519608, "learning_rate": 2.0246668081010944e-06, "loss": 0.0057, "step": 7084 }, { "epoch": 0.9360240446543581, "grad_norm": 0.2766578197479248, "learning_rate": 2.016354417032662e-06, "loss": 0.0127, "step": 7085 }, { "epoch": 0.9361561581398421, "grad_norm": 0.20240576565265656, "learning_rate": 2.008058950806968e-06, "loss": 0.0253, "step": 7086 }, { "epoch": 0.9362882716253261, "grad_norm": 0.12679430842399597, "learning_rate": 1.999780410856922e-06, "loss": 0.0108, "step": 7087 }, { "epoch": 0.9364203851108102, "grad_norm": 0.16213077306747437, "learning_rate": 1.9915187986124575e-06, "loss": 0.0087, "step": 7088 }, { "epoch": 0.9365524985962942, "grad_norm": 0.1392461359500885, "learning_rate": 1.983274115500633e-06, "loss": 0.0136, "step": 7089 }, { "epoch": 0.9366846120817782, "grad_norm": 0.15210972726345062, "learning_rate": 1.9750463629455653e-06, "loss": 0.0228, "step": 7090 }, { "epoch": 0.9368167255672623, "grad_norm": 0.14509055018424988, "learning_rate": 1.9668355423684504e-06, "loss": 0.0117, "step": 7091 }, { "epoch": 0.9369488390527463, "grad_norm": 0.11387002468109131, "learning_rate": 1.958641655187521e-06, "loss": 0.0167, "step": 7092 }, { "epoch": 0.9370809525382303, "grad_norm": 0.20874197781085968, "learning_rate": 1.9504647028181443e-06, "loss": 0.0209, "step": 7093 }, { "epoch": 0.9372130660237143, "grad_norm": 0.2966103255748749, "learning_rate": 1.942304686672747e-06, "loss": 0.0172, "step": 7094 }, { "epoch": 0.9373451795091984, "grad_norm": 0.20808520913124084, "learning_rate": 1.9341616081607894e-06, "loss": 0.0231, "step": 7095 }, { "epoch": 0.9374772929946824, "grad_norm": 0.19017285108566284, "learning_rate": 1.926035468688847e-06, "loss": 0.0158, "step": 7096 }, { "epoch": 0.9376094064801664, "grad_norm": 0.2927365303039551, "learning_rate": 1.917926269660575e-06, "loss": 0.0185, "step": 7097 }, { "epoch": 0.9377415199656505, "grad_norm": 0.2043789178133011, "learning_rate": 1.909834012476663e-06, "loss": 0.0167, "step": 7098 }, { "epoch": 0.9378736334511345, "grad_norm": 0.09357764571905136, "learning_rate": 1.9017586985349168e-06, "loss": 0.0057, "step": 7099 }, { "epoch": 0.9380057469366185, "grad_norm": 0.1728450506925583, "learning_rate": 1.8937003292301746e-06, "loss": 0.0167, "step": 7100 }, { "epoch": 0.9381378604221026, "grad_norm": 0.11996671557426453, "learning_rate": 1.8856589059543905e-06, "loss": 0.0126, "step": 7101 }, { "epoch": 0.9382699739075866, "grad_norm": 0.13449335098266602, "learning_rate": 1.877634430096553e-06, "loss": 0.0149, "step": 7102 }, { "epoch": 0.9384020873930706, "grad_norm": 0.6450221538543701, "learning_rate": 1.8696269030427427e-06, "loss": 0.0181, "step": 7103 }, { "epoch": 0.9385342008785547, "grad_norm": 0.23661655187606812, "learning_rate": 1.8616363261761195e-06, "loss": 0.0138, "step": 7104 }, { "epoch": 0.9386663143640387, "grad_norm": 0.1648440659046173, "learning_rate": 1.8536627008769014e-06, "loss": 0.0172, "step": 7105 }, { "epoch": 0.9387984278495227, "grad_norm": 0.11088185757398605, "learning_rate": 1.845706028522387e-06, "loss": 0.0163, "step": 7106 }, { "epoch": 0.9389305413350068, "grad_norm": 0.3062111735343933, "learning_rate": 1.8377663104869325e-06, "loss": 0.0222, "step": 7107 }, { "epoch": 0.9390626548204908, "grad_norm": 0.13327403366565704, "learning_rate": 1.8298435481419852e-06, "loss": 0.0121, "step": 7108 }, { "epoch": 0.9391947683059748, "grad_norm": 0.22051618993282318, "learning_rate": 1.8219377428560502e-06, "loss": 0.0215, "step": 7109 }, { "epoch": 0.9393268817914588, "grad_norm": 0.15959028899669647, "learning_rate": 1.8140488959947023e-06, "loss": 0.0171, "step": 7110 }, { "epoch": 0.9394589952769429, "grad_norm": 0.12574240565299988, "learning_rate": 1.8061770089206064e-06, "loss": 0.0117, "step": 7111 }, { "epoch": 0.9395911087624269, "grad_norm": 0.15989792346954346, "learning_rate": 1.7983220829934755e-06, "loss": 0.0167, "step": 7112 }, { "epoch": 0.9397232222479109, "grad_norm": 0.12491068989038467, "learning_rate": 1.7904841195701016e-06, "loss": 0.0071, "step": 7113 }, { "epoch": 0.939855335733395, "grad_norm": 0.18081387877464294, "learning_rate": 1.7826631200043353e-06, "loss": 0.016, "step": 7114 }, { "epoch": 0.939987449218879, "grad_norm": 0.19697430729866028, "learning_rate": 1.7748590856471402e-06, "loss": 0.0145, "step": 7115 }, { "epoch": 0.940119562704363, "grad_norm": 0.18398360908031464, "learning_rate": 1.7670720178464716e-06, "loss": 0.0117, "step": 7116 }, { "epoch": 0.9402516761898471, "grad_norm": 0.12235773354768753, "learning_rate": 1.7593019179474312e-06, "loss": 0.0156, "step": 7117 }, { "epoch": 0.9403837896753311, "grad_norm": 0.2906453013420105, "learning_rate": 1.7515487872921566e-06, "loss": 0.0212, "step": 7118 }, { "epoch": 0.9405159031608151, "grad_norm": 0.17571309208869934, "learning_rate": 1.743812627219854e-06, "loss": 0.012, "step": 7119 }, { "epoch": 0.9406480166462992, "grad_norm": 0.08763639628887177, "learning_rate": 1.7360934390667881e-06, "loss": 0.0128, "step": 7120 }, { "epoch": 0.9407801301317832, "grad_norm": 0.1028781607747078, "learning_rate": 1.728391224166326e-06, "loss": 0.0083, "step": 7121 }, { "epoch": 0.9409122436172672, "grad_norm": 0.16966675221920013, "learning_rate": 1.7207059838488582e-06, "loss": 0.0172, "step": 7122 }, { "epoch": 0.9410443571027512, "grad_norm": 0.17335392534732819, "learning_rate": 1.713037719441879e-06, "loss": 0.0103, "step": 7123 }, { "epoch": 0.9411764705882353, "grad_norm": 0.13844364881515503, "learning_rate": 1.7053864322699398e-06, "loss": 0.0145, "step": 7124 }, { "epoch": 0.9413085840737193, "grad_norm": 0.1319582760334015, "learning_rate": 1.69775212365465e-06, "loss": 0.0076, "step": 7125 }, { "epoch": 0.9414406975592033, "grad_norm": 0.12818774580955505, "learning_rate": 1.6901347949146996e-06, "loss": 0.014, "step": 7126 }, { "epoch": 0.9415728110446874, "grad_norm": 0.20910701155662537, "learning_rate": 1.6825344473658355e-06, "loss": 0.0234, "step": 7127 }, { "epoch": 0.9417049245301714, "grad_norm": 0.17381620407104492, "learning_rate": 1.6749510823208748e-06, "loss": 0.0236, "step": 7128 }, { "epoch": 0.9418370380156554, "grad_norm": 0.1289311945438385, "learning_rate": 1.6673847010897025e-06, "loss": 0.0159, "step": 7129 }, { "epoch": 0.9419691515011395, "grad_norm": 0.09960731118917465, "learning_rate": 1.6598353049792736e-06, "loss": 0.0044, "step": 7130 }, { "epoch": 0.9421012649866235, "grad_norm": 0.12561599910259247, "learning_rate": 1.6523028952936003e-06, "loss": 0.0155, "step": 7131 }, { "epoch": 0.9422333784721075, "grad_norm": 0.15394242107868195, "learning_rate": 1.6447874733337753e-06, "loss": 0.0131, "step": 7132 }, { "epoch": 0.9423654919575916, "grad_norm": 0.1523938775062561, "learning_rate": 1.637289040397938e-06, "loss": 0.0175, "step": 7133 }, { "epoch": 0.9424976054430756, "grad_norm": 0.18024346232414246, "learning_rate": 1.6298075977812966e-06, "loss": 0.0217, "step": 7134 }, { "epoch": 0.9426297189285596, "grad_norm": 0.22924868762493134, "learning_rate": 1.6223431467761396e-06, "loss": 0.0294, "step": 7135 }, { "epoch": 0.9427618324140437, "grad_norm": 0.13143298029899597, "learning_rate": 1.6148956886718246e-06, "loss": 0.0139, "step": 7136 }, { "epoch": 0.9428939458995277, "grad_norm": 0.12749572098255157, "learning_rate": 1.607465224754734e-06, "loss": 0.0143, "step": 7137 }, { "epoch": 0.9430260593850117, "grad_norm": 0.18467861413955688, "learning_rate": 1.6000517563083628e-06, "loss": 0.0176, "step": 7138 }, { "epoch": 0.9431581728704957, "grad_norm": 0.25325989723205566, "learning_rate": 1.5926552846132315e-06, "loss": 0.015, "step": 7139 }, { "epoch": 0.9432902863559798, "grad_norm": 0.2042577713727951, "learning_rate": 1.5852758109469623e-06, "loss": 0.0126, "step": 7140 }, { "epoch": 0.9434223998414638, "grad_norm": 0.10952811688184738, "learning_rate": 1.5779133365841915e-06, "loss": 0.0081, "step": 7141 }, { "epoch": 0.9435545133269478, "grad_norm": 0.13215503096580505, "learning_rate": 1.570567862796679e-06, "loss": 0.012, "step": 7142 }, { "epoch": 0.9436866268124319, "grad_norm": 0.13652601838111877, "learning_rate": 1.5632393908532106e-06, "loss": 0.0116, "step": 7143 }, { "epoch": 0.9438187402979159, "grad_norm": 0.3904765546321869, "learning_rate": 1.5559279220196288e-06, "loss": 0.0062, "step": 7144 }, { "epoch": 0.9439508537833999, "grad_norm": 0.13622334599494934, "learning_rate": 1.548633457558868e-06, "loss": 0.0093, "step": 7145 }, { "epoch": 0.944082967268884, "grad_norm": 0.13196682929992676, "learning_rate": 1.541355998730909e-06, "loss": 0.0114, "step": 7146 }, { "epoch": 0.944215080754368, "grad_norm": 0.14724312722682953, "learning_rate": 1.5340955467927909e-06, "loss": 0.0174, "step": 7147 }, { "epoch": 0.944347194239852, "grad_norm": 0.15435270965099335, "learning_rate": 1.5268521029986104e-06, "loss": 0.0167, "step": 7148 }, { "epoch": 0.9444793077253361, "grad_norm": 0.15787175297737122, "learning_rate": 1.5196256685995557e-06, "loss": 0.011, "step": 7149 }, { "epoch": 0.9446114212108201, "grad_norm": 0.14526408910751343, "learning_rate": 1.5124162448438728e-06, "loss": 0.0161, "step": 7150 }, { "epoch": 0.9447435346963041, "grad_norm": 0.23147417604923248, "learning_rate": 1.5052238329768099e-06, "loss": 0.0166, "step": 7151 }, { "epoch": 0.9448756481817882, "grad_norm": 0.07740113884210587, "learning_rate": 1.4980484342407507e-06, "loss": 0.0056, "step": 7152 }, { "epoch": 0.9450077616672722, "grad_norm": 0.14792416989803314, "learning_rate": 1.4908900498751155e-06, "loss": 0.0118, "step": 7153 }, { "epoch": 0.9451398751527562, "grad_norm": 0.1700582653284073, "learning_rate": 1.4837486811163704e-06, "loss": 0.0117, "step": 7154 }, { "epoch": 0.9452719886382402, "grad_norm": 0.33778321743011475, "learning_rate": 1.4766243291980507e-06, "loss": 0.015, "step": 7155 }, { "epoch": 0.9454041021237243, "grad_norm": 0.30752959847450256, "learning_rate": 1.4695169953507614e-06, "loss": 0.012, "step": 7156 }, { "epoch": 0.9455362156092083, "grad_norm": 0.15177850425243378, "learning_rate": 1.4624266808021647e-06, "loss": 0.0217, "step": 7157 }, { "epoch": 0.9456683290946923, "grad_norm": 0.22389481961727142, "learning_rate": 1.4553533867769697e-06, "loss": 0.0226, "step": 7158 }, { "epoch": 0.9458004425801764, "grad_norm": 0.21998845040798187, "learning_rate": 1.4482971144969547e-06, "loss": 0.0143, "step": 7159 }, { "epoch": 0.9459325560656604, "grad_norm": 0.16826555132865906, "learning_rate": 1.4412578651809894e-06, "loss": 0.0191, "step": 7160 }, { "epoch": 0.9460646695511444, "grad_norm": 0.122434601187706, "learning_rate": 1.434235640044923e-06, "loss": 0.0108, "step": 7161 }, { "epoch": 0.9461967830366285, "grad_norm": 0.13651913404464722, "learning_rate": 1.4272304403017523e-06, "loss": 0.0163, "step": 7162 }, { "epoch": 0.9463288965221125, "grad_norm": 0.20436081290245056, "learning_rate": 1.4202422671614647e-06, "loss": 0.0211, "step": 7163 }, { "epoch": 0.9464610100075965, "grad_norm": 0.25356408953666687, "learning_rate": 1.4132711218311723e-06, "loss": 0.023, "step": 7164 }, { "epoch": 0.9465931234930806, "grad_norm": 0.23628415167331696, "learning_rate": 1.4063170055149788e-06, "loss": 0.0172, "step": 7165 }, { "epoch": 0.9467252369785646, "grad_norm": 0.16951411962509155, "learning_rate": 1.399379919414101e-06, "loss": 0.0148, "step": 7166 }, { "epoch": 0.9468573504640486, "grad_norm": 0.11124689131975174, "learning_rate": 1.3924598647267694e-06, "loss": 0.013, "step": 7167 }, { "epoch": 0.9469894639495327, "grad_norm": 0.1717250794172287, "learning_rate": 1.3855568426483057e-06, "loss": 0.0145, "step": 7168 }, { "epoch": 0.9471215774350167, "grad_norm": 0.16048182547092438, "learning_rate": 1.3786708543710781e-06, "loss": 0.0175, "step": 7169 }, { "epoch": 0.9472536909205007, "grad_norm": 0.09600821882486343, "learning_rate": 1.3718019010845129e-06, "loss": 0.0054, "step": 7170 }, { "epoch": 0.9473858044059847, "grad_norm": 0.09572196006774902, "learning_rate": 1.3649499839750946e-06, "loss": 0.0147, "step": 7171 }, { "epoch": 0.9475179178914688, "grad_norm": 0.08443471044301987, "learning_rate": 1.358115104226343e-06, "loss": 0.0039, "step": 7172 }, { "epoch": 0.9476500313769528, "grad_norm": 0.15685054659843445, "learning_rate": 1.3512972630188914e-06, "loss": 0.0158, "step": 7173 }, { "epoch": 0.9477821448624368, "grad_norm": 0.16958138346672058, "learning_rate": 1.3444964615303646e-06, "loss": 0.0189, "step": 7174 }, { "epoch": 0.9479142583479209, "grad_norm": 0.08407772332429886, "learning_rate": 1.3377127009354895e-06, "loss": 0.0086, "step": 7175 }, { "epoch": 0.9480463718334049, "grad_norm": 0.16646800935268402, "learning_rate": 1.3309459824060288e-06, "loss": 0.012, "step": 7176 }, { "epoch": 0.9481784853188889, "grad_norm": 0.08961289376020432, "learning_rate": 1.3241963071108031e-06, "loss": 0.0064, "step": 7177 }, { "epoch": 0.948310598804373, "grad_norm": 0.16918633878231049, "learning_rate": 1.3174636762157133e-06, "loss": 0.0162, "step": 7178 }, { "epoch": 0.948442712289857, "grad_norm": 0.32409965991973877, "learning_rate": 1.3107480908836622e-06, "loss": 0.0322, "step": 7179 }, { "epoch": 0.948574825775341, "grad_norm": 0.15334896743297577, "learning_rate": 1.3040495522746664e-06, "loss": 0.0117, "step": 7180 }, { "epoch": 0.948706939260825, "grad_norm": 0.12855543196201324, "learning_rate": 1.2973680615457672e-06, "loss": 0.0073, "step": 7181 }, { "epoch": 0.9488390527463091, "grad_norm": 0.11828119307756424, "learning_rate": 1.2907036198510636e-06, "loss": 0.0116, "step": 7182 }, { "epoch": 0.9489711662317931, "grad_norm": 0.14522184431552887, "learning_rate": 1.2840562283417013e-06, "loss": 0.0156, "step": 7183 }, { "epoch": 0.9491032797172771, "grad_norm": 0.2941734194755554, "learning_rate": 1.2774258881659174e-06, "loss": 0.0224, "step": 7184 }, { "epoch": 0.9492353932027612, "grad_norm": 0.19937515258789062, "learning_rate": 1.2708126004689735e-06, "loss": 0.0257, "step": 7185 }, { "epoch": 0.9493675066882452, "grad_norm": 0.1287490576505661, "learning_rate": 1.2642163663931895e-06, "loss": 0.0152, "step": 7186 }, { "epoch": 0.9494996201737292, "grad_norm": 0.22921793162822723, "learning_rate": 1.2576371870779202e-06, "loss": 0.0188, "step": 7187 }, { "epoch": 0.9496317336592133, "grad_norm": 0.289598286151886, "learning_rate": 1.2510750636596346e-06, "loss": 0.0096, "step": 7188 }, { "epoch": 0.9497638471446973, "grad_norm": 0.12793676555156708, "learning_rate": 1.2445299972717817e-06, "loss": 0.0156, "step": 7189 }, { "epoch": 0.9498959606301813, "grad_norm": 0.10357484966516495, "learning_rate": 1.2380019890449124e-06, "loss": 0.0083, "step": 7190 }, { "epoch": 0.9500280741156654, "grad_norm": 0.18829475343227386, "learning_rate": 1.231491040106636e-06, "loss": 0.0146, "step": 7191 }, { "epoch": 0.9501601876011494, "grad_norm": 0.25576236844062805, "learning_rate": 1.224997151581564e-06, "loss": 0.0165, "step": 7192 }, { "epoch": 0.9502923010866334, "grad_norm": 0.15136855840682983, "learning_rate": 1.2185203245914212e-06, "loss": 0.0171, "step": 7193 }, { "epoch": 0.9504244145721175, "grad_norm": 0.13044820725917816, "learning_rate": 1.2120605602549462e-06, "loss": 0.0137, "step": 7194 }, { "epoch": 0.9505565280576015, "grad_norm": 0.1385304480791092, "learning_rate": 1.2056178596879352e-06, "loss": 0.0137, "step": 7195 }, { "epoch": 0.9506886415430855, "grad_norm": 0.13936810195446014, "learning_rate": 1.199192224003265e-06, "loss": 0.0106, "step": 7196 }, { "epoch": 0.9508207550285696, "grad_norm": 0.19669145345687866, "learning_rate": 1.1927836543108251e-06, "loss": 0.0145, "step": 7197 }, { "epoch": 0.9509528685140536, "grad_norm": 0.4017046391963959, "learning_rate": 1.1863921517175968e-06, "loss": 0.0211, "step": 7198 }, { "epoch": 0.9510849819995376, "grad_norm": 0.21765701472759247, "learning_rate": 1.1800177173275639e-06, "loss": 0.0172, "step": 7199 }, { "epoch": 0.9512170954850216, "grad_norm": 0.22565911710262299, "learning_rate": 1.173660352241812e-06, "loss": 0.0187, "step": 7200 }, { "epoch": 0.9513492089705057, "grad_norm": 0.33037158846855164, "learning_rate": 1.167320057558452e-06, "loss": 0.0255, "step": 7201 }, { "epoch": 0.9514813224559897, "grad_norm": 0.16202698647975922, "learning_rate": 1.1609968343726519e-06, "loss": 0.0137, "step": 7202 }, { "epoch": 0.9516134359414737, "grad_norm": 0.10075455904006958, "learning_rate": 1.1546906837766268e-06, "loss": 0.0079, "step": 7203 }, { "epoch": 0.9517455494269578, "grad_norm": 0.24736060202121735, "learning_rate": 1.1484016068596393e-06, "loss": 0.0266, "step": 7204 }, { "epoch": 0.9518776629124418, "grad_norm": 0.11703985184431076, "learning_rate": 1.1421296047080421e-06, "loss": 0.0125, "step": 7205 }, { "epoch": 0.9520097763979258, "grad_norm": 0.1756127029657364, "learning_rate": 1.1358746784051687e-06, "loss": 0.0136, "step": 7206 }, { "epoch": 0.9521418898834099, "grad_norm": 0.10668937116861343, "learning_rate": 1.129636829031444e-06, "loss": 0.013, "step": 7207 }, { "epoch": 0.9522740033688939, "grad_norm": 0.11571614444255829, "learning_rate": 1.1234160576643726e-06, "loss": 0.0141, "step": 7208 }, { "epoch": 0.9524061168543779, "grad_norm": 0.12110844999551773, "learning_rate": 1.1172123653784394e-06, "loss": 0.012, "step": 7209 }, { "epoch": 0.952538230339862, "grad_norm": 0.1357802003622055, "learning_rate": 1.111025753245243e-06, "loss": 0.0107, "step": 7210 }, { "epoch": 0.952670343825346, "grad_norm": 0.1572360098361969, "learning_rate": 1.1048562223333835e-06, "loss": 0.0132, "step": 7211 }, { "epoch": 0.95280245731083, "grad_norm": 0.28157058358192444, "learning_rate": 1.0987037737085536e-06, "loss": 0.0285, "step": 7212 }, { "epoch": 0.952934570796314, "grad_norm": 0.18760952353477478, "learning_rate": 1.0925684084334476e-06, "loss": 0.0232, "step": 7213 }, { "epoch": 0.9530666842817981, "grad_norm": 0.15009918808937073, "learning_rate": 1.0864501275678618e-06, "loss": 0.0149, "step": 7214 }, { "epoch": 0.9531987977672821, "grad_norm": 0.2483643740415573, "learning_rate": 1.0803489321685957e-06, "loss": 0.0253, "step": 7215 }, { "epoch": 0.9533309112527661, "grad_norm": 0.1848488599061966, "learning_rate": 1.074264823289528e-06, "loss": 0.0155, "step": 7216 }, { "epoch": 0.9534630247382502, "grad_norm": 0.14977069199085236, "learning_rate": 1.0681978019815741e-06, "loss": 0.0173, "step": 7217 }, { "epoch": 0.9535951382237342, "grad_norm": 0.1266462504863739, "learning_rate": 1.0621478692926845e-06, "loss": 0.0125, "step": 7218 }, { "epoch": 0.9537272517092182, "grad_norm": 0.20076677203178406, "learning_rate": 1.0561150262678899e-06, "loss": 0.0125, "step": 7219 }, { "epoch": 0.9538593651947023, "grad_norm": 0.13278064131736755, "learning_rate": 1.0500992739492454e-06, "loss": 0.0137, "step": 7220 }, { "epoch": 0.9539914786801863, "grad_norm": 0.10803290456533432, "learning_rate": 1.0441006133758536e-06, "loss": 0.0094, "step": 7221 }, { "epoch": 0.9541235921656703, "grad_norm": 0.11469370126724243, "learning_rate": 1.0381190455838852e-06, "loss": 0.0134, "step": 7222 }, { "epoch": 0.9542557056511544, "grad_norm": 0.13785609602928162, "learning_rate": 1.032154571606525e-06, "loss": 0.0166, "step": 7223 }, { "epoch": 0.9543878191366384, "grad_norm": 0.15190643072128296, "learning_rate": 1.0262071924740268e-06, "loss": 0.0152, "step": 7224 }, { "epoch": 0.9545199326221224, "grad_norm": 0.1632893681526184, "learning_rate": 1.0202769092137022e-06, "loss": 0.0179, "step": 7225 }, { "epoch": 0.9546520461076065, "grad_norm": 0.1495528668165207, "learning_rate": 1.0143637228498981e-06, "loss": 0.0126, "step": 7226 }, { "epoch": 0.9547841595930905, "grad_norm": 0.15436944365501404, "learning_rate": 1.0084676344039977e-06, "loss": 0.0095, "step": 7227 }, { "epoch": 0.9549162730785745, "grad_norm": 0.1181582435965538, "learning_rate": 1.002588644894431e-06, "loss": 0.0157, "step": 7228 }, { "epoch": 0.9550483865640585, "grad_norm": 0.09800876677036285, "learning_rate": 9.967267553367078e-07, "loss": 0.0116, "step": 7229 }, { "epoch": 0.9551805000495426, "grad_norm": 0.21567225456237793, "learning_rate": 9.9088196674334e-07, "loss": 0.0196, "step": 7230 }, { "epoch": 0.9553126135350266, "grad_norm": 0.13500306010246277, "learning_rate": 9.850542801239093e-07, "loss": 0.0104, "step": 7231 }, { "epoch": 0.9554447270205106, "grad_norm": 0.1280946284532547, "learning_rate": 9.792436964850439e-07, "loss": 0.0076, "step": 7232 }, { "epoch": 0.9555768405059947, "grad_norm": 0.11754422634840012, "learning_rate": 9.734502168304183e-07, "loss": 0.0148, "step": 7233 }, { "epoch": 0.9557089539914787, "grad_norm": 0.14567334949970245, "learning_rate": 9.676738421607434e-07, "loss": 0.0134, "step": 7234 }, { "epoch": 0.9558410674769627, "grad_norm": 0.11744000762701035, "learning_rate": 9.619145734737655e-07, "loss": 0.0132, "step": 7235 }, { "epoch": 0.9559731809624468, "grad_norm": 0.26282408833503723, "learning_rate": 9.56172411764311e-07, "loss": 0.0333, "step": 7236 }, { "epoch": 0.9561052944479308, "grad_norm": 0.11459115147590637, "learning_rate": 9.504473580242202e-07, "loss": 0.0089, "step": 7237 }, { "epoch": 0.9562374079334148, "grad_norm": 0.09463014453649521, "learning_rate": 9.447394132423903e-07, "loss": 0.0066, "step": 7238 }, { "epoch": 0.9563695214188989, "grad_norm": 0.14389050006866455, "learning_rate": 9.390485784047664e-07, "loss": 0.0131, "step": 7239 }, { "epoch": 0.9565016349043829, "grad_norm": 0.12472982704639435, "learning_rate": 9.333748544943288e-07, "loss": 0.0179, "step": 7240 }, { "epoch": 0.9566337483898669, "grad_norm": 0.170174241065979, "learning_rate": 9.277182424911158e-07, "loss": 0.0173, "step": 7241 }, { "epoch": 0.956765861875351, "grad_norm": 0.19937077164649963, "learning_rate": 9.2207874337219e-07, "loss": 0.0218, "step": 7242 }, { "epoch": 0.956897975360835, "grad_norm": 0.45541685819625854, "learning_rate": 9.164563581116725e-07, "loss": 0.0117, "step": 7243 }, { "epoch": 0.957030088846319, "grad_norm": 0.15659266710281372, "learning_rate": 9.108510876807308e-07, "loss": 0.0149, "step": 7244 }, { "epoch": 0.957162202331803, "grad_norm": 0.11466649919748306, "learning_rate": 9.052629330475681e-07, "loss": 0.0092, "step": 7245 }, { "epoch": 0.9572943158172871, "grad_norm": 0.1386268436908722, "learning_rate": 8.996918951774236e-07, "loss": 0.0136, "step": 7246 }, { "epoch": 0.9574264293027711, "grad_norm": 0.1966053545475006, "learning_rate": 8.941379750326051e-07, "loss": 0.0187, "step": 7247 }, { "epoch": 0.9575585427882551, "grad_norm": 0.15060442686080933, "learning_rate": 8.886011735724453e-07, "loss": 0.0153, "step": 7248 }, { "epoch": 0.9576906562737392, "grad_norm": 0.1787412166595459, "learning_rate": 8.830814917533125e-07, "loss": 0.0172, "step": 7249 }, { "epoch": 0.9578227697592232, "grad_norm": 0.14520373940467834, "learning_rate": 8.775789305286442e-07, "loss": 0.0098, "step": 7250 }, { "epoch": 0.9579548832447072, "grad_norm": 0.165212482213974, "learning_rate": 8.720934908488909e-07, "loss": 0.0159, "step": 7251 }, { "epoch": 0.9580869967301913, "grad_norm": 0.12482784688472748, "learning_rate": 8.666251736615616e-07, "loss": 0.0045, "step": 7252 }, { "epoch": 0.9582191102156753, "grad_norm": 0.1122523620724678, "learning_rate": 8.611739799112229e-07, "loss": 0.011, "step": 7253 }, { "epoch": 0.9583512237011593, "grad_norm": 0.10343725234270096, "learning_rate": 8.557399105394437e-07, "loss": 0.0084, "step": 7254 }, { "epoch": 0.9584833371866434, "grad_norm": 0.10364764928817749, "learning_rate": 8.503229664848733e-07, "loss": 0.0088, "step": 7255 }, { "epoch": 0.9586154506721274, "grad_norm": 0.19793720543384552, "learning_rate": 8.449231486831744e-07, "loss": 0.0213, "step": 7256 }, { "epoch": 0.9587475641576114, "grad_norm": 0.1351151317358017, "learning_rate": 8.395404580670785e-07, "loss": 0.0112, "step": 7257 }, { "epoch": 0.9588796776430955, "grad_norm": 0.16011805832386017, "learning_rate": 8.34174895566342e-07, "loss": 0.0127, "step": 7258 }, { "epoch": 0.9590117911285795, "grad_norm": 0.16073232889175415, "learning_rate": 8.288264621077457e-07, "loss": 0.0163, "step": 7259 }, { "epoch": 0.9591439046140635, "grad_norm": 0.14454664289951324, "learning_rate": 8.234951586151729e-07, "loss": 0.0123, "step": 7260 }, { "epoch": 0.9592760180995475, "grad_norm": 0.12177897244691849, "learning_rate": 8.181809860094647e-07, "loss": 0.0133, "step": 7261 }, { "epoch": 0.9594081315850316, "grad_norm": 0.13024061918258667, "learning_rate": 8.128839452085535e-07, "loss": 0.0152, "step": 7262 }, { "epoch": 0.9595402450705156, "grad_norm": 0.13621719181537628, "learning_rate": 8.076040371274296e-07, "loss": 0.0227, "step": 7263 }, { "epoch": 0.9596723585559996, "grad_norm": 0.13376359641551971, "learning_rate": 8.023412626780746e-07, "loss": 0.0107, "step": 7264 }, { "epoch": 0.9598044720414837, "grad_norm": 0.12808959186077118, "learning_rate": 7.970956227695392e-07, "loss": 0.0117, "step": 7265 }, { "epoch": 0.9599365855269677, "grad_norm": 0.2006939947605133, "learning_rate": 7.918671183079096e-07, "loss": 0.0215, "step": 7266 }, { "epoch": 0.9600686990124517, "grad_norm": 0.08638248592615128, "learning_rate": 7.86655750196319e-07, "loss": 0.0047, "step": 7267 }, { "epoch": 0.9602008124979358, "grad_norm": 0.10825937241315842, "learning_rate": 7.814615193349251e-07, "loss": 0.0102, "step": 7268 }, { "epoch": 0.9603329259834198, "grad_norm": 0.1509263962507248, "learning_rate": 7.762844266209435e-07, "loss": 0.0131, "step": 7269 }, { "epoch": 0.9604650394689038, "grad_norm": 0.0950889065861702, "learning_rate": 7.711244729486034e-07, "loss": 0.0109, "step": 7270 }, { "epoch": 0.9605971529543879, "grad_norm": 0.10586385428905487, "learning_rate": 7.65981659209214e-07, "loss": 0.0131, "step": 7271 }, { "epoch": 0.9607292664398719, "grad_norm": 0.13581392168998718, "learning_rate": 7.608559862910758e-07, "loss": 0.0134, "step": 7272 }, { "epoch": 0.9608613799253559, "grad_norm": 0.1681486815214157, "learning_rate": 7.557474550795695e-07, "loss": 0.0215, "step": 7273 }, { "epoch": 0.96099349341084, "grad_norm": 0.09689151495695114, "learning_rate": 7.506560664571005e-07, "loss": 0.006, "step": 7274 }, { "epoch": 0.961125606896324, "grad_norm": 0.11082078516483307, "learning_rate": 7.455818213030985e-07, "loss": 0.0089, "step": 7275 }, { "epoch": 0.961257720381808, "grad_norm": 0.18737493455410004, "learning_rate": 7.405247204940513e-07, "loss": 0.0209, "step": 7276 }, { "epoch": 0.961389833867292, "grad_norm": 0.18590763211250305, "learning_rate": 7.354847649034713e-07, "loss": 0.0171, "step": 7277 }, { "epoch": 0.9615219473527761, "grad_norm": 0.2514604926109314, "learning_rate": 7.304619554019288e-07, "loss": 0.0095, "step": 7278 }, { "epoch": 0.9616540608382601, "grad_norm": 0.10440555959939957, "learning_rate": 7.254562928570074e-07, "loss": 0.0129, "step": 7279 }, { "epoch": 0.9617861743237441, "grad_norm": 0.13339920341968536, "learning_rate": 7.204677781333602e-07, "loss": 0.0116, "step": 7280 }, { "epoch": 0.9619182878092282, "grad_norm": 0.19956082105636597, "learning_rate": 7.154964120926422e-07, "loss": 0.0078, "step": 7281 }, { "epoch": 0.9620504012947122, "grad_norm": 0.1739201694726944, "learning_rate": 7.105421955935665e-07, "loss": 0.0224, "step": 7282 }, { "epoch": 0.9621825147801962, "grad_norm": 0.25007346272468567, "learning_rate": 7.056051294918819e-07, "loss": 0.0255, "step": 7283 }, { "epoch": 0.9623146282656803, "grad_norm": 0.1323181390762329, "learning_rate": 7.006852146403842e-07, "loss": 0.0201, "step": 7284 }, { "epoch": 0.9624467417511643, "grad_norm": 0.13154102861881256, "learning_rate": 6.957824518888822e-07, "loss": 0.0172, "step": 7285 }, { "epoch": 0.9625788552366483, "grad_norm": 0.15256892144680023, "learning_rate": 6.908968420842433e-07, "loss": 0.0268, "step": 7286 }, { "epoch": 0.9627109687221324, "grad_norm": 0.17247919738292694, "learning_rate": 6.860283860703698e-07, "loss": 0.0081, "step": 7287 }, { "epoch": 0.9628430822076164, "grad_norm": 0.22937926650047302, "learning_rate": 6.811770846882004e-07, "loss": 0.0141, "step": 7288 }, { "epoch": 0.9629751956931004, "grad_norm": 0.14299964904785156, "learning_rate": 6.76342938775687e-07, "loss": 0.0107, "step": 7289 }, { "epoch": 0.9631073091785844, "grad_norm": 0.156651571393013, "learning_rate": 6.715259491678505e-07, "loss": 0.0064, "step": 7290 }, { "epoch": 0.9632394226640685, "grad_norm": 0.12845245003700256, "learning_rate": 6.667261166967365e-07, "loss": 0.0125, "step": 7291 }, { "epoch": 0.9633715361495525, "grad_norm": 0.13767504692077637, "learning_rate": 6.619434421914262e-07, "loss": 0.0107, "step": 7292 }, { "epoch": 0.9635036496350365, "grad_norm": 0.14422591030597687, "learning_rate": 6.571779264780364e-07, "loss": 0.0162, "step": 7293 }, { "epoch": 0.9636357631205206, "grad_norm": 0.178952157497406, "learning_rate": 6.524295703797201e-07, "loss": 0.0137, "step": 7294 }, { "epoch": 0.9637678766060046, "grad_norm": 0.1274910867214203, "learning_rate": 6.476983747166654e-07, "loss": 0.0126, "step": 7295 }, { "epoch": 0.9638999900914886, "grad_norm": 0.07097754627466202, "learning_rate": 6.429843403060964e-07, "loss": 0.0037, "step": 7296 }, { "epoch": 0.9640321035769727, "grad_norm": 0.12936192750930786, "learning_rate": 6.382874679622841e-07, "loss": 0.0225, "step": 7297 }, { "epoch": 0.9641642170624567, "grad_norm": 0.11343681067228317, "learning_rate": 6.336077584965128e-07, "loss": 0.0085, "step": 7298 }, { "epoch": 0.9642963305479407, "grad_norm": 0.15607260167598724, "learning_rate": 6.289452127171247e-07, "loss": 0.0163, "step": 7299 }, { "epoch": 0.9644284440334248, "grad_norm": 0.13873155415058136, "learning_rate": 6.242998314294757e-07, "loss": 0.0114, "step": 7300 }, { "epoch": 0.9645605575189088, "grad_norm": 0.14331649243831635, "learning_rate": 6.196716154359794e-07, "loss": 0.0181, "step": 7301 }, { "epoch": 0.9646926710043928, "grad_norm": 0.18304325640201569, "learning_rate": 6.150605655360853e-07, "loss": 0.0147, "step": 7302 }, { "epoch": 0.9648247844898769, "grad_norm": 0.137273371219635, "learning_rate": 6.10466682526234e-07, "loss": 0.014, "step": 7303 }, { "epoch": 0.9649568979753609, "grad_norm": 0.13832210004329681, "learning_rate": 6.058899671999574e-07, "loss": 0.0137, "step": 7304 }, { "epoch": 0.9650890114608449, "grad_norm": 0.19359838962554932, "learning_rate": 6.013304203477788e-07, "loss": 0.0155, "step": 7305 }, { "epoch": 0.965221124946329, "grad_norm": 0.2560751736164093, "learning_rate": 5.967880427573014e-07, "loss": 0.0218, "step": 7306 }, { "epoch": 0.965353238431813, "grad_norm": 0.33832573890686035, "learning_rate": 5.922628352131087e-07, "loss": 0.0129, "step": 7307 }, { "epoch": 0.965485351917297, "grad_norm": 0.213003471493721, "learning_rate": 5.877547984968646e-07, "loss": 0.0139, "step": 7308 }, { "epoch": 0.965617465402781, "grad_norm": 0.3071267008781433, "learning_rate": 5.832639333872458e-07, "loss": 0.0176, "step": 7309 }, { "epoch": 0.9657495788882651, "grad_norm": 0.2311631441116333, "learning_rate": 5.787902406599543e-07, "loss": 0.0189, "step": 7310 }, { "epoch": 0.9658816923737491, "grad_norm": 0.10062497109174728, "learning_rate": 5.743337210877386e-07, "loss": 0.0114, "step": 7311 }, { "epoch": 0.9660138058592331, "grad_norm": 0.18390169739723206, "learning_rate": 5.69894375440394e-07, "loss": 0.0147, "step": 7312 }, { "epoch": 0.9661459193447172, "grad_norm": 0.17044439911842346, "learning_rate": 5.654722044847183e-07, "loss": 0.0166, "step": 7313 }, { "epoch": 0.9662780328302012, "grad_norm": 0.0919048935174942, "learning_rate": 5.61067208984567e-07, "loss": 0.0084, "step": 7314 }, { "epoch": 0.9664101463156852, "grad_norm": 0.14148785173892975, "learning_rate": 5.566793897008204e-07, "loss": 0.0136, "step": 7315 }, { "epoch": 0.9665422598011693, "grad_norm": 0.2157713621854782, "learning_rate": 5.523087473913835e-07, "loss": 0.0137, "step": 7316 }, { "epoch": 0.9666743732866533, "grad_norm": 0.24128608405590057, "learning_rate": 5.479552828112189e-07, "loss": 0.0228, "step": 7317 }, { "epoch": 0.9668064867721373, "grad_norm": 0.12320316582918167, "learning_rate": 5.436189967122918e-07, "loss": 0.0117, "step": 7318 }, { "epoch": 0.9669386002576214, "grad_norm": 0.18456970155239105, "learning_rate": 5.392998898436252e-07, "loss": 0.0225, "step": 7319 }, { "epoch": 0.9670707137431054, "grad_norm": 0.14801354706287384, "learning_rate": 5.349979629512448e-07, "loss": 0.018, "step": 7320 }, { "epoch": 0.9672028272285894, "grad_norm": 0.13997134566307068, "learning_rate": 5.307132167782558e-07, "loss": 0.0202, "step": 7321 }, { "epoch": 0.9673349407140734, "grad_norm": 0.22053630650043488, "learning_rate": 5.264456520647554e-07, "loss": 0.0282, "step": 7322 }, { "epoch": 0.9674670541995575, "grad_norm": 0.1400284618139267, "learning_rate": 5.22195269547876e-07, "loss": 0.016, "step": 7323 }, { "epoch": 0.9675991676850415, "grad_norm": 0.18816642463207245, "learning_rate": 5.179620699617971e-07, "loss": 0.0157, "step": 7324 }, { "epoch": 0.9677312811705255, "grad_norm": 0.14045368134975433, "learning_rate": 5.137460540377337e-07, "loss": 0.0114, "step": 7325 }, { "epoch": 0.9678633946560095, "grad_norm": 0.1565784364938736, "learning_rate": 5.095472225039255e-07, "loss": 0.0185, "step": 7326 }, { "epoch": 0.9679955081414935, "grad_norm": 0.16238823533058167, "learning_rate": 5.053655760856257e-07, "loss": 0.0144, "step": 7327 }, { "epoch": 0.9681276216269775, "grad_norm": 0.17575566470623016, "learning_rate": 5.012011155051454e-07, "loss": 0.0154, "step": 7328 }, { "epoch": 0.9682597351124616, "grad_norm": 0.14301219582557678, "learning_rate": 4.970538414818204e-07, "loss": 0.0195, "step": 7329 }, { "epoch": 0.9683918485979456, "grad_norm": 0.07853206992149353, "learning_rate": 4.929237547320109e-07, "loss": 0.0056, "step": 7330 }, { "epoch": 0.9685239620834296, "grad_norm": 0.19317726790905, "learning_rate": 4.888108559691018e-07, "loss": 0.0185, "step": 7331 }, { "epoch": 0.9686560755689136, "grad_norm": 0.20411786437034607, "learning_rate": 4.84715145903536e-07, "loss": 0.0211, "step": 7332 }, { "epoch": 0.9687881890543977, "grad_norm": 0.17225292325019836, "learning_rate": 4.806366252427697e-07, "loss": 0.0199, "step": 7333 }, { "epoch": 0.9689203025398817, "grad_norm": 0.10041002184152603, "learning_rate": 4.7657529469128384e-07, "loss": 0.0094, "step": 7334 }, { "epoch": 0.9690524160253657, "grad_norm": 0.16155697405338287, "learning_rate": 4.725311549505951e-07, "loss": 0.0192, "step": 7335 }, { "epoch": 0.9691845295108498, "grad_norm": 0.10414627939462662, "learning_rate": 4.6850420671925575e-07, "loss": 0.013, "step": 7336 }, { "epoch": 0.9693166429963338, "grad_norm": 0.10860517621040344, "learning_rate": 4.644944506928539e-07, "loss": 0.0081, "step": 7337 }, { "epoch": 0.9694487564818178, "grad_norm": 0.12890401482582092, "learning_rate": 4.6050188756397994e-07, "loss": 0.0102, "step": 7338 }, { "epoch": 0.9695808699673019, "grad_norm": 0.21085602045059204, "learning_rate": 4.565265180223044e-07, "loss": 0.0153, "step": 7339 }, { "epoch": 0.9697129834527859, "grad_norm": 0.1475398987531662, "learning_rate": 4.525683427544669e-07, "loss": 0.0129, "step": 7340 }, { "epoch": 0.9698450969382699, "grad_norm": 0.184042826294899, "learning_rate": 4.4862736244419835e-07, "loss": 0.0088, "step": 7341 }, { "epoch": 0.969977210423754, "grad_norm": 0.10526137053966522, "learning_rate": 4.447035777721986e-07, "loss": 0.0112, "step": 7342 }, { "epoch": 0.970109323909238, "grad_norm": 0.12947724759578705, "learning_rate": 4.407969894162589e-07, "loss": 0.0129, "step": 7343 }, { "epoch": 0.970241437394722, "grad_norm": 0.1230868250131607, "learning_rate": 4.3690759805113944e-07, "loss": 0.0107, "step": 7344 }, { "epoch": 0.970373550880206, "grad_norm": 0.22553841769695282, "learning_rate": 4.330354043486806e-07, "loss": 0.0208, "step": 7345 }, { "epoch": 0.9705056643656901, "grad_norm": 0.15463051199913025, "learning_rate": 4.2918040897772513e-07, "loss": 0.0145, "step": 7346 }, { "epoch": 0.9706377778511741, "grad_norm": 0.18720471858978271, "learning_rate": 4.253426126041515e-07, "loss": 0.0074, "step": 7347 }, { "epoch": 0.9707698913366581, "grad_norm": 0.1039755642414093, "learning_rate": 4.215220158908628e-07, "loss": 0.0122, "step": 7348 }, { "epoch": 0.9709020048221422, "grad_norm": 0.14909744262695312, "learning_rate": 4.177186194977978e-07, "loss": 0.0144, "step": 7349 }, { "epoch": 0.9710341183076262, "grad_norm": 0.1407427191734314, "learning_rate": 4.139324240819309e-07, "loss": 0.0137, "step": 7350 }, { "epoch": 0.9711662317931102, "grad_norm": 0.16804887354373932, "learning_rate": 4.1016343029725014e-07, "loss": 0.0134, "step": 7351 }, { "epoch": 0.9712983452785943, "grad_norm": 0.14374983310699463, "learning_rate": 4.064116387947792e-07, "loss": 0.0158, "step": 7352 }, { "epoch": 0.9714304587640783, "grad_norm": 0.1527361273765564, "learning_rate": 4.026770502225663e-07, "loss": 0.016, "step": 7353 }, { "epoch": 0.9715625722495623, "grad_norm": 0.1665458083152771, "learning_rate": 3.989596652256955e-07, "loss": 0.0139, "step": 7354 }, { "epoch": 0.9716946857350464, "grad_norm": 0.12903252243995667, "learning_rate": 3.9525948444627534e-07, "loss": 0.0101, "step": 7355 }, { "epoch": 0.9718267992205304, "grad_norm": 0.12322012335062027, "learning_rate": 3.915765085234391e-07, "loss": 0.0081, "step": 7356 }, { "epoch": 0.9719589127060144, "grad_norm": 0.18705911934375763, "learning_rate": 3.8791073809336666e-07, "loss": 0.0124, "step": 7357 }, { "epoch": 0.9720910261914985, "grad_norm": 0.12552902102470398, "learning_rate": 3.842621737892294e-07, "loss": 0.0141, "step": 7358 }, { "epoch": 0.9722231396769825, "grad_norm": 0.17267630994319916, "learning_rate": 3.806308162412564e-07, "loss": 0.0108, "step": 7359 }, { "epoch": 0.9723552531624665, "grad_norm": 0.11716732382774353, "learning_rate": 3.770166660767016e-07, "loss": 0.0121, "step": 7360 }, { "epoch": 0.9724873666479505, "grad_norm": 0.2246181219816208, "learning_rate": 3.734197239198434e-07, "loss": 0.0234, "step": 7361 }, { "epoch": 0.9726194801334346, "grad_norm": 0.11996744573116302, "learning_rate": 3.698399903919847e-07, "loss": 0.0098, "step": 7362 }, { "epoch": 0.9727515936189186, "grad_norm": 0.1341700404882431, "learning_rate": 3.662774661114421e-07, "loss": 0.0111, "step": 7363 }, { "epoch": 0.9728837071044026, "grad_norm": 0.11980053037405014, "learning_rate": 3.6273215169360107e-07, "loss": 0.0102, "step": 7364 }, { "epoch": 0.9730158205898867, "grad_norm": 0.12592989206314087, "learning_rate": 3.5920404775082737e-07, "loss": 0.0093, "step": 7365 }, { "epoch": 0.9731479340753707, "grad_norm": 0.17192436754703522, "learning_rate": 3.556931548925557e-07, "loss": 0.0098, "step": 7366 }, { "epoch": 0.9732800475608547, "grad_norm": 0.09422129392623901, "learning_rate": 3.521994737252121e-07, "loss": 0.0081, "step": 7367 }, { "epoch": 0.9734121610463388, "grad_norm": 0.1644677072763443, "learning_rate": 3.487230048522583e-07, "loss": 0.0129, "step": 7368 }, { "epoch": 0.9735442745318228, "grad_norm": 0.13948781788349152, "learning_rate": 3.4526374887420275e-07, "loss": 0.0146, "step": 7369 }, { "epoch": 0.9736763880173068, "grad_norm": 0.12497429549694061, "learning_rate": 3.418217063885565e-07, "loss": 0.0148, "step": 7370 }, { "epoch": 0.9738085015027909, "grad_norm": 0.14855220913887024, "learning_rate": 3.383968779898883e-07, "loss": 0.0074, "step": 7371 }, { "epoch": 0.9739406149882749, "grad_norm": 0.154222771525383, "learning_rate": 3.349892642697472e-07, "loss": 0.0111, "step": 7372 }, { "epoch": 0.9740727284737589, "grad_norm": 0.13619877398014069, "learning_rate": 3.3159886581675124e-07, "loss": 0.0195, "step": 7373 }, { "epoch": 0.974204841959243, "grad_norm": 0.27135902643203735, "learning_rate": 3.2822568321653204e-07, "loss": 0.0187, "step": 7374 }, { "epoch": 0.974336955444727, "grad_norm": 0.17041723430156708, "learning_rate": 3.2486971705172343e-07, "loss": 0.0088, "step": 7375 }, { "epoch": 0.974469068930211, "grad_norm": 0.2805335819721222, "learning_rate": 3.215309679020284e-07, "loss": 0.0272, "step": 7376 }, { "epoch": 0.974601182415695, "grad_norm": 0.15251220762729645, "learning_rate": 3.182094363441301e-07, "loss": 0.0126, "step": 7377 }, { "epoch": 0.9747332959011791, "grad_norm": 0.23265893757343292, "learning_rate": 3.1490512295179186e-07, "loss": 0.0208, "step": 7378 }, { "epoch": 0.9748654093866631, "grad_norm": 0.1823471486568451, "learning_rate": 3.1161802829573486e-07, "loss": 0.0196, "step": 7379 }, { "epoch": 0.9749975228721471, "grad_norm": 0.21355217695236206, "learning_rate": 3.083481529437715e-07, "loss": 0.0199, "step": 7380 }, { "epoch": 0.9751296363576312, "grad_norm": 0.25127115845680237, "learning_rate": 3.0509549746070564e-07, "loss": 0.0244, "step": 7381 }, { "epoch": 0.9752617498431152, "grad_norm": 0.1271241009235382, "learning_rate": 3.018600624083767e-07, "loss": 0.0113, "step": 7382 }, { "epoch": 0.9753938633285992, "grad_norm": 0.20300577580928802, "learning_rate": 2.9864184834562657e-07, "loss": 0.0236, "step": 7383 }, { "epoch": 0.9755259768140833, "grad_norm": 0.12326376140117645, "learning_rate": 2.95440855828355e-07, "loss": 0.0094, "step": 7384 }, { "epoch": 0.9756580902995673, "grad_norm": 0.16099140048027039, "learning_rate": 2.9225708540947527e-07, "loss": 0.0117, "step": 7385 }, { "epoch": 0.9757902037850513, "grad_norm": 0.11374569684267044, "learning_rate": 2.8909053763891414e-07, "loss": 0.0131, "step": 7386 }, { "epoch": 0.9759223172705354, "grad_norm": 0.1675594598054886, "learning_rate": 2.859412130636452e-07, "loss": 0.0183, "step": 7387 }, { "epoch": 0.9760544307560194, "grad_norm": 0.22921884059906006, "learning_rate": 2.828091122276555e-07, "loss": 0.0191, "step": 7388 }, { "epoch": 0.9761865442415034, "grad_norm": 0.09144330769777298, "learning_rate": 2.7969423567195674e-07, "loss": 0.0056, "step": 7389 }, { "epoch": 0.9763186577269874, "grad_norm": 0.14772000908851624, "learning_rate": 2.76596583934563e-07, "loss": 0.0117, "step": 7390 }, { "epoch": 0.9764507712124715, "grad_norm": 0.1751900613307953, "learning_rate": 2.7351615755056846e-07, "loss": 0.018, "step": 7391 }, { "epoch": 0.9765828846979555, "grad_norm": 0.14852789044380188, "learning_rate": 2.7045295705203643e-07, "loss": 0.0155, "step": 7392 }, { "epoch": 0.9767149981834395, "grad_norm": 0.21934881806373596, "learning_rate": 2.674069829680881e-07, "loss": 0.014, "step": 7393 }, { "epoch": 0.9768471116689236, "grad_norm": 0.17583468556404114, "learning_rate": 2.643782358248581e-07, "loss": 0.0123, "step": 7394 }, { "epoch": 0.9769792251544076, "grad_norm": 0.12872935831546783, "learning_rate": 2.6136671614550577e-07, "loss": 0.0107, "step": 7395 }, { "epoch": 0.9771113386398916, "grad_norm": 0.15072745084762573, "learning_rate": 2.58372424450215e-07, "loss": 0.0161, "step": 7396 }, { "epoch": 0.9772434521253757, "grad_norm": 0.1387665718793869, "learning_rate": 2.5539536125618324e-07, "loss": 0.0127, "step": 7397 }, { "epoch": 0.9773755656108597, "grad_norm": 0.15194547176361084, "learning_rate": 2.5243552707765463e-07, "loss": 0.0097, "step": 7398 }, { "epoch": 0.9775076790963437, "grad_norm": 0.19213752448558807, "learning_rate": 2.4949292242587573e-07, "loss": 0.0199, "step": 7399 }, { "epoch": 0.9776397925818278, "grad_norm": 0.18676453828811646, "learning_rate": 2.4656754780914004e-07, "loss": 0.0139, "step": 7400 }, { "epoch": 0.9777719060673118, "grad_norm": 0.1353488266468048, "learning_rate": 2.436594037327433e-07, "loss": 0.0164, "step": 7401 }, { "epoch": 0.9779040195527958, "grad_norm": 0.08223433792591095, "learning_rate": 2.40768490699006e-07, "loss": 0.0096, "step": 7402 }, { "epoch": 0.9780361330382799, "grad_norm": 0.15485511720180511, "learning_rate": 2.3789480920729524e-07, "loss": 0.014, "step": 7403 }, { "epoch": 0.9781682465237639, "grad_norm": 0.15698592364788055, "learning_rate": 2.350383597539696e-07, "loss": 0.0128, "step": 7404 }, { "epoch": 0.9783003600092479, "grad_norm": 0.21309952437877655, "learning_rate": 2.3219914283243437e-07, "loss": 0.0365, "step": 7405 }, { "epoch": 0.978432473494732, "grad_norm": 0.09512881934642792, "learning_rate": 2.2937715893311952e-07, "loss": 0.011, "step": 7406 }, { "epoch": 0.978564586980216, "grad_norm": 0.1895619034767151, "learning_rate": 2.265724085434573e-07, "loss": 0.02, "step": 7407 }, { "epoch": 0.9786967004657, "grad_norm": 0.2142200469970703, "learning_rate": 2.2378489214791577e-07, "loss": 0.0251, "step": 7408 }, { "epoch": 0.978828813951184, "grad_norm": 0.13200215995311737, "learning_rate": 2.2101461022799862e-07, "loss": 0.0175, "step": 7409 }, { "epoch": 0.9789609274366681, "grad_norm": 0.18622782826423645, "learning_rate": 2.1826156326221202e-07, "loss": 0.0234, "step": 7410 }, { "epoch": 0.9790930409221521, "grad_norm": 0.34825804829597473, "learning_rate": 2.155257517260867e-07, "loss": 0.0239, "step": 7411 }, { "epoch": 0.9792251544076361, "grad_norm": 0.11274074018001556, "learning_rate": 2.1280717609220013e-07, "loss": 0.0086, "step": 7412 }, { "epoch": 0.9793572678931202, "grad_norm": 0.16548287868499756, "learning_rate": 2.101058368301212e-07, "loss": 0.0192, "step": 7413 }, { "epoch": 0.9794893813786042, "grad_norm": 0.1395992487668991, "learning_rate": 2.0742173440646552e-07, "loss": 0.0112, "step": 7414 }, { "epoch": 0.9796214948640882, "grad_norm": 0.1266821175813675, "learning_rate": 2.0475486928484e-07, "loss": 0.0115, "step": 7415 }, { "epoch": 0.9797536083495723, "grad_norm": 0.17278282344341278, "learning_rate": 2.0210524192593173e-07, "loss": 0.016, "step": 7416 }, { "epoch": 0.9798857218350563, "grad_norm": 0.15695269405841827, "learning_rate": 1.994728527873857e-07, "loss": 0.0088, "step": 7417 }, { "epoch": 0.9800178353205403, "grad_norm": 0.1245427280664444, "learning_rate": 1.9685770232390488e-07, "loss": 0.0125, "step": 7418 }, { "epoch": 0.9801499488060244, "grad_norm": 0.17886166274547577, "learning_rate": 1.942597909872057e-07, "loss": 0.0212, "step": 7419 }, { "epoch": 0.9802820622915084, "grad_norm": 0.118898406624794, "learning_rate": 1.916791192260403e-07, "loss": 0.0174, "step": 7420 }, { "epoch": 0.9804141757769924, "grad_norm": 0.13825471699237823, "learning_rate": 1.8911568748616326e-07, "loss": 0.0099, "step": 7421 }, { "epoch": 0.9805462892624764, "grad_norm": 0.0875687450170517, "learning_rate": 1.8656949621035368e-07, "loss": 0.0102, "step": 7422 }, { "epoch": 0.9806784027479605, "grad_norm": 0.19940754771232605, "learning_rate": 1.8404054583842645e-07, "loss": 0.0276, "step": 7423 }, { "epoch": 0.9808105162334445, "grad_norm": 0.1369803100824356, "learning_rate": 1.815288368072099e-07, "loss": 0.0128, "step": 7424 }, { "epoch": 0.9809426297189285, "grad_norm": 0.1722928136587143, "learning_rate": 1.7903436955055697e-07, "loss": 0.0142, "step": 7425 }, { "epoch": 0.9810747432044126, "grad_norm": 0.20842589437961578, "learning_rate": 1.7655714449933413e-07, "loss": 0.0254, "step": 7426 }, { "epoch": 0.9812068566898966, "grad_norm": 0.08559152483940125, "learning_rate": 1.7409716208144355e-07, "loss": 0.0085, "step": 7427 }, { "epoch": 0.9813389701753806, "grad_norm": 0.1631077080965042, "learning_rate": 1.716544227217898e-07, "loss": 0.0158, "step": 7428 }, { "epoch": 0.9814710836608647, "grad_norm": 0.09865134954452515, "learning_rate": 1.6922892684232417e-07, "loss": 0.0107, "step": 7429 }, { "epoch": 0.9816031971463487, "grad_norm": 0.1299627423286438, "learning_rate": 1.6682067486198937e-07, "loss": 0.0188, "step": 7430 }, { "epoch": 0.9817353106318327, "grad_norm": 0.21306796371936798, "learning_rate": 1.644296671967749e-07, "loss": 0.0155, "step": 7431 }, { "epoch": 0.9818674241173168, "grad_norm": 0.2713109254837036, "learning_rate": 1.6205590425969474e-07, "loss": 0.0084, "step": 7432 }, { "epoch": 0.9819995376028008, "grad_norm": 0.14543212950229645, "learning_rate": 1.5969938646075432e-07, "loss": 0.0202, "step": 7433 }, { "epoch": 0.9821316510882848, "grad_norm": 0.12514202296733856, "learning_rate": 1.573601142069947e-07, "loss": 0.0106, "step": 7434 }, { "epoch": 0.9822637645737688, "grad_norm": 0.2774215042591095, "learning_rate": 1.5503808790249263e-07, "loss": 0.0163, "step": 7435 }, { "epoch": 0.9823958780592529, "grad_norm": 0.08826129138469696, "learning_rate": 1.527333079483384e-07, "loss": 0.0079, "step": 7436 }, { "epoch": 0.9825279915447369, "grad_norm": 0.11560474336147308, "learning_rate": 1.5044577474263576e-07, "loss": 0.012, "step": 7437 }, { "epoch": 0.9826601050302209, "grad_norm": 0.19101516902446747, "learning_rate": 1.4817548868050202e-07, "loss": 0.0245, "step": 7438 }, { "epoch": 0.982792218515705, "grad_norm": 0.18168167769908905, "learning_rate": 1.4592245015410123e-07, "loss": 0.0234, "step": 7439 }, { "epoch": 0.982924332001189, "grad_norm": 0.1375359743833542, "learning_rate": 1.4368665955259986e-07, "loss": 0.0144, "step": 7440 }, { "epoch": 0.983056445486673, "grad_norm": 0.14244583249092102, "learning_rate": 1.41468117262189e-07, "loss": 0.0135, "step": 7441 }, { "epoch": 0.9831885589721571, "grad_norm": 0.1177082434296608, "learning_rate": 1.3926682366607324e-07, "loss": 0.0163, "step": 7442 }, { "epoch": 0.9833206724576411, "grad_norm": 0.13664676249027252, "learning_rate": 1.3708277914449287e-07, "loss": 0.0175, "step": 7443 }, { "epoch": 0.9834527859431251, "grad_norm": 0.06507763266563416, "learning_rate": 1.3491598407470162e-07, "loss": 0.0047, "step": 7444 }, { "epoch": 0.9835848994286092, "grad_norm": 0.17224647104740143, "learning_rate": 1.3276643883096684e-07, "loss": 0.0095, "step": 7445 }, { "epoch": 0.9837170129140932, "grad_norm": 0.13159163296222687, "learning_rate": 1.3063414378458038e-07, "loss": 0.0155, "step": 7446 }, { "epoch": 0.9838491263995772, "grad_norm": 0.18034628033638, "learning_rate": 1.2851909930386984e-07, "loss": 0.0228, "step": 7447 }, { "epoch": 0.9839812398850613, "grad_norm": 0.19768694043159485, "learning_rate": 1.2642130575415413e-07, "loss": 0.0184, "step": 7448 }, { "epoch": 0.9841133533705453, "grad_norm": 0.11957130581140518, "learning_rate": 1.243407634977878e-07, "loss": 0.0075, "step": 7449 }, { "epoch": 0.9842454668560293, "grad_norm": 0.10868565738201141, "learning_rate": 1.2227747289416114e-07, "loss": 0.0097, "step": 7450 }, { "epoch": 0.9843775803415133, "grad_norm": 0.11350014060735703, "learning_rate": 1.2023143429965577e-07, "loss": 0.012, "step": 7451 }, { "epoch": 0.9845096938269974, "grad_norm": 0.10784092545509338, "learning_rate": 1.182026480677001e-07, "loss": 0.012, "step": 7452 }, { "epoch": 0.9846418073124814, "grad_norm": 0.1857926845550537, "learning_rate": 1.1619111454871378e-07, "loss": 0.024, "step": 7453 }, { "epoch": 0.9847739207979654, "grad_norm": 0.19607584178447723, "learning_rate": 1.1419683409015225e-07, "loss": 0.0086, "step": 7454 }, { "epoch": 0.9849060342834495, "grad_norm": 0.1448315978050232, "learning_rate": 1.1221980703650659e-07, "loss": 0.0089, "step": 7455 }, { "epoch": 0.9850381477689335, "grad_norm": 0.158442422747612, "learning_rate": 1.1026003372924809e-07, "loss": 0.0132, "step": 7456 }, { "epoch": 0.9851702612544175, "grad_norm": 0.10124783962965012, "learning_rate": 1.0831751450691707e-07, "loss": 0.0064, "step": 7457 }, { "epoch": 0.9853023747399016, "grad_norm": 0.1521049290895462, "learning_rate": 1.0639224970502293e-07, "loss": 0.0164, "step": 7458 }, { "epoch": 0.9854344882253856, "grad_norm": 0.20458267629146576, "learning_rate": 1.0448423965613297e-07, "loss": 0.0144, "step": 7459 }, { "epoch": 0.9855666017108696, "grad_norm": 0.1937059909105301, "learning_rate": 1.0259348468981689e-07, "loss": 0.0174, "step": 7460 }, { "epoch": 0.9856987151963537, "grad_norm": 0.10319459438323975, "learning_rate": 1.00719985132669e-07, "loss": 0.0099, "step": 7461 }, { "epoch": 0.9858308286818377, "grad_norm": 0.16178636252880096, "learning_rate": 9.886374130829711e-08, "loss": 0.0177, "step": 7462 }, { "epoch": 0.9859629421673217, "grad_norm": 0.1864520162343979, "learning_rate": 9.702475353733364e-08, "loss": 0.025, "step": 7463 }, { "epoch": 0.9860950556528058, "grad_norm": 0.15063433349132538, "learning_rate": 9.520302213743559e-08, "loss": 0.0154, "step": 7464 }, { "epoch": 0.9862271691382898, "grad_norm": 0.10295557975769043, "learning_rate": 9.339854742326238e-08, "loss": 0.0127, "step": 7465 }, { "epoch": 0.9863592826237738, "grad_norm": 0.10709338635206223, "learning_rate": 9.16113297065202e-08, "loss": 0.006, "step": 7466 }, { "epoch": 0.9864913961092578, "grad_norm": 0.11552749574184418, "learning_rate": 8.984136929589548e-08, "loss": 0.0054, "step": 7467 }, { "epoch": 0.9866235095947419, "grad_norm": 0.1398506462574005, "learning_rate": 8.808866649713254e-08, "loss": 0.0067, "step": 7468 }, { "epoch": 0.9867556230802259, "grad_norm": 0.2075989991426468, "learning_rate": 8.6353221612967e-08, "loss": 0.0234, "step": 7469 }, { "epoch": 0.9868877365657099, "grad_norm": 0.16684336960315704, "learning_rate": 8.463503494317015e-08, "loss": 0.0233, "step": 7470 }, { "epoch": 0.987019850051194, "grad_norm": 0.15726900100708008, "learning_rate": 8.293410678452685e-08, "loss": 0.0125, "step": 7471 }, { "epoch": 0.987151963536678, "grad_norm": 0.20710338652133942, "learning_rate": 8.125043743084648e-08, "loss": 0.0105, "step": 7472 }, { "epoch": 0.987284077022162, "grad_norm": 0.14183491468429565, "learning_rate": 7.958402717294089e-08, "loss": 0.0151, "step": 7473 }, { "epoch": 0.9874161905076461, "grad_norm": 0.26822736859321594, "learning_rate": 7.793487629865759e-08, "loss": 0.0245, "step": 7474 }, { "epoch": 0.9875483039931301, "grad_norm": 0.1157698780298233, "learning_rate": 7.63029850928465e-08, "loss": 0.0082, "step": 7475 }, { "epoch": 0.9876804174786141, "grad_norm": 0.12137190997600555, "learning_rate": 7.468835383740436e-08, "loss": 0.0115, "step": 7476 }, { "epoch": 0.9878125309640982, "grad_norm": 0.1745070219039917, "learning_rate": 7.309098281120808e-08, "loss": 0.0104, "step": 7477 }, { "epoch": 0.9879446444495822, "grad_norm": 0.1688745766878128, "learning_rate": 7.151087229019249e-08, "loss": 0.0143, "step": 7478 }, { "epoch": 0.9880767579350662, "grad_norm": 0.1639687716960907, "learning_rate": 6.994802254728372e-08, "loss": 0.0112, "step": 7479 }, { "epoch": 0.9882088714205502, "grad_norm": 0.26630374789237976, "learning_rate": 6.84024338524325e-08, "loss": 0.0197, "step": 7480 }, { "epoch": 0.9883409849060343, "grad_norm": 0.41736364364624023, "learning_rate": 6.687410647260306e-08, "loss": 0.0486, "step": 7481 }, { "epoch": 0.9884730983915183, "grad_norm": 0.11081980168819427, "learning_rate": 6.536304067180643e-08, "loss": 0.0095, "step": 7482 }, { "epoch": 0.9886052118770023, "grad_norm": 0.12715992331504822, "learning_rate": 6.386923671103384e-08, "loss": 0.0074, "step": 7483 }, { "epoch": 0.9887373253624864, "grad_norm": 0.12597718834877014, "learning_rate": 6.239269484832333e-08, "loss": 0.014, "step": 7484 }, { "epoch": 0.9888694388479704, "grad_norm": 0.17149794101715088, "learning_rate": 6.093341533870422e-08, "loss": 0.0128, "step": 7485 }, { "epoch": 0.9890015523334544, "grad_norm": 0.10480506718158722, "learning_rate": 5.949139843426377e-08, "loss": 0.0168, "step": 7486 }, { "epoch": 0.9891336658189385, "grad_norm": 0.20579087734222412, "learning_rate": 5.806664438405829e-08, "loss": 0.0089, "step": 7487 }, { "epoch": 0.9892657793044225, "grad_norm": 0.10642731189727783, "learning_rate": 5.665915343420203e-08, "loss": 0.0041, "step": 7488 }, { "epoch": 0.9893978927899065, "grad_norm": 0.19486381113529205, "learning_rate": 5.526892582781162e-08, "loss": 0.0136, "step": 7489 }, { "epoch": 0.9895300062753906, "grad_norm": 0.15670417249202728, "learning_rate": 5.3895961805017214e-08, "loss": 0.013, "step": 7490 }, { "epoch": 0.9896621197608746, "grad_norm": 0.14204590022563934, "learning_rate": 5.254026160297354e-08, "loss": 0.0164, "step": 7491 }, { "epoch": 0.9897942332463586, "grad_norm": 0.10373309999704361, "learning_rate": 5.120182545585994e-08, "loss": 0.0087, "step": 7492 }, { "epoch": 0.9899263467318427, "grad_norm": 0.2570636570453644, "learning_rate": 4.988065359485816e-08, "loss": 0.0165, "step": 7493 }, { "epoch": 0.9900584602173267, "grad_norm": 0.23979882895946503, "learning_rate": 4.857674624818565e-08, "loss": 0.023, "step": 7494 }, { "epoch": 0.9901905737028107, "grad_norm": 0.12361893057823181, "learning_rate": 4.729010364105113e-08, "loss": 0.0128, "step": 7495 }, { "epoch": 0.9903226871882947, "grad_norm": 0.16618198156356812, "learning_rate": 4.6020725995710166e-08, "loss": 0.0086, "step": 7496 }, { "epoch": 0.9904548006737788, "grad_norm": 0.10690437257289886, "learning_rate": 4.47686135314318e-08, "loss": 0.0098, "step": 7497 }, { "epoch": 0.9905869141592628, "grad_norm": 0.1558026224374771, "learning_rate": 4.3533766464476376e-08, "loss": 0.0118, "step": 7498 }, { "epoch": 0.9907190276447468, "grad_norm": 0.22771494090557098, "learning_rate": 4.231618500815104e-08, "loss": 0.014, "step": 7499 }, { "epoch": 0.9908511411302309, "grad_norm": 0.24591323733329773, "learning_rate": 4.111586937276535e-08, "loss": 0.0182, "step": 7500 }, { "epoch": 0.9909832546157149, "grad_norm": 0.14383232593536377, "learning_rate": 3.993281976566454e-08, "loss": 0.0074, "step": 7501 }, { "epoch": 0.9911153681011989, "grad_norm": 0.2168605774641037, "learning_rate": 3.876703639117407e-08, "loss": 0.0164, "step": 7502 }, { "epoch": 0.991247481586683, "grad_norm": 0.1999928057193756, "learning_rate": 3.7618519450688394e-08, "loss": 0.0175, "step": 7503 }, { "epoch": 0.991379595072167, "grad_norm": 0.09430862218141556, "learning_rate": 3.6487269142571055e-08, "loss": 0.0096, "step": 7504 }, { "epoch": 0.991511708557651, "grad_norm": 0.1702381670475006, "learning_rate": 3.5373285662243515e-08, "loss": 0.0193, "step": 7505 }, { "epoch": 0.9916438220431351, "grad_norm": 0.08822032064199448, "learning_rate": 3.427656920210742e-08, "loss": 0.0082, "step": 7506 }, { "epoch": 0.9917759355286191, "grad_norm": 0.1684400737285614, "learning_rate": 3.319711995161123e-08, "loss": 0.0167, "step": 7507 }, { "epoch": 0.9919080490141031, "grad_norm": 0.1515914797782898, "learning_rate": 3.21349380971947e-08, "loss": 0.0108, "step": 7508 }, { "epoch": 0.9920401624995872, "grad_norm": 0.0880783349275589, "learning_rate": 3.109002382235548e-08, "loss": 0.0074, "step": 7509 }, { "epoch": 0.9921722759850712, "grad_norm": 0.15489071607589722, "learning_rate": 3.006237730756034e-08, "loss": 0.0123, "step": 7510 }, { "epoch": 0.9923043894705552, "grad_norm": 0.07852049171924591, "learning_rate": 2.905199873033393e-08, "loss": 0.009, "step": 7511 }, { "epoch": 0.9924365029560392, "grad_norm": 0.17368489503860474, "learning_rate": 2.8058888265181105e-08, "loss": 0.0199, "step": 7512 }, { "epoch": 0.9925686164415233, "grad_norm": 0.09940238296985626, "learning_rate": 2.708304608365353e-08, "loss": 0.0063, "step": 7513 }, { "epoch": 0.9927007299270073, "grad_norm": 0.1165870726108551, "learning_rate": 2.6124472354316364e-08, "loss": 0.0082, "step": 7514 }, { "epoch": 0.9928328434124913, "grad_norm": 0.2655845582485199, "learning_rate": 2.518316724272607e-08, "loss": 0.0209, "step": 7515 }, { "epoch": 0.9929649568979754, "grad_norm": 0.07835333049297333, "learning_rate": 2.425913091149701e-08, "loss": 0.0067, "step": 7516 }, { "epoch": 0.9930970703834594, "grad_norm": 0.15666654706001282, "learning_rate": 2.335236352022374e-08, "loss": 0.0079, "step": 7517 }, { "epoch": 0.9932291838689434, "grad_norm": 0.11625221371650696, "learning_rate": 2.2462865225547636e-08, "loss": 0.0092, "step": 7518 }, { "epoch": 0.9933612973544275, "grad_norm": 0.11382373422384262, "learning_rate": 2.1590636181090252e-08, "loss": 0.0136, "step": 7519 }, { "epoch": 0.9934934108399115, "grad_norm": 0.13483434915542603, "learning_rate": 2.073567653754216e-08, "loss": 0.0167, "step": 7520 }, { "epoch": 0.9936255243253955, "grad_norm": 0.25301748514175415, "learning_rate": 1.989798644255192e-08, "loss": 0.0122, "step": 7521 }, { "epoch": 0.9937576378108796, "grad_norm": 0.1368926465511322, "learning_rate": 1.9077566040837104e-08, "loss": 0.0115, "step": 7522 }, { "epoch": 0.9938897512963636, "grad_norm": 0.11503051221370697, "learning_rate": 1.8274415474106577e-08, "loss": 0.0114, "step": 7523 }, { "epoch": 0.9940218647818476, "grad_norm": 0.19685514271259308, "learning_rate": 1.7488534881082708e-08, "loss": 0.0163, "step": 7524 }, { "epoch": 0.9941539782673317, "grad_norm": 0.199909046292305, "learning_rate": 1.6719924397512465e-08, "loss": 0.0142, "step": 7525 }, { "epoch": 0.9942860917528157, "grad_norm": 0.10640694200992584, "learning_rate": 1.596858415615632e-08, "loss": 0.0097, "step": 7526 }, { "epoch": 0.9944182052382997, "grad_norm": 0.07238924503326416, "learning_rate": 1.5234514286810442e-08, "loss": 0.0044, "step": 7527 }, { "epoch": 0.9945503187237837, "grad_norm": 0.15096305310726166, "learning_rate": 1.4517714916251202e-08, "loss": 0.0108, "step": 7528 }, { "epoch": 0.9946824322092678, "grad_norm": 0.16533154249191284, "learning_rate": 1.3818186168301772e-08, "loss": 0.0149, "step": 7529 }, { "epoch": 0.9948145456947518, "grad_norm": 0.19295816123485565, "learning_rate": 1.3135928163787725e-08, "loss": 0.0393, "step": 7530 }, { "epoch": 0.9949466591802358, "grad_norm": 0.23089705407619476, "learning_rate": 1.2470941020570336e-08, "loss": 0.0238, "step": 7531 }, { "epoch": 0.9950787726657199, "grad_norm": 0.3397480547428131, "learning_rate": 1.1823224853491077e-08, "loss": 0.0283, "step": 7532 }, { "epoch": 0.9952108861512039, "grad_norm": 0.17770127952098846, "learning_rate": 1.1192779774449325e-08, "loss": 0.0163, "step": 7533 }, { "epoch": 0.9953429996366879, "grad_norm": 0.16790403425693512, "learning_rate": 1.0579605892346855e-08, "loss": 0.0209, "step": 7534 }, { "epoch": 0.995475113122172, "grad_norm": 0.18203333020210266, "learning_rate": 9.983703313076743e-09, "loss": 0.021, "step": 7535 }, { "epoch": 0.995607226607656, "grad_norm": 0.14092746376991272, "learning_rate": 9.405072139578864e-09, "loss": 0.0116, "step": 7536 }, { "epoch": 0.99573934009314, "grad_norm": 0.11230488121509552, "learning_rate": 8.843712471806598e-09, "loss": 0.0073, "step": 7537 }, { "epoch": 0.995871453578624, "grad_norm": 0.13066421449184418, "learning_rate": 8.299624406726824e-09, "loss": 0.0199, "step": 7538 }, { "epoch": 0.9960035670641081, "grad_norm": 0.1082078292965889, "learning_rate": 7.772808038308822e-09, "loss": 0.005, "step": 7539 }, { "epoch": 0.9961356805495921, "grad_norm": 0.3292711079120636, "learning_rate": 7.263263457557568e-09, "loss": 0.0082, "step": 7540 }, { "epoch": 0.9962677940350761, "grad_norm": 0.1794513761997223, "learning_rate": 6.770990752491546e-09, "loss": 0.0157, "step": 7541 }, { "epoch": 0.9963999075205602, "grad_norm": 0.07902058213949203, "learning_rate": 6.295990008131636e-09, "loss": 0.0041, "step": 7542 }, { "epoch": 0.9965320210060442, "grad_norm": 0.2639276385307312, "learning_rate": 5.8382613065344204e-09, "loss": 0.0269, "step": 7543 }, { "epoch": 0.9966641344915282, "grad_norm": 0.11883040517568588, "learning_rate": 5.39780472674778e-09, "loss": 0.0203, "step": 7544 }, { "epoch": 0.9967962479770123, "grad_norm": 0.1411103755235672, "learning_rate": 4.974620344877501e-09, "loss": 0.013, "step": 7545 }, { "epoch": 0.9969283614624963, "grad_norm": 0.10590819269418716, "learning_rate": 4.568708233998465e-09, "loss": 0.0061, "step": 7546 }, { "epoch": 0.9970604749479803, "grad_norm": 0.14374881982803345, "learning_rate": 4.180068464243458e-09, "loss": 0.0105, "step": 7547 }, { "epoch": 0.9971925884334644, "grad_norm": 0.13679523766040802, "learning_rate": 3.808701102725465e-09, "loss": 0.0181, "step": 7548 }, { "epoch": 0.9973247019189484, "grad_norm": 0.19184891879558563, "learning_rate": 3.4546062135931702e-09, "loss": 0.0256, "step": 7549 }, { "epoch": 0.9974568154044324, "grad_norm": 0.1876770406961441, "learning_rate": 3.1177838580198626e-09, "loss": 0.0209, "step": 7550 }, { "epoch": 0.9975889288899165, "grad_norm": 0.2564197778701782, "learning_rate": 2.7982340941812292e-09, "loss": 0.0245, "step": 7551 }, { "epoch": 0.9977210423754005, "grad_norm": 0.14303652942180634, "learning_rate": 2.4959569772775583e-09, "loss": 0.0105, "step": 7552 }, { "epoch": 0.9978531558608845, "grad_norm": 0.18371181190013885, "learning_rate": 2.2109525595115365e-09, "loss": 0.0211, "step": 7553 }, { "epoch": 0.9979852693463686, "grad_norm": 0.16563263535499573, "learning_rate": 1.9432208901104533e-09, "loss": 0.0168, "step": 7554 }, { "epoch": 0.9981173828318526, "grad_norm": 0.19452422857284546, "learning_rate": 1.6927620153373013e-09, "loss": 0.0116, "step": 7555 }, { "epoch": 0.9982494963173366, "grad_norm": 0.07965493947267532, "learning_rate": 1.4595759784463703e-09, "loss": 0.0047, "step": 7556 }, { "epoch": 0.9983816098028206, "grad_norm": 0.14059865474700928, "learning_rate": 1.243662819705449e-09, "loss": 0.0098, "step": 7557 }, { "epoch": 0.9985137232883047, "grad_norm": 0.11752263456583023, "learning_rate": 1.045022576418031e-09, "loss": 0.0104, "step": 7558 }, { "epoch": 0.9986458367737887, "grad_norm": 0.09485765546560287, "learning_rate": 8.636552829011102e-10, "loss": 0.0087, "step": 7559 }, { "epoch": 0.9987779502592727, "grad_norm": 0.10121173411607742, "learning_rate": 6.99560970474078e-10, "loss": 0.0118, "step": 7560 }, { "epoch": 0.9989100637447568, "grad_norm": 0.1641887128353119, "learning_rate": 5.527396674809282e-10, "loss": 0.013, "step": 7561 }, { "epoch": 0.9990421772302408, "grad_norm": 0.09525637328624725, "learning_rate": 4.2319139929025695e-10, "loss": 0.0092, "step": 7562 }, { "epoch": 0.9991742907157248, "grad_norm": 0.1413845717906952, "learning_rate": 3.1091618827305826e-10, "loss": 0.0127, "step": 7563 }, { "epoch": 0.9993064042012089, "grad_norm": 0.1747020184993744, "learning_rate": 2.1591405382492824e-10, "loss": 0.0154, "step": 7564 }, { "epoch": 0.9994385176866929, "grad_norm": 0.19588789343833923, "learning_rate": 1.3818501234386105e-10, "loss": 0.021, "step": 7565 }, { "epoch": 0.9995706311721769, "grad_norm": 0.14852724969387054, "learning_rate": 7.77290772746575e-11, "loss": 0.0124, "step": 7566 }, { "epoch": 0.999702744657661, "grad_norm": 0.17045824229717255, "learning_rate": 3.4546259053414023e-11, "loss": 0.0256, "step": 7567 }, { "epoch": 0.999834858143145, "grad_norm": 0.15149812400341034, "learning_rate": 8.636565140829333e-12, "loss": 0.0186, "step": 7568 }, { "epoch": 0.999966971628629, "grad_norm": 0.1452350914478302, "learning_rate": 0.0, "loss": 0.0206, "step": 7569 } ], "logging_steps": 1, "max_steps": 7569, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.1252272381910057e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }