diff --git "a/checkpoint-7569/trainer_state.json" "b/checkpoint-7569/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-7569/trainer_state.json" @@ -0,0 +1,53016 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999966971628629, + "eval_steps": 500, + "global_step": 7569, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00013211348548403077, + "grad_norm": 1.223802924156189, + "learning_rate": 2e-05, + "loss": 0.1696, + "step": 1 + }, + { + "epoch": 0.00026422697096806154, + "grad_norm": 0.8252124190330505, + "learning_rate": 4e-05, + "loss": 0.1671, + "step": 2 + }, + { + "epoch": 0.00039634045645209234, + "grad_norm": 0.9697959423065186, + "learning_rate": 6e-05, + "loss": 0.1425, + "step": 3 + }, + { + "epoch": 0.0005284539419361231, + "grad_norm": 0.9205353856086731, + "learning_rate": 8e-05, + "loss": 0.1113, + "step": 4 + }, + { + "epoch": 0.0006605674274201539, + "grad_norm": 0.6761035323143005, + "learning_rate": 0.0001, + "loss": 0.1032, + "step": 5 + }, + { + "epoch": 0.0007926809129041847, + "grad_norm": 0.6447001695632935, + "learning_rate": 0.00012, + "loss": 0.1314, + "step": 6 + }, + { + "epoch": 0.0009247943983882155, + "grad_norm": 0.6059455871582031, + "learning_rate": 0.00014, + "loss": 0.1041, + "step": 7 + }, + { + "epoch": 0.0010569078838722462, + "grad_norm": 0.4858318567276001, + "learning_rate": 0.00016, + "loss": 0.067, + "step": 8 + }, + { + "epoch": 0.001189021369356277, + "grad_norm": 0.4148808717727661, + "learning_rate": 0.00018, + "loss": 0.0534, + "step": 9 + }, + { + "epoch": 0.0013211348548403078, + "grad_norm": 0.39029794931411743, + "learning_rate": 0.0002, + "loss": 0.0882, + "step": 10 + }, + { + "epoch": 0.0014532483403243387, + "grad_norm": 0.7170138955116272, + "learning_rate": 0.00019999999136343486, + "loss": 0.1022, + "step": 11 + }, + { + "epoch": 0.0015853618258083694, + "grad_norm": 0.6317317485809326, + "learning_rate": 0.00019999996545374095, + "loss": 0.1389, + "step": 12 + }, + { + "epoch": 0.0017174753112924003, + "grad_norm": 0.4452052116394043, + "learning_rate": 0.00019999992227092274, + "loss": 0.1099, + "step": 13 + }, + { + "epoch": 0.001849588796776431, + "grad_norm": 0.37861597537994385, + "learning_rate": 0.00019999986181498767, + "loss": 0.0748, + "step": 14 + }, + { + "epoch": 0.0019817022822604616, + "grad_norm": 0.48993363976478577, + "learning_rate": 0.00019999978408594618, + "loss": 0.0713, + "step": 15 + }, + { + "epoch": 0.0021138157677444923, + "grad_norm": 0.35126134753227234, + "learning_rate": 0.00019999968908381176, + "loss": 0.0684, + "step": 16 + }, + { + "epoch": 0.0022459292532285235, + "grad_norm": 0.6053355932235718, + "learning_rate": 0.00019999957680860071, + "loss": 0.0986, + "step": 17 + }, + { + "epoch": 0.002378042738712554, + "grad_norm": 0.44116076827049255, + "learning_rate": 0.00019999944726033252, + "loss": 0.1038, + "step": 18 + }, + { + "epoch": 0.002510156224196585, + "grad_norm": 0.3535824418067932, + "learning_rate": 0.00019999930043902952, + "loss": 0.0556, + "step": 19 + }, + { + "epoch": 0.0026422697096806155, + "grad_norm": 0.5412662029266357, + "learning_rate": 0.0001999991363447171, + "loss": 0.069, + "step": 20 + }, + { + "epoch": 0.0027743831951646462, + "grad_norm": 0.3860626220703125, + "learning_rate": 0.0001999989549774236, + "loss": 0.0875, + "step": 21 + }, + { + "epoch": 0.0029064966806486773, + "grad_norm": 0.4982723891735077, + "learning_rate": 0.0001999987563371803, + "loss": 0.057, + "step": 22 + }, + { + "epoch": 0.003038610166132708, + "grad_norm": 0.4545145630836487, + "learning_rate": 0.00019999854042402157, + "loss": 0.0415, + "step": 23 + }, + { + "epoch": 0.0031707236516167387, + "grad_norm": 0.5430541634559631, + "learning_rate": 0.00019999830723798468, + "loss": 0.0939, + "step": 24 + }, + { + "epoch": 0.0033028371371007694, + "grad_norm": 0.39696234464645386, + "learning_rate": 0.0001999980567791099, + "loss": 0.0624, + "step": 25 + }, + { + "epoch": 0.0034349506225848005, + "grad_norm": 0.6339076161384583, + "learning_rate": 0.0001999977890474405, + "loss": 0.0873, + "step": 26 + }, + { + "epoch": 0.0035670641080688312, + "grad_norm": 0.29136186838150024, + "learning_rate": 0.00019999750404302272, + "loss": 0.0502, + "step": 27 + }, + { + "epoch": 0.003699177593552862, + "grad_norm": 0.41455066204071045, + "learning_rate": 0.00019999720176590584, + "loss": 0.0801, + "step": 28 + }, + { + "epoch": 0.0038312910790368926, + "grad_norm": 0.40953338146209717, + "learning_rate": 0.000199996882216142, + "loss": 0.0763, + "step": 29 + }, + { + "epoch": 0.003963404564520923, + "grad_norm": 0.3564930856227875, + "learning_rate": 0.00019999654539378642, + "loss": 0.0512, + "step": 30 + }, + { + "epoch": 0.004095518050004954, + "grad_norm": 0.5764234066009521, + "learning_rate": 0.0001999961912988973, + "loss": 0.0918, + "step": 31 + }, + { + "epoch": 0.004227631535488985, + "grad_norm": 0.38000357151031494, + "learning_rate": 0.00019999581993153577, + "loss": 0.0433, + "step": 32 + }, + { + "epoch": 0.004359745020973016, + "grad_norm": 0.3675003945827484, + "learning_rate": 0.000199995431291766, + "loss": 0.0603, + "step": 33 + }, + { + "epoch": 0.004491858506457047, + "grad_norm": 0.2798750698566437, + "learning_rate": 0.00019999502537965512, + "loss": 0.0384, + "step": 34 + }, + { + "epoch": 0.004623971991941077, + "grad_norm": 0.38664910197257996, + "learning_rate": 0.00019999460219527327, + "loss": 0.0541, + "step": 35 + }, + { + "epoch": 0.004756085477425108, + "grad_norm": 0.505436897277832, + "learning_rate": 0.00019999416173869348, + "loss": 0.0639, + "step": 36 + }, + { + "epoch": 0.0048881989629091386, + "grad_norm": 0.3454754948616028, + "learning_rate": 0.00019999370400999186, + "loss": 0.0566, + "step": 37 + }, + { + "epoch": 0.00502031244839317, + "grad_norm": 0.3809961676597595, + "learning_rate": 0.00019999322900924753, + "loss": 0.0522, + "step": 38 + }, + { + "epoch": 0.005152425933877201, + "grad_norm": 1.07643723487854, + "learning_rate": 0.00019999273673654245, + "loss": 0.0714, + "step": 39 + }, + { + "epoch": 0.005284539419361231, + "grad_norm": 0.5807058811187744, + "learning_rate": 0.0001999922271919617, + "loss": 0.0534, + "step": 40 + }, + { + "epoch": 0.005416652904845262, + "grad_norm": 0.5059730410575867, + "learning_rate": 0.0001999917003755933, + "loss": 0.0627, + "step": 41 + }, + { + "epoch": 0.0055487663903292924, + "grad_norm": 0.5554924607276917, + "learning_rate": 0.0001999911562875282, + "loss": 0.063, + "step": 42 + }, + { + "epoch": 0.0056808798758133236, + "grad_norm": 0.4277496635913849, + "learning_rate": 0.00019999059492786044, + "loss": 0.0624, + "step": 43 + }, + { + "epoch": 0.005812993361297355, + "grad_norm": 0.38281354308128357, + "learning_rate": 0.00019999001629668692, + "loss": 0.0595, + "step": 44 + }, + { + "epoch": 0.005945106846781385, + "grad_norm": 0.39231449365615845, + "learning_rate": 0.00019998942039410765, + "loss": 0.0432, + "step": 45 + }, + { + "epoch": 0.006077220332265416, + "grad_norm": 0.3610371947288513, + "learning_rate": 0.00019998880722022557, + "loss": 0.0602, + "step": 46 + }, + { + "epoch": 0.006209333817749447, + "grad_norm": 0.4488823711872101, + "learning_rate": 0.0001999881767751465, + "loss": 0.0829, + "step": 47 + }, + { + "epoch": 0.0063414473032334774, + "grad_norm": 0.2863968014717102, + "learning_rate": 0.00019998752905897943, + "loss": 0.0631, + "step": 48 + }, + { + "epoch": 0.006473560788717509, + "grad_norm": 0.3344953954219818, + "learning_rate": 0.00019998686407183622, + "loss": 0.056, + "step": 49 + }, + { + "epoch": 0.006605674274201539, + "grad_norm": 0.6836997270584106, + "learning_rate": 0.0001999861818138317, + "loss": 0.0765, + "step": 50 + }, + { + "epoch": 0.00673778775968557, + "grad_norm": 0.283840149641037, + "learning_rate": 0.00019998548228508377, + "loss": 0.0491, + "step": 51 + }, + { + "epoch": 0.006869901245169601, + "grad_norm": 0.35097068548202515, + "learning_rate": 0.0001999847654857132, + "loss": 0.0498, + "step": 52 + }, + { + "epoch": 0.007002014730653631, + "grad_norm": 0.4589046537876129, + "learning_rate": 0.00019998403141584386, + "loss": 0.0507, + "step": 53 + }, + { + "epoch": 0.0071341282161376625, + "grad_norm": 0.3400501012802124, + "learning_rate": 0.0001999832800756025, + "loss": 0.0578, + "step": 54 + }, + { + "epoch": 0.007266241701621693, + "grad_norm": 0.3245796859264374, + "learning_rate": 0.00019998251146511893, + "loss": 0.0471, + "step": 55 + }, + { + "epoch": 0.007398355187105724, + "grad_norm": 0.4059467017650604, + "learning_rate": 0.0001999817255845259, + "loss": 0.0613, + "step": 56 + }, + { + "epoch": 0.007530468672589755, + "grad_norm": 0.32029154896736145, + "learning_rate": 0.00019998092243395918, + "loss": 0.0545, + "step": 57 + }, + { + "epoch": 0.007662582158073785, + "grad_norm": 0.29986247420310974, + "learning_rate": 0.00019998010201355745, + "loss": 0.0644, + "step": 58 + }, + { + "epoch": 0.007794695643557816, + "grad_norm": 0.3331080973148346, + "learning_rate": 0.00019997926432346245, + "loss": 0.0702, + "step": 59 + }, + { + "epoch": 0.007926809129041847, + "grad_norm": 0.5329291820526123, + "learning_rate": 0.00019997840936381893, + "loss": 0.0504, + "step": 60 + }, + { + "epoch": 0.008058922614525878, + "grad_norm": 0.32196447253227234, + "learning_rate": 0.00019997753713477448, + "loss": 0.0618, + "step": 61 + }, + { + "epoch": 0.008191036100009909, + "grad_norm": 0.4515661597251892, + "learning_rate": 0.00019997664763647977, + "loss": 0.0544, + "step": 62 + }, + { + "epoch": 0.00832314958549394, + "grad_norm": 0.31676095724105835, + "learning_rate": 0.0001999757408690885, + "loss": 0.0645, + "step": 63 + }, + { + "epoch": 0.00845526307097797, + "grad_norm": 0.29965102672576904, + "learning_rate": 0.00019997481683275728, + "loss": 0.0613, + "step": 64 + }, + { + "epoch": 0.008587376556462, + "grad_norm": 0.3969486355781555, + "learning_rate": 0.00019997387552764568, + "loss": 0.0481, + "step": 65 + }, + { + "epoch": 0.008719490041946032, + "grad_norm": 0.27435922622680664, + "learning_rate": 0.00019997291695391636, + "loss": 0.0383, + "step": 66 + }, + { + "epoch": 0.008851603527430063, + "grad_norm": 0.37260785698890686, + "learning_rate": 0.00019997194111173483, + "loss": 0.0459, + "step": 67 + }, + { + "epoch": 0.008983717012914094, + "grad_norm": 0.44555535912513733, + "learning_rate": 0.0001999709480012697, + "loss": 0.0576, + "step": 68 + }, + { + "epoch": 0.009115830498398123, + "grad_norm": 0.5709348917007446, + "learning_rate": 0.00019996993762269244, + "loss": 0.0774, + "step": 69 + }, + { + "epoch": 0.009247943983882154, + "grad_norm": 0.48640209436416626, + "learning_rate": 0.00019996890997617766, + "loss": 0.07, + "step": 70 + }, + { + "epoch": 0.009380057469366185, + "grad_norm": 0.4809843897819519, + "learning_rate": 0.0001999678650619028, + "loss": 0.0783, + "step": 71 + }, + { + "epoch": 0.009512170954850217, + "grad_norm": 0.3329864740371704, + "learning_rate": 0.0001999668028800484, + "loss": 0.0468, + "step": 72 + }, + { + "epoch": 0.009644284440334248, + "grad_norm": 0.2965797781944275, + "learning_rate": 0.00019996572343079788, + "loss": 0.0433, + "step": 73 + }, + { + "epoch": 0.009776397925818277, + "grad_norm": 0.31055697798728943, + "learning_rate": 0.00019996462671433775, + "loss": 0.075, + "step": 74 + }, + { + "epoch": 0.009908511411302308, + "grad_norm": 0.3642573952674866, + "learning_rate": 0.00019996351273085744, + "loss": 0.0517, + "step": 75 + }, + { + "epoch": 0.01004062489678634, + "grad_norm": 0.32922685146331787, + "learning_rate": 0.0001999623814805493, + "loss": 0.0499, + "step": 76 + }, + { + "epoch": 0.01017273838227037, + "grad_norm": 0.27644476294517517, + "learning_rate": 0.00019996123296360882, + "loss": 0.0513, + "step": 77 + }, + { + "epoch": 0.010304851867754402, + "grad_norm": 0.2898384630680084, + "learning_rate": 0.00019996006718023433, + "loss": 0.0516, + "step": 78 + }, + { + "epoch": 0.010436965353238431, + "grad_norm": 0.2735963761806488, + "learning_rate": 0.00019995888413062724, + "loss": 0.0389, + "step": 79 + }, + { + "epoch": 0.010569078838722462, + "grad_norm": 0.2912638783454895, + "learning_rate": 0.00019995768381499186, + "loss": 0.053, + "step": 80 + }, + { + "epoch": 0.010701192324206493, + "grad_norm": 0.42680662870407104, + "learning_rate": 0.00019995646623353555, + "loss": 0.0569, + "step": 81 + }, + { + "epoch": 0.010833305809690524, + "grad_norm": 0.43611371517181396, + "learning_rate": 0.00019995523138646858, + "loss": 0.0447, + "step": 82 + }, + { + "epoch": 0.010965419295174555, + "grad_norm": 0.419028103351593, + "learning_rate": 0.0001999539792740043, + "loss": 0.0694, + "step": 83 + }, + { + "epoch": 0.011097532780658585, + "grad_norm": 0.3377486765384674, + "learning_rate": 0.00019995270989635894, + "loss": 0.0568, + "step": 84 + }, + { + "epoch": 0.011229646266142616, + "grad_norm": 0.25879159569740295, + "learning_rate": 0.00019995142325375181, + "loss": 0.0418, + "step": 85 + }, + { + "epoch": 0.011361759751626647, + "grad_norm": 0.2732607126235962, + "learning_rate": 0.00019995011934640516, + "loss": 0.0446, + "step": 86 + }, + { + "epoch": 0.011493873237110678, + "grad_norm": 0.39738181233406067, + "learning_rate": 0.00019994879817454415, + "loss": 0.0588, + "step": 87 + }, + { + "epoch": 0.01162598672259471, + "grad_norm": 0.5449623465538025, + "learning_rate": 0.00019994745973839703, + "loss": 0.0697, + "step": 88 + }, + { + "epoch": 0.01175810020807874, + "grad_norm": 0.48834940791130066, + "learning_rate": 0.000199946104038195, + "loss": 0.0654, + "step": 89 + }, + { + "epoch": 0.01189021369356277, + "grad_norm": 0.3608017563819885, + "learning_rate": 0.0001999447310741722, + "loss": 0.0515, + "step": 90 + }, + { + "epoch": 0.012022327179046801, + "grad_norm": 0.28702256083488464, + "learning_rate": 0.0001999433408465658, + "loss": 0.05, + "step": 91 + }, + { + "epoch": 0.012154440664530832, + "grad_norm": 0.8507578372955322, + "learning_rate": 0.00019994193335561594, + "loss": 0.1108, + "step": 92 + }, + { + "epoch": 0.012286554150014863, + "grad_norm": 0.35049304366111755, + "learning_rate": 0.00019994050860156574, + "loss": 0.0513, + "step": 93 + }, + { + "epoch": 0.012418667635498894, + "grad_norm": 0.23222468793392181, + "learning_rate": 0.0001999390665846613, + "loss": 0.0363, + "step": 94 + }, + { + "epoch": 0.012550781120982924, + "grad_norm": 0.5731639862060547, + "learning_rate": 0.00019993760730515166, + "loss": 0.054, + "step": 95 + }, + { + "epoch": 0.012682894606466955, + "grad_norm": 0.26577410101890564, + "learning_rate": 0.00019993613076328898, + "loss": 0.042, + "step": 96 + }, + { + "epoch": 0.012815008091950986, + "grad_norm": 0.33413466811180115, + "learning_rate": 0.0001999346369593282, + "loss": 0.031, + "step": 97 + }, + { + "epoch": 0.012947121577435017, + "grad_norm": 0.311821848154068, + "learning_rate": 0.00019993312589352739, + "loss": 0.0518, + "step": 98 + }, + { + "epoch": 0.013079235062919048, + "grad_norm": 0.4056153893470764, + "learning_rate": 0.00019993159756614759, + "loss": 0.0582, + "step": 99 + }, + { + "epoch": 0.013211348548403078, + "grad_norm": 0.31561562418937683, + "learning_rate": 0.00019993005197745274, + "loss": 0.0554, + "step": 100 + }, + { + "epoch": 0.013343462033887109, + "grad_norm": 0.4322955906391144, + "learning_rate": 0.00019992848912770984, + "loss": 0.0516, + "step": 101 + }, + { + "epoch": 0.01347557551937114, + "grad_norm": 0.2842521369457245, + "learning_rate": 0.0001999269090171888, + "loss": 0.0443, + "step": 102 + }, + { + "epoch": 0.013607689004855171, + "grad_norm": 0.4293564558029175, + "learning_rate": 0.00019992531164616262, + "loss": 0.0679, + "step": 103 + }, + { + "epoch": 0.013739802490339202, + "grad_norm": 0.4246540069580078, + "learning_rate": 0.00019992369701490715, + "loss": 0.0428, + "step": 104 + }, + { + "epoch": 0.013871915975823232, + "grad_norm": 0.4045541286468506, + "learning_rate": 0.00019992206512370135, + "loss": 0.0616, + "step": 105 + }, + { + "epoch": 0.014004029461307263, + "grad_norm": 0.37344667315483093, + "learning_rate": 0.00019992041597282706, + "loss": 0.037, + "step": 106 + }, + { + "epoch": 0.014136142946791294, + "grad_norm": 0.19354045391082764, + "learning_rate": 0.00019991874956256918, + "loss": 0.0255, + "step": 107 + }, + { + "epoch": 0.014268256432275325, + "grad_norm": 0.2800353765487671, + "learning_rate": 0.00019991706589321548, + "loss": 0.0475, + "step": 108 + }, + { + "epoch": 0.014400369917759356, + "grad_norm": 0.29562219977378845, + "learning_rate": 0.00019991536496505682, + "loss": 0.0441, + "step": 109 + }, + { + "epoch": 0.014532483403243385, + "grad_norm": 0.35554012656211853, + "learning_rate": 0.00019991364677838705, + "loss": 0.0477, + "step": 110 + }, + { + "epoch": 0.014664596888727417, + "grad_norm": 0.2925634980201721, + "learning_rate": 0.00019991191133350287, + "loss": 0.031, + "step": 111 + }, + { + "epoch": 0.014796710374211448, + "grad_norm": 0.29444658756256104, + "learning_rate": 0.00019991015863070411, + "loss": 0.0532, + "step": 112 + }, + { + "epoch": 0.014928823859695479, + "grad_norm": 0.3253136873245239, + "learning_rate": 0.00019990838867029348, + "loss": 0.0579, + "step": 113 + }, + { + "epoch": 0.01506093734517951, + "grad_norm": 0.31317800283432007, + "learning_rate": 0.00019990660145257673, + "loss": 0.0486, + "step": 114 + }, + { + "epoch": 0.01519305083066354, + "grad_norm": 0.3204265236854553, + "learning_rate": 0.00019990479697786257, + "loss": 0.0497, + "step": 115 + }, + { + "epoch": 0.01532516431614757, + "grad_norm": 0.3222450911998749, + "learning_rate": 0.0001999029752464627, + "loss": 0.047, + "step": 116 + }, + { + "epoch": 0.015457277801631602, + "grad_norm": 0.4164290726184845, + "learning_rate": 0.00019990113625869172, + "loss": 0.0575, + "step": 117 + }, + { + "epoch": 0.015589391287115633, + "grad_norm": 0.32476532459259033, + "learning_rate": 0.00019989928001486735, + "loss": 0.0426, + "step": 118 + }, + { + "epoch": 0.015721504772599662, + "grad_norm": 0.32922977209091187, + "learning_rate": 0.0001998974065153102, + "loss": 0.0502, + "step": 119 + }, + { + "epoch": 0.015853618258083693, + "grad_norm": 0.5189375281333923, + "learning_rate": 0.0001998955157603439, + "loss": 0.0707, + "step": 120 + }, + { + "epoch": 0.015985731743567724, + "grad_norm": 0.2372635304927826, + "learning_rate": 0.000199893607750295, + "loss": 0.0362, + "step": 121 + }, + { + "epoch": 0.016117845229051755, + "grad_norm": 0.39730942249298096, + "learning_rate": 0.00019989168248549312, + "loss": 0.0472, + "step": 122 + }, + { + "epoch": 0.016249958714535787, + "grad_norm": 0.3153408467769623, + "learning_rate": 0.00019988973996627076, + "loss": 0.0335, + "step": 123 + }, + { + "epoch": 0.016382072200019818, + "grad_norm": 0.38365423679351807, + "learning_rate": 0.0001998877801929635, + "loss": 0.0574, + "step": 124 + }, + { + "epoch": 0.01651418568550385, + "grad_norm": 0.35128024220466614, + "learning_rate": 0.00019988580316590985, + "loss": 0.039, + "step": 125 + }, + { + "epoch": 0.01664629917098788, + "grad_norm": 0.23545370995998383, + "learning_rate": 0.00019988380888545128, + "loss": 0.0374, + "step": 126 + }, + { + "epoch": 0.01677841265647191, + "grad_norm": 0.35600200295448303, + "learning_rate": 0.00019988179735193232, + "loss": 0.0625, + "step": 127 + }, + { + "epoch": 0.01691052614195594, + "grad_norm": 0.3094804286956787, + "learning_rate": 0.00019987976856570034, + "loss": 0.0505, + "step": 128 + }, + { + "epoch": 0.01704263962743997, + "grad_norm": 0.41479748487472534, + "learning_rate": 0.00019987772252710582, + "loss": 0.0468, + "step": 129 + }, + { + "epoch": 0.017174753112924, + "grad_norm": 0.3505192995071411, + "learning_rate": 0.0001998756592365022, + "loss": 0.0562, + "step": 130 + }, + { + "epoch": 0.017306866598408032, + "grad_norm": 0.30042126774787903, + "learning_rate": 0.00019987357869424586, + "loss": 0.046, + "step": 131 + }, + { + "epoch": 0.017438980083892063, + "grad_norm": 0.27621573209762573, + "learning_rate": 0.00019987148090069617, + "loss": 0.0474, + "step": 132 + }, + { + "epoch": 0.017571093569376094, + "grad_norm": 0.28688856959342957, + "learning_rate": 0.00019986936585621542, + "loss": 0.0348, + "step": 133 + }, + { + "epoch": 0.017703207054860125, + "grad_norm": 0.314082533121109, + "learning_rate": 0.00019986723356116905, + "loss": 0.0699, + "step": 134 + }, + { + "epoch": 0.017835320540344157, + "grad_norm": 0.3481959104537964, + "learning_rate": 0.0001998650840159253, + "loss": 0.0376, + "step": 135 + }, + { + "epoch": 0.017967434025828188, + "grad_norm": 0.31359419226646423, + "learning_rate": 0.00019986291722085553, + "loss": 0.0456, + "step": 136 + }, + { + "epoch": 0.01809954751131222, + "grad_norm": 0.29597827792167664, + "learning_rate": 0.00019986073317633394, + "loss": 0.0481, + "step": 137 + }, + { + "epoch": 0.018231660996796246, + "grad_norm": 0.31836578249931335, + "learning_rate": 0.00019985853188273783, + "loss": 0.0566, + "step": 138 + }, + { + "epoch": 0.018363774482280278, + "grad_norm": 0.3767854869365692, + "learning_rate": 0.0001998563133404474, + "loss": 0.0456, + "step": 139 + }, + { + "epoch": 0.01849588796776431, + "grad_norm": 0.5224811434745789, + "learning_rate": 0.0001998540775498459, + "loss": 0.0605, + "step": 140 + }, + { + "epoch": 0.01862800145324834, + "grad_norm": 0.6283369660377502, + "learning_rate": 0.00019985182451131948, + "loss": 0.0737, + "step": 141 + }, + { + "epoch": 0.01876011493873237, + "grad_norm": 0.29250311851501465, + "learning_rate": 0.00019984955422525737, + "loss": 0.0393, + "step": 142 + }, + { + "epoch": 0.018892228424216402, + "grad_norm": 0.3254264295101166, + "learning_rate": 0.00019984726669205167, + "loss": 0.0511, + "step": 143 + }, + { + "epoch": 0.019024341909700433, + "grad_norm": 0.5066094398498535, + "learning_rate": 0.00019984496191209752, + "loss": 0.0519, + "step": 144 + }, + { + "epoch": 0.019156455395184464, + "grad_norm": 0.4091821312904358, + "learning_rate": 0.00019984263988579302, + "loss": 0.0602, + "step": 145 + }, + { + "epoch": 0.019288568880668495, + "grad_norm": 0.3435320556163788, + "learning_rate": 0.00019984030061353925, + "loss": 0.0508, + "step": 146 + }, + { + "epoch": 0.019420682366152527, + "grad_norm": 0.2907693386077881, + "learning_rate": 0.0001998379440957403, + "loss": 0.0469, + "step": 147 + }, + { + "epoch": 0.019552795851636554, + "grad_norm": 0.38646361231803894, + "learning_rate": 0.00019983557033280322, + "loss": 0.0435, + "step": 148 + }, + { + "epoch": 0.019684909337120585, + "grad_norm": 0.3644489347934723, + "learning_rate": 0.000199833179325138, + "loss": 0.0468, + "step": 149 + }, + { + "epoch": 0.019817022822604616, + "grad_norm": 0.3132040202617645, + "learning_rate": 0.00019983077107315768, + "loss": 0.0566, + "step": 150 + }, + { + "epoch": 0.019949136308088648, + "grad_norm": 0.41234928369522095, + "learning_rate": 0.0001998283455772782, + "loss": 0.0571, + "step": 151 + }, + { + "epoch": 0.02008124979357268, + "grad_norm": 0.31212666630744934, + "learning_rate": 0.00019982590283791857, + "loss": 0.0345, + "step": 152 + }, + { + "epoch": 0.02021336327905671, + "grad_norm": 0.27527937293052673, + "learning_rate": 0.00019982344285550068, + "loss": 0.0414, + "step": 153 + }, + { + "epoch": 0.02034547676454074, + "grad_norm": 0.40870988368988037, + "learning_rate": 0.00019982096563044946, + "loss": 0.0633, + "step": 154 + }, + { + "epoch": 0.020477590250024772, + "grad_norm": 0.3136458098888397, + "learning_rate": 0.0001998184711631928, + "loss": 0.0445, + "step": 155 + }, + { + "epoch": 0.020609703735508803, + "grad_norm": 0.26755937933921814, + "learning_rate": 0.00019981595945416157, + "loss": 0.0431, + "step": 156 + }, + { + "epoch": 0.020741817220992834, + "grad_norm": 0.5987271666526794, + "learning_rate": 0.00019981343050378967, + "loss": 0.0578, + "step": 157 + }, + { + "epoch": 0.020873930706476862, + "grad_norm": 1.4551541805267334, + "learning_rate": 0.00019981088431251384, + "loss": 0.0443, + "step": 158 + }, + { + "epoch": 0.021006044191960893, + "grad_norm": 0.3044228255748749, + "learning_rate": 0.00019980832088077396, + "loss": 0.0385, + "step": 159 + }, + { + "epoch": 0.021138157677444924, + "grad_norm": 0.4092259109020233, + "learning_rate": 0.00019980574020901282, + "loss": 0.0492, + "step": 160 + }, + { + "epoch": 0.021270271162928955, + "grad_norm": 0.3194064497947693, + "learning_rate": 0.00019980314229767608, + "loss": 0.0571, + "step": 161 + }, + { + "epoch": 0.021402384648412986, + "grad_norm": 0.27775710821151733, + "learning_rate": 0.00019980052714721263, + "loss": 0.0477, + "step": 162 + }, + { + "epoch": 0.021534498133897018, + "grad_norm": 0.3487389385700226, + "learning_rate": 0.0001997978947580741, + "loss": 0.0427, + "step": 163 + }, + { + "epoch": 0.02166661161938105, + "grad_norm": 0.3832155466079712, + "learning_rate": 0.00019979524513071516, + "loss": 0.045, + "step": 164 + }, + { + "epoch": 0.02179872510486508, + "grad_norm": 0.27927878499031067, + "learning_rate": 0.00019979257826559357, + "loss": 0.0371, + "step": 165 + }, + { + "epoch": 0.02193083859034911, + "grad_norm": 0.2599792182445526, + "learning_rate": 0.00019978989416316988, + "loss": 0.0436, + "step": 166 + }, + { + "epoch": 0.022062952075833142, + "grad_norm": 0.33800897002220154, + "learning_rate": 0.00019978719282390782, + "loss": 0.0624, + "step": 167 + }, + { + "epoch": 0.02219506556131717, + "grad_norm": 0.3244200646877289, + "learning_rate": 0.00019978447424827392, + "loss": 0.0305, + "step": 168 + }, + { + "epoch": 0.0223271790468012, + "grad_norm": 0.3526539206504822, + "learning_rate": 0.00019978173843673779, + "loss": 0.0428, + "step": 169 + }, + { + "epoch": 0.022459292532285232, + "grad_norm": 0.21195152401924133, + "learning_rate": 0.00019977898538977201, + "loss": 0.0326, + "step": 170 + }, + { + "epoch": 0.022591406017769263, + "grad_norm": 0.36155325174331665, + "learning_rate": 0.00019977621510785208, + "loss": 0.0716, + "step": 171 + }, + { + "epoch": 0.022723519503253294, + "grad_norm": 0.31337928771972656, + "learning_rate": 0.00019977342759145653, + "loss": 0.0436, + "step": 172 + }, + { + "epoch": 0.022855632988737325, + "grad_norm": 0.33871617913246155, + "learning_rate": 0.00019977062284106688, + "loss": 0.0496, + "step": 173 + }, + { + "epoch": 0.022987746474221357, + "grad_norm": 0.22005482017993927, + "learning_rate": 0.00019976780085716758, + "loss": 0.0246, + "step": 174 + }, + { + "epoch": 0.023119859959705388, + "grad_norm": 0.4200701117515564, + "learning_rate": 0.00019976496164024604, + "loss": 0.0575, + "step": 175 + }, + { + "epoch": 0.02325197344518942, + "grad_norm": 0.3284203112125397, + "learning_rate": 0.0001997621051907927, + "loss": 0.0565, + "step": 176 + }, + { + "epoch": 0.02338408693067345, + "grad_norm": 0.3615087568759918, + "learning_rate": 0.000199759231509301, + "loss": 0.0535, + "step": 177 + }, + { + "epoch": 0.02351620041615748, + "grad_norm": 0.2843573987483978, + "learning_rate": 0.00019975634059626727, + "loss": 0.0453, + "step": 178 + }, + { + "epoch": 0.02364831390164151, + "grad_norm": 0.3295792043209076, + "learning_rate": 0.00019975343245219086, + "loss": 0.0551, + "step": 179 + }, + { + "epoch": 0.02378042738712554, + "grad_norm": 0.3016244173049927, + "learning_rate": 0.00019975050707757413, + "loss": 0.054, + "step": 180 + }, + { + "epoch": 0.02391254087260957, + "grad_norm": 0.3135812282562256, + "learning_rate": 0.00019974756447292235, + "loss": 0.0415, + "step": 181 + }, + { + "epoch": 0.024044654358093602, + "grad_norm": 0.2880041301250458, + "learning_rate": 0.00019974460463874382, + "loss": 0.0417, + "step": 182 + }, + { + "epoch": 0.024176767843577633, + "grad_norm": 0.23650489747524261, + "learning_rate": 0.0001997416275755498, + "loss": 0.0382, + "step": 183 + }, + { + "epoch": 0.024308881329061664, + "grad_norm": 0.20587410032749176, + "learning_rate": 0.0001997386332838545, + "loss": 0.0285, + "step": 184 + }, + { + "epoch": 0.024440994814545695, + "grad_norm": 0.2216879278421402, + "learning_rate": 0.00019973562176417515, + "loss": 0.0293, + "step": 185 + }, + { + "epoch": 0.024573108300029727, + "grad_norm": 0.3186545968055725, + "learning_rate": 0.0001997325930170319, + "loss": 0.0408, + "step": 186 + }, + { + "epoch": 0.024705221785513758, + "grad_norm": 0.25373074412345886, + "learning_rate": 0.00019972954704294797, + "loss": 0.0323, + "step": 187 + }, + { + "epoch": 0.02483733527099779, + "grad_norm": 0.27224501967430115, + "learning_rate": 0.00019972648384244943, + "loss": 0.0389, + "step": 188 + }, + { + "epoch": 0.024969448756481816, + "grad_norm": 0.2983395755290985, + "learning_rate": 0.00019972340341606546, + "loss": 0.0511, + "step": 189 + }, + { + "epoch": 0.025101562241965848, + "grad_norm": 0.2117580771446228, + "learning_rate": 0.00019972030576432807, + "loss": 0.0279, + "step": 190 + }, + { + "epoch": 0.02523367572744988, + "grad_norm": 0.3268132507801056, + "learning_rate": 0.00019971719088777236, + "loss": 0.0526, + "step": 191 + }, + { + "epoch": 0.02536578921293391, + "grad_norm": 0.295146644115448, + "learning_rate": 0.00019971405878693637, + "loss": 0.0371, + "step": 192 + }, + { + "epoch": 0.02549790269841794, + "grad_norm": 0.3200048804283142, + "learning_rate": 0.00019971090946236108, + "loss": 0.0454, + "step": 193 + }, + { + "epoch": 0.025630016183901972, + "grad_norm": 0.30760833621025085, + "learning_rate": 0.00019970774291459053, + "loss": 0.0477, + "step": 194 + }, + { + "epoch": 0.025762129669386003, + "grad_norm": 0.28404486179351807, + "learning_rate": 0.00019970455914417165, + "loss": 0.0317, + "step": 195 + }, + { + "epoch": 0.025894243154870034, + "grad_norm": 0.271648645401001, + "learning_rate": 0.00019970135815165438, + "loss": 0.0354, + "step": 196 + }, + { + "epoch": 0.026026356640354065, + "grad_norm": 0.3435926139354706, + "learning_rate": 0.00019969813993759162, + "loss": 0.0443, + "step": 197 + }, + { + "epoch": 0.026158470125838097, + "grad_norm": 0.31753024458885193, + "learning_rate": 0.00019969490450253932, + "loss": 0.0439, + "step": 198 + }, + { + "epoch": 0.026290583611322124, + "grad_norm": 0.4796743094921112, + "learning_rate": 0.00019969165184705623, + "loss": 0.0335, + "step": 199 + }, + { + "epoch": 0.026422697096806155, + "grad_norm": 0.47100892663002014, + "learning_rate": 0.00019968838197170427, + "loss": 0.0516, + "step": 200 + }, + { + "epoch": 0.026554810582290186, + "grad_norm": 0.32624903321266174, + "learning_rate": 0.0001996850948770482, + "loss": 0.0431, + "step": 201 + }, + { + "epoch": 0.026686924067774218, + "grad_norm": 0.2607976198196411, + "learning_rate": 0.00019968179056365588, + "loss": 0.0306, + "step": 202 + }, + { + "epoch": 0.02681903755325825, + "grad_norm": 0.3531661331653595, + "learning_rate": 0.000199678469032098, + "loss": 0.0474, + "step": 203 + }, + { + "epoch": 0.02695115103874228, + "grad_norm": 0.3134405314922333, + "learning_rate": 0.0001996751302829483, + "loss": 0.0518, + "step": 204 + }, + { + "epoch": 0.02708326452422631, + "grad_norm": 0.26866933703422546, + "learning_rate": 0.00019967177431678347, + "loss": 0.0375, + "step": 205 + }, + { + "epoch": 0.027215378009710342, + "grad_norm": 0.30498382449150085, + "learning_rate": 0.00019966840113418326, + "loss": 0.033, + "step": 206 + }, + { + "epoch": 0.027347491495194373, + "grad_norm": 0.27717041969299316, + "learning_rate": 0.00019966501073573025, + "loss": 0.0387, + "step": 207 + }, + { + "epoch": 0.027479604980678404, + "grad_norm": 0.3919484615325928, + "learning_rate": 0.0001996616031220101, + "loss": 0.0434, + "step": 208 + }, + { + "epoch": 0.027611718466162432, + "grad_norm": 0.31150931119918823, + "learning_rate": 0.00019965817829361145, + "loss": 0.0352, + "step": 209 + }, + { + "epoch": 0.027743831951646463, + "grad_norm": 1.0243334770202637, + "learning_rate": 0.0001996547362511258, + "loss": 0.0719, + "step": 210 + }, + { + "epoch": 0.027875945437130494, + "grad_norm": 0.4097023904323578, + "learning_rate": 0.00019965127699514773, + "loss": 0.0503, + "step": 211 + }, + { + "epoch": 0.028008058922614525, + "grad_norm": 0.29546260833740234, + "learning_rate": 0.00019964780052627478, + "loss": 0.0484, + "step": 212 + }, + { + "epoch": 0.028140172408098556, + "grad_norm": 0.3095148801803589, + "learning_rate": 0.00019964430684510744, + "loss": 0.0548, + "step": 213 + }, + { + "epoch": 0.028272285893582588, + "grad_norm": 0.37686797976493835, + "learning_rate": 0.00019964079595224919, + "loss": 0.0559, + "step": 214 + }, + { + "epoch": 0.02840439937906662, + "grad_norm": 0.352685809135437, + "learning_rate": 0.0001996372678483064, + "loss": 0.0424, + "step": 215 + }, + { + "epoch": 0.02853651286455065, + "grad_norm": 0.30168718099594116, + "learning_rate": 0.00019963372253388857, + "loss": 0.0392, + "step": 216 + }, + { + "epoch": 0.02866862635003468, + "grad_norm": 0.3077027201652527, + "learning_rate": 0.00019963016000960803, + "loss": 0.043, + "step": 217 + }, + { + "epoch": 0.028800739835518712, + "grad_norm": 0.1910567730665207, + "learning_rate": 0.00019962658027608017, + "loss": 0.0266, + "step": 218 + }, + { + "epoch": 0.02893285332100274, + "grad_norm": 0.3267483115196228, + "learning_rate": 0.00019962298333392331, + "loss": 0.0466, + "step": 219 + }, + { + "epoch": 0.02906496680648677, + "grad_norm": 0.2665257155895233, + "learning_rate": 0.00019961936918375876, + "loss": 0.0373, + "step": 220 + }, + { + "epoch": 0.029197080291970802, + "grad_norm": 0.3253210484981537, + "learning_rate": 0.0001996157378262108, + "loss": 0.0416, + "step": 221 + }, + { + "epoch": 0.029329193777454833, + "grad_norm": 0.32253530621528625, + "learning_rate": 0.00019961208926190668, + "loss": 0.0369, + "step": 222 + }, + { + "epoch": 0.029461307262938864, + "grad_norm": 0.3137173652648926, + "learning_rate": 0.00019960842349147658, + "loss": 0.028, + "step": 223 + }, + { + "epoch": 0.029593420748422895, + "grad_norm": 0.3122856020927429, + "learning_rate": 0.00019960474051555372, + "loss": 0.0547, + "step": 224 + }, + { + "epoch": 0.029725534233906926, + "grad_norm": 0.3124530613422394, + "learning_rate": 0.00019960104033477433, + "loss": 0.0529, + "step": 225 + }, + { + "epoch": 0.029857647719390958, + "grad_norm": 0.3381507992744446, + "learning_rate": 0.00019959732294977744, + "loss": 0.0457, + "step": 226 + }, + { + "epoch": 0.02998976120487499, + "grad_norm": 0.25204360485076904, + "learning_rate": 0.00019959358836120524, + "loss": 0.0262, + "step": 227 + }, + { + "epoch": 0.03012187469035902, + "grad_norm": 0.21586604416370392, + "learning_rate": 0.00019958983656970277, + "loss": 0.0328, + "step": 228 + }, + { + "epoch": 0.030253988175843047, + "grad_norm": 0.32628804445266724, + "learning_rate": 0.00019958606757591806, + "loss": 0.0332, + "step": 229 + }, + { + "epoch": 0.03038610166132708, + "grad_norm": 0.31612610816955566, + "learning_rate": 0.00019958228138050222, + "loss": 0.0358, + "step": 230 + }, + { + "epoch": 0.03051821514681111, + "grad_norm": 0.2597368061542511, + "learning_rate": 0.00019957847798410914, + "loss": 0.0321, + "step": 231 + }, + { + "epoch": 0.03065032863229514, + "grad_norm": 0.2700968384742737, + "learning_rate": 0.00019957465738739587, + "loss": 0.0507, + "step": 232 + }, + { + "epoch": 0.030782442117779172, + "grad_norm": 0.3140323758125305, + "learning_rate": 0.0001995708195910223, + "loss": 0.06, + "step": 233 + }, + { + "epoch": 0.030914555603263203, + "grad_norm": 0.2122896909713745, + "learning_rate": 0.00019956696459565133, + "loss": 0.0308, + "step": 234 + }, + { + "epoch": 0.031046669088747234, + "grad_norm": 0.30587294697761536, + "learning_rate": 0.00019956309240194887, + "loss": 0.0398, + "step": 235 + }, + { + "epoch": 0.031178782574231265, + "grad_norm": 0.3328923285007477, + "learning_rate": 0.00019955920301058377, + "loss": 0.0703, + "step": 236 + }, + { + "epoch": 0.03131089605971529, + "grad_norm": 0.25456702709198, + "learning_rate": 0.00019955529642222782, + "loss": 0.0384, + "step": 237 + }, + { + "epoch": 0.031443009545199324, + "grad_norm": 0.4323252737522125, + "learning_rate": 0.00019955137263755584, + "loss": 0.0473, + "step": 238 + }, + { + "epoch": 0.031575123030683355, + "grad_norm": 0.2853761911392212, + "learning_rate": 0.00019954743165724554, + "loss": 0.0463, + "step": 239 + }, + { + "epoch": 0.031707236516167386, + "grad_norm": 0.24068021774291992, + "learning_rate": 0.00019954347348197772, + "loss": 0.0422, + "step": 240 + }, + { + "epoch": 0.03183935000165142, + "grad_norm": 0.24711669981479645, + "learning_rate": 0.00019953949811243602, + "loss": 0.048, + "step": 241 + }, + { + "epoch": 0.03197146348713545, + "grad_norm": 0.3120286464691162, + "learning_rate": 0.00019953550554930715, + "loss": 0.0297, + "step": 242 + }, + { + "epoch": 0.03210357697261948, + "grad_norm": 0.35250943899154663, + "learning_rate": 0.00019953149579328075, + "loss": 0.028, + "step": 243 + }, + { + "epoch": 0.03223569045810351, + "grad_norm": 0.26348239183425903, + "learning_rate": 0.00019952746884504942, + "loss": 0.037, + "step": 244 + }, + { + "epoch": 0.03236780394358754, + "grad_norm": 0.23028039932250977, + "learning_rate": 0.00019952342470530874, + "loss": 0.0279, + "step": 245 + }, + { + "epoch": 0.03249991742907157, + "grad_norm": 0.3633415102958679, + "learning_rate": 0.00019951936337475723, + "loss": 0.0627, + "step": 246 + }, + { + "epoch": 0.032632030914555604, + "grad_norm": 0.582521915435791, + "learning_rate": 0.00019951528485409646, + "loss": 0.0819, + "step": 247 + }, + { + "epoch": 0.032764144400039635, + "grad_norm": 0.28156954050064087, + "learning_rate": 0.0001995111891440309, + "loss": 0.044, + "step": 248 + }, + { + "epoch": 0.032896257885523666, + "grad_norm": 0.4584990441799164, + "learning_rate": 0.000199507076245268, + "loss": 0.0534, + "step": 249 + }, + { + "epoch": 0.0330283713710077, + "grad_norm": 0.45770400762557983, + "learning_rate": 0.00019950294615851818, + "loss": 0.0524, + "step": 250 + }, + { + "epoch": 0.03316048485649173, + "grad_norm": 0.23415185511112213, + "learning_rate": 0.00019949879888449487, + "loss": 0.0278, + "step": 251 + }, + { + "epoch": 0.03329259834197576, + "grad_norm": 0.23550723493099213, + "learning_rate": 0.00019949463442391437, + "loss": 0.0466, + "step": 252 + }, + { + "epoch": 0.03342471182745979, + "grad_norm": 0.21547628939151764, + "learning_rate": 0.00019949045277749608, + "loss": 0.0346, + "step": 253 + }, + { + "epoch": 0.03355682531294382, + "grad_norm": 0.28360670804977417, + "learning_rate": 0.0001994862539459623, + "loss": 0.0447, + "step": 254 + }, + { + "epoch": 0.033688938798427846, + "grad_norm": 0.22443735599517822, + "learning_rate": 0.00019948203793003822, + "loss": 0.0446, + "step": 255 + }, + { + "epoch": 0.03382105228391188, + "grad_norm": 0.46867096424102783, + "learning_rate": 0.00019947780473045216, + "loss": 0.0703, + "step": 256 + }, + { + "epoch": 0.03395316576939591, + "grad_norm": 0.24217675626277924, + "learning_rate": 0.00019947355434793526, + "loss": 0.0486, + "step": 257 + }, + { + "epoch": 0.03408527925487994, + "grad_norm": 0.2256115823984146, + "learning_rate": 0.00019946928678322173, + "loss": 0.0383, + "step": 258 + }, + { + "epoch": 0.03421739274036397, + "grad_norm": 0.2267841249704361, + "learning_rate": 0.00019946500203704877, + "loss": 0.0366, + "step": 259 + }, + { + "epoch": 0.034349506225848, + "grad_norm": 0.330691397190094, + "learning_rate": 0.00019946070011015642, + "loss": 0.0473, + "step": 260 + }, + { + "epoch": 0.03448161971133203, + "grad_norm": 0.3271361291408539, + "learning_rate": 0.0001994563810032877, + "loss": 0.0391, + "step": 261 + }, + { + "epoch": 0.034613733196816064, + "grad_norm": 0.23788490891456604, + "learning_rate": 0.0001994520447171888, + "loss": 0.0251, + "step": 262 + }, + { + "epoch": 0.034745846682300095, + "grad_norm": 0.37657663226127625, + "learning_rate": 0.00019944769125260862, + "loss": 0.0442, + "step": 263 + }, + { + "epoch": 0.034877960167784126, + "grad_norm": 0.37910810112953186, + "learning_rate": 0.0001994433206102992, + "loss": 0.0411, + "step": 264 + }, + { + "epoch": 0.03501007365326816, + "grad_norm": 0.4243963956832886, + "learning_rate": 0.00019943893279101543, + "loss": 0.0621, + "step": 265 + }, + { + "epoch": 0.03514218713875219, + "grad_norm": 0.3931756019592285, + "learning_rate": 0.0001994345277955153, + "loss": 0.0544, + "step": 266 + }, + { + "epoch": 0.03527430062423622, + "grad_norm": 0.5544042587280273, + "learning_rate": 0.00019943010562455962, + "loss": 0.046, + "step": 267 + }, + { + "epoch": 0.03540641410972025, + "grad_norm": 0.3359571695327759, + "learning_rate": 0.0001994256662789123, + "loss": 0.0476, + "step": 268 + }, + { + "epoch": 0.03553852759520428, + "grad_norm": 0.26182490587234497, + "learning_rate": 0.00019942120975934008, + "loss": 0.0301, + "step": 269 + }, + { + "epoch": 0.03567064108068831, + "grad_norm": 0.30689841508865356, + "learning_rate": 0.00019941673606661277, + "loss": 0.0414, + "step": 270 + }, + { + "epoch": 0.035802754566172344, + "grad_norm": 1.3792670965194702, + "learning_rate": 0.00019941224520150314, + "loss": 0.0793, + "step": 271 + }, + { + "epoch": 0.035934868051656375, + "grad_norm": 0.2693886160850525, + "learning_rate": 0.0001994077371647869, + "loss": 0.038, + "step": 272 + }, + { + "epoch": 0.036066981537140406, + "grad_norm": 0.26960980892181396, + "learning_rate": 0.0001994032119572427, + "loss": 0.0331, + "step": 273 + }, + { + "epoch": 0.03619909502262444, + "grad_norm": 0.3784126937389374, + "learning_rate": 0.00019939866957965224, + "loss": 0.0527, + "step": 274 + }, + { + "epoch": 0.03633120850810846, + "grad_norm": 0.308722585439682, + "learning_rate": 0.00019939411003280007, + "loss": 0.0576, + "step": 275 + }, + { + "epoch": 0.03646332199359249, + "grad_norm": 0.3805916905403137, + "learning_rate": 0.0001993895333174738, + "loss": 0.0417, + "step": 276 + }, + { + "epoch": 0.036595435479076524, + "grad_norm": 0.39375531673431396, + "learning_rate": 0.00019938493943446394, + "loss": 0.0544, + "step": 277 + }, + { + "epoch": 0.036727548964560555, + "grad_norm": 0.37296605110168457, + "learning_rate": 0.000199380328384564, + "loss": 0.0424, + "step": 278 + }, + { + "epoch": 0.036859662450044586, + "grad_norm": 0.5127437114715576, + "learning_rate": 0.00019937570016857054, + "loss": 0.0578, + "step": 279 + }, + { + "epoch": 0.03699177593552862, + "grad_norm": 0.20692554116249084, + "learning_rate": 0.00019937105478728292, + "loss": 0.0304, + "step": 280 + }, + { + "epoch": 0.03712388942101265, + "grad_norm": 0.2890003025531769, + "learning_rate": 0.0001993663922415035, + "loss": 0.0408, + "step": 281 + }, + { + "epoch": 0.03725600290649668, + "grad_norm": 0.3227050006389618, + "learning_rate": 0.00019936171253203772, + "loss": 0.0573, + "step": 282 + }, + { + "epoch": 0.03738811639198071, + "grad_norm": 0.3141216039657593, + "learning_rate": 0.00019935701565969391, + "loss": 0.0452, + "step": 283 + }, + { + "epoch": 0.03752022987746474, + "grad_norm": 0.3515707552433014, + "learning_rate": 0.00019935230162528334, + "loss": 0.0557, + "step": 284 + }, + { + "epoch": 0.03765234336294877, + "grad_norm": 0.24839921295642853, + "learning_rate": 0.0001993475704296203, + "loss": 0.041, + "step": 285 + }, + { + "epoch": 0.037784456848432804, + "grad_norm": 0.2833830714225769, + "learning_rate": 0.00019934282207352197, + "loss": 0.0503, + "step": 286 + }, + { + "epoch": 0.037916570333916835, + "grad_norm": 0.25666195154190063, + "learning_rate": 0.0001993380565578086, + "loss": 0.0413, + "step": 287 + }, + { + "epoch": 0.038048683819400866, + "grad_norm": 0.41307997703552246, + "learning_rate": 0.00019933327388330327, + "loss": 0.042, + "step": 288 + }, + { + "epoch": 0.0381807973048849, + "grad_norm": 0.28401657938957214, + "learning_rate": 0.00019932847405083214, + "loss": 0.0391, + "step": 289 + }, + { + "epoch": 0.03831291079036893, + "grad_norm": 0.2572495937347412, + "learning_rate": 0.00019932365706122433, + "loss": 0.0392, + "step": 290 + }, + { + "epoch": 0.03844502427585296, + "grad_norm": 0.2577889561653137, + "learning_rate": 0.00019931882291531183, + "loss": 0.0304, + "step": 291 + }, + { + "epoch": 0.03857713776133699, + "grad_norm": 0.25664687156677246, + "learning_rate": 0.00019931397161392965, + "loss": 0.0364, + "step": 292 + }, + { + "epoch": 0.03870925124682102, + "grad_norm": 0.3059316873550415, + "learning_rate": 0.00019930910315791577, + "loss": 0.0412, + "step": 293 + }, + { + "epoch": 0.03884136473230505, + "grad_norm": 0.20961342751979828, + "learning_rate": 0.00019930421754811112, + "loss": 0.0317, + "step": 294 + }, + { + "epoch": 0.038973478217789084, + "grad_norm": 0.2327074110507965, + "learning_rate": 0.00019929931478535965, + "loss": 0.0391, + "step": 295 + }, + { + "epoch": 0.03910559170327311, + "grad_norm": 0.2746254503726959, + "learning_rate": 0.00019929439487050812, + "loss": 0.017, + "step": 296 + }, + { + "epoch": 0.03923770518875714, + "grad_norm": 0.7959591150283813, + "learning_rate": 0.00019928945780440645, + "loss": 0.0459, + "step": 297 + }, + { + "epoch": 0.03936981867424117, + "grad_norm": 0.2347254902124405, + "learning_rate": 0.0001992845035879074, + "loss": 0.0416, + "step": 298 + }, + { + "epoch": 0.0395019321597252, + "grad_norm": 0.34481900930404663, + "learning_rate": 0.00019927953222186666, + "loss": 0.0609, + "step": 299 + }, + { + "epoch": 0.03963404564520923, + "grad_norm": 0.46709200739860535, + "learning_rate": 0.000199274543707143, + "loss": 0.0428, + "step": 300 + }, + { + "epoch": 0.039766159130693264, + "grad_norm": 0.29546868801116943, + "learning_rate": 0.0001992695380445981, + "loss": 0.0362, + "step": 301 + }, + { + "epoch": 0.039898272616177295, + "grad_norm": 0.45198675990104675, + "learning_rate": 0.00019926451523509653, + "loss": 0.0766, + "step": 302 + }, + { + "epoch": 0.040030386101661326, + "grad_norm": 0.25279825925827026, + "learning_rate": 0.00019925947527950596, + "loss": 0.051, + "step": 303 + }, + { + "epoch": 0.04016249958714536, + "grad_norm": 0.22193700075149536, + "learning_rate": 0.0001992544181786969, + "loss": 0.0299, + "step": 304 + }, + { + "epoch": 0.04029461307262939, + "grad_norm": 0.3298232853412628, + "learning_rate": 0.00019924934393354292, + "loss": 0.0468, + "step": 305 + }, + { + "epoch": 0.04042672655811342, + "grad_norm": 0.22312577068805695, + "learning_rate": 0.00019924425254492042, + "loss": 0.0385, + "step": 306 + }, + { + "epoch": 0.04055884004359745, + "grad_norm": 0.27001601457595825, + "learning_rate": 0.00019923914401370893, + "loss": 0.0292, + "step": 307 + }, + { + "epoch": 0.04069095352908148, + "grad_norm": 0.2545408010482788, + "learning_rate": 0.0001992340183407908, + "loss": 0.043, + "step": 308 + }, + { + "epoch": 0.04082306701456551, + "grad_norm": 0.22949855029582977, + "learning_rate": 0.0001992288755270514, + "loss": 0.0321, + "step": 309 + }, + { + "epoch": 0.040955180500049544, + "grad_norm": 0.44335100054740906, + "learning_rate": 0.00019922371557337906, + "loss": 0.0524, + "step": 310 + }, + { + "epoch": 0.041087293985533575, + "grad_norm": 0.46752268075942993, + "learning_rate": 0.00019921853848066506, + "loss": 0.0432, + "step": 311 + }, + { + "epoch": 0.041219407471017606, + "grad_norm": 0.4074217677116394, + "learning_rate": 0.0001992133442498037, + "loss": 0.0359, + "step": 312 + }, + { + "epoch": 0.04135152095650164, + "grad_norm": 0.33330097794532776, + "learning_rate": 0.00019920813288169212, + "loss": 0.0736, + "step": 313 + }, + { + "epoch": 0.04148363444198567, + "grad_norm": 0.352234810590744, + "learning_rate": 0.00019920290437723046, + "loss": 0.0338, + "step": 314 + }, + { + "epoch": 0.0416157479274697, + "grad_norm": 0.27098697423934937, + "learning_rate": 0.00019919765873732193, + "loss": 0.0482, + "step": 315 + }, + { + "epoch": 0.041747861412953724, + "grad_norm": 0.29560789465904236, + "learning_rate": 0.00019919239596287257, + "loss": 0.0426, + "step": 316 + }, + { + "epoch": 0.041879974898437755, + "grad_norm": 0.22003328800201416, + "learning_rate": 0.00019918711605479146, + "loss": 0.0299, + "step": 317 + }, + { + "epoch": 0.042012088383921786, + "grad_norm": 0.20728716254234314, + "learning_rate": 0.00019918181901399057, + "loss": 0.0335, + "step": 318 + }, + { + "epoch": 0.04214420186940582, + "grad_norm": 0.31240910291671753, + "learning_rate": 0.00019917650484138486, + "loss": 0.042, + "step": 319 + }, + { + "epoch": 0.04227631535488985, + "grad_norm": 0.3703945279121399, + "learning_rate": 0.00019917117353789225, + "loss": 0.0403, + "step": 320 + }, + { + "epoch": 0.04240842884037388, + "grad_norm": 0.2775070071220398, + "learning_rate": 0.00019916582510443368, + "loss": 0.0405, + "step": 321 + }, + { + "epoch": 0.04254054232585791, + "grad_norm": 0.2771599292755127, + "learning_rate": 0.00019916045954193292, + "loss": 0.0455, + "step": 322 + }, + { + "epoch": 0.04267265581134194, + "grad_norm": 0.4107271432876587, + "learning_rate": 0.00019915507685131685, + "loss": 0.0458, + "step": 323 + }, + { + "epoch": 0.04280476929682597, + "grad_norm": 0.38203802704811096, + "learning_rate": 0.00019914967703351513, + "loss": 0.0495, + "step": 324 + }, + { + "epoch": 0.042936882782310004, + "grad_norm": 0.26967763900756836, + "learning_rate": 0.00019914426008946058, + "loss": 0.0364, + "step": 325 + }, + { + "epoch": 0.043068996267794035, + "grad_norm": 0.39296478033065796, + "learning_rate": 0.00019913882602008877, + "loss": 0.045, + "step": 326 + }, + { + "epoch": 0.043201109753278066, + "grad_norm": 0.2375117838382721, + "learning_rate": 0.00019913337482633844, + "loss": 0.0359, + "step": 327 + }, + { + "epoch": 0.0433332232387621, + "grad_norm": 0.3346708118915558, + "learning_rate": 0.00019912790650915112, + "loss": 0.0376, + "step": 328 + }, + { + "epoch": 0.04346533672424613, + "grad_norm": 0.26708805561065674, + "learning_rate": 0.00019912242106947137, + "loss": 0.0325, + "step": 329 + }, + { + "epoch": 0.04359745020973016, + "grad_norm": 0.3279438316822052, + "learning_rate": 0.0001991169185082467, + "loss": 0.0437, + "step": 330 + }, + { + "epoch": 0.04372956369521419, + "grad_norm": 0.12221206724643707, + "learning_rate": 0.00019911139882642758, + "loss": 0.0154, + "step": 331 + }, + { + "epoch": 0.04386167718069822, + "grad_norm": 0.2495148479938507, + "learning_rate": 0.00019910586202496742, + "loss": 0.0464, + "step": 332 + }, + { + "epoch": 0.04399379066618225, + "grad_norm": 0.4149893820285797, + "learning_rate": 0.0001991003081048226, + "loss": 0.0423, + "step": 333 + }, + { + "epoch": 0.044125904151666284, + "grad_norm": 0.33313971757888794, + "learning_rate": 0.00019909473706695245, + "loss": 0.0556, + "step": 334 + }, + { + "epoch": 0.044258017637150315, + "grad_norm": 0.2713533341884613, + "learning_rate": 0.00019908914891231927, + "loss": 0.0279, + "step": 335 + }, + { + "epoch": 0.04439013112263434, + "grad_norm": 0.2693321704864502, + "learning_rate": 0.00019908354364188836, + "loss": 0.0571, + "step": 336 + }, + { + "epoch": 0.04452224460811837, + "grad_norm": 0.2995421290397644, + "learning_rate": 0.00019907792125662782, + "loss": 0.0404, + "step": 337 + }, + { + "epoch": 0.0446543580936024, + "grad_norm": 0.2042950689792633, + "learning_rate": 0.0001990722817575089, + "loss": 0.0411, + "step": 338 + }, + { + "epoch": 0.04478647157908643, + "grad_norm": 0.21087051928043365, + "learning_rate": 0.0001990666251455057, + "loss": 0.0291, + "step": 339 + }, + { + "epoch": 0.044918585064570464, + "grad_norm": 0.24781399965286255, + "learning_rate": 0.00019906095142159524, + "loss": 0.0322, + "step": 340 + }, + { + "epoch": 0.045050698550054495, + "grad_norm": 0.25378474593162537, + "learning_rate": 0.00019905526058675764, + "loss": 0.0461, + "step": 341 + }, + { + "epoch": 0.045182812035538526, + "grad_norm": 0.25275951623916626, + "learning_rate": 0.00019904955264197577, + "loss": 0.0397, + "step": 342 + }, + { + "epoch": 0.04531492552102256, + "grad_norm": 0.2260797619819641, + "learning_rate": 0.0001990438275882357, + "loss": 0.0413, + "step": 343 + }, + { + "epoch": 0.04544703900650659, + "grad_norm": 0.18803325295448303, + "learning_rate": 0.00019903808542652625, + "loss": 0.0256, + "step": 344 + }, + { + "epoch": 0.04557915249199062, + "grad_norm": 0.33592838048934937, + "learning_rate": 0.0001990323261578393, + "loss": 0.0469, + "step": 345 + }, + { + "epoch": 0.04571126597747465, + "grad_norm": 0.20926974713802338, + "learning_rate": 0.00019902654978316958, + "loss": 0.0349, + "step": 346 + }, + { + "epoch": 0.04584337946295868, + "grad_norm": 0.22873865067958832, + "learning_rate": 0.00019902075630351496, + "loss": 0.032, + "step": 347 + }, + { + "epoch": 0.04597549294844271, + "grad_norm": 0.3676367998123169, + "learning_rate": 0.0001990149457198761, + "loss": 0.0507, + "step": 348 + }, + { + "epoch": 0.046107606433926744, + "grad_norm": 0.23917162418365479, + "learning_rate": 0.0001990091180332567, + "loss": 0.0455, + "step": 349 + }, + { + "epoch": 0.046239719919410775, + "grad_norm": 0.2903183102607727, + "learning_rate": 0.0001990032732446633, + "loss": 0.0466, + "step": 350 + }, + { + "epoch": 0.046371833404894806, + "grad_norm": 0.19862964749336243, + "learning_rate": 0.0001989974113551056, + "loss": 0.0243, + "step": 351 + }, + { + "epoch": 0.04650394689037884, + "grad_norm": 0.4329935312271118, + "learning_rate": 0.00019899153236559603, + "loss": 0.0513, + "step": 352 + }, + { + "epoch": 0.04663606037586287, + "grad_norm": 0.21616338193416595, + "learning_rate": 0.0001989856362771501, + "loss": 0.0337, + "step": 353 + }, + { + "epoch": 0.0467681738613469, + "grad_norm": 0.3290994465351105, + "learning_rate": 0.00019897972309078628, + "loss": 0.0327, + "step": 354 + }, + { + "epoch": 0.04690028734683093, + "grad_norm": 0.6223381757736206, + "learning_rate": 0.00019897379280752598, + "loss": 0.06, + "step": 355 + }, + { + "epoch": 0.04703240083231496, + "grad_norm": 0.2321864515542984, + "learning_rate": 0.0001989678454283935, + "loss": 0.0413, + "step": 356 + }, + { + "epoch": 0.047164514317798986, + "grad_norm": 0.2947792708873749, + "learning_rate": 0.00019896188095441613, + "loss": 0.0541, + "step": 357 + }, + { + "epoch": 0.04729662780328302, + "grad_norm": 0.307486355304718, + "learning_rate": 0.00019895589938662416, + "loss": 0.0485, + "step": 358 + }, + { + "epoch": 0.04742874128876705, + "grad_norm": 0.2538960576057434, + "learning_rate": 0.0001989499007260508, + "loss": 0.0393, + "step": 359 + }, + { + "epoch": 0.04756085477425108, + "grad_norm": 0.21938273310661316, + "learning_rate": 0.00019894388497373214, + "loss": 0.0272, + "step": 360 + }, + { + "epoch": 0.04769296825973511, + "grad_norm": 0.2789864242076874, + "learning_rate": 0.00019893785213070733, + "loss": 0.0318, + "step": 361 + }, + { + "epoch": 0.04782508174521914, + "grad_norm": 0.23993203043937683, + "learning_rate": 0.00019893180219801844, + "loss": 0.0355, + "step": 362 + }, + { + "epoch": 0.04795719523070317, + "grad_norm": 0.3103472590446472, + "learning_rate": 0.00019892573517671047, + "loss": 0.0384, + "step": 363 + }, + { + "epoch": 0.048089308716187204, + "grad_norm": 0.31523704528808594, + "learning_rate": 0.0001989196510678314, + "loss": 0.0326, + "step": 364 + }, + { + "epoch": 0.048221422201671235, + "grad_norm": 0.33698925375938416, + "learning_rate": 0.00019891354987243217, + "loss": 0.0527, + "step": 365 + }, + { + "epoch": 0.048353535687155266, + "grad_norm": 0.22326672077178955, + "learning_rate": 0.00019890743159156656, + "loss": 0.0345, + "step": 366 + }, + { + "epoch": 0.0484856491726393, + "grad_norm": 0.32215416431427, + "learning_rate": 0.00019890129622629146, + "loss": 0.0706, + "step": 367 + }, + { + "epoch": 0.04861776265812333, + "grad_norm": 0.31839343905448914, + "learning_rate": 0.00019889514377766662, + "loss": 0.038, + "step": 368 + }, + { + "epoch": 0.04874987614360736, + "grad_norm": 0.23498553037643433, + "learning_rate": 0.00019888897424675476, + "loss": 0.0279, + "step": 369 + }, + { + "epoch": 0.04888198962909139, + "grad_norm": 0.29461991786956787, + "learning_rate": 0.00019888278763462158, + "loss": 0.0339, + "step": 370 + }, + { + "epoch": 0.04901410311457542, + "grad_norm": 0.27036118507385254, + "learning_rate": 0.00019887658394233563, + "loss": 0.0353, + "step": 371 + }, + { + "epoch": 0.04914621660005945, + "grad_norm": 0.24991001188755035, + "learning_rate": 0.00019887036317096856, + "loss": 0.03, + "step": 372 + }, + { + "epoch": 0.049278330085543484, + "grad_norm": 0.22536452114582062, + "learning_rate": 0.00019886412532159486, + "loss": 0.035, + "step": 373 + }, + { + "epoch": 0.049410443571027515, + "grad_norm": 0.3040442168712616, + "learning_rate": 0.00019885787039529198, + "loss": 0.0324, + "step": 374 + }, + { + "epoch": 0.049542557056511546, + "grad_norm": 0.32448676228523254, + "learning_rate": 0.00019885159839314035, + "loss": 0.0289, + "step": 375 + }, + { + "epoch": 0.04967467054199558, + "grad_norm": 0.3627474904060364, + "learning_rate": 0.0001988453093162234, + "loss": 0.0639, + "step": 376 + }, + { + "epoch": 0.0498067840274796, + "grad_norm": 0.30182796716690063, + "learning_rate": 0.00019883900316562735, + "loss": 0.037, + "step": 377 + }, + { + "epoch": 0.04993889751296363, + "grad_norm": 0.21517710387706757, + "learning_rate": 0.00019883267994244154, + "loss": 0.0351, + "step": 378 + }, + { + "epoch": 0.050071010998447664, + "grad_norm": 0.22155945003032684, + "learning_rate": 0.0001988263396477582, + "loss": 0.0228, + "step": 379 + }, + { + "epoch": 0.050203124483931695, + "grad_norm": 0.2640284299850464, + "learning_rate": 0.00019881998228267245, + "loss": 0.0475, + "step": 380 + }, + { + "epoch": 0.050335237969415726, + "grad_norm": 0.38280588388442993, + "learning_rate": 0.00019881360784828242, + "loss": 0.0468, + "step": 381 + }, + { + "epoch": 0.05046735145489976, + "grad_norm": 0.23366643488407135, + "learning_rate": 0.0001988072163456892, + "loss": 0.0317, + "step": 382 + }, + { + "epoch": 0.05059946494038379, + "grad_norm": 0.2938922643661499, + "learning_rate": 0.00019880080777599673, + "loss": 0.0419, + "step": 383 + }, + { + "epoch": 0.05073157842586782, + "grad_norm": 0.23098677396774292, + "learning_rate": 0.00019879438214031206, + "loss": 0.0291, + "step": 384 + }, + { + "epoch": 0.05086369191135185, + "grad_norm": 0.32484182715415955, + "learning_rate": 0.00019878793943974506, + "loss": 0.0349, + "step": 385 + }, + { + "epoch": 0.05099580539683588, + "grad_norm": 0.356143593788147, + "learning_rate": 0.00019878147967540859, + "loss": 0.0435, + "step": 386 + }, + { + "epoch": 0.05112791888231991, + "grad_norm": 0.24577073752880096, + "learning_rate": 0.00019877500284841846, + "loss": 0.0459, + "step": 387 + }, + { + "epoch": 0.051260032367803944, + "grad_norm": 0.32349520921707153, + "learning_rate": 0.00019876850895989337, + "loss": 0.0282, + "step": 388 + }, + { + "epoch": 0.051392145853287975, + "grad_norm": 0.2956535816192627, + "learning_rate": 0.0001987619980109551, + "loss": 0.033, + "step": 389 + }, + { + "epoch": 0.051524259338772006, + "grad_norm": 0.26525723934173584, + "learning_rate": 0.00019875547000272823, + "loss": 0.0487, + "step": 390 + }, + { + "epoch": 0.05165637282425604, + "grad_norm": 0.23867838084697723, + "learning_rate": 0.00019874892493634038, + "loss": 0.0371, + "step": 391 + }, + { + "epoch": 0.05178848630974007, + "grad_norm": 0.281076580286026, + "learning_rate": 0.00019874236281292208, + "loss": 0.0373, + "step": 392 + }, + { + "epoch": 0.0519205997952241, + "grad_norm": 0.2519184648990631, + "learning_rate": 0.00019873578363360683, + "loss": 0.0384, + "step": 393 + }, + { + "epoch": 0.05205271328070813, + "grad_norm": 0.4297747015953064, + "learning_rate": 0.00019872918739953103, + "loss": 0.0717, + "step": 394 + }, + { + "epoch": 0.05218482676619216, + "grad_norm": 0.203557550907135, + "learning_rate": 0.0001987225741118341, + "loss": 0.0274, + "step": 395 + }, + { + "epoch": 0.05231694025167619, + "grad_norm": 0.24573923647403717, + "learning_rate": 0.00019871594377165831, + "loss": 0.0412, + "step": 396 + }, + { + "epoch": 0.05244905373716022, + "grad_norm": 0.34401312470436096, + "learning_rate": 0.00019870929638014895, + "loss": 0.0288, + "step": 397 + }, + { + "epoch": 0.05258116722264425, + "grad_norm": 0.35726824402809143, + "learning_rate": 0.00019870263193845427, + "loss": 0.0594, + "step": 398 + }, + { + "epoch": 0.05271328070812828, + "grad_norm": 0.23975130915641785, + "learning_rate": 0.00019869595044772536, + "loss": 0.0359, + "step": 399 + }, + { + "epoch": 0.05284539419361231, + "grad_norm": 0.271397203207016, + "learning_rate": 0.00019868925190911636, + "loss": 0.0447, + "step": 400 + }, + { + "epoch": 0.05297750767909634, + "grad_norm": 0.28331097960472107, + "learning_rate": 0.0001986825363237843, + "loss": 0.0464, + "step": 401 + }, + { + "epoch": 0.05310962116458037, + "grad_norm": 0.24692410230636597, + "learning_rate": 0.0001986758036928892, + "loss": 0.0362, + "step": 402 + }, + { + "epoch": 0.053241734650064404, + "grad_norm": 0.6335957646369934, + "learning_rate": 0.000198669054017594, + "loss": 0.0383, + "step": 403 + }, + { + "epoch": 0.053373848135548435, + "grad_norm": 0.26764601469039917, + "learning_rate": 0.00019866228729906453, + "loss": 0.032, + "step": 404 + }, + { + "epoch": 0.053505961621032466, + "grad_norm": 0.3402880132198334, + "learning_rate": 0.00019865550353846966, + "loss": 0.0294, + "step": 405 + }, + { + "epoch": 0.0536380751065165, + "grad_norm": 0.2578073740005493, + "learning_rate": 0.00019864870273698113, + "loss": 0.0468, + "step": 406 + }, + { + "epoch": 0.05377018859200053, + "grad_norm": 0.2212483137845993, + "learning_rate": 0.00019864188489577368, + "loss": 0.037, + "step": 407 + }, + { + "epoch": 0.05390230207748456, + "grad_norm": 0.2527289390563965, + "learning_rate": 0.00019863505001602492, + "loss": 0.0306, + "step": 408 + }, + { + "epoch": 0.05403441556296859, + "grad_norm": 0.3121415674686432, + "learning_rate": 0.00019862819809891548, + "loss": 0.0359, + "step": 409 + }, + { + "epoch": 0.05416652904845262, + "grad_norm": 0.3989887833595276, + "learning_rate": 0.00019862132914562892, + "loss": 0.0494, + "step": 410 + }, + { + "epoch": 0.05429864253393665, + "grad_norm": 0.310231477022171, + "learning_rate": 0.0001986144431573517, + "loss": 0.0343, + "step": 411 + }, + { + "epoch": 0.054430756019420684, + "grad_norm": 0.30125224590301514, + "learning_rate": 0.00019860754013527326, + "loss": 0.0351, + "step": 412 + }, + { + "epoch": 0.054562869504904715, + "grad_norm": 0.28822439908981323, + "learning_rate": 0.00019860062008058592, + "loss": 0.0505, + "step": 413 + }, + { + "epoch": 0.054694982990388746, + "grad_norm": 0.3362140357494354, + "learning_rate": 0.00019859368299448505, + "loss": 0.0489, + "step": 414 + }, + { + "epoch": 0.05482709647587278, + "grad_norm": 0.22514215111732483, + "learning_rate": 0.00019858672887816884, + "loss": 0.0393, + "step": 415 + }, + { + "epoch": 0.05495920996135681, + "grad_norm": 0.3549448251724243, + "learning_rate": 0.00019857975773283855, + "loss": 0.0313, + "step": 416 + }, + { + "epoch": 0.05509132344684084, + "grad_norm": 0.238682359457016, + "learning_rate": 0.00019857276955969827, + "loss": 0.0437, + "step": 417 + }, + { + "epoch": 0.055223436932324864, + "grad_norm": 0.24690861999988556, + "learning_rate": 0.0001985657643599551, + "loss": 0.0398, + "step": 418 + }, + { + "epoch": 0.055355550417808895, + "grad_norm": 0.17593775689601898, + "learning_rate": 0.00019855874213481903, + "loss": 0.0232, + "step": 419 + }, + { + "epoch": 0.055487663903292926, + "grad_norm": 0.22385098040103912, + "learning_rate": 0.00019855170288550305, + "loss": 0.035, + "step": 420 + }, + { + "epoch": 0.05561977738877696, + "grad_norm": 0.27371707558631897, + "learning_rate": 0.00019854464661322302, + "loss": 0.038, + "step": 421 + }, + { + "epoch": 0.05575189087426099, + "grad_norm": 0.2543024718761444, + "learning_rate": 0.00019853757331919785, + "loss": 0.0441, + "step": 422 + }, + { + "epoch": 0.05588400435974502, + "grad_norm": 0.18014635145664215, + "learning_rate": 0.00019853048300464925, + "loss": 0.0296, + "step": 423 + }, + { + "epoch": 0.05601611784522905, + "grad_norm": 0.2874230742454529, + "learning_rate": 0.00019852337567080196, + "loss": 0.0396, + "step": 424 + }, + { + "epoch": 0.05614823133071308, + "grad_norm": 0.24500614404678345, + "learning_rate": 0.00019851625131888363, + "loss": 0.0405, + "step": 425 + }, + { + "epoch": 0.05628034481619711, + "grad_norm": 0.277548611164093, + "learning_rate": 0.00019850910995012488, + "loss": 0.0256, + "step": 426 + }, + { + "epoch": 0.056412458301681144, + "grad_norm": 0.5045779943466187, + "learning_rate": 0.00019850195156575926, + "loss": 0.0392, + "step": 427 + }, + { + "epoch": 0.056544571787165175, + "grad_norm": 0.2808299958705902, + "learning_rate": 0.0001984947761670232, + "loss": 0.0339, + "step": 428 + }, + { + "epoch": 0.056676685272649206, + "grad_norm": 0.2322196215391159, + "learning_rate": 0.00019848758375515615, + "loss": 0.0304, + "step": 429 + }, + { + "epoch": 0.05680879875813324, + "grad_norm": 0.8448551893234253, + "learning_rate": 0.00019848037433140044, + "loss": 0.0885, + "step": 430 + }, + { + "epoch": 0.05694091224361727, + "grad_norm": 0.24392534792423248, + "learning_rate": 0.0001984731478970014, + "loss": 0.0413, + "step": 431 + }, + { + "epoch": 0.0570730257291013, + "grad_norm": 0.33474117517471313, + "learning_rate": 0.00019846590445320723, + "loss": 0.0417, + "step": 432 + }, + { + "epoch": 0.05720513921458533, + "grad_norm": 0.251472532749176, + "learning_rate": 0.0001984586440012691, + "loss": 0.0291, + "step": 433 + }, + { + "epoch": 0.05733725270006936, + "grad_norm": 0.21113821864128113, + "learning_rate": 0.00019845136654244114, + "loss": 0.0307, + "step": 434 + }, + { + "epoch": 0.05746936618555339, + "grad_norm": 0.3923201858997345, + "learning_rate": 0.00019844407207798037, + "loss": 0.0482, + "step": 435 + }, + { + "epoch": 0.057601479671037424, + "grad_norm": 0.29165613651275635, + "learning_rate": 0.0001984367606091468, + "loss": 0.0459, + "step": 436 + }, + { + "epoch": 0.057733593156521455, + "grad_norm": 0.31841304898262024, + "learning_rate": 0.00019842943213720332, + "loss": 0.0452, + "step": 437 + }, + { + "epoch": 0.05786570664200548, + "grad_norm": 0.19356763362884521, + "learning_rate": 0.00019842208666341583, + "loss": 0.0292, + "step": 438 + }, + { + "epoch": 0.05799782012748951, + "grad_norm": 0.2828764021396637, + "learning_rate": 0.00019841472418905305, + "loss": 0.0418, + "step": 439 + }, + { + "epoch": 0.05812993361297354, + "grad_norm": 0.7014762163162231, + "learning_rate": 0.00019840734471538677, + "loss": 0.0375, + "step": 440 + }, + { + "epoch": 0.05826204709845757, + "grad_norm": 0.2510444223880768, + "learning_rate": 0.00019839994824369167, + "loss": 0.0371, + "step": 441 + }, + { + "epoch": 0.058394160583941604, + "grad_norm": 0.4541257917881012, + "learning_rate": 0.00019839253477524528, + "loss": 0.0425, + "step": 442 + }, + { + "epoch": 0.058526274069425635, + "grad_norm": 0.3985843062400818, + "learning_rate": 0.0001983851043113282, + "loss": 0.046, + "step": 443 + }, + { + "epoch": 0.058658387554909666, + "grad_norm": 0.2939070165157318, + "learning_rate": 0.00019837765685322385, + "loss": 0.0465, + "step": 444 + }, + { + "epoch": 0.0587905010403937, + "grad_norm": 0.2937294542789459, + "learning_rate": 0.00019837019240221874, + "loss": 0.0367, + "step": 445 + }, + { + "epoch": 0.05892261452587773, + "grad_norm": 0.24315603077411652, + "learning_rate": 0.00019836271095960206, + "loss": 0.0344, + "step": 446 + }, + { + "epoch": 0.05905472801136176, + "grad_norm": 0.3837631642818451, + "learning_rate": 0.00019835521252666624, + "loss": 0.0412, + "step": 447 + }, + { + "epoch": 0.05918684149684579, + "grad_norm": 0.2079976350069046, + "learning_rate": 0.00019834769710470643, + "loss": 0.0312, + "step": 448 + }, + { + "epoch": 0.05931895498232982, + "grad_norm": 0.35846373438835144, + "learning_rate": 0.00019834016469502075, + "loss": 0.0512, + "step": 449 + }, + { + "epoch": 0.05945106846781385, + "grad_norm": 0.2271134853363037, + "learning_rate": 0.00019833261529891033, + "loss": 0.0344, + "step": 450 + }, + { + "epoch": 0.059583181953297884, + "grad_norm": 0.26171258091926575, + "learning_rate": 0.00019832504891767916, + "loss": 0.0443, + "step": 451 + }, + { + "epoch": 0.059715295438781915, + "grad_norm": 0.32468751072883606, + "learning_rate": 0.00019831746555263417, + "loss": 0.0275, + "step": 452 + }, + { + "epoch": 0.059847408924265946, + "grad_norm": 0.3164912462234497, + "learning_rate": 0.0001983098652050853, + "loss": 0.0537, + "step": 453 + }, + { + "epoch": 0.05997952240974998, + "grad_norm": 0.29063880443573, + "learning_rate": 0.00019830224787634537, + "loss": 0.0426, + "step": 454 + }, + { + "epoch": 0.06011163589523401, + "grad_norm": 0.3254551589488983, + "learning_rate": 0.00019829461356773008, + "loss": 0.0668, + "step": 455 + }, + { + "epoch": 0.06024374938071804, + "grad_norm": 0.2961951494216919, + "learning_rate": 0.00019828696228055815, + "loss": 0.0445, + "step": 456 + }, + { + "epoch": 0.06037586286620207, + "grad_norm": 0.37507346272468567, + "learning_rate": 0.00019827929401615115, + "loss": 0.0359, + "step": 457 + }, + { + "epoch": 0.060507976351686095, + "grad_norm": 0.3069281578063965, + "learning_rate": 0.0001982716087758337, + "loss": 0.0351, + "step": 458 + }, + { + "epoch": 0.060640089837170126, + "grad_norm": 0.2815963923931122, + "learning_rate": 0.0001982639065609332, + "loss": 0.0609, + "step": 459 + }, + { + "epoch": 0.06077220332265416, + "grad_norm": 0.3931083381175995, + "learning_rate": 0.00019825618737278017, + "loss": 0.0491, + "step": 460 + }, + { + "epoch": 0.06090431680813819, + "grad_norm": 0.24349068105220795, + "learning_rate": 0.00019824845121270787, + "loss": 0.0398, + "step": 461 + }, + { + "epoch": 0.06103643029362222, + "grad_norm": 0.260797917842865, + "learning_rate": 0.00019824069808205259, + "loss": 0.0379, + "step": 462 + }, + { + "epoch": 0.06116854377910625, + "grad_norm": 0.22222347557544708, + "learning_rate": 0.00019823292798215353, + "loss": 0.0247, + "step": 463 + }, + { + "epoch": 0.06130065726459028, + "grad_norm": 0.342363566160202, + "learning_rate": 0.00019822514091435287, + "loss": 0.0509, + "step": 464 + }, + { + "epoch": 0.06143277075007431, + "grad_norm": 0.2042882740497589, + "learning_rate": 0.00019821733687999568, + "loss": 0.0351, + "step": 465 + }, + { + "epoch": 0.061564884235558344, + "grad_norm": 0.18748803436756134, + "learning_rate": 0.00019820951588042993, + "loss": 0.0222, + "step": 466 + }, + { + "epoch": 0.061696997721042375, + "grad_norm": 0.3864912688732147, + "learning_rate": 0.00019820167791700653, + "loss": 0.0499, + "step": 467 + }, + { + "epoch": 0.061829111206526406, + "grad_norm": 0.2836579978466034, + "learning_rate": 0.0001981938229910794, + "loss": 0.0469, + "step": 468 + }, + { + "epoch": 0.06196122469201044, + "grad_norm": 0.2501057982444763, + "learning_rate": 0.00019818595110400531, + "loss": 0.0336, + "step": 469 + }, + { + "epoch": 0.06209333817749447, + "grad_norm": 0.1960388571023941, + "learning_rate": 0.00019817806225714394, + "loss": 0.0267, + "step": 470 + }, + { + "epoch": 0.0622254516629785, + "grad_norm": 0.20142126083374023, + "learning_rate": 0.00019817015645185801, + "loss": 0.022, + "step": 471 + }, + { + "epoch": 0.06235756514846253, + "grad_norm": 0.26167020201683044, + "learning_rate": 0.00019816223368951307, + "loss": 0.0348, + "step": 472 + }, + { + "epoch": 0.06248967863394656, + "grad_norm": 0.313064306974411, + "learning_rate": 0.00019815429397147764, + "loss": 0.0393, + "step": 473 + }, + { + "epoch": 0.06262179211943059, + "grad_norm": 0.23127563297748566, + "learning_rate": 0.0001981463372991231, + "loss": 0.029, + "step": 474 + }, + { + "epoch": 0.06275390560491462, + "grad_norm": 0.2343924194574356, + "learning_rate": 0.00019813836367382388, + "loss": 0.04, + "step": 475 + }, + { + "epoch": 0.06288601909039865, + "grad_norm": 0.35813525319099426, + "learning_rate": 0.00019813037309695725, + "loss": 0.0429, + "step": 476 + }, + { + "epoch": 0.06301813257588268, + "grad_norm": 0.20709457993507385, + "learning_rate": 0.00019812236556990346, + "loss": 0.0315, + "step": 477 + }, + { + "epoch": 0.06315024606136671, + "grad_norm": 0.20351076126098633, + "learning_rate": 0.00019811434109404563, + "loss": 0.0271, + "step": 478 + }, + { + "epoch": 0.06328235954685074, + "grad_norm": 0.7306005358695984, + "learning_rate": 0.00019810629967076984, + "loss": 0.0406, + "step": 479 + }, + { + "epoch": 0.06341447303233477, + "grad_norm": 0.2870320975780487, + "learning_rate": 0.0001980982413014651, + "loss": 0.0367, + "step": 480 + }, + { + "epoch": 0.0635465865178188, + "grad_norm": 0.3635860085487366, + "learning_rate": 0.00019809016598752334, + "loss": 0.0385, + "step": 481 + }, + { + "epoch": 0.06367870000330283, + "grad_norm": 0.2811727523803711, + "learning_rate": 0.00019808207373033944, + "loss": 0.0407, + "step": 482 + }, + { + "epoch": 0.06381081348878687, + "grad_norm": 0.18366318941116333, + "learning_rate": 0.00019807396453131118, + "loss": 0.0329, + "step": 483 + }, + { + "epoch": 0.0639429269742709, + "grad_norm": 0.23548203706741333, + "learning_rate": 0.00019806583839183922, + "loss": 0.0273, + "step": 484 + }, + { + "epoch": 0.06407504045975493, + "grad_norm": 0.4355567395687103, + "learning_rate": 0.00019805769531332728, + "loss": 0.0531, + "step": 485 + }, + { + "epoch": 0.06420715394523896, + "grad_norm": 0.38908225297927856, + "learning_rate": 0.00019804953529718185, + "loss": 0.0445, + "step": 486 + }, + { + "epoch": 0.06433926743072299, + "grad_norm": 0.2681350111961365, + "learning_rate": 0.0001980413583448125, + "loss": 0.0324, + "step": 487 + }, + { + "epoch": 0.06447138091620702, + "grad_norm": 0.32850104570388794, + "learning_rate": 0.00019803316445763156, + "loss": 0.0547, + "step": 488 + }, + { + "epoch": 0.06460349440169105, + "grad_norm": 0.20159336924552917, + "learning_rate": 0.00019802495363705446, + "loss": 0.0348, + "step": 489 + }, + { + "epoch": 0.06473560788717508, + "grad_norm": 0.23026055097579956, + "learning_rate": 0.00019801672588449937, + "loss": 0.041, + "step": 490 + }, + { + "epoch": 0.06486772137265912, + "grad_norm": 0.23433181643486023, + "learning_rate": 0.00019800848120138755, + "loss": 0.0377, + "step": 491 + }, + { + "epoch": 0.06499983485814315, + "grad_norm": 0.26035311818122864, + "learning_rate": 0.0001980002195891431, + "loss": 0.0342, + "step": 492 + }, + { + "epoch": 0.06513194834362718, + "grad_norm": 0.23564809560775757, + "learning_rate": 0.00019799194104919306, + "loss": 0.0264, + "step": 493 + }, + { + "epoch": 0.06526406182911121, + "grad_norm": 0.23754185438156128, + "learning_rate": 0.00019798364558296737, + "loss": 0.0377, + "step": 494 + }, + { + "epoch": 0.06539617531459524, + "grad_norm": 0.23798996210098267, + "learning_rate": 0.0001979753331918989, + "loss": 0.039, + "step": 495 + }, + { + "epoch": 0.06552828880007927, + "grad_norm": 0.2444775253534317, + "learning_rate": 0.00019796700387742354, + "loss": 0.0228, + "step": 496 + }, + { + "epoch": 0.0656604022855633, + "grad_norm": 0.3243919909000397, + "learning_rate": 0.00019795865764097998, + "loss": 0.0501, + "step": 497 + }, + { + "epoch": 0.06579251577104733, + "grad_norm": 0.29826414585113525, + "learning_rate": 0.00019795029448400984, + "loss": 0.0343, + "step": 498 + }, + { + "epoch": 0.06592462925653136, + "grad_norm": 0.30492937564849854, + "learning_rate": 0.00019794191440795775, + "loss": 0.0368, + "step": 499 + }, + { + "epoch": 0.0660567427420154, + "grad_norm": 0.3303871750831604, + "learning_rate": 0.00019793351741427117, + "loss": 0.0397, + "step": 500 + }, + { + "epoch": 0.06618885622749943, + "grad_norm": 0.2616530656814575, + "learning_rate": 0.00019792510350440058, + "loss": 0.0295, + "step": 501 + }, + { + "epoch": 0.06632096971298346, + "grad_norm": 0.2011949121952057, + "learning_rate": 0.00019791667267979928, + "loss": 0.0206, + "step": 502 + }, + { + "epoch": 0.06645308319846749, + "grad_norm": 0.985835611820221, + "learning_rate": 0.00019790822494192357, + "loss": 0.0354, + "step": 503 + }, + { + "epoch": 0.06658519668395152, + "grad_norm": 0.26946449279785156, + "learning_rate": 0.00019789976029223257, + "loss": 0.0402, + "step": 504 + }, + { + "epoch": 0.06671731016943555, + "grad_norm": 0.2122262567281723, + "learning_rate": 0.00019789127873218843, + "loss": 0.0352, + "step": 505 + }, + { + "epoch": 0.06684942365491958, + "grad_norm": 0.2970362603664398, + "learning_rate": 0.00019788278026325627, + "loss": 0.0352, + "step": 506 + }, + { + "epoch": 0.06698153714040361, + "grad_norm": 0.34310245513916016, + "learning_rate": 0.0001978742648869039, + "loss": 0.0378, + "step": 507 + }, + { + "epoch": 0.06711365062588764, + "grad_norm": 0.3422040343284607, + "learning_rate": 0.00019786573260460226, + "loss": 0.0623, + "step": 508 + }, + { + "epoch": 0.06724576411137166, + "grad_norm": 0.23203279078006744, + "learning_rate": 0.00019785718341782516, + "loss": 0.0435, + "step": 509 + }, + { + "epoch": 0.06737787759685569, + "grad_norm": 0.3923529386520386, + "learning_rate": 0.00019784861732804926, + "loss": 0.0358, + "step": 510 + }, + { + "epoch": 0.06750999108233972, + "grad_norm": 0.32772544026374817, + "learning_rate": 0.00019784003433675421, + "loss": 0.0418, + "step": 511 + }, + { + "epoch": 0.06764210456782375, + "grad_norm": 0.2983352541923523, + "learning_rate": 0.00019783143444542257, + "loss": 0.0333, + "step": 512 + }, + { + "epoch": 0.06777421805330779, + "grad_norm": 0.3037568926811218, + "learning_rate": 0.00019782281765553985, + "loss": 0.045, + "step": 513 + }, + { + "epoch": 0.06790633153879182, + "grad_norm": 0.2756015658378601, + "learning_rate": 0.00019781418396859436, + "loss": 0.0381, + "step": 514 + }, + { + "epoch": 0.06803844502427585, + "grad_norm": 0.2572304606437683, + "learning_rate": 0.00019780553338607745, + "loss": 0.0423, + "step": 515 + }, + { + "epoch": 0.06817055850975988, + "grad_norm": 0.40779799222946167, + "learning_rate": 0.00019779686590948336, + "loss": 0.0407, + "step": 516 + }, + { + "epoch": 0.06830267199524391, + "grad_norm": 0.32563602924346924, + "learning_rate": 0.00019778818154030922, + "loss": 0.058, + "step": 517 + }, + { + "epoch": 0.06843478548072794, + "grad_norm": 0.19639067351818085, + "learning_rate": 0.0001977794802800551, + "loss": 0.0232, + "step": 518 + }, + { + "epoch": 0.06856689896621197, + "grad_norm": 0.3198147118091583, + "learning_rate": 0.00019777076213022397, + "loss": 0.0426, + "step": 519 + }, + { + "epoch": 0.068699012451696, + "grad_norm": 0.34367501735687256, + "learning_rate": 0.0001977620270923217, + "loss": 0.0524, + "step": 520 + }, + { + "epoch": 0.06883112593718003, + "grad_norm": 0.2864331305027008, + "learning_rate": 0.00019775327516785714, + "loss": 0.0386, + "step": 521 + }, + { + "epoch": 0.06896323942266407, + "grad_norm": 0.2509300708770752, + "learning_rate": 0.00019774450635834203, + "loss": 0.0478, + "step": 522 + }, + { + "epoch": 0.0690953529081481, + "grad_norm": 0.3212604522705078, + "learning_rate": 0.000197735720665291, + "loss": 0.0399, + "step": 523 + }, + { + "epoch": 0.06922746639363213, + "grad_norm": 0.25791066884994507, + "learning_rate": 0.00019772691809022161, + "loss": 0.0352, + "step": 524 + }, + { + "epoch": 0.06935957987911616, + "grad_norm": 0.3202870190143585, + "learning_rate": 0.00019771809863465437, + "loss": 0.0273, + "step": 525 + }, + { + "epoch": 0.06949169336460019, + "grad_norm": 0.292076975107193, + "learning_rate": 0.0001977092623001126, + "loss": 0.047, + "step": 526 + }, + { + "epoch": 0.06962380685008422, + "grad_norm": 0.2345723956823349, + "learning_rate": 0.0001977004090881227, + "loss": 0.0356, + "step": 527 + }, + { + "epoch": 0.06975592033556825, + "grad_norm": 0.2687516212463379, + "learning_rate": 0.00019769153900021388, + "loss": 0.043, + "step": 528 + }, + { + "epoch": 0.06988803382105228, + "grad_norm": 0.24501895904541016, + "learning_rate": 0.00019768265203791826, + "loss": 0.0347, + "step": 529 + }, + { + "epoch": 0.07002014730653632, + "grad_norm": 0.29931098222732544, + "learning_rate": 0.00019767374820277086, + "loss": 0.0484, + "step": 530 + }, + { + "epoch": 0.07015226079202035, + "grad_norm": 0.19638241827487946, + "learning_rate": 0.0001976648274963097, + "loss": 0.0292, + "step": 531 + }, + { + "epoch": 0.07028437427750438, + "grad_norm": 0.22806896269321442, + "learning_rate": 0.00019765588992007568, + "loss": 0.0256, + "step": 532 + }, + { + "epoch": 0.07041648776298841, + "grad_norm": 0.24054238200187683, + "learning_rate": 0.00019764693547561255, + "loss": 0.025, + "step": 533 + }, + { + "epoch": 0.07054860124847244, + "grad_norm": 0.24966415762901306, + "learning_rate": 0.00019763796416446706, + "loss": 0.0322, + "step": 534 + }, + { + "epoch": 0.07068071473395647, + "grad_norm": 0.25949081778526306, + "learning_rate": 0.00019762897598818883, + "loss": 0.0268, + "step": 535 + }, + { + "epoch": 0.0708128282194405, + "grad_norm": 0.34949883818626404, + "learning_rate": 0.00019761997094833037, + "loss": 0.0434, + "step": 536 + }, + { + "epoch": 0.07094494170492453, + "grad_norm": 0.3490285873413086, + "learning_rate": 0.0001976109490464472, + "loss": 0.0231, + "step": 537 + }, + { + "epoch": 0.07107705519040856, + "grad_norm": 0.22599942982196808, + "learning_rate": 0.0001976019102840976, + "loss": 0.0242, + "step": 538 + }, + { + "epoch": 0.0712091686758926, + "grad_norm": 0.2737880051136017, + "learning_rate": 0.0001975928546628429, + "loss": 0.0341, + "step": 539 + }, + { + "epoch": 0.07134128216137663, + "grad_norm": 0.2882981598377228, + "learning_rate": 0.00019758378218424726, + "loss": 0.0378, + "step": 540 + }, + { + "epoch": 0.07147339564686066, + "grad_norm": 0.20302222669124603, + "learning_rate": 0.00019757469284987784, + "loss": 0.0371, + "step": 541 + }, + { + "epoch": 0.07160550913234469, + "grad_norm": 0.23300130665302277, + "learning_rate": 0.0001975655866613046, + "loss": 0.0258, + "step": 542 + }, + { + "epoch": 0.07173762261782872, + "grad_norm": 0.23061759769916534, + "learning_rate": 0.00019755646362010044, + "loss": 0.0262, + "step": 543 + }, + { + "epoch": 0.07186973610331275, + "grad_norm": 0.3055586516857147, + "learning_rate": 0.00019754732372784126, + "loss": 0.0406, + "step": 544 + }, + { + "epoch": 0.07200184958879678, + "grad_norm": 0.20383194088935852, + "learning_rate": 0.00019753816698610577, + "loss": 0.0263, + "step": 545 + }, + { + "epoch": 0.07213396307428081, + "grad_norm": 0.25024712085723877, + "learning_rate": 0.00019752899339647563, + "loss": 0.0299, + "step": 546 + }, + { + "epoch": 0.07226607655976484, + "grad_norm": 0.21452884376049042, + "learning_rate": 0.00019751980296053541, + "loss": 0.0293, + "step": 547 + }, + { + "epoch": 0.07239819004524888, + "grad_norm": 0.24472303688526154, + "learning_rate": 0.00019751059567987259, + "loss": 0.0251, + "step": 548 + }, + { + "epoch": 0.0725303035307329, + "grad_norm": 0.28603652119636536, + "learning_rate": 0.0001975013715560775, + "loss": 0.0439, + "step": 549 + }, + { + "epoch": 0.07266241701621692, + "grad_norm": 0.349922776222229, + "learning_rate": 0.00019749213059074353, + "loss": 0.0497, + "step": 550 + }, + { + "epoch": 0.07279453050170095, + "grad_norm": 0.2149500995874405, + "learning_rate": 0.00019748287278546683, + "loss": 0.0227, + "step": 551 + }, + { + "epoch": 0.07292664398718499, + "grad_norm": 0.2354866862297058, + "learning_rate": 0.00019747359814184653, + "loss": 0.0252, + "step": 552 + }, + { + "epoch": 0.07305875747266902, + "grad_norm": 0.3029223680496216, + "learning_rate": 0.00019746430666148462, + "loss": 0.0422, + "step": 553 + }, + { + "epoch": 0.07319087095815305, + "grad_norm": 0.364507257938385, + "learning_rate": 0.00019745499834598605, + "loss": 0.0459, + "step": 554 + }, + { + "epoch": 0.07332298444363708, + "grad_norm": 0.2667244076728821, + "learning_rate": 0.00019744567319695869, + "loss": 0.0362, + "step": 555 + }, + { + "epoch": 0.07345509792912111, + "grad_norm": 0.35405200719833374, + "learning_rate": 0.00019743633121601322, + "loss": 0.04, + "step": 556 + }, + { + "epoch": 0.07358721141460514, + "grad_norm": 0.21454021334648132, + "learning_rate": 0.00019742697240476332, + "loss": 0.0365, + "step": 557 + }, + { + "epoch": 0.07371932490008917, + "grad_norm": 0.29431477189064026, + "learning_rate": 0.0001974175967648256, + "loss": 0.0334, + "step": 558 + }, + { + "epoch": 0.0738514383855732, + "grad_norm": 0.22505509853363037, + "learning_rate": 0.00019740820429781943, + "loss": 0.0398, + "step": 559 + }, + { + "epoch": 0.07398355187105723, + "grad_norm": 0.22686269879341125, + "learning_rate": 0.00019739879500536725, + "loss": 0.0317, + "step": 560 + }, + { + "epoch": 0.07411566535654127, + "grad_norm": 0.2540476322174072, + "learning_rate": 0.00019738936888909434, + "loss": 0.029, + "step": 561 + }, + { + "epoch": 0.0742477788420253, + "grad_norm": 0.22462958097457886, + "learning_rate": 0.00019737992595062886, + "loss": 0.0239, + "step": 562 + }, + { + "epoch": 0.07437989232750933, + "grad_norm": 0.4206967353820801, + "learning_rate": 0.00019737046619160194, + "loss": 0.0497, + "step": 563 + }, + { + "epoch": 0.07451200581299336, + "grad_norm": 0.2678888738155365, + "learning_rate": 0.0001973609896136475, + "loss": 0.029, + "step": 564 + }, + { + "epoch": 0.07464411929847739, + "grad_norm": 0.19996492564678192, + "learning_rate": 0.0001973514962184025, + "loss": 0.023, + "step": 565 + }, + { + "epoch": 0.07477623278396142, + "grad_norm": 0.32264792919158936, + "learning_rate": 0.00019734198600750678, + "loss": 0.0366, + "step": 566 + }, + { + "epoch": 0.07490834626944545, + "grad_norm": 0.23401503264904022, + "learning_rate": 0.00019733245898260297, + "loss": 0.0279, + "step": 567 + }, + { + "epoch": 0.07504045975492948, + "grad_norm": 0.19676105678081512, + "learning_rate": 0.00019732291514533673, + "loss": 0.0406, + "step": 568 + }, + { + "epoch": 0.07517257324041351, + "grad_norm": 0.37109094858169556, + "learning_rate": 0.00019731335449735659, + "loss": 0.0452, + "step": 569 + }, + { + "epoch": 0.07530468672589755, + "grad_norm": 0.19893021881580353, + "learning_rate": 0.00019730377704031392, + "loss": 0.028, + "step": 570 + }, + { + "epoch": 0.07543680021138158, + "grad_norm": 0.24675323069095612, + "learning_rate": 0.00019729418277586306, + "loss": 0.032, + "step": 571 + }, + { + "epoch": 0.07556891369686561, + "grad_norm": 0.2152172029018402, + "learning_rate": 0.00019728457170566132, + "loss": 0.0341, + "step": 572 + }, + { + "epoch": 0.07570102718234964, + "grad_norm": 0.23518556356430054, + "learning_rate": 0.00019727494383136874, + "loss": 0.0295, + "step": 573 + }, + { + "epoch": 0.07583314066783367, + "grad_norm": 0.23954501748085022, + "learning_rate": 0.00019726529915464842, + "loss": 0.0296, + "step": 574 + }, + { + "epoch": 0.0759652541533177, + "grad_norm": 0.25000494718551636, + "learning_rate": 0.00019725563767716625, + "loss": 0.0353, + "step": 575 + }, + { + "epoch": 0.07609736763880173, + "grad_norm": 0.34247976541519165, + "learning_rate": 0.00019724595940059106, + "loss": 0.0314, + "step": 576 + }, + { + "epoch": 0.07622948112428576, + "grad_norm": 0.36853498220443726, + "learning_rate": 0.00019723626432659462, + "loss": 0.0478, + "step": 577 + }, + { + "epoch": 0.0763615946097698, + "grad_norm": 0.20818112790584564, + "learning_rate": 0.0001972265524568516, + "loss": 0.0357, + "step": 578 + }, + { + "epoch": 0.07649370809525383, + "grad_norm": 0.40607398748397827, + "learning_rate": 0.0001972168237930395, + "loss": 0.053, + "step": 579 + }, + { + "epoch": 0.07662582158073786, + "grad_norm": 0.2218083292245865, + "learning_rate": 0.0001972070783368388, + "loss": 0.0375, + "step": 580 + }, + { + "epoch": 0.07675793506622189, + "grad_norm": 0.27258095145225525, + "learning_rate": 0.00019719731608993282, + "loss": 0.037, + "step": 581 + }, + { + "epoch": 0.07689004855170592, + "grad_norm": 0.420881450176239, + "learning_rate": 0.0001971875370540078, + "loss": 0.0456, + "step": 582 + }, + { + "epoch": 0.07702216203718995, + "grad_norm": 0.27685117721557617, + "learning_rate": 0.0001971777412307529, + "loss": 0.0325, + "step": 583 + }, + { + "epoch": 0.07715427552267398, + "grad_norm": 0.28919875621795654, + "learning_rate": 0.00019716792862186014, + "loss": 0.0214, + "step": 584 + }, + { + "epoch": 0.07728638900815801, + "grad_norm": 0.23569947481155396, + "learning_rate": 0.0001971580992290245, + "loss": 0.0331, + "step": 585 + }, + { + "epoch": 0.07741850249364204, + "grad_norm": 0.2427627444267273, + "learning_rate": 0.0001971482530539438, + "loss": 0.0329, + "step": 586 + }, + { + "epoch": 0.07755061597912608, + "grad_norm": 0.19415737688541412, + "learning_rate": 0.0001971383900983188, + "loss": 0.0262, + "step": 587 + }, + { + "epoch": 0.0776827294646101, + "grad_norm": 0.46439921855926514, + "learning_rate": 0.00019712851036385315, + "loss": 0.0417, + "step": 588 + }, + { + "epoch": 0.07781484295009414, + "grad_norm": 0.18036745488643646, + "learning_rate": 0.00019711861385225338, + "loss": 0.0179, + "step": 589 + }, + { + "epoch": 0.07794695643557817, + "grad_norm": 0.22073324024677277, + "learning_rate": 0.00019710870056522889, + "loss": 0.0349, + "step": 590 + }, + { + "epoch": 0.07807906992106219, + "grad_norm": 0.2414667010307312, + "learning_rate": 0.00019709877050449204, + "loss": 0.0356, + "step": 591 + }, + { + "epoch": 0.07821118340654622, + "grad_norm": 0.19599366188049316, + "learning_rate": 0.0001970888236717581, + "loss": 0.0229, + "step": 592 + }, + { + "epoch": 0.07834329689203025, + "grad_norm": 0.23867139220237732, + "learning_rate": 0.00019707886006874515, + "loss": 0.0241, + "step": 593 + }, + { + "epoch": 0.07847541037751428, + "grad_norm": 0.3124195635318756, + "learning_rate": 0.0001970688796971742, + "loss": 0.0435, + "step": 594 + }, + { + "epoch": 0.07860752386299831, + "grad_norm": 0.27740946412086487, + "learning_rate": 0.00019705888255876927, + "loss": 0.0337, + "step": 595 + }, + { + "epoch": 0.07873963734848234, + "grad_norm": 0.20327328145503998, + "learning_rate": 0.00019704886865525706, + "loss": 0.0297, + "step": 596 + }, + { + "epoch": 0.07887175083396637, + "grad_norm": 0.37126624584198, + "learning_rate": 0.00019703883798836738, + "loss": 0.0553, + "step": 597 + }, + { + "epoch": 0.0790038643194504, + "grad_norm": 0.22105854749679565, + "learning_rate": 0.0001970287905598328, + "loss": 0.041, + "step": 598 + }, + { + "epoch": 0.07913597780493443, + "grad_norm": 0.22515787184238434, + "learning_rate": 0.0001970187263713888, + "loss": 0.0295, + "step": 599 + }, + { + "epoch": 0.07926809129041847, + "grad_norm": 0.47252383828163147, + "learning_rate": 0.0001970086454247738, + "loss": 0.048, + "step": 600 + }, + { + "epoch": 0.0794002047759025, + "grad_norm": 0.24759109318256378, + "learning_rate": 0.0001969985477217291, + "loss": 0.0274, + "step": 601 + }, + { + "epoch": 0.07953231826138653, + "grad_norm": 0.41675540804862976, + "learning_rate": 0.0001969884332639989, + "loss": 0.0379, + "step": 602 + }, + { + "epoch": 0.07966443174687056, + "grad_norm": 0.2635309398174286, + "learning_rate": 0.0001969783020533303, + "loss": 0.0449, + "step": 603 + }, + { + "epoch": 0.07979654523235459, + "grad_norm": 0.25403887033462524, + "learning_rate": 0.00019696815409147317, + "loss": 0.0387, + "step": 604 + }, + { + "epoch": 0.07992865871783862, + "grad_norm": 0.21932649612426758, + "learning_rate": 0.00019695798938018053, + "loss": 0.036, + "step": 605 + }, + { + "epoch": 0.08006077220332265, + "grad_norm": 0.23342837393283844, + "learning_rate": 0.00019694780792120807, + "loss": 0.0431, + "step": 606 + }, + { + "epoch": 0.08019288568880668, + "grad_norm": 0.5543757081031799, + "learning_rate": 0.00019693760971631444, + "loss": 0.0539, + "step": 607 + }, + { + "epoch": 0.08032499917429071, + "grad_norm": 0.29595449566841125, + "learning_rate": 0.00019692739476726118, + "loss": 0.021, + "step": 608 + }, + { + "epoch": 0.08045711265977475, + "grad_norm": 0.24539723992347717, + "learning_rate": 0.0001969171630758128, + "loss": 0.0386, + "step": 609 + }, + { + "epoch": 0.08058922614525878, + "grad_norm": 0.20920851826667786, + "learning_rate": 0.0001969069146437365, + "loss": 0.0207, + "step": 610 + }, + { + "epoch": 0.08072133963074281, + "grad_norm": 0.3386335074901581, + "learning_rate": 0.00019689664947280267, + "loss": 0.0408, + "step": 611 + }, + { + "epoch": 0.08085345311622684, + "grad_norm": 0.3359006643295288, + "learning_rate": 0.00019688636756478434, + "loss": 0.0548, + "step": 612 + }, + { + "epoch": 0.08098556660171087, + "grad_norm": 0.3495159149169922, + "learning_rate": 0.00019687606892145748, + "loss": 0.0353, + "step": 613 + }, + { + "epoch": 0.0811176800871949, + "grad_norm": 0.34049656987190247, + "learning_rate": 0.00019686575354460107, + "loss": 0.0547, + "step": 614 + }, + { + "epoch": 0.08124979357267893, + "grad_norm": 0.37891748547554016, + "learning_rate": 0.00019685542143599684, + "loss": 0.0415, + "step": 615 + }, + { + "epoch": 0.08138190705816296, + "grad_norm": 0.1867925226688385, + "learning_rate": 0.0001968450725974295, + "loss": 0.0279, + "step": 616 + }, + { + "epoch": 0.081514020543647, + "grad_norm": 0.2333107739686966, + "learning_rate": 0.00019683470703068664, + "loss": 0.0297, + "step": 617 + }, + { + "epoch": 0.08164613402913103, + "grad_norm": 0.2671178877353668, + "learning_rate": 0.0001968243247375586, + "loss": 0.0264, + "step": 618 + }, + { + "epoch": 0.08177824751461506, + "grad_norm": 0.2965892553329468, + "learning_rate": 0.00019681392571983887, + "loss": 0.0354, + "step": 619 + }, + { + "epoch": 0.08191036100009909, + "grad_norm": 0.23038817942142487, + "learning_rate": 0.00019680350997932364, + "loss": 0.0332, + "step": 620 + }, + { + "epoch": 0.08204247448558312, + "grad_norm": 0.3962733745574951, + "learning_rate": 0.000196793077517812, + "loss": 0.0359, + "step": 621 + }, + { + "epoch": 0.08217458797106715, + "grad_norm": 0.26966428756713867, + "learning_rate": 0.00019678262833710598, + "loss": 0.0279, + "step": 622 + }, + { + "epoch": 0.08230670145655118, + "grad_norm": 0.31382763385772705, + "learning_rate": 0.00019677216243901052, + "loss": 0.0317, + "step": 623 + }, + { + "epoch": 0.08243881494203521, + "grad_norm": 0.23322831094264984, + "learning_rate": 0.00019676167982533334, + "loss": 0.046, + "step": 624 + }, + { + "epoch": 0.08257092842751924, + "grad_norm": 0.2607961595058441, + "learning_rate": 0.00019675118049788514, + "loss": 0.03, + "step": 625 + }, + { + "epoch": 0.08270304191300328, + "grad_norm": 0.18060070276260376, + "learning_rate": 0.00019674066445847952, + "loss": 0.0272, + "step": 626 + }, + { + "epoch": 0.0828351553984873, + "grad_norm": 0.24973896145820618, + "learning_rate": 0.0001967301317089329, + "loss": 0.0412, + "step": 627 + }, + { + "epoch": 0.08296726888397134, + "grad_norm": 0.25361204147338867, + "learning_rate": 0.00019671958225106462, + "loss": 0.0294, + "step": 628 + }, + { + "epoch": 0.08309938236945537, + "grad_norm": 0.38416653871536255, + "learning_rate": 0.00019670901608669685, + "loss": 0.0396, + "step": 629 + }, + { + "epoch": 0.0832314958549394, + "grad_norm": 0.21381421387195587, + "learning_rate": 0.0001966984332176548, + "loss": 0.0391, + "step": 630 + }, + { + "epoch": 0.08336360934042342, + "grad_norm": 0.2730841636657715, + "learning_rate": 0.0001966878336457664, + "loss": 0.0288, + "step": 631 + }, + { + "epoch": 0.08349572282590745, + "grad_norm": 0.1917334944009781, + "learning_rate": 0.00019667721737286252, + "loss": 0.0222, + "step": 632 + }, + { + "epoch": 0.08362783631139148, + "grad_norm": 0.27792686223983765, + "learning_rate": 0.00019666658440077695, + "loss": 0.035, + "step": 633 + }, + { + "epoch": 0.08375994979687551, + "grad_norm": 0.4698370397090912, + "learning_rate": 0.00019665593473134631, + "loss": 0.0394, + "step": 634 + }, + { + "epoch": 0.08389206328235954, + "grad_norm": 0.38890373706817627, + "learning_rate": 0.00019664526836641016, + "loss": 0.0304, + "step": 635 + }, + { + "epoch": 0.08402417676784357, + "grad_norm": 0.25699278712272644, + "learning_rate": 0.00019663458530781093, + "loss": 0.0305, + "step": 636 + }, + { + "epoch": 0.0841562902533276, + "grad_norm": 0.3510453999042511, + "learning_rate": 0.00019662388555739387, + "loss": 0.0341, + "step": 637 + }, + { + "epoch": 0.08428840373881163, + "grad_norm": 0.21721839904785156, + "learning_rate": 0.00019661316911700715, + "loss": 0.0276, + "step": 638 + }, + { + "epoch": 0.08442051722429567, + "grad_norm": 0.3929044008255005, + "learning_rate": 0.0001966024359885019, + "loss": 0.0459, + "step": 639 + }, + { + "epoch": 0.0845526307097797, + "grad_norm": 0.3013642430305481, + "learning_rate": 0.000196591686173732, + "loss": 0.0428, + "step": 640 + }, + { + "epoch": 0.08468474419526373, + "grad_norm": 0.2829546630382538, + "learning_rate": 0.00019658091967455436, + "loss": 0.0221, + "step": 641 + }, + { + "epoch": 0.08481685768074776, + "grad_norm": 0.3992788791656494, + "learning_rate": 0.00019657013649282865, + "loss": 0.0298, + "step": 642 + }, + { + "epoch": 0.08494897116623179, + "grad_norm": 0.24115873873233795, + "learning_rate": 0.00019655933663041743, + "loss": 0.0273, + "step": 643 + }, + { + "epoch": 0.08508108465171582, + "grad_norm": 0.22903680801391602, + "learning_rate": 0.0001965485200891862, + "loss": 0.0306, + "step": 644 + }, + { + "epoch": 0.08521319813719985, + "grad_norm": 0.24627751111984253, + "learning_rate": 0.00019653768687100334, + "loss": 0.037, + "step": 645 + }, + { + "epoch": 0.08534531162268388, + "grad_norm": 0.23460929095745087, + "learning_rate": 0.00019652683697774008, + "loss": 0.0289, + "step": 646 + }, + { + "epoch": 0.08547742510816791, + "grad_norm": 0.4498720169067383, + "learning_rate": 0.0001965159704112705, + "loss": 0.0362, + "step": 647 + }, + { + "epoch": 0.08560953859365195, + "grad_norm": 0.32583916187286377, + "learning_rate": 0.0001965050871734716, + "loss": 0.038, + "step": 648 + }, + { + "epoch": 0.08574165207913598, + "grad_norm": 0.20444458723068237, + "learning_rate": 0.00019649418726622327, + "loss": 0.0233, + "step": 649 + }, + { + "epoch": 0.08587376556462001, + "grad_norm": 0.20803149044513702, + "learning_rate": 0.00019648327069140832, + "loss": 0.0187, + "step": 650 + }, + { + "epoch": 0.08600587905010404, + "grad_norm": 0.21215280890464783, + "learning_rate": 0.00019647233745091226, + "loss": 0.0331, + "step": 651 + }, + { + "epoch": 0.08613799253558807, + "grad_norm": 0.24561989307403564, + "learning_rate": 0.00019646138754662374, + "loss": 0.0301, + "step": 652 + }, + { + "epoch": 0.0862701060210721, + "grad_norm": 0.3120889961719513, + "learning_rate": 0.00019645042098043406, + "loss": 0.0407, + "step": 653 + }, + { + "epoch": 0.08640221950655613, + "grad_norm": 0.22205640375614166, + "learning_rate": 0.0001964394377542375, + "loss": 0.0402, + "step": 654 + }, + { + "epoch": 0.08653433299204016, + "grad_norm": 0.24051034450531006, + "learning_rate": 0.00019642843786993124, + "loss": 0.0182, + "step": 655 + }, + { + "epoch": 0.0866664464775242, + "grad_norm": 0.23244361579418182, + "learning_rate": 0.00019641742132941529, + "loss": 0.0268, + "step": 656 + }, + { + "epoch": 0.08679855996300823, + "grad_norm": 0.30195531249046326, + "learning_rate": 0.00019640638813459252, + "loss": 0.042, + "step": 657 + }, + { + "epoch": 0.08693067344849226, + "grad_norm": 0.2843739092350006, + "learning_rate": 0.00019639533828736875, + "loss": 0.0314, + "step": 658 + }, + { + "epoch": 0.08706278693397629, + "grad_norm": 0.33743587136268616, + "learning_rate": 0.00019638427178965263, + "loss": 0.0342, + "step": 659 + }, + { + "epoch": 0.08719490041946032, + "grad_norm": 0.2209853231906891, + "learning_rate": 0.0001963731886433557, + "loss": 0.0436, + "step": 660 + }, + { + "epoch": 0.08732701390494435, + "grad_norm": 0.20948272943496704, + "learning_rate": 0.00019636208885039232, + "loss": 0.0333, + "step": 661 + }, + { + "epoch": 0.08745912739042838, + "grad_norm": 0.27300870418548584, + "learning_rate": 0.00019635097241267979, + "loss": 0.0435, + "step": 662 + }, + { + "epoch": 0.08759124087591241, + "grad_norm": 0.21323652565479279, + "learning_rate": 0.0001963398393321383, + "loss": 0.023, + "step": 663 + }, + { + "epoch": 0.08772335436139644, + "grad_norm": 0.2643250823020935, + "learning_rate": 0.00019632868961069085, + "loss": 0.0451, + "step": 664 + }, + { + "epoch": 0.08785546784688048, + "grad_norm": 0.3321099579334259, + "learning_rate": 0.00019631752325026335, + "loss": 0.0338, + "step": 665 + }, + { + "epoch": 0.0879875813323645, + "grad_norm": 0.28824859857559204, + "learning_rate": 0.0001963063402527846, + "loss": 0.0366, + "step": 666 + }, + { + "epoch": 0.08811969481784854, + "grad_norm": 0.2807556092739105, + "learning_rate": 0.00019629514062018618, + "loss": 0.028, + "step": 667 + }, + { + "epoch": 0.08825180830333257, + "grad_norm": 0.25782066583633423, + "learning_rate": 0.00019628392435440276, + "loss": 0.0271, + "step": 668 + }, + { + "epoch": 0.0883839217888166, + "grad_norm": 0.31848353147506714, + "learning_rate": 0.0001962726914573716, + "loss": 0.0391, + "step": 669 + }, + { + "epoch": 0.08851603527430063, + "grad_norm": 0.20026235282421112, + "learning_rate": 0.00019626144193103304, + "loss": 0.0303, + "step": 670 + }, + { + "epoch": 0.08864814875978466, + "grad_norm": 0.2486668825149536, + "learning_rate": 0.0001962501757773302, + "loss": 0.035, + "step": 671 + }, + { + "epoch": 0.08878026224526868, + "grad_norm": 0.18474815785884857, + "learning_rate": 0.00019623889299820913, + "loss": 0.0263, + "step": 672 + }, + { + "epoch": 0.08891237573075271, + "grad_norm": 0.23487155139446259, + "learning_rate": 0.0001962275935956187, + "loss": 0.04, + "step": 673 + }, + { + "epoch": 0.08904448921623674, + "grad_norm": 0.25455838441848755, + "learning_rate": 0.00019621627757151065, + "loss": 0.0275, + "step": 674 + }, + { + "epoch": 0.08917660270172077, + "grad_norm": 0.2235615849494934, + "learning_rate": 0.00019620494492783962, + "loss": 0.0216, + "step": 675 + }, + { + "epoch": 0.0893087161872048, + "grad_norm": 0.26305070519447327, + "learning_rate": 0.00019619359566656316, + "loss": 0.0425, + "step": 676 + }, + { + "epoch": 0.08944082967268883, + "grad_norm": 0.24703732132911682, + "learning_rate": 0.0001961822297896416, + "loss": 0.0204, + "step": 677 + }, + { + "epoch": 0.08957294315817287, + "grad_norm": 0.2195281833410263, + "learning_rate": 0.00019617084729903818, + "loss": 0.0254, + "step": 678 + }, + { + "epoch": 0.0897050566436569, + "grad_norm": 0.20875030755996704, + "learning_rate": 0.000196159448196719, + "loss": 0.0398, + "step": 679 + }, + { + "epoch": 0.08983717012914093, + "grad_norm": 0.23726671934127808, + "learning_rate": 0.0001961480324846531, + "loss": 0.0346, + "step": 680 + }, + { + "epoch": 0.08996928361462496, + "grad_norm": 0.2222008854150772, + "learning_rate": 0.0001961366001648123, + "loss": 0.0246, + "step": 681 + }, + { + "epoch": 0.09010139710010899, + "grad_norm": 0.2726287245750427, + "learning_rate": 0.0001961251512391713, + "loss": 0.026, + "step": 682 + }, + { + "epoch": 0.09023351058559302, + "grad_norm": 0.2012794464826584, + "learning_rate": 0.00019611368570970767, + "loss": 0.0282, + "step": 683 + }, + { + "epoch": 0.09036562407107705, + "grad_norm": 0.16400226950645447, + "learning_rate": 0.0001961022035784019, + "loss": 0.0241, + "step": 684 + }, + { + "epoch": 0.09049773755656108, + "grad_norm": 0.30417323112487793, + "learning_rate": 0.00019609070484723738, + "loss": 0.035, + "step": 685 + }, + { + "epoch": 0.09062985104204511, + "grad_norm": 0.3207133412361145, + "learning_rate": 0.0001960791895182002, + "loss": 0.0469, + "step": 686 + }, + { + "epoch": 0.09076196452752915, + "grad_norm": 0.385721355676651, + "learning_rate": 0.00019606765759327944, + "loss": 0.036, + "step": 687 + }, + { + "epoch": 0.09089407801301318, + "grad_norm": 0.2257477343082428, + "learning_rate": 0.000196056109074467, + "loss": 0.033, + "step": 688 + }, + { + "epoch": 0.09102619149849721, + "grad_norm": 0.238136425614357, + "learning_rate": 0.00019604454396375773, + "loss": 0.0231, + "step": 689 + }, + { + "epoch": 0.09115830498398124, + "grad_norm": 0.2879732847213745, + "learning_rate": 0.00019603296226314927, + "loss": 0.0358, + "step": 690 + }, + { + "epoch": 0.09129041846946527, + "grad_norm": 0.20712998509407043, + "learning_rate": 0.00019602136397464212, + "loss": 0.0232, + "step": 691 + }, + { + "epoch": 0.0914225319549493, + "grad_norm": 0.19827014207839966, + "learning_rate": 0.0001960097491002397, + "loss": 0.0194, + "step": 692 + }, + { + "epoch": 0.09155464544043333, + "grad_norm": 0.23387299478054047, + "learning_rate": 0.00019599811764194823, + "loss": 0.0326, + "step": 693 + }, + { + "epoch": 0.09168675892591736, + "grad_norm": 0.2525857090950012, + "learning_rate": 0.00019598646960177683, + "loss": 0.0346, + "step": 694 + }, + { + "epoch": 0.0918188724114014, + "grad_norm": 0.22527404129505157, + "learning_rate": 0.00019597480498173754, + "loss": 0.0338, + "step": 695 + }, + { + "epoch": 0.09195098589688543, + "grad_norm": 0.2621941864490509, + "learning_rate": 0.0001959631237838451, + "loss": 0.0291, + "step": 696 + }, + { + "epoch": 0.09208309938236946, + "grad_norm": 0.19432783126831055, + "learning_rate": 0.0001959514260101173, + "loss": 0.0198, + "step": 697 + }, + { + "epoch": 0.09221521286785349, + "grad_norm": 0.42948463559150696, + "learning_rate": 0.00019593971166257466, + "loss": 0.0275, + "step": 698 + }, + { + "epoch": 0.09234732635333752, + "grad_norm": 0.3384920060634613, + "learning_rate": 0.00019592798074324067, + "loss": 0.0405, + "step": 699 + }, + { + "epoch": 0.09247943983882155, + "grad_norm": 0.2368774712085724, + "learning_rate": 0.00019591623325414161, + "loss": 0.0385, + "step": 700 + }, + { + "epoch": 0.09261155332430558, + "grad_norm": 0.31908854842185974, + "learning_rate": 0.0001959044691973066, + "loss": 0.0308, + "step": 701 + }, + { + "epoch": 0.09274366680978961, + "grad_norm": 0.24914051592350006, + "learning_rate": 0.0001958926885747677, + "loss": 0.0181, + "step": 702 + }, + { + "epoch": 0.09287578029527364, + "grad_norm": 0.33457881212234497, + "learning_rate": 0.00019588089138855978, + "loss": 0.0392, + "step": 703 + }, + { + "epoch": 0.09300789378075767, + "grad_norm": 0.3331588804721832, + "learning_rate": 0.0001958690776407206, + "loss": 0.0454, + "step": 704 + }, + { + "epoch": 0.0931400072662417, + "grad_norm": 0.30309465527534485, + "learning_rate": 0.00019585724733329072, + "loss": 0.0387, + "step": 705 + }, + { + "epoch": 0.09327212075172574, + "grad_norm": 0.1881301999092102, + "learning_rate": 0.00019584540046831364, + "loss": 0.0233, + "step": 706 + }, + { + "epoch": 0.09340423423720977, + "grad_norm": 0.19208797812461853, + "learning_rate": 0.0001958335370478357, + "loss": 0.0269, + "step": 707 + }, + { + "epoch": 0.0935363477226938, + "grad_norm": 0.3077560365200043, + "learning_rate": 0.00019582165707390602, + "loss": 0.0349, + "step": 708 + }, + { + "epoch": 0.09366846120817783, + "grad_norm": 0.19938530027866364, + "learning_rate": 0.0001958097605485767, + "loss": 0.023, + "step": 709 + }, + { + "epoch": 0.09380057469366186, + "grad_norm": 0.7101864218711853, + "learning_rate": 0.00019579784747390263, + "loss": 0.0621, + "step": 710 + }, + { + "epoch": 0.09393268817914589, + "grad_norm": 0.2244912087917328, + "learning_rate": 0.00019578591785194156, + "loss": 0.0371, + "step": 711 + }, + { + "epoch": 0.09406480166462992, + "grad_norm": 0.21191316843032837, + "learning_rate": 0.00019577397168475414, + "loss": 0.0227, + "step": 712 + }, + { + "epoch": 0.09419691515011394, + "grad_norm": 0.27440908551216125, + "learning_rate": 0.0001957620089744038, + "loss": 0.025, + "step": 713 + }, + { + "epoch": 0.09432902863559797, + "grad_norm": 0.3090989291667938, + "learning_rate": 0.0001957500297229569, + "loss": 0.0304, + "step": 714 + }, + { + "epoch": 0.094461142121082, + "grad_norm": 0.204986572265625, + "learning_rate": 0.00019573803393248263, + "loss": 0.0286, + "step": 715 + }, + { + "epoch": 0.09459325560656603, + "grad_norm": 0.21652153134346008, + "learning_rate": 0.00019572602160505305, + "loss": 0.0178, + "step": 716 + }, + { + "epoch": 0.09472536909205007, + "grad_norm": 0.3828398585319519, + "learning_rate": 0.00019571399274274305, + "loss": 0.0249, + "step": 717 + }, + { + "epoch": 0.0948574825775341, + "grad_norm": 0.24476584792137146, + "learning_rate": 0.00019570194734763038, + "loss": 0.0352, + "step": 718 + }, + { + "epoch": 0.09498959606301813, + "grad_norm": 0.2881144881248474, + "learning_rate": 0.00019568988542179567, + "loss": 0.0457, + "step": 719 + }, + { + "epoch": 0.09512170954850216, + "grad_norm": 0.19496379792690277, + "learning_rate": 0.0001956778069673224, + "loss": 0.0186, + "step": 720 + }, + { + "epoch": 0.09525382303398619, + "grad_norm": 0.21406958997249603, + "learning_rate": 0.00019566571198629694, + "loss": 0.0282, + "step": 721 + }, + { + "epoch": 0.09538593651947022, + "grad_norm": 0.1741507649421692, + "learning_rate": 0.00019565360048080837, + "loss": 0.0218, + "step": 722 + }, + { + "epoch": 0.09551805000495425, + "grad_norm": 0.2991659939289093, + "learning_rate": 0.00019564147245294876, + "loss": 0.0358, + "step": 723 + }, + { + "epoch": 0.09565016349043828, + "grad_norm": 0.361500084400177, + "learning_rate": 0.00019562932790481306, + "loss": 0.0533, + "step": 724 + }, + { + "epoch": 0.09578227697592231, + "grad_norm": 0.24400769174098969, + "learning_rate": 0.00019561716683849894, + "loss": 0.0336, + "step": 725 + }, + { + "epoch": 0.09591439046140635, + "grad_norm": 0.22594720125198364, + "learning_rate": 0.00019560498925610706, + "loss": 0.0296, + "step": 726 + }, + { + "epoch": 0.09604650394689038, + "grad_norm": 0.22000914812088013, + "learning_rate": 0.0001955927951597408, + "loss": 0.0327, + "step": 727 + }, + { + "epoch": 0.09617861743237441, + "grad_norm": 0.42935800552368164, + "learning_rate": 0.00019558058455150653, + "loss": 0.0432, + "step": 728 + }, + { + "epoch": 0.09631073091785844, + "grad_norm": 0.30375710129737854, + "learning_rate": 0.0001955683574335134, + "loss": 0.028, + "step": 729 + }, + { + "epoch": 0.09644284440334247, + "grad_norm": 0.33745771646499634, + "learning_rate": 0.00019555611380787333, + "loss": 0.0415, + "step": 730 + }, + { + "epoch": 0.0965749578888265, + "grad_norm": 0.25258249044418335, + "learning_rate": 0.00019554385367670128, + "loss": 0.0158, + "step": 731 + }, + { + "epoch": 0.09670707137431053, + "grad_norm": 0.20375578105449677, + "learning_rate": 0.0001955315770421149, + "loss": 0.0209, + "step": 732 + }, + { + "epoch": 0.09683918485979456, + "grad_norm": 0.2016744613647461, + "learning_rate": 0.00019551928390623477, + "loss": 0.0337, + "step": 733 + }, + { + "epoch": 0.0969712983452786, + "grad_norm": 0.26688241958618164, + "learning_rate": 0.00019550697427118429, + "loss": 0.0387, + "step": 734 + }, + { + "epoch": 0.09710341183076263, + "grad_norm": 0.21267996728420258, + "learning_rate": 0.00019549464813908973, + "loss": 0.0336, + "step": 735 + }, + { + "epoch": 0.09723552531624666, + "grad_norm": 0.3066573739051819, + "learning_rate": 0.0001954823055120802, + "loss": 0.0375, + "step": 736 + }, + { + "epoch": 0.09736763880173069, + "grad_norm": 0.22104468941688538, + "learning_rate": 0.00019546994639228765, + "loss": 0.0224, + "step": 737 + }, + { + "epoch": 0.09749975228721472, + "grad_norm": 0.20909041166305542, + "learning_rate": 0.00019545757078184687, + "loss": 0.0242, + "step": 738 + }, + { + "epoch": 0.09763186577269875, + "grad_norm": 0.2227659821510315, + "learning_rate": 0.00019544517868289556, + "loss": 0.0376, + "step": 739 + }, + { + "epoch": 0.09776397925818278, + "grad_norm": 0.23300065100193024, + "learning_rate": 0.00019543277009757417, + "loss": 0.0379, + "step": 740 + }, + { + "epoch": 0.09789609274366681, + "grad_norm": 0.320353239774704, + "learning_rate": 0.0001954203450280261, + "loss": 0.0414, + "step": 741 + }, + { + "epoch": 0.09802820622915084, + "grad_norm": 0.23459066450595856, + "learning_rate": 0.00019540790347639752, + "loss": 0.0339, + "step": 742 + }, + { + "epoch": 0.09816031971463487, + "grad_norm": 0.2674216032028198, + "learning_rate": 0.00019539544544483746, + "loss": 0.0255, + "step": 743 + }, + { + "epoch": 0.0982924332001189, + "grad_norm": 0.2853393256664276, + "learning_rate": 0.00019538297093549788, + "loss": 0.0349, + "step": 744 + }, + { + "epoch": 0.09842454668560294, + "grad_norm": 0.2072666585445404, + "learning_rate": 0.00019537047995053347, + "loss": 0.0254, + "step": 745 + }, + { + "epoch": 0.09855666017108697, + "grad_norm": 0.23279407620429993, + "learning_rate": 0.00019535797249210177, + "loss": 0.035, + "step": 746 + }, + { + "epoch": 0.098688773656571, + "grad_norm": 0.30693086981773376, + "learning_rate": 0.00019534544856236329, + "loss": 0.0384, + "step": 747 + }, + { + "epoch": 0.09882088714205503, + "grad_norm": 0.24419213831424713, + "learning_rate": 0.00019533290816348123, + "loss": 0.0403, + "step": 748 + }, + { + "epoch": 0.09895300062753906, + "grad_norm": 0.288097620010376, + "learning_rate": 0.0001953203512976218, + "loss": 0.034, + "step": 749 + }, + { + "epoch": 0.09908511411302309, + "grad_norm": 0.26100876927375793, + "learning_rate": 0.0001953077779669539, + "loss": 0.0331, + "step": 750 + }, + { + "epoch": 0.09921722759850712, + "grad_norm": 0.2615920305252075, + "learning_rate": 0.00019529518817364933, + "loss": 0.0389, + "step": 751 + }, + { + "epoch": 0.09934934108399116, + "grad_norm": 0.28125059604644775, + "learning_rate": 0.00019528258191988277, + "loss": 0.0344, + "step": 752 + }, + { + "epoch": 0.09948145456947517, + "grad_norm": 0.268867164850235, + "learning_rate": 0.00019526995920783174, + "loss": 0.0463, + "step": 753 + }, + { + "epoch": 0.0996135680549592, + "grad_norm": 0.303653359413147, + "learning_rate": 0.00019525732003967651, + "loss": 0.0454, + "step": 754 + }, + { + "epoch": 0.09974568154044323, + "grad_norm": 0.2101089507341385, + "learning_rate": 0.0001952446644176003, + "loss": 0.0332, + "step": 755 + }, + { + "epoch": 0.09987779502592727, + "grad_norm": 0.2549467980861664, + "learning_rate": 0.00019523199234378915, + "loss": 0.0284, + "step": 756 + }, + { + "epoch": 0.1000099085114113, + "grad_norm": 0.31274503469467163, + "learning_rate": 0.00019521930382043187, + "loss": 0.0491, + "step": 757 + }, + { + "epoch": 0.10014202199689533, + "grad_norm": 0.2537939250469208, + "learning_rate": 0.0001952065988497202, + "loss": 0.0315, + "step": 758 + }, + { + "epoch": 0.10027413548237936, + "grad_norm": 0.2864026129245758, + "learning_rate": 0.00019519387743384872, + "loss": 0.0367, + "step": 759 + }, + { + "epoch": 0.10040624896786339, + "grad_norm": 0.584783673286438, + "learning_rate": 0.00019518113957501477, + "loss": 0.0467, + "step": 760 + }, + { + "epoch": 0.10053836245334742, + "grad_norm": 0.14912083745002747, + "learning_rate": 0.00019516838527541857, + "loss": 0.0225, + "step": 761 + }, + { + "epoch": 0.10067047593883145, + "grad_norm": 0.19884039461612701, + "learning_rate": 0.0001951556145372632, + "loss": 0.0204, + "step": 762 + }, + { + "epoch": 0.10080258942431548, + "grad_norm": 0.20827095210552216, + "learning_rate": 0.00019514282736275454, + "loss": 0.0291, + "step": 763 + }, + { + "epoch": 0.10093470290979951, + "grad_norm": 0.23917272686958313, + "learning_rate": 0.0001951300237541014, + "loss": 0.0256, + "step": 764 + }, + { + "epoch": 0.10106681639528355, + "grad_norm": 0.23405112326145172, + "learning_rate": 0.00019511720371351534, + "loss": 0.04, + "step": 765 + }, + { + "epoch": 0.10119892988076758, + "grad_norm": 0.21369348466396332, + "learning_rate": 0.00019510436724321076, + "loss": 0.0386, + "step": 766 + }, + { + "epoch": 0.10133104336625161, + "grad_norm": 0.25339123606681824, + "learning_rate": 0.0001950915143454049, + "loss": 0.027, + "step": 767 + }, + { + "epoch": 0.10146315685173564, + "grad_norm": 0.2526932656764984, + "learning_rate": 0.00019507864502231792, + "loss": 0.0329, + "step": 768 + }, + { + "epoch": 0.10159527033721967, + "grad_norm": 0.223262757062912, + "learning_rate": 0.00019506575927617271, + "loss": 0.0298, + "step": 769 + }, + { + "epoch": 0.1017273838227037, + "grad_norm": 0.22292746603488922, + "learning_rate": 0.00019505285710919506, + "loss": 0.0378, + "step": 770 + }, + { + "epoch": 0.10185949730818773, + "grad_norm": 0.20766198635101318, + "learning_rate": 0.0001950399385236136, + "loss": 0.0321, + "step": 771 + }, + { + "epoch": 0.10199161079367176, + "grad_norm": 0.25311240553855896, + "learning_rate": 0.00019502700352165973, + "loss": 0.0397, + "step": 772 + }, + { + "epoch": 0.1021237242791558, + "grad_norm": 0.25872135162353516, + "learning_rate": 0.00019501405210556774, + "loss": 0.032, + "step": 773 + }, + { + "epoch": 0.10225583776463983, + "grad_norm": 0.14971302449703217, + "learning_rate": 0.00019500108427757473, + "loss": 0.0155, + "step": 774 + }, + { + "epoch": 0.10238795125012386, + "grad_norm": 0.22067025303840637, + "learning_rate": 0.0001949881000399207, + "loss": 0.0245, + "step": 775 + }, + { + "epoch": 0.10252006473560789, + "grad_norm": 0.2329145222902298, + "learning_rate": 0.00019497509939484843, + "loss": 0.0286, + "step": 776 + }, + { + "epoch": 0.10265217822109192, + "grad_norm": 0.15475256741046906, + "learning_rate": 0.00019496208234460346, + "loss": 0.0169, + "step": 777 + }, + { + "epoch": 0.10278429170657595, + "grad_norm": 0.2959286868572235, + "learning_rate": 0.00019494904889143434, + "loss": 0.0479, + "step": 778 + }, + { + "epoch": 0.10291640519205998, + "grad_norm": 0.3313973546028137, + "learning_rate": 0.0001949359990375923, + "loss": 0.0329, + "step": 779 + }, + { + "epoch": 0.10304851867754401, + "grad_norm": 0.3937395215034485, + "learning_rate": 0.00019492293278533147, + "loss": 0.0338, + "step": 780 + }, + { + "epoch": 0.10318063216302804, + "grad_norm": 0.2434310019016266, + "learning_rate": 0.0001949098501369088, + "loss": 0.0253, + "step": 781 + }, + { + "epoch": 0.10331274564851207, + "grad_norm": 0.3571442663669586, + "learning_rate": 0.00019489675109458406, + "loss": 0.0214, + "step": 782 + }, + { + "epoch": 0.1034448591339961, + "grad_norm": 0.20219728350639343, + "learning_rate": 0.0001948836356606199, + "loss": 0.0265, + "step": 783 + }, + { + "epoch": 0.10357697261948014, + "grad_norm": 0.277512788772583, + "learning_rate": 0.00019487050383728175, + "loss": 0.0469, + "step": 784 + }, + { + "epoch": 0.10370908610496417, + "grad_norm": 0.19468040764331818, + "learning_rate": 0.00019485735562683784, + "loss": 0.027, + "step": 785 + }, + { + "epoch": 0.1038411995904482, + "grad_norm": 0.23780375719070435, + "learning_rate": 0.00019484419103155937, + "loss": 0.0275, + "step": 786 + }, + { + "epoch": 0.10397331307593223, + "grad_norm": 0.17604751884937286, + "learning_rate": 0.0001948310100537202, + "loss": 0.0139, + "step": 787 + }, + { + "epoch": 0.10410542656141626, + "grad_norm": 0.25029489398002625, + "learning_rate": 0.0001948178126955971, + "loss": 0.0381, + "step": 788 + }, + { + "epoch": 0.10423754004690029, + "grad_norm": 0.20858237147331238, + "learning_rate": 0.00019480459895946975, + "loss": 0.0299, + "step": 789 + }, + { + "epoch": 0.10436965353238432, + "grad_norm": 0.28888922929763794, + "learning_rate": 0.00019479136884762048, + "loss": 0.0289, + "step": 790 + }, + { + "epoch": 0.10450176701786835, + "grad_norm": 0.2120070606470108, + "learning_rate": 0.00019477812236233456, + "loss": 0.0457, + "step": 791 + }, + { + "epoch": 0.10463388050335239, + "grad_norm": 0.248973086476326, + "learning_rate": 0.00019476485950590012, + "loss": 0.0411, + "step": 792 + }, + { + "epoch": 0.10476599398883642, + "grad_norm": 0.32294097542762756, + "learning_rate": 0.00019475158028060808, + "loss": 0.0381, + "step": 793 + }, + { + "epoch": 0.10489810747432043, + "grad_norm": 0.25859832763671875, + "learning_rate": 0.0001947382846887521, + "loss": 0.0276, + "step": 794 + }, + { + "epoch": 0.10503022095980447, + "grad_norm": 0.178748220205307, + "learning_rate": 0.0001947249727326288, + "loss": 0.0184, + "step": 795 + }, + { + "epoch": 0.1051623344452885, + "grad_norm": 0.21161046624183655, + "learning_rate": 0.00019471164441453755, + "loss": 0.0313, + "step": 796 + }, + { + "epoch": 0.10529444793077253, + "grad_norm": 0.36292991042137146, + "learning_rate": 0.0001946982997367806, + "loss": 0.0329, + "step": 797 + }, + { + "epoch": 0.10542656141625656, + "grad_norm": 0.4161984324455261, + "learning_rate": 0.00019468493870166293, + "loss": 0.0363, + "step": 798 + }, + { + "epoch": 0.10555867490174059, + "grad_norm": 0.21217572689056396, + "learning_rate": 0.00019467156131149248, + "loss": 0.0402, + "step": 799 + }, + { + "epoch": 0.10569078838722462, + "grad_norm": 0.24036584794521332, + "learning_rate": 0.00019465816756857992, + "loss": 0.0351, + "step": 800 + }, + { + "epoch": 0.10582290187270865, + "grad_norm": 0.18879947066307068, + "learning_rate": 0.00019464475747523876, + "loss": 0.0204, + "step": 801 + }, + { + "epoch": 0.10595501535819268, + "grad_norm": 0.26750242710113525, + "learning_rate": 0.00019463133103378533, + "loss": 0.0349, + "step": 802 + }, + { + "epoch": 0.10608712884367671, + "grad_norm": 0.2977040708065033, + "learning_rate": 0.0001946178882465388, + "loss": 0.0464, + "step": 803 + }, + { + "epoch": 0.10621924232916075, + "grad_norm": 0.1896338313817978, + "learning_rate": 0.0001946044291158212, + "loss": 0.0237, + "step": 804 + }, + { + "epoch": 0.10635135581464478, + "grad_norm": 0.23944416642189026, + "learning_rate": 0.00019459095364395728, + "loss": 0.0312, + "step": 805 + }, + { + "epoch": 0.10648346930012881, + "grad_norm": 0.23625792562961578, + "learning_rate": 0.00019457746183327475, + "loss": 0.0257, + "step": 806 + }, + { + "epoch": 0.10661558278561284, + "grad_norm": 0.22346678376197815, + "learning_rate": 0.000194563953686104, + "loss": 0.0202, + "step": 807 + }, + { + "epoch": 0.10674769627109687, + "grad_norm": 0.269264280796051, + "learning_rate": 0.00019455042920477834, + "loss": 0.0201, + "step": 808 + }, + { + "epoch": 0.1068798097565809, + "grad_norm": 0.27811092138290405, + "learning_rate": 0.00019453688839163392, + "loss": 0.0397, + "step": 809 + }, + { + "epoch": 0.10701192324206493, + "grad_norm": 0.3348802328109741, + "learning_rate": 0.00019452333124900955, + "loss": 0.0354, + "step": 810 + }, + { + "epoch": 0.10714403672754896, + "grad_norm": 0.4341381788253784, + "learning_rate": 0.00019450975777924706, + "loss": 0.0446, + "step": 811 + }, + { + "epoch": 0.107276150213033, + "grad_norm": 0.2206539660692215, + "learning_rate": 0.00019449616798469097, + "loss": 0.0257, + "step": 812 + }, + { + "epoch": 0.10740826369851703, + "grad_norm": 0.26142534613609314, + "learning_rate": 0.00019448256186768869, + "loss": 0.0351, + "step": 813 + }, + { + "epoch": 0.10754037718400106, + "grad_norm": 0.3535195589065552, + "learning_rate": 0.00019446893943059044, + "loss": 0.0217, + "step": 814 + }, + { + "epoch": 0.10767249066948509, + "grad_norm": 0.2100822776556015, + "learning_rate": 0.0001944553006757492, + "loss": 0.0333, + "step": 815 + }, + { + "epoch": 0.10780460415496912, + "grad_norm": 0.19954076409339905, + "learning_rate": 0.00019444164560552082, + "loss": 0.0242, + "step": 816 + }, + { + "epoch": 0.10793671764045315, + "grad_norm": 0.25790032744407654, + "learning_rate": 0.00019442797422226398, + "loss": 0.0282, + "step": 817 + }, + { + "epoch": 0.10806883112593718, + "grad_norm": 0.2589845359325409, + "learning_rate": 0.0001944142865283401, + "loss": 0.0233, + "step": 818 + }, + { + "epoch": 0.10820094461142121, + "grad_norm": 0.2854527235031128, + "learning_rate": 0.00019440058252611354, + "loss": 0.0378, + "step": 819 + }, + { + "epoch": 0.10833305809690524, + "grad_norm": 0.35180917382240295, + "learning_rate": 0.00019438686221795137, + "loss": 0.028, + "step": 820 + }, + { + "epoch": 0.10846517158238927, + "grad_norm": 0.22869399189949036, + "learning_rate": 0.00019437312560622355, + "loss": 0.0215, + "step": 821 + }, + { + "epoch": 0.1085972850678733, + "grad_norm": 0.25632917881011963, + "learning_rate": 0.00019435937269330275, + "loss": 0.0395, + "step": 822 + }, + { + "epoch": 0.10872939855335734, + "grad_norm": 0.22697928547859192, + "learning_rate": 0.00019434560348156464, + "loss": 0.0341, + "step": 823 + }, + { + "epoch": 0.10886151203884137, + "grad_norm": 0.285995215177536, + "learning_rate": 0.00019433181797338752, + "loss": 0.0377, + "step": 824 + }, + { + "epoch": 0.1089936255243254, + "grad_norm": 0.15859773755073547, + "learning_rate": 0.00019431801617115257, + "loss": 0.017, + "step": 825 + }, + { + "epoch": 0.10912573900980943, + "grad_norm": 0.2510858178138733, + "learning_rate": 0.00019430419807724383, + "loss": 0.0477, + "step": 826 + }, + { + "epoch": 0.10925785249529346, + "grad_norm": 0.17218337953090668, + "learning_rate": 0.0001942903636940481, + "loss": 0.0151, + "step": 827 + }, + { + "epoch": 0.10938996598077749, + "grad_norm": 0.2229837030172348, + "learning_rate": 0.000194276513023955, + "loss": 0.0365, + "step": 828 + }, + { + "epoch": 0.10952207946626152, + "grad_norm": 0.25345245003700256, + "learning_rate": 0.00019426264606935703, + "loss": 0.027, + "step": 829 + }, + { + "epoch": 0.10965419295174555, + "grad_norm": 0.2927243411540985, + "learning_rate": 0.00019424876283264937, + "loss": 0.0393, + "step": 830 + }, + { + "epoch": 0.10978630643722959, + "grad_norm": 0.17304779589176178, + "learning_rate": 0.00019423486331623013, + "loss": 0.0186, + "step": 831 + }, + { + "epoch": 0.10991841992271362, + "grad_norm": 0.16232873499393463, + "learning_rate": 0.0001942209475225002, + "loss": 0.0219, + "step": 832 + }, + { + "epoch": 0.11005053340819765, + "grad_norm": 0.2516721189022064, + "learning_rate": 0.00019420701545386327, + "loss": 0.0282, + "step": 833 + }, + { + "epoch": 0.11018264689368168, + "grad_norm": 0.3067221939563751, + "learning_rate": 0.0001941930671127258, + "loss": 0.0331, + "step": 834 + }, + { + "epoch": 0.1103147603791657, + "grad_norm": 0.28294551372528076, + "learning_rate": 0.00019417910250149714, + "loss": 0.043, + "step": 835 + }, + { + "epoch": 0.11044687386464973, + "grad_norm": 0.3205573856830597, + "learning_rate": 0.00019416512162258944, + "loss": 0.0402, + "step": 836 + }, + { + "epoch": 0.11057898735013376, + "grad_norm": 0.38353288173675537, + "learning_rate": 0.00019415112447841764, + "loss": 0.0293, + "step": 837 + }, + { + "epoch": 0.11071110083561779, + "grad_norm": 0.234690859913826, + "learning_rate": 0.0001941371110713994, + "loss": 0.0396, + "step": 838 + }, + { + "epoch": 0.11084321432110182, + "grad_norm": 0.292104035615921, + "learning_rate": 0.00019412308140395534, + "loss": 0.0368, + "step": 839 + }, + { + "epoch": 0.11097532780658585, + "grad_norm": 0.2527158856391907, + "learning_rate": 0.00019410903547850884, + "loss": 0.0473, + "step": 840 + }, + { + "epoch": 0.11110744129206988, + "grad_norm": 0.33742791414260864, + "learning_rate": 0.00019409497329748603, + "loss": 0.0165, + "step": 841 + }, + { + "epoch": 0.11123955477755391, + "grad_norm": 0.3314475119113922, + "learning_rate": 0.0001940808948633159, + "loss": 0.0257, + "step": 842 + }, + { + "epoch": 0.11137166826303795, + "grad_norm": 0.5029959678649902, + "learning_rate": 0.00019406680017843022, + "loss": 0.0449, + "step": 843 + }, + { + "epoch": 0.11150378174852198, + "grad_norm": 0.24230080842971802, + "learning_rate": 0.0001940526892452636, + "loss": 0.0196, + "step": 844 + }, + { + "epoch": 0.11163589523400601, + "grad_norm": 0.23190683126449585, + "learning_rate": 0.00019403856206625349, + "loss": 0.0252, + "step": 845 + }, + { + "epoch": 0.11176800871949004, + "grad_norm": 0.3320704400539398, + "learning_rate": 0.00019402441864384, + "loss": 0.0435, + "step": 846 + }, + { + "epoch": 0.11190012220497407, + "grad_norm": 0.2635941505432129, + "learning_rate": 0.00019401025898046622, + "loss": 0.0414, + "step": 847 + }, + { + "epoch": 0.1120322356904581, + "grad_norm": 0.40881574153900146, + "learning_rate": 0.00019399608307857792, + "loss": 0.0514, + "step": 848 + }, + { + "epoch": 0.11216434917594213, + "grad_norm": 0.21754924952983856, + "learning_rate": 0.00019398189094062374, + "loss": 0.0194, + "step": 849 + }, + { + "epoch": 0.11229646266142616, + "grad_norm": 0.24845930933952332, + "learning_rate": 0.0001939676825690551, + "loss": 0.0243, + "step": 850 + }, + { + "epoch": 0.1124285761469102, + "grad_norm": 0.30211737751960754, + "learning_rate": 0.00019395345796632626, + "loss": 0.034, + "step": 851 + }, + { + "epoch": 0.11256068963239423, + "grad_norm": 0.25241124629974365, + "learning_rate": 0.00019393921713489417, + "loss": 0.0238, + "step": 852 + }, + { + "epoch": 0.11269280311787826, + "grad_norm": 0.26138097047805786, + "learning_rate": 0.00019392496007721878, + "loss": 0.0257, + "step": 853 + }, + { + "epoch": 0.11282491660336229, + "grad_norm": 0.2760176956653595, + "learning_rate": 0.00019391068679576264, + "loss": 0.0304, + "step": 854 + }, + { + "epoch": 0.11295703008884632, + "grad_norm": 0.1539127230644226, + "learning_rate": 0.0001938963972929913, + "loss": 0.0276, + "step": 855 + }, + { + "epoch": 0.11308914357433035, + "grad_norm": 0.31126534938812256, + "learning_rate": 0.00019388209157137286, + "loss": 0.0529, + "step": 856 + }, + { + "epoch": 0.11322125705981438, + "grad_norm": 0.2580847442150116, + "learning_rate": 0.00019386776963337848, + "loss": 0.0368, + "step": 857 + }, + { + "epoch": 0.11335337054529841, + "grad_norm": 0.3393428921699524, + "learning_rate": 0.00019385343148148193, + "loss": 0.0195, + "step": 858 + }, + { + "epoch": 0.11348548403078244, + "grad_norm": 0.22064736485481262, + "learning_rate": 0.00019383907711815993, + "loss": 0.0238, + "step": 859 + }, + { + "epoch": 0.11361759751626647, + "grad_norm": 0.2550196945667267, + "learning_rate": 0.00019382470654589188, + "loss": 0.0234, + "step": 860 + }, + { + "epoch": 0.1137497110017505, + "grad_norm": 0.1911945790052414, + "learning_rate": 0.00019381031976716006, + "loss": 0.0199, + "step": 861 + }, + { + "epoch": 0.11388182448723454, + "grad_norm": 0.2527654469013214, + "learning_rate": 0.0001937959167844495, + "loss": 0.0298, + "step": 862 + }, + { + "epoch": 0.11401393797271857, + "grad_norm": 0.20978406071662903, + "learning_rate": 0.00019378149760024803, + "loss": 0.0205, + "step": 863 + }, + { + "epoch": 0.1141460514582026, + "grad_norm": 0.38254910707473755, + "learning_rate": 0.00019376706221704628, + "loss": 0.0315, + "step": 864 + }, + { + "epoch": 0.11427816494368663, + "grad_norm": 0.33304017782211304, + "learning_rate": 0.00019375261063733773, + "loss": 0.0209, + "step": 865 + }, + { + "epoch": 0.11441027842917066, + "grad_norm": 0.2691979706287384, + "learning_rate": 0.00019373814286361863, + "loss": 0.0342, + "step": 866 + }, + { + "epoch": 0.11454239191465469, + "grad_norm": 0.21065416932106018, + "learning_rate": 0.000193723658898388, + "loss": 0.029, + "step": 867 + }, + { + "epoch": 0.11467450540013872, + "grad_norm": 0.3999660015106201, + "learning_rate": 0.00019370915874414765, + "loss": 0.0348, + "step": 868 + }, + { + "epoch": 0.11480661888562275, + "grad_norm": 0.35480427742004395, + "learning_rate": 0.00019369464240340226, + "loss": 0.0295, + "step": 869 + }, + { + "epoch": 0.11493873237110679, + "grad_norm": 0.24580347537994385, + "learning_rate": 0.00019368010987865918, + "loss": 0.0205, + "step": 870 + }, + { + "epoch": 0.11507084585659082, + "grad_norm": 0.27900341153144836, + "learning_rate": 0.00019366556117242874, + "loss": 0.0271, + "step": 871 + }, + { + "epoch": 0.11520295934207485, + "grad_norm": 0.2475052922964096, + "learning_rate": 0.00019365099628722388, + "loss": 0.0364, + "step": 872 + }, + { + "epoch": 0.11533507282755888, + "grad_norm": 0.27171263098716736, + "learning_rate": 0.00019363641522556038, + "loss": 0.0516, + "step": 873 + }, + { + "epoch": 0.11546718631304291, + "grad_norm": 0.23279814422130585, + "learning_rate": 0.000193621817989957, + "loss": 0.0496, + "step": 874 + }, + { + "epoch": 0.11559929979852693, + "grad_norm": 0.26830148696899414, + "learning_rate": 0.00019360720458293495, + "loss": 0.025, + "step": 875 + }, + { + "epoch": 0.11573141328401096, + "grad_norm": 0.47811031341552734, + "learning_rate": 0.00019359257500701853, + "loss": 0.0368, + "step": 876 + }, + { + "epoch": 0.11586352676949499, + "grad_norm": 0.2844720780849457, + "learning_rate": 0.0001935779292647347, + "loss": 0.0272, + "step": 877 + }, + { + "epoch": 0.11599564025497902, + "grad_norm": 0.3734263479709625, + "learning_rate": 0.00019356326735861322, + "loss": 0.0274, + "step": 878 + }, + { + "epoch": 0.11612775374046305, + "grad_norm": 0.29889705777168274, + "learning_rate": 0.00019354858929118674, + "loss": 0.026, + "step": 879 + }, + { + "epoch": 0.11625986722594708, + "grad_norm": 0.37999603152275085, + "learning_rate": 0.00019353389506499054, + "loss": 0.0515, + "step": 880 + }, + { + "epoch": 0.11639198071143111, + "grad_norm": 0.2977294921875, + "learning_rate": 0.00019351918468256277, + "loss": 0.0431, + "step": 881 + }, + { + "epoch": 0.11652409419691515, + "grad_norm": 0.25868692994117737, + "learning_rate": 0.00019350445814644442, + "loss": 0.0348, + "step": 882 + }, + { + "epoch": 0.11665620768239918, + "grad_norm": 0.2617489993572235, + "learning_rate": 0.0001934897154591792, + "loss": 0.0292, + "step": 883 + }, + { + "epoch": 0.11678832116788321, + "grad_norm": 0.21163403987884521, + "learning_rate": 0.00019347495662331364, + "loss": 0.0355, + "step": 884 + }, + { + "epoch": 0.11692043465336724, + "grad_norm": 0.2167588174343109, + "learning_rate": 0.00019346018164139705, + "loss": 0.0316, + "step": 885 + }, + { + "epoch": 0.11705254813885127, + "grad_norm": 0.30651724338531494, + "learning_rate": 0.00019344539051598152, + "loss": 0.0546, + "step": 886 + }, + { + "epoch": 0.1171846616243353, + "grad_norm": 0.1882271021604538, + "learning_rate": 0.00019343058324962196, + "loss": 0.0294, + "step": 887 + }, + { + "epoch": 0.11731677510981933, + "grad_norm": 0.17964918911457062, + "learning_rate": 0.00019341575984487604, + "loss": 0.0254, + "step": 888 + }, + { + "epoch": 0.11744888859530336, + "grad_norm": 0.16270661354064941, + "learning_rate": 0.0001934009203043042, + "loss": 0.0252, + "step": 889 + }, + { + "epoch": 0.1175810020807874, + "grad_norm": 0.8287057876586914, + "learning_rate": 0.00019338606463046977, + "loss": 0.027, + "step": 890 + }, + { + "epoch": 0.11771311556627143, + "grad_norm": 0.2975148856639862, + "learning_rate": 0.00019337119282593874, + "loss": 0.0288, + "step": 891 + }, + { + "epoch": 0.11784522905175546, + "grad_norm": 0.2509402632713318, + "learning_rate": 0.0001933563048932799, + "loss": 0.0401, + "step": 892 + }, + { + "epoch": 0.11797734253723949, + "grad_norm": 0.18912091851234436, + "learning_rate": 0.0001933414008350649, + "loss": 0.0253, + "step": 893 + }, + { + "epoch": 0.11810945602272352, + "grad_norm": 0.22809197008609772, + "learning_rate": 0.00019332648065386815, + "loss": 0.0385, + "step": 894 + }, + { + "epoch": 0.11824156950820755, + "grad_norm": 0.41790077090263367, + "learning_rate": 0.00019331154435226684, + "loss": 0.0377, + "step": 895 + }, + { + "epoch": 0.11837368299369158, + "grad_norm": 0.2439369559288025, + "learning_rate": 0.00019329659193284088, + "loss": 0.026, + "step": 896 + }, + { + "epoch": 0.11850579647917561, + "grad_norm": 0.2535647749900818, + "learning_rate": 0.00019328162339817307, + "loss": 0.024, + "step": 897 + }, + { + "epoch": 0.11863790996465964, + "grad_norm": 0.36167967319488525, + "learning_rate": 0.00019326663875084891, + "loss": 0.0273, + "step": 898 + }, + { + "epoch": 0.11877002345014367, + "grad_norm": 0.19031481444835663, + "learning_rate": 0.00019325163799345675, + "loss": 0.0311, + "step": 899 + }, + { + "epoch": 0.1189021369356277, + "grad_norm": 0.31373831629753113, + "learning_rate": 0.0001932366211285877, + "loss": 0.0328, + "step": 900 + }, + { + "epoch": 0.11903425042111174, + "grad_norm": 0.30672964453697205, + "learning_rate": 0.00019322158815883558, + "loss": 0.0529, + "step": 901 + }, + { + "epoch": 0.11916636390659577, + "grad_norm": 0.20721125602722168, + "learning_rate": 0.00019320653908679711, + "loss": 0.0366, + "step": 902 + }, + { + "epoch": 0.1192984773920798, + "grad_norm": 0.19243820011615753, + "learning_rate": 0.00019319147391507174, + "loss": 0.0185, + "step": 903 + }, + { + "epoch": 0.11943059087756383, + "grad_norm": 0.2745702564716339, + "learning_rate": 0.00019317639264626165, + "loss": 0.0356, + "step": 904 + }, + { + "epoch": 0.11956270436304786, + "grad_norm": 0.3759895861148834, + "learning_rate": 0.00019316129528297192, + "loss": 0.0121, + "step": 905 + }, + { + "epoch": 0.11969481784853189, + "grad_norm": 0.2563433349132538, + "learning_rate": 0.00019314618182781024, + "loss": 0.0356, + "step": 906 + }, + { + "epoch": 0.11982693133401592, + "grad_norm": 0.2550245225429535, + "learning_rate": 0.00019313105228338726, + "loss": 0.0267, + "step": 907 + }, + { + "epoch": 0.11995904481949995, + "grad_norm": 0.27986523509025574, + "learning_rate": 0.00019311590665231626, + "loss": 0.0295, + "step": 908 + }, + { + "epoch": 0.12009115830498399, + "grad_norm": 0.24848835170269012, + "learning_rate": 0.00019310074493721343, + "loss": 0.0395, + "step": 909 + }, + { + "epoch": 0.12022327179046802, + "grad_norm": 0.2371983379125595, + "learning_rate": 0.00019308556714069764, + "loss": 0.0377, + "step": 910 + }, + { + "epoch": 0.12035538527595205, + "grad_norm": 0.26739948987960815, + "learning_rate": 0.00019307037326539057, + "loss": 0.0215, + "step": 911 + }, + { + "epoch": 0.12048749876143608, + "grad_norm": 0.19139261543750763, + "learning_rate": 0.0001930551633139167, + "loss": 0.028, + "step": 912 + }, + { + "epoch": 0.12061961224692011, + "grad_norm": 0.19188474118709564, + "learning_rate": 0.0001930399372889032, + "loss": 0.0202, + "step": 913 + }, + { + "epoch": 0.12075172573240414, + "grad_norm": 0.25728708505630493, + "learning_rate": 0.00019302469519298014, + "loss": 0.0417, + "step": 914 + }, + { + "epoch": 0.12088383921788817, + "grad_norm": 0.28429311513900757, + "learning_rate": 0.0001930094370287803, + "loss": 0.0273, + "step": 915 + }, + { + "epoch": 0.12101595270337219, + "grad_norm": 0.2718221843242645, + "learning_rate": 0.00019299416279893925, + "loss": 0.0349, + "step": 916 + }, + { + "epoch": 0.12114806618885622, + "grad_norm": 0.18933570384979248, + "learning_rate": 0.0001929788725060953, + "loss": 0.0248, + "step": 917 + }, + { + "epoch": 0.12128017967434025, + "grad_norm": 0.2467522919178009, + "learning_rate": 0.00019296356615288959, + "loss": 0.0216, + "step": 918 + }, + { + "epoch": 0.12141229315982428, + "grad_norm": 0.22754551470279694, + "learning_rate": 0.00019294824374196598, + "loss": 0.0266, + "step": 919 + }, + { + "epoch": 0.12154440664530831, + "grad_norm": 0.2605039179325104, + "learning_rate": 0.00019293290527597114, + "loss": 0.0315, + "step": 920 + }, + { + "epoch": 0.12167652013079235, + "grad_norm": 0.2127530425786972, + "learning_rate": 0.00019291755075755452, + "loss": 0.0351, + "step": 921 + }, + { + "epoch": 0.12180863361627638, + "grad_norm": 0.3158978521823883, + "learning_rate": 0.00019290218018936829, + "loss": 0.0342, + "step": 922 + }, + { + "epoch": 0.12194074710176041, + "grad_norm": 0.2638450860977173, + "learning_rate": 0.00019288679357406746, + "loss": 0.0407, + "step": 923 + }, + { + "epoch": 0.12207286058724444, + "grad_norm": 0.26631325483322144, + "learning_rate": 0.00019287139091430977, + "loss": 0.0339, + "step": 924 + }, + { + "epoch": 0.12220497407272847, + "grad_norm": 0.24103693664073944, + "learning_rate": 0.00019285597221275572, + "loss": 0.0252, + "step": 925 + }, + { + "epoch": 0.1223370875582125, + "grad_norm": 0.1970626562833786, + "learning_rate": 0.00019284053747206867, + "loss": 0.0211, + "step": 926 + }, + { + "epoch": 0.12246920104369653, + "grad_norm": 0.23220521211624146, + "learning_rate": 0.0001928250866949146, + "loss": 0.0363, + "step": 927 + }, + { + "epoch": 0.12260131452918056, + "grad_norm": 0.2567414343357086, + "learning_rate": 0.0001928096198839624, + "loss": 0.0429, + "step": 928 + }, + { + "epoch": 0.1227334280146646, + "grad_norm": 0.18246598541736603, + "learning_rate": 0.00019279413704188363, + "loss": 0.0319, + "step": 929 + }, + { + "epoch": 0.12286554150014863, + "grad_norm": 0.2592841684818268, + "learning_rate": 0.00019277863817135268, + "loss": 0.0284, + "step": 930 + }, + { + "epoch": 0.12299765498563266, + "grad_norm": 0.29718905687332153, + "learning_rate": 0.00019276312327504673, + "loss": 0.0195, + "step": 931 + }, + { + "epoch": 0.12312976847111669, + "grad_norm": 0.48700830340385437, + "learning_rate": 0.0001927475923556456, + "loss": 0.0263, + "step": 932 + }, + { + "epoch": 0.12326188195660072, + "grad_norm": 0.18568505346775055, + "learning_rate": 0.00019273204541583208, + "loss": 0.0238, + "step": 933 + }, + { + "epoch": 0.12339399544208475, + "grad_norm": 0.32222384214401245, + "learning_rate": 0.00019271648245829153, + "loss": 0.0279, + "step": 934 + }, + { + "epoch": 0.12352610892756878, + "grad_norm": 0.2924232482910156, + "learning_rate": 0.00019270090348571216, + "loss": 0.0282, + "step": 935 + }, + { + "epoch": 0.12365822241305281, + "grad_norm": 0.17929571866989136, + "learning_rate": 0.00019268530850078498, + "loss": 0.0228, + "step": 936 + }, + { + "epoch": 0.12379033589853684, + "grad_norm": 0.2127278745174408, + "learning_rate": 0.0001926696975062037, + "loss": 0.0388, + "step": 937 + }, + { + "epoch": 0.12392244938402087, + "grad_norm": 0.14027151465415955, + "learning_rate": 0.00019265407050466485, + "loss": 0.0159, + "step": 938 + }, + { + "epoch": 0.1240545628695049, + "grad_norm": 0.23516128957271576, + "learning_rate": 0.0001926384274988677, + "loss": 0.0367, + "step": 939 + }, + { + "epoch": 0.12418667635498894, + "grad_norm": 0.27905580401420593, + "learning_rate": 0.00019262276849151433, + "loss": 0.0373, + "step": 940 + }, + { + "epoch": 0.12431878984047297, + "grad_norm": 0.3128524124622345, + "learning_rate": 0.00019260709348530944, + "loss": 0.0312, + "step": 941 + }, + { + "epoch": 0.124450903325957, + "grad_norm": 0.3510471284389496, + "learning_rate": 0.00019259140248296068, + "loss": 0.0413, + "step": 942 + }, + { + "epoch": 0.12458301681144103, + "grad_norm": 0.2699235677719116, + "learning_rate": 0.00019257569548717832, + "loss": 0.0279, + "step": 943 + }, + { + "epoch": 0.12471513029692506, + "grad_norm": 0.2423953115940094, + "learning_rate": 0.00019255997250067553, + "loss": 0.0317, + "step": 944 + }, + { + "epoch": 0.12484724378240909, + "grad_norm": 0.3213600516319275, + "learning_rate": 0.00019254423352616805, + "loss": 0.0402, + "step": 945 + }, + { + "epoch": 0.12497935726789312, + "grad_norm": 0.26986581087112427, + "learning_rate": 0.00019252847856637457, + "loss": 0.0301, + "step": 946 + }, + { + "epoch": 0.12511147075337714, + "grad_norm": 0.2727492153644562, + "learning_rate": 0.00019251270762401647, + "loss": 0.0208, + "step": 947 + }, + { + "epoch": 0.12524358423886117, + "grad_norm": 0.2855754792690277, + "learning_rate": 0.00019249692070181785, + "loss": 0.0419, + "step": 948 + }, + { + "epoch": 0.1253756977243452, + "grad_norm": 0.278870165348053, + "learning_rate": 0.00019248111780250561, + "loss": 0.0214, + "step": 949 + }, + { + "epoch": 0.12550781120982923, + "grad_norm": 0.18916112184524536, + "learning_rate": 0.00019246529892880945, + "loss": 0.025, + "step": 950 + }, + { + "epoch": 0.12563992469531327, + "grad_norm": 0.2686702013015747, + "learning_rate": 0.0001924494640834617, + "loss": 0.0321, + "step": 951 + }, + { + "epoch": 0.1257720381807973, + "grad_norm": 0.23873859643936157, + "learning_rate": 0.0001924336132691976, + "loss": 0.0307, + "step": 952 + }, + { + "epoch": 0.12590415166628133, + "grad_norm": 0.26889604330062866, + "learning_rate": 0.0001924177464887551, + "loss": 0.0355, + "step": 953 + }, + { + "epoch": 0.12603626515176536, + "grad_norm": 0.2223420888185501, + "learning_rate": 0.0001924018637448748, + "loss": 0.0416, + "step": 954 + }, + { + "epoch": 0.1261683786372494, + "grad_norm": 0.24047642946243286, + "learning_rate": 0.00019238596504030024, + "loss": 0.0317, + "step": 955 + }, + { + "epoch": 0.12630049212273342, + "grad_norm": 0.3237987160682678, + "learning_rate": 0.00019237005037777755, + "loss": 0.0351, + "step": 956 + }, + { + "epoch": 0.12643260560821745, + "grad_norm": 0.30036526918411255, + "learning_rate": 0.00019235411976005576, + "loss": 0.0416, + "step": 957 + }, + { + "epoch": 0.12656471909370148, + "grad_norm": 0.2027190625667572, + "learning_rate": 0.00019233817318988652, + "loss": 0.0368, + "step": 958 + }, + { + "epoch": 0.12669683257918551, + "grad_norm": 0.23805420100688934, + "learning_rate": 0.00019232221067002437, + "loss": 0.02, + "step": 959 + }, + { + "epoch": 0.12682894606466955, + "grad_norm": 0.3873671293258667, + "learning_rate": 0.00019230623220322648, + "loss": 0.0294, + "step": 960 + }, + { + "epoch": 0.12696105955015358, + "grad_norm": 0.27125123143196106, + "learning_rate": 0.00019229023779225284, + "loss": 0.0299, + "step": 961 + }, + { + "epoch": 0.1270931730356376, + "grad_norm": 0.2969513535499573, + "learning_rate": 0.0001922742274398662, + "loss": 0.031, + "step": 962 + }, + { + "epoch": 0.12722528652112164, + "grad_norm": 0.244962677359581, + "learning_rate": 0.00019225820114883208, + "loss": 0.0275, + "step": 963 + }, + { + "epoch": 0.12735740000660567, + "grad_norm": 0.2579887807369232, + "learning_rate": 0.00019224215892191864, + "loss": 0.0332, + "step": 964 + }, + { + "epoch": 0.1274895134920897, + "grad_norm": 0.1939936876296997, + "learning_rate": 0.00019222610076189694, + "loss": 0.03, + "step": 965 + }, + { + "epoch": 0.12762162697757373, + "grad_norm": 0.24424876272678375, + "learning_rate": 0.0001922100266715407, + "loss": 0.0359, + "step": 966 + }, + { + "epoch": 0.12775374046305776, + "grad_norm": 0.17694316804409027, + "learning_rate": 0.00019219393665362647, + "loss": 0.0187, + "step": 967 + }, + { + "epoch": 0.1278858539485418, + "grad_norm": 0.27498042583465576, + "learning_rate": 0.00019217783071093342, + "loss": 0.0256, + "step": 968 + }, + { + "epoch": 0.12801796743402583, + "grad_norm": 0.24844923615455627, + "learning_rate": 0.0001921617088462436, + "loss": 0.0288, + "step": 969 + }, + { + "epoch": 0.12815008091950986, + "grad_norm": 0.19000475108623505, + "learning_rate": 0.00019214557106234174, + "loss": 0.0247, + "step": 970 + }, + { + "epoch": 0.1282821944049939, + "grad_norm": 0.3154200613498688, + "learning_rate": 0.00019212941736201537, + "loss": 0.031, + "step": 971 + }, + { + "epoch": 0.12841430789047792, + "grad_norm": 0.3130955398082733, + "learning_rate": 0.00019211324774805473, + "loss": 0.0349, + "step": 972 + }, + { + "epoch": 0.12854642137596195, + "grad_norm": 0.2864592671394348, + "learning_rate": 0.00019209706222325277, + "loss": 0.032, + "step": 973 + }, + { + "epoch": 0.12867853486144598, + "grad_norm": 0.22307147085666656, + "learning_rate": 0.0001920808607904053, + "loss": 0.037, + "step": 974 + }, + { + "epoch": 0.12881064834693, + "grad_norm": 0.3396158814430237, + "learning_rate": 0.00019206464345231078, + "loss": 0.0337, + "step": 975 + }, + { + "epoch": 0.12894276183241404, + "grad_norm": 0.24885155260562897, + "learning_rate": 0.00019204841021177048, + "loss": 0.0291, + "step": 976 + }, + { + "epoch": 0.12907487531789807, + "grad_norm": 0.1922139674425125, + "learning_rate": 0.00019203216107158833, + "loss": 0.02, + "step": 977 + }, + { + "epoch": 0.1292069888033821, + "grad_norm": 0.252600759267807, + "learning_rate": 0.00019201589603457114, + "loss": 0.0259, + "step": 978 + }, + { + "epoch": 0.12933910228886614, + "grad_norm": 0.24059033393859863, + "learning_rate": 0.00019199961510352832, + "loss": 0.0281, + "step": 979 + }, + { + "epoch": 0.12947121577435017, + "grad_norm": 0.21730482578277588, + "learning_rate": 0.00019198331828127217, + "loss": 0.034, + "step": 980 + }, + { + "epoch": 0.1296033292598342, + "grad_norm": 0.3147624433040619, + "learning_rate": 0.00019196700557061762, + "loss": 0.0323, + "step": 981 + }, + { + "epoch": 0.12973544274531823, + "grad_norm": 0.21451252698898315, + "learning_rate": 0.00019195067697438237, + "loss": 0.0254, + "step": 982 + }, + { + "epoch": 0.12986755623080226, + "grad_norm": 0.230705127120018, + "learning_rate": 0.0001919343324953869, + "loss": 0.0398, + "step": 983 + }, + { + "epoch": 0.1299996697162863, + "grad_norm": 0.19270865619182587, + "learning_rate": 0.00019191797213645445, + "loss": 0.0274, + "step": 984 + }, + { + "epoch": 0.13013178320177032, + "grad_norm": 0.1896556168794632, + "learning_rate": 0.00019190159590041088, + "loss": 0.0197, + "step": 985 + }, + { + "epoch": 0.13026389668725435, + "grad_norm": 0.21830704808235168, + "learning_rate": 0.00019188520379008494, + "loss": 0.0191, + "step": 986 + }, + { + "epoch": 0.13039601017273839, + "grad_norm": 0.3246423304080963, + "learning_rate": 0.00019186879580830807, + "loss": 0.0294, + "step": 987 + }, + { + "epoch": 0.13052812365822242, + "grad_norm": 0.34824860095977783, + "learning_rate": 0.0001918523719579144, + "loss": 0.0418, + "step": 988 + }, + { + "epoch": 0.13066023714370645, + "grad_norm": 0.19474251568317413, + "learning_rate": 0.00019183593224174084, + "loss": 0.0265, + "step": 989 + }, + { + "epoch": 0.13079235062919048, + "grad_norm": 0.2300022393465042, + "learning_rate": 0.00019181947666262712, + "loss": 0.0312, + "step": 990 + }, + { + "epoch": 0.1309244641146745, + "grad_norm": 0.2592034339904785, + "learning_rate": 0.00019180300522341558, + "loss": 0.0434, + "step": 991 + }, + { + "epoch": 0.13105657760015854, + "grad_norm": 0.21826054155826569, + "learning_rate": 0.0001917865179269513, + "loss": 0.0233, + "step": 992 + }, + { + "epoch": 0.13118869108564257, + "grad_norm": 0.263738214969635, + "learning_rate": 0.00019177001477608226, + "loss": 0.0184, + "step": 993 + }, + { + "epoch": 0.1313208045711266, + "grad_norm": 0.34222137928009033, + "learning_rate": 0.000191753495773659, + "loss": 0.0295, + "step": 994 + }, + { + "epoch": 0.13145291805661063, + "grad_norm": 0.3267710208892822, + "learning_rate": 0.00019173696092253487, + "loss": 0.0307, + "step": 995 + }, + { + "epoch": 0.13158503154209467, + "grad_norm": 0.22792892158031464, + "learning_rate": 0.00019172041022556596, + "loss": 0.03, + "step": 996 + }, + { + "epoch": 0.1317171450275787, + "grad_norm": 0.20261724293231964, + "learning_rate": 0.0001917038436856111, + "loss": 0.027, + "step": 997 + }, + { + "epoch": 0.13184925851306273, + "grad_norm": 0.23422667384147644, + "learning_rate": 0.0001916872613055319, + "loss": 0.0249, + "step": 998 + }, + { + "epoch": 0.13198137199854676, + "grad_norm": 0.26018065214157104, + "learning_rate": 0.0001916706630881926, + "loss": 0.0335, + "step": 999 + }, + { + "epoch": 0.1321134854840308, + "grad_norm": 0.36841535568237305, + "learning_rate": 0.00019165404903646023, + "loss": 0.0239, + "step": 1000 + }, + { + "epoch": 0.13224559896951482, + "grad_norm": 0.1476823389530182, + "learning_rate": 0.00019163741915320456, + "loss": 0.013, + "step": 1001 + }, + { + "epoch": 0.13237771245499885, + "grad_norm": 0.2931428849697113, + "learning_rate": 0.0001916207734412981, + "loss": 0.0263, + "step": 1002 + }, + { + "epoch": 0.13250982594048288, + "grad_norm": 0.2295461893081665, + "learning_rate": 0.00019160411190361612, + "loss": 0.0243, + "step": 1003 + }, + { + "epoch": 0.13264193942596691, + "grad_norm": 0.22297543287277222, + "learning_rate": 0.00019158743454303654, + "loss": 0.0372, + "step": 1004 + }, + { + "epoch": 0.13277405291145095, + "grad_norm": 0.2877453863620758, + "learning_rate": 0.0001915707413624401, + "loss": 0.0319, + "step": 1005 + }, + { + "epoch": 0.13290616639693498, + "grad_norm": 0.21408823132514954, + "learning_rate": 0.00019155403236471017, + "loss": 0.0229, + "step": 1006 + }, + { + "epoch": 0.133038279882419, + "grad_norm": 0.29204505681991577, + "learning_rate": 0.00019153730755273296, + "loss": 0.0408, + "step": 1007 + }, + { + "epoch": 0.13317039336790304, + "grad_norm": 0.20328344404697418, + "learning_rate": 0.0001915205669293974, + "loss": 0.0249, + "step": 1008 + }, + { + "epoch": 0.13330250685338707, + "grad_norm": 0.2546907663345337, + "learning_rate": 0.00019150381049759508, + "loss": 0.0328, + "step": 1009 + }, + { + "epoch": 0.1334346203388711, + "grad_norm": 0.19306686520576477, + "learning_rate": 0.00019148703826022035, + "loss": 0.0287, + "step": 1010 + }, + { + "epoch": 0.13356673382435513, + "grad_norm": 0.6536433100700378, + "learning_rate": 0.0001914702502201703, + "loss": 0.0398, + "step": 1011 + }, + { + "epoch": 0.13369884730983916, + "grad_norm": 0.2630719542503357, + "learning_rate": 0.00019145344638034484, + "loss": 0.0271, + "step": 1012 + }, + { + "epoch": 0.1338309607953232, + "grad_norm": 0.2874428331851959, + "learning_rate": 0.0001914366267436464, + "loss": 0.046, + "step": 1013 + }, + { + "epoch": 0.13396307428080723, + "grad_norm": 0.25163084268569946, + "learning_rate": 0.0001914197913129803, + "loss": 0.0335, + "step": 1014 + }, + { + "epoch": 0.13409518776629126, + "grad_norm": 0.303242564201355, + "learning_rate": 0.00019140294009125457, + "loss": 0.0281, + "step": 1015 + }, + { + "epoch": 0.1342273012517753, + "grad_norm": 0.2756708264350891, + "learning_rate": 0.0001913860730813799, + "loss": 0.0503, + "step": 1016 + }, + { + "epoch": 0.13435941473725932, + "grad_norm": 0.3068932592868805, + "learning_rate": 0.0001913691902862698, + "loss": 0.0236, + "step": 1017 + }, + { + "epoch": 0.13449152822274332, + "grad_norm": 0.21432353556156158, + "learning_rate": 0.00019135229170884043, + "loss": 0.0367, + "step": 1018 + }, + { + "epoch": 0.13462364170822735, + "grad_norm": 0.39591842889785767, + "learning_rate": 0.0001913353773520107, + "loss": 0.0399, + "step": 1019 + }, + { + "epoch": 0.13475575519371139, + "grad_norm": 0.2623952329158783, + "learning_rate": 0.00019131844721870226, + "loss": 0.0285, + "step": 1020 + }, + { + "epoch": 0.13488786867919542, + "grad_norm": 0.4142909646034241, + "learning_rate": 0.00019130150131183946, + "loss": 0.0419, + "step": 1021 + }, + { + "epoch": 0.13501998216467945, + "grad_norm": 0.20737366378307343, + "learning_rate": 0.0001912845396343494, + "loss": 0.0293, + "step": 1022 + }, + { + "epoch": 0.13515209565016348, + "grad_norm": 0.22168460488319397, + "learning_rate": 0.00019126756218916188, + "loss": 0.0266, + "step": 1023 + }, + { + "epoch": 0.1352842091356475, + "grad_norm": 0.21502786874771118, + "learning_rate": 0.00019125056897920946, + "loss": 0.0326, + "step": 1024 + }, + { + "epoch": 0.13541632262113154, + "grad_norm": 0.17693819105625153, + "learning_rate": 0.0001912335600074274, + "loss": 0.0177, + "step": 1025 + }, + { + "epoch": 0.13554843610661557, + "grad_norm": 0.1689392477273941, + "learning_rate": 0.00019121653527675366, + "loss": 0.0201, + "step": 1026 + }, + { + "epoch": 0.1356805495920996, + "grad_norm": 0.3991635739803314, + "learning_rate": 0.00019119949479012892, + "loss": 0.0385, + "step": 1027 + }, + { + "epoch": 0.13581266307758363, + "grad_norm": 0.20152747631072998, + "learning_rate": 0.00019118243855049667, + "loss": 0.0407, + "step": 1028 + }, + { + "epoch": 0.13594477656306767, + "grad_norm": 0.16736957430839539, + "learning_rate": 0.00019116536656080298, + "loss": 0.0222, + "step": 1029 + }, + { + "epoch": 0.1360768900485517, + "grad_norm": 0.2872177064418793, + "learning_rate": 0.00019114827882399683, + "loss": 0.041, + "step": 1030 + }, + { + "epoch": 0.13620900353403573, + "grad_norm": 0.22250396013259888, + "learning_rate": 0.00019113117534302968, + "loss": 0.0381, + "step": 1031 + }, + { + "epoch": 0.13634111701951976, + "grad_norm": 0.24812956154346466, + "learning_rate": 0.00019111405612085594, + "loss": 0.0329, + "step": 1032 + }, + { + "epoch": 0.1364732305050038, + "grad_norm": 0.18522006273269653, + "learning_rate": 0.00019109692116043255, + "loss": 0.0203, + "step": 1033 + }, + { + "epoch": 0.13660534399048782, + "grad_norm": 0.17845605313777924, + "learning_rate": 0.0001910797704647193, + "loss": 0.0202, + "step": 1034 + }, + { + "epoch": 0.13673745747597185, + "grad_norm": 0.20754919946193695, + "learning_rate": 0.00019106260403667865, + "loss": 0.0295, + "step": 1035 + }, + { + "epoch": 0.13686957096145588, + "grad_norm": 0.2204241305589676, + "learning_rate": 0.00019104542187927577, + "loss": 0.0246, + "step": 1036 + }, + { + "epoch": 0.13700168444693991, + "grad_norm": 0.2664002776145935, + "learning_rate": 0.0001910282239954786, + "loss": 0.0258, + "step": 1037 + }, + { + "epoch": 0.13713379793242395, + "grad_norm": 0.26305004954338074, + "learning_rate": 0.00019101101038825766, + "loss": 0.0236, + "step": 1038 + }, + { + "epoch": 0.13726591141790798, + "grad_norm": 0.1791495531797409, + "learning_rate": 0.00019099378106058636, + "loss": 0.0229, + "step": 1039 + }, + { + "epoch": 0.137398024903392, + "grad_norm": 0.2402389943599701, + "learning_rate": 0.00019097653601544073, + "loss": 0.0237, + "step": 1040 + }, + { + "epoch": 0.13753013838887604, + "grad_norm": 0.24972432851791382, + "learning_rate": 0.00019095927525579948, + "loss": 0.0289, + "step": 1041 + }, + { + "epoch": 0.13766225187436007, + "grad_norm": 0.18227741122245789, + "learning_rate": 0.00019094199878464413, + "loss": 0.0183, + "step": 1042 + }, + { + "epoch": 0.1377943653598441, + "grad_norm": 0.2381259799003601, + "learning_rate": 0.00019092470660495887, + "loss": 0.0327, + "step": 1043 + }, + { + "epoch": 0.13792647884532813, + "grad_norm": 0.2130383551120758, + "learning_rate": 0.0001909073987197306, + "loss": 0.0251, + "step": 1044 + }, + { + "epoch": 0.13805859233081216, + "grad_norm": 0.2830989360809326, + "learning_rate": 0.0001908900751319489, + "loss": 0.0377, + "step": 1045 + }, + { + "epoch": 0.1381907058162962, + "grad_norm": 0.2945898175239563, + "learning_rate": 0.0001908727358446061, + "loss": 0.0294, + "step": 1046 + }, + { + "epoch": 0.13832281930178023, + "grad_norm": 0.23683592677116394, + "learning_rate": 0.00019085538086069728, + "loss": 0.0188, + "step": 1047 + }, + { + "epoch": 0.13845493278726426, + "grad_norm": 0.24602121114730835, + "learning_rate": 0.0001908380101832202, + "loss": 0.0376, + "step": 1048 + }, + { + "epoch": 0.1385870462727483, + "grad_norm": 0.4324703514575958, + "learning_rate": 0.00019082062381517524, + "loss": 0.0461, + "step": 1049 + }, + { + "epoch": 0.13871915975823232, + "grad_norm": 0.28953877091407776, + "learning_rate": 0.00019080322175956562, + "loss": 0.036, + "step": 1050 + }, + { + "epoch": 0.13885127324371635, + "grad_norm": 0.1999170482158661, + "learning_rate": 0.0001907858040193972, + "loss": 0.0326, + "step": 1051 + }, + { + "epoch": 0.13898338672920038, + "grad_norm": 0.15232303738594055, + "learning_rate": 0.0001907683705976786, + "loss": 0.0173, + "step": 1052 + }, + { + "epoch": 0.1391155002146844, + "grad_norm": 0.3097558319568634, + "learning_rate": 0.00019075092149742112, + "loss": 0.0311, + "step": 1053 + }, + { + "epoch": 0.13924761370016844, + "grad_norm": 0.1991742104291916, + "learning_rate": 0.0001907334567216387, + "loss": 0.0302, + "step": 1054 + }, + { + "epoch": 0.13937972718565247, + "grad_norm": 0.19482475519180298, + "learning_rate": 0.00019071597627334815, + "loss": 0.031, + "step": 1055 + }, + { + "epoch": 0.1395118406711365, + "grad_norm": 0.18483558297157288, + "learning_rate": 0.00019069848015556878, + "loss": 0.0259, + "step": 1056 + }, + { + "epoch": 0.13964395415662054, + "grad_norm": 0.3096649944782257, + "learning_rate": 0.00019068096837132284, + "loss": 0.0278, + "step": 1057 + }, + { + "epoch": 0.13977606764210457, + "grad_norm": 0.3025237023830414, + "learning_rate": 0.00019066344092363507, + "loss": 0.0363, + "step": 1058 + }, + { + "epoch": 0.1399081811275886, + "grad_norm": 0.2409946471452713, + "learning_rate": 0.00019064589781553305, + "loss": 0.0188, + "step": 1059 + }, + { + "epoch": 0.14004029461307263, + "grad_norm": 0.20228618383407593, + "learning_rate": 0.00019062833905004697, + "loss": 0.0317, + "step": 1060 + }, + { + "epoch": 0.14017240809855666, + "grad_norm": 0.17695695161819458, + "learning_rate": 0.00019061076463020986, + "loss": 0.0269, + "step": 1061 + }, + { + "epoch": 0.1403045215840407, + "grad_norm": 0.3224967420101166, + "learning_rate": 0.0001905931745590573, + "loss": 0.0245, + "step": 1062 + }, + { + "epoch": 0.14043663506952472, + "grad_norm": 0.130575031042099, + "learning_rate": 0.00019057556883962776, + "loss": 0.0163, + "step": 1063 + }, + { + "epoch": 0.14056874855500875, + "grad_norm": 0.272037535905838, + "learning_rate": 0.00019055794747496215, + "loss": 0.0361, + "step": 1064 + }, + { + "epoch": 0.14070086204049279, + "grad_norm": 0.29713672399520874, + "learning_rate": 0.00019054031046810433, + "loss": 0.041, + "step": 1065 + }, + { + "epoch": 0.14083297552597682, + "grad_norm": 0.2734525501728058, + "learning_rate": 0.0001905226578221007, + "loss": 0.0179, + "step": 1066 + }, + { + "epoch": 0.14096508901146085, + "grad_norm": 0.32613930106163025, + "learning_rate": 0.00019050498954000048, + "loss": 0.0436, + "step": 1067 + }, + { + "epoch": 0.14109720249694488, + "grad_norm": 0.168988898396492, + "learning_rate": 0.00019048730562485554, + "loss": 0.0198, + "step": 1068 + }, + { + "epoch": 0.1412293159824289, + "grad_norm": 0.21649277210235596, + "learning_rate": 0.00019046960607972037, + "loss": 0.0283, + "step": 1069 + }, + { + "epoch": 0.14136142946791294, + "grad_norm": 0.19166362285614014, + "learning_rate": 0.00019045189090765232, + "loss": 0.0256, + "step": 1070 + }, + { + "epoch": 0.14149354295339697, + "grad_norm": 0.23602889478206635, + "learning_rate": 0.0001904341601117113, + "loss": 0.0366, + "step": 1071 + }, + { + "epoch": 0.141625656438881, + "grad_norm": 0.2654229700565338, + "learning_rate": 0.00019041641369496, + "loss": 0.0415, + "step": 1072 + }, + { + "epoch": 0.14175776992436503, + "grad_norm": 0.2024417668581009, + "learning_rate": 0.00019039865166046378, + "loss": 0.0247, + "step": 1073 + }, + { + "epoch": 0.14188988340984907, + "grad_norm": 0.34355026483535767, + "learning_rate": 0.00019038087401129067, + "loss": 0.036, + "step": 1074 + }, + { + "epoch": 0.1420219968953331, + "grad_norm": 0.1743433028459549, + "learning_rate": 0.00019036308075051148, + "loss": 0.033, + "step": 1075 + }, + { + "epoch": 0.14215411038081713, + "grad_norm": 0.23234279453754425, + "learning_rate": 0.00019034527188119962, + "loss": 0.0418, + "step": 1076 + }, + { + "epoch": 0.14228622386630116, + "grad_norm": 0.1761244535446167, + "learning_rate": 0.00019032744740643125, + "loss": 0.0185, + "step": 1077 + }, + { + "epoch": 0.1424183373517852, + "grad_norm": 0.18785180151462555, + "learning_rate": 0.00019030960732928522, + "loss": 0.0242, + "step": 1078 + }, + { + "epoch": 0.14255045083726922, + "grad_norm": 0.23817138373851776, + "learning_rate": 0.0001902917516528431, + "loss": 0.0253, + "step": 1079 + }, + { + "epoch": 0.14268256432275325, + "grad_norm": 0.20377707481384277, + "learning_rate": 0.00019027388038018902, + "loss": 0.0314, + "step": 1080 + }, + { + "epoch": 0.14281467780823728, + "grad_norm": 0.21786224842071533, + "learning_rate": 0.00019025599351441002, + "loss": 0.0321, + "step": 1081 + }, + { + "epoch": 0.14294679129372131, + "grad_norm": 0.2601945400238037, + "learning_rate": 0.00019023809105859569, + "loss": 0.0333, + "step": 1082 + }, + { + "epoch": 0.14307890477920535, + "grad_norm": 0.2133285105228424, + "learning_rate": 0.00019022017301583834, + "loss": 0.0418, + "step": 1083 + }, + { + "epoch": 0.14321101826468938, + "grad_norm": 0.16249844431877136, + "learning_rate": 0.00019020223938923296, + "loss": 0.0197, + "step": 1084 + }, + { + "epoch": 0.1433431317501734, + "grad_norm": 0.21031540632247925, + "learning_rate": 0.00019018429018187723, + "loss": 0.0285, + "step": 1085 + }, + { + "epoch": 0.14347524523565744, + "grad_norm": 0.23109595477581024, + "learning_rate": 0.00019016632539687163, + "loss": 0.0222, + "step": 1086 + }, + { + "epoch": 0.14360735872114147, + "grad_norm": 0.2368910163640976, + "learning_rate": 0.00019014834503731915, + "loss": 0.0315, + "step": 1087 + }, + { + "epoch": 0.1437394722066255, + "grad_norm": 0.3795262277126312, + "learning_rate": 0.00019013034910632558, + "loss": 0.039, + "step": 1088 + }, + { + "epoch": 0.14387158569210953, + "grad_norm": 0.20181721448898315, + "learning_rate": 0.00019011233760699942, + "loss": 0.0228, + "step": 1089 + }, + { + "epoch": 0.14400369917759356, + "grad_norm": 0.26354295015335083, + "learning_rate": 0.00019009431054245178, + "loss": 0.0298, + "step": 1090 + }, + { + "epoch": 0.1441358126630776, + "grad_norm": 0.3916509449481964, + "learning_rate": 0.00019007626791579652, + "loss": 0.0325, + "step": 1091 + }, + { + "epoch": 0.14426792614856163, + "grad_norm": 0.29506227374076843, + "learning_rate": 0.00019005820973015016, + "loss": 0.0298, + "step": 1092 + }, + { + "epoch": 0.14440003963404566, + "grad_norm": 0.35943931341171265, + "learning_rate": 0.0001900401359886319, + "loss": 0.033, + "step": 1093 + }, + { + "epoch": 0.1445321531195297, + "grad_norm": 0.19208261370658875, + "learning_rate": 0.00019002204669436369, + "loss": 0.0207, + "step": 1094 + }, + { + "epoch": 0.14466426660501372, + "grad_norm": 0.22459660470485687, + "learning_rate": 0.00019000394185047004, + "loss": 0.0358, + "step": 1095 + }, + { + "epoch": 0.14479638009049775, + "grad_norm": 0.27884823083877563, + "learning_rate": 0.00018998582146007825, + "loss": 0.0403, + "step": 1096 + }, + { + "epoch": 0.14492849357598178, + "grad_norm": 0.45688390731811523, + "learning_rate": 0.0001899676855263183, + "loss": 0.0349, + "step": 1097 + }, + { + "epoch": 0.1450606070614658, + "grad_norm": 0.28939759731292725, + "learning_rate": 0.00018994953405232287, + "loss": 0.0439, + "step": 1098 + }, + { + "epoch": 0.14519272054694982, + "grad_norm": 0.3789207339286804, + "learning_rate": 0.0001899313670412272, + "loss": 0.0408, + "step": 1099 + }, + { + "epoch": 0.14532483403243385, + "grad_norm": 0.2404397577047348, + "learning_rate": 0.00018991318449616937, + "loss": 0.0255, + "step": 1100 + }, + { + "epoch": 0.14545694751791788, + "grad_norm": 0.2406584769487381, + "learning_rate": 0.00018989498642029004, + "loss": 0.0301, + "step": 1101 + }, + { + "epoch": 0.1455890610034019, + "grad_norm": 0.20736750960350037, + "learning_rate": 0.0001898767728167326, + "loss": 0.0161, + "step": 1102 + }, + { + "epoch": 0.14572117448888594, + "grad_norm": 0.24288155138492584, + "learning_rate": 0.0001898585436886431, + "loss": 0.0239, + "step": 1103 + }, + { + "epoch": 0.14585328797436997, + "grad_norm": 0.2473408728837967, + "learning_rate": 0.00018984029903917026, + "loss": 0.0272, + "step": 1104 + }, + { + "epoch": 0.145985401459854, + "grad_norm": 0.22904950380325317, + "learning_rate": 0.00018982203887146556, + "loss": 0.031, + "step": 1105 + }, + { + "epoch": 0.14611751494533803, + "grad_norm": 0.23573671281337738, + "learning_rate": 0.0001898037631886831, + "loss": 0.0296, + "step": 1106 + }, + { + "epoch": 0.14624962843082207, + "grad_norm": 0.2947499752044678, + "learning_rate": 0.00018978547199397959, + "loss": 0.0485, + "step": 1107 + }, + { + "epoch": 0.1463817419163061, + "grad_norm": 0.2720722556114197, + "learning_rate": 0.00018976716529051454, + "loss": 0.0326, + "step": 1108 + }, + { + "epoch": 0.14651385540179013, + "grad_norm": 0.20816238224506378, + "learning_rate": 0.0001897488430814501, + "loss": 0.0231, + "step": 1109 + }, + { + "epoch": 0.14664596888727416, + "grad_norm": 0.291048526763916, + "learning_rate": 0.0001897305053699511, + "loss": 0.0399, + "step": 1110 + }, + { + "epoch": 0.1467780823727582, + "grad_norm": 0.15667644143104553, + "learning_rate": 0.00018971215215918497, + "loss": 0.0122, + "step": 1111 + }, + { + "epoch": 0.14691019585824222, + "grad_norm": 0.23087377846240997, + "learning_rate": 0.00018969378345232193, + "loss": 0.0273, + "step": 1112 + }, + { + "epoch": 0.14704230934372625, + "grad_norm": 0.2081710398197174, + "learning_rate": 0.00018967539925253486, + "loss": 0.0376, + "step": 1113 + }, + { + "epoch": 0.14717442282921028, + "grad_norm": 0.2385694682598114, + "learning_rate": 0.00018965699956299923, + "loss": 0.0333, + "step": 1114 + }, + { + "epoch": 0.14730653631469431, + "grad_norm": 0.25243672728538513, + "learning_rate": 0.00018963858438689326, + "loss": 0.0304, + "step": 1115 + }, + { + "epoch": 0.14743864980017835, + "grad_norm": 0.1876525729894638, + "learning_rate": 0.00018962015372739788, + "loss": 0.0178, + "step": 1116 + }, + { + "epoch": 0.14757076328566238, + "grad_norm": 0.20586282014846802, + "learning_rate": 0.00018960170758769654, + "loss": 0.0342, + "step": 1117 + }, + { + "epoch": 0.1477028767711464, + "grad_norm": 0.17124402523040771, + "learning_rate": 0.00018958324597097555, + "loss": 0.0182, + "step": 1118 + }, + { + "epoch": 0.14783499025663044, + "grad_norm": 0.165056049823761, + "learning_rate": 0.00018956476888042377, + "loss": 0.0146, + "step": 1119 + }, + { + "epoch": 0.14796710374211447, + "grad_norm": 0.24566829204559326, + "learning_rate": 0.00018954627631923279, + "loss": 0.0296, + "step": 1120 + }, + { + "epoch": 0.1480992172275985, + "grad_norm": 0.2505076825618744, + "learning_rate": 0.00018952776829059685, + "loss": 0.033, + "step": 1121 + }, + { + "epoch": 0.14823133071308253, + "grad_norm": 0.22813905775547028, + "learning_rate": 0.00018950924479771287, + "loss": 0.0262, + "step": 1122 + }, + { + "epoch": 0.14836344419856656, + "grad_norm": 0.18311943113803864, + "learning_rate": 0.0001894907058437804, + "loss": 0.0256, + "step": 1123 + }, + { + "epoch": 0.1484955576840506, + "grad_norm": 0.2090534269809723, + "learning_rate": 0.00018947215143200175, + "loss": 0.0286, + "step": 1124 + }, + { + "epoch": 0.14862767116953463, + "grad_norm": 0.5938106775283813, + "learning_rate": 0.00018945358156558184, + "loss": 0.0517, + "step": 1125 + }, + { + "epoch": 0.14875978465501866, + "grad_norm": 0.371894896030426, + "learning_rate": 0.0001894349962477282, + "loss": 0.0224, + "step": 1126 + }, + { + "epoch": 0.1488918981405027, + "grad_norm": 0.22545740008354187, + "learning_rate": 0.0001894163954816512, + "loss": 0.032, + "step": 1127 + }, + { + "epoch": 0.14902401162598672, + "grad_norm": 0.23922865092754364, + "learning_rate": 0.00018939777927056372, + "loss": 0.0317, + "step": 1128 + }, + { + "epoch": 0.14915612511147075, + "grad_norm": 0.3410808742046356, + "learning_rate": 0.00018937914761768133, + "loss": 0.0278, + "step": 1129 + }, + { + "epoch": 0.14928823859695478, + "grad_norm": 0.2365533411502838, + "learning_rate": 0.00018936050052622237, + "loss": 0.0323, + "step": 1130 + }, + { + "epoch": 0.1494203520824388, + "grad_norm": 0.26630455255508423, + "learning_rate": 0.00018934183799940773, + "loss": 0.0247, + "step": 1131 + }, + { + "epoch": 0.14955246556792284, + "grad_norm": 0.2577216625213623, + "learning_rate": 0.00018932316004046103, + "loss": 0.0396, + "step": 1132 + }, + { + "epoch": 0.14968457905340687, + "grad_norm": 0.20923064649105072, + "learning_rate": 0.00018930446665260854, + "loss": 0.0277, + "step": 1133 + }, + { + "epoch": 0.1498166925388909, + "grad_norm": 0.24694538116455078, + "learning_rate": 0.00018928575783907914, + "loss": 0.0364, + "step": 1134 + }, + { + "epoch": 0.14994880602437494, + "grad_norm": 0.28028610348701477, + "learning_rate": 0.00018926703360310453, + "loss": 0.0367, + "step": 1135 + }, + { + "epoch": 0.15008091950985897, + "grad_norm": 0.4478507936000824, + "learning_rate": 0.00018924829394791886, + "loss": 0.0308, + "step": 1136 + }, + { + "epoch": 0.150213032995343, + "grad_norm": 0.3255590796470642, + "learning_rate": 0.00018922953887675915, + "loss": 0.0404, + "step": 1137 + }, + { + "epoch": 0.15034514648082703, + "grad_norm": 0.18644562363624573, + "learning_rate": 0.00018921076839286495, + "loss": 0.0197, + "step": 1138 + }, + { + "epoch": 0.15047725996631106, + "grad_norm": 0.2694709897041321, + "learning_rate": 0.00018919198249947846, + "loss": 0.015, + "step": 1139 + }, + { + "epoch": 0.1506093734517951, + "grad_norm": 0.23030641674995422, + "learning_rate": 0.00018917318119984468, + "loss": 0.0326, + "step": 1140 + }, + { + "epoch": 0.15074148693727912, + "grad_norm": 0.2648636996746063, + "learning_rate": 0.00018915436449721117, + "loss": 0.0452, + "step": 1141 + }, + { + "epoch": 0.15087360042276315, + "grad_norm": 0.28663501143455505, + "learning_rate": 0.0001891355323948281, + "loss": 0.0368, + "step": 1142 + }, + { + "epoch": 0.15100571390824719, + "grad_norm": 0.21681685745716095, + "learning_rate": 0.00018911668489594838, + "loss": 0.0222, + "step": 1143 + }, + { + "epoch": 0.15113782739373122, + "grad_norm": 0.3494594693183899, + "learning_rate": 0.00018909782200382763, + "loss": 0.0212, + "step": 1144 + }, + { + "epoch": 0.15126994087921525, + "grad_norm": 0.32084718346595764, + "learning_rate": 0.000189078943721724, + "loss": 0.0402, + "step": 1145 + }, + { + "epoch": 0.15140205436469928, + "grad_norm": 0.23676195740699768, + "learning_rate": 0.00018906005005289836, + "loss": 0.0324, + "step": 1146 + }, + { + "epoch": 0.1515341678501833, + "grad_norm": 0.16447904706001282, + "learning_rate": 0.00018904114100061424, + "loss": 0.0174, + "step": 1147 + }, + { + "epoch": 0.15166628133566734, + "grad_norm": 0.24322609603405, + "learning_rate": 0.0001890222165681379, + "loss": 0.0327, + "step": 1148 + }, + { + "epoch": 0.15179839482115137, + "grad_norm": 0.21443983912467957, + "learning_rate": 0.00018900327675873806, + "loss": 0.035, + "step": 1149 + }, + { + "epoch": 0.1519305083066354, + "grad_norm": 0.1838637739419937, + "learning_rate": 0.0001889843215756863, + "loss": 0.0267, + "step": 1150 + }, + { + "epoch": 0.15206262179211943, + "grad_norm": 0.2316933572292328, + "learning_rate": 0.00018896535102225673, + "loss": 0.0266, + "step": 1151 + }, + { + "epoch": 0.15219473527760347, + "grad_norm": 0.25021079182624817, + "learning_rate": 0.0001889463651017262, + "loss": 0.0207, + "step": 1152 + }, + { + "epoch": 0.1523268487630875, + "grad_norm": 0.28480058908462524, + "learning_rate": 0.00018892736381737418, + "loss": 0.0331, + "step": 1153 + }, + { + "epoch": 0.15245896224857153, + "grad_norm": 0.2881741523742676, + "learning_rate": 0.0001889083471724827, + "loss": 0.0202, + "step": 1154 + }, + { + "epoch": 0.15259107573405556, + "grad_norm": 0.31690138578414917, + "learning_rate": 0.00018888931517033663, + "loss": 0.0391, + "step": 1155 + }, + { + "epoch": 0.1527231892195396, + "grad_norm": 0.1786164790391922, + "learning_rate": 0.00018887026781422338, + "loss": 0.0304, + "step": 1156 + }, + { + "epoch": 0.15285530270502362, + "grad_norm": 0.15002429485321045, + "learning_rate": 0.00018885120510743296, + "loss": 0.0163, + "step": 1157 + }, + { + "epoch": 0.15298741619050765, + "grad_norm": 0.1841292530298233, + "learning_rate": 0.00018883212705325813, + "loss": 0.0276, + "step": 1158 + }, + { + "epoch": 0.15311952967599168, + "grad_norm": 0.3281814157962799, + "learning_rate": 0.00018881303365499426, + "loss": 0.0307, + "step": 1159 + }, + { + "epoch": 0.15325164316147571, + "grad_norm": 0.30686184763908386, + "learning_rate": 0.00018879392491593942, + "loss": 0.0349, + "step": 1160 + }, + { + "epoch": 0.15338375664695975, + "grad_norm": 0.19830575585365295, + "learning_rate": 0.00018877480083939424, + "loss": 0.0247, + "step": 1161 + }, + { + "epoch": 0.15351587013244378, + "grad_norm": 0.2402205914258957, + "learning_rate": 0.00018875566142866204, + "loss": 0.0235, + "step": 1162 + }, + { + "epoch": 0.1536479836179278, + "grad_norm": 0.29031792283058167, + "learning_rate": 0.00018873650668704882, + "loss": 0.035, + "step": 1163 + }, + { + "epoch": 0.15378009710341184, + "grad_norm": 0.24618452787399292, + "learning_rate": 0.00018871733661786325, + "loss": 0.0308, + "step": 1164 + }, + { + "epoch": 0.15391221058889587, + "grad_norm": 0.2671643793582916, + "learning_rate": 0.0001886981512244165, + "loss": 0.0331, + "step": 1165 + }, + { + "epoch": 0.1540443240743799, + "grad_norm": 0.2721196711063385, + "learning_rate": 0.00018867895051002256, + "loss": 0.037, + "step": 1166 + }, + { + "epoch": 0.15417643755986393, + "grad_norm": 0.32829850912094116, + "learning_rate": 0.000188659734477998, + "loss": 0.0387, + "step": 1167 + }, + { + "epoch": 0.15430855104534796, + "grad_norm": 0.14274796843528748, + "learning_rate": 0.00018864050313166194, + "loss": 0.0235, + "step": 1168 + }, + { + "epoch": 0.154440664530832, + "grad_norm": 0.25783032178878784, + "learning_rate": 0.00018862125647433635, + "loss": 0.03, + "step": 1169 + }, + { + "epoch": 0.15457277801631603, + "grad_norm": 0.1943584531545639, + "learning_rate": 0.00018860199450934566, + "loss": 0.035, + "step": 1170 + }, + { + "epoch": 0.15470489150180006, + "grad_norm": 0.2702155113220215, + "learning_rate": 0.00018858271724001707, + "loss": 0.0349, + "step": 1171 + }, + { + "epoch": 0.1548370049872841, + "grad_norm": 0.34101739525794983, + "learning_rate": 0.00018856342466968027, + "loss": 0.0453, + "step": 1172 + }, + { + "epoch": 0.15496911847276812, + "grad_norm": 0.23040156066417694, + "learning_rate": 0.0001885441168016678, + "loss": 0.0289, + "step": 1173 + }, + { + "epoch": 0.15510123195825215, + "grad_norm": 0.26681602001190186, + "learning_rate": 0.00018852479363931467, + "loss": 0.0434, + "step": 1174 + }, + { + "epoch": 0.15523334544373618, + "grad_norm": 0.1774115115404129, + "learning_rate": 0.00018850545518595859, + "loss": 0.0255, + "step": 1175 + }, + { + "epoch": 0.1553654589292202, + "grad_norm": 0.24246004223823547, + "learning_rate": 0.00018848610144493996, + "loss": 0.0279, + "step": 1176 + }, + { + "epoch": 0.15549757241470424, + "grad_norm": 0.365506112575531, + "learning_rate": 0.00018846673241960176, + "loss": 0.043, + "step": 1177 + }, + { + "epoch": 0.15562968590018827, + "grad_norm": 0.3292416036128998, + "learning_rate": 0.0001884473481132896, + "loss": 0.0359, + "step": 1178 + }, + { + "epoch": 0.1557617993856723, + "grad_norm": 0.28295496106147766, + "learning_rate": 0.00018842794852935181, + "loss": 0.0494, + "step": 1179 + }, + { + "epoch": 0.15589391287115634, + "grad_norm": 0.19957217574119568, + "learning_rate": 0.00018840853367113925, + "loss": 0.0199, + "step": 1180 + }, + { + "epoch": 0.15602602635664034, + "grad_norm": 0.251113623380661, + "learning_rate": 0.0001883891035420055, + "loss": 0.0367, + "step": 1181 + }, + { + "epoch": 0.15615813984212437, + "grad_norm": 0.31150946021080017, + "learning_rate": 0.00018836965814530675, + "loss": 0.0312, + "step": 1182 + }, + { + "epoch": 0.1562902533276084, + "grad_norm": 0.20670191943645477, + "learning_rate": 0.00018835019748440185, + "loss": 0.0236, + "step": 1183 + }, + { + "epoch": 0.15642236681309243, + "grad_norm": 0.24962063133716583, + "learning_rate": 0.0001883307215626522, + "loss": 0.037, + "step": 1184 + }, + { + "epoch": 0.15655448029857646, + "grad_norm": 0.24450640380382538, + "learning_rate": 0.00018831123038342195, + "loss": 0.0317, + "step": 1185 + }, + { + "epoch": 0.1566865937840605, + "grad_norm": 0.1726607382297516, + "learning_rate": 0.00018829172395007783, + "loss": 0.0251, + "step": 1186 + }, + { + "epoch": 0.15681870726954453, + "grad_norm": 0.22054699063301086, + "learning_rate": 0.00018827220226598924, + "loss": 0.0312, + "step": 1187 + }, + { + "epoch": 0.15695082075502856, + "grad_norm": 0.1949966996908188, + "learning_rate": 0.00018825266533452814, + "loss": 0.0213, + "step": 1188 + }, + { + "epoch": 0.1570829342405126, + "grad_norm": 0.27526164054870605, + "learning_rate": 0.0001882331131590692, + "loss": 0.0274, + "step": 1189 + }, + { + "epoch": 0.15721504772599662, + "grad_norm": 0.25454092025756836, + "learning_rate": 0.00018821354574298965, + "loss": 0.0216, + "step": 1190 + }, + { + "epoch": 0.15734716121148065, + "grad_norm": 0.18084175884723663, + "learning_rate": 0.00018819396308966945, + "loss": 0.02, + "step": 1191 + }, + { + "epoch": 0.15747927469696468, + "grad_norm": 0.19691844284534454, + "learning_rate": 0.00018817436520249112, + "loss": 0.0194, + "step": 1192 + }, + { + "epoch": 0.15761138818244871, + "grad_norm": 0.26010459661483765, + "learning_rate": 0.0001881547520848398, + "loss": 0.0266, + "step": 1193 + }, + { + "epoch": 0.15774350166793275, + "grad_norm": 0.17826038599014282, + "learning_rate": 0.00018813512374010332, + "loss": 0.0216, + "step": 1194 + }, + { + "epoch": 0.15787561515341678, + "grad_norm": 0.34981799125671387, + "learning_rate": 0.0001881154801716721, + "loss": 0.0398, + "step": 1195 + }, + { + "epoch": 0.1580077286389008, + "grad_norm": 0.23868107795715332, + "learning_rate": 0.00018809582138293922, + "loss": 0.034, + "step": 1196 + }, + { + "epoch": 0.15813984212438484, + "grad_norm": 0.23763948678970337, + "learning_rate": 0.00018807614737730033, + "loss": 0.0251, + "step": 1197 + }, + { + "epoch": 0.15827195560986887, + "grad_norm": 0.25793859362602234, + "learning_rate": 0.00018805645815815373, + "loss": 0.0305, + "step": 1198 + }, + { + "epoch": 0.1584040690953529, + "grad_norm": 0.1698640137910843, + "learning_rate": 0.00018803675372890046, + "loss": 0.0217, + "step": 1199 + }, + { + "epoch": 0.15853618258083693, + "grad_norm": 0.20204704999923706, + "learning_rate": 0.000188017034092944, + "loss": 0.021, + "step": 1200 + }, + { + "epoch": 0.15866829606632096, + "grad_norm": 0.22710919380187988, + "learning_rate": 0.00018799729925369056, + "loss": 0.0155, + "step": 1201 + }, + { + "epoch": 0.158800409551805, + "grad_norm": 0.4746352732181549, + "learning_rate": 0.000187977549214549, + "loss": 0.0268, + "step": 1202 + }, + { + "epoch": 0.15893252303728903, + "grad_norm": 0.3901737630367279, + "learning_rate": 0.00018795778397893078, + "loss": 0.0314, + "step": 1203 + }, + { + "epoch": 0.15906463652277306, + "grad_norm": 0.22661690413951874, + "learning_rate": 0.0001879380035502499, + "loss": 0.0375, + "step": 1204 + }, + { + "epoch": 0.1591967500082571, + "grad_norm": 0.25800031423568726, + "learning_rate": 0.00018791820793192314, + "loss": 0.0309, + "step": 1205 + }, + { + "epoch": 0.15932886349374112, + "grad_norm": 0.20021669566631317, + "learning_rate": 0.0001878983971273698, + "loss": 0.0191, + "step": 1206 + }, + { + "epoch": 0.15946097697922515, + "grad_norm": 0.2079436480998993, + "learning_rate": 0.00018787857114001177, + "loss": 0.018, + "step": 1207 + }, + { + "epoch": 0.15959309046470918, + "grad_norm": 0.22843880951404572, + "learning_rate": 0.0001878587299732737, + "loss": 0.0241, + "step": 1208 + }, + { + "epoch": 0.1597252039501932, + "grad_norm": 0.20756269991397858, + "learning_rate": 0.00018783887363058274, + "loss": 0.0337, + "step": 1209 + }, + { + "epoch": 0.15985731743567724, + "grad_norm": 0.2768935561180115, + "learning_rate": 0.0001878190021153687, + "loss": 0.0383, + "step": 1210 + }, + { + "epoch": 0.15998943092116127, + "grad_norm": 0.2520005702972412, + "learning_rate": 0.00018779911543106406, + "loss": 0.0373, + "step": 1211 + }, + { + "epoch": 0.1601215444066453, + "grad_norm": 0.2929930090904236, + "learning_rate": 0.0001877792135811038, + "loss": 0.0303, + "step": 1212 + }, + { + "epoch": 0.16025365789212934, + "grad_norm": 0.23010598123073578, + "learning_rate": 0.0001877592965689256, + "loss": 0.0313, + "step": 1213 + }, + { + "epoch": 0.16038577137761337, + "grad_norm": 0.21887648105621338, + "learning_rate": 0.0001877393643979698, + "loss": 0.0317, + "step": 1214 + }, + { + "epoch": 0.1605178848630974, + "grad_norm": 0.35093334317207336, + "learning_rate": 0.0001877194170716793, + "loss": 0.0377, + "step": 1215 + }, + { + "epoch": 0.16064999834858143, + "grad_norm": 0.20710135996341705, + "learning_rate": 0.00018769945459349964, + "loss": 0.034, + "step": 1216 + }, + { + "epoch": 0.16078211183406546, + "grad_norm": 0.1625404953956604, + "learning_rate": 0.00018767947696687887, + "loss": 0.0175, + "step": 1217 + }, + { + "epoch": 0.1609142253195495, + "grad_norm": 0.2056693285703659, + "learning_rate": 0.00018765948419526788, + "loss": 0.0203, + "step": 1218 + }, + { + "epoch": 0.16104633880503352, + "grad_norm": 0.15321901440620422, + "learning_rate": 0.00018763947628211994, + "loss": 0.0294, + "step": 1219 + }, + { + "epoch": 0.16117845229051755, + "grad_norm": 0.19081968069076538, + "learning_rate": 0.0001876194532308911, + "loss": 0.0153, + "step": 1220 + }, + { + "epoch": 0.16131056577600159, + "grad_norm": 0.19270607829093933, + "learning_rate": 0.00018759941504504002, + "loss": 0.0311, + "step": 1221 + }, + { + "epoch": 0.16144267926148562, + "grad_norm": 0.20099586248397827, + "learning_rate": 0.0001875793617280278, + "loss": 0.0359, + "step": 1222 + }, + { + "epoch": 0.16157479274696965, + "grad_norm": 0.20773924887180328, + "learning_rate": 0.00018755929328331835, + "loss": 0.0222, + "step": 1223 + }, + { + "epoch": 0.16170690623245368, + "grad_norm": 0.256315141916275, + "learning_rate": 0.00018753920971437813, + "loss": 0.0383, + "step": 1224 + }, + { + "epoch": 0.1618390197179377, + "grad_norm": 0.20451293885707855, + "learning_rate": 0.00018751911102467614, + "loss": 0.0281, + "step": 1225 + }, + { + "epoch": 0.16197113320342174, + "grad_norm": 0.20011462271213531, + "learning_rate": 0.0001874989972176841, + "loss": 0.0287, + "step": 1226 + }, + { + "epoch": 0.16210324668890577, + "grad_norm": 0.2945924997329712, + "learning_rate": 0.00018747886829687628, + "loss": 0.0389, + "step": 1227 + }, + { + "epoch": 0.1622353601743898, + "grad_norm": 0.1852794885635376, + "learning_rate": 0.00018745872426572958, + "loss": 0.0214, + "step": 1228 + }, + { + "epoch": 0.16236747365987383, + "grad_norm": 0.25254741311073303, + "learning_rate": 0.0001874385651277235, + "loss": 0.0278, + "step": 1229 + }, + { + "epoch": 0.16249958714535787, + "grad_norm": 0.22638756036758423, + "learning_rate": 0.00018741839088634018, + "loss": 0.0268, + "step": 1230 + }, + { + "epoch": 0.1626317006308419, + "grad_norm": 0.27086418867111206, + "learning_rate": 0.0001873982015450643, + "loss": 0.0394, + "step": 1231 + }, + { + "epoch": 0.16276381411632593, + "grad_norm": 0.1940222531557083, + "learning_rate": 0.00018737799710738325, + "loss": 0.0308, + "step": 1232 + }, + { + "epoch": 0.16289592760180996, + "grad_norm": 0.26498880982398987, + "learning_rate": 0.00018735777757678687, + "loss": 0.0164, + "step": 1233 + }, + { + "epoch": 0.163028041087294, + "grad_norm": 0.2891830503940582, + "learning_rate": 0.00018733754295676777, + "loss": 0.0367, + "step": 1234 + }, + { + "epoch": 0.16316015457277802, + "grad_norm": 0.32519957423210144, + "learning_rate": 0.00018731729325082114, + "loss": 0.0219, + "step": 1235 + }, + { + "epoch": 0.16329226805826205, + "grad_norm": 0.495128333568573, + "learning_rate": 0.00018729702846244467, + "loss": 0.03, + "step": 1236 + }, + { + "epoch": 0.16342438154374608, + "grad_norm": 0.16794808208942413, + "learning_rate": 0.00018727674859513879, + "loss": 0.0142, + "step": 1237 + }, + { + "epoch": 0.16355649502923011, + "grad_norm": 0.40994521975517273, + "learning_rate": 0.00018725645365240637, + "loss": 0.037, + "step": 1238 + }, + { + "epoch": 0.16368860851471415, + "grad_norm": 0.21588881313800812, + "learning_rate": 0.00018723614363775303, + "loss": 0.0215, + "step": 1239 + }, + { + "epoch": 0.16382072200019818, + "grad_norm": 0.21787334978580475, + "learning_rate": 0.00018721581855468702, + "loss": 0.0256, + "step": 1240 + }, + { + "epoch": 0.1639528354856822, + "grad_norm": 0.31683409214019775, + "learning_rate": 0.000187195478406719, + "loss": 0.0261, + "step": 1241 + }, + { + "epoch": 0.16408494897116624, + "grad_norm": 0.18830211460590363, + "learning_rate": 0.00018717512319736242, + "loss": 0.0153, + "step": 1242 + }, + { + "epoch": 0.16421706245665027, + "grad_norm": 0.231922909617424, + "learning_rate": 0.00018715475293013326, + "loss": 0.0333, + "step": 1243 + }, + { + "epoch": 0.1643491759421343, + "grad_norm": 0.28003785014152527, + "learning_rate": 0.00018713436760855006, + "loss": 0.0328, + "step": 1244 + }, + { + "epoch": 0.16448128942761833, + "grad_norm": 0.24703218042850494, + "learning_rate": 0.00018711396723613402, + "loss": 0.024, + "step": 1245 + }, + { + "epoch": 0.16461340291310236, + "grad_norm": 0.2914784550666809, + "learning_rate": 0.00018709355181640897, + "loss": 0.0313, + "step": 1246 + }, + { + "epoch": 0.1647455163985864, + "grad_norm": 0.20224568247795105, + "learning_rate": 0.00018707312135290125, + "loss": 0.0334, + "step": 1247 + }, + { + "epoch": 0.16487762988407043, + "grad_norm": 0.2872107923030853, + "learning_rate": 0.00018705267584913983, + "loss": 0.0312, + "step": 1248 + }, + { + "epoch": 0.16500974336955446, + "grad_norm": 0.3120299279689789, + "learning_rate": 0.00018703221530865633, + "loss": 0.0343, + "step": 1249 + }, + { + "epoch": 0.1651418568550385, + "grad_norm": 0.23731586337089539, + "learning_rate": 0.00018701173973498486, + "loss": 0.0277, + "step": 1250 + }, + { + "epoch": 0.16527397034052252, + "grad_norm": 0.3275063633918762, + "learning_rate": 0.00018699124913166228, + "loss": 0.0274, + "step": 1251 + }, + { + "epoch": 0.16540608382600655, + "grad_norm": 0.17656168341636658, + "learning_rate": 0.00018697074350222786, + "loss": 0.0246, + "step": 1252 + }, + { + "epoch": 0.16553819731149058, + "grad_norm": 0.27418434619903564, + "learning_rate": 0.00018695022285022364, + "loss": 0.0419, + "step": 1253 + }, + { + "epoch": 0.1656703107969746, + "grad_norm": 0.26168766617774963, + "learning_rate": 0.0001869296871791942, + "loss": 0.0262, + "step": 1254 + }, + { + "epoch": 0.16580242428245864, + "grad_norm": 0.21798458695411682, + "learning_rate": 0.0001869091364926866, + "loss": 0.0245, + "step": 1255 + }, + { + "epoch": 0.16593453776794267, + "grad_norm": 0.4269181191921234, + "learning_rate": 0.00018688857079425064, + "loss": 0.0287, + "step": 1256 + }, + { + "epoch": 0.1660666512534267, + "grad_norm": 0.35198020935058594, + "learning_rate": 0.00018686799008743864, + "loss": 0.0269, + "step": 1257 + }, + { + "epoch": 0.16619876473891074, + "grad_norm": 0.2965468168258667, + "learning_rate": 0.00018684739437580555, + "loss": 0.0391, + "step": 1258 + }, + { + "epoch": 0.16633087822439477, + "grad_norm": 0.2700027823448181, + "learning_rate": 0.00018682678366290894, + "loss": 0.0316, + "step": 1259 + }, + { + "epoch": 0.1664629917098788, + "grad_norm": 0.27826032042503357, + "learning_rate": 0.0001868061579523088, + "loss": 0.043, + "step": 1260 + }, + { + "epoch": 0.16659510519536283, + "grad_norm": 0.22224481403827667, + "learning_rate": 0.00018678551724756796, + "loss": 0.0281, + "step": 1261 + }, + { + "epoch": 0.16672721868084683, + "grad_norm": 0.23873165249824524, + "learning_rate": 0.00018676486155225168, + "loss": 0.0385, + "step": 1262 + }, + { + "epoch": 0.16685933216633086, + "grad_norm": 0.16569150984287262, + "learning_rate": 0.0001867441908699278, + "loss": 0.0256, + "step": 1263 + }, + { + "epoch": 0.1669914456518149, + "grad_norm": 0.29517295956611633, + "learning_rate": 0.00018672350520416683, + "loss": 0.0202, + "step": 1264 + }, + { + "epoch": 0.16712355913729893, + "grad_norm": 0.2139241248369217, + "learning_rate": 0.00018670280455854185, + "loss": 0.0368, + "step": 1265 + }, + { + "epoch": 0.16725567262278296, + "grad_norm": 0.25826069712638855, + "learning_rate": 0.0001866820889366285, + "loss": 0.0276, + "step": 1266 + }, + { + "epoch": 0.167387786108267, + "grad_norm": 0.205109640955925, + "learning_rate": 0.000186661358342005, + "loss": 0.0233, + "step": 1267 + }, + { + "epoch": 0.16751989959375102, + "grad_norm": 0.16500066220760345, + "learning_rate": 0.00018664061277825212, + "loss": 0.0273, + "step": 1268 + }, + { + "epoch": 0.16765201307923505, + "grad_norm": 0.38416817784309387, + "learning_rate": 0.00018661985224895339, + "loss": 0.0339, + "step": 1269 + }, + { + "epoch": 0.16778412656471908, + "grad_norm": 0.3248102366924286, + "learning_rate": 0.0001865990767576947, + "loss": 0.0314, + "step": 1270 + }, + { + "epoch": 0.1679162400502031, + "grad_norm": 0.3818456828594208, + "learning_rate": 0.00018657828630806467, + "loss": 0.0226, + "step": 1271 + }, + { + "epoch": 0.16804835353568714, + "grad_norm": 0.22842957079410553, + "learning_rate": 0.00018655748090365445, + "loss": 0.039, + "step": 1272 + }, + { + "epoch": 0.16818046702117118, + "grad_norm": 0.18327449262142181, + "learning_rate": 0.00018653666054805785, + "loss": 0.0211, + "step": 1273 + }, + { + "epoch": 0.1683125805066552, + "grad_norm": 0.31490403413772583, + "learning_rate": 0.0001865158252448711, + "loss": 0.0211, + "step": 1274 + }, + { + "epoch": 0.16844469399213924, + "grad_norm": 0.23804102838039398, + "learning_rate": 0.00018649497499769314, + "loss": 0.0231, + "step": 1275 + }, + { + "epoch": 0.16857680747762327, + "grad_norm": 0.17960631847381592, + "learning_rate": 0.0001864741098101255, + "loss": 0.0226, + "step": 1276 + }, + { + "epoch": 0.1687089209631073, + "grad_norm": 0.20831550657749176, + "learning_rate": 0.00018645322968577216, + "loss": 0.0265, + "step": 1277 + }, + { + "epoch": 0.16884103444859133, + "grad_norm": 0.19507895410060883, + "learning_rate": 0.00018643233462823988, + "loss": 0.0215, + "step": 1278 + }, + { + "epoch": 0.16897314793407536, + "grad_norm": 0.19099454581737518, + "learning_rate": 0.00018641142464113783, + "loss": 0.0239, + "step": 1279 + }, + { + "epoch": 0.1691052614195594, + "grad_norm": 0.32890570163726807, + "learning_rate": 0.00018639049972807783, + "loss": 0.0375, + "step": 1280 + }, + { + "epoch": 0.16923737490504343, + "grad_norm": 0.17816013097763062, + "learning_rate": 0.00018636955989267427, + "loss": 0.0131, + "step": 1281 + }, + { + "epoch": 0.16936948839052746, + "grad_norm": 0.2211841195821762, + "learning_rate": 0.00018634860513854412, + "loss": 0.0266, + "step": 1282 + }, + { + "epoch": 0.1695016018760115, + "grad_norm": 0.3327048420906067, + "learning_rate": 0.00018632763546930692, + "loss": 0.0431, + "step": 1283 + }, + { + "epoch": 0.16963371536149552, + "grad_norm": 0.3703952133655548, + "learning_rate": 0.00018630665088858477, + "loss": 0.0515, + "step": 1284 + }, + { + "epoch": 0.16976582884697955, + "grad_norm": 0.4615309536457062, + "learning_rate": 0.0001862856514000024, + "loss": 0.023, + "step": 1285 + }, + { + "epoch": 0.16989794233246358, + "grad_norm": 0.23466283082962036, + "learning_rate": 0.00018626463700718705, + "loss": 0.0348, + "step": 1286 + }, + { + "epoch": 0.1700300558179476, + "grad_norm": 0.17402033507823944, + "learning_rate": 0.00018624360771376855, + "loss": 0.0198, + "step": 1287 + }, + { + "epoch": 0.17016216930343164, + "grad_norm": 0.2167549729347229, + "learning_rate": 0.00018622256352337935, + "loss": 0.0186, + "step": 1288 + }, + { + "epoch": 0.17029428278891567, + "grad_norm": 0.20165550708770752, + "learning_rate": 0.00018620150443965442, + "loss": 0.0208, + "step": 1289 + }, + { + "epoch": 0.1704263962743997, + "grad_norm": 0.29167264699935913, + "learning_rate": 0.00018618043046623136, + "loss": 0.0407, + "step": 1290 + }, + { + "epoch": 0.17055850975988374, + "grad_norm": 0.23466479778289795, + "learning_rate": 0.00018615934160675024, + "loss": 0.0316, + "step": 1291 + }, + { + "epoch": 0.17069062324536777, + "grad_norm": 0.16358260810375214, + "learning_rate": 0.00018613823786485382, + "loss": 0.0139, + "step": 1292 + }, + { + "epoch": 0.1708227367308518, + "grad_norm": 0.228166401386261, + "learning_rate": 0.00018611711924418733, + "loss": 0.0328, + "step": 1293 + }, + { + "epoch": 0.17095485021633583, + "grad_norm": 0.18310043215751648, + "learning_rate": 0.00018609598574839868, + "loss": 0.022, + "step": 1294 + }, + { + "epoch": 0.17108696370181986, + "grad_norm": 0.27457547187805176, + "learning_rate": 0.00018607483738113825, + "loss": 0.0203, + "step": 1295 + }, + { + "epoch": 0.1712190771873039, + "grad_norm": 0.2539723217487335, + "learning_rate": 0.000186053674146059, + "loss": 0.0149, + "step": 1296 + }, + { + "epoch": 0.17135119067278792, + "grad_norm": 0.26341545581817627, + "learning_rate": 0.00018603249604681653, + "loss": 0.0463, + "step": 1297 + }, + { + "epoch": 0.17148330415827195, + "grad_norm": 0.9475441575050354, + "learning_rate": 0.00018601130308706896, + "loss": 0.0245, + "step": 1298 + }, + { + "epoch": 0.17161541764375599, + "grad_norm": 0.2520473599433899, + "learning_rate": 0.00018599009527047692, + "loss": 0.0324, + "step": 1299 + }, + { + "epoch": 0.17174753112924002, + "grad_norm": 0.201473668217659, + "learning_rate": 0.00018596887260070375, + "loss": 0.0163, + "step": 1300 + }, + { + "epoch": 0.17187964461472405, + "grad_norm": 0.3438372015953064, + "learning_rate": 0.00018594763508141516, + "loss": 0.0441, + "step": 1301 + }, + { + "epoch": 0.17201175810020808, + "grad_norm": 0.33211642503738403, + "learning_rate": 0.00018592638271627964, + "loss": 0.031, + "step": 1302 + }, + { + "epoch": 0.1721438715856921, + "grad_norm": 0.3771807551383972, + "learning_rate": 0.00018590511550896808, + "loss": 0.0374, + "step": 1303 + }, + { + "epoch": 0.17227598507117614, + "grad_norm": 0.5802851319313049, + "learning_rate": 0.00018588383346315404, + "loss": 0.0425, + "step": 1304 + }, + { + "epoch": 0.17240809855666017, + "grad_norm": 0.3841919004917145, + "learning_rate": 0.00018586253658251352, + "loss": 0.0443, + "step": 1305 + }, + { + "epoch": 0.1725402120421442, + "grad_norm": 0.25487831234931946, + "learning_rate": 0.00018584122487072522, + "loss": 0.0164, + "step": 1306 + }, + { + "epoch": 0.17267232552762823, + "grad_norm": 0.23374485969543457, + "learning_rate": 0.0001858198983314703, + "loss": 0.024, + "step": 1307 + }, + { + "epoch": 0.17280443901311227, + "grad_norm": 0.18719379603862762, + "learning_rate": 0.00018579855696843257, + "loss": 0.0159, + "step": 1308 + }, + { + "epoch": 0.1729365524985963, + "grad_norm": 0.2631910741329193, + "learning_rate": 0.0001857772007852983, + "loss": 0.0273, + "step": 1309 + }, + { + "epoch": 0.17306866598408033, + "grad_norm": 0.2309308648109436, + "learning_rate": 0.0001857558297857564, + "loss": 0.0201, + "step": 1310 + }, + { + "epoch": 0.17320077946956436, + "grad_norm": 0.2370709776878357, + "learning_rate": 0.0001857344439734983, + "loss": 0.0428, + "step": 1311 + }, + { + "epoch": 0.1733328929550484, + "grad_norm": 0.25953209400177, + "learning_rate": 0.00018571304335221803, + "loss": 0.0374, + "step": 1312 + }, + { + "epoch": 0.17346500644053242, + "grad_norm": 0.21717828512191772, + "learning_rate": 0.0001856916279256121, + "loss": 0.0279, + "step": 1313 + }, + { + "epoch": 0.17359711992601645, + "grad_norm": 0.1866343766450882, + "learning_rate": 0.00018567019769737963, + "loss": 0.0181, + "step": 1314 + }, + { + "epoch": 0.17372923341150048, + "grad_norm": 0.21271054446697235, + "learning_rate": 0.0001856487526712223, + "loss": 0.0246, + "step": 1315 + }, + { + "epoch": 0.17386134689698451, + "grad_norm": 0.2034507691860199, + "learning_rate": 0.00018562729285084438, + "loss": 0.0286, + "step": 1316 + }, + { + "epoch": 0.17399346038246855, + "grad_norm": 0.3184293508529663, + "learning_rate": 0.0001856058182399526, + "loss": 0.0478, + "step": 1317 + }, + { + "epoch": 0.17412557386795258, + "grad_norm": 0.2903006672859192, + "learning_rate": 0.00018558432884225633, + "loss": 0.0329, + "step": 1318 + }, + { + "epoch": 0.1742576873534366, + "grad_norm": 0.22487443685531616, + "learning_rate": 0.00018556282466146743, + "loss": 0.0217, + "step": 1319 + }, + { + "epoch": 0.17438980083892064, + "grad_norm": 0.2479228377342224, + "learning_rate": 0.00018554130570130038, + "loss": 0.0225, + "step": 1320 + }, + { + "epoch": 0.17452191432440467, + "grad_norm": 0.26292628049850464, + "learning_rate": 0.00018551977196547213, + "loss": 0.0361, + "step": 1321 + }, + { + "epoch": 0.1746540278098887, + "grad_norm": 0.2891198694705963, + "learning_rate": 0.0001854982234577023, + "loss": 0.0379, + "step": 1322 + }, + { + "epoch": 0.17478614129537273, + "grad_norm": 0.22966928780078888, + "learning_rate": 0.00018547666018171294, + "loss": 0.033, + "step": 1323 + }, + { + "epoch": 0.17491825478085676, + "grad_norm": 0.291325181722641, + "learning_rate": 0.0001854550821412287, + "loss": 0.0302, + "step": 1324 + }, + { + "epoch": 0.1750503682663408, + "grad_norm": 0.4915953576564789, + "learning_rate": 0.00018543348933997678, + "loss": 0.0372, + "step": 1325 + }, + { + "epoch": 0.17518248175182483, + "grad_norm": 0.3161768317222595, + "learning_rate": 0.00018541188178168696, + "loss": 0.0304, + "step": 1326 + }, + { + "epoch": 0.17531459523730886, + "grad_norm": 0.252323716878891, + "learning_rate": 0.00018539025947009153, + "loss": 0.0188, + "step": 1327 + }, + { + "epoch": 0.1754467087227929, + "grad_norm": 0.4096275866031647, + "learning_rate": 0.00018536862240892536, + "loss": 0.0511, + "step": 1328 + }, + { + "epoch": 0.17557882220827692, + "grad_norm": 0.22228872776031494, + "learning_rate": 0.00018534697060192584, + "loss": 0.032, + "step": 1329 + }, + { + "epoch": 0.17571093569376095, + "grad_norm": 0.20836091041564941, + "learning_rate": 0.00018532530405283287, + "loss": 0.0239, + "step": 1330 + }, + { + "epoch": 0.17584304917924498, + "grad_norm": 0.28031957149505615, + "learning_rate": 0.00018530362276538898, + "loss": 0.0292, + "step": 1331 + }, + { + "epoch": 0.175975162664729, + "grad_norm": 0.3088838756084442, + "learning_rate": 0.00018528192674333922, + "loss": 0.0408, + "step": 1332 + }, + { + "epoch": 0.17610727615021304, + "grad_norm": 0.28220516443252563, + "learning_rate": 0.00018526021599043113, + "loss": 0.0257, + "step": 1333 + }, + { + "epoch": 0.17623938963569707, + "grad_norm": 0.19157913327217102, + "learning_rate": 0.0001852384905104149, + "loss": 0.0297, + "step": 1334 + }, + { + "epoch": 0.1763715031211811, + "grad_norm": 0.3451235592365265, + "learning_rate": 0.00018521675030704312, + "loss": 0.027, + "step": 1335 + }, + { + "epoch": 0.17650361660666514, + "grad_norm": 0.3861960470676422, + "learning_rate": 0.00018519499538407105, + "loss": 0.041, + "step": 1336 + }, + { + "epoch": 0.17663573009214917, + "grad_norm": 0.2573603391647339, + "learning_rate": 0.00018517322574525648, + "loss": 0.0312, + "step": 1337 + }, + { + "epoch": 0.1767678435776332, + "grad_norm": 0.29054367542266846, + "learning_rate": 0.00018515144139435964, + "loss": 0.0243, + "step": 1338 + }, + { + "epoch": 0.17689995706311723, + "grad_norm": 0.24201183021068573, + "learning_rate": 0.0001851296423351434, + "loss": 0.0269, + "step": 1339 + }, + { + "epoch": 0.17703207054860126, + "grad_norm": 0.16764526069164276, + "learning_rate": 0.0001851078285713731, + "loss": 0.018, + "step": 1340 + }, + { + "epoch": 0.1771641840340853, + "grad_norm": 0.18756064772605896, + "learning_rate": 0.0001850860001068168, + "loss": 0.0224, + "step": 1341 + }, + { + "epoch": 0.17729629751956932, + "grad_norm": 0.27146032452583313, + "learning_rate": 0.00018506415694524478, + "loss": 0.0303, + "step": 1342 + }, + { + "epoch": 0.17742841100505333, + "grad_norm": 0.2190525382757187, + "learning_rate": 0.00018504229909043014, + "loss": 0.0218, + "step": 1343 + }, + { + "epoch": 0.17756052449053736, + "grad_norm": 0.30307960510253906, + "learning_rate": 0.00018502042654614838, + "loss": 0.0292, + "step": 1344 + }, + { + "epoch": 0.1776926379760214, + "grad_norm": 0.2004368156194687, + "learning_rate": 0.0001849985393161776, + "loss": 0.0298, + "step": 1345 + }, + { + "epoch": 0.17782475146150542, + "grad_norm": 0.22861526906490326, + "learning_rate": 0.00018497663740429837, + "loss": 0.0254, + "step": 1346 + }, + { + "epoch": 0.17795686494698945, + "grad_norm": 0.2857377529144287, + "learning_rate": 0.00018495472081429386, + "loss": 0.0302, + "step": 1347 + }, + { + "epoch": 0.17808897843247348, + "grad_norm": 0.275430291891098, + "learning_rate": 0.00018493278954994976, + "loss": 0.0387, + "step": 1348 + }, + { + "epoch": 0.1782210919179575, + "grad_norm": 0.23096689581871033, + "learning_rate": 0.0001849108436150543, + "loss": 0.0362, + "step": 1349 + }, + { + "epoch": 0.17835320540344154, + "grad_norm": 0.1883264034986496, + "learning_rate": 0.00018488888301339818, + "loss": 0.0239, + "step": 1350 + }, + { + "epoch": 0.17848531888892558, + "grad_norm": 0.3247344195842743, + "learning_rate": 0.00018486690774877472, + "loss": 0.04, + "step": 1351 + }, + { + "epoch": 0.1786174323744096, + "grad_norm": 0.15827877819538116, + "learning_rate": 0.00018484491782497974, + "loss": 0.0181, + "step": 1352 + }, + { + "epoch": 0.17874954585989364, + "grad_norm": 0.21765846014022827, + "learning_rate": 0.0001848229132458115, + "loss": 0.0374, + "step": 1353 + }, + { + "epoch": 0.17888165934537767, + "grad_norm": 0.29163116216659546, + "learning_rate": 0.00018480089401507103, + "loss": 0.03, + "step": 1354 + }, + { + "epoch": 0.1790137728308617, + "grad_norm": 0.1919863075017929, + "learning_rate": 0.00018477886013656164, + "loss": 0.0226, + "step": 1355 + }, + { + "epoch": 0.17914588631634573, + "grad_norm": 0.2065700888633728, + "learning_rate": 0.0001847568116140893, + "loss": 0.0203, + "step": 1356 + }, + { + "epoch": 0.17927799980182976, + "grad_norm": 0.32423102855682373, + "learning_rate": 0.0001847347484514625, + "loss": 0.0507, + "step": 1357 + }, + { + "epoch": 0.1794101132873138, + "grad_norm": 0.31353965401649475, + "learning_rate": 0.00018471267065249216, + "loss": 0.0267, + "step": 1358 + }, + { + "epoch": 0.17954222677279782, + "grad_norm": 0.29463842511177063, + "learning_rate": 0.00018469057822099192, + "loss": 0.0286, + "step": 1359 + }, + { + "epoch": 0.17967434025828186, + "grad_norm": 0.19099555909633636, + "learning_rate": 0.0001846684711607777, + "loss": 0.0199, + "step": 1360 + }, + { + "epoch": 0.1798064537437659, + "grad_norm": 0.3381175398826599, + "learning_rate": 0.00018464634947566825, + "loss": 0.0416, + "step": 1361 + }, + { + "epoch": 0.17993856722924992, + "grad_norm": 0.24425239861011505, + "learning_rate": 0.00018462421316948452, + "loss": 0.0322, + "step": 1362 + }, + { + "epoch": 0.18007068071473395, + "grad_norm": 0.20561014115810394, + "learning_rate": 0.0001846020622460502, + "loss": 0.0283, + "step": 1363 + }, + { + "epoch": 0.18020279420021798, + "grad_norm": 0.16568754613399506, + "learning_rate": 0.0001845798967091915, + "loss": 0.0153, + "step": 1364 + }, + { + "epoch": 0.180334907685702, + "grad_norm": 0.24680128693580627, + "learning_rate": 0.000184557716562737, + "loss": 0.0312, + "step": 1365 + }, + { + "epoch": 0.18046702117118604, + "grad_norm": 0.2902357876300812, + "learning_rate": 0.00018453552181051802, + "loss": 0.0385, + "step": 1366 + }, + { + "epoch": 0.18059913465667007, + "grad_norm": 0.20758208632469177, + "learning_rate": 0.00018451331245636818, + "loss": 0.0264, + "step": 1367 + }, + { + "epoch": 0.1807312481421541, + "grad_norm": 0.21378950774669647, + "learning_rate": 0.00018449108850412382, + "loss": 0.0326, + "step": 1368 + }, + { + "epoch": 0.18086336162763814, + "grad_norm": 0.32274001836776733, + "learning_rate": 0.00018446884995762365, + "loss": 0.0337, + "step": 1369 + }, + { + "epoch": 0.18099547511312217, + "grad_norm": 0.19334742426872253, + "learning_rate": 0.000184446596820709, + "loss": 0.0246, + "step": 1370 + }, + { + "epoch": 0.1811275885986062, + "grad_norm": 0.3434712588787079, + "learning_rate": 0.00018442432909722366, + "loss": 0.036, + "step": 1371 + }, + { + "epoch": 0.18125970208409023, + "grad_norm": 0.23470188677310944, + "learning_rate": 0.000184402046791014, + "loss": 0.0285, + "step": 1372 + }, + { + "epoch": 0.18139181556957426, + "grad_norm": 0.16895225644111633, + "learning_rate": 0.00018437974990592884, + "loss": 0.0179, + "step": 1373 + }, + { + "epoch": 0.1815239290550583, + "grad_norm": 0.26691296696662903, + "learning_rate": 0.00018435743844581954, + "loss": 0.025, + "step": 1374 + }, + { + "epoch": 0.18165604254054232, + "grad_norm": 0.21763736009597778, + "learning_rate": 0.00018433511241454001, + "loss": 0.0253, + "step": 1375 + }, + { + "epoch": 0.18178815602602635, + "grad_norm": 0.37234312295913696, + "learning_rate": 0.0001843127718159466, + "loss": 0.0342, + "step": 1376 + }, + { + "epoch": 0.18192026951151039, + "grad_norm": 0.22473712265491486, + "learning_rate": 0.00018429041665389835, + "loss": 0.0202, + "step": 1377 + }, + { + "epoch": 0.18205238299699442, + "grad_norm": 0.18173733353614807, + "learning_rate": 0.00018426804693225658, + "loss": 0.0259, + "step": 1378 + }, + { + "epoch": 0.18218449648247845, + "grad_norm": 0.1645773947238922, + "learning_rate": 0.00018424566265488532, + "loss": 0.012, + "step": 1379 + }, + { + "epoch": 0.18231660996796248, + "grad_norm": 0.3594745695590973, + "learning_rate": 0.00018422326382565096, + "loss": 0.0333, + "step": 1380 + }, + { + "epoch": 0.1824487234534465, + "grad_norm": 0.2075868397951126, + "learning_rate": 0.00018420085044842255, + "loss": 0.0246, + "step": 1381 + }, + { + "epoch": 0.18258083693893054, + "grad_norm": 0.26188474893569946, + "learning_rate": 0.0001841784225270715, + "loss": 0.0444, + "step": 1382 + }, + { + "epoch": 0.18271295042441457, + "grad_norm": 0.2381543070077896, + "learning_rate": 0.00018415598006547192, + "loss": 0.0156, + "step": 1383 + }, + { + "epoch": 0.1828450639098986, + "grad_norm": 0.2644082009792328, + "learning_rate": 0.00018413352306750026, + "loss": 0.0467, + "step": 1384 + }, + { + "epoch": 0.18297717739538263, + "grad_norm": 0.21643215417861938, + "learning_rate": 0.00018411105153703556, + "loss": 0.0344, + "step": 1385 + }, + { + "epoch": 0.18310929088086667, + "grad_norm": 0.2230250984430313, + "learning_rate": 0.00018408856547795933, + "loss": 0.0287, + "step": 1386 + }, + { + "epoch": 0.1832414043663507, + "grad_norm": 0.2237335443496704, + "learning_rate": 0.00018406606489415568, + "loss": 0.0341, + "step": 1387 + }, + { + "epoch": 0.18337351785183473, + "grad_norm": 0.2967820167541504, + "learning_rate": 0.0001840435497895111, + "loss": 0.0216, + "step": 1388 + }, + { + "epoch": 0.18350563133731876, + "grad_norm": 0.21675102412700653, + "learning_rate": 0.00018402102016791468, + "loss": 0.0246, + "step": 1389 + }, + { + "epoch": 0.1836377448228028, + "grad_norm": 0.22738157212734222, + "learning_rate": 0.000183998476033258, + "loss": 0.0367, + "step": 1390 + }, + { + "epoch": 0.18376985830828682, + "grad_norm": 0.23565442860126495, + "learning_rate": 0.0001839759173894351, + "loss": 0.0327, + "step": 1391 + }, + { + "epoch": 0.18390197179377085, + "grad_norm": 0.3125069737434387, + "learning_rate": 0.00018395334424034263, + "loss": 0.0252, + "step": 1392 + }, + { + "epoch": 0.18403408527925488, + "grad_norm": 0.2953563630580902, + "learning_rate": 0.00018393075658987962, + "loss": 0.0383, + "step": 1393 + }, + { + "epoch": 0.18416619876473891, + "grad_norm": 0.2701183259487152, + "learning_rate": 0.00018390815444194766, + "loss": 0.0235, + "step": 1394 + }, + { + "epoch": 0.18429831225022295, + "grad_norm": 0.26304230093955994, + "learning_rate": 0.00018388553780045093, + "loss": 0.0379, + "step": 1395 + }, + { + "epoch": 0.18443042573570698, + "grad_norm": 0.2280203104019165, + "learning_rate": 0.00018386290666929593, + "loss": 0.0302, + "step": 1396 + }, + { + "epoch": 0.184562539221191, + "grad_norm": 0.2568672001361847, + "learning_rate": 0.00018384026105239184, + "loss": 0.0227, + "step": 1397 + }, + { + "epoch": 0.18469465270667504, + "grad_norm": 0.556996762752533, + "learning_rate": 0.00018381760095365022, + "loss": 0.0346, + "step": 1398 + }, + { + "epoch": 0.18482676619215907, + "grad_norm": 0.3996849060058594, + "learning_rate": 0.0001837949263769852, + "loss": 0.0229, + "step": 1399 + }, + { + "epoch": 0.1849588796776431, + "grad_norm": 0.2925168573856354, + "learning_rate": 0.00018377223732631337, + "loss": 0.0385, + "step": 1400 + }, + { + "epoch": 0.18509099316312713, + "grad_norm": 0.26440808176994324, + "learning_rate": 0.00018374953380555388, + "loss": 0.0262, + "step": 1401 + }, + { + "epoch": 0.18522310664861116, + "grad_norm": 0.23454102873802185, + "learning_rate": 0.0001837268158186283, + "loss": 0.0251, + "step": 1402 + }, + { + "epoch": 0.1853552201340952, + "grad_norm": 0.6885169744491577, + "learning_rate": 0.00018370408336946075, + "loss": 0.0437, + "step": 1403 + }, + { + "epoch": 0.18548733361957923, + "grad_norm": 0.3095178008079529, + "learning_rate": 0.00018368133646197782, + "loss": 0.0287, + "step": 1404 + }, + { + "epoch": 0.18561944710506326, + "grad_norm": 0.24140727519989014, + "learning_rate": 0.00018365857510010866, + "loss": 0.0284, + "step": 1405 + }, + { + "epoch": 0.1857515605905473, + "grad_norm": 0.32140839099884033, + "learning_rate": 0.00018363579928778483, + "loss": 0.037, + "step": 1406 + }, + { + "epoch": 0.18588367407603132, + "grad_norm": 0.2660520374774933, + "learning_rate": 0.00018361300902894044, + "loss": 0.0512, + "step": 1407 + }, + { + "epoch": 0.18601578756151535, + "grad_norm": 0.2977862060070038, + "learning_rate": 0.00018359020432751205, + "loss": 0.0348, + "step": 1408 + }, + { + "epoch": 0.18614790104699938, + "grad_norm": 0.2596471607685089, + "learning_rate": 0.0001835673851874388, + "loss": 0.0302, + "step": 1409 + }, + { + "epoch": 0.1862800145324834, + "grad_norm": 0.20368549227714539, + "learning_rate": 0.0001835445516126622, + "loss": 0.0331, + "step": 1410 + }, + { + "epoch": 0.18641212801796744, + "grad_norm": 0.24589885771274567, + "learning_rate": 0.00018352170360712639, + "loss": 0.0294, + "step": 1411 + }, + { + "epoch": 0.18654424150345147, + "grad_norm": 0.33342570066452026, + "learning_rate": 0.0001834988411747779, + "loss": 0.0333, + "step": 1412 + }, + { + "epoch": 0.1866763549889355, + "grad_norm": 0.17387668788433075, + "learning_rate": 0.00018347596431956582, + "loss": 0.0204, + "step": 1413 + }, + { + "epoch": 0.18680846847441954, + "grad_norm": 0.25326329469680786, + "learning_rate": 0.0001834530730454417, + "loss": 0.0354, + "step": 1414 + }, + { + "epoch": 0.18694058195990357, + "grad_norm": 0.18185511231422424, + "learning_rate": 0.0001834301673563595, + "loss": 0.0292, + "step": 1415 + }, + { + "epoch": 0.1870726954453876, + "grad_norm": 0.16677400469779968, + "learning_rate": 0.00018340724725627583, + "loss": 0.0204, + "step": 1416 + }, + { + "epoch": 0.18720480893087163, + "grad_norm": 0.2312796413898468, + "learning_rate": 0.0001833843127491497, + "loss": 0.0294, + "step": 1417 + }, + { + "epoch": 0.18733692241635566, + "grad_norm": 0.21785087883472443, + "learning_rate": 0.00018336136383894256, + "loss": 0.033, + "step": 1418 + }, + { + "epoch": 0.1874690359018397, + "grad_norm": 0.22082215547561646, + "learning_rate": 0.0001833384005296185, + "loss": 0.0252, + "step": 1419 + }, + { + "epoch": 0.18760114938732372, + "grad_norm": 0.22463281452655792, + "learning_rate": 0.0001833154228251439, + "loss": 0.0298, + "step": 1420 + }, + { + "epoch": 0.18773326287280775, + "grad_norm": 0.20574955642223358, + "learning_rate": 0.0001832924307294878, + "loss": 0.0192, + "step": 1421 + }, + { + "epoch": 0.18786537635829179, + "grad_norm": 0.20774230360984802, + "learning_rate": 0.00018326942424662165, + "loss": 0.0211, + "step": 1422 + }, + { + "epoch": 0.18799748984377582, + "grad_norm": 0.2312815636396408, + "learning_rate": 0.00018324640338051934, + "loss": 0.0402, + "step": 1423 + }, + { + "epoch": 0.18812960332925985, + "grad_norm": 0.18057630956172943, + "learning_rate": 0.00018322336813515733, + "loss": 0.0284, + "step": 1424 + }, + { + "epoch": 0.18826171681474385, + "grad_norm": 0.20045863091945648, + "learning_rate": 0.00018320031851451452, + "loss": 0.0193, + "step": 1425 + }, + { + "epoch": 0.18839383030022788, + "grad_norm": 0.21414987742900848, + "learning_rate": 0.00018317725452257234, + "loss": 0.0313, + "step": 1426 + }, + { + "epoch": 0.1885259437857119, + "grad_norm": 0.2546866834163666, + "learning_rate": 0.0001831541761633146, + "loss": 0.0342, + "step": 1427 + }, + { + "epoch": 0.18865805727119594, + "grad_norm": 0.23323996365070343, + "learning_rate": 0.0001831310834407277, + "loss": 0.0256, + "step": 1428 + }, + { + "epoch": 0.18879017075667998, + "grad_norm": 0.1575809270143509, + "learning_rate": 0.00018310797635880043, + "loss": 0.0178, + "step": 1429 + }, + { + "epoch": 0.188922284242164, + "grad_norm": 0.2791820466518402, + "learning_rate": 0.0001830848549215242, + "loss": 0.0385, + "step": 1430 + }, + { + "epoch": 0.18905439772764804, + "grad_norm": 0.18286937475204468, + "learning_rate": 0.00018306171913289268, + "loss": 0.0152, + "step": 1431 + }, + { + "epoch": 0.18918651121313207, + "grad_norm": 0.2981870770454407, + "learning_rate": 0.00018303856899690223, + "loss": 0.0248, + "step": 1432 + }, + { + "epoch": 0.1893186246986161, + "grad_norm": 0.2948318123817444, + "learning_rate": 0.00018301540451755158, + "loss": 0.0206, + "step": 1433 + }, + { + "epoch": 0.18945073818410013, + "grad_norm": 0.25580301880836487, + "learning_rate": 0.00018299222569884198, + "loss": 0.0245, + "step": 1434 + }, + { + "epoch": 0.18958285166958416, + "grad_norm": 0.23851770162582397, + "learning_rate": 0.0001829690325447771, + "loss": 0.0293, + "step": 1435 + }, + { + "epoch": 0.1897149651550682, + "grad_norm": 0.35320428013801575, + "learning_rate": 0.00018294582505936312, + "loss": 0.0419, + "step": 1436 + }, + { + "epoch": 0.18984707864055222, + "grad_norm": 0.2724175453186035, + "learning_rate": 0.00018292260324660875, + "loss": 0.0336, + "step": 1437 + }, + { + "epoch": 0.18997919212603626, + "grad_norm": 0.1679764837026596, + "learning_rate": 0.0001828993671105251, + "loss": 0.0232, + "step": 1438 + }, + { + "epoch": 0.1901113056115203, + "grad_norm": 0.19844774901866913, + "learning_rate": 0.00018287611665512575, + "loss": 0.0216, + "step": 1439 + }, + { + "epoch": 0.19024341909700432, + "grad_norm": 0.26759544014930725, + "learning_rate": 0.00018285285188442683, + "loss": 0.0296, + "step": 1440 + }, + { + "epoch": 0.19037553258248835, + "grad_norm": 0.6173081994056702, + "learning_rate": 0.00018282957280244685, + "loss": 0.0172, + "step": 1441 + }, + { + "epoch": 0.19050764606797238, + "grad_norm": 0.2865166664123535, + "learning_rate": 0.00018280627941320688, + "loss": 0.0303, + "step": 1442 + }, + { + "epoch": 0.1906397595534564, + "grad_norm": 0.24494263529777527, + "learning_rate": 0.00018278297172073037, + "loss": 0.0247, + "step": 1443 + }, + { + "epoch": 0.19077187303894044, + "grad_norm": 0.1931535005569458, + "learning_rate": 0.0001827596497290433, + "loss": 0.0251, + "step": 1444 + }, + { + "epoch": 0.19090398652442447, + "grad_norm": 0.2376992106437683, + "learning_rate": 0.00018273631344217415, + "loss": 0.022, + "step": 1445 + }, + { + "epoch": 0.1910361000099085, + "grad_norm": 0.23395435512065887, + "learning_rate": 0.00018271296286415377, + "loss": 0.0311, + "step": 1446 + }, + { + "epoch": 0.19116821349539254, + "grad_norm": 0.2956295311450958, + "learning_rate": 0.00018268959799901558, + "loss": 0.0255, + "step": 1447 + }, + { + "epoch": 0.19130032698087657, + "grad_norm": 0.31732040643692017, + "learning_rate": 0.0001826662188507954, + "loss": 0.0342, + "step": 1448 + }, + { + "epoch": 0.1914324404663606, + "grad_norm": 0.20662644505500793, + "learning_rate": 0.00018264282542353156, + "loss": 0.0177, + "step": 1449 + }, + { + "epoch": 0.19156455395184463, + "grad_norm": 0.15632778406143188, + "learning_rate": 0.00018261941772126477, + "loss": 0.0147, + "step": 1450 + }, + { + "epoch": 0.19169666743732866, + "grad_norm": 0.18582721054553986, + "learning_rate": 0.0001825959957480384, + "loss": 0.0253, + "step": 1451 + }, + { + "epoch": 0.1918287809228127, + "grad_norm": 0.3005513846874237, + "learning_rate": 0.00018257255950789803, + "loss": 0.0497, + "step": 1452 + }, + { + "epoch": 0.19196089440829672, + "grad_norm": 0.20745569467544556, + "learning_rate": 0.0001825491090048919, + "loss": 0.0277, + "step": 1453 + }, + { + "epoch": 0.19209300789378075, + "grad_norm": 0.22888556122779846, + "learning_rate": 0.00018252564424307065, + "loss": 0.0304, + "step": 1454 + }, + { + "epoch": 0.19222512137926479, + "grad_norm": 0.2664090394973755, + "learning_rate": 0.00018250216522648738, + "loss": 0.0344, + "step": 1455 + }, + { + "epoch": 0.19235723486474882, + "grad_norm": 0.27466508746147156, + "learning_rate": 0.0001824786719591976, + "loss": 0.0235, + "step": 1456 + }, + { + "epoch": 0.19248934835023285, + "grad_norm": 0.21832376718521118, + "learning_rate": 0.00018245516444525937, + "loss": 0.0137, + "step": 1457 + }, + { + "epoch": 0.19262146183571688, + "grad_norm": 0.163347527384758, + "learning_rate": 0.00018243164268873317, + "loss": 0.0133, + "step": 1458 + }, + { + "epoch": 0.1927535753212009, + "grad_norm": 0.19356457889080048, + "learning_rate": 0.00018240810669368194, + "loss": 0.026, + "step": 1459 + }, + { + "epoch": 0.19288568880668494, + "grad_norm": 0.336757093667984, + "learning_rate": 0.00018238455646417108, + "loss": 0.034, + "step": 1460 + }, + { + "epoch": 0.19301780229216897, + "grad_norm": 0.197519913315773, + "learning_rate": 0.00018236099200426845, + "loss": 0.0303, + "step": 1461 + }, + { + "epoch": 0.193149915777653, + "grad_norm": 0.40165066719055176, + "learning_rate": 0.0001823374133180444, + "loss": 0.0409, + "step": 1462 + }, + { + "epoch": 0.19328202926313703, + "grad_norm": 0.167204812169075, + "learning_rate": 0.00018231382040957166, + "loss": 0.0193, + "step": 1463 + }, + { + "epoch": 0.19341414274862107, + "grad_norm": 0.1733560860157013, + "learning_rate": 0.0001822902132829255, + "loss": 0.0189, + "step": 1464 + }, + { + "epoch": 0.1935462562341051, + "grad_norm": 0.22970251739025116, + "learning_rate": 0.00018226659194218363, + "loss": 0.0337, + "step": 1465 + }, + { + "epoch": 0.19367836971958913, + "grad_norm": 0.5224602818489075, + "learning_rate": 0.00018224295639142612, + "loss": 0.0214, + "step": 1466 + }, + { + "epoch": 0.19381048320507316, + "grad_norm": 0.18081384897232056, + "learning_rate": 0.0001822193066347356, + "loss": 0.0196, + "step": 1467 + }, + { + "epoch": 0.1939425966905572, + "grad_norm": 0.2021227478981018, + "learning_rate": 0.00018219564267619719, + "loss": 0.0253, + "step": 1468 + }, + { + "epoch": 0.19407471017604122, + "grad_norm": 0.17635184526443481, + "learning_rate": 0.00018217196451989832, + "loss": 0.0187, + "step": 1469 + }, + { + "epoch": 0.19420682366152525, + "grad_norm": 0.36210498213768005, + "learning_rate": 0.00018214827216992893, + "loss": 0.0383, + "step": 1470 + }, + { + "epoch": 0.19433893714700928, + "grad_norm": 0.2603173553943634, + "learning_rate": 0.00018212456563038151, + "loss": 0.0212, + "step": 1471 + }, + { + "epoch": 0.19447105063249331, + "grad_norm": 0.1690969467163086, + "learning_rate": 0.00018210084490535088, + "loss": 0.0252, + "step": 1472 + }, + { + "epoch": 0.19460316411797735, + "grad_norm": 0.17981064319610596, + "learning_rate": 0.00018207710999893436, + "loss": 0.0257, + "step": 1473 + }, + { + "epoch": 0.19473527760346138, + "grad_norm": 0.2037472426891327, + "learning_rate": 0.00018205336091523167, + "loss": 0.0228, + "step": 1474 + }, + { + "epoch": 0.1948673910889454, + "grad_norm": 0.3084849715232849, + "learning_rate": 0.0001820295976583451, + "loss": 0.0321, + "step": 1475 + }, + { + "epoch": 0.19499950457442944, + "grad_norm": 0.24606458842754364, + "learning_rate": 0.00018200582023237925, + "loss": 0.0327, + "step": 1476 + }, + { + "epoch": 0.19513161805991347, + "grad_norm": 0.2386983186006546, + "learning_rate": 0.00018198202864144124, + "loss": 0.0245, + "step": 1477 + }, + { + "epoch": 0.1952637315453975, + "grad_norm": 0.19429266452789307, + "learning_rate": 0.00018195822288964063, + "loss": 0.0266, + "step": 1478 + }, + { + "epoch": 0.19539584503088153, + "grad_norm": 0.20982322096824646, + "learning_rate": 0.00018193440298108939, + "loss": 0.0227, + "step": 1479 + }, + { + "epoch": 0.19552795851636556, + "grad_norm": 0.20643344521522522, + "learning_rate": 0.00018191056891990202, + "loss": 0.0294, + "step": 1480 + }, + { + "epoch": 0.1956600720018496, + "grad_norm": 0.2400667667388916, + "learning_rate": 0.00018188672071019535, + "loss": 0.0242, + "step": 1481 + }, + { + "epoch": 0.19579218548733363, + "grad_norm": 0.3001692593097687, + "learning_rate": 0.0001818628583560887, + "loss": 0.0316, + "step": 1482 + }, + { + "epoch": 0.19592429897281766, + "grad_norm": 0.22269077599048615, + "learning_rate": 0.00018183898186170395, + "loss": 0.0338, + "step": 1483 + }, + { + "epoch": 0.1960564124583017, + "grad_norm": 0.26167190074920654, + "learning_rate": 0.0001818150912311652, + "loss": 0.0361, + "step": 1484 + }, + { + "epoch": 0.19618852594378572, + "grad_norm": 0.2652554512023926, + "learning_rate": 0.00018179118646859918, + "loss": 0.0194, + "step": 1485 + }, + { + "epoch": 0.19632063942926975, + "grad_norm": 0.26757028698921204, + "learning_rate": 0.00018176726757813497, + "loss": 0.0173, + "step": 1486 + }, + { + "epoch": 0.19645275291475378, + "grad_norm": 0.28542211651802063, + "learning_rate": 0.00018174333456390409, + "loss": 0.0436, + "step": 1487 + }, + { + "epoch": 0.1965848664002378, + "grad_norm": 0.25279921293258667, + "learning_rate": 0.00018171938743004055, + "loss": 0.0358, + "step": 1488 + }, + { + "epoch": 0.19671697988572184, + "grad_norm": 0.2647570073604584, + "learning_rate": 0.00018169542618068078, + "loss": 0.0168, + "step": 1489 + }, + { + "epoch": 0.19684909337120587, + "grad_norm": 0.20313741266727448, + "learning_rate": 0.00018167145081996358, + "loss": 0.0248, + "step": 1490 + }, + { + "epoch": 0.1969812068566899, + "grad_norm": 0.8464295864105225, + "learning_rate": 0.00018164746135203034, + "loss": 0.0314, + "step": 1491 + }, + { + "epoch": 0.19711332034217394, + "grad_norm": 0.15083986520767212, + "learning_rate": 0.0001816234577810247, + "loss": 0.0111, + "step": 1492 + }, + { + "epoch": 0.19724543382765797, + "grad_norm": 0.30844977498054504, + "learning_rate": 0.0001815994401110929, + "loss": 0.029, + "step": 1493 + }, + { + "epoch": 0.197377547313142, + "grad_norm": 0.2315860390663147, + "learning_rate": 0.00018157540834638346, + "loss": 0.0282, + "step": 1494 + }, + { + "epoch": 0.19750966079862603, + "grad_norm": 0.249103382229805, + "learning_rate": 0.00018155136249104747, + "loss": 0.0227, + "step": 1495 + }, + { + "epoch": 0.19764177428411006, + "grad_norm": 0.1728508025407791, + "learning_rate": 0.00018152730254923841, + "loss": 0.0255, + "step": 1496 + }, + { + "epoch": 0.1977738877695941, + "grad_norm": 0.35926568508148193, + "learning_rate": 0.00018150322852511218, + "loss": 0.021, + "step": 1497 + }, + { + "epoch": 0.19790600125507812, + "grad_norm": 0.22047758102416992, + "learning_rate": 0.0001814791404228271, + "loss": 0.0345, + "step": 1498 + }, + { + "epoch": 0.19803811474056215, + "grad_norm": 0.28702178597450256, + "learning_rate": 0.00018145503824654394, + "loss": 0.0246, + "step": 1499 + }, + { + "epoch": 0.19817022822604619, + "grad_norm": 0.24854514002799988, + "learning_rate": 0.00018143092200042596, + "loss": 0.0303, + "step": 1500 + }, + { + "epoch": 0.19830234171153022, + "grad_norm": 0.21608246862888336, + "learning_rate": 0.0001814067916886387, + "loss": 0.0155, + "step": 1501 + }, + { + "epoch": 0.19843445519701425, + "grad_norm": 0.21416838467121124, + "learning_rate": 0.00018138264731535025, + "loss": 0.0243, + "step": 1502 + }, + { + "epoch": 0.19856656868249828, + "grad_norm": 0.21499444544315338, + "learning_rate": 0.00018135848888473115, + "loss": 0.0268, + "step": 1503 + }, + { + "epoch": 0.1986986821679823, + "grad_norm": 0.2473350167274475, + "learning_rate": 0.00018133431640095425, + "loss": 0.0181, + "step": 1504 + }, + { + "epoch": 0.19883079565346634, + "grad_norm": 0.15252315998077393, + "learning_rate": 0.0001813101298681949, + "loss": 0.0165, + "step": 1505 + }, + { + "epoch": 0.19896290913895034, + "grad_norm": 0.2195468693971634, + "learning_rate": 0.00018128592929063093, + "loss": 0.0302, + "step": 1506 + }, + { + "epoch": 0.19909502262443438, + "grad_norm": 0.19480378925800323, + "learning_rate": 0.00018126171467244248, + "loss": 0.026, + "step": 1507 + }, + { + "epoch": 0.1992271361099184, + "grad_norm": 0.3477564752101898, + "learning_rate": 0.0001812374860178122, + "loss": 0.0353, + "step": 1508 + }, + { + "epoch": 0.19935924959540244, + "grad_norm": 0.19011445343494415, + "learning_rate": 0.00018121324333092513, + "loss": 0.0203, + "step": 1509 + }, + { + "epoch": 0.19949136308088647, + "grad_norm": 0.2625371515750885, + "learning_rate": 0.00018118898661596876, + "loss": 0.0191, + "step": 1510 + }, + { + "epoch": 0.1996234765663705, + "grad_norm": 0.18926642835140228, + "learning_rate": 0.00018116471587713293, + "loss": 0.0178, + "step": 1511 + }, + { + "epoch": 0.19975559005185453, + "grad_norm": 0.17896400392055511, + "learning_rate": 0.00018114043111861, + "loss": 0.025, + "step": 1512 + }, + { + "epoch": 0.19988770353733856, + "grad_norm": 0.2485428750514984, + "learning_rate": 0.00018111613234459472, + "loss": 0.0186, + "step": 1513 + }, + { + "epoch": 0.2000198170228226, + "grad_norm": 0.3549768030643463, + "learning_rate": 0.0001810918195592842, + "loss": 0.047, + "step": 1514 + }, + { + "epoch": 0.20015193050830662, + "grad_norm": 0.2386510819196701, + "learning_rate": 0.00018106749276687806, + "loss": 0.0269, + "step": 1515 + }, + { + "epoch": 0.20028404399379066, + "grad_norm": 0.21441778540611267, + "learning_rate": 0.0001810431519715783, + "loss": 0.0271, + "step": 1516 + }, + { + "epoch": 0.2004161574792747, + "grad_norm": 0.3943035900592804, + "learning_rate": 0.00018101879717758931, + "loss": 0.0253, + "step": 1517 + }, + { + "epoch": 0.20054827096475872, + "grad_norm": 0.2370191365480423, + "learning_rate": 0.00018099442838911793, + "loss": 0.0418, + "step": 1518 + }, + { + "epoch": 0.20068038445024275, + "grad_norm": 0.2918333113193512, + "learning_rate": 0.00018097004561037344, + "loss": 0.0371, + "step": 1519 + }, + { + "epoch": 0.20081249793572678, + "grad_norm": 0.23465055227279663, + "learning_rate": 0.00018094564884556745, + "loss": 0.0301, + "step": 1520 + }, + { + "epoch": 0.2009446114212108, + "grad_norm": 0.1612727791070938, + "learning_rate": 0.00018092123809891413, + "loss": 0.0219, + "step": 1521 + }, + { + "epoch": 0.20107672490669484, + "grad_norm": 0.21482056379318237, + "learning_rate": 0.0001808968133746299, + "loss": 0.0307, + "step": 1522 + }, + { + "epoch": 0.20120883839217887, + "grad_norm": 0.24555975198745728, + "learning_rate": 0.00018087237467693374, + "loss": 0.0233, + "step": 1523 + }, + { + "epoch": 0.2013409518776629, + "grad_norm": 0.1954948455095291, + "learning_rate": 0.0001808479220100469, + "loss": 0.0236, + "step": 1524 + }, + { + "epoch": 0.20147306536314694, + "grad_norm": 0.2907003164291382, + "learning_rate": 0.00018082345537819326, + "loss": 0.0356, + "step": 1525 + }, + { + "epoch": 0.20160517884863097, + "grad_norm": 0.17122356593608856, + "learning_rate": 0.00018079897478559878, + "loss": 0.026, + "step": 1526 + }, + { + "epoch": 0.201737292334115, + "grad_norm": 0.2171349972486496, + "learning_rate": 0.00018077448023649218, + "loss": 0.0284, + "step": 1527 + }, + { + "epoch": 0.20186940581959903, + "grad_norm": 0.18378032743930817, + "learning_rate": 0.00018074997173510437, + "loss": 0.0255, + "step": 1528 + }, + { + "epoch": 0.20200151930508306, + "grad_norm": 0.7698642611503601, + "learning_rate": 0.00018072544928566874, + "loss": 0.0277, + "step": 1529 + }, + { + "epoch": 0.2021336327905671, + "grad_norm": 0.23853057622909546, + "learning_rate": 0.00018070091289242114, + "loss": 0.0294, + "step": 1530 + }, + { + "epoch": 0.20226574627605112, + "grad_norm": 0.3057100176811218, + "learning_rate": 0.00018067636255959964, + "loss": 0.0363, + "step": 1531 + }, + { + "epoch": 0.20239785976153515, + "grad_norm": 0.28446629643440247, + "learning_rate": 0.00018065179829144498, + "loss": 0.034, + "step": 1532 + }, + { + "epoch": 0.20252997324701918, + "grad_norm": 0.4760570526123047, + "learning_rate": 0.00018062722009220015, + "loss": 0.0383, + "step": 1533 + }, + { + "epoch": 0.20266208673250322, + "grad_norm": 0.36639007925987244, + "learning_rate": 0.00018060262796611057, + "loss": 0.0224, + "step": 1534 + }, + { + "epoch": 0.20279420021798725, + "grad_norm": 0.23543117940425873, + "learning_rate": 0.00018057802191742402, + "loss": 0.0393, + "step": 1535 + }, + { + "epoch": 0.20292631370347128, + "grad_norm": 0.21014980971813202, + "learning_rate": 0.00018055340195039077, + "loss": 0.0271, + "step": 1536 + }, + { + "epoch": 0.2030584271889553, + "grad_norm": 0.2565693259239197, + "learning_rate": 0.00018052876806926347, + "loss": 0.0347, + "step": 1537 + }, + { + "epoch": 0.20319054067443934, + "grad_norm": 0.24108761548995972, + "learning_rate": 0.00018050412027829715, + "loss": 0.0182, + "step": 1538 + }, + { + "epoch": 0.20332265415992337, + "grad_norm": 0.2666507363319397, + "learning_rate": 0.00018047945858174925, + "loss": 0.0289, + "step": 1539 + }, + { + "epoch": 0.2034547676454074, + "grad_norm": 0.2039230316877365, + "learning_rate": 0.00018045478298387967, + "loss": 0.0244, + "step": 1540 + }, + { + "epoch": 0.20358688113089143, + "grad_norm": 0.26694339513778687, + "learning_rate": 0.00018043009348895058, + "loss": 0.0285, + "step": 1541 + }, + { + "epoch": 0.20371899461637547, + "grad_norm": 0.23415853083133698, + "learning_rate": 0.00018040539010122668, + "loss": 0.0325, + "step": 1542 + }, + { + "epoch": 0.2038511081018595, + "grad_norm": 0.23588255047798157, + "learning_rate": 0.000180380672824975, + "loss": 0.0282, + "step": 1543 + }, + { + "epoch": 0.20398322158734353, + "grad_norm": 0.2112555056810379, + "learning_rate": 0.00018035594166446498, + "loss": 0.0322, + "step": 1544 + }, + { + "epoch": 0.20411533507282756, + "grad_norm": 0.21087202429771423, + "learning_rate": 0.00018033119662396846, + "loss": 0.0237, + "step": 1545 + }, + { + "epoch": 0.2042474485583116, + "grad_norm": 0.24182988703250885, + "learning_rate": 0.00018030643770775972, + "loss": 0.0251, + "step": 1546 + }, + { + "epoch": 0.20437956204379562, + "grad_norm": 0.205749049782753, + "learning_rate": 0.00018028166492011538, + "loss": 0.0242, + "step": 1547 + }, + { + "epoch": 0.20451167552927965, + "grad_norm": 0.26967766880989075, + "learning_rate": 0.00018025687826531445, + "loss": 0.0312, + "step": 1548 + }, + { + "epoch": 0.20464378901476368, + "grad_norm": 0.3510602116584778, + "learning_rate": 0.0001802320777476384, + "loss": 0.0512, + "step": 1549 + }, + { + "epoch": 0.20477590250024771, + "grad_norm": 0.1554529070854187, + "learning_rate": 0.00018020726337137106, + "loss": 0.0141, + "step": 1550 + }, + { + "epoch": 0.20490801598573175, + "grad_norm": 0.22048431634902954, + "learning_rate": 0.00018018243514079861, + "loss": 0.0319, + "step": 1551 + }, + { + "epoch": 0.20504012947121578, + "grad_norm": 0.24141453206539154, + "learning_rate": 0.00018015759306020968, + "loss": 0.0299, + "step": 1552 + }, + { + "epoch": 0.2051722429566998, + "grad_norm": 0.2439783215522766, + "learning_rate": 0.00018013273713389527, + "loss": 0.035, + "step": 1553 + }, + { + "epoch": 0.20530435644218384, + "grad_norm": 0.2585470676422119, + "learning_rate": 0.0001801078673661488, + "loss": 0.0367, + "step": 1554 + }, + { + "epoch": 0.20543646992766787, + "grad_norm": 0.33572009205818176, + "learning_rate": 0.00018008298376126605, + "loss": 0.0409, + "step": 1555 + }, + { + "epoch": 0.2055685834131519, + "grad_norm": 0.2367447465658188, + "learning_rate": 0.00018005808632354516, + "loss": 0.0269, + "step": 1556 + }, + { + "epoch": 0.20570069689863593, + "grad_norm": 0.20082278549671173, + "learning_rate": 0.00018003317505728674, + "loss": 0.0285, + "step": 1557 + }, + { + "epoch": 0.20583281038411996, + "grad_norm": 0.1698249727487564, + "learning_rate": 0.0001800082499667937, + "loss": 0.0203, + "step": 1558 + }, + { + "epoch": 0.205964923869604, + "grad_norm": 0.1803530901670456, + "learning_rate": 0.00017998331105637148, + "loss": 0.0226, + "step": 1559 + }, + { + "epoch": 0.20609703735508803, + "grad_norm": 0.24770233035087585, + "learning_rate": 0.0001799583583303277, + "loss": 0.0187, + "step": 1560 + }, + { + "epoch": 0.20622915084057206, + "grad_norm": 0.09762029349803925, + "learning_rate": 0.0001799333917929725, + "loss": 0.0057, + "step": 1561 + }, + { + "epoch": 0.2063612643260561, + "grad_norm": 0.1758822500705719, + "learning_rate": 0.00017990841144861845, + "loss": 0.0139, + "step": 1562 + }, + { + "epoch": 0.20649337781154012, + "grad_norm": 0.24245113134384155, + "learning_rate": 0.00017988341730158037, + "loss": 0.0368, + "step": 1563 + }, + { + "epoch": 0.20662549129702415, + "grad_norm": 0.21039935946464539, + "learning_rate": 0.00017985840935617558, + "loss": 0.0267, + "step": 1564 + }, + { + "epoch": 0.20675760478250818, + "grad_norm": 0.12615539133548737, + "learning_rate": 0.00017983338761672367, + "loss": 0.0162, + "step": 1565 + }, + { + "epoch": 0.2068897182679922, + "grad_norm": 0.21313214302062988, + "learning_rate": 0.00017980835208754675, + "loss": 0.0256, + "step": 1566 + }, + { + "epoch": 0.20702183175347624, + "grad_norm": 0.18838942050933838, + "learning_rate": 0.00017978330277296917, + "loss": 0.0268, + "step": 1567 + }, + { + "epoch": 0.20715394523896027, + "grad_norm": 0.2906305491924286, + "learning_rate": 0.00017975823967731778, + "loss": 0.0321, + "step": 1568 + }, + { + "epoch": 0.2072860587244443, + "grad_norm": 0.2979860305786133, + "learning_rate": 0.00017973316280492173, + "loss": 0.0415, + "step": 1569 + }, + { + "epoch": 0.20741817220992834, + "grad_norm": 0.19764262437820435, + "learning_rate": 0.00017970807216011262, + "loss": 0.0201, + "step": 1570 + }, + { + "epoch": 0.20755028569541237, + "grad_norm": 0.22368620336055756, + "learning_rate": 0.00017968296774722436, + "loss": 0.0385, + "step": 1571 + }, + { + "epoch": 0.2076823991808964, + "grad_norm": 0.1834963858127594, + "learning_rate": 0.0001796578495705933, + "loss": 0.0175, + "step": 1572 + }, + { + "epoch": 0.20781451266638043, + "grad_norm": 0.20226474106311798, + "learning_rate": 0.0001796327176345581, + "loss": 0.0202, + "step": 1573 + }, + { + "epoch": 0.20794662615186446, + "grad_norm": 0.21934787929058075, + "learning_rate": 0.00017960757194345983, + "loss": 0.0244, + "step": 1574 + }, + { + "epoch": 0.2080787396373485, + "grad_norm": 0.21891097724437714, + "learning_rate": 0.00017958241250164196, + "loss": 0.0183, + "step": 1575 + }, + { + "epoch": 0.20821085312283252, + "grad_norm": 0.21612125635147095, + "learning_rate": 0.0001795572393134503, + "loss": 0.0305, + "step": 1576 + }, + { + "epoch": 0.20834296660831655, + "grad_norm": 0.18277354538440704, + "learning_rate": 0.00017953205238323305, + "loss": 0.021, + "step": 1577 + }, + { + "epoch": 0.20847508009380059, + "grad_norm": 0.1836683601140976, + "learning_rate": 0.0001795068517153408, + "loss": 0.0272, + "step": 1578 + }, + { + "epoch": 0.20860719357928462, + "grad_norm": 0.22064363956451416, + "learning_rate": 0.00017948163731412647, + "loss": 0.0257, + "step": 1579 + }, + { + "epoch": 0.20873930706476865, + "grad_norm": 0.24008288979530334, + "learning_rate": 0.00017945640918394536, + "loss": 0.0249, + "step": 1580 + }, + { + "epoch": 0.20887142055025268, + "grad_norm": 0.32040950655937195, + "learning_rate": 0.00017943116732915522, + "loss": 0.0238, + "step": 1581 + }, + { + "epoch": 0.2090035340357367, + "grad_norm": 0.2716364562511444, + "learning_rate": 0.00017940591175411602, + "loss": 0.0323, + "step": 1582 + }, + { + "epoch": 0.20913564752122074, + "grad_norm": 0.2920228838920593, + "learning_rate": 0.0001793806424631903, + "loss": 0.0272, + "step": 1583 + }, + { + "epoch": 0.20926776100670477, + "grad_norm": 0.30364248156547546, + "learning_rate": 0.00017935535946074277, + "loss": 0.028, + "step": 1584 + }, + { + "epoch": 0.2093998744921888, + "grad_norm": 0.19998852908611298, + "learning_rate": 0.00017933006275114058, + "loss": 0.026, + "step": 1585 + }, + { + "epoch": 0.20953198797767283, + "grad_norm": 0.23407015204429626, + "learning_rate": 0.00017930475233875334, + "loss": 0.0262, + "step": 1586 + }, + { + "epoch": 0.20966410146315687, + "grad_norm": 0.31622499227523804, + "learning_rate": 0.00017927942822795295, + "loss": 0.0443, + "step": 1587 + }, + { + "epoch": 0.20979621494864087, + "grad_norm": 0.2934754192829132, + "learning_rate": 0.0001792540904231136, + "loss": 0.0319, + "step": 1588 + }, + { + "epoch": 0.2099283284341249, + "grad_norm": 0.21690823137760162, + "learning_rate": 0.00017922873892861198, + "loss": 0.0333, + "step": 1589 + }, + { + "epoch": 0.21006044191960893, + "grad_norm": 0.20228062570095062, + "learning_rate": 0.00017920337374882707, + "loss": 0.0179, + "step": 1590 + }, + { + "epoch": 0.21019255540509296, + "grad_norm": 0.19345952570438385, + "learning_rate": 0.00017917799488814022, + "loss": 0.0169, + "step": 1591 + }, + { + "epoch": 0.210324668890577, + "grad_norm": 0.22918011248111725, + "learning_rate": 0.00017915260235093516, + "loss": 0.0198, + "step": 1592 + }, + { + "epoch": 0.21045678237606102, + "grad_norm": 0.26978784799575806, + "learning_rate": 0.000179127196141598, + "loss": 0.0354, + "step": 1593 + }, + { + "epoch": 0.21058889586154506, + "grad_norm": 0.19669629633426666, + "learning_rate": 0.00017910177626451716, + "loss": 0.0204, + "step": 1594 + }, + { + "epoch": 0.2107210093470291, + "grad_norm": 0.2608644962310791, + "learning_rate": 0.00017907634272408348, + "loss": 0.0236, + "step": 1595 + }, + { + "epoch": 0.21085312283251312, + "grad_norm": 0.3554353713989258, + "learning_rate": 0.00017905089552469006, + "loss": 0.0375, + "step": 1596 + }, + { + "epoch": 0.21098523631799715, + "grad_norm": 0.21859480440616608, + "learning_rate": 0.00017902543467073251, + "loss": 0.0349, + "step": 1597 + }, + { + "epoch": 0.21111734980348118, + "grad_norm": 0.21692459285259247, + "learning_rate": 0.00017899996016660868, + "loss": 0.0291, + "step": 1598 + }, + { + "epoch": 0.2112494632889652, + "grad_norm": 0.1959286630153656, + "learning_rate": 0.00017897447201671883, + "loss": 0.0299, + "step": 1599 + }, + { + "epoch": 0.21138157677444924, + "grad_norm": 0.21266604959964752, + "learning_rate": 0.00017894897022546552, + "loss": 0.0291, + "step": 1600 + }, + { + "epoch": 0.21151369025993327, + "grad_norm": 0.36550846695899963, + "learning_rate": 0.00017892345479725373, + "loss": 0.0302, + "step": 1601 + }, + { + "epoch": 0.2116458037454173, + "grad_norm": 0.25008660554885864, + "learning_rate": 0.0001788979257364908, + "loss": 0.0293, + "step": 1602 + }, + { + "epoch": 0.21177791723090134, + "grad_norm": 0.21736475825309753, + "learning_rate": 0.00017887238304758633, + "loss": 0.0166, + "step": 1603 + }, + { + "epoch": 0.21191003071638537, + "grad_norm": 0.19466808438301086, + "learning_rate": 0.00017884682673495244, + "loss": 0.025, + "step": 1604 + }, + { + "epoch": 0.2120421442018694, + "grad_norm": 0.22048614919185638, + "learning_rate": 0.00017882125680300344, + "loss": 0.0217, + "step": 1605 + }, + { + "epoch": 0.21217425768735343, + "grad_norm": 0.2242555469274521, + "learning_rate": 0.00017879567325615605, + "loss": 0.0216, + "step": 1606 + }, + { + "epoch": 0.21230637117283746, + "grad_norm": 0.17746785283088684, + "learning_rate": 0.00017877007609882938, + "loss": 0.0173, + "step": 1607 + }, + { + "epoch": 0.2124384846583215, + "grad_norm": 0.3593146502971649, + "learning_rate": 0.00017874446533544484, + "loss": 0.0235, + "step": 1608 + }, + { + "epoch": 0.21257059814380552, + "grad_norm": 0.22502164542675018, + "learning_rate": 0.0001787188409704262, + "loss": 0.0221, + "step": 1609 + }, + { + "epoch": 0.21270271162928955, + "grad_norm": 0.2735959589481354, + "learning_rate": 0.00017869320300819967, + "loss": 0.0367, + "step": 1610 + }, + { + "epoch": 0.21283482511477358, + "grad_norm": 0.4019820988178253, + "learning_rate": 0.00017866755145319366, + "loss": 0.0412, + "step": 1611 + }, + { + "epoch": 0.21296693860025762, + "grad_norm": 0.2791774868965149, + "learning_rate": 0.00017864188630983897, + "loss": 0.0387, + "step": 1612 + }, + { + "epoch": 0.21309905208574165, + "grad_norm": 0.28183433413505554, + "learning_rate": 0.0001786162075825688, + "loss": 0.0318, + "step": 1613 + }, + { + "epoch": 0.21323116557122568, + "grad_norm": 0.20986947417259216, + "learning_rate": 0.0001785905152758187, + "loss": 0.0197, + "step": 1614 + }, + { + "epoch": 0.2133632790567097, + "grad_norm": 0.26751548051834106, + "learning_rate": 0.0001785648093940265, + "loss": 0.0268, + "step": 1615 + }, + { + "epoch": 0.21349539254219374, + "grad_norm": 0.24319139122962952, + "learning_rate": 0.00017853908994163248, + "loss": 0.0338, + "step": 1616 + }, + { + "epoch": 0.21362750602767777, + "grad_norm": 0.18239794671535492, + "learning_rate": 0.00017851335692307905, + "loss": 0.0236, + "step": 1617 + }, + { + "epoch": 0.2137596195131618, + "grad_norm": 0.12741148471832275, + "learning_rate": 0.00017848761034281127, + "loss": 0.0139, + "step": 1618 + }, + { + "epoch": 0.21389173299864583, + "grad_norm": 0.28356799483299255, + "learning_rate": 0.00017846185020527628, + "loss": 0.0263, + "step": 1619 + }, + { + "epoch": 0.21402384648412986, + "grad_norm": 0.18482592701911926, + "learning_rate": 0.00017843607651492368, + "loss": 0.0223, + "step": 1620 + }, + { + "epoch": 0.2141559599696139, + "grad_norm": 0.21175946295261383, + "learning_rate": 0.00017841028927620544, + "loss": 0.0304, + "step": 1621 + }, + { + "epoch": 0.21428807345509793, + "grad_norm": 0.23961223661899567, + "learning_rate": 0.00017838448849357574, + "loss": 0.0377, + "step": 1622 + }, + { + "epoch": 0.21442018694058196, + "grad_norm": 0.2638351023197174, + "learning_rate": 0.00017835867417149127, + "loss": 0.0268, + "step": 1623 + }, + { + "epoch": 0.214552300426066, + "grad_norm": 0.16874513030052185, + "learning_rate": 0.0001783328463144109, + "loss": 0.0157, + "step": 1624 + }, + { + "epoch": 0.21468441391155002, + "grad_norm": 0.22855417430400848, + "learning_rate": 0.00017830700492679595, + "loss": 0.0269, + "step": 1625 + }, + { + "epoch": 0.21481652739703405, + "grad_norm": 0.23026318848133087, + "learning_rate": 0.00017828115001311003, + "loss": 0.0193, + "step": 1626 + }, + { + "epoch": 0.21494864088251808, + "grad_norm": 0.1460496187210083, + "learning_rate": 0.00017825528157781908, + "loss": 0.0164, + "step": 1627 + }, + { + "epoch": 0.21508075436800211, + "grad_norm": 0.15897943079471588, + "learning_rate": 0.00017822939962539142, + "loss": 0.0219, + "step": 1628 + }, + { + "epoch": 0.21521286785348615, + "grad_norm": 0.32363682985305786, + "learning_rate": 0.00017820350416029762, + "loss": 0.0342, + "step": 1629 + }, + { + "epoch": 0.21534498133897018, + "grad_norm": 0.2849627733230591, + "learning_rate": 0.0001781775951870107, + "loss": 0.0333, + "step": 1630 + }, + { + "epoch": 0.2154770948244542, + "grad_norm": 0.22426821291446686, + "learning_rate": 0.00017815167271000587, + "loss": 0.0349, + "step": 1631 + }, + { + "epoch": 0.21560920830993824, + "grad_norm": 0.18805508315563202, + "learning_rate": 0.00017812573673376086, + "loss": 0.0321, + "step": 1632 + }, + { + "epoch": 0.21574132179542227, + "grad_norm": 0.13759900629520416, + "learning_rate": 0.00017809978726275553, + "loss": 0.0159, + "step": 1633 + }, + { + "epoch": 0.2158734352809063, + "grad_norm": 0.17568650841712952, + "learning_rate": 0.00017807382430147221, + "loss": 0.0268, + "step": 1634 + }, + { + "epoch": 0.21600554876639033, + "grad_norm": 0.1804172843694687, + "learning_rate": 0.00017804784785439552, + "loss": 0.0237, + "step": 1635 + }, + { + "epoch": 0.21613766225187436, + "grad_norm": 0.18389666080474854, + "learning_rate": 0.0001780218579260124, + "loss": 0.019, + "step": 1636 + }, + { + "epoch": 0.2162697757373584, + "grad_norm": 0.3894127607345581, + "learning_rate": 0.00017799585452081212, + "loss": 0.0375, + "step": 1637 + }, + { + "epoch": 0.21640188922284243, + "grad_norm": 0.21016249060630798, + "learning_rate": 0.00017796983764328627, + "loss": 0.0217, + "step": 1638 + }, + { + "epoch": 0.21653400270832646, + "grad_norm": 0.22238297760486603, + "learning_rate": 0.0001779438072979288, + "loss": 0.0208, + "step": 1639 + }, + { + "epoch": 0.2166661161938105, + "grad_norm": 0.22341783344745636, + "learning_rate": 0.00017791776348923593, + "loss": 0.0332, + "step": 1640 + }, + { + "epoch": 0.21679822967929452, + "grad_norm": 0.19672025740146637, + "learning_rate": 0.00017789170622170626, + "loss": 0.0135, + "step": 1641 + }, + { + "epoch": 0.21693034316477855, + "grad_norm": 0.3600621223449707, + "learning_rate": 0.00017786563549984074, + "loss": 0.0325, + "step": 1642 + }, + { + "epoch": 0.21706245665026258, + "grad_norm": 0.1935514360666275, + "learning_rate": 0.00017783955132814257, + "loss": 0.0223, + "step": 1643 + }, + { + "epoch": 0.2171945701357466, + "grad_norm": 0.19862335920333862, + "learning_rate": 0.00017781345371111726, + "loss": 0.0241, + "step": 1644 + }, + { + "epoch": 0.21732668362123064, + "grad_norm": 0.2320193350315094, + "learning_rate": 0.0001777873426532727, + "loss": 0.0311, + "step": 1645 + }, + { + "epoch": 0.21745879710671467, + "grad_norm": 0.3330860435962677, + "learning_rate": 0.00017776121815911915, + "loss": 0.0334, + "step": 1646 + }, + { + "epoch": 0.2175909105921987, + "grad_norm": 0.23526433110237122, + "learning_rate": 0.00017773508023316909, + "loss": 0.0337, + "step": 1647 + }, + { + "epoch": 0.21772302407768274, + "grad_norm": 0.22359345853328705, + "learning_rate": 0.00017770892887993735, + "loss": 0.0213, + "step": 1648 + }, + { + "epoch": 0.21785513756316677, + "grad_norm": 0.18130530416965485, + "learning_rate": 0.0001776827641039411, + "loss": 0.0235, + "step": 1649 + }, + { + "epoch": 0.2179872510486508, + "grad_norm": 0.2357502579689026, + "learning_rate": 0.00017765658590969977, + "loss": 0.0275, + "step": 1650 + }, + { + "epoch": 0.21811936453413483, + "grad_norm": 0.21433457732200623, + "learning_rate": 0.00017763039430173522, + "loss": 0.0355, + "step": 1651 + }, + { + "epoch": 0.21825147801961886, + "grad_norm": 0.38811975717544556, + "learning_rate": 0.00017760418928457149, + "loss": 0.0403, + "step": 1652 + }, + { + "epoch": 0.2183835915051029, + "grad_norm": 0.22088828682899475, + "learning_rate": 0.0001775779708627351, + "loss": 0.0258, + "step": 1653 + }, + { + "epoch": 0.21851570499058692, + "grad_norm": 0.1678798496723175, + "learning_rate": 0.0001775517390407547, + "loss": 0.0196, + "step": 1654 + }, + { + "epoch": 0.21864781847607095, + "grad_norm": 0.1645585298538208, + "learning_rate": 0.00017752549382316142, + "loss": 0.0195, + "step": 1655 + }, + { + "epoch": 0.21877993196155499, + "grad_norm": 0.40038180351257324, + "learning_rate": 0.00017749923521448858, + "loss": 0.0281, + "step": 1656 + }, + { + "epoch": 0.21891204544703902, + "grad_norm": 0.24812577664852142, + "learning_rate": 0.0001774729632192719, + "loss": 0.0308, + "step": 1657 + }, + { + "epoch": 0.21904415893252305, + "grad_norm": 0.2897437810897827, + "learning_rate": 0.00017744667784204933, + "loss": 0.0366, + "step": 1658 + }, + { + "epoch": 0.21917627241800708, + "grad_norm": 0.32864317297935486, + "learning_rate": 0.0001774203790873612, + "loss": 0.0446, + "step": 1659 + }, + { + "epoch": 0.2193083859034911, + "grad_norm": 0.23558707535266876, + "learning_rate": 0.00017739406695975015, + "loss": 0.0205, + "step": 1660 + }, + { + "epoch": 0.21944049938897514, + "grad_norm": 0.1841057389974594, + "learning_rate": 0.0001773677414637611, + "loss": 0.0269, + "step": 1661 + }, + { + "epoch": 0.21957261287445917, + "grad_norm": 0.19524601101875305, + "learning_rate": 0.00017734140260394126, + "loss": 0.0277, + "step": 1662 + }, + { + "epoch": 0.2197047263599432, + "grad_norm": 0.14408576488494873, + "learning_rate": 0.0001773150503848402, + "loss": 0.0124, + "step": 1663 + }, + { + "epoch": 0.21983683984542723, + "grad_norm": 0.16591165959835052, + "learning_rate": 0.00017728868481100977, + "loss": 0.0221, + "step": 1664 + }, + { + "epoch": 0.21996895333091127, + "grad_norm": 0.15631432831287384, + "learning_rate": 0.00017726230588700412, + "loss": 0.0242, + "step": 1665 + }, + { + "epoch": 0.2201010668163953, + "grad_norm": 0.12172911316156387, + "learning_rate": 0.0001772359136173797, + "loss": 0.0191, + "step": 1666 + }, + { + "epoch": 0.22023318030187933, + "grad_norm": 0.14911390841007233, + "learning_rate": 0.00017720950800669533, + "loss": 0.0173, + "step": 1667 + }, + { + "epoch": 0.22036529378736336, + "grad_norm": 0.27882853150367737, + "learning_rate": 0.0001771830890595121, + "loss": 0.0553, + "step": 1668 + }, + { + "epoch": 0.22049740727284736, + "grad_norm": 0.2508869171142578, + "learning_rate": 0.0001771566567803933, + "loss": 0.032, + "step": 1669 + }, + { + "epoch": 0.2206295207583314, + "grad_norm": 0.1741028130054474, + "learning_rate": 0.00017713021117390465, + "loss": 0.0189, + "step": 1670 + }, + { + "epoch": 0.22076163424381542, + "grad_norm": 0.256998747587204, + "learning_rate": 0.00017710375224461416, + "loss": 0.023, + "step": 1671 + }, + { + "epoch": 0.22089374772929946, + "grad_norm": 0.18835949897766113, + "learning_rate": 0.00017707727999709207, + "loss": 0.0146, + "step": 1672 + }, + { + "epoch": 0.2210258612147835, + "grad_norm": 0.22253838181495667, + "learning_rate": 0.00017705079443591104, + "loss": 0.0212, + "step": 1673 + }, + { + "epoch": 0.22115797470026752, + "grad_norm": 0.15436488389968872, + "learning_rate": 0.0001770242955656459, + "loss": 0.0181, + "step": 1674 + }, + { + "epoch": 0.22129008818575155, + "grad_norm": 0.24255327880382538, + "learning_rate": 0.00017699778339087384, + "loss": 0.0284, + "step": 1675 + }, + { + "epoch": 0.22142220167123558, + "grad_norm": 0.23516564071178436, + "learning_rate": 0.00017697125791617434, + "loss": 0.0184, + "step": 1676 + }, + { + "epoch": 0.2215543151567196, + "grad_norm": 0.15337681770324707, + "learning_rate": 0.0001769447191461292, + "loss": 0.0159, + "step": 1677 + }, + { + "epoch": 0.22168642864220364, + "grad_norm": 0.16270548105239868, + "learning_rate": 0.00017691816708532247, + "loss": 0.0171, + "step": 1678 + }, + { + "epoch": 0.22181854212768767, + "grad_norm": 0.19286991655826569, + "learning_rate": 0.00017689160173834054, + "loss": 0.0166, + "step": 1679 + }, + { + "epoch": 0.2219506556131717, + "grad_norm": 0.1711857169866562, + "learning_rate": 0.0001768650231097721, + "loss": 0.0271, + "step": 1680 + }, + { + "epoch": 0.22208276909865574, + "grad_norm": 0.30671536922454834, + "learning_rate": 0.00017683843120420804, + "loss": 0.027, + "step": 1681 + }, + { + "epoch": 0.22221488258413977, + "grad_norm": 0.2321193814277649, + "learning_rate": 0.00017681182602624168, + "loss": 0.0084, + "step": 1682 + }, + { + "epoch": 0.2223469960696238, + "grad_norm": 0.1957147866487503, + "learning_rate": 0.00017678520758046857, + "loss": 0.0169, + "step": 1683 + }, + { + "epoch": 0.22247910955510783, + "grad_norm": 0.1827395260334015, + "learning_rate": 0.0001767585758714865, + "loss": 0.0147, + "step": 1684 + }, + { + "epoch": 0.22261122304059186, + "grad_norm": 0.1572861671447754, + "learning_rate": 0.00017673193090389562, + "loss": 0.0136, + "step": 1685 + }, + { + "epoch": 0.2227433365260759, + "grad_norm": 0.21025492250919342, + "learning_rate": 0.00017670527268229838, + "loss": 0.0221, + "step": 1686 + }, + { + "epoch": 0.22287545001155992, + "grad_norm": 0.2409404218196869, + "learning_rate": 0.00017667860121129943, + "loss": 0.0229, + "step": 1687 + }, + { + "epoch": 0.22300756349704395, + "grad_norm": 0.20603077113628387, + "learning_rate": 0.0001766519164955058, + "loss": 0.0223, + "step": 1688 + }, + { + "epoch": 0.22313967698252798, + "grad_norm": 0.2035643309354782, + "learning_rate": 0.00017662521853952678, + "loss": 0.0169, + "step": 1689 + }, + { + "epoch": 0.22327179046801202, + "grad_norm": 0.2209031730890274, + "learning_rate": 0.00017659850734797397, + "loss": 0.0341, + "step": 1690 + }, + { + "epoch": 0.22340390395349605, + "grad_norm": 0.25767406821250916, + "learning_rate": 0.00017657178292546118, + "loss": 0.026, + "step": 1691 + }, + { + "epoch": 0.22353601743898008, + "grad_norm": 0.2129616141319275, + "learning_rate": 0.00017654504527660455, + "loss": 0.0163, + "step": 1692 + }, + { + "epoch": 0.2236681309244641, + "grad_norm": 0.28341618180274963, + "learning_rate": 0.00017651829440602258, + "loss": 0.0237, + "step": 1693 + }, + { + "epoch": 0.22380024440994814, + "grad_norm": 0.2251022607088089, + "learning_rate": 0.00017649153031833593, + "loss": 0.025, + "step": 1694 + }, + { + "epoch": 0.22393235789543217, + "grad_norm": 0.2455792874097824, + "learning_rate": 0.00017646475301816755, + "loss": 0.032, + "step": 1695 + }, + { + "epoch": 0.2240644713809162, + "grad_norm": 0.18342851102352142, + "learning_rate": 0.0001764379625101428, + "loss": 0.0188, + "step": 1696 + }, + { + "epoch": 0.22419658486640023, + "grad_norm": 0.2210848480463028, + "learning_rate": 0.0001764111587988892, + "loss": 0.0233, + "step": 1697 + }, + { + "epoch": 0.22432869835188426, + "grad_norm": 0.2323235422372818, + "learning_rate": 0.0001763843418890366, + "loss": 0.0111, + "step": 1698 + }, + { + "epoch": 0.2244608118373683, + "grad_norm": 0.228493332862854, + "learning_rate": 0.00017635751178521716, + "loss": 0.0279, + "step": 1699 + }, + { + "epoch": 0.22459292532285233, + "grad_norm": 0.3134079575538635, + "learning_rate": 0.00017633066849206518, + "loss": 0.0272, + "step": 1700 + }, + { + "epoch": 0.22472503880833636, + "grad_norm": 0.1924765408039093, + "learning_rate": 0.0001763038120142174, + "loss": 0.0229, + "step": 1701 + }, + { + "epoch": 0.2248571522938204, + "grad_norm": 0.4358731806278229, + "learning_rate": 0.00017627694235631278, + "loss": 0.0174, + "step": 1702 + }, + { + "epoch": 0.22498926577930442, + "grad_norm": 0.24608589708805084, + "learning_rate": 0.00017625005952299255, + "loss": 0.0192, + "step": 1703 + }, + { + "epoch": 0.22512137926478845, + "grad_norm": 0.32105961441993713, + "learning_rate": 0.00017622316351890017, + "loss": 0.0254, + "step": 1704 + }, + { + "epoch": 0.22525349275027248, + "grad_norm": 0.33401522040367126, + "learning_rate": 0.0001761962543486815, + "loss": 0.0393, + "step": 1705 + }, + { + "epoch": 0.2253856062357565, + "grad_norm": 0.34425655007362366, + "learning_rate": 0.00017616933201698452, + "loss": 0.0305, + "step": 1706 + }, + { + "epoch": 0.22551771972124054, + "grad_norm": 0.26156890392303467, + "learning_rate": 0.0001761423965284596, + "loss": 0.0349, + "step": 1707 + }, + { + "epoch": 0.22564983320672458, + "grad_norm": 0.22440393269062042, + "learning_rate": 0.00017611544788775937, + "loss": 0.0278, + "step": 1708 + }, + { + "epoch": 0.2257819466922086, + "grad_norm": 0.4424617886543274, + "learning_rate": 0.0001760884860995386, + "loss": 0.0276, + "step": 1709 + }, + { + "epoch": 0.22591406017769264, + "grad_norm": 0.1829511821269989, + "learning_rate": 0.00017606151116845458, + "loss": 0.0251, + "step": 1710 + }, + { + "epoch": 0.22604617366317667, + "grad_norm": 0.21703237295150757, + "learning_rate": 0.0001760345230991666, + "loss": 0.0191, + "step": 1711 + }, + { + "epoch": 0.2261782871486607, + "grad_norm": 0.23436713218688965, + "learning_rate": 0.0001760075218963364, + "loss": 0.0269, + "step": 1712 + }, + { + "epoch": 0.22631040063414473, + "grad_norm": 0.5055571794509888, + "learning_rate": 0.00017598050756462795, + "loss": 0.0277, + "step": 1713 + }, + { + "epoch": 0.22644251411962876, + "grad_norm": 0.23705795407295227, + "learning_rate": 0.0001759534801087074, + "loss": 0.0209, + "step": 1714 + }, + { + "epoch": 0.2265746276051128, + "grad_norm": 0.2245938926935196, + "learning_rate": 0.00017592643953324332, + "loss": 0.0143, + "step": 1715 + }, + { + "epoch": 0.22670674109059682, + "grad_norm": 0.24844376742839813, + "learning_rate": 0.0001758993858429064, + "loss": 0.0324, + "step": 1716 + }, + { + "epoch": 0.22683885457608086, + "grad_norm": 0.26490744948387146, + "learning_rate": 0.0001758723190423697, + "loss": 0.0295, + "step": 1717 + }, + { + "epoch": 0.2269709680615649, + "grad_norm": 0.3163183033466339, + "learning_rate": 0.00017584523913630851, + "loss": 0.046, + "step": 1718 + }, + { + "epoch": 0.22710308154704892, + "grad_norm": 0.27821382880210876, + "learning_rate": 0.00017581814612940036, + "loss": 0.0419, + "step": 1719 + }, + { + "epoch": 0.22723519503253295, + "grad_norm": 0.16733022034168243, + "learning_rate": 0.00017579104002632504, + "loss": 0.0228, + "step": 1720 + }, + { + "epoch": 0.22736730851801698, + "grad_norm": 0.3289214074611664, + "learning_rate": 0.00017576392083176466, + "loss": 0.0416, + "step": 1721 + }, + { + "epoch": 0.227499422003501, + "grad_norm": 0.205794095993042, + "learning_rate": 0.0001757367885504035, + "loss": 0.016, + "step": 1722 + }, + { + "epoch": 0.22763153548898504, + "grad_norm": 0.2669104337692261, + "learning_rate": 0.0001757096431869282, + "loss": 0.0335, + "step": 1723 + }, + { + "epoch": 0.22776364897446907, + "grad_norm": 0.22329342365264893, + "learning_rate": 0.0001756824847460276, + "loss": 0.021, + "step": 1724 + }, + { + "epoch": 0.2278957624599531, + "grad_norm": 0.2545218765735626, + "learning_rate": 0.00017565531323239286, + "loss": 0.0306, + "step": 1725 + }, + { + "epoch": 0.22802787594543714, + "grad_norm": 0.2121114581823349, + "learning_rate": 0.00017562812865071727, + "loss": 0.0161, + "step": 1726 + }, + { + "epoch": 0.22815998943092117, + "grad_norm": 0.2927865982055664, + "learning_rate": 0.0001756009310056965, + "loss": 0.0574, + "step": 1727 + }, + { + "epoch": 0.2282921029164052, + "grad_norm": 0.18407411873340607, + "learning_rate": 0.00017557372030202844, + "loss": 0.0239, + "step": 1728 + }, + { + "epoch": 0.22842421640188923, + "grad_norm": 0.2130606770515442, + "learning_rate": 0.00017554649654441323, + "loss": 0.0303, + "step": 1729 + }, + { + "epoch": 0.22855632988737326, + "grad_norm": 0.3313738703727722, + "learning_rate": 0.0001755192597375532, + "loss": 0.0481, + "step": 1730 + }, + { + "epoch": 0.2286884433728573, + "grad_norm": 0.24702952802181244, + "learning_rate": 0.00017549200988615311, + "loss": 0.0234, + "step": 1731 + }, + { + "epoch": 0.22882055685834132, + "grad_norm": 0.19129116833209991, + "learning_rate": 0.00017546474699491976, + "loss": 0.0299, + "step": 1732 + }, + { + "epoch": 0.22895267034382535, + "grad_norm": 0.2197098731994629, + "learning_rate": 0.0001754374710685624, + "loss": 0.0359, + "step": 1733 + }, + { + "epoch": 0.22908478382930939, + "grad_norm": 0.1769767850637436, + "learning_rate": 0.00017541018211179236, + "loss": 0.0155, + "step": 1734 + }, + { + "epoch": 0.22921689731479342, + "grad_norm": 0.22637642920017242, + "learning_rate": 0.00017538288012932334, + "loss": 0.0276, + "step": 1735 + }, + { + "epoch": 0.22934901080027745, + "grad_norm": 0.23079346120357513, + "learning_rate": 0.0001753555651258712, + "loss": 0.0204, + "step": 1736 + }, + { + "epoch": 0.22948112428576148, + "grad_norm": 0.3650188148021698, + "learning_rate": 0.00017532823710615417, + "loss": 0.0407, + "step": 1737 + }, + { + "epoch": 0.2296132377712455, + "grad_norm": 0.30013394355773926, + "learning_rate": 0.0001753008960748926, + "loss": 0.0426, + "step": 1738 + }, + { + "epoch": 0.22974535125672954, + "grad_norm": 0.22095589339733124, + "learning_rate": 0.00017527354203680914, + "loss": 0.0312, + "step": 1739 + }, + { + "epoch": 0.22987746474221357, + "grad_norm": 0.29758599400520325, + "learning_rate": 0.00017524617499662873, + "loss": 0.038, + "step": 1740 + }, + { + "epoch": 0.2300095782276976, + "grad_norm": 0.19875991344451904, + "learning_rate": 0.00017521879495907845, + "loss": 0.022, + "step": 1741 + }, + { + "epoch": 0.23014169171318163, + "grad_norm": 0.318978875875473, + "learning_rate": 0.00017519140192888774, + "loss": 0.0352, + "step": 1742 + }, + { + "epoch": 0.23027380519866567, + "grad_norm": 0.2085612267255783, + "learning_rate": 0.00017516399591078823, + "loss": 0.0246, + "step": 1743 + }, + { + "epoch": 0.2304059186841497, + "grad_norm": 0.26876693964004517, + "learning_rate": 0.00017513657690951378, + "loss": 0.0247, + "step": 1744 + }, + { + "epoch": 0.23053803216963373, + "grad_norm": 0.1887684464454651, + "learning_rate": 0.00017510914492980054, + "loss": 0.0325, + "step": 1745 + }, + { + "epoch": 0.23067014565511776, + "grad_norm": 0.16390034556388855, + "learning_rate": 0.0001750816999763868, + "loss": 0.0208, + "step": 1746 + }, + { + "epoch": 0.2308022591406018, + "grad_norm": 0.15387281775474548, + "learning_rate": 0.0001750542420540133, + "loss": 0.0109, + "step": 1747 + }, + { + "epoch": 0.23093437262608582, + "grad_norm": 0.17003598809242249, + "learning_rate": 0.00017502677116742273, + "loss": 0.0165, + "step": 1748 + }, + { + "epoch": 0.23106648611156985, + "grad_norm": 0.19222618639469147, + "learning_rate": 0.0001749992873213602, + "loss": 0.0195, + "step": 1749 + }, + { + "epoch": 0.23119859959705386, + "grad_norm": 0.203046977519989, + "learning_rate": 0.00017497179052057313, + "loss": 0.0217, + "step": 1750 + }, + { + "epoch": 0.2313307130825379, + "grad_norm": 0.23541304469108582, + "learning_rate": 0.000174944280769811, + "loss": 0.0229, + "step": 1751 + }, + { + "epoch": 0.23146282656802192, + "grad_norm": 0.1804003268480301, + "learning_rate": 0.0001749167580738256, + "loss": 0.0274, + "step": 1752 + }, + { + "epoch": 0.23159494005350595, + "grad_norm": 0.2763790786266327, + "learning_rate": 0.00017488922243737103, + "loss": 0.0334, + "step": 1753 + }, + { + "epoch": 0.23172705353898998, + "grad_norm": 0.17673511803150177, + "learning_rate": 0.0001748616738652035, + "loss": 0.0243, + "step": 1754 + }, + { + "epoch": 0.231859167024474, + "grad_norm": 0.23072902858257294, + "learning_rate": 0.00017483411236208149, + "loss": 0.0263, + "step": 1755 + }, + { + "epoch": 0.23199128050995804, + "grad_norm": 0.1810344159603119, + "learning_rate": 0.00017480653793276578, + "loss": 0.0222, + "step": 1756 + }, + { + "epoch": 0.23212339399544207, + "grad_norm": 0.19396093487739563, + "learning_rate": 0.0001747789505820193, + "loss": 0.0265, + "step": 1757 + }, + { + "epoch": 0.2322555074809261, + "grad_norm": 0.2533824145793915, + "learning_rate": 0.00017475135031460727, + "loss": 0.0401, + "step": 1758 + }, + { + "epoch": 0.23238762096641014, + "grad_norm": 0.18334946036338806, + "learning_rate": 0.00017472373713529714, + "loss": 0.0204, + "step": 1759 + }, + { + "epoch": 0.23251973445189417, + "grad_norm": 0.23319047689437866, + "learning_rate": 0.0001746961110488585, + "loss": 0.0294, + "step": 1760 + }, + { + "epoch": 0.2326518479373782, + "grad_norm": 0.2602129876613617, + "learning_rate": 0.00017466847206006335, + "loss": 0.0276, + "step": 1761 + }, + { + "epoch": 0.23278396142286223, + "grad_norm": 0.1965998113155365, + "learning_rate": 0.00017464082017368574, + "loss": 0.0239, + "step": 1762 + }, + { + "epoch": 0.23291607490834626, + "grad_norm": 0.27252528071403503, + "learning_rate": 0.00017461315539450204, + "loss": 0.0349, + "step": 1763 + }, + { + "epoch": 0.2330481883938303, + "grad_norm": 0.20603559911251068, + "learning_rate": 0.00017458547772729075, + "loss": 0.0281, + "step": 1764 + }, + { + "epoch": 0.23318030187931432, + "grad_norm": 0.17887108027935028, + "learning_rate": 0.00017455778717683277, + "loss": 0.0163, + "step": 1765 + }, + { + "epoch": 0.23331241536479835, + "grad_norm": 0.30276936292648315, + "learning_rate": 0.00017453008374791106, + "loss": 0.0526, + "step": 1766 + }, + { + "epoch": 0.23344452885028238, + "grad_norm": 0.37227141857147217, + "learning_rate": 0.00017450236744531093, + "loss": 0.0261, + "step": 1767 + }, + { + "epoch": 0.23357664233576642, + "grad_norm": 0.24804899096488953, + "learning_rate": 0.00017447463827381977, + "loss": 0.0391, + "step": 1768 + }, + { + "epoch": 0.23370875582125045, + "grad_norm": 0.2147999256849289, + "learning_rate": 0.00017444689623822735, + "loss": 0.0264, + "step": 1769 + }, + { + "epoch": 0.23384086930673448, + "grad_norm": 0.64190274477005, + "learning_rate": 0.00017441914134332556, + "loss": 0.0392, + "step": 1770 + }, + { + "epoch": 0.2339729827922185, + "grad_norm": 0.24241581559181213, + "learning_rate": 0.0001743913735939085, + "loss": 0.0238, + "step": 1771 + }, + { + "epoch": 0.23410509627770254, + "grad_norm": 0.19723357260227203, + "learning_rate": 0.0001743635929947726, + "loss": 0.023, + "step": 1772 + }, + { + "epoch": 0.23423720976318657, + "grad_norm": 0.16124692559242249, + "learning_rate": 0.0001743357995507164, + "loss": 0.0176, + "step": 1773 + }, + { + "epoch": 0.2343693232486706, + "grad_norm": 0.16757133603096008, + "learning_rate": 0.0001743079932665407, + "loss": 0.0138, + "step": 1774 + }, + { + "epoch": 0.23450143673415463, + "grad_norm": 0.36387279629707336, + "learning_rate": 0.00017428017414704853, + "loss": 0.037, + "step": 1775 + }, + { + "epoch": 0.23463355021963866, + "grad_norm": 0.20576192438602448, + "learning_rate": 0.0001742523421970451, + "loss": 0.0151, + "step": 1776 + }, + { + "epoch": 0.2347656637051227, + "grad_norm": 0.25506216287612915, + "learning_rate": 0.00017422449742133787, + "loss": 0.0311, + "step": 1777 + }, + { + "epoch": 0.23489777719060673, + "grad_norm": 0.20788130164146423, + "learning_rate": 0.0001741966398247365, + "loss": 0.0229, + "step": 1778 + }, + { + "epoch": 0.23502989067609076, + "grad_norm": 0.2590482532978058, + "learning_rate": 0.0001741687694120529, + "loss": 0.0257, + "step": 1779 + }, + { + "epoch": 0.2351620041615748, + "grad_norm": 0.23437197506427765, + "learning_rate": 0.00017414088618810113, + "loss": 0.0246, + "step": 1780 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 0.23272240161895752, + "learning_rate": 0.00017411299015769754, + "loss": 0.0256, + "step": 1781 + }, + { + "epoch": 0.23542623113254285, + "grad_norm": 0.20408686995506287, + "learning_rate": 0.00017408508132566055, + "loss": 0.0257, + "step": 1782 + }, + { + "epoch": 0.23555834461802688, + "grad_norm": 0.2479366958141327, + "learning_rate": 0.00017405715969681098, + "loss": 0.0357, + "step": 1783 + }, + { + "epoch": 0.2356904581035109, + "grad_norm": 0.19560597836971283, + "learning_rate": 0.00017402922527597173, + "loss": 0.0191, + "step": 1784 + }, + { + "epoch": 0.23582257158899494, + "grad_norm": 0.2366217076778412, + "learning_rate": 0.00017400127806796792, + "loss": 0.0219, + "step": 1785 + }, + { + "epoch": 0.23595468507447898, + "grad_norm": 0.1941092610359192, + "learning_rate": 0.00017397331807762702, + "loss": 0.0298, + "step": 1786 + }, + { + "epoch": 0.236086798559963, + "grad_norm": 0.17476750910282135, + "learning_rate": 0.0001739453453097785, + "loss": 0.0165, + "step": 1787 + }, + { + "epoch": 0.23621891204544704, + "grad_norm": 0.17220233380794525, + "learning_rate": 0.00017391735976925412, + "loss": 0.0218, + "step": 1788 + }, + { + "epoch": 0.23635102553093107, + "grad_norm": 0.21363332867622375, + "learning_rate": 0.0001738893614608879, + "loss": 0.021, + "step": 1789 + }, + { + "epoch": 0.2364831390164151, + "grad_norm": 0.2519174814224243, + "learning_rate": 0.00017386135038951602, + "loss": 0.0352, + "step": 1790 + }, + { + "epoch": 0.23661525250189913, + "grad_norm": 0.23778265714645386, + "learning_rate": 0.0001738333265599769, + "loss": 0.0252, + "step": 1791 + }, + { + "epoch": 0.23674736598738316, + "grad_norm": 0.2772628366947174, + "learning_rate": 0.00017380528997711108, + "loss": 0.0264, + "step": 1792 + }, + { + "epoch": 0.2368794794728672, + "grad_norm": 0.20920205116271973, + "learning_rate": 0.00017377724064576136, + "loss": 0.0277, + "step": 1793 + }, + { + "epoch": 0.23701159295835122, + "grad_norm": 0.2609867453575134, + "learning_rate": 0.00017374917857077276, + "loss": 0.0273, + "step": 1794 + }, + { + "epoch": 0.23714370644383526, + "grad_norm": 0.34655800461769104, + "learning_rate": 0.00017372110375699247, + "loss": 0.0311, + "step": 1795 + }, + { + "epoch": 0.2372758199293193, + "grad_norm": 0.3078354001045227, + "learning_rate": 0.0001736930162092699, + "loss": 0.0495, + "step": 1796 + }, + { + "epoch": 0.23740793341480332, + "grad_norm": 0.3296651542186737, + "learning_rate": 0.0001736649159324566, + "loss": 0.0357, + "step": 1797 + }, + { + "epoch": 0.23754004690028735, + "grad_norm": 0.27013999223709106, + "learning_rate": 0.00017363680293140644, + "loss": 0.0315, + "step": 1798 + }, + { + "epoch": 0.23767216038577138, + "grad_norm": 0.35716310143470764, + "learning_rate": 0.00017360867721097538, + "loss": 0.018, + "step": 1799 + }, + { + "epoch": 0.2378042738712554, + "grad_norm": 0.22914241254329681, + "learning_rate": 0.0001735805387760216, + "loss": 0.0282, + "step": 1800 + }, + { + "epoch": 0.23793638735673944, + "grad_norm": 0.2120794951915741, + "learning_rate": 0.00017355238763140554, + "loss": 0.0293, + "step": 1801 + }, + { + "epoch": 0.23806850084222347, + "grad_norm": 0.24935783445835114, + "learning_rate": 0.00017352422378198973, + "loss": 0.0336, + "step": 1802 + }, + { + "epoch": 0.2382006143277075, + "grad_norm": 0.2915463447570801, + "learning_rate": 0.00017349604723263892, + "loss": 0.03, + "step": 1803 + }, + { + "epoch": 0.23833272781319154, + "grad_norm": 0.21356649696826935, + "learning_rate": 0.00017346785798822017, + "loss": 0.0258, + "step": 1804 + }, + { + "epoch": 0.23846484129867557, + "grad_norm": 0.3095000386238098, + "learning_rate": 0.00017343965605360256, + "loss": 0.0232, + "step": 1805 + }, + { + "epoch": 0.2385969547841596, + "grad_norm": 0.1942126750946045, + "learning_rate": 0.00017341144143365753, + "loss": 0.0218, + "step": 1806 + }, + { + "epoch": 0.23872906826964363, + "grad_norm": 0.4227634072303772, + "learning_rate": 0.00017338321413325859, + "loss": 0.0407, + "step": 1807 + }, + { + "epoch": 0.23886118175512766, + "grad_norm": 0.19542503356933594, + "learning_rate": 0.00017335497415728145, + "loss": 0.0193, + "step": 1808 + }, + { + "epoch": 0.2389932952406117, + "grad_norm": 0.18369914591312408, + "learning_rate": 0.00017332672151060404, + "loss": 0.0229, + "step": 1809 + }, + { + "epoch": 0.23912540872609572, + "grad_norm": 0.2526663839817047, + "learning_rate": 0.00017329845619810653, + "loss": 0.041, + "step": 1810 + }, + { + "epoch": 0.23925752221157975, + "grad_norm": 0.20717869699001312, + "learning_rate": 0.00017327017822467119, + "loss": 0.0187, + "step": 1811 + }, + { + "epoch": 0.23938963569706379, + "grad_norm": 0.23846372961997986, + "learning_rate": 0.00017324188759518252, + "loss": 0.0212, + "step": 1812 + }, + { + "epoch": 0.23952174918254782, + "grad_norm": 0.1714830845594406, + "learning_rate": 0.00017321358431452718, + "loss": 0.0239, + "step": 1813 + }, + { + "epoch": 0.23965386266803185, + "grad_norm": 0.18644586205482483, + "learning_rate": 0.00017318526838759404, + "loss": 0.0317, + "step": 1814 + }, + { + "epoch": 0.23978597615351588, + "grad_norm": 0.14925703406333923, + "learning_rate": 0.00017315693981927416, + "loss": 0.0154, + "step": 1815 + }, + { + "epoch": 0.2399180896389999, + "grad_norm": 0.16923202574253082, + "learning_rate": 0.00017312859861446075, + "loss": 0.0229, + "step": 1816 + }, + { + "epoch": 0.24005020312448394, + "grad_norm": 0.30296310782432556, + "learning_rate": 0.00017310024477804926, + "loss": 0.0346, + "step": 1817 + }, + { + "epoch": 0.24018231660996797, + "grad_norm": 0.2446068674325943, + "learning_rate": 0.00017307187831493726, + "loss": 0.0268, + "step": 1818 + }, + { + "epoch": 0.240314430095452, + "grad_norm": 0.26852089166641235, + "learning_rate": 0.0001730434992300245, + "loss": 0.0407, + "step": 1819 + }, + { + "epoch": 0.24044654358093603, + "grad_norm": 0.18075965344905853, + "learning_rate": 0.00017301510752821297, + "loss": 0.024, + "step": 1820 + }, + { + "epoch": 0.24057865706642007, + "grad_norm": 0.1839093118906021, + "learning_rate": 0.0001729867032144068, + "loss": 0.0278, + "step": 1821 + }, + { + "epoch": 0.2407107705519041, + "grad_norm": 0.17958417534828186, + "learning_rate": 0.00017295828629351233, + "loss": 0.0214, + "step": 1822 + }, + { + "epoch": 0.24084288403738813, + "grad_norm": 0.17468973994255066, + "learning_rate": 0.00017292985677043796, + "loss": 0.0236, + "step": 1823 + }, + { + "epoch": 0.24097499752287216, + "grad_norm": 0.27770498394966125, + "learning_rate": 0.00017290141465009447, + "loss": 0.032, + "step": 1824 + }, + { + "epoch": 0.2411071110083562, + "grad_norm": 0.16993235051631927, + "learning_rate": 0.00017287295993739465, + "loss": 0.0146, + "step": 1825 + }, + { + "epoch": 0.24123922449384022, + "grad_norm": 0.2006664276123047, + "learning_rate": 0.00017284449263725354, + "loss": 0.0316, + "step": 1826 + }, + { + "epoch": 0.24137133797932425, + "grad_norm": 0.33198291063308716, + "learning_rate": 0.00017281601275458827, + "loss": 0.0185, + "step": 1827 + }, + { + "epoch": 0.24150345146480828, + "grad_norm": 0.3680172562599182, + "learning_rate": 0.0001727875202943183, + "loss": 0.0266, + "step": 1828 + }, + { + "epoch": 0.24163556495029231, + "grad_norm": 0.18102778494358063, + "learning_rate": 0.00017275901526136512, + "loss": 0.0172, + "step": 1829 + }, + { + "epoch": 0.24176767843577635, + "grad_norm": 0.36317577958106995, + "learning_rate": 0.00017273049766065244, + "loss": 0.029, + "step": 1830 + }, + { + "epoch": 0.24189979192126038, + "grad_norm": 0.15369760990142822, + "learning_rate": 0.00017270196749710618, + "loss": 0.0192, + "step": 1831 + }, + { + "epoch": 0.24203190540674438, + "grad_norm": 0.16851592063903809, + "learning_rate": 0.00017267342477565433, + "loss": 0.0219, + "step": 1832 + }, + { + "epoch": 0.2421640188922284, + "grad_norm": 0.22588814795017242, + "learning_rate": 0.00017264486950122716, + "loss": 0.0261, + "step": 1833 + }, + { + "epoch": 0.24229613237771244, + "grad_norm": 0.20349568128585815, + "learning_rate": 0.000172616301678757, + "loss": 0.0323, + "step": 1834 + }, + { + "epoch": 0.24242824586319647, + "grad_norm": 0.17781049013137817, + "learning_rate": 0.00017258772131317852, + "loss": 0.0319, + "step": 1835 + }, + { + "epoch": 0.2425603593486805, + "grad_norm": 0.25952428579330444, + "learning_rate": 0.00017255912840942833, + "loss": 0.0293, + "step": 1836 + }, + { + "epoch": 0.24269247283416454, + "grad_norm": 0.23489899933338165, + "learning_rate": 0.00017253052297244538, + "loss": 0.0387, + "step": 1837 + }, + { + "epoch": 0.24282458631964857, + "grad_norm": 0.3075209856033325, + "learning_rate": 0.00017250190500717075, + "loss": 0.0252, + "step": 1838 + }, + { + "epoch": 0.2429566998051326, + "grad_norm": 0.3768205940723419, + "learning_rate": 0.00017247327451854757, + "loss": 0.0508, + "step": 1839 + }, + { + "epoch": 0.24308881329061663, + "grad_norm": 0.15840856730937958, + "learning_rate": 0.00017244463151152125, + "loss": 0.0137, + "step": 1840 + }, + { + "epoch": 0.24322092677610066, + "grad_norm": 0.31808412075042725, + "learning_rate": 0.0001724159759910394, + "loss": 0.0395, + "step": 1841 + }, + { + "epoch": 0.2433530402615847, + "grad_norm": 0.7312543392181396, + "learning_rate": 0.00017238730796205167, + "loss": 0.0197, + "step": 1842 + }, + { + "epoch": 0.24348515374706872, + "grad_norm": 0.15382172167301178, + "learning_rate": 0.0001723586274295099, + "loss": 0.017, + "step": 1843 + }, + { + "epoch": 0.24361726723255275, + "grad_norm": 0.25445330142974854, + "learning_rate": 0.0001723299343983682, + "loss": 0.0198, + "step": 1844 + }, + { + "epoch": 0.24374938071803678, + "grad_norm": 0.2107013463973999, + "learning_rate": 0.00017230122887358269, + "loss": 0.0278, + "step": 1845 + }, + { + "epoch": 0.24388149420352082, + "grad_norm": 0.25929173827171326, + "learning_rate": 0.0001722725108601117, + "loss": 0.0141, + "step": 1846 + }, + { + "epoch": 0.24401360768900485, + "grad_norm": 0.2503575384616852, + "learning_rate": 0.00017224378036291576, + "loss": 0.022, + "step": 1847 + }, + { + "epoch": 0.24414572117448888, + "grad_norm": 0.24697549641132355, + "learning_rate": 0.00017221503738695757, + "loss": 0.0301, + "step": 1848 + }, + { + "epoch": 0.2442778346599729, + "grad_norm": 0.25922176241874695, + "learning_rate": 0.00017218628193720186, + "loss": 0.0295, + "step": 1849 + }, + { + "epoch": 0.24440994814545694, + "grad_norm": 0.20222648978233337, + "learning_rate": 0.00017215751401861563, + "loss": 0.0134, + "step": 1850 + }, + { + "epoch": 0.24454206163094097, + "grad_norm": 0.19815732538700104, + "learning_rate": 0.00017212873363616803, + "loss": 0.0257, + "step": 1851 + }, + { + "epoch": 0.244674175116425, + "grad_norm": 0.16871704161167145, + "learning_rate": 0.00017209994079483027, + "loss": 0.016, + "step": 1852 + }, + { + "epoch": 0.24480628860190903, + "grad_norm": 0.21894827485084534, + "learning_rate": 0.00017207113549957582, + "loss": 0.0182, + "step": 1853 + }, + { + "epoch": 0.24493840208739306, + "grad_norm": 0.211959108710289, + "learning_rate": 0.00017204231775538027, + "loss": 0.0315, + "step": 1854 + }, + { + "epoch": 0.2450705155728771, + "grad_norm": 0.2773512899875641, + "learning_rate": 0.0001720134875672213, + "loss": 0.0371, + "step": 1855 + }, + { + "epoch": 0.24520262905836113, + "grad_norm": 0.23648853600025177, + "learning_rate": 0.00017198464494007883, + "loss": 0.0279, + "step": 1856 + }, + { + "epoch": 0.24533474254384516, + "grad_norm": 0.569492757320404, + "learning_rate": 0.00017195578987893483, + "loss": 0.0586, + "step": 1857 + }, + { + "epoch": 0.2454668560293292, + "grad_norm": 0.23297181725502014, + "learning_rate": 0.00017192692238877352, + "loss": 0.0214, + "step": 1858 + }, + { + "epoch": 0.24559896951481322, + "grad_norm": 0.28752514719963074, + "learning_rate": 0.0001718980424745812, + "loss": 0.0361, + "step": 1859 + }, + { + "epoch": 0.24573108300029725, + "grad_norm": 0.25073346495628357, + "learning_rate": 0.00017186915014134635, + "loss": 0.0255, + "step": 1860 + }, + { + "epoch": 0.24586319648578128, + "grad_norm": 0.2746346890926361, + "learning_rate": 0.00017184024539405957, + "loss": 0.0297, + "step": 1861 + }, + { + "epoch": 0.2459953099712653, + "grad_norm": 0.2706051170825958, + "learning_rate": 0.0001718113282377136, + "loss": 0.0269, + "step": 1862 + }, + { + "epoch": 0.24612742345674934, + "grad_norm": 0.27474790811538696, + "learning_rate": 0.00017178239867730337, + "loss": 0.0192, + "step": 1863 + }, + { + "epoch": 0.24625953694223338, + "grad_norm": 0.18939033150672913, + "learning_rate": 0.00017175345671782588, + "loss": 0.0306, + "step": 1864 + }, + { + "epoch": 0.2463916504277174, + "grad_norm": 0.16039957106113434, + "learning_rate": 0.00017172450236428035, + "loss": 0.0264, + "step": 1865 + }, + { + "epoch": 0.24652376391320144, + "grad_norm": 0.27674245834350586, + "learning_rate": 0.00017169553562166811, + "loss": 0.0292, + "step": 1866 + }, + { + "epoch": 0.24665587739868547, + "grad_norm": 0.26449286937713623, + "learning_rate": 0.00017166655649499256, + "loss": 0.0411, + "step": 1867 + }, + { + "epoch": 0.2467879908841695, + "grad_norm": 0.24633260071277618, + "learning_rate": 0.00017163756498925938, + "loss": 0.031, + "step": 1868 + }, + { + "epoch": 0.24692010436965353, + "grad_norm": 0.34218135476112366, + "learning_rate": 0.00017160856110947625, + "loss": 0.022, + "step": 1869 + }, + { + "epoch": 0.24705221785513756, + "grad_norm": 0.3518677055835724, + "learning_rate": 0.00017157954486065311, + "loss": 0.0481, + "step": 1870 + }, + { + "epoch": 0.2471843313406216, + "grad_norm": 0.1745559424161911, + "learning_rate": 0.0001715505162478019, + "loss": 0.023, + "step": 1871 + }, + { + "epoch": 0.24731644482610562, + "grad_norm": 0.19944533705711365, + "learning_rate": 0.00017152147527593682, + "loss": 0.0229, + "step": 1872 + }, + { + "epoch": 0.24744855831158966, + "grad_norm": 0.1665343940258026, + "learning_rate": 0.00017149242195007417, + "loss": 0.0192, + "step": 1873 + }, + { + "epoch": 0.2475806717970737, + "grad_norm": 0.19697602093219757, + "learning_rate": 0.0001714633562752323, + "loss": 0.0183, + "step": 1874 + }, + { + "epoch": 0.24771278528255772, + "grad_norm": 0.3007800877094269, + "learning_rate": 0.00017143427825643182, + "loss": 0.0288, + "step": 1875 + }, + { + "epoch": 0.24784489876804175, + "grad_norm": 0.194051593542099, + "learning_rate": 0.0001714051878986954, + "loss": 0.0279, + "step": 1876 + }, + { + "epoch": 0.24797701225352578, + "grad_norm": 0.18599578738212585, + "learning_rate": 0.00017137608520704785, + "loss": 0.0171, + "step": 1877 + }, + { + "epoch": 0.2481091257390098, + "grad_norm": 0.3460446000099182, + "learning_rate": 0.00017134697018651612, + "loss": 0.0382, + "step": 1878 + }, + { + "epoch": 0.24824123922449384, + "grad_norm": 0.23564647138118744, + "learning_rate": 0.00017131784284212927, + "loss": 0.038, + "step": 1879 + }, + { + "epoch": 0.24837335270997787, + "grad_norm": 0.2204003930091858, + "learning_rate": 0.00017128870317891854, + "loss": 0.0343, + "step": 1880 + }, + { + "epoch": 0.2485054661954619, + "grad_norm": 0.1897745579481125, + "learning_rate": 0.00017125955120191725, + "loss": 0.02, + "step": 1881 + }, + { + "epoch": 0.24863757968094594, + "grad_norm": 0.31301870942115784, + "learning_rate": 0.0001712303869161608, + "loss": 0.0233, + "step": 1882 + }, + { + "epoch": 0.24876969316642997, + "grad_norm": 0.2227194607257843, + "learning_rate": 0.00017120121032668687, + "loss": 0.038, + "step": 1883 + }, + { + "epoch": 0.248901806651914, + "grad_norm": 0.25485947728157043, + "learning_rate": 0.0001711720214385351, + "loss": 0.0435, + "step": 1884 + }, + { + "epoch": 0.24903392013739803, + "grad_norm": 0.2976076602935791, + "learning_rate": 0.00017114282025674734, + "loss": 0.0284, + "step": 1885 + }, + { + "epoch": 0.24916603362288206, + "grad_norm": 0.18369312584400177, + "learning_rate": 0.0001711136067863676, + "loss": 0.02, + "step": 1886 + }, + { + "epoch": 0.2492981471083661, + "grad_norm": 0.24397863447666168, + "learning_rate": 0.00017108438103244188, + "loss": 0.0365, + "step": 1887 + }, + { + "epoch": 0.24943026059385012, + "grad_norm": 0.1897364854812622, + "learning_rate": 0.0001710551430000184, + "loss": 0.0183, + "step": 1888 + }, + { + "epoch": 0.24956237407933415, + "grad_norm": 0.24037641286849976, + "learning_rate": 0.00017102589269414758, + "loss": 0.0392, + "step": 1889 + }, + { + "epoch": 0.24969448756481818, + "grad_norm": 0.20401312410831451, + "learning_rate": 0.00017099663011988173, + "loss": 0.0231, + "step": 1890 + }, + { + "epoch": 0.24982660105030222, + "grad_norm": 0.2197640836238861, + "learning_rate": 0.00017096735528227547, + "loss": 0.0214, + "step": 1891 + }, + { + "epoch": 0.24995871453578625, + "grad_norm": 0.4697629511356354, + "learning_rate": 0.0001709380681863855, + "loss": 0.0183, + "step": 1892 + }, + { + "epoch": 0.25009082802127025, + "grad_norm": 0.3376876413822174, + "learning_rate": 0.0001709087688372706, + "loss": 0.0324, + "step": 1893 + }, + { + "epoch": 0.2502229415067543, + "grad_norm": 0.3038407862186432, + "learning_rate": 0.0001708794572399917, + "loss": 0.034, + "step": 1894 + }, + { + "epoch": 0.2503550549922383, + "grad_norm": 0.2301395833492279, + "learning_rate": 0.00017085013339961178, + "loss": 0.0342, + "step": 1895 + }, + { + "epoch": 0.25048716847772234, + "grad_norm": 0.20109249651432037, + "learning_rate": 0.000170820797321196, + "loss": 0.029, + "step": 1896 + }, + { + "epoch": 0.2506192819632064, + "grad_norm": 0.16469945013523102, + "learning_rate": 0.0001707914490098117, + "loss": 0.0158, + "step": 1897 + }, + { + "epoch": 0.2507513954486904, + "grad_norm": 0.174901083111763, + "learning_rate": 0.00017076208847052816, + "loss": 0.0272, + "step": 1898 + }, + { + "epoch": 0.25088350893417444, + "grad_norm": 0.2955951988697052, + "learning_rate": 0.00017073271570841692, + "loss": 0.0432, + "step": 1899 + }, + { + "epoch": 0.25101562241965847, + "grad_norm": 0.47816672921180725, + "learning_rate": 0.0001707033307285515, + "loss": 0.0292, + "step": 1900 + }, + { + "epoch": 0.2511477359051425, + "grad_norm": 0.25690802931785583, + "learning_rate": 0.0001706739335360077, + "loss": 0.018, + "step": 1901 + }, + { + "epoch": 0.25127984939062653, + "grad_norm": 0.37199512124061584, + "learning_rate": 0.00017064452413586328, + "loss": 0.0442, + "step": 1902 + }, + { + "epoch": 0.25141196287611056, + "grad_norm": 0.21501943469047546, + "learning_rate": 0.00017061510253319815, + "loss": 0.0256, + "step": 1903 + }, + { + "epoch": 0.2515440763615946, + "grad_norm": 0.33642202615737915, + "learning_rate": 0.00017058566873309438, + "loss": 0.0381, + "step": 1904 + }, + { + "epoch": 0.2516761898470786, + "grad_norm": 0.1645767092704773, + "learning_rate": 0.0001705562227406361, + "loss": 0.0355, + "step": 1905 + }, + { + "epoch": 0.25180830333256266, + "grad_norm": 0.24579696357250214, + "learning_rate": 0.00017052676456090956, + "loss": 0.0257, + "step": 1906 + }, + { + "epoch": 0.2519404168180467, + "grad_norm": 0.2509557008743286, + "learning_rate": 0.0001704972941990031, + "loss": 0.0295, + "step": 1907 + }, + { + "epoch": 0.2520725303035307, + "grad_norm": 0.23168303072452545, + "learning_rate": 0.00017046781166000716, + "loss": 0.0193, + "step": 1908 + }, + { + "epoch": 0.25220464378901475, + "grad_norm": 0.15511181950569153, + "learning_rate": 0.00017043831694901434, + "loss": 0.024, + "step": 1909 + }, + { + "epoch": 0.2523367572744988, + "grad_norm": 0.24736307561397552, + "learning_rate": 0.00017040881007111925, + "loss": 0.0275, + "step": 1910 + }, + { + "epoch": 0.2524688707599828, + "grad_norm": 0.19190946221351624, + "learning_rate": 0.00017037929103141865, + "loss": 0.0241, + "step": 1911 + }, + { + "epoch": 0.25260098424546684, + "grad_norm": 0.14852671325206757, + "learning_rate": 0.00017034975983501146, + "loss": 0.0174, + "step": 1912 + }, + { + "epoch": 0.2527330977309509, + "grad_norm": 0.42580118775367737, + "learning_rate": 0.00017032021648699858, + "loss": 0.02, + "step": 1913 + }, + { + "epoch": 0.2528652112164349, + "grad_norm": 0.23213063180446625, + "learning_rate": 0.00017029066099248313, + "loss": 0.0464, + "step": 1914 + }, + { + "epoch": 0.25299732470191894, + "grad_norm": 0.40431395173072815, + "learning_rate": 0.00017026109335657022, + "loss": 0.0389, + "step": 1915 + }, + { + "epoch": 0.25312943818740297, + "grad_norm": 0.26642554998397827, + "learning_rate": 0.0001702315135843671, + "loss": 0.0217, + "step": 1916 + }, + { + "epoch": 0.253261551672887, + "grad_norm": 0.3669357895851135, + "learning_rate": 0.0001702019216809832, + "loss": 0.0404, + "step": 1917 + }, + { + "epoch": 0.25339366515837103, + "grad_norm": 0.14874513447284698, + "learning_rate": 0.00017017231765152987, + "loss": 0.0166, + "step": 1918 + }, + { + "epoch": 0.25352577864385506, + "grad_norm": 0.29495328664779663, + "learning_rate": 0.0001701427015011207, + "loss": 0.0228, + "step": 1919 + }, + { + "epoch": 0.2536578921293391, + "grad_norm": 0.3058466613292694, + "learning_rate": 0.00017011307323487132, + "loss": 0.023, + "step": 1920 + }, + { + "epoch": 0.2537900056148231, + "grad_norm": 0.2111206203699112, + "learning_rate": 0.00017008343285789953, + "loss": 0.0319, + "step": 1921 + }, + { + "epoch": 0.25392211910030715, + "grad_norm": 0.26292023062705994, + "learning_rate": 0.000170053780375325, + "loss": 0.0314, + "step": 1922 + }, + { + "epoch": 0.2540542325857912, + "grad_norm": 0.18287408351898193, + "learning_rate": 0.00017002411579226974, + "loss": 0.0282, + "step": 1923 + }, + { + "epoch": 0.2541863460712752, + "grad_norm": 0.21615292131900787, + "learning_rate": 0.00016999443911385774, + "loss": 0.0266, + "step": 1924 + }, + { + "epoch": 0.25431845955675925, + "grad_norm": 0.240324005484581, + "learning_rate": 0.00016996475034521512, + "loss": 0.0378, + "step": 1925 + }, + { + "epoch": 0.2544505730422433, + "grad_norm": 0.24561507999897003, + "learning_rate": 0.00016993504949147, + "loss": 0.0387, + "step": 1926 + }, + { + "epoch": 0.2545826865277273, + "grad_norm": 0.26725414395332336, + "learning_rate": 0.00016990533655775268, + "loss": 0.0189, + "step": 1927 + }, + { + "epoch": 0.25471480001321134, + "grad_norm": 0.21090959012508392, + "learning_rate": 0.0001698756115491955, + "loss": 0.0205, + "step": 1928 + }, + { + "epoch": 0.25484691349869537, + "grad_norm": 0.25421762466430664, + "learning_rate": 0.00016984587447093293, + "loss": 0.0341, + "step": 1929 + }, + { + "epoch": 0.2549790269841794, + "grad_norm": 0.14478729665279388, + "learning_rate": 0.00016981612532810145, + "loss": 0.0202, + "step": 1930 + }, + { + "epoch": 0.25511114046966343, + "grad_norm": 0.18194665014743805, + "learning_rate": 0.0001697863641258397, + "loss": 0.0281, + "step": 1931 + }, + { + "epoch": 0.25524325395514746, + "grad_norm": 0.2574337124824524, + "learning_rate": 0.00016975659086928836, + "loss": 0.0294, + "step": 1932 + }, + { + "epoch": 0.2553753674406315, + "grad_norm": 0.19785122573375702, + "learning_rate": 0.0001697268055635902, + "loss": 0.0201, + "step": 1933 + }, + { + "epoch": 0.2555074809261155, + "grad_norm": 0.37554481625556946, + "learning_rate": 0.00016969700821389008, + "loss": 0.0463, + "step": 1934 + }, + { + "epoch": 0.25563959441159956, + "grad_norm": 0.3940991461277008, + "learning_rate": 0.00016966719882533497, + "loss": 0.0197, + "step": 1935 + }, + { + "epoch": 0.2557717078970836, + "grad_norm": 0.21494190394878387, + "learning_rate": 0.00016963737740307381, + "loss": 0.0378, + "step": 1936 + }, + { + "epoch": 0.2559038213825676, + "grad_norm": 0.24298927187919617, + "learning_rate": 0.00016960754395225775, + "loss": 0.0306, + "step": 1937 + }, + { + "epoch": 0.25603593486805165, + "grad_norm": 0.2017381191253662, + "learning_rate": 0.00016957769847803994, + "loss": 0.021, + "step": 1938 + }, + { + "epoch": 0.2561680483535357, + "grad_norm": 0.21786369383335114, + "learning_rate": 0.00016954784098557565, + "loss": 0.0253, + "step": 1939 + }, + { + "epoch": 0.2563001618390197, + "grad_norm": 0.28797072172164917, + "learning_rate": 0.00016951797148002216, + "loss": 0.0306, + "step": 1940 + }, + { + "epoch": 0.25643227532450374, + "grad_norm": 0.17211690545082092, + "learning_rate": 0.00016948808996653889, + "loss": 0.0189, + "step": 1941 + }, + { + "epoch": 0.2565643888099878, + "grad_norm": 0.2314436137676239, + "learning_rate": 0.00016945819645028731, + "loss": 0.0194, + "step": 1942 + }, + { + "epoch": 0.2566965022954718, + "grad_norm": 0.3278854489326477, + "learning_rate": 0.000169428290936431, + "loss": 0.0297, + "step": 1943 + }, + { + "epoch": 0.25682861578095584, + "grad_norm": 0.1829918473958969, + "learning_rate": 0.00016939837343013552, + "loss": 0.0245, + "step": 1944 + }, + { + "epoch": 0.25696072926643987, + "grad_norm": 0.3587496876716614, + "learning_rate": 0.00016936844393656864, + "loss": 0.027, + "step": 1945 + }, + { + "epoch": 0.2570928427519239, + "grad_norm": 0.20004841685295105, + "learning_rate": 0.0001693385024609, + "loss": 0.018, + "step": 1946 + }, + { + "epoch": 0.25722495623740793, + "grad_norm": 0.24667544662952423, + "learning_rate": 0.00016930854900830156, + "loss": 0.0203, + "step": 1947 + }, + { + "epoch": 0.25735706972289196, + "grad_norm": 0.34233590960502625, + "learning_rate": 0.00016927858358394712, + "loss": 0.0307, + "step": 1948 + }, + { + "epoch": 0.257489183208376, + "grad_norm": 0.16642144322395325, + "learning_rate": 0.00016924860619301271, + "loss": 0.0186, + "step": 1949 + }, + { + "epoch": 0.25762129669386, + "grad_norm": 0.22433346509933472, + "learning_rate": 0.00016921861684067633, + "loss": 0.0145, + "step": 1950 + }, + { + "epoch": 0.25775341017934406, + "grad_norm": 0.9970558285713196, + "learning_rate": 0.0001691886155321181, + "loss": 0.0281, + "step": 1951 + }, + { + "epoch": 0.2578855236648281, + "grad_norm": 0.2507546842098236, + "learning_rate": 0.0001691586022725202, + "loss": 0.0133, + "step": 1952 + }, + { + "epoch": 0.2580176371503121, + "grad_norm": 0.15097548067569733, + "learning_rate": 0.0001691285770670668, + "loss": 0.0192, + "step": 1953 + }, + { + "epoch": 0.25814975063579615, + "grad_norm": 0.22995054721832275, + "learning_rate": 0.0001690985399209442, + "loss": 0.036, + "step": 1954 + }, + { + "epoch": 0.2582818641212802, + "grad_norm": 0.2987491190433502, + "learning_rate": 0.00016906849083934083, + "loss": 0.0299, + "step": 1955 + }, + { + "epoch": 0.2584139776067642, + "grad_norm": 0.3527694344520569, + "learning_rate": 0.00016903842982744704, + "loss": 0.0222, + "step": 1956 + }, + { + "epoch": 0.25854609109224824, + "grad_norm": 0.4488008916378021, + "learning_rate": 0.00016900835689045535, + "loss": 0.0305, + "step": 1957 + }, + { + "epoch": 0.2586782045777323, + "grad_norm": 0.18324284255504608, + "learning_rate": 0.00016897827203356025, + "loss": 0.0204, + "step": 1958 + }, + { + "epoch": 0.2588103180632163, + "grad_norm": 0.303751677274704, + "learning_rate": 0.00016894817526195833, + "loss": 0.0362, + "step": 1959 + }, + { + "epoch": 0.25894243154870034, + "grad_norm": 0.29179519414901733, + "learning_rate": 0.0001689180665808483, + "loss": 0.0364, + "step": 1960 + }, + { + "epoch": 0.25907454503418437, + "grad_norm": 0.16983579099178314, + "learning_rate": 0.00016888794599543089, + "loss": 0.0266, + "step": 1961 + }, + { + "epoch": 0.2592066585196684, + "grad_norm": 0.23435205221176147, + "learning_rate": 0.0001688578135109088, + "loss": 0.0293, + "step": 1962 + }, + { + "epoch": 0.25933877200515243, + "grad_norm": 0.20603543519973755, + "learning_rate": 0.00016882766913248686, + "loss": 0.0168, + "step": 1963 + }, + { + "epoch": 0.25947088549063646, + "grad_norm": 0.32107990980148315, + "learning_rate": 0.000168797512865372, + "loss": 0.0268, + "step": 1964 + }, + { + "epoch": 0.2596029989761205, + "grad_norm": 0.22059336304664612, + "learning_rate": 0.00016876734471477312, + "loss": 0.0187, + "step": 1965 + }, + { + "epoch": 0.2597351124616045, + "grad_norm": 0.23701448738574982, + "learning_rate": 0.00016873716468590117, + "loss": 0.0211, + "step": 1966 + }, + { + "epoch": 0.25986722594708855, + "grad_norm": 0.21445463597774506, + "learning_rate": 0.00016870697278396923, + "loss": 0.0318, + "step": 1967 + }, + { + "epoch": 0.2599993394325726, + "grad_norm": 0.1952168196439743, + "learning_rate": 0.00016867676901419237, + "loss": 0.0284, + "step": 1968 + }, + { + "epoch": 0.2601314529180566, + "grad_norm": 0.2025042474269867, + "learning_rate": 0.00016864655338178777, + "loss": 0.016, + "step": 1969 + }, + { + "epoch": 0.26026356640354065, + "grad_norm": 0.20277979969978333, + "learning_rate": 0.00016861632589197453, + "loss": 0.0275, + "step": 1970 + }, + { + "epoch": 0.2603956798890247, + "grad_norm": 0.18800503015518188, + "learning_rate": 0.00016858608654997395, + "loss": 0.0198, + "step": 1971 + }, + { + "epoch": 0.2605277933745087, + "grad_norm": 0.1867101937532425, + "learning_rate": 0.00016855583536100926, + "loss": 0.0243, + "step": 1972 + }, + { + "epoch": 0.26065990685999274, + "grad_norm": 0.28561997413635254, + "learning_rate": 0.00016852557233030586, + "loss": 0.03, + "step": 1973 + }, + { + "epoch": 0.26079202034547677, + "grad_norm": 0.24048177897930145, + "learning_rate": 0.00016849529746309108, + "loss": 0.0296, + "step": 1974 + }, + { + "epoch": 0.2609241338309608, + "grad_norm": 0.24759027361869812, + "learning_rate": 0.00016846501076459434, + "loss": 0.0327, + "step": 1975 + }, + { + "epoch": 0.26105624731644483, + "grad_norm": 0.27090364694595337, + "learning_rate": 0.00016843471224004704, + "loss": 0.0362, + "step": 1976 + }, + { + "epoch": 0.26118836080192886, + "grad_norm": 0.29066452383995056, + "learning_rate": 0.0001684044018946828, + "loss": 0.0434, + "step": 1977 + }, + { + "epoch": 0.2613204742874129, + "grad_norm": 0.2134980857372284, + "learning_rate": 0.0001683740797337371, + "loss": 0.0242, + "step": 1978 + }, + { + "epoch": 0.2614525877728969, + "grad_norm": 0.256011426448822, + "learning_rate": 0.00016834374576244753, + "loss": 0.0407, + "step": 1979 + }, + { + "epoch": 0.26158470125838096, + "grad_norm": 0.22266724705696106, + "learning_rate": 0.00016831339998605373, + "loss": 0.0241, + "step": 1980 + }, + { + "epoch": 0.261716814743865, + "grad_norm": 0.29499000310897827, + "learning_rate": 0.00016828304240979735, + "loss": 0.0221, + "step": 1981 + }, + { + "epoch": 0.261848928229349, + "grad_norm": 0.19685974717140198, + "learning_rate": 0.0001682526730389221, + "loss": 0.0189, + "step": 1982 + }, + { + "epoch": 0.26198104171483305, + "grad_norm": 0.2072879672050476, + "learning_rate": 0.00016822229187867373, + "loss": 0.0209, + "step": 1983 + }, + { + "epoch": 0.2621131552003171, + "grad_norm": 0.33101019263267517, + "learning_rate": 0.00016819189893429998, + "loss": 0.031, + "step": 1984 + }, + { + "epoch": 0.2622452686858011, + "grad_norm": 0.14240136742591858, + "learning_rate": 0.00016816149421105072, + "loss": 0.0179, + "step": 1985 + }, + { + "epoch": 0.26237738217128515, + "grad_norm": 0.23045289516448975, + "learning_rate": 0.00016813107771417775, + "loss": 0.0238, + "step": 1986 + }, + { + "epoch": 0.2625094956567692, + "grad_norm": 0.2774489223957062, + "learning_rate": 0.000168100649448935, + "loss": 0.0327, + "step": 1987 + }, + { + "epoch": 0.2626416091422532, + "grad_norm": 0.20476806163787842, + "learning_rate": 0.0001680702094205783, + "loss": 0.0251, + "step": 1988 + }, + { + "epoch": 0.26277372262773724, + "grad_norm": 0.28308263421058655, + "learning_rate": 0.0001680397576343657, + "loss": 0.0411, + "step": 1989 + }, + { + "epoch": 0.26290583611322127, + "grad_norm": 0.12856468558311462, + "learning_rate": 0.0001680092940955571, + "loss": 0.0145, + "step": 1990 + }, + { + "epoch": 0.2630379495987053, + "grad_norm": 0.3462192118167877, + "learning_rate": 0.00016797881880941455, + "loss": 0.0345, + "step": 1991 + }, + { + "epoch": 0.26317006308418933, + "grad_norm": 0.18518102169036865, + "learning_rate": 0.00016794833178120205, + "loss": 0.0263, + "step": 1992 + }, + { + "epoch": 0.26330217656967336, + "grad_norm": 0.2983401119709015, + "learning_rate": 0.00016791783301618572, + "loss": 0.0222, + "step": 1993 + }, + { + "epoch": 0.2634342900551574, + "grad_norm": 0.20992180705070496, + "learning_rate": 0.00016788732251963356, + "loss": 0.0222, + "step": 1994 + }, + { + "epoch": 0.2635664035406414, + "grad_norm": 0.24977165460586548, + "learning_rate": 0.0001678568002968158, + "loss": 0.0386, + "step": 1995 + }, + { + "epoch": 0.26369851702612546, + "grad_norm": 0.2119644284248352, + "learning_rate": 0.0001678262663530045, + "loss": 0.0137, + "step": 1996 + }, + { + "epoch": 0.2638306305116095, + "grad_norm": 0.2339543104171753, + "learning_rate": 0.00016779572069347385, + "loss": 0.0329, + "step": 1997 + }, + { + "epoch": 0.2639627439970935, + "grad_norm": 0.25529757142066956, + "learning_rate": 0.00016776516332350005, + "loss": 0.0397, + "step": 1998 + }, + { + "epoch": 0.26409485748257755, + "grad_norm": 0.22438396513462067, + "learning_rate": 0.0001677345942483613, + "loss": 0.0247, + "step": 1999 + }, + { + "epoch": 0.2642269709680616, + "grad_norm": 0.24435588717460632, + "learning_rate": 0.00016770401347333786, + "loss": 0.0139, + "step": 2000 + }, + { + "epoch": 0.2643590844535456, + "grad_norm": 0.2665388286113739, + "learning_rate": 0.00016767342100371195, + "loss": 0.0234, + "step": 2001 + }, + { + "epoch": 0.26449119793902964, + "grad_norm": 0.28348082304000854, + "learning_rate": 0.0001676428168447679, + "loss": 0.0291, + "step": 2002 + }, + { + "epoch": 0.2646233114245137, + "grad_norm": 0.20156100392341614, + "learning_rate": 0.00016761220100179196, + "loss": 0.0266, + "step": 2003 + }, + { + "epoch": 0.2647554249099977, + "grad_norm": 0.21052148938179016, + "learning_rate": 0.00016758157348007246, + "loss": 0.0264, + "step": 2004 + }, + { + "epoch": 0.26488753839548174, + "grad_norm": 0.2750470042228699, + "learning_rate": 0.00016755093428489975, + "loss": 0.048, + "step": 2005 + }, + { + "epoch": 0.26501965188096577, + "grad_norm": 0.2317686378955841, + "learning_rate": 0.0001675202834215661, + "loss": 0.0203, + "step": 2006 + }, + { + "epoch": 0.2651517653664498, + "grad_norm": 0.2592511773109436, + "learning_rate": 0.00016748962089536601, + "loss": 0.0229, + "step": 2007 + }, + { + "epoch": 0.26528387885193383, + "grad_norm": 0.20439298450946808, + "learning_rate": 0.00016745894671159578, + "loss": 0.0217, + "step": 2008 + }, + { + "epoch": 0.26541599233741786, + "grad_norm": 0.1797826588153839, + "learning_rate": 0.00016742826087555375, + "loss": 0.0248, + "step": 2009 + }, + { + "epoch": 0.2655481058229019, + "grad_norm": 0.21490655839443207, + "learning_rate": 0.0001673975633925404, + "loss": 0.0224, + "step": 2010 + }, + { + "epoch": 0.2656802193083859, + "grad_norm": 0.23416008055210114, + "learning_rate": 0.00016736685426785815, + "loss": 0.0201, + "step": 2011 + }, + { + "epoch": 0.26581233279386995, + "grad_norm": 0.2559327483177185, + "learning_rate": 0.00016733613350681137, + "loss": 0.0268, + "step": 2012 + }, + { + "epoch": 0.265944446279354, + "grad_norm": 0.29835590720176697, + "learning_rate": 0.00016730540111470652, + "loss": 0.034, + "step": 2013 + }, + { + "epoch": 0.266076559764838, + "grad_norm": 0.22217904031276703, + "learning_rate": 0.00016727465709685208, + "loss": 0.0296, + "step": 2014 + }, + { + "epoch": 0.26620867325032205, + "grad_norm": 0.23027774691581726, + "learning_rate": 0.00016724390145855846, + "loss": 0.0234, + "step": 2015 + }, + { + "epoch": 0.2663407867358061, + "grad_norm": 0.24703939259052277, + "learning_rate": 0.00016721313420513817, + "loss": 0.0248, + "step": 2016 + }, + { + "epoch": 0.2664729002212901, + "grad_norm": 0.26446980237960815, + "learning_rate": 0.00016718235534190563, + "loss": 0.0216, + "step": 2017 + }, + { + "epoch": 0.26660501370677414, + "grad_norm": 0.16694585978984833, + "learning_rate": 0.0001671515648741773, + "loss": 0.0203, + "step": 2018 + }, + { + "epoch": 0.26673712719225817, + "grad_norm": 0.21245832741260529, + "learning_rate": 0.00016712076280727173, + "loss": 0.0237, + "step": 2019 + }, + { + "epoch": 0.2668692406777422, + "grad_norm": 0.18598133325576782, + "learning_rate": 0.00016708994914650934, + "loss": 0.0207, + "step": 2020 + }, + { + "epoch": 0.26700135416322623, + "grad_norm": 0.24841630458831787, + "learning_rate": 0.00016705912389721267, + "loss": 0.0348, + "step": 2021 + }, + { + "epoch": 0.26713346764871027, + "grad_norm": 0.17656932771205902, + "learning_rate": 0.00016702828706470615, + "loss": 0.0202, + "step": 2022 + }, + { + "epoch": 0.2672655811341943, + "grad_norm": 0.26282253861427307, + "learning_rate": 0.00016699743865431627, + "loss": 0.0303, + "step": 2023 + }, + { + "epoch": 0.2673976946196783, + "grad_norm": 0.2911767065525055, + "learning_rate": 0.00016696657867137156, + "loss": 0.0286, + "step": 2024 + }, + { + "epoch": 0.26752980810516236, + "grad_norm": 0.16267457604408264, + "learning_rate": 0.00016693570712120247, + "loss": 0.0095, + "step": 2025 + }, + { + "epoch": 0.2676619215906464, + "grad_norm": 0.37082239985466003, + "learning_rate": 0.00016690482400914144, + "loss": 0.0298, + "step": 2026 + }, + { + "epoch": 0.2677940350761304, + "grad_norm": 0.18662765622138977, + "learning_rate": 0.00016687392934052305, + "loss": 0.0238, + "step": 2027 + }, + { + "epoch": 0.26792614856161445, + "grad_norm": 0.22798240184783936, + "learning_rate": 0.00016684302312068374, + "loss": 0.0289, + "step": 2028 + }, + { + "epoch": 0.2680582620470985, + "grad_norm": 0.23374059796333313, + "learning_rate": 0.00016681210535496194, + "loss": 0.0281, + "step": 2029 + }, + { + "epoch": 0.2681903755325825, + "grad_norm": 0.19504158198833466, + "learning_rate": 0.00016678117604869815, + "loss": 0.0175, + "step": 2030 + }, + { + "epoch": 0.26832248901806655, + "grad_norm": 0.23058202862739563, + "learning_rate": 0.0001667502352072348, + "loss": 0.0166, + "step": 2031 + }, + { + "epoch": 0.2684546025035506, + "grad_norm": 0.23326201736927032, + "learning_rate": 0.0001667192828359164, + "loss": 0.0317, + "step": 2032 + }, + { + "epoch": 0.2685867159890346, + "grad_norm": 0.2174236923456192, + "learning_rate": 0.00016668831894008936, + "loss": 0.0244, + "step": 2033 + }, + { + "epoch": 0.26871882947451864, + "grad_norm": 0.21321603655815125, + "learning_rate": 0.00016665734352510207, + "loss": 0.0211, + "step": 2034 + }, + { + "epoch": 0.2688509429600026, + "grad_norm": 0.29613959789276123, + "learning_rate": 0.00016662635659630504, + "loss": 0.0411, + "step": 2035 + }, + { + "epoch": 0.26898305644548665, + "grad_norm": 0.2365838587284088, + "learning_rate": 0.00016659535815905064, + "loss": 0.0223, + "step": 2036 + }, + { + "epoch": 0.2691151699309707, + "grad_norm": 0.31860825419425964, + "learning_rate": 0.00016656434821869323, + "loss": 0.035, + "step": 2037 + }, + { + "epoch": 0.2692472834164547, + "grad_norm": 0.17729634046554565, + "learning_rate": 0.00016653332678058928, + "loss": 0.0176, + "step": 2038 + }, + { + "epoch": 0.26937939690193874, + "grad_norm": 0.19119992852210999, + "learning_rate": 0.0001665022938500971, + "loss": 0.018, + "step": 2039 + }, + { + "epoch": 0.26951151038742277, + "grad_norm": 0.18928834795951843, + "learning_rate": 0.0001664712494325771, + "loss": 0.0336, + "step": 2040 + }, + { + "epoch": 0.2696436238729068, + "grad_norm": 0.191539004445076, + "learning_rate": 0.00016644019353339153, + "loss": 0.0225, + "step": 2041 + }, + { + "epoch": 0.26977573735839083, + "grad_norm": 0.3990846872329712, + "learning_rate": 0.00016640912615790483, + "loss": 0.0205, + "step": 2042 + }, + { + "epoch": 0.26990785084387486, + "grad_norm": 0.17020075023174286, + "learning_rate": 0.00016637804731148322, + "loss": 0.0117, + "step": 2043 + }, + { + "epoch": 0.2700399643293589, + "grad_norm": 0.17086337506771088, + "learning_rate": 0.00016634695699949505, + "loss": 0.0297, + "step": 2044 + }, + { + "epoch": 0.2701720778148429, + "grad_norm": 0.16119834780693054, + "learning_rate": 0.00016631585522731054, + "loss": 0.0198, + "step": 2045 + }, + { + "epoch": 0.27030419130032696, + "grad_norm": 0.23394055664539337, + "learning_rate": 0.00016628474200030196, + "loss": 0.0298, + "step": 2046 + }, + { + "epoch": 0.270436304785811, + "grad_norm": 0.21679936349391937, + "learning_rate": 0.0001662536173238436, + "loss": 0.0174, + "step": 2047 + }, + { + "epoch": 0.270568418271295, + "grad_norm": 0.4186050295829773, + "learning_rate": 0.00016622248120331157, + "loss": 0.0397, + "step": 2048 + }, + { + "epoch": 0.27070053175677905, + "grad_norm": 0.22252652049064636, + "learning_rate": 0.0001661913336440841, + "loss": 0.0165, + "step": 2049 + }, + { + "epoch": 0.2708326452422631, + "grad_norm": 0.16150662302970886, + "learning_rate": 0.00016616017465154133, + "loss": 0.0286, + "step": 2050 + }, + { + "epoch": 0.2709647587277471, + "grad_norm": 0.20743726193904877, + "learning_rate": 0.0001661290042310654, + "loss": 0.0216, + "step": 2051 + }, + { + "epoch": 0.27109687221323114, + "grad_norm": 0.2305314987897873, + "learning_rate": 0.0001660978223880404, + "loss": 0.0281, + "step": 2052 + }, + { + "epoch": 0.2712289856987152, + "grad_norm": 0.17613615095615387, + "learning_rate": 0.0001660666291278525, + "loss": 0.0144, + "step": 2053 + }, + { + "epoch": 0.2713610991841992, + "grad_norm": 0.18410120904445648, + "learning_rate": 0.00016603542445588963, + "loss": 0.0183, + "step": 2054 + }, + { + "epoch": 0.27149321266968324, + "grad_norm": 0.16890189051628113, + "learning_rate": 0.0001660042083775419, + "loss": 0.0215, + "step": 2055 + }, + { + "epoch": 0.27162532615516727, + "grad_norm": 0.16772271692752838, + "learning_rate": 0.00016597298089820125, + "loss": 0.0168, + "step": 2056 + }, + { + "epoch": 0.2717574396406513, + "grad_norm": 0.172120600938797, + "learning_rate": 0.00016594174202326167, + "loss": 0.0159, + "step": 2057 + }, + { + "epoch": 0.27188955312613533, + "grad_norm": 0.20007994771003723, + "learning_rate": 0.00016591049175811908, + "loss": 0.0315, + "step": 2058 + }, + { + "epoch": 0.27202166661161936, + "grad_norm": 0.15204408764839172, + "learning_rate": 0.00016587923010817138, + "loss": 0.0112, + "step": 2059 + }, + { + "epoch": 0.2721537800971034, + "grad_norm": 0.26047348976135254, + "learning_rate": 0.00016584795707881846, + "loss": 0.0296, + "step": 2060 + }, + { + "epoch": 0.2722858935825874, + "grad_norm": 0.16377569735050201, + "learning_rate": 0.00016581667267546213, + "loss": 0.0173, + "step": 2061 + }, + { + "epoch": 0.27241800706807145, + "grad_norm": 0.2574384808540344, + "learning_rate": 0.00016578537690350618, + "loss": 0.0282, + "step": 2062 + }, + { + "epoch": 0.2725501205535555, + "grad_norm": 0.2776723802089691, + "learning_rate": 0.00016575406976835637, + "loss": 0.0301, + "step": 2063 + }, + { + "epoch": 0.2726822340390395, + "grad_norm": 0.23735906183719635, + "learning_rate": 0.00016572275127542044, + "loss": 0.0253, + "step": 2064 + }, + { + "epoch": 0.27281434752452355, + "grad_norm": 0.332064688205719, + "learning_rate": 0.00016569142143010805, + "loss": 0.0394, + "step": 2065 + }, + { + "epoch": 0.2729464610100076, + "grad_norm": 0.29127272963523865, + "learning_rate": 0.00016566008023783087, + "loss": 0.0308, + "step": 2066 + }, + { + "epoch": 0.2730785744954916, + "grad_norm": 0.25889071822166443, + "learning_rate": 0.00016562872770400252, + "loss": 0.0247, + "step": 2067 + }, + { + "epoch": 0.27321068798097564, + "grad_norm": 0.17592589557170868, + "learning_rate": 0.0001655973638340385, + "loss": 0.0202, + "step": 2068 + }, + { + "epoch": 0.2733428014664597, + "grad_norm": 0.20446552336215973, + "learning_rate": 0.00016556598863335634, + "loss": 0.0243, + "step": 2069 + }, + { + "epoch": 0.2734749149519437, + "grad_norm": 0.20881833136081696, + "learning_rate": 0.00016553460210737563, + "loss": 0.0297, + "step": 2070 + }, + { + "epoch": 0.27360702843742774, + "grad_norm": 0.17322196066379547, + "learning_rate": 0.00016550320426151767, + "loss": 0.0163, + "step": 2071 + }, + { + "epoch": 0.27373914192291177, + "grad_norm": 0.15687014162540436, + "learning_rate": 0.00016547179510120592, + "loss": 0.0175, + "step": 2072 + }, + { + "epoch": 0.2738712554083958, + "grad_norm": 0.19147679209709167, + "learning_rate": 0.0001654403746318657, + "loss": 0.0176, + "step": 2073 + }, + { + "epoch": 0.27400336889387983, + "grad_norm": 0.2648773193359375, + "learning_rate": 0.00016540894285892432, + "loss": 0.0317, + "step": 2074 + }, + { + "epoch": 0.27413548237936386, + "grad_norm": 0.26171812415122986, + "learning_rate": 0.00016537749978781102, + "loss": 0.0244, + "step": 2075 + }, + { + "epoch": 0.2742675958648479, + "grad_norm": 0.24321086704730988, + "learning_rate": 0.00016534604542395705, + "loss": 0.0123, + "step": 2076 + }, + { + "epoch": 0.2743997093503319, + "grad_norm": 0.23158743977546692, + "learning_rate": 0.00016531457977279548, + "loss": 0.0345, + "step": 2077 + }, + { + "epoch": 0.27453182283581595, + "grad_norm": 0.2649182975292206, + "learning_rate": 0.00016528310283976148, + "loss": 0.0213, + "step": 2078 + }, + { + "epoch": 0.2746639363213, + "grad_norm": 0.21584641933441162, + "learning_rate": 0.00016525161463029208, + "loss": 0.0164, + "step": 2079 + }, + { + "epoch": 0.274796049806784, + "grad_norm": 0.2687309682369232, + "learning_rate": 0.00016522011514982633, + "loss": 0.0124, + "step": 2080 + }, + { + "epoch": 0.27492816329226805, + "grad_norm": 0.20673182606697083, + "learning_rate": 0.00016518860440380503, + "loss": 0.0246, + "step": 2081 + }, + { + "epoch": 0.2750602767777521, + "grad_norm": 0.27289971709251404, + "learning_rate": 0.00016515708239767124, + "loss": 0.0206, + "step": 2082 + }, + { + "epoch": 0.2751923902632361, + "grad_norm": 0.17888841032981873, + "learning_rate": 0.00016512554913686967, + "loss": 0.0197, + "step": 2083 + }, + { + "epoch": 0.27532450374872014, + "grad_norm": 0.16519111394882202, + "learning_rate": 0.0001650940046268472, + "loss": 0.0166, + "step": 2084 + }, + { + "epoch": 0.27545661723420417, + "grad_norm": 0.23809735476970673, + "learning_rate": 0.00016506244887305252, + "loss": 0.0393, + "step": 2085 + }, + { + "epoch": 0.2755887307196882, + "grad_norm": 0.2258623093366623, + "learning_rate": 0.00016503088188093626, + "loss": 0.027, + "step": 2086 + }, + { + "epoch": 0.27572084420517223, + "grad_norm": 0.21751005947589874, + "learning_rate": 0.0001649993036559511, + "loss": 0.0245, + "step": 2087 + }, + { + "epoch": 0.27585295769065626, + "grad_norm": 0.3354283571243286, + "learning_rate": 0.0001649677142035515, + "loss": 0.0269, + "step": 2088 + }, + { + "epoch": 0.2759850711761403, + "grad_norm": 0.3530503511428833, + "learning_rate": 0.000164936113529194, + "loss": 0.0255, + "step": 2089 + }, + { + "epoch": 0.2761171846616243, + "grad_norm": 0.2316848635673523, + "learning_rate": 0.000164904501638337, + "loss": 0.0171, + "step": 2090 + }, + { + "epoch": 0.27624929814710836, + "grad_norm": 0.20455540716648102, + "learning_rate": 0.00016487287853644088, + "loss": 0.027, + "step": 2091 + }, + { + "epoch": 0.2763814116325924, + "grad_norm": 0.2040807604789734, + "learning_rate": 0.00016484124422896796, + "loss": 0.0201, + "step": 2092 + }, + { + "epoch": 0.2765135251180764, + "grad_norm": 0.2649083435535431, + "learning_rate": 0.00016480959872138245, + "loss": 0.0214, + "step": 2093 + }, + { + "epoch": 0.27664563860356045, + "grad_norm": 0.5068869590759277, + "learning_rate": 0.00016477794201915052, + "loss": 0.0265, + "step": 2094 + }, + { + "epoch": 0.2767777520890445, + "grad_norm": 0.26571395993232727, + "learning_rate": 0.00016474627412774027, + "loss": 0.0426, + "step": 2095 + }, + { + "epoch": 0.2769098655745285, + "grad_norm": 0.37573423981666565, + "learning_rate": 0.00016471459505262176, + "loss": 0.0294, + "step": 2096 + }, + { + "epoch": 0.27704197906001254, + "grad_norm": 0.2801987826824188, + "learning_rate": 0.0001646829047992669, + "loss": 0.0207, + "step": 2097 + }, + { + "epoch": 0.2771740925454966, + "grad_norm": 0.32490137219429016, + "learning_rate": 0.00016465120337314968, + "loss": 0.0378, + "step": 2098 + }, + { + "epoch": 0.2773062060309806, + "grad_norm": 0.29033592343330383, + "learning_rate": 0.00016461949077974585, + "loss": 0.0248, + "step": 2099 + }, + { + "epoch": 0.27743831951646464, + "grad_norm": 0.2312108725309372, + "learning_rate": 0.0001645877670245332, + "loss": 0.0358, + "step": 2100 + }, + { + "epoch": 0.27757043300194867, + "grad_norm": 0.2771279215812683, + "learning_rate": 0.0001645560321129914, + "loss": 0.0313, + "step": 2101 + }, + { + "epoch": 0.2777025464874327, + "grad_norm": 0.2926751375198364, + "learning_rate": 0.0001645242860506021, + "loss": 0.0162, + "step": 2102 + }, + { + "epoch": 0.27783465997291673, + "grad_norm": 0.2527139186859131, + "learning_rate": 0.0001644925288428488, + "loss": 0.0244, + "step": 2103 + }, + { + "epoch": 0.27796677345840076, + "grad_norm": 0.20468686521053314, + "learning_rate": 0.000164460760495217, + "loss": 0.0209, + "step": 2104 + }, + { + "epoch": 0.2780988869438848, + "grad_norm": 0.26166459918022156, + "learning_rate": 0.000164428981013194, + "loss": 0.0389, + "step": 2105 + }, + { + "epoch": 0.2782310004293688, + "grad_norm": 0.25610142946243286, + "learning_rate": 0.00016439719040226925, + "loss": 0.0198, + "step": 2106 + }, + { + "epoch": 0.27836311391485286, + "grad_norm": 0.23919682204723358, + "learning_rate": 0.00016436538866793386, + "loss": 0.0256, + "step": 2107 + }, + { + "epoch": 0.2784952274003369, + "grad_norm": 0.29429468512535095, + "learning_rate": 0.00016433357581568107, + "loss": 0.0216, + "step": 2108 + }, + { + "epoch": 0.2786273408858209, + "grad_norm": 0.17881804704666138, + "learning_rate": 0.0001643017518510059, + "loss": 0.0229, + "step": 2109 + }, + { + "epoch": 0.27875945437130495, + "grad_norm": 0.19791945815086365, + "learning_rate": 0.00016426991677940538, + "loss": 0.0165, + "step": 2110 + }, + { + "epoch": 0.278891567856789, + "grad_norm": 0.18446460366249084, + "learning_rate": 0.00016423807060637836, + "loss": 0.0177, + "step": 2111 + }, + { + "epoch": 0.279023681342273, + "grad_norm": 0.19182991981506348, + "learning_rate": 0.0001642062133374258, + "loss": 0.0262, + "step": 2112 + }, + { + "epoch": 0.27915579482775704, + "grad_norm": 0.23441122472286224, + "learning_rate": 0.0001641743449780503, + "loss": 0.0237, + "step": 2113 + }, + { + "epoch": 0.2792879083132411, + "grad_norm": 0.21251298487186432, + "learning_rate": 0.00016414246553375663, + "loss": 0.0247, + "step": 2114 + }, + { + "epoch": 0.2794200217987251, + "grad_norm": 0.19433574378490448, + "learning_rate": 0.0001641105750100513, + "loss": 0.0278, + "step": 2115 + }, + { + "epoch": 0.27955213528420914, + "grad_norm": 0.16363725066184998, + "learning_rate": 0.00016407867341244282, + "loss": 0.0306, + "step": 2116 + }, + { + "epoch": 0.27968424876969317, + "grad_norm": 0.1874036192893982, + "learning_rate": 0.00016404676074644167, + "loss": 0.0299, + "step": 2117 + }, + { + "epoch": 0.2798163622551772, + "grad_norm": 0.27652040123939514, + "learning_rate": 0.00016401483701756003, + "loss": 0.0403, + "step": 2118 + }, + { + "epoch": 0.27994847574066123, + "grad_norm": 0.23305629193782806, + "learning_rate": 0.00016398290223131222, + "loss": 0.0247, + "step": 2119 + }, + { + "epoch": 0.28008058922614526, + "grad_norm": 0.24826198816299438, + "learning_rate": 0.00016395095639321438, + "loss": 0.0175, + "step": 2120 + }, + { + "epoch": 0.2802127027116293, + "grad_norm": 0.16378138959407806, + "learning_rate": 0.0001639189995087845, + "loss": 0.0184, + "step": 2121 + }, + { + "epoch": 0.2803448161971133, + "grad_norm": 0.4624858498573303, + "learning_rate": 0.0001638870315835426, + "loss": 0.0349, + "step": 2122 + }, + { + "epoch": 0.28047692968259735, + "grad_norm": 0.17329254746437073, + "learning_rate": 0.0001638550526230105, + "loss": 0.0355, + "step": 2123 + }, + { + "epoch": 0.2806090431680814, + "grad_norm": 0.258650541305542, + "learning_rate": 0.00016382306263271193, + "loss": 0.0317, + "step": 2124 + }, + { + "epoch": 0.2807411566535654, + "grad_norm": 0.2232353240251541, + "learning_rate": 0.00016379106161817263, + "loss": 0.0315, + "step": 2125 + }, + { + "epoch": 0.28087327013904945, + "grad_norm": 0.17142353951931, + "learning_rate": 0.00016375904958492016, + "loss": 0.0183, + "step": 2126 + }, + { + "epoch": 0.2810053836245335, + "grad_norm": 0.1361655443906784, + "learning_rate": 0.00016372702653848402, + "loss": 0.0169, + "step": 2127 + }, + { + "epoch": 0.2811374971100175, + "grad_norm": 0.3477037250995636, + "learning_rate": 0.00016369499248439554, + "loss": 0.0349, + "step": 2128 + }, + { + "epoch": 0.28126961059550154, + "grad_norm": 0.20569737255573273, + "learning_rate": 0.000163662947428188, + "loss": 0.0244, + "step": 2129 + }, + { + "epoch": 0.28140172408098557, + "grad_norm": 0.17859184741973877, + "learning_rate": 0.0001636308913753967, + "loss": 0.0232, + "step": 2130 + }, + { + "epoch": 0.2815338375664696, + "grad_norm": 0.22174854576587677, + "learning_rate": 0.00016359882433155857, + "loss": 0.0256, + "step": 2131 + }, + { + "epoch": 0.28166595105195363, + "grad_norm": 0.2234748899936676, + "learning_rate": 0.00016356674630221268, + "loss": 0.0192, + "step": 2132 + }, + { + "epoch": 0.28179806453743766, + "grad_norm": 0.3553476333618164, + "learning_rate": 0.0001635346572928999, + "loss": 0.0308, + "step": 2133 + }, + { + "epoch": 0.2819301780229217, + "grad_norm": 0.3469673991203308, + "learning_rate": 0.000163502557309163, + "loss": 0.0322, + "step": 2134 + }, + { + "epoch": 0.2820622915084057, + "grad_norm": 0.27370384335517883, + "learning_rate": 0.00016347044635654662, + "loss": 0.0271, + "step": 2135 + }, + { + "epoch": 0.28219440499388976, + "grad_norm": 0.20264363288879395, + "learning_rate": 0.00016343832444059737, + "loss": 0.016, + "step": 2136 + }, + { + "epoch": 0.2823265184793738, + "grad_norm": 0.1757168471813202, + "learning_rate": 0.0001634061915668637, + "loss": 0.0159, + "step": 2137 + }, + { + "epoch": 0.2824586319648578, + "grad_norm": 0.24907803535461426, + "learning_rate": 0.00016337404774089596, + "loss": 0.0259, + "step": 2138 + }, + { + "epoch": 0.28259074545034185, + "grad_norm": 0.20025473833084106, + "learning_rate": 0.0001633418929682464, + "loss": 0.0262, + "step": 2139 + }, + { + "epoch": 0.2827228589358259, + "grad_norm": 0.33397263288497925, + "learning_rate": 0.00016330972725446915, + "loss": 0.0384, + "step": 2140 + }, + { + "epoch": 0.2828549724213099, + "grad_norm": 0.16113078594207764, + "learning_rate": 0.0001632775506051202, + "loss": 0.0185, + "step": 2141 + }, + { + "epoch": 0.28298708590679394, + "grad_norm": 0.19310183823108673, + "learning_rate": 0.00016324536302575755, + "loss": 0.0281, + "step": 2142 + }, + { + "epoch": 0.283119199392278, + "grad_norm": 0.2211921066045761, + "learning_rate": 0.00016321316452194094, + "loss": 0.0346, + "step": 2143 + }, + { + "epoch": 0.283251312877762, + "grad_norm": 0.27257010340690613, + "learning_rate": 0.0001631809550992321, + "loss": 0.0225, + "step": 2144 + }, + { + "epoch": 0.28338342636324604, + "grad_norm": 0.1994791179895401, + "learning_rate": 0.0001631487347631945, + "loss": 0.0302, + "step": 2145 + }, + { + "epoch": 0.28351553984873007, + "grad_norm": 0.22019606828689575, + "learning_rate": 0.00016311650351939373, + "loss": 0.0255, + "step": 2146 + }, + { + "epoch": 0.2836476533342141, + "grad_norm": 0.22622746229171753, + "learning_rate": 0.00016308426137339707, + "loss": 0.0221, + "step": 2147 + }, + { + "epoch": 0.28377976681969813, + "grad_norm": 0.1966571807861328, + "learning_rate": 0.0001630520083307738, + "loss": 0.0286, + "step": 2148 + }, + { + "epoch": 0.28391188030518216, + "grad_norm": 0.18262293934822083, + "learning_rate": 0.00016301974439709494, + "loss": 0.0197, + "step": 2149 + }, + { + "epoch": 0.2840439937906662, + "grad_norm": 0.21323782205581665, + "learning_rate": 0.00016298746957793355, + "loss": 0.0349, + "step": 2150 + }, + { + "epoch": 0.2841761072761502, + "grad_norm": 0.13299968838691711, + "learning_rate": 0.0001629551838788645, + "loss": 0.012, + "step": 2151 + }, + { + "epoch": 0.28430822076163426, + "grad_norm": 0.19067250192165375, + "learning_rate": 0.0001629228873054645, + "loss": 0.0235, + "step": 2152 + }, + { + "epoch": 0.2844403342471183, + "grad_norm": 0.3284483551979065, + "learning_rate": 0.00016289057986331221, + "loss": 0.0185, + "step": 2153 + }, + { + "epoch": 0.2845724477326023, + "grad_norm": 0.2838576138019562, + "learning_rate": 0.00016285826155798815, + "loss": 0.0288, + "step": 2154 + }, + { + "epoch": 0.28470456121808635, + "grad_norm": 0.20192408561706543, + "learning_rate": 0.00016282593239507466, + "loss": 0.0205, + "step": 2155 + }, + { + "epoch": 0.2848366747035704, + "grad_norm": 0.23177526891231537, + "learning_rate": 0.00016279359238015605, + "loss": 0.0195, + "step": 2156 + }, + { + "epoch": 0.2849687881890544, + "grad_norm": 0.20849239826202393, + "learning_rate": 0.0001627612415188184, + "loss": 0.0469, + "step": 2157 + }, + { + "epoch": 0.28510090167453844, + "grad_norm": 0.30044230818748474, + "learning_rate": 0.00016272887981664974, + "loss": 0.0241, + "step": 2158 + }, + { + "epoch": 0.2852330151600225, + "grad_norm": 0.22462965548038483, + "learning_rate": 0.00016269650727923998, + "loss": 0.0173, + "step": 2159 + }, + { + "epoch": 0.2853651286455065, + "grad_norm": 0.2355768233537674, + "learning_rate": 0.0001626641239121808, + "loss": 0.0286, + "step": 2160 + }, + { + "epoch": 0.28549724213099054, + "grad_norm": 0.24778874218463898, + "learning_rate": 0.0001626317297210659, + "loss": 0.0329, + "step": 2161 + }, + { + "epoch": 0.28562935561647457, + "grad_norm": 0.3856741189956665, + "learning_rate": 0.00016259932471149072, + "loss": 0.0369, + "step": 2162 + }, + { + "epoch": 0.2857614691019586, + "grad_norm": 0.21599425375461578, + "learning_rate": 0.00016256690888905264, + "loss": 0.0271, + "step": 2163 + }, + { + "epoch": 0.28589358258744263, + "grad_norm": 0.2502656579017639, + "learning_rate": 0.00016253448225935087, + "loss": 0.0243, + "step": 2164 + }, + { + "epoch": 0.28602569607292666, + "grad_norm": 0.30705583095550537, + "learning_rate": 0.0001625020448279865, + "loss": 0.0418, + "step": 2165 + }, + { + "epoch": 0.2861578095584107, + "grad_norm": 0.43476346135139465, + "learning_rate": 0.0001624695966005625, + "loss": 0.0356, + "step": 2166 + }, + { + "epoch": 0.2862899230438947, + "grad_norm": 0.19399316608905792, + "learning_rate": 0.00016243713758268372, + "loss": 0.0331, + "step": 2167 + }, + { + "epoch": 0.28642203652937875, + "grad_norm": 0.19930878281593323, + "learning_rate": 0.00016240466777995685, + "loss": 0.0241, + "step": 2168 + }, + { + "epoch": 0.2865541500148628, + "grad_norm": 0.18226459622383118, + "learning_rate": 0.00016237218719799035, + "loss": 0.0179, + "step": 2169 + }, + { + "epoch": 0.2866862635003468, + "grad_norm": 0.35922908782958984, + "learning_rate": 0.00016233969584239478, + "loss": 0.0303, + "step": 2170 + }, + { + "epoch": 0.28681837698583085, + "grad_norm": 0.2960500717163086, + "learning_rate": 0.0001623071937187823, + "loss": 0.0334, + "step": 2171 + }, + { + "epoch": 0.2869504904713149, + "grad_norm": 0.21738512814044952, + "learning_rate": 0.00016227468083276707, + "loss": 0.014, + "step": 2172 + }, + { + "epoch": 0.2870826039567989, + "grad_norm": 0.18073824048042297, + "learning_rate": 0.0001622421571899651, + "loss": 0.0172, + "step": 2173 + }, + { + "epoch": 0.28721471744228294, + "grad_norm": 0.1971302479505539, + "learning_rate": 0.00016220962279599424, + "loss": 0.0254, + "step": 2174 + }, + { + "epoch": 0.28734683092776697, + "grad_norm": 0.2494899183511734, + "learning_rate": 0.0001621770776564742, + "loss": 0.0341, + "step": 2175 + }, + { + "epoch": 0.287478944413251, + "grad_norm": 0.45942479372024536, + "learning_rate": 0.0001621445217770265, + "loss": 0.0218, + "step": 2176 + }, + { + "epoch": 0.28761105789873503, + "grad_norm": 0.13894154131412506, + "learning_rate": 0.0001621119551632746, + "loss": 0.0157, + "step": 2177 + }, + { + "epoch": 0.28774317138421907, + "grad_norm": 0.18475857377052307, + "learning_rate": 0.00016207937782084382, + "loss": 0.0234, + "step": 2178 + }, + { + "epoch": 0.2878752848697031, + "grad_norm": 0.2803318500518799, + "learning_rate": 0.0001620467897553612, + "loss": 0.0352, + "step": 2179 + }, + { + "epoch": 0.2880073983551871, + "grad_norm": 0.16757351160049438, + "learning_rate": 0.00016201419097245577, + "loss": 0.0228, + "step": 2180 + }, + { + "epoch": 0.28813951184067116, + "grad_norm": 0.1853777915239334, + "learning_rate": 0.00016198158147775834, + "loss": 0.0169, + "step": 2181 + }, + { + "epoch": 0.2882716253261552, + "grad_norm": 0.20891882479190826, + "learning_rate": 0.0001619489612769016, + "loss": 0.0237, + "step": 2182 + }, + { + "epoch": 0.2884037388116392, + "grad_norm": 0.3805196285247803, + "learning_rate": 0.00016191633037552006, + "loss": 0.0295, + "step": 2183 + }, + { + "epoch": 0.28853585229712325, + "grad_norm": 0.2402908354997635, + "learning_rate": 0.00016188368877925012, + "loss": 0.0203, + "step": 2184 + }, + { + "epoch": 0.2886679657826073, + "grad_norm": 0.22193406522274017, + "learning_rate": 0.00016185103649373, + "loss": 0.0171, + "step": 2185 + }, + { + "epoch": 0.2888000792680913, + "grad_norm": 0.23514831066131592, + "learning_rate": 0.00016181837352459977, + "loss": 0.0396, + "step": 2186 + }, + { + "epoch": 0.28893219275357535, + "grad_norm": 0.2036595493555069, + "learning_rate": 0.00016178569987750137, + "loss": 0.0344, + "step": 2187 + }, + { + "epoch": 0.2890643062390594, + "grad_norm": 0.21834520995616913, + "learning_rate": 0.0001617530155580785, + "loss": 0.0271, + "step": 2188 + }, + { + "epoch": 0.2891964197245434, + "grad_norm": 0.23453760147094727, + "learning_rate": 0.00016172032057197683, + "loss": 0.0217, + "step": 2189 + }, + { + "epoch": 0.28932853321002744, + "grad_norm": 0.16586275398731232, + "learning_rate": 0.00016168761492484378, + "loss": 0.0217, + "step": 2190 + }, + { + "epoch": 0.28946064669551147, + "grad_norm": 0.46942102909088135, + "learning_rate": 0.00016165489862232866, + "loss": 0.0177, + "step": 2191 + }, + { + "epoch": 0.2895927601809955, + "grad_norm": 0.18020497262477875, + "learning_rate": 0.00016162217167008255, + "loss": 0.0154, + "step": 2192 + }, + { + "epoch": 0.28972487366647953, + "grad_norm": 0.20823246240615845, + "learning_rate": 0.00016158943407375845, + "loss": 0.0289, + "step": 2193 + }, + { + "epoch": 0.28985698715196356, + "grad_norm": 0.32893314957618713, + "learning_rate": 0.0001615566858390112, + "loss": 0.0334, + "step": 2194 + }, + { + "epoch": 0.2899891006374476, + "grad_norm": 0.3370470404624939, + "learning_rate": 0.0001615239269714974, + "loss": 0.0206, + "step": 2195 + }, + { + "epoch": 0.2901212141229316, + "grad_norm": 0.19687312841415405, + "learning_rate": 0.00016149115747687552, + "loss": 0.0249, + "step": 2196 + }, + { + "epoch": 0.29025332760841566, + "grad_norm": 0.19295763969421387, + "learning_rate": 0.00016145837736080592, + "loss": 0.0253, + "step": 2197 + }, + { + "epoch": 0.29038544109389963, + "grad_norm": 0.19667404890060425, + "learning_rate": 0.00016142558662895072, + "loss": 0.0172, + "step": 2198 + }, + { + "epoch": 0.29051755457938366, + "grad_norm": 0.17432467639446259, + "learning_rate": 0.00016139278528697396, + "loss": 0.0268, + "step": 2199 + }, + { + "epoch": 0.2906496680648677, + "grad_norm": 0.18398909270763397, + "learning_rate": 0.0001613599733405414, + "loss": 0.028, + "step": 2200 + }, + { + "epoch": 0.2907817815503517, + "grad_norm": 0.19623810052871704, + "learning_rate": 0.00016132715079532074, + "loss": 0.0286, + "step": 2201 + }, + { + "epoch": 0.29091389503583576, + "grad_norm": 0.18011881411075592, + "learning_rate": 0.00016129431765698137, + "loss": 0.0263, + "step": 2202 + }, + { + "epoch": 0.2910460085213198, + "grad_norm": 0.4542388916015625, + "learning_rate": 0.00016126147393119472, + "loss": 0.023, + "step": 2203 + }, + { + "epoch": 0.2911781220068038, + "grad_norm": 0.19564567506313324, + "learning_rate": 0.0001612286196236338, + "loss": 0.0181, + "step": 2204 + }, + { + "epoch": 0.29131023549228785, + "grad_norm": 0.1872696876525879, + "learning_rate": 0.00016119575473997372, + "loss": 0.0147, + "step": 2205 + }, + { + "epoch": 0.2914423489777719, + "grad_norm": 0.2464790791273117, + "learning_rate": 0.00016116287928589115, + "loss": 0.0228, + "step": 2206 + }, + { + "epoch": 0.2915744624632559, + "grad_norm": 0.19536122679710388, + "learning_rate": 0.00016112999326706482, + "loss": 0.0372, + "step": 2207 + }, + { + "epoch": 0.29170657594873994, + "grad_norm": 0.2512054443359375, + "learning_rate": 0.00016109709668917508, + "loss": 0.0218, + "step": 2208 + }, + { + "epoch": 0.291838689434224, + "grad_norm": 0.22209592163562775, + "learning_rate": 0.00016106418955790422, + "loss": 0.0208, + "step": 2209 + }, + { + "epoch": 0.291970802919708, + "grad_norm": 0.25197240710258484, + "learning_rate": 0.00016103127187893637, + "loss": 0.0146, + "step": 2210 + }, + { + "epoch": 0.29210291640519204, + "grad_norm": 0.34588807821273804, + "learning_rate": 0.0001609983436579574, + "loss": 0.0285, + "step": 2211 + }, + { + "epoch": 0.29223502989067607, + "grad_norm": 0.2277306318283081, + "learning_rate": 0.00016096540490065508, + "loss": 0.0155, + "step": 2212 + }, + { + "epoch": 0.2923671433761601, + "grad_norm": 0.14773651957511902, + "learning_rate": 0.00016093245561271896, + "loss": 0.0167, + "step": 2213 + }, + { + "epoch": 0.29249925686164413, + "grad_norm": 0.13434603810310364, + "learning_rate": 0.0001608994957998404, + "loss": 0.0188, + "step": 2214 + }, + { + "epoch": 0.29263137034712816, + "grad_norm": 0.1497490257024765, + "learning_rate": 0.0001608665254677126, + "loss": 0.0203, + "step": 2215 + }, + { + "epoch": 0.2927634838326122, + "grad_norm": 0.25678667426109314, + "learning_rate": 0.00016083354462203056, + "loss": 0.0314, + "step": 2216 + }, + { + "epoch": 0.2928955973180962, + "grad_norm": 0.2165631651878357, + "learning_rate": 0.00016080055326849109, + "loss": 0.0346, + "step": 2217 + }, + { + "epoch": 0.29302771080358025, + "grad_norm": 0.16122478246688843, + "learning_rate": 0.00016076755141279287, + "loss": 0.0234, + "step": 2218 + }, + { + "epoch": 0.2931598242890643, + "grad_norm": 0.1963433027267456, + "learning_rate": 0.0001607345390606363, + "loss": 0.0243, + "step": 2219 + }, + { + "epoch": 0.2932919377745483, + "grad_norm": 0.21741873025894165, + "learning_rate": 0.00016070151621772372, + "loss": 0.0264, + "step": 2220 + }, + { + "epoch": 0.29342405126003235, + "grad_norm": 0.1384367048740387, + "learning_rate": 0.00016066848288975912, + "loss": 0.0131, + "step": 2221 + }, + { + "epoch": 0.2935561647455164, + "grad_norm": 0.23075279593467712, + "learning_rate": 0.00016063543908244847, + "loss": 0.0259, + "step": 2222 + }, + { + "epoch": 0.2936882782310004, + "grad_norm": 0.33965376019477844, + "learning_rate": 0.0001606023848014994, + "loss": 0.0331, + "step": 2223 + }, + { + "epoch": 0.29382039171648444, + "grad_norm": 0.15662816166877747, + "learning_rate": 0.00016056932005262148, + "loss": 0.0189, + "step": 2224 + }, + { + "epoch": 0.2939525052019685, + "grad_norm": 0.21762795746326447, + "learning_rate": 0.000160536244841526, + "loss": 0.0253, + "step": 2225 + }, + { + "epoch": 0.2940846186874525, + "grad_norm": 0.20064249634742737, + "learning_rate": 0.00016050315917392612, + "loss": 0.0229, + "step": 2226 + }, + { + "epoch": 0.29421673217293653, + "grad_norm": 0.16303327679634094, + "learning_rate": 0.0001604700630555367, + "loss": 0.0217, + "step": 2227 + }, + { + "epoch": 0.29434884565842057, + "grad_norm": 0.18921849131584167, + "learning_rate": 0.0001604369564920745, + "loss": 0.0244, + "step": 2228 + }, + { + "epoch": 0.2944809591439046, + "grad_norm": 0.24990400671958923, + "learning_rate": 0.0001604038394892581, + "loss": 0.0232, + "step": 2229 + }, + { + "epoch": 0.29461307262938863, + "grad_norm": 0.23241092264652252, + "learning_rate": 0.00016037071205280781, + "loss": 0.0254, + "step": 2230 + }, + { + "epoch": 0.29474518611487266, + "grad_norm": 0.17506669461727142, + "learning_rate": 0.00016033757418844577, + "loss": 0.0195, + "step": 2231 + }, + { + "epoch": 0.2948772996003567, + "grad_norm": 0.1743784099817276, + "learning_rate": 0.00016030442590189595, + "loss": 0.0151, + "step": 2232 + }, + { + "epoch": 0.2950094130858407, + "grad_norm": 0.45965102314949036, + "learning_rate": 0.00016027126719888408, + "loss": 0.0377, + "step": 2233 + }, + { + "epoch": 0.29514152657132475, + "grad_norm": 0.17976310849189758, + "learning_rate": 0.0001602380980851377, + "loss": 0.0187, + "step": 2234 + }, + { + "epoch": 0.2952736400568088, + "grad_norm": 0.16373927891254425, + "learning_rate": 0.00016020491856638618, + "loss": 0.0194, + "step": 2235 + }, + { + "epoch": 0.2954057535422928, + "grad_norm": 0.19986402988433838, + "learning_rate": 0.00016017172864836064, + "loss": 0.0164, + "step": 2236 + }, + { + "epoch": 0.29553786702777685, + "grad_norm": 0.2965887486934662, + "learning_rate": 0.00016013852833679398, + "loss": 0.0441, + "step": 2237 + }, + { + "epoch": 0.2956699805132609, + "grad_norm": 0.22505910694599152, + "learning_rate": 0.00016010531763742104, + "loss": 0.025, + "step": 2238 + }, + { + "epoch": 0.2958020939987449, + "grad_norm": 0.3020710051059723, + "learning_rate": 0.00016007209655597828, + "loss": 0.0244, + "step": 2239 + }, + { + "epoch": 0.29593420748422894, + "grad_norm": 0.6665115356445312, + "learning_rate": 0.00016003886509820397, + "loss": 0.0346, + "step": 2240 + }, + { + "epoch": 0.29606632096971297, + "grad_norm": 0.17169487476348877, + "learning_rate": 0.0001600056232698383, + "loss": 0.0193, + "step": 2241 + }, + { + "epoch": 0.296198434455197, + "grad_norm": 0.18180319666862488, + "learning_rate": 0.00015997237107662318, + "loss": 0.0169, + "step": 2242 + }, + { + "epoch": 0.29633054794068103, + "grad_norm": 0.23043014109134674, + "learning_rate": 0.00015993910852430228, + "loss": 0.0195, + "step": 2243 + }, + { + "epoch": 0.29646266142616506, + "grad_norm": 0.2865215241909027, + "learning_rate": 0.00015990583561862102, + "loss": 0.0388, + "step": 2244 + }, + { + "epoch": 0.2965947749116491, + "grad_norm": 0.20309293270111084, + "learning_rate": 0.0001598725523653268, + "loss": 0.0355, + "step": 2245 + }, + { + "epoch": 0.2967268883971331, + "grad_norm": 0.31159254908561707, + "learning_rate": 0.0001598392587701686, + "loss": 0.0252, + "step": 2246 + }, + { + "epoch": 0.29685900188261716, + "grad_norm": 0.15543049573898315, + "learning_rate": 0.00015980595483889725, + "loss": 0.0052, + "step": 2247 + }, + { + "epoch": 0.2969911153681012, + "grad_norm": 0.29889604449272156, + "learning_rate": 0.00015977264057726539, + "loss": 0.0229, + "step": 2248 + }, + { + "epoch": 0.2971232288535852, + "grad_norm": 0.14694826304912567, + "learning_rate": 0.0001597393159910275, + "loss": 0.0188, + "step": 2249 + }, + { + "epoch": 0.29725534233906925, + "grad_norm": 0.2545098066329956, + "learning_rate": 0.00015970598108593973, + "loss": 0.032, + "step": 2250 + }, + { + "epoch": 0.2973874558245533, + "grad_norm": 0.3052002191543579, + "learning_rate": 0.00015967263586776006, + "loss": 0.0253, + "step": 2251 + }, + { + "epoch": 0.2975195693100373, + "grad_norm": 0.13028715550899506, + "learning_rate": 0.00015963928034224824, + "loss": 0.0144, + "step": 2252 + }, + { + "epoch": 0.29765168279552134, + "grad_norm": 0.16676437854766846, + "learning_rate": 0.00015960591451516585, + "loss": 0.023, + "step": 2253 + }, + { + "epoch": 0.2977837962810054, + "grad_norm": 0.2732032835483551, + "learning_rate": 0.0001595725383922762, + "loss": 0.0344, + "step": 2254 + }, + { + "epoch": 0.2979159097664894, + "grad_norm": 0.24554814398288727, + "learning_rate": 0.00015953915197934436, + "loss": 0.0309, + "step": 2255 + }, + { + "epoch": 0.29804802325197344, + "grad_norm": 0.1905503273010254, + "learning_rate": 0.0001595057552821373, + "loss": 0.0243, + "step": 2256 + }, + { + "epoch": 0.29818013673745747, + "grad_norm": 0.1754557490348816, + "learning_rate": 0.00015947234830642355, + "loss": 0.0185, + "step": 2257 + }, + { + "epoch": 0.2983122502229415, + "grad_norm": 0.1937703639268875, + "learning_rate": 0.00015943893105797364, + "loss": 0.0133, + "step": 2258 + }, + { + "epoch": 0.29844436370842553, + "grad_norm": 0.2528638541698456, + "learning_rate": 0.0001594055035425597, + "loss": 0.0425, + "step": 2259 + }, + { + "epoch": 0.29857647719390956, + "grad_norm": 0.16026671230793, + "learning_rate": 0.00015937206576595574, + "loss": 0.0116, + "step": 2260 + }, + { + "epoch": 0.2987085906793936, + "grad_norm": 0.23767255246639252, + "learning_rate": 0.00015933861773393754, + "loss": 0.0244, + "step": 2261 + }, + { + "epoch": 0.2988407041648776, + "grad_norm": 0.40182456374168396, + "learning_rate": 0.0001593051594522826, + "loss": 0.0279, + "step": 2262 + }, + { + "epoch": 0.29897281765036166, + "grad_norm": 0.16921398043632507, + "learning_rate": 0.0001592716909267702, + "loss": 0.0243, + "step": 2263 + }, + { + "epoch": 0.2991049311358457, + "grad_norm": 0.21883520483970642, + "learning_rate": 0.00015923821216318141, + "loss": 0.0262, + "step": 2264 + }, + { + "epoch": 0.2992370446213297, + "grad_norm": 0.15632914006710052, + "learning_rate": 0.00015920472316729908, + "loss": 0.0164, + "step": 2265 + }, + { + "epoch": 0.29936915810681375, + "grad_norm": 0.19226789474487305, + "learning_rate": 0.00015917122394490775, + "loss": 0.0159, + "step": 2266 + }, + { + "epoch": 0.2995012715922978, + "grad_norm": 0.2588377296924591, + "learning_rate": 0.00015913771450179384, + "loss": 0.0199, + "step": 2267 + }, + { + "epoch": 0.2996333850777818, + "grad_norm": 0.2027762234210968, + "learning_rate": 0.00015910419484374547, + "loss": 0.0195, + "step": 2268 + }, + { + "epoch": 0.29976549856326584, + "grad_norm": 0.1688690334558487, + "learning_rate": 0.00015907066497655253, + "loss": 0.0119, + "step": 2269 + }, + { + "epoch": 0.2998976120487499, + "grad_norm": 0.18758751451969147, + "learning_rate": 0.00015903712490600668, + "loss": 0.0247, + "step": 2270 + }, + { + "epoch": 0.3000297255342339, + "grad_norm": 0.23641707003116608, + "learning_rate": 0.00015900357463790132, + "loss": 0.0288, + "step": 2271 + }, + { + "epoch": 0.30016183901971794, + "grad_norm": 0.22951290011405945, + "learning_rate": 0.00015897001417803166, + "loss": 0.026, + "step": 2272 + }, + { + "epoch": 0.30029395250520197, + "grad_norm": 0.20448489487171173, + "learning_rate": 0.00015893644353219463, + "loss": 0.0196, + "step": 2273 + }, + { + "epoch": 0.300426065990686, + "grad_norm": 0.1670645922422409, + "learning_rate": 0.00015890286270618892, + "loss": 0.015, + "step": 2274 + }, + { + "epoch": 0.30055817947617003, + "grad_norm": 0.1759205311536789, + "learning_rate": 0.000158869271705815, + "loss": 0.023, + "step": 2275 + }, + { + "epoch": 0.30069029296165406, + "grad_norm": 0.23238608241081238, + "learning_rate": 0.00015883567053687512, + "loss": 0.0234, + "step": 2276 + }, + { + "epoch": 0.3008224064471381, + "grad_norm": 0.14851292967796326, + "learning_rate": 0.0001588020592051732, + "loss": 0.0178, + "step": 2277 + }, + { + "epoch": 0.3009545199326221, + "grad_norm": 0.17358282208442688, + "learning_rate": 0.00015876843771651497, + "loss": 0.0224, + "step": 2278 + }, + { + "epoch": 0.30108663341810615, + "grad_norm": 0.25880980491638184, + "learning_rate": 0.00015873480607670793, + "loss": 0.0276, + "step": 2279 + }, + { + "epoch": 0.3012187469035902, + "grad_norm": 0.1913393884897232, + "learning_rate": 0.00015870116429156136, + "loss": 0.0193, + "step": 2280 + }, + { + "epoch": 0.3013508603890742, + "grad_norm": 0.16971909999847412, + "learning_rate": 0.00015866751236688617, + "loss": 0.0249, + "step": 2281 + }, + { + "epoch": 0.30148297387455825, + "grad_norm": 0.17388664186000824, + "learning_rate": 0.00015863385030849515, + "loss": 0.0133, + "step": 2282 + }, + { + "epoch": 0.3016150873600423, + "grad_norm": 0.26276373863220215, + "learning_rate": 0.0001586001781222028, + "loss": 0.0361, + "step": 2283 + }, + { + "epoch": 0.3017472008455263, + "grad_norm": 0.21703331172466278, + "learning_rate": 0.00015856649581382534, + "loss": 0.0186, + "step": 2284 + }, + { + "epoch": 0.30187931433101034, + "grad_norm": 0.24544142186641693, + "learning_rate": 0.00015853280338918078, + "loss": 0.0146, + "step": 2285 + }, + { + "epoch": 0.30201142781649437, + "grad_norm": 0.1939631551504135, + "learning_rate": 0.00015849910085408882, + "loss": 0.0295, + "step": 2286 + }, + { + "epoch": 0.3021435413019784, + "grad_norm": 0.21825960278511047, + "learning_rate": 0.00015846538821437094, + "loss": 0.0218, + "step": 2287 + }, + { + "epoch": 0.30227565478746243, + "grad_norm": 0.20250838994979858, + "learning_rate": 0.00015843166547585043, + "loss": 0.0236, + "step": 2288 + }, + { + "epoch": 0.30240776827294646, + "grad_norm": 0.20301519334316254, + "learning_rate": 0.0001583979326443522, + "loss": 0.0256, + "step": 2289 + }, + { + "epoch": 0.3025398817584305, + "grad_norm": 0.12368103861808777, + "learning_rate": 0.00015836418972570298, + "loss": 0.0106, + "step": 2290 + }, + { + "epoch": 0.3026719952439145, + "grad_norm": 0.25658977031707764, + "learning_rate": 0.00015833043672573122, + "loss": 0.0391, + "step": 2291 + }, + { + "epoch": 0.30280410872939856, + "grad_norm": 0.22816288471221924, + "learning_rate": 0.00015829667365026718, + "loss": 0.0217, + "step": 2292 + }, + { + "epoch": 0.3029362222148826, + "grad_norm": 0.34003958106040955, + "learning_rate": 0.00015826290050514273, + "loss": 0.0237, + "step": 2293 + }, + { + "epoch": 0.3030683357003666, + "grad_norm": 0.2503148317337036, + "learning_rate": 0.00015822911729619158, + "loss": 0.0133, + "step": 2294 + }, + { + "epoch": 0.30320044918585065, + "grad_norm": 0.24932456016540527, + "learning_rate": 0.00015819532402924912, + "loss": 0.0277, + "step": 2295 + }, + { + "epoch": 0.3033325626713347, + "grad_norm": 0.27211788296699524, + "learning_rate": 0.00015816152071015255, + "loss": 0.0179, + "step": 2296 + }, + { + "epoch": 0.3034646761568187, + "grad_norm": 0.15673963725566864, + "learning_rate": 0.00015812770734474074, + "loss": 0.0196, + "step": 2297 + }, + { + "epoch": 0.30359678964230274, + "grad_norm": 0.3486553728580475, + "learning_rate": 0.00015809388393885434, + "loss": 0.0349, + "step": 2298 + }, + { + "epoch": 0.3037289031277868, + "grad_norm": 0.34395694732666016, + "learning_rate": 0.00015806005049833564, + "loss": 0.023, + "step": 2299 + }, + { + "epoch": 0.3038610166132708, + "grad_norm": 0.23733146488666534, + "learning_rate": 0.00015802620702902882, + "loss": 0.0192, + "step": 2300 + }, + { + "epoch": 0.30399313009875484, + "grad_norm": 0.18981397151947021, + "learning_rate": 0.00015799235353677963, + "loss": 0.0216, + "step": 2301 + }, + { + "epoch": 0.30412524358423887, + "grad_norm": 0.34877878427505493, + "learning_rate": 0.0001579584900274357, + "loss": 0.0279, + "step": 2302 + }, + { + "epoch": 0.3042573570697229, + "grad_norm": 0.5037104487419128, + "learning_rate": 0.00015792461650684624, + "loss": 0.0301, + "step": 2303 + }, + { + "epoch": 0.30438947055520693, + "grad_norm": 0.16604360938072205, + "learning_rate": 0.00015789073298086236, + "loss": 0.0193, + "step": 2304 + }, + { + "epoch": 0.30452158404069096, + "grad_norm": 0.24134650826454163, + "learning_rate": 0.00015785683945533673, + "loss": 0.0307, + "step": 2305 + }, + { + "epoch": 0.304653697526175, + "grad_norm": 0.2895773947238922, + "learning_rate": 0.00015782293593612386, + "loss": 0.0243, + "step": 2306 + }, + { + "epoch": 0.304785811011659, + "grad_norm": 0.17948000133037567, + "learning_rate": 0.00015778902242907995, + "loss": 0.0243, + "step": 2307 + }, + { + "epoch": 0.30491792449714306, + "grad_norm": 0.2832844853401184, + "learning_rate": 0.00015775509894006286, + "loss": 0.0295, + "step": 2308 + }, + { + "epoch": 0.3050500379826271, + "grad_norm": 0.29266154766082764, + "learning_rate": 0.00015772116547493233, + "loss": 0.018, + "step": 2309 + }, + { + "epoch": 0.3051821514681111, + "grad_norm": 0.23423396050930023, + "learning_rate": 0.0001576872220395497, + "loss": 0.0154, + "step": 2310 + }, + { + "epoch": 0.30531426495359515, + "grad_norm": 0.2711523175239563, + "learning_rate": 0.00015765326863977804, + "loss": 0.0352, + "step": 2311 + }, + { + "epoch": 0.3054463784390792, + "grad_norm": 0.13761219382286072, + "learning_rate": 0.00015761930528148218, + "loss": 0.0139, + "step": 2312 + }, + { + "epoch": 0.3055784919245632, + "grad_norm": 0.21768812835216522, + "learning_rate": 0.00015758533197052867, + "loss": 0.0251, + "step": 2313 + }, + { + "epoch": 0.30571060541004724, + "grad_norm": 0.22113913297653198, + "learning_rate": 0.00015755134871278575, + "loss": 0.0271, + "step": 2314 + }, + { + "epoch": 0.3058427188955313, + "grad_norm": 0.31142762303352356, + "learning_rate": 0.00015751735551412338, + "loss": 0.0319, + "step": 2315 + }, + { + "epoch": 0.3059748323810153, + "grad_norm": 0.22142787277698517, + "learning_rate": 0.00015748335238041324, + "loss": 0.0299, + "step": 2316 + }, + { + "epoch": 0.30610694586649934, + "grad_norm": 0.20579847693443298, + "learning_rate": 0.00015744933931752882, + "loss": 0.0238, + "step": 2317 + }, + { + "epoch": 0.30623905935198337, + "grad_norm": 0.2536778151988983, + "learning_rate": 0.00015741531633134512, + "loss": 0.038, + "step": 2318 + }, + { + "epoch": 0.3063711728374674, + "grad_norm": 0.18283496797084808, + "learning_rate": 0.00015738128342773907, + "loss": 0.0243, + "step": 2319 + }, + { + "epoch": 0.30650328632295143, + "grad_norm": 0.23785123229026794, + "learning_rate": 0.0001573472406125892, + "loss": 0.0284, + "step": 2320 + }, + { + "epoch": 0.30663539980843546, + "grad_norm": 0.19473259150981903, + "learning_rate": 0.0001573131878917757, + "loss": 0.0213, + "step": 2321 + }, + { + "epoch": 0.3067675132939195, + "grad_norm": 0.33988869190216064, + "learning_rate": 0.00015727912527118063, + "loss": 0.0422, + "step": 2322 + }, + { + "epoch": 0.3068996267794035, + "grad_norm": 0.22191791236400604, + "learning_rate": 0.00015724505275668758, + "loss": 0.0344, + "step": 2323 + }, + { + "epoch": 0.30703174026488755, + "grad_norm": 0.25878584384918213, + "learning_rate": 0.00015721097035418206, + "loss": 0.0322, + "step": 2324 + }, + { + "epoch": 0.3071638537503716, + "grad_norm": 0.18412941694259644, + "learning_rate": 0.00015717687806955107, + "loss": 0.021, + "step": 2325 + }, + { + "epoch": 0.3072959672358556, + "grad_norm": 0.19447560608386993, + "learning_rate": 0.00015714277590868345, + "loss": 0.0161, + "step": 2326 + }, + { + "epoch": 0.30742808072133965, + "grad_norm": 0.19321192800998688, + "learning_rate": 0.0001571086638774697, + "loss": 0.0298, + "step": 2327 + }, + { + "epoch": 0.3075601942068237, + "grad_norm": 0.3516335189342499, + "learning_rate": 0.00015707454198180204, + "loss": 0.0274, + "step": 2328 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 0.3699778914451599, + "learning_rate": 0.00015704041022757438, + "loss": 0.0549, + "step": 2329 + }, + { + "epoch": 0.30782442117779174, + "grad_norm": 0.2168762981891632, + "learning_rate": 0.00015700626862068237, + "loss": 0.0216, + "step": 2330 + }, + { + "epoch": 0.30795653466327577, + "grad_norm": 0.20153917372226715, + "learning_rate": 0.0001569721171670233, + "loss": 0.0151, + "step": 2331 + }, + { + "epoch": 0.3080886481487598, + "grad_norm": 0.15755203366279602, + "learning_rate": 0.00015693795587249623, + "loss": 0.0204, + "step": 2332 + }, + { + "epoch": 0.30822076163424383, + "grad_norm": 0.1630401760339737, + "learning_rate": 0.0001569037847430019, + "loss": 0.0113, + "step": 2333 + }, + { + "epoch": 0.30835287511972787, + "grad_norm": 0.252340167760849, + "learning_rate": 0.00015686960378444266, + "loss": 0.0231, + "step": 2334 + }, + { + "epoch": 0.3084849886052119, + "grad_norm": 0.20554135739803314, + "learning_rate": 0.00015683541300272268, + "loss": 0.0239, + "step": 2335 + }, + { + "epoch": 0.3086171020906959, + "grad_norm": 0.36748287081718445, + "learning_rate": 0.00015680121240374775, + "loss": 0.0345, + "step": 2336 + }, + { + "epoch": 0.30874921557617996, + "grad_norm": 0.32464292645454407, + "learning_rate": 0.00015676700199342543, + "loss": 0.0169, + "step": 2337 + }, + { + "epoch": 0.308881329061664, + "grad_norm": 0.18028365075588226, + "learning_rate": 0.00015673278177766488, + "loss": 0.017, + "step": 2338 + }, + { + "epoch": 0.309013442547148, + "grad_norm": 0.2513674199581146, + "learning_rate": 0.00015669855176237704, + "loss": 0.0261, + "step": 2339 + }, + { + "epoch": 0.30914555603263205, + "grad_norm": 0.21183346211910248, + "learning_rate": 0.0001566643119534745, + "loss": 0.0246, + "step": 2340 + }, + { + "epoch": 0.3092776695181161, + "grad_norm": 0.1958574652671814, + "learning_rate": 0.00015663006235687153, + "loss": 0.0178, + "step": 2341 + }, + { + "epoch": 0.3094097830036001, + "grad_norm": 0.16935515403747559, + "learning_rate": 0.00015659580297848412, + "loss": 0.025, + "step": 2342 + }, + { + "epoch": 0.30954189648908415, + "grad_norm": 0.19944345951080322, + "learning_rate": 0.00015656153382422993, + "loss": 0.0236, + "step": 2343 + }, + { + "epoch": 0.3096740099745682, + "grad_norm": 0.22058166563510895, + "learning_rate": 0.00015652725490002833, + "loss": 0.0389, + "step": 2344 + }, + { + "epoch": 0.3098061234600522, + "grad_norm": 0.13750414550304413, + "learning_rate": 0.00015649296621180034, + "loss": 0.0164, + "step": 2345 + }, + { + "epoch": 0.30993823694553624, + "grad_norm": 0.14292161166667938, + "learning_rate": 0.00015645866776546868, + "loss": 0.0189, + "step": 2346 + }, + { + "epoch": 0.31007035043102027, + "grad_norm": 0.246055006980896, + "learning_rate": 0.0001564243595669578, + "loss": 0.029, + "step": 2347 + }, + { + "epoch": 0.3102024639165043, + "grad_norm": 0.33867743611335754, + "learning_rate": 0.00015639004162219382, + "loss": 0.0326, + "step": 2348 + }, + { + "epoch": 0.31033457740198833, + "grad_norm": 0.17780427634716034, + "learning_rate": 0.00015635571393710445, + "loss": 0.0209, + "step": 2349 + }, + { + "epoch": 0.31046669088747236, + "grad_norm": 0.20596635341644287, + "learning_rate": 0.00015632137651761923, + "loss": 0.0118, + "step": 2350 + }, + { + "epoch": 0.3105988043729564, + "grad_norm": 0.15004736185073853, + "learning_rate": 0.00015628702936966926, + "loss": 0.0091, + "step": 2351 + }, + { + "epoch": 0.3107309178584404, + "grad_norm": 0.34517210721969604, + "learning_rate": 0.00015625267249918737, + "loss": 0.0362, + "step": 2352 + }, + { + "epoch": 0.31086303134392446, + "grad_norm": 0.3376559317111969, + "learning_rate": 0.00015621830591210808, + "loss": 0.0372, + "step": 2353 + }, + { + "epoch": 0.3109951448294085, + "grad_norm": 0.2009844183921814, + "learning_rate": 0.00015618392961436756, + "loss": 0.026, + "step": 2354 + }, + { + "epoch": 0.3111272583148925, + "grad_norm": 0.23881836235523224, + "learning_rate": 0.0001561495436119037, + "loss": 0.0215, + "step": 2355 + }, + { + "epoch": 0.31125937180037655, + "grad_norm": 0.28877967596054077, + "learning_rate": 0.00015611514791065602, + "loss": 0.0376, + "step": 2356 + }, + { + "epoch": 0.3113914852858606, + "grad_norm": 0.20852817595005035, + "learning_rate": 0.00015608074251656574, + "loss": 0.023, + "step": 2357 + }, + { + "epoch": 0.3115235987713446, + "grad_norm": 0.22282958030700684, + "learning_rate": 0.00015604632743557577, + "loss": 0.0257, + "step": 2358 + }, + { + "epoch": 0.31165571225682864, + "grad_norm": 0.22633077204227448, + "learning_rate": 0.00015601190267363062, + "loss": 0.0495, + "step": 2359 + }, + { + "epoch": 0.3117878257423127, + "grad_norm": 0.6149209141731262, + "learning_rate": 0.00015597746823667655, + "loss": 0.0493, + "step": 2360 + }, + { + "epoch": 0.31191993922779665, + "grad_norm": 0.14970439672470093, + "learning_rate": 0.0001559430241306615, + "loss": 0.0164, + "step": 2361 + }, + { + "epoch": 0.3120520527132807, + "grad_norm": 0.2662661373615265, + "learning_rate": 0.00015590857036153498, + "loss": 0.0258, + "step": 2362 + }, + { + "epoch": 0.3121841661987647, + "grad_norm": 0.1701253205537796, + "learning_rate": 0.0001558741069352483, + "loss": 0.0278, + "step": 2363 + }, + { + "epoch": 0.31231627968424874, + "grad_norm": 0.20933538675308228, + "learning_rate": 0.0001558396338577543, + "loss": 0.0187, + "step": 2364 + }, + { + "epoch": 0.3124483931697328, + "grad_norm": 0.24754363298416138, + "learning_rate": 0.00015580515113500763, + "loss": 0.0559, + "step": 2365 + }, + { + "epoch": 0.3125805066552168, + "grad_norm": 0.32924720644950867, + "learning_rate": 0.00015577065877296452, + "loss": 0.0303, + "step": 2366 + }, + { + "epoch": 0.31271262014070084, + "grad_norm": 0.3301711678504944, + "learning_rate": 0.0001557361567775828, + "loss": 0.0436, + "step": 2367 + }, + { + "epoch": 0.31284473362618487, + "grad_norm": 0.1682203859090805, + "learning_rate": 0.00015570164515482215, + "loss": 0.0202, + "step": 2368 + }, + { + "epoch": 0.3129768471116689, + "grad_norm": 0.1786518096923828, + "learning_rate": 0.00015566712391064378, + "loss": 0.0312, + "step": 2369 + }, + { + "epoch": 0.31310896059715293, + "grad_norm": 0.13824589550495148, + "learning_rate": 0.00015563259305101057, + "loss": 0.0191, + "step": 2370 + }, + { + "epoch": 0.31324107408263696, + "grad_norm": 0.2650575637817383, + "learning_rate": 0.00015559805258188707, + "loss": 0.0264, + "step": 2371 + }, + { + "epoch": 0.313373187568121, + "grad_norm": 0.220457062125206, + "learning_rate": 0.00015556350250923954, + "loss": 0.0208, + "step": 2372 + }, + { + "epoch": 0.313505301053605, + "grad_norm": 0.25818371772766113, + "learning_rate": 0.00015552894283903584, + "loss": 0.0266, + "step": 2373 + }, + { + "epoch": 0.31363741453908905, + "grad_norm": 0.25876984000205994, + "learning_rate": 0.00015549437357724547, + "loss": 0.0366, + "step": 2374 + }, + { + "epoch": 0.3137695280245731, + "grad_norm": 0.17257654666900635, + "learning_rate": 0.00015545979472983968, + "loss": 0.0229, + "step": 2375 + }, + { + "epoch": 0.3139016415100571, + "grad_norm": 0.21076619625091553, + "learning_rate": 0.00015542520630279133, + "loss": 0.0135, + "step": 2376 + }, + { + "epoch": 0.31403375499554115, + "grad_norm": 0.2625904083251953, + "learning_rate": 0.00015539060830207485, + "loss": 0.0358, + "step": 2377 + }, + { + "epoch": 0.3141658684810252, + "grad_norm": 0.2343330979347229, + "learning_rate": 0.00015535600073366649, + "loss": 0.0247, + "step": 2378 + }, + { + "epoch": 0.3142979819665092, + "grad_norm": 0.3292272984981537, + "learning_rate": 0.00015532138360354396, + "loss": 0.0307, + "step": 2379 + }, + { + "epoch": 0.31443009545199324, + "grad_norm": 0.24976593255996704, + "learning_rate": 0.00015528675691768676, + "loss": 0.0325, + "step": 2380 + }, + { + "epoch": 0.3145622089374773, + "grad_norm": 0.15806466341018677, + "learning_rate": 0.00015525212068207605, + "loss": 0.0166, + "step": 2381 + }, + { + "epoch": 0.3146943224229613, + "grad_norm": 0.2826812267303467, + "learning_rate": 0.00015521747490269454, + "loss": 0.024, + "step": 2382 + }, + { + "epoch": 0.31482643590844533, + "grad_norm": 0.1427195966243744, + "learning_rate": 0.00015518281958552666, + "loss": 0.0176, + "step": 2383 + }, + { + "epoch": 0.31495854939392937, + "grad_norm": 0.26324498653411865, + "learning_rate": 0.00015514815473655847, + "loss": 0.0364, + "step": 2384 + }, + { + "epoch": 0.3150906628794134, + "grad_norm": 0.31639760732650757, + "learning_rate": 0.00015511348036177766, + "loss": 0.0447, + "step": 2385 + }, + { + "epoch": 0.31522277636489743, + "grad_norm": 0.22099065780639648, + "learning_rate": 0.0001550787964671736, + "loss": 0.0293, + "step": 2386 + }, + { + "epoch": 0.31535488985038146, + "grad_norm": 0.16949567198753357, + "learning_rate": 0.00015504410305873726, + "loss": 0.0203, + "step": 2387 + }, + { + "epoch": 0.3154870033358655, + "grad_norm": 0.18645323812961578, + "learning_rate": 0.0001550094001424613, + "loss": 0.0236, + "step": 2388 + }, + { + "epoch": 0.3156191168213495, + "grad_norm": 0.23583142459392548, + "learning_rate": 0.00015497468772434, + "loss": 0.0153, + "step": 2389 + }, + { + "epoch": 0.31575123030683355, + "grad_norm": 0.41702818870544434, + "learning_rate": 0.00015493996581036928, + "loss": 0.0274, + "step": 2390 + }, + { + "epoch": 0.3158833437923176, + "grad_norm": 0.24233517050743103, + "learning_rate": 0.00015490523440654666, + "loss": 0.0124, + "step": 2391 + }, + { + "epoch": 0.3160154572778016, + "grad_norm": 0.16888493299484253, + "learning_rate": 0.0001548704935188714, + "loss": 0.0211, + "step": 2392 + }, + { + "epoch": 0.31614757076328565, + "grad_norm": 0.24889791011810303, + "learning_rate": 0.00015483574315334431, + "loss": 0.0192, + "step": 2393 + }, + { + "epoch": 0.3162796842487697, + "grad_norm": 0.31684616208076477, + "learning_rate": 0.00015480098331596784, + "loss": 0.0249, + "step": 2394 + }, + { + "epoch": 0.3164117977342537, + "grad_norm": 0.2519291937351227, + "learning_rate": 0.00015476621401274617, + "loss": 0.0287, + "step": 2395 + }, + { + "epoch": 0.31654391121973774, + "grad_norm": 0.15193960070610046, + "learning_rate": 0.00015473143524968497, + "loss": 0.0164, + "step": 2396 + }, + { + "epoch": 0.31667602470522177, + "grad_norm": 0.12480553984642029, + "learning_rate": 0.0001546966470327917, + "loss": 0.0167, + "step": 2397 + }, + { + "epoch": 0.3168081381907058, + "grad_norm": 0.22492715716362, + "learning_rate": 0.00015466184936807528, + "loss": 0.0213, + "step": 2398 + }, + { + "epoch": 0.31694025167618983, + "grad_norm": 0.2097017616033554, + "learning_rate": 0.00015462704226154646, + "loss": 0.031, + "step": 2399 + }, + { + "epoch": 0.31707236516167386, + "grad_norm": 0.18435464799404144, + "learning_rate": 0.00015459222571921743, + "loss": 0.0264, + "step": 2400 + }, + { + "epoch": 0.3172044786471579, + "grad_norm": 0.23296067118644714, + "learning_rate": 0.00015455739974710214, + "loss": 0.0199, + "step": 2401 + }, + { + "epoch": 0.3173365921326419, + "grad_norm": 0.15619218349456787, + "learning_rate": 0.00015452256435121616, + "loss": 0.026, + "step": 2402 + }, + { + "epoch": 0.31746870561812596, + "grad_norm": 0.31736814975738525, + "learning_rate": 0.00015448771953757658, + "loss": 0.0259, + "step": 2403 + }, + { + "epoch": 0.31760081910361, + "grad_norm": 0.21897679567337036, + "learning_rate": 0.0001544528653122022, + "loss": 0.0342, + "step": 2404 + }, + { + "epoch": 0.317732932589094, + "grad_norm": 0.15073131024837494, + "learning_rate": 0.0001544180016811135, + "loss": 0.0206, + "step": 2405 + }, + { + "epoch": 0.31786504607457805, + "grad_norm": 0.199736088514328, + "learning_rate": 0.00015438312865033245, + "loss": 0.0265, + "step": 2406 + }, + { + "epoch": 0.3179971595600621, + "grad_norm": 0.369244247674942, + "learning_rate": 0.00015434824622588276, + "loss": 0.0283, + "step": 2407 + }, + { + "epoch": 0.3181292730455461, + "grad_norm": 0.21546632051467896, + "learning_rate": 0.00015431335441378968, + "loss": 0.0118, + "step": 2408 + }, + { + "epoch": 0.31826138653103014, + "grad_norm": 0.14493218064308167, + "learning_rate": 0.00015427845322008013, + "loss": 0.0137, + "step": 2409 + }, + { + "epoch": 0.3183935000165142, + "grad_norm": 0.1289505809545517, + "learning_rate": 0.00015424354265078266, + "loss": 0.0123, + "step": 2410 + }, + { + "epoch": 0.3185256135019982, + "grad_norm": 0.27677011489868164, + "learning_rate": 0.00015420862271192743, + "loss": 0.0401, + "step": 2411 + }, + { + "epoch": 0.31865772698748224, + "grad_norm": 0.19438651204109192, + "learning_rate": 0.00015417369340954616, + "loss": 0.0292, + "step": 2412 + }, + { + "epoch": 0.31878984047296627, + "grad_norm": 0.2415550947189331, + "learning_rate": 0.00015413875474967222, + "loss": 0.0437, + "step": 2413 + }, + { + "epoch": 0.3189219539584503, + "grad_norm": 0.23373529314994812, + "learning_rate": 0.00015410380673834068, + "loss": 0.0366, + "step": 2414 + }, + { + "epoch": 0.31905406744393433, + "grad_norm": 0.45507702231407166, + "learning_rate": 0.0001540688493815881, + "loss": 0.0236, + "step": 2415 + }, + { + "epoch": 0.31918618092941836, + "grad_norm": 0.26520341634750366, + "learning_rate": 0.00015403388268545276, + "loss": 0.0169, + "step": 2416 + }, + { + "epoch": 0.3193182944149024, + "grad_norm": 0.17684026062488556, + "learning_rate": 0.00015399890665597442, + "loss": 0.0273, + "step": 2417 + }, + { + "epoch": 0.3194504079003864, + "grad_norm": 0.22508570551872253, + "learning_rate": 0.00015396392129919467, + "loss": 0.0176, + "step": 2418 + }, + { + "epoch": 0.31958252138587045, + "grad_norm": 0.20308654010295868, + "learning_rate": 0.00015392892662115644, + "loss": 0.0331, + "step": 2419 + }, + { + "epoch": 0.3197146348713545, + "grad_norm": 0.31760069727897644, + "learning_rate": 0.00015389392262790444, + "loss": 0.0253, + "step": 2420 + }, + { + "epoch": 0.3198467483568385, + "grad_norm": 0.2112942934036255, + "learning_rate": 0.00015385890932548502, + "loss": 0.0279, + "step": 2421 + }, + { + "epoch": 0.31997886184232255, + "grad_norm": 0.25001874566078186, + "learning_rate": 0.00015382388671994599, + "loss": 0.0361, + "step": 2422 + }, + { + "epoch": 0.3201109753278066, + "grad_norm": 0.30203571915626526, + "learning_rate": 0.00015378885481733692, + "loss": 0.0189, + "step": 2423 + }, + { + "epoch": 0.3202430888132906, + "grad_norm": 0.1823374330997467, + "learning_rate": 0.00015375381362370884, + "loss": 0.0217, + "step": 2424 + }, + { + "epoch": 0.32037520229877464, + "grad_norm": 0.2312767654657364, + "learning_rate": 0.00015371876314511455, + "loss": 0.0212, + "step": 2425 + }, + { + "epoch": 0.3205073157842587, + "grad_norm": 0.2587668001651764, + "learning_rate": 0.0001536837033876083, + "loss": 0.024, + "step": 2426 + }, + { + "epoch": 0.3206394292697427, + "grad_norm": 0.29623347520828247, + "learning_rate": 0.00015364863435724606, + "loss": 0.0307, + "step": 2427 + }, + { + "epoch": 0.32077154275522674, + "grad_norm": 0.18185120820999146, + "learning_rate": 0.00015361355606008527, + "loss": 0.0184, + "step": 2428 + }, + { + "epoch": 0.32090365624071077, + "grad_norm": 0.2674174904823303, + "learning_rate": 0.00015357846850218513, + "loss": 0.0335, + "step": 2429 + }, + { + "epoch": 0.3210357697261948, + "grad_norm": 0.22505123913288116, + "learning_rate": 0.00015354337168960633, + "loss": 0.0297, + "step": 2430 + }, + { + "epoch": 0.32116788321167883, + "grad_norm": 0.21133509278297424, + "learning_rate": 0.00015350826562841117, + "loss": 0.0198, + "step": 2431 + }, + { + "epoch": 0.32129999669716286, + "grad_norm": 0.1815750002861023, + "learning_rate": 0.00015347315032466358, + "loss": 0.0222, + "step": 2432 + }, + { + "epoch": 0.3214321101826469, + "grad_norm": 0.16259314119815826, + "learning_rate": 0.0001534380257844291, + "loss": 0.0172, + "step": 2433 + }, + { + "epoch": 0.3215642236681309, + "grad_norm": 0.2182605117559433, + "learning_rate": 0.00015340289201377477, + "loss": 0.027, + "step": 2434 + }, + { + "epoch": 0.32169633715361495, + "grad_norm": 0.1356114149093628, + "learning_rate": 0.00015336774901876936, + "loss": 0.0198, + "step": 2435 + }, + { + "epoch": 0.321828450639099, + "grad_norm": 0.2803022861480713, + "learning_rate": 0.00015333259680548313, + "loss": 0.0238, + "step": 2436 + }, + { + "epoch": 0.321960564124583, + "grad_norm": 0.23340195417404175, + "learning_rate": 0.000153297435379988, + "loss": 0.0224, + "step": 2437 + }, + { + "epoch": 0.32209267761006705, + "grad_norm": 0.15403622388839722, + "learning_rate": 0.0001532622647483574, + "loss": 0.021, + "step": 2438 + }, + { + "epoch": 0.3222247910955511, + "grad_norm": 0.21697868406772614, + "learning_rate": 0.00015322708491666642, + "loss": 0.0309, + "step": 2439 + }, + { + "epoch": 0.3223569045810351, + "grad_norm": 0.27215054631233215, + "learning_rate": 0.00015319189589099174, + "loss": 0.0216, + "step": 2440 + }, + { + "epoch": 0.32248901806651914, + "grad_norm": 0.18713687360286713, + "learning_rate": 0.00015315669767741155, + "loss": 0.0211, + "step": 2441 + }, + { + "epoch": 0.32262113155200317, + "grad_norm": 0.24193894863128662, + "learning_rate": 0.00015312149028200576, + "loss": 0.033, + "step": 2442 + }, + { + "epoch": 0.3227532450374872, + "grad_norm": 0.5523065328598022, + "learning_rate": 0.00015308627371085574, + "loss": 0.0321, + "step": 2443 + }, + { + "epoch": 0.32288535852297123, + "grad_norm": 0.37535157799720764, + "learning_rate": 0.00015305104797004452, + "loss": 0.0162, + "step": 2444 + }, + { + "epoch": 0.32301747200845526, + "grad_norm": 0.220017671585083, + "learning_rate": 0.00015301581306565666, + "loss": 0.0286, + "step": 2445 + }, + { + "epoch": 0.3231495854939393, + "grad_norm": 0.15219157934188843, + "learning_rate": 0.00015298056900377833, + "loss": 0.0165, + "step": 2446 + }, + { + "epoch": 0.3232816989794233, + "grad_norm": 0.2553863525390625, + "learning_rate": 0.00015294531579049733, + "loss": 0.0212, + "step": 2447 + }, + { + "epoch": 0.32341381246490736, + "grad_norm": 0.2089850753545761, + "learning_rate": 0.00015291005343190292, + "loss": 0.0222, + "step": 2448 + }, + { + "epoch": 0.3235459259503914, + "grad_norm": 0.19783416390419006, + "learning_rate": 0.00015287478193408608, + "loss": 0.0214, + "step": 2449 + }, + { + "epoch": 0.3236780394358754, + "grad_norm": 0.2705092132091522, + "learning_rate": 0.00015283950130313926, + "loss": 0.0409, + "step": 2450 + }, + { + "epoch": 0.32381015292135945, + "grad_norm": 0.28372472524642944, + "learning_rate": 0.00015280421154515656, + "loss": 0.0274, + "step": 2451 + }, + { + "epoch": 0.3239422664068435, + "grad_norm": 0.17090560495853424, + "learning_rate": 0.00015276891266623362, + "loss": 0.0151, + "step": 2452 + }, + { + "epoch": 0.3240743798923275, + "grad_norm": 0.17509298026561737, + "learning_rate": 0.00015273360467246762, + "loss": 0.0206, + "step": 2453 + }, + { + "epoch": 0.32420649337781154, + "grad_norm": 0.27054014801979065, + "learning_rate": 0.0001526982875699574, + "loss": 0.025, + "step": 2454 + }, + { + "epoch": 0.3243386068632956, + "grad_norm": 0.17075131833553314, + "learning_rate": 0.00015266296136480333, + "loss": 0.017, + "step": 2455 + }, + { + "epoch": 0.3244707203487796, + "grad_norm": 0.21123848855495453, + "learning_rate": 0.0001526276260631073, + "loss": 0.0274, + "step": 2456 + }, + { + "epoch": 0.32460283383426364, + "grad_norm": 0.15472885966300964, + "learning_rate": 0.00015259228167097287, + "loss": 0.0106, + "step": 2457 + }, + { + "epoch": 0.32473494731974767, + "grad_norm": 0.264280766248703, + "learning_rate": 0.00015255692819450512, + "loss": 0.0288, + "step": 2458 + }, + { + "epoch": 0.3248670608052317, + "grad_norm": 0.3066900670528412, + "learning_rate": 0.00015252156563981073, + "loss": 0.0302, + "step": 2459 + }, + { + "epoch": 0.32499917429071573, + "grad_norm": 0.24605877697467804, + "learning_rate": 0.00015248619401299785, + "loss": 0.0322, + "step": 2460 + }, + { + "epoch": 0.32513128777619976, + "grad_norm": 0.2953263819217682, + "learning_rate": 0.0001524508133201763, + "loss": 0.0308, + "step": 2461 + }, + { + "epoch": 0.3252634012616838, + "grad_norm": 0.25516271591186523, + "learning_rate": 0.00015241542356745749, + "loss": 0.025, + "step": 2462 + }, + { + "epoch": 0.3253955147471678, + "grad_norm": 0.23421679437160492, + "learning_rate": 0.00015238002476095422, + "loss": 0.022, + "step": 2463 + }, + { + "epoch": 0.32552762823265186, + "grad_norm": 0.1896619200706482, + "learning_rate": 0.0001523446169067811, + "loss": 0.0235, + "step": 2464 + }, + { + "epoch": 0.3256597417181359, + "grad_norm": 0.1484084129333496, + "learning_rate": 0.00015230920001105405, + "loss": 0.0216, + "step": 2465 + }, + { + "epoch": 0.3257918552036199, + "grad_norm": 0.20647959411144257, + "learning_rate": 0.00015227377407989073, + "loss": 0.0196, + "step": 2466 + }, + { + "epoch": 0.32592396868910395, + "grad_norm": 0.2000134289264679, + "learning_rate": 0.00015223833911941036, + "loss": 0.0289, + "step": 2467 + }, + { + "epoch": 0.326056082174588, + "grad_norm": 0.15682335197925568, + "learning_rate": 0.00015220289513573362, + "loss": 0.0122, + "step": 2468 + }, + { + "epoch": 0.326188195660072, + "grad_norm": 0.2056158483028412, + "learning_rate": 0.0001521674421349828, + "loss": 0.0246, + "step": 2469 + }, + { + "epoch": 0.32632030914555604, + "grad_norm": 0.47918254137039185, + "learning_rate": 0.0001521319801232817, + "loss": 0.0349, + "step": 2470 + }, + { + "epoch": 0.3264524226310401, + "grad_norm": 0.14794687926769257, + "learning_rate": 0.00015209650910675578, + "loss": 0.0148, + "step": 2471 + }, + { + "epoch": 0.3265845361165241, + "grad_norm": 0.21690905094146729, + "learning_rate": 0.00015206102909153197, + "loss": 0.0227, + "step": 2472 + }, + { + "epoch": 0.32671664960200814, + "grad_norm": 0.3061348795890808, + "learning_rate": 0.0001520255400837388, + "loss": 0.032, + "step": 2473 + }, + { + "epoch": 0.32684876308749217, + "grad_norm": 0.21932987868785858, + "learning_rate": 0.0001519900420895063, + "loss": 0.0317, + "step": 2474 + }, + { + "epoch": 0.3269808765729762, + "grad_norm": 0.23280304670333862, + "learning_rate": 0.0001519545351149661, + "loss": 0.0315, + "step": 2475 + }, + { + "epoch": 0.32711299005846023, + "grad_norm": 0.3209573030471802, + "learning_rate": 0.0001519190191662514, + "loss": 0.0353, + "step": 2476 + }, + { + "epoch": 0.32724510354394426, + "grad_norm": 0.2334751933813095, + "learning_rate": 0.00015188349424949683, + "loss": 0.031, + "step": 2477 + }, + { + "epoch": 0.3273772170294283, + "grad_norm": 0.1476392149925232, + "learning_rate": 0.00015184796037083875, + "loss": 0.0164, + "step": 2478 + }, + { + "epoch": 0.3275093305149123, + "grad_norm": 0.34847354888916016, + "learning_rate": 0.0001518124175364149, + "loss": 0.0243, + "step": 2479 + }, + { + "epoch": 0.32764144400039635, + "grad_norm": 0.2759470045566559, + "learning_rate": 0.0001517768657523647, + "loss": 0.0244, + "step": 2480 + }, + { + "epoch": 0.3277735574858804, + "grad_norm": 0.28842800855636597, + "learning_rate": 0.00015174130502482899, + "loss": 0.039, + "step": 2481 + }, + { + "epoch": 0.3279056709713644, + "grad_norm": 0.14972680807113647, + "learning_rate": 0.00015170573535995029, + "loss": 0.0162, + "step": 2482 + }, + { + "epoch": 0.32803778445684845, + "grad_norm": 0.23471418023109436, + "learning_rate": 0.00015167015676387257, + "loss": 0.0304, + "step": 2483 + }, + { + "epoch": 0.3281698979423325, + "grad_norm": 0.1672179400920868, + "learning_rate": 0.00015163456924274134, + "loss": 0.0155, + "step": 2484 + }, + { + "epoch": 0.3283020114278165, + "grad_norm": 0.24049347639083862, + "learning_rate": 0.00015159897280270373, + "loss": 0.0271, + "step": 2485 + }, + { + "epoch": 0.32843412491330054, + "grad_norm": 0.288613885641098, + "learning_rate": 0.00015156336744990827, + "loss": 0.0335, + "step": 2486 + }, + { + "epoch": 0.32856623839878457, + "grad_norm": 0.2593197822570801, + "learning_rate": 0.00015152775319050523, + "loss": 0.0194, + "step": 2487 + }, + { + "epoch": 0.3286983518842686, + "grad_norm": 0.1788790225982666, + "learning_rate": 0.00015149213003064622, + "loss": 0.0198, + "step": 2488 + }, + { + "epoch": 0.32883046536975263, + "grad_norm": 0.21538689732551575, + "learning_rate": 0.00015145649797648455, + "loss": 0.0231, + "step": 2489 + }, + { + "epoch": 0.32896257885523666, + "grad_norm": 0.2356831431388855, + "learning_rate": 0.0001514208570341749, + "loss": 0.0231, + "step": 2490 + }, + { + "epoch": 0.3290946923407207, + "grad_norm": 0.12992119789123535, + "learning_rate": 0.00015138520720987366, + "loss": 0.0129, + "step": 2491 + }, + { + "epoch": 0.3292268058262047, + "grad_norm": 0.19276019930839539, + "learning_rate": 0.00015134954850973864, + "loss": 0.0189, + "step": 2492 + }, + { + "epoch": 0.32935891931168876, + "grad_norm": 0.20604351162910461, + "learning_rate": 0.00015131388093992916, + "loss": 0.0141, + "step": 2493 + }, + { + "epoch": 0.3294910327971728, + "grad_norm": 0.3511555790901184, + "learning_rate": 0.00015127820450660625, + "loss": 0.0428, + "step": 2494 + }, + { + "epoch": 0.3296231462826568, + "grad_norm": 0.18908363580703735, + "learning_rate": 0.00015124251921593227, + "loss": 0.0233, + "step": 2495 + }, + { + "epoch": 0.32975525976814085, + "grad_norm": 0.1390237957239151, + "learning_rate": 0.00015120682507407113, + "loss": 0.0225, + "step": 2496 + }, + { + "epoch": 0.3298873732536249, + "grad_norm": 0.20404070615768433, + "learning_rate": 0.00015117112208718844, + "loss": 0.0336, + "step": 2497 + }, + { + "epoch": 0.3300194867391089, + "grad_norm": 0.14441095292568207, + "learning_rate": 0.00015113541026145114, + "loss": 0.0142, + "step": 2498 + }, + { + "epoch": 0.33015160022459294, + "grad_norm": 0.11000459641218185, + "learning_rate": 0.00015109968960302784, + "loss": 0.0122, + "step": 2499 + }, + { + "epoch": 0.330283713710077, + "grad_norm": 0.21273118257522583, + "learning_rate": 0.00015106396011808855, + "loss": 0.0184, + "step": 2500 + }, + { + "epoch": 0.330415827195561, + "grad_norm": 0.19047781825065613, + "learning_rate": 0.0001510282218128049, + "loss": 0.0199, + "step": 2501 + }, + { + "epoch": 0.33054794068104504, + "grad_norm": 0.3213901221752167, + "learning_rate": 0.00015099247469335008, + "loss": 0.0299, + "step": 2502 + }, + { + "epoch": 0.33068005416652907, + "grad_norm": 0.2068183869123459, + "learning_rate": 0.00015095671876589863, + "loss": 0.0237, + "step": 2503 + }, + { + "epoch": 0.3308121676520131, + "grad_norm": 0.13071802258491516, + "learning_rate": 0.00015092095403662677, + "loss": 0.0169, + "step": 2504 + }, + { + "epoch": 0.33094428113749713, + "grad_norm": 0.1935047060251236, + "learning_rate": 0.00015088518051171218, + "loss": 0.0188, + "step": 2505 + }, + { + "epoch": 0.33107639462298116, + "grad_norm": 0.20110556483268738, + "learning_rate": 0.0001508493981973341, + "loss": 0.0247, + "step": 2506 + }, + { + "epoch": 0.3312085081084652, + "grad_norm": 0.19901902973651886, + "learning_rate": 0.00015081360709967318, + "loss": 0.0262, + "step": 2507 + }, + { + "epoch": 0.3313406215939492, + "grad_norm": 0.24266882240772247, + "learning_rate": 0.00015077780722491175, + "loss": 0.0291, + "step": 2508 + }, + { + "epoch": 0.33147273507943326, + "grad_norm": 0.1843455731868744, + "learning_rate": 0.00015074199857923352, + "loss": 0.0205, + "step": 2509 + }, + { + "epoch": 0.3316048485649173, + "grad_norm": 0.30212709307670593, + "learning_rate": 0.00015070618116882375, + "loss": 0.0368, + "step": 2510 + }, + { + "epoch": 0.3317369620504013, + "grad_norm": 0.23659828305244446, + "learning_rate": 0.00015067035499986928, + "loss": 0.0154, + "step": 2511 + }, + { + "epoch": 0.33186907553588535, + "grad_norm": 0.2850700318813324, + "learning_rate": 0.00015063452007855834, + "loss": 0.0235, + "step": 2512 + }, + { + "epoch": 0.3320011890213694, + "grad_norm": 0.3431922495365143, + "learning_rate": 0.00015059867641108082, + "loss": 0.0425, + "step": 2513 + }, + { + "epoch": 0.3321333025068534, + "grad_norm": 0.15030327439308167, + "learning_rate": 0.000150562824003628, + "loss": 0.016, + "step": 2514 + }, + { + "epoch": 0.33226541599233744, + "grad_norm": 0.19416648149490356, + "learning_rate": 0.00015052696286239274, + "loss": 0.0147, + "step": 2515 + }, + { + "epoch": 0.3323975294778215, + "grad_norm": 0.2212238758802414, + "learning_rate": 0.00015049109299356933, + "loss": 0.0273, + "step": 2516 + }, + { + "epoch": 0.3325296429633055, + "grad_norm": 0.47250092029571533, + "learning_rate": 0.00015045521440335363, + "loss": 0.0266, + "step": 2517 + }, + { + "epoch": 0.33266175644878954, + "grad_norm": 0.21963316202163696, + "learning_rate": 0.00015041932709794308, + "loss": 0.0239, + "step": 2518 + }, + { + "epoch": 0.33279386993427357, + "grad_norm": 0.1677628606557846, + "learning_rate": 0.00015038343108353646, + "loss": 0.0202, + "step": 2519 + }, + { + "epoch": 0.3329259834197576, + "grad_norm": 0.21540825068950653, + "learning_rate": 0.0001503475263663341, + "loss": 0.0252, + "step": 2520 + }, + { + "epoch": 0.33305809690524163, + "grad_norm": 0.23379550874233246, + "learning_rate": 0.00015031161295253796, + "loss": 0.0292, + "step": 2521 + }, + { + "epoch": 0.33319021039072566, + "grad_norm": 0.22972875833511353, + "learning_rate": 0.00015027569084835138, + "loss": 0.0269, + "step": 2522 + }, + { + "epoch": 0.3333223238762097, + "grad_norm": 0.18469804525375366, + "learning_rate": 0.0001502397600599792, + "loss": 0.0145, + "step": 2523 + }, + { + "epoch": 0.33345443736169367, + "grad_norm": 0.22447286546230316, + "learning_rate": 0.00015020382059362786, + "loss": 0.022, + "step": 2524 + }, + { + "epoch": 0.3335865508471777, + "grad_norm": 0.1887979358434677, + "learning_rate": 0.00015016787245550515, + "loss": 0.0211, + "step": 2525 + }, + { + "epoch": 0.33371866433266173, + "grad_norm": 0.19186192750930786, + "learning_rate": 0.0001501319156518205, + "loss": 0.0188, + "step": 2526 + }, + { + "epoch": 0.33385077781814576, + "grad_norm": 0.22555392980575562, + "learning_rate": 0.00015009595018878472, + "loss": 0.0274, + "step": 2527 + }, + { + "epoch": 0.3339828913036298, + "grad_norm": 0.18418656289577484, + "learning_rate": 0.00015005997607261024, + "loss": 0.0196, + "step": 2528 + }, + { + "epoch": 0.3341150047891138, + "grad_norm": 0.19265905022621155, + "learning_rate": 0.00015002399330951084, + "loss": 0.0291, + "step": 2529 + }, + { + "epoch": 0.33424711827459785, + "grad_norm": 0.374276340007782, + "learning_rate": 0.00014998800190570193, + "loss": 0.0331, + "step": 2530 + }, + { + "epoch": 0.3343792317600819, + "grad_norm": 0.17162999510765076, + "learning_rate": 0.00014995200186740032, + "loss": 0.0182, + "step": 2531 + }, + { + "epoch": 0.3345113452455659, + "grad_norm": 0.2706623077392578, + "learning_rate": 0.00014991599320082438, + "loss": 0.0287, + "step": 2532 + }, + { + "epoch": 0.33464345873104995, + "grad_norm": 0.2407575100660324, + "learning_rate": 0.00014987997591219386, + "loss": 0.028, + "step": 2533 + }, + { + "epoch": 0.334775572216534, + "grad_norm": 0.2555379271507263, + "learning_rate": 0.00014984395000773015, + "loss": 0.0269, + "step": 2534 + }, + { + "epoch": 0.334907685702018, + "grad_norm": 0.1932975947856903, + "learning_rate": 0.00014980791549365602, + "loss": 0.023, + "step": 2535 + }, + { + "epoch": 0.33503979918750204, + "grad_norm": 0.15046212077140808, + "learning_rate": 0.00014977187237619576, + "loss": 0.016, + "step": 2536 + }, + { + "epoch": 0.33517191267298607, + "grad_norm": 0.3287968933582306, + "learning_rate": 0.00014973582066157514, + "loss": 0.0333, + "step": 2537 + }, + { + "epoch": 0.3353040261584701, + "grad_norm": 0.10586533695459366, + "learning_rate": 0.00014969976035602144, + "loss": 0.0084, + "step": 2538 + }, + { + "epoch": 0.33543613964395413, + "grad_norm": 0.25480660796165466, + "learning_rate": 0.00014966369146576338, + "loss": 0.0367, + "step": 2539 + }, + { + "epoch": 0.33556825312943817, + "grad_norm": 0.1262090504169464, + "learning_rate": 0.0001496276139970312, + "loss": 0.0148, + "step": 2540 + }, + { + "epoch": 0.3357003666149222, + "grad_norm": 0.21672266721725464, + "learning_rate": 0.0001495915279560566, + "loss": 0.0272, + "step": 2541 + }, + { + "epoch": 0.3358324801004062, + "grad_norm": 0.2435304820537567, + "learning_rate": 0.00014955543334907277, + "loss": 0.0271, + "step": 2542 + }, + { + "epoch": 0.33596459358589026, + "grad_norm": 0.21179917454719543, + "learning_rate": 0.00014951933018231435, + "loss": 0.0213, + "step": 2543 + }, + { + "epoch": 0.3360967070713743, + "grad_norm": 0.20296365022659302, + "learning_rate": 0.00014948321846201758, + "loss": 0.0239, + "step": 2544 + }, + { + "epoch": 0.3362288205568583, + "grad_norm": 0.24866227805614471, + "learning_rate": 0.00014944709819441994, + "loss": 0.0235, + "step": 2545 + }, + { + "epoch": 0.33636093404234235, + "grad_norm": 0.2591897249221802, + "learning_rate": 0.00014941096938576068, + "loss": 0.0348, + "step": 2546 + }, + { + "epoch": 0.3364930475278264, + "grad_norm": 0.24188803136348724, + "learning_rate": 0.00014937483204228029, + "loss": 0.0199, + "step": 2547 + }, + { + "epoch": 0.3366251610133104, + "grad_norm": 0.20329833030700684, + "learning_rate": 0.00014933868617022085, + "loss": 0.0195, + "step": 2548 + }, + { + "epoch": 0.33675727449879445, + "grad_norm": 0.1957000344991684, + "learning_rate": 0.00014930253177582585, + "loss": 0.0198, + "step": 2549 + }, + { + "epoch": 0.3368893879842785, + "grad_norm": 0.25972720980644226, + "learning_rate": 0.00014926636886534032, + "loss": 0.0277, + "step": 2550 + }, + { + "epoch": 0.3370215014697625, + "grad_norm": 0.16775253415107727, + "learning_rate": 0.00014923019744501073, + "loss": 0.0125, + "step": 2551 + }, + { + "epoch": 0.33715361495524654, + "grad_norm": 0.22409114241600037, + "learning_rate": 0.000149194017521085, + "loss": 0.026, + "step": 2552 + }, + { + "epoch": 0.33728572844073057, + "grad_norm": 0.33436572551727295, + "learning_rate": 0.00014915782909981248, + "loss": 0.0411, + "step": 2553 + }, + { + "epoch": 0.3374178419262146, + "grad_norm": 0.21022121608257294, + "learning_rate": 0.00014912163218744418, + "loss": 0.0336, + "step": 2554 + }, + { + "epoch": 0.33754995541169863, + "grad_norm": 0.1965954452753067, + "learning_rate": 0.0001490854267902323, + "loss": 0.0166, + "step": 2555 + }, + { + "epoch": 0.33768206889718266, + "grad_norm": 0.2767995595932007, + "learning_rate": 0.00014904921291443074, + "loss": 0.0345, + "step": 2556 + }, + { + "epoch": 0.3378141823826667, + "grad_norm": 0.10635353624820709, + "learning_rate": 0.00014901299056629475, + "loss": 0.0149, + "step": 2557 + }, + { + "epoch": 0.3379462958681507, + "grad_norm": 0.23388898372650146, + "learning_rate": 0.000148976759752081, + "loss": 0.0204, + "step": 2558 + }, + { + "epoch": 0.33807840935363476, + "grad_norm": 0.19081705808639526, + "learning_rate": 0.00014894052047804775, + "loss": 0.0254, + "step": 2559 + }, + { + "epoch": 0.3382105228391188, + "grad_norm": 0.2938932478427887, + "learning_rate": 0.00014890427275045468, + "loss": 0.0442, + "step": 2560 + }, + { + "epoch": 0.3383426363246028, + "grad_norm": 0.2676193118095398, + "learning_rate": 0.00014886801657556283, + "loss": 0.0197, + "step": 2561 + }, + { + "epoch": 0.33847474981008685, + "grad_norm": 0.15962931513786316, + "learning_rate": 0.00014883175195963482, + "loss": 0.0188, + "step": 2562 + }, + { + "epoch": 0.3386068632955709, + "grad_norm": 0.4894774854183197, + "learning_rate": 0.00014879547890893469, + "loss": 0.0276, + "step": 2563 + }, + { + "epoch": 0.3387389767810549, + "grad_norm": 0.33002814650535583, + "learning_rate": 0.00014875919742972794, + "loss": 0.0333, + "step": 2564 + }, + { + "epoch": 0.33887109026653894, + "grad_norm": 0.27643507719039917, + "learning_rate": 0.00014872290752828145, + "loss": 0.0223, + "step": 2565 + }, + { + "epoch": 0.339003203752023, + "grad_norm": 0.2616123557090759, + "learning_rate": 0.0001486866092108637, + "loss": 0.02, + "step": 2566 + }, + { + "epoch": 0.339135317237507, + "grad_norm": 0.1584719866514206, + "learning_rate": 0.0001486503024837445, + "loss": 0.0191, + "step": 2567 + }, + { + "epoch": 0.33926743072299104, + "grad_norm": 0.14762555062770844, + "learning_rate": 0.00014861398735319518, + "loss": 0.0138, + "step": 2568 + }, + { + "epoch": 0.33939954420847507, + "grad_norm": 0.198564350605011, + "learning_rate": 0.0001485776638254885, + "loss": 0.035, + "step": 2569 + }, + { + "epoch": 0.3395316576939591, + "grad_norm": 0.15630686283111572, + "learning_rate": 0.00014854133190689867, + "loss": 0.0166, + "step": 2570 + }, + { + "epoch": 0.33966377117944313, + "grad_norm": 0.23185116052627563, + "learning_rate": 0.00014850499160370134, + "loss": 0.0179, + "step": 2571 + }, + { + "epoch": 0.33979588466492716, + "grad_norm": 0.5001983046531677, + "learning_rate": 0.0001484686429221736, + "loss": 0.0247, + "step": 2572 + }, + { + "epoch": 0.3399279981504112, + "grad_norm": 0.2338925302028656, + "learning_rate": 0.00014843228586859406, + "loss": 0.021, + "step": 2573 + }, + { + "epoch": 0.3400601116358952, + "grad_norm": 0.2724718153476715, + "learning_rate": 0.00014839592044924265, + "loss": 0.0215, + "step": 2574 + }, + { + "epoch": 0.34019222512137925, + "grad_norm": 0.1371961534023285, + "learning_rate": 0.00014835954667040085, + "loss": 0.0134, + "step": 2575 + }, + { + "epoch": 0.3403243386068633, + "grad_norm": 0.24581897258758545, + "learning_rate": 0.0001483231645383516, + "loss": 0.029, + "step": 2576 + }, + { + "epoch": 0.3404564520923473, + "grad_norm": 0.30089905858039856, + "learning_rate": 0.00014828677405937917, + "loss": 0.0341, + "step": 2577 + }, + { + "epoch": 0.34058856557783135, + "grad_norm": 0.22530049085617065, + "learning_rate": 0.00014825037523976935, + "loss": 0.0243, + "step": 2578 + }, + { + "epoch": 0.3407206790633154, + "grad_norm": 0.18343088030815125, + "learning_rate": 0.00014821396808580934, + "loss": 0.0186, + "step": 2579 + }, + { + "epoch": 0.3408527925487994, + "grad_norm": 0.2015973925590515, + "learning_rate": 0.00014817755260378786, + "loss": 0.0213, + "step": 2580 + }, + { + "epoch": 0.34098490603428344, + "grad_norm": 0.24317368865013123, + "learning_rate": 0.00014814112879999488, + "loss": 0.0309, + "step": 2581 + }, + { + "epoch": 0.3411170195197675, + "grad_norm": 0.2161998599767685, + "learning_rate": 0.00014810469668072207, + "loss": 0.0305, + "step": 2582 + }, + { + "epoch": 0.3412491330052515, + "grad_norm": 0.22956828773021698, + "learning_rate": 0.00014806825625226234, + "loss": 0.0189, + "step": 2583 + }, + { + "epoch": 0.34138124649073553, + "grad_norm": 0.1916801631450653, + "learning_rate": 0.00014803180752091005, + "loss": 0.0298, + "step": 2584 + }, + { + "epoch": 0.34151335997621957, + "grad_norm": 0.2665329873561859, + "learning_rate": 0.0001479953504929611, + "loss": 0.0266, + "step": 2585 + }, + { + "epoch": 0.3416454734617036, + "grad_norm": 0.2349788397550583, + "learning_rate": 0.0001479588851747127, + "loss": 0.0302, + "step": 2586 + }, + { + "epoch": 0.34177758694718763, + "grad_norm": 0.14647245407104492, + "learning_rate": 0.00014792241157246362, + "loss": 0.0168, + "step": 2587 + }, + { + "epoch": 0.34190970043267166, + "grad_norm": 0.17012257874011993, + "learning_rate": 0.00014788592969251397, + "loss": 0.021, + "step": 2588 + }, + { + "epoch": 0.3420418139181557, + "grad_norm": 0.30817341804504395, + "learning_rate": 0.0001478494395411653, + "loss": 0.0369, + "step": 2589 + }, + { + "epoch": 0.3421739274036397, + "grad_norm": 0.23504842817783356, + "learning_rate": 0.00014781294112472057, + "loss": 0.0213, + "step": 2590 + }, + { + "epoch": 0.34230604088912375, + "grad_norm": 0.18608969449996948, + "learning_rate": 0.00014777643444948424, + "loss": 0.0193, + "step": 2591 + }, + { + "epoch": 0.3424381543746078, + "grad_norm": 0.2825300991535187, + "learning_rate": 0.00014773991952176215, + "loss": 0.0246, + "step": 2592 + }, + { + "epoch": 0.3425702678600918, + "grad_norm": 0.1629979908466339, + "learning_rate": 0.00014770339634786157, + "loss": 0.016, + "step": 2593 + }, + { + "epoch": 0.34270238134557585, + "grad_norm": 0.18643341958522797, + "learning_rate": 0.00014766686493409122, + "loss": 0.0105, + "step": 2594 + }, + { + "epoch": 0.3428344948310599, + "grad_norm": 0.1917845457792282, + "learning_rate": 0.00014763032528676114, + "loss": 0.017, + "step": 2595 + }, + { + "epoch": 0.3429666083165439, + "grad_norm": 0.2119200974702835, + "learning_rate": 0.00014759377741218298, + "loss": 0.0203, + "step": 2596 + }, + { + "epoch": 0.34309872180202794, + "grad_norm": 0.153628870844841, + "learning_rate": 0.00014755722131666962, + "loss": 0.0165, + "step": 2597 + }, + { + "epoch": 0.34323083528751197, + "grad_norm": 0.15713736414909363, + "learning_rate": 0.00014752065700653546, + "loss": 0.0208, + "step": 2598 + }, + { + "epoch": 0.343362948772996, + "grad_norm": 0.18878145515918732, + "learning_rate": 0.00014748408448809631, + "loss": 0.0267, + "step": 2599 + }, + { + "epoch": 0.34349506225848003, + "grad_norm": 0.21092022955417633, + "learning_rate": 0.0001474475037676694, + "loss": 0.023, + "step": 2600 + }, + { + "epoch": 0.34362717574396406, + "grad_norm": 0.22435013949871063, + "learning_rate": 0.00014741091485157335, + "loss": 0.0242, + "step": 2601 + }, + { + "epoch": 0.3437592892294481, + "grad_norm": 0.24122808873653412, + "learning_rate": 0.0001473743177461282, + "loss": 0.0226, + "step": 2602 + }, + { + "epoch": 0.3438914027149321, + "grad_norm": 0.28979021310806274, + "learning_rate": 0.00014733771245765544, + "loss": 0.0236, + "step": 2603 + }, + { + "epoch": 0.34402351620041616, + "grad_norm": 0.1095261350274086, + "learning_rate": 0.00014730109899247794, + "loss": 0.0104, + "step": 2604 + }, + { + "epoch": 0.3441556296859002, + "grad_norm": 0.2334039807319641, + "learning_rate": 0.00014726447735692, + "loss": 0.0262, + "step": 2605 + }, + { + "epoch": 0.3442877431713842, + "grad_norm": 0.144916832447052, + "learning_rate": 0.00014722784755730732, + "loss": 0.0168, + "step": 2606 + }, + { + "epoch": 0.34441985665686825, + "grad_norm": 0.1582469791173935, + "learning_rate": 0.000147191209599967, + "loss": 0.018, + "step": 2607 + }, + { + "epoch": 0.3445519701423523, + "grad_norm": 0.19044512510299683, + "learning_rate": 0.00014715456349122754, + "loss": 0.0261, + "step": 2608 + }, + { + "epoch": 0.3446840836278363, + "grad_norm": 0.15183739364147186, + "learning_rate": 0.00014711790923741894, + "loss": 0.0181, + "step": 2609 + }, + { + "epoch": 0.34481619711332034, + "grad_norm": 0.580545961856842, + "learning_rate": 0.00014708124684487245, + "loss": 0.0423, + "step": 2610 + }, + { + "epoch": 0.3449483105988044, + "grad_norm": 0.19683608412742615, + "learning_rate": 0.00014704457631992091, + "loss": 0.0108, + "step": 2611 + }, + { + "epoch": 0.3450804240842884, + "grad_norm": 0.2760066092014313, + "learning_rate": 0.00014700789766889836, + "loss": 0.0229, + "step": 2612 + }, + { + "epoch": 0.34521253756977244, + "grad_norm": 0.310825914144516, + "learning_rate": 0.00014697121089814042, + "loss": 0.0274, + "step": 2613 + }, + { + "epoch": 0.34534465105525647, + "grad_norm": 0.28714632987976074, + "learning_rate": 0.00014693451601398408, + "loss": 0.025, + "step": 2614 + }, + { + "epoch": 0.3454767645407405, + "grad_norm": 0.15742076933383942, + "learning_rate": 0.0001468978130227676, + "loss": 0.0214, + "step": 2615 + }, + { + "epoch": 0.34560887802622453, + "grad_norm": 0.24934737384319305, + "learning_rate": 0.0001468611019308308, + "loss": 0.0278, + "step": 2616 + }, + { + "epoch": 0.34574099151170856, + "grad_norm": 0.18599407374858856, + "learning_rate": 0.0001468243827445148, + "loss": 0.0309, + "step": 2617 + }, + { + "epoch": 0.3458731049971926, + "grad_norm": 0.13141511380672455, + "learning_rate": 0.0001467876554701622, + "loss": 0.0152, + "step": 2618 + }, + { + "epoch": 0.3460052184826766, + "grad_norm": 0.4108513295650482, + "learning_rate": 0.00014675092011411689, + "loss": 0.028, + "step": 2619 + }, + { + "epoch": 0.34613733196816066, + "grad_norm": 0.1940169632434845, + "learning_rate": 0.00014671417668272424, + "loss": 0.0333, + "step": 2620 + }, + { + "epoch": 0.3462694454536447, + "grad_norm": 0.24660934507846832, + "learning_rate": 0.00014667742518233103, + "loss": 0.0296, + "step": 2621 + }, + { + "epoch": 0.3464015589391287, + "grad_norm": 0.31359758973121643, + "learning_rate": 0.00014664066561928532, + "loss": 0.0197, + "step": 2622 + }, + { + "epoch": 0.34653367242461275, + "grad_norm": 0.24189671874046326, + "learning_rate": 0.00014660389799993673, + "loss": 0.0292, + "step": 2623 + }, + { + "epoch": 0.3466657859100968, + "grad_norm": 0.17720560729503632, + "learning_rate": 0.00014656712233063608, + "loss": 0.0225, + "step": 2624 + }, + { + "epoch": 0.3467978993955808, + "grad_norm": 0.2419784665107727, + "learning_rate": 0.00014653033861773573, + "loss": 0.0338, + "step": 2625 + }, + { + "epoch": 0.34693001288106484, + "grad_norm": 0.202085480093956, + "learning_rate": 0.0001464935468675894, + "loss": 0.0251, + "step": 2626 + }, + { + "epoch": 0.3470621263665489, + "grad_norm": 0.18005435168743134, + "learning_rate": 0.00014645674708655212, + "loss": 0.0188, + "step": 2627 + }, + { + "epoch": 0.3471942398520329, + "grad_norm": 0.2608417272567749, + "learning_rate": 0.00014641993928098042, + "loss": 0.0249, + "step": 2628 + }, + { + "epoch": 0.34732635333751694, + "grad_norm": 0.3798268437385559, + "learning_rate": 0.0001463831234572321, + "loss": 0.0262, + "step": 2629 + }, + { + "epoch": 0.34745846682300097, + "grad_norm": 0.189011350274086, + "learning_rate": 0.0001463462996216665, + "loss": 0.0191, + "step": 2630 + }, + { + "epoch": 0.347590580308485, + "grad_norm": 0.26774030923843384, + "learning_rate": 0.00014630946778064415, + "loss": 0.027, + "step": 2631 + }, + { + "epoch": 0.34772269379396903, + "grad_norm": 0.25474798679351807, + "learning_rate": 0.0001462726279405271, + "loss": 0.0317, + "step": 2632 + }, + { + "epoch": 0.34785480727945306, + "grad_norm": 0.2341891974210739, + "learning_rate": 0.00014623578010767874, + "loss": 0.0268, + "step": 2633 + }, + { + "epoch": 0.3479869207649371, + "grad_norm": 0.18379223346710205, + "learning_rate": 0.00014619892428846388, + "loss": 0.0248, + "step": 2634 + }, + { + "epoch": 0.3481190342504211, + "grad_norm": 0.21996241807937622, + "learning_rate": 0.00014616206048924862, + "loss": 0.0211, + "step": 2635 + }, + { + "epoch": 0.34825114773590515, + "grad_norm": 0.16686783730983734, + "learning_rate": 0.00014612518871640049, + "loss": 0.0264, + "step": 2636 + }, + { + "epoch": 0.3483832612213892, + "grad_norm": 0.20649857819080353, + "learning_rate": 0.00014608830897628846, + "loss": 0.0211, + "step": 2637 + }, + { + "epoch": 0.3485153747068732, + "grad_norm": 0.2630440890789032, + "learning_rate": 0.00014605142127528277, + "loss": 0.0245, + "step": 2638 + }, + { + "epoch": 0.34864748819235725, + "grad_norm": 0.1711365431547165, + "learning_rate": 0.0001460145256197551, + "loss": 0.0101, + "step": 2639 + }, + { + "epoch": 0.3487796016778413, + "grad_norm": 0.1501888632774353, + "learning_rate": 0.0001459776220160785, + "loss": 0.0122, + "step": 2640 + }, + { + "epoch": 0.3489117151633253, + "grad_norm": 0.3269023895263672, + "learning_rate": 0.0001459407104706273, + "loss": 0.0533, + "step": 2641 + }, + { + "epoch": 0.34904382864880934, + "grad_norm": 0.12305083870887756, + "learning_rate": 0.00014590379098977736, + "loss": 0.0146, + "step": 2642 + }, + { + "epoch": 0.34917594213429337, + "grad_norm": 0.5508819222450256, + "learning_rate": 0.00014586686357990578, + "loss": 0.024, + "step": 2643 + }, + { + "epoch": 0.3493080556197774, + "grad_norm": 0.15577666461467743, + "learning_rate": 0.00014582992824739113, + "loss": 0.0169, + "step": 2644 + }, + { + "epoch": 0.34944016910526143, + "grad_norm": 0.3088664710521698, + "learning_rate": 0.00014579298499861325, + "loss": 0.0216, + "step": 2645 + }, + { + "epoch": 0.34957228259074546, + "grad_norm": 0.20767951011657715, + "learning_rate": 0.00014575603383995344, + "loss": 0.024, + "step": 2646 + }, + { + "epoch": 0.3497043960762295, + "grad_norm": 0.2066233605146408, + "learning_rate": 0.0001457190747777943, + "loss": 0.0252, + "step": 2647 + }, + { + "epoch": 0.3498365095617135, + "grad_norm": 0.3334047198295593, + "learning_rate": 0.00014568210781851977, + "loss": 0.0488, + "step": 2648 + }, + { + "epoch": 0.34996862304719756, + "grad_norm": 0.272562175989151, + "learning_rate": 0.0001456451329685153, + "loss": 0.0285, + "step": 2649 + }, + { + "epoch": 0.3501007365326816, + "grad_norm": 0.2628403902053833, + "learning_rate": 0.0001456081502341675, + "loss": 0.0179, + "step": 2650 + }, + { + "epoch": 0.3502328500181656, + "grad_norm": 0.26784005761146545, + "learning_rate": 0.00014557115962186452, + "loss": 0.0237, + "step": 2651 + }, + { + "epoch": 0.35036496350364965, + "grad_norm": 0.2535072863101959, + "learning_rate": 0.00014553416113799575, + "loss": 0.035, + "step": 2652 + }, + { + "epoch": 0.3504970769891337, + "grad_norm": 0.14403533935546875, + "learning_rate": 0.00014549715478895202, + "loss": 0.0223, + "step": 2653 + }, + { + "epoch": 0.3506291904746177, + "grad_norm": 0.17470040917396545, + "learning_rate": 0.00014546014058112552, + "loss": 0.024, + "step": 2654 + }, + { + "epoch": 0.35076130396010174, + "grad_norm": 0.29080143570899963, + "learning_rate": 0.00014542311852090963, + "loss": 0.0245, + "step": 2655 + }, + { + "epoch": 0.3508934174455858, + "grad_norm": 0.17105811834335327, + "learning_rate": 0.00014538608861469938, + "loss": 0.0211, + "step": 2656 + }, + { + "epoch": 0.3510255309310698, + "grad_norm": 0.14506390690803528, + "learning_rate": 0.00014534905086889085, + "loss": 0.0197, + "step": 2657 + }, + { + "epoch": 0.35115764441655384, + "grad_norm": 0.3682821989059448, + "learning_rate": 0.00014531200528988174, + "loss": 0.0184, + "step": 2658 + }, + { + "epoch": 0.35128975790203787, + "grad_norm": 0.2343844622373581, + "learning_rate": 0.0001452749518840709, + "loss": 0.0227, + "step": 2659 + }, + { + "epoch": 0.3514218713875219, + "grad_norm": 0.22336915135383606, + "learning_rate": 0.00014523789065785866, + "loss": 0.0274, + "step": 2660 + }, + { + "epoch": 0.35155398487300593, + "grad_norm": 0.2997768521308899, + "learning_rate": 0.00014520082161764665, + "loss": 0.0301, + "step": 2661 + }, + { + "epoch": 0.35168609835848996, + "grad_norm": 0.2349299192428589, + "learning_rate": 0.00014516374476983775, + "loss": 0.0219, + "step": 2662 + }, + { + "epoch": 0.351818211843974, + "grad_norm": 0.17483294010162354, + "learning_rate": 0.00014512666012083644, + "loss": 0.0166, + "step": 2663 + }, + { + "epoch": 0.351950325329458, + "grad_norm": 0.18980465829372406, + "learning_rate": 0.00014508956767704835, + "loss": 0.0191, + "step": 2664 + }, + { + "epoch": 0.35208243881494206, + "grad_norm": 0.22129544615745544, + "learning_rate": 0.00014505246744488047, + "loss": 0.0225, + "step": 2665 + }, + { + "epoch": 0.3522145523004261, + "grad_norm": 0.2243020087480545, + "learning_rate": 0.00014501535943074124, + "loss": 0.0411, + "step": 2666 + }, + { + "epoch": 0.3523466657859101, + "grad_norm": 0.15792229771614075, + "learning_rate": 0.0001449782436410403, + "loss": 0.0218, + "step": 2667 + }, + { + "epoch": 0.35247877927139415, + "grad_norm": 0.23417407274246216, + "learning_rate": 0.00014494112008218873, + "loss": 0.0248, + "step": 2668 + }, + { + "epoch": 0.3526108927568782, + "grad_norm": 0.16839639842510223, + "learning_rate": 0.00014490398876059897, + "loss": 0.012, + "step": 2669 + }, + { + "epoch": 0.3527430062423622, + "grad_norm": 0.2418825328350067, + "learning_rate": 0.0001448668496826847, + "loss": 0.0211, + "step": 2670 + }, + { + "epoch": 0.35287511972784624, + "grad_norm": 0.17806826531887054, + "learning_rate": 0.00014482970285486108, + "loss": 0.0133, + "step": 2671 + }, + { + "epoch": 0.3530072332133303, + "grad_norm": 0.2244100570678711, + "learning_rate": 0.00014479254828354444, + "loss": 0.0206, + "step": 2672 + }, + { + "epoch": 0.3531393466988143, + "grad_norm": 0.1828577071428299, + "learning_rate": 0.00014475538597515263, + "loss": 0.0195, + "step": 2673 + }, + { + "epoch": 0.35327146018429834, + "grad_norm": 0.1585683524608612, + "learning_rate": 0.00014471821593610467, + "loss": 0.0166, + "step": 2674 + }, + { + "epoch": 0.35340357366978237, + "grad_norm": 0.18276363611221313, + "learning_rate": 0.00014468103817282102, + "loss": 0.0211, + "step": 2675 + }, + { + "epoch": 0.3535356871552664, + "grad_norm": 0.22013506293296814, + "learning_rate": 0.00014464385269172343, + "loss": 0.0219, + "step": 2676 + }, + { + "epoch": 0.35366780064075043, + "grad_norm": 0.3715534508228302, + "learning_rate": 0.000144606659499235, + "loss": 0.0198, + "step": 2677 + }, + { + "epoch": 0.35379991412623446, + "grad_norm": 0.17348001897335052, + "learning_rate": 0.00014456945860178019, + "loss": 0.0226, + "step": 2678 + }, + { + "epoch": 0.3539320276117185, + "grad_norm": 0.16937151551246643, + "learning_rate": 0.00014453225000578472, + "loss": 0.021, + "step": 2679 + }, + { + "epoch": 0.3540641410972025, + "grad_norm": 0.17919118702411652, + "learning_rate": 0.0001444950337176757, + "loss": 0.0254, + "step": 2680 + }, + { + "epoch": 0.35419625458268655, + "grad_norm": 0.3098297119140625, + "learning_rate": 0.00014445780974388153, + "loss": 0.0531, + "step": 2681 + }, + { + "epoch": 0.3543283680681706, + "grad_norm": 0.1814904361963272, + "learning_rate": 0.00014442057809083196, + "loss": 0.0242, + "step": 2682 + }, + { + "epoch": 0.3544604815536546, + "grad_norm": 0.20038634538650513, + "learning_rate": 0.00014438333876495808, + "loss": 0.027, + "step": 2683 + }, + { + "epoch": 0.35459259503913865, + "grad_norm": 0.2531159222126007, + "learning_rate": 0.00014434609177269226, + "loss": 0.033, + "step": 2684 + }, + { + "epoch": 0.3547247085246227, + "grad_norm": 0.18288713693618774, + "learning_rate": 0.00014430883712046827, + "loss": 0.023, + "step": 2685 + }, + { + "epoch": 0.35485682201010665, + "grad_norm": 0.2389320433139801, + "learning_rate": 0.00014427157481472112, + "loss": 0.024, + "step": 2686 + }, + { + "epoch": 0.3549889354955907, + "grad_norm": 0.15338629484176636, + "learning_rate": 0.00014423430486188715, + "loss": 0.0093, + "step": 2687 + }, + { + "epoch": 0.3551210489810747, + "grad_norm": 0.28040778636932373, + "learning_rate": 0.00014419702726840408, + "loss": 0.036, + "step": 2688 + }, + { + "epoch": 0.35525316246655875, + "grad_norm": 0.1844131052494049, + "learning_rate": 0.00014415974204071093, + "loss": 0.0268, + "step": 2689 + }, + { + "epoch": 0.3553852759520428, + "grad_norm": 0.23858432471752167, + "learning_rate": 0.00014412244918524797, + "loss": 0.0274, + "step": 2690 + }, + { + "epoch": 0.3555173894375268, + "grad_norm": 0.22725573182106018, + "learning_rate": 0.0001440851487084569, + "loss": 0.0231, + "step": 2691 + }, + { + "epoch": 0.35564950292301084, + "grad_norm": 0.2215961068868637, + "learning_rate": 0.0001440478406167807, + "loss": 0.0254, + "step": 2692 + }, + { + "epoch": 0.35578161640849487, + "grad_norm": 0.26224878430366516, + "learning_rate": 0.00014401052491666357, + "loss": 0.0309, + "step": 2693 + }, + { + "epoch": 0.3559137298939789, + "grad_norm": 0.18535034358501434, + "learning_rate": 0.00014397320161455116, + "loss": 0.0211, + "step": 2694 + }, + { + "epoch": 0.35604584337946293, + "grad_norm": 0.24484148621559143, + "learning_rate": 0.00014393587071689034, + "loss": 0.0345, + "step": 2695 + }, + { + "epoch": 0.35617795686494697, + "grad_norm": 0.17378197610378265, + "learning_rate": 0.00014389853223012935, + "loss": 0.0159, + "step": 2696 + }, + { + "epoch": 0.356310070350431, + "grad_norm": 0.28994354605674744, + "learning_rate": 0.00014386118616071768, + "loss": 0.0461, + "step": 2697 + }, + { + "epoch": 0.356442183835915, + "grad_norm": 0.2110566794872284, + "learning_rate": 0.00014382383251510618, + "loss": 0.0195, + "step": 2698 + }, + { + "epoch": 0.35657429732139906, + "grad_norm": 0.23441585898399353, + "learning_rate": 0.00014378647129974703, + "loss": 0.0268, + "step": 2699 + }, + { + "epoch": 0.3567064108068831, + "grad_norm": 0.3104618191719055, + "learning_rate": 0.00014374910252109362, + "loss": 0.0259, + "step": 2700 + }, + { + "epoch": 0.3568385242923671, + "grad_norm": 0.21284493803977966, + "learning_rate": 0.00014371172618560073, + "loss": 0.0296, + "step": 2701 + }, + { + "epoch": 0.35697063777785115, + "grad_norm": 0.1908929944038391, + "learning_rate": 0.00014367434229972445, + "loss": 0.0269, + "step": 2702 + }, + { + "epoch": 0.3571027512633352, + "grad_norm": 0.21304036676883698, + "learning_rate": 0.00014363695086992215, + "loss": 0.0278, + "step": 2703 + }, + { + "epoch": 0.3572348647488192, + "grad_norm": 0.14423377811908722, + "learning_rate": 0.00014359955190265246, + "loss": 0.0217, + "step": 2704 + }, + { + "epoch": 0.35736697823430325, + "grad_norm": 0.2462548464536667, + "learning_rate": 0.00014356214540437535, + "loss": 0.0264, + "step": 2705 + }, + { + "epoch": 0.3574990917197873, + "grad_norm": 0.21595892310142517, + "learning_rate": 0.00014352473138155215, + "loss": 0.0247, + "step": 2706 + }, + { + "epoch": 0.3576312052052713, + "grad_norm": 0.2566458582878113, + "learning_rate": 0.00014348730984064539, + "loss": 0.0322, + "step": 2707 + }, + { + "epoch": 0.35776331869075534, + "grad_norm": 0.22518129646778107, + "learning_rate": 0.00014344988078811893, + "loss": 0.0161, + "step": 2708 + }, + { + "epoch": 0.35789543217623937, + "grad_norm": 0.14789043366909027, + "learning_rate": 0.00014341244423043796, + "loss": 0.0127, + "step": 2709 + }, + { + "epoch": 0.3580275456617234, + "grad_norm": 0.21939575672149658, + "learning_rate": 0.00014337500017406899, + "loss": 0.0213, + "step": 2710 + }, + { + "epoch": 0.35815965914720743, + "grad_norm": 0.31216418743133545, + "learning_rate": 0.00014333754862547968, + "loss": 0.0146, + "step": 2711 + }, + { + "epoch": 0.35829177263269146, + "grad_norm": 0.20372824370861053, + "learning_rate": 0.00014330008959113915, + "loss": 0.0254, + "step": 2712 + }, + { + "epoch": 0.3584238861181755, + "grad_norm": 0.33901652693748474, + "learning_rate": 0.00014326262307751773, + "loss": 0.0356, + "step": 2713 + }, + { + "epoch": 0.3585559996036595, + "grad_norm": 0.2312634438276291, + "learning_rate": 0.00014322514909108708, + "loss": 0.0278, + "step": 2714 + }, + { + "epoch": 0.35868811308914356, + "grad_norm": 0.3031659722328186, + "learning_rate": 0.0001431876676383201, + "loss": 0.0255, + "step": 2715 + }, + { + "epoch": 0.3588202265746276, + "grad_norm": 0.1266086995601654, + "learning_rate": 0.00014315017872569105, + "loss": 0.0165, + "step": 2716 + }, + { + "epoch": 0.3589523400601116, + "grad_norm": 0.16536574065685272, + "learning_rate": 0.00014311268235967544, + "loss": 0.0149, + "step": 2717 + }, + { + "epoch": 0.35908445354559565, + "grad_norm": 0.12575633823871613, + "learning_rate": 0.00014307517854675, + "loss": 0.0128, + "step": 2718 + }, + { + "epoch": 0.3592165670310797, + "grad_norm": 0.12982703745365143, + "learning_rate": 0.00014303766729339284, + "loss": 0.0121, + "step": 2719 + }, + { + "epoch": 0.3593486805165637, + "grad_norm": 0.18847040832042694, + "learning_rate": 0.0001430001486060834, + "loss": 0.0125, + "step": 2720 + }, + { + "epoch": 0.35948079400204774, + "grad_norm": 0.1142876073718071, + "learning_rate": 0.00014296262249130224, + "loss": 0.0091, + "step": 2721 + }, + { + "epoch": 0.3596129074875318, + "grad_norm": 0.18693424761295319, + "learning_rate": 0.0001429250889555313, + "loss": 0.0243, + "step": 2722 + }, + { + "epoch": 0.3597450209730158, + "grad_norm": 0.22312524914741516, + "learning_rate": 0.00014288754800525385, + "loss": 0.0262, + "step": 2723 + }, + { + "epoch": 0.35987713445849984, + "grad_norm": 0.2658948600292206, + "learning_rate": 0.00014284999964695437, + "loss": 0.0195, + "step": 2724 + }, + { + "epoch": 0.36000924794398387, + "grad_norm": 0.3503406345844269, + "learning_rate": 0.00014281244388711863, + "loss": 0.0273, + "step": 2725 + }, + { + "epoch": 0.3601413614294679, + "grad_norm": 0.22179050743579865, + "learning_rate": 0.0001427748807322337, + "loss": 0.018, + "step": 2726 + }, + { + "epoch": 0.36027347491495193, + "grad_norm": 0.18278402090072632, + "learning_rate": 0.00014273731018878784, + "loss": 0.0164, + "step": 2727 + }, + { + "epoch": 0.36040558840043596, + "grad_norm": 0.16992351412773132, + "learning_rate": 0.00014269973226327078, + "loss": 0.0191, + "step": 2728 + }, + { + "epoch": 0.36053770188592, + "grad_norm": 0.16629379987716675, + "learning_rate": 0.0001426621469621733, + "loss": 0.0215, + "step": 2729 + }, + { + "epoch": 0.360669815371404, + "grad_norm": 0.211905837059021, + "learning_rate": 0.00014262455429198763, + "loss": 0.0218, + "step": 2730 + }, + { + "epoch": 0.36080192885688805, + "grad_norm": 0.12857602536678314, + "learning_rate": 0.00014258695425920713, + "loss": 0.0106, + "step": 2731 + }, + { + "epoch": 0.3609340423423721, + "grad_norm": 0.1673976629972458, + "learning_rate": 0.00014254934687032658, + "loss": 0.0162, + "step": 2732 + }, + { + "epoch": 0.3610661558278561, + "grad_norm": 0.1890508532524109, + "learning_rate": 0.0001425117321318419, + "loss": 0.0189, + "step": 2733 + }, + { + "epoch": 0.36119826931334015, + "grad_norm": 0.24128776788711548, + "learning_rate": 0.00014247411005025034, + "loss": 0.0184, + "step": 2734 + }, + { + "epoch": 0.3613303827988242, + "grad_norm": 0.264330118894577, + "learning_rate": 0.00014243648063205042, + "loss": 0.0254, + "step": 2735 + }, + { + "epoch": 0.3614624962843082, + "grad_norm": 0.18204659223556519, + "learning_rate": 0.0001423988438837419, + "loss": 0.0303, + "step": 2736 + }, + { + "epoch": 0.36159460976979224, + "grad_norm": 0.49082961678504944, + "learning_rate": 0.00014236119981182589, + "loss": 0.0168, + "step": 2737 + }, + { + "epoch": 0.3617267232552763, + "grad_norm": 0.228216752409935, + "learning_rate": 0.0001423235484228046, + "loss": 0.0227, + "step": 2738 + }, + { + "epoch": 0.3618588367407603, + "grad_norm": 0.24391178786754608, + "learning_rate": 0.00014228588972318168, + "loss": 0.0323, + "step": 2739 + }, + { + "epoch": 0.36199095022624433, + "grad_norm": 0.20811639726161957, + "learning_rate": 0.00014224822371946194, + "loss": 0.0255, + "step": 2740 + }, + { + "epoch": 0.36212306371172837, + "grad_norm": 0.21693377196788788, + "learning_rate": 0.00014221055041815147, + "loss": 0.0252, + "step": 2741 + }, + { + "epoch": 0.3622551771972124, + "grad_norm": 0.21688027679920197, + "learning_rate": 0.00014217286982575765, + "loss": 0.0359, + "step": 2742 + }, + { + "epoch": 0.36238729068269643, + "grad_norm": 0.1932186633348465, + "learning_rate": 0.0001421351819487891, + "loss": 0.0208, + "step": 2743 + }, + { + "epoch": 0.36251940416818046, + "grad_norm": 0.17899347841739655, + "learning_rate": 0.00014209748679375566, + "loss": 0.0324, + "step": 2744 + }, + { + "epoch": 0.3626515176536645, + "grad_norm": 0.20702287554740906, + "learning_rate": 0.00014205978436716848, + "loss": 0.0204, + "step": 2745 + }, + { + "epoch": 0.3627836311391485, + "grad_norm": 0.18478304147720337, + "learning_rate": 0.00014202207467553995, + "loss": 0.0143, + "step": 2746 + }, + { + "epoch": 0.36291574462463255, + "grad_norm": 0.2355339080095291, + "learning_rate": 0.0001419843577253837, + "loss": 0.0328, + "step": 2747 + }, + { + "epoch": 0.3630478581101166, + "grad_norm": 0.17565876245498657, + "learning_rate": 0.0001419466335232147, + "loss": 0.0218, + "step": 2748 + }, + { + "epoch": 0.3631799715956006, + "grad_norm": 0.35660335421562195, + "learning_rate": 0.00014190890207554902, + "loss": 0.0225, + "step": 2749 + }, + { + "epoch": 0.36331208508108465, + "grad_norm": 0.17033158242702484, + "learning_rate": 0.0001418711633889041, + "loss": 0.0164, + "step": 2750 + }, + { + "epoch": 0.3634441985665687, + "grad_norm": 0.17524130642414093, + "learning_rate": 0.00014183341746979857, + "loss": 0.0185, + "step": 2751 + }, + { + "epoch": 0.3635763120520527, + "grad_norm": 0.23012326657772064, + "learning_rate": 0.0001417956643247523, + "loss": 0.0256, + "step": 2752 + }, + { + "epoch": 0.36370842553753674, + "grad_norm": 0.3006463646888733, + "learning_rate": 0.00014175790396028657, + "loss": 0.0366, + "step": 2753 + }, + { + "epoch": 0.36384053902302077, + "grad_norm": 0.16686567664146423, + "learning_rate": 0.00014172013638292366, + "loss": 0.0114, + "step": 2754 + }, + { + "epoch": 0.3639726525085048, + "grad_norm": 0.2867545783519745, + "learning_rate": 0.0001416823615991872, + "loss": 0.0168, + "step": 2755 + }, + { + "epoch": 0.36410476599398883, + "grad_norm": 0.2075122892856598, + "learning_rate": 0.00014164457961560217, + "loss": 0.0168, + "step": 2756 + }, + { + "epoch": 0.36423687947947286, + "grad_norm": 0.13762469589710236, + "learning_rate": 0.0001416067904386946, + "loss": 0.0065, + "step": 2757 + }, + { + "epoch": 0.3643689929649569, + "grad_norm": 0.17714820802211761, + "learning_rate": 0.00014156899407499196, + "loss": 0.0176, + "step": 2758 + }, + { + "epoch": 0.3645011064504409, + "grad_norm": 0.37875422835350037, + "learning_rate": 0.0001415311905310228, + "loss": 0.0254, + "step": 2759 + }, + { + "epoch": 0.36463321993592496, + "grad_norm": 0.23140572011470795, + "learning_rate": 0.000141493379813317, + "loss": 0.0231, + "step": 2760 + }, + { + "epoch": 0.364765333421409, + "grad_norm": 0.1881604641675949, + "learning_rate": 0.00014145556192840566, + "loss": 0.0206, + "step": 2761 + }, + { + "epoch": 0.364897446906893, + "grad_norm": 0.2795705795288086, + "learning_rate": 0.00014141773688282108, + "loss": 0.0337, + "step": 2762 + }, + { + "epoch": 0.36502956039237705, + "grad_norm": 0.3115374445915222, + "learning_rate": 0.0001413799046830969, + "loss": 0.0386, + "step": 2763 + }, + { + "epoch": 0.3651616738778611, + "grad_norm": 0.22487610578536987, + "learning_rate": 0.0001413420653357678, + "loss": 0.0394, + "step": 2764 + }, + { + "epoch": 0.3652937873633451, + "grad_norm": 0.3041042387485504, + "learning_rate": 0.00014130421884736997, + "loss": 0.0266, + "step": 2765 + }, + { + "epoch": 0.36542590084882914, + "grad_norm": 0.30442002415657043, + "learning_rate": 0.00014126636522444056, + "loss": 0.0325, + "step": 2766 + }, + { + "epoch": 0.3655580143343132, + "grad_norm": 0.23418278992176056, + "learning_rate": 0.00014122850447351816, + "loss": 0.0268, + "step": 2767 + }, + { + "epoch": 0.3656901278197972, + "grad_norm": 0.25879621505737305, + "learning_rate": 0.00014119063660114246, + "loss": 0.0317, + "step": 2768 + }, + { + "epoch": 0.36582224130528124, + "grad_norm": 0.1678416132926941, + "learning_rate": 0.00014115276161385444, + "loss": 0.0115, + "step": 2769 + }, + { + "epoch": 0.36595435479076527, + "grad_norm": 0.1974228322505951, + "learning_rate": 0.0001411148795181963, + "loss": 0.0336, + "step": 2770 + }, + { + "epoch": 0.3660864682762493, + "grad_norm": 0.18305741250514984, + "learning_rate": 0.00014107699032071144, + "loss": 0.0148, + "step": 2771 + }, + { + "epoch": 0.36621858176173333, + "grad_norm": 0.24715334177017212, + "learning_rate": 0.00014103909402794456, + "loss": 0.022, + "step": 2772 + }, + { + "epoch": 0.36635069524721736, + "grad_norm": 0.25955766439437866, + "learning_rate": 0.00014100119064644148, + "loss": 0.0381, + "step": 2773 + }, + { + "epoch": 0.3664828087327014, + "grad_norm": 0.14129871129989624, + "learning_rate": 0.00014096328018274937, + "loss": 0.01, + "step": 2774 + }, + { + "epoch": 0.3666149222181854, + "grad_norm": 0.17851941287517548, + "learning_rate": 0.00014092536264341646, + "loss": 0.0146, + "step": 2775 + }, + { + "epoch": 0.36674703570366946, + "grad_norm": 0.13794377446174622, + "learning_rate": 0.00014088743803499236, + "loss": 0.0108, + "step": 2776 + }, + { + "epoch": 0.3668791491891535, + "grad_norm": 0.2250049114227295, + "learning_rate": 0.00014084950636402782, + "loss": 0.0031, + "step": 2777 + }, + { + "epoch": 0.3670112626746375, + "grad_norm": 0.20712882280349731, + "learning_rate": 0.00014081156763707484, + "loss": 0.0224, + "step": 2778 + }, + { + "epoch": 0.36714337616012155, + "grad_norm": 0.15495552122592926, + "learning_rate": 0.00014077362186068658, + "loss": 0.0232, + "step": 2779 + }, + { + "epoch": 0.3672754896456056, + "grad_norm": 0.1719774305820465, + "learning_rate": 0.00014073566904141755, + "loss": 0.0231, + "step": 2780 + }, + { + "epoch": 0.3674076031310896, + "grad_norm": 0.35171160101890564, + "learning_rate": 0.0001406977091858233, + "loss": 0.0271, + "step": 2781 + }, + { + "epoch": 0.36753971661657364, + "grad_norm": 0.16442163288593292, + "learning_rate": 0.00014065974230046075, + "loss": 0.0215, + "step": 2782 + }, + { + "epoch": 0.3676718301020577, + "grad_norm": 0.18466730415821075, + "learning_rate": 0.00014062176839188794, + "loss": 0.0191, + "step": 2783 + }, + { + "epoch": 0.3678039435875417, + "grad_norm": 0.1335364282131195, + "learning_rate": 0.00014058378746666417, + "loss": 0.0141, + "step": 2784 + }, + { + "epoch": 0.36793605707302574, + "grad_norm": 0.21508878469467163, + "learning_rate": 0.0001405457995313499, + "loss": 0.0217, + "step": 2785 + }, + { + "epoch": 0.36806817055850977, + "grad_norm": 0.17799223959445953, + "learning_rate": 0.00014050780459250685, + "loss": 0.0121, + "step": 2786 + }, + { + "epoch": 0.3682002840439938, + "grad_norm": 0.2416561394929886, + "learning_rate": 0.00014046980265669797, + "loss": 0.0308, + "step": 2787 + }, + { + "epoch": 0.36833239752947783, + "grad_norm": 0.2069653421640396, + "learning_rate": 0.0001404317937304873, + "loss": 0.018, + "step": 2788 + }, + { + "epoch": 0.36846451101496186, + "grad_norm": 0.15654677152633667, + "learning_rate": 0.0001403937778204403, + "loss": 0.0148, + "step": 2789 + }, + { + "epoch": 0.3685966245004459, + "grad_norm": 0.20482538640499115, + "learning_rate": 0.00014035575493312341, + "loss": 0.0322, + "step": 2790 + }, + { + "epoch": 0.3687287379859299, + "grad_norm": 0.29183149337768555, + "learning_rate": 0.0001403177250751044, + "loss": 0.0367, + "step": 2791 + }, + { + "epoch": 0.36886085147141395, + "grad_norm": 0.19543787837028503, + "learning_rate": 0.00014027968825295218, + "loss": 0.0246, + "step": 2792 + }, + { + "epoch": 0.368992964956898, + "grad_norm": 0.19467125833034515, + "learning_rate": 0.00014024164447323697, + "loss": 0.0296, + "step": 2793 + }, + { + "epoch": 0.369125078442382, + "grad_norm": 0.19719550013542175, + "learning_rate": 0.00014020359374253006, + "loss": 0.0207, + "step": 2794 + }, + { + "epoch": 0.36925719192786605, + "grad_norm": 0.20642030239105225, + "learning_rate": 0.00014016553606740405, + "loss": 0.0233, + "step": 2795 + }, + { + "epoch": 0.3693893054133501, + "grad_norm": 0.17306675016880035, + "learning_rate": 0.00014012747145443269, + "loss": 0.0228, + "step": 2796 + }, + { + "epoch": 0.3695214188988341, + "grad_norm": 0.19165189564228058, + "learning_rate": 0.00014008939991019085, + "loss": 0.0146, + "step": 2797 + }, + { + "epoch": 0.36965353238431814, + "grad_norm": 0.35364142060279846, + "learning_rate": 0.0001400513214412548, + "loss": 0.0464, + "step": 2798 + }, + { + "epoch": 0.36978564586980217, + "grad_norm": 0.16079017519950867, + "learning_rate": 0.0001400132360542018, + "loss": 0.016, + "step": 2799 + }, + { + "epoch": 0.3699177593552862, + "grad_norm": 0.2617528736591339, + "learning_rate": 0.0001399751437556104, + "loss": 0.0185, + "step": 2800 + }, + { + "epoch": 0.37004987284077023, + "grad_norm": 0.23893366754055023, + "learning_rate": 0.00013993704455206034, + "loss": 0.027, + "step": 2801 + }, + { + "epoch": 0.37018198632625426, + "grad_norm": 0.24953344464302063, + "learning_rate": 0.00013989893845013255, + "loss": 0.0291, + "step": 2802 + }, + { + "epoch": 0.3703140998117383, + "grad_norm": 0.27270838618278503, + "learning_rate": 0.00013986082545640915, + "loss": 0.043, + "step": 2803 + }, + { + "epoch": 0.3704462132972223, + "grad_norm": 0.29569360613822937, + "learning_rate": 0.00013982270557747343, + "loss": 0.0282, + "step": 2804 + }, + { + "epoch": 0.37057832678270636, + "grad_norm": 0.25201651453971863, + "learning_rate": 0.00013978457881990992, + "loss": 0.0146, + "step": 2805 + }, + { + "epoch": 0.3707104402681904, + "grad_norm": 0.17621220648288727, + "learning_rate": 0.0001397464451903043, + "loss": 0.0171, + "step": 2806 + }, + { + "epoch": 0.3708425537536744, + "grad_norm": 0.2043362408876419, + "learning_rate": 0.00013970830469524337, + "loss": 0.0305, + "step": 2807 + }, + { + "epoch": 0.37097466723915845, + "grad_norm": 0.1952420324087143, + "learning_rate": 0.00013967015734131527, + "loss": 0.0134, + "step": 2808 + }, + { + "epoch": 0.3711067807246425, + "grad_norm": 0.2854112386703491, + "learning_rate": 0.0001396320031351092, + "loss": 0.0349, + "step": 2809 + }, + { + "epoch": 0.3712388942101265, + "grad_norm": 0.1703397035598755, + "learning_rate": 0.00013959384208321558, + "loss": 0.0166, + "step": 2810 + }, + { + "epoch": 0.37137100769561054, + "grad_norm": 0.2521183490753174, + "learning_rate": 0.00013955567419222606, + "loss": 0.0367, + "step": 2811 + }, + { + "epoch": 0.3715031211810946, + "grad_norm": 0.20728209614753723, + "learning_rate": 0.0001395174994687334, + "loss": 0.0177, + "step": 2812 + }, + { + "epoch": 0.3716352346665786, + "grad_norm": 0.09481014311313629, + "learning_rate": 0.0001394793179193316, + "loss": 0.0135, + "step": 2813 + }, + { + "epoch": 0.37176734815206264, + "grad_norm": 0.17332907021045685, + "learning_rate": 0.00013944112955061575, + "loss": 0.0194, + "step": 2814 + }, + { + "epoch": 0.37189946163754667, + "grad_norm": 0.1680062860250473, + "learning_rate": 0.00013940293436918226, + "loss": 0.0282, + "step": 2815 + }, + { + "epoch": 0.3720315751230307, + "grad_norm": 0.1549157202243805, + "learning_rate": 0.00013936473238162854, + "loss": 0.0146, + "step": 2816 + }, + { + "epoch": 0.37216368860851473, + "grad_norm": 0.14746297895908356, + "learning_rate": 0.00013932652359455335, + "loss": 0.018, + "step": 2817 + }, + { + "epoch": 0.37229580209399876, + "grad_norm": 0.206549271941185, + "learning_rate": 0.00013928830801455649, + "loss": 0.0292, + "step": 2818 + }, + { + "epoch": 0.3724279155794828, + "grad_norm": 0.16838571429252625, + "learning_rate": 0.00013925008564823899, + "loss": 0.0188, + "step": 2819 + }, + { + "epoch": 0.3725600290649668, + "grad_norm": 0.1660108119249344, + "learning_rate": 0.00013921185650220307, + "loss": 0.0212, + "step": 2820 + }, + { + "epoch": 0.37269214255045086, + "grad_norm": 0.2400975227355957, + "learning_rate": 0.0001391736205830521, + "loss": 0.0206, + "step": 2821 + }, + { + "epoch": 0.3728242560359349, + "grad_norm": 0.18147924542427063, + "learning_rate": 0.00013913537789739063, + "loss": 0.0146, + "step": 2822 + }, + { + "epoch": 0.3729563695214189, + "grad_norm": 0.17276743054389954, + "learning_rate": 0.00013909712845182432, + "loss": 0.0128, + "step": 2823 + }, + { + "epoch": 0.37308848300690295, + "grad_norm": 0.17892836034297943, + "learning_rate": 0.0001390588722529601, + "loss": 0.0267, + "step": 2824 + }, + { + "epoch": 0.373220596492387, + "grad_norm": 0.19831377267837524, + "learning_rate": 0.00013902060930740602, + "loss": 0.0202, + "step": 2825 + }, + { + "epoch": 0.373352709977871, + "grad_norm": 0.2546497583389282, + "learning_rate": 0.0001389823396217712, + "loss": 0.0253, + "step": 2826 + }, + { + "epoch": 0.37348482346335504, + "grad_norm": 0.20438458025455475, + "learning_rate": 0.00013894406320266614, + "loss": 0.0191, + "step": 2827 + }, + { + "epoch": 0.3736169369488391, + "grad_norm": 0.165092334151268, + "learning_rate": 0.00013890578005670224, + "loss": 0.0233, + "step": 2828 + }, + { + "epoch": 0.3737490504343231, + "grad_norm": 0.16347329318523407, + "learning_rate": 0.00013886749019049232, + "loss": 0.0255, + "step": 2829 + }, + { + "epoch": 0.37388116391980714, + "grad_norm": 0.1961696892976761, + "learning_rate": 0.00013882919361065014, + "loss": 0.0123, + "step": 2830 + }, + { + "epoch": 0.37401327740529117, + "grad_norm": 0.15506936609745026, + "learning_rate": 0.0001387908903237908, + "loss": 0.0189, + "step": 2831 + }, + { + "epoch": 0.3741453908907752, + "grad_norm": 0.1768009215593338, + "learning_rate": 0.00013875258033653045, + "loss": 0.0144, + "step": 2832 + }, + { + "epoch": 0.37427750437625923, + "grad_norm": 0.19626346230506897, + "learning_rate": 0.0001387142636554864, + "loss": 0.014, + "step": 2833 + }, + { + "epoch": 0.37440961786174326, + "grad_norm": 0.22314384579658508, + "learning_rate": 0.00013867594028727715, + "loss": 0.0168, + "step": 2834 + }, + { + "epoch": 0.3745417313472273, + "grad_norm": 0.2579716444015503, + "learning_rate": 0.00013863761023852233, + "loss": 0.0249, + "step": 2835 + }, + { + "epoch": 0.3746738448327113, + "grad_norm": 0.14838150143623352, + "learning_rate": 0.00013859927351584278, + "loss": 0.0185, + "step": 2836 + }, + { + "epoch": 0.37480595831819535, + "grad_norm": 0.20027662813663483, + "learning_rate": 0.00013856093012586045, + "loss": 0.0222, + "step": 2837 + }, + { + "epoch": 0.3749380718036794, + "grad_norm": 0.337117463350296, + "learning_rate": 0.00013852258007519838, + "loss": 0.0249, + "step": 2838 + }, + { + "epoch": 0.3750701852891634, + "grad_norm": 0.20685425400733948, + "learning_rate": 0.0001384842233704809, + "loss": 0.0206, + "step": 2839 + }, + { + "epoch": 0.37520229877464745, + "grad_norm": 0.20184406638145447, + "learning_rate": 0.00013844586001833338, + "loss": 0.0198, + "step": 2840 + }, + { + "epoch": 0.3753344122601315, + "grad_norm": 0.21701376140117645, + "learning_rate": 0.00013840749002538236, + "loss": 0.021, + "step": 2841 + }, + { + "epoch": 0.3754665257456155, + "grad_norm": 0.2415953427553177, + "learning_rate": 0.00013836911339825558, + "loss": 0.0219, + "step": 2842 + }, + { + "epoch": 0.37559863923109954, + "grad_norm": 0.29480111598968506, + "learning_rate": 0.00013833073014358184, + "loss": 0.0213, + "step": 2843 + }, + { + "epoch": 0.37573075271658357, + "grad_norm": 0.251859575510025, + "learning_rate": 0.00013829234026799115, + "loss": 0.0293, + "step": 2844 + }, + { + "epoch": 0.3758628662020676, + "grad_norm": 0.23018434643745422, + "learning_rate": 0.00013825394377811465, + "loss": 0.0311, + "step": 2845 + }, + { + "epoch": 0.37599497968755163, + "grad_norm": 0.21823716163635254, + "learning_rate": 0.0001382155406805846, + "loss": 0.0326, + "step": 2846 + }, + { + "epoch": 0.37612709317303566, + "grad_norm": 0.2705758213996887, + "learning_rate": 0.00013817713098203442, + "loss": 0.0168, + "step": 2847 + }, + { + "epoch": 0.3762592066585197, + "grad_norm": 0.1876915544271469, + "learning_rate": 0.00013813871468909867, + "loss": 0.0201, + "step": 2848 + }, + { + "epoch": 0.37639132014400367, + "grad_norm": 0.19940240681171417, + "learning_rate": 0.00013810029180841307, + "loss": 0.0202, + "step": 2849 + }, + { + "epoch": 0.3765234336294877, + "grad_norm": 0.22496294975280762, + "learning_rate": 0.0001380618623466144, + "loss": 0.0335, + "step": 2850 + }, + { + "epoch": 0.37665554711497173, + "grad_norm": 0.31435123085975647, + "learning_rate": 0.00013802342631034068, + "loss": 0.0101, + "step": 2851 + }, + { + "epoch": 0.37678766060045576, + "grad_norm": 0.14207758009433746, + "learning_rate": 0.000137984983706231, + "loss": 0.0152, + "step": 2852 + }, + { + "epoch": 0.3769197740859398, + "grad_norm": 0.18659062683582306, + "learning_rate": 0.00013794653454092559, + "loss": 0.0213, + "step": 2853 + }, + { + "epoch": 0.3770518875714238, + "grad_norm": 0.3208668529987335, + "learning_rate": 0.0001379080788210658, + "loss": 0.0271, + "step": 2854 + }, + { + "epoch": 0.37718400105690786, + "grad_norm": 0.2228097915649414, + "learning_rate": 0.00013786961655329425, + "loss": 0.0354, + "step": 2855 + }, + { + "epoch": 0.3773161145423919, + "grad_norm": 0.19546012580394745, + "learning_rate": 0.00013783114774425448, + "loss": 0.0229, + "step": 2856 + }, + { + "epoch": 0.3774482280278759, + "grad_norm": 0.3273359537124634, + "learning_rate": 0.00013779267240059123, + "loss": 0.0272, + "step": 2857 + }, + { + "epoch": 0.37758034151335995, + "grad_norm": 0.266356885433197, + "learning_rate": 0.0001377541905289505, + "loss": 0.0136, + "step": 2858 + }, + { + "epoch": 0.377712454998844, + "grad_norm": 0.22096264362335205, + "learning_rate": 0.0001377157021359792, + "loss": 0.0327, + "step": 2859 + }, + { + "epoch": 0.377844568484328, + "grad_norm": 0.1980806142091751, + "learning_rate": 0.00013767720722832557, + "loss": 0.0131, + "step": 2860 + }, + { + "epoch": 0.37797668196981205, + "grad_norm": 0.16720090806484222, + "learning_rate": 0.00013763870581263882, + "loss": 0.0253, + "step": 2861 + }, + { + "epoch": 0.3781087954552961, + "grad_norm": 0.1335599720478058, + "learning_rate": 0.00013760019789556944, + "loss": 0.0214, + "step": 2862 + }, + { + "epoch": 0.3782409089407801, + "grad_norm": 0.20969679951667786, + "learning_rate": 0.00013756168348376884, + "loss": 0.0371, + "step": 2863 + }, + { + "epoch": 0.37837302242626414, + "grad_norm": 0.17305636405944824, + "learning_rate": 0.00013752316258388976, + "loss": 0.0207, + "step": 2864 + }, + { + "epoch": 0.37850513591174817, + "grad_norm": 0.29161009192466736, + "learning_rate": 0.0001374846352025859, + "loss": 0.0263, + "step": 2865 + }, + { + "epoch": 0.3786372493972322, + "grad_norm": 0.21840567886829376, + "learning_rate": 0.00013744610134651218, + "loss": 0.0178, + "step": 2866 + }, + { + "epoch": 0.37876936288271623, + "grad_norm": 0.1723618358373642, + "learning_rate": 0.00013740756102232458, + "loss": 0.0219, + "step": 2867 + }, + { + "epoch": 0.37890147636820026, + "grad_norm": 0.1875240057706833, + "learning_rate": 0.00013736901423668023, + "loss": 0.02, + "step": 2868 + }, + { + "epoch": 0.3790335898536843, + "grad_norm": 0.19212710857391357, + "learning_rate": 0.00013733046099623737, + "loss": 0.0179, + "step": 2869 + }, + { + "epoch": 0.3791657033391683, + "grad_norm": 0.1740020364522934, + "learning_rate": 0.00013729190130765538, + "loss": 0.0285, + "step": 2870 + }, + { + "epoch": 0.37929781682465236, + "grad_norm": 0.15244339406490326, + "learning_rate": 0.00013725333517759463, + "loss": 0.0195, + "step": 2871 + }, + { + "epoch": 0.3794299303101364, + "grad_norm": 0.17750827968120575, + "learning_rate": 0.00013721476261271685, + "loss": 0.0193, + "step": 2872 + }, + { + "epoch": 0.3795620437956204, + "grad_norm": 0.2849121391773224, + "learning_rate": 0.0001371761836196846, + "loss": 0.0382, + "step": 2873 + }, + { + "epoch": 0.37969415728110445, + "grad_norm": 0.18970602750778198, + "learning_rate": 0.00013713759820516171, + "loss": 0.0157, + "step": 2874 + }, + { + "epoch": 0.3798262707665885, + "grad_norm": 0.16193434596061707, + "learning_rate": 0.0001370990063758131, + "loss": 0.0185, + "step": 2875 + }, + { + "epoch": 0.3799583842520725, + "grad_norm": 0.18522220849990845, + "learning_rate": 0.00013706040813830483, + "loss": 0.0232, + "step": 2876 + }, + { + "epoch": 0.38009049773755654, + "grad_norm": 0.15757068991661072, + "learning_rate": 0.00013702180349930396, + "loss": 0.013, + "step": 2877 + }, + { + "epoch": 0.3802226112230406, + "grad_norm": 0.14084728062152863, + "learning_rate": 0.00013698319246547872, + "loss": 0.0189, + "step": 2878 + }, + { + "epoch": 0.3803547247085246, + "grad_norm": 0.1844385862350464, + "learning_rate": 0.0001369445750434985, + "loss": 0.0257, + "step": 2879 + }, + { + "epoch": 0.38048683819400864, + "grad_norm": 0.11293325573205948, + "learning_rate": 0.0001369059512400337, + "loss": 0.0131, + "step": 2880 + }, + { + "epoch": 0.38061895167949267, + "grad_norm": 0.22674749791622162, + "learning_rate": 0.00013686732106175587, + "loss": 0.0245, + "step": 2881 + }, + { + "epoch": 0.3807510651649767, + "grad_norm": 0.2409743368625641, + "learning_rate": 0.0001368286845153376, + "loss": 0.0207, + "step": 2882 + }, + { + "epoch": 0.38088317865046073, + "grad_norm": 0.15085680782794952, + "learning_rate": 0.00013679004160745272, + "loss": 0.0232, + "step": 2883 + }, + { + "epoch": 0.38101529213594476, + "grad_norm": 0.15232190489768982, + "learning_rate": 0.00013675139234477603, + "loss": 0.0161, + "step": 2884 + }, + { + "epoch": 0.3811474056214288, + "grad_norm": 0.17391395568847656, + "learning_rate": 0.0001367127367339834, + "loss": 0.0147, + "step": 2885 + }, + { + "epoch": 0.3812795191069128, + "grad_norm": 0.4284667670726776, + "learning_rate": 0.000136674074781752, + "loss": 0.0393, + "step": 2886 + }, + { + "epoch": 0.38141163259239685, + "grad_norm": 0.20392243564128876, + "learning_rate": 0.00013663540649475986, + "loss": 0.0149, + "step": 2887 + }, + { + "epoch": 0.3815437460778809, + "grad_norm": 0.14438848197460175, + "learning_rate": 0.00013659673187968623, + "loss": 0.0195, + "step": 2888 + }, + { + "epoch": 0.3816758595633649, + "grad_norm": 0.27825459837913513, + "learning_rate": 0.00013655805094321143, + "loss": 0.0132, + "step": 2889 + }, + { + "epoch": 0.38180797304884895, + "grad_norm": 0.13513772189617157, + "learning_rate": 0.00013651936369201686, + "loss": 0.0195, + "step": 2890 + }, + { + "epoch": 0.381940086534333, + "grad_norm": 0.1930549293756485, + "learning_rate": 0.00013648067013278503, + "loss": 0.025, + "step": 2891 + }, + { + "epoch": 0.382072200019817, + "grad_norm": 0.18280315399169922, + "learning_rate": 0.0001364419702721995, + "loss": 0.026, + "step": 2892 + }, + { + "epoch": 0.38220431350530104, + "grad_norm": 0.13674134016036987, + "learning_rate": 0.000136403264116945, + "loss": 0.009, + "step": 2893 + }, + { + "epoch": 0.38233642699078507, + "grad_norm": 0.24852058291435242, + "learning_rate": 0.00013636455167370724, + "loss": 0.0282, + "step": 2894 + }, + { + "epoch": 0.3824685404762691, + "grad_norm": 0.4249047636985779, + "learning_rate": 0.00013632583294917314, + "loss": 0.0331, + "step": 2895 + }, + { + "epoch": 0.38260065396175313, + "grad_norm": 0.19219887256622314, + "learning_rate": 0.00013628710795003055, + "loss": 0.0248, + "step": 2896 + }, + { + "epoch": 0.38273276744723717, + "grad_norm": 0.1995341181755066, + "learning_rate": 0.00013624837668296854, + "loss": 0.0237, + "step": 2897 + }, + { + "epoch": 0.3828648809327212, + "grad_norm": 0.21090322732925415, + "learning_rate": 0.00013620963915467722, + "loss": 0.0203, + "step": 2898 + }, + { + "epoch": 0.3829969944182052, + "grad_norm": 0.2628128230571747, + "learning_rate": 0.00013617089537184776, + "loss": 0.0318, + "step": 2899 + }, + { + "epoch": 0.38312910790368926, + "grad_norm": 0.3117437958717346, + "learning_rate": 0.00013613214534117238, + "loss": 0.0215, + "step": 2900 + }, + { + "epoch": 0.3832612213891733, + "grad_norm": 0.22514411807060242, + "learning_rate": 0.0001360933890693445, + "loss": 0.0198, + "step": 2901 + }, + { + "epoch": 0.3833933348746573, + "grad_norm": 0.19629830121994019, + "learning_rate": 0.0001360546265630585, + "loss": 0.0251, + "step": 2902 + }, + { + "epoch": 0.38352544836014135, + "grad_norm": 0.16926242411136627, + "learning_rate": 0.00013601585782900988, + "loss": 0.0229, + "step": 2903 + }, + { + "epoch": 0.3836575618456254, + "grad_norm": 0.1834266632795334, + "learning_rate": 0.00013597708287389524, + "loss": 0.0331, + "step": 2904 + }, + { + "epoch": 0.3837896753311094, + "grad_norm": 0.18873266875743866, + "learning_rate": 0.00013593830170441218, + "loss": 0.0309, + "step": 2905 + }, + { + "epoch": 0.38392178881659345, + "grad_norm": 0.1950349062681198, + "learning_rate": 0.00013589951432725947, + "loss": 0.0213, + "step": 2906 + }, + { + "epoch": 0.3840539023020775, + "grad_norm": 0.18346445262432098, + "learning_rate": 0.00013586072074913685, + "loss": 0.0185, + "step": 2907 + }, + { + "epoch": 0.3841860157875615, + "grad_norm": 0.3406682014465332, + "learning_rate": 0.00013582192097674525, + "loss": 0.0222, + "step": 2908 + }, + { + "epoch": 0.38431812927304554, + "grad_norm": 0.2803246080875397, + "learning_rate": 0.00013578311501678657, + "loss": 0.0266, + "step": 2909 + }, + { + "epoch": 0.38445024275852957, + "grad_norm": 0.24295170605182648, + "learning_rate": 0.0001357443028759638, + "loss": 0.0288, + "step": 2910 + }, + { + "epoch": 0.3845823562440136, + "grad_norm": 0.21152372658252716, + "learning_rate": 0.00013570548456098104, + "loss": 0.0256, + "step": 2911 + }, + { + "epoch": 0.38471446972949763, + "grad_norm": 0.2268865704536438, + "learning_rate": 0.00013566666007854342, + "loss": 0.028, + "step": 2912 + }, + { + "epoch": 0.38484658321498166, + "grad_norm": 0.2392731010913849, + "learning_rate": 0.00013562782943535716, + "loss": 0.0198, + "step": 2913 + }, + { + "epoch": 0.3849786967004657, + "grad_norm": 0.13897490501403809, + "learning_rate": 0.00013558899263812945, + "loss": 0.0218, + "step": 2914 + }, + { + "epoch": 0.3851108101859497, + "grad_norm": 0.1503855586051941, + "learning_rate": 0.00013555014969356873, + "loss": 0.0134, + "step": 2915 + }, + { + "epoch": 0.38524292367143376, + "grad_norm": 0.15839877724647522, + "learning_rate": 0.00013551130060838432, + "loss": 0.0182, + "step": 2916 + }, + { + "epoch": 0.3853750371569178, + "grad_norm": 0.23495671153068542, + "learning_rate": 0.00013547244538928668, + "loss": 0.0315, + "step": 2917 + }, + { + "epoch": 0.3855071506424018, + "grad_norm": 0.17712470889091492, + "learning_rate": 0.00013543358404298736, + "loss": 0.0197, + "step": 2918 + }, + { + "epoch": 0.38563926412788585, + "grad_norm": 0.23429818451404572, + "learning_rate": 0.00013539471657619893, + "loss": 0.0242, + "step": 2919 + }, + { + "epoch": 0.3857713776133699, + "grad_norm": 0.16615261137485504, + "learning_rate": 0.00013535584299563498, + "loss": 0.0273, + "step": 2920 + }, + { + "epoch": 0.3859034910988539, + "grad_norm": 0.20191910862922668, + "learning_rate": 0.00013531696330801017, + "loss": 0.0208, + "step": 2921 + }, + { + "epoch": 0.38603560458433794, + "grad_norm": 0.20236913859844208, + "learning_rate": 0.00013527807752004034, + "loss": 0.0225, + "step": 2922 + }, + { + "epoch": 0.386167718069822, + "grad_norm": 0.18063655495643616, + "learning_rate": 0.00013523918563844217, + "loss": 0.0239, + "step": 2923 + }, + { + "epoch": 0.386299831555306, + "grad_norm": 0.21345975995063782, + "learning_rate": 0.0001352002876699336, + "loss": 0.0273, + "step": 2924 + }, + { + "epoch": 0.38643194504079004, + "grad_norm": 0.20318594574928284, + "learning_rate": 0.00013516138362123346, + "loss": 0.0247, + "step": 2925 + }, + { + "epoch": 0.38656405852627407, + "grad_norm": 0.1923476904630661, + "learning_rate": 0.0001351224734990617, + "loss": 0.0237, + "step": 2926 + }, + { + "epoch": 0.3866961720117581, + "grad_norm": 0.26162242889404297, + "learning_rate": 0.00013508355731013937, + "loss": 0.026, + "step": 2927 + }, + { + "epoch": 0.38682828549724213, + "grad_norm": 0.41296452283859253, + "learning_rate": 0.00013504463506118847, + "loss": 0.0247, + "step": 2928 + }, + { + "epoch": 0.38696039898272616, + "grad_norm": 0.1592453122138977, + "learning_rate": 0.0001350057067589321, + "loss": 0.0149, + "step": 2929 + }, + { + "epoch": 0.3870925124682102, + "grad_norm": 0.20988403260707855, + "learning_rate": 0.00013496677241009442, + "loss": 0.0237, + "step": 2930 + }, + { + "epoch": 0.3872246259536942, + "grad_norm": 0.2678348422050476, + "learning_rate": 0.00013492783202140058, + "loss": 0.0307, + "step": 2931 + }, + { + "epoch": 0.38735673943917825, + "grad_norm": 0.1757078468799591, + "learning_rate": 0.00013488888559957683, + "loss": 0.0338, + "step": 2932 + }, + { + "epoch": 0.3874888529246623, + "grad_norm": 0.19529110193252563, + "learning_rate": 0.00013484993315135036, + "loss": 0.0296, + "step": 2933 + }, + { + "epoch": 0.3876209664101463, + "grad_norm": 0.21215179562568665, + "learning_rate": 0.0001348109746834496, + "loss": 0.0217, + "step": 2934 + }, + { + "epoch": 0.38775307989563035, + "grad_norm": 0.1827051341533661, + "learning_rate": 0.0001347720102026038, + "loss": 0.0193, + "step": 2935 + }, + { + "epoch": 0.3878851933811144, + "grad_norm": 0.1964787244796753, + "learning_rate": 0.00013473303971554338, + "loss": 0.0165, + "step": 2936 + }, + { + "epoch": 0.3880173068665984, + "grad_norm": 0.21198433637619019, + "learning_rate": 0.0001346940632289998, + "loss": 0.0249, + "step": 2937 + }, + { + "epoch": 0.38814942035208244, + "grad_norm": 0.2188272625207901, + "learning_rate": 0.00013465508074970544, + "loss": 0.0178, + "step": 2938 + }, + { + "epoch": 0.3882815338375665, + "grad_norm": 0.17410850524902344, + "learning_rate": 0.00013461609228439386, + "loss": 0.0277, + "step": 2939 + }, + { + "epoch": 0.3884136473230505, + "grad_norm": 0.2147447019815445, + "learning_rate": 0.00013457709783979956, + "loss": 0.0237, + "step": 2940 + }, + { + "epoch": 0.38854576080853453, + "grad_norm": 0.19408878684043884, + "learning_rate": 0.0001345380974226581, + "loss": 0.0174, + "step": 2941 + }, + { + "epoch": 0.38867787429401857, + "grad_norm": 0.17176926136016846, + "learning_rate": 0.00013449909103970605, + "loss": 0.0173, + "step": 2942 + }, + { + "epoch": 0.3888099877795026, + "grad_norm": 0.16017231345176697, + "learning_rate": 0.00013446007869768109, + "loss": 0.0247, + "step": 2943 + }, + { + "epoch": 0.38894210126498663, + "grad_norm": 0.24421115219593048, + "learning_rate": 0.00013442106040332182, + "loss": 0.0284, + "step": 2944 + }, + { + "epoch": 0.38907421475047066, + "grad_norm": 0.21438637375831604, + "learning_rate": 0.00013438203616336798, + "loss": 0.0236, + "step": 2945 + }, + { + "epoch": 0.3892063282359547, + "grad_norm": 0.3065054416656494, + "learning_rate": 0.00013434300598456024, + "loss": 0.0249, + "step": 2946 + }, + { + "epoch": 0.3893384417214387, + "grad_norm": 0.15419889986515045, + "learning_rate": 0.00013430396987364033, + "loss": 0.0155, + "step": 2947 + }, + { + "epoch": 0.38947055520692275, + "grad_norm": 0.20932449400424957, + "learning_rate": 0.00013426492783735102, + "loss": 0.0212, + "step": 2948 + }, + { + "epoch": 0.3896026686924068, + "grad_norm": 0.22069528698921204, + "learning_rate": 0.00013422587988243606, + "loss": 0.0219, + "step": 2949 + }, + { + "epoch": 0.3897347821778908, + "grad_norm": 0.16955885291099548, + "learning_rate": 0.00013418682601564033, + "loss": 0.0147, + "step": 2950 + }, + { + "epoch": 0.38986689566337485, + "grad_norm": 0.23010531067848206, + "learning_rate": 0.00013414776624370958, + "loss": 0.0245, + "step": 2951 + }, + { + "epoch": 0.3899990091488589, + "grad_norm": 0.2735253870487213, + "learning_rate": 0.00013410870057339067, + "loss": 0.0251, + "step": 2952 + }, + { + "epoch": 0.3901311226343429, + "grad_norm": 0.20499961078166962, + "learning_rate": 0.00013406962901143146, + "loss": 0.0159, + "step": 2953 + }, + { + "epoch": 0.39026323611982694, + "grad_norm": 0.2517299950122833, + "learning_rate": 0.0001340305515645809, + "loss": 0.0217, + "step": 2954 + }, + { + "epoch": 0.39039534960531097, + "grad_norm": 0.13152022659778595, + "learning_rate": 0.00013399146823958878, + "loss": 0.0143, + "step": 2955 + }, + { + "epoch": 0.390527463090795, + "grad_norm": 0.23188278079032898, + "learning_rate": 0.0001339523790432061, + "loss": 0.0208, + "step": 2956 + }, + { + "epoch": 0.39065957657627903, + "grad_norm": 0.20910461246967316, + "learning_rate": 0.00013391328398218474, + "loss": 0.0316, + "step": 2957 + }, + { + "epoch": 0.39079169006176306, + "grad_norm": 0.13782212138175964, + "learning_rate": 0.0001338741830632777, + "loss": 0.015, + "step": 2958 + }, + { + "epoch": 0.3909238035472471, + "grad_norm": 0.22674843668937683, + "learning_rate": 0.00013383507629323883, + "loss": 0.0252, + "step": 2959 + }, + { + "epoch": 0.3910559170327311, + "grad_norm": 0.19780975580215454, + "learning_rate": 0.0001337959636788232, + "loss": 0.0251, + "step": 2960 + }, + { + "epoch": 0.39118803051821516, + "grad_norm": 0.21379712224006653, + "learning_rate": 0.00013375684522678674, + "loss": 0.0199, + "step": 2961 + }, + { + "epoch": 0.3913201440036992, + "grad_norm": 0.22542434930801392, + "learning_rate": 0.0001337177209438864, + "loss": 0.0288, + "step": 2962 + }, + { + "epoch": 0.3914522574891832, + "grad_norm": 0.14902955293655396, + "learning_rate": 0.0001336785908368802, + "loss": 0.0119, + "step": 2963 + }, + { + "epoch": 0.39158437097466725, + "grad_norm": 0.38387978076934814, + "learning_rate": 0.00013363945491252715, + "loss": 0.0341, + "step": 2964 + }, + { + "epoch": 0.3917164844601513, + "grad_norm": 0.17453815042972565, + "learning_rate": 0.0001336003131775872, + "loss": 0.0222, + "step": 2965 + }, + { + "epoch": 0.3918485979456353, + "grad_norm": 0.23411858081817627, + "learning_rate": 0.00013356116563882143, + "loss": 0.0224, + "step": 2966 + }, + { + "epoch": 0.39198071143111934, + "grad_norm": 0.17240062355995178, + "learning_rate": 0.00013352201230299176, + "loss": 0.0228, + "step": 2967 + }, + { + "epoch": 0.3921128249166034, + "grad_norm": 0.18300558626651764, + "learning_rate": 0.00013348285317686123, + "loss": 0.0235, + "step": 2968 + }, + { + "epoch": 0.3922449384020874, + "grad_norm": 0.3427295386791229, + "learning_rate": 0.00013344368826719388, + "loss": 0.0277, + "step": 2969 + }, + { + "epoch": 0.39237705188757144, + "grad_norm": 0.45966774225234985, + "learning_rate": 0.00013340451758075468, + "loss": 0.0399, + "step": 2970 + }, + { + "epoch": 0.39250916537305547, + "grad_norm": 0.26486414670944214, + "learning_rate": 0.0001333653411243096, + "loss": 0.0185, + "step": 2971 + }, + { + "epoch": 0.3926412788585395, + "grad_norm": 0.18045035004615784, + "learning_rate": 0.00013332615890462575, + "loss": 0.0186, + "step": 2972 + }, + { + "epoch": 0.39277339234402353, + "grad_norm": 0.28566309809684753, + "learning_rate": 0.000133286970928471, + "loss": 0.0348, + "step": 2973 + }, + { + "epoch": 0.39290550582950756, + "grad_norm": 0.209380641579628, + "learning_rate": 0.00013324777720261443, + "loss": 0.0149, + "step": 2974 + }, + { + "epoch": 0.3930376193149916, + "grad_norm": 0.40362977981567383, + "learning_rate": 0.00013320857773382598, + "loss": 0.0138, + "step": 2975 + }, + { + "epoch": 0.3931697328004756, + "grad_norm": 0.1985737681388855, + "learning_rate": 0.00013316937252887665, + "loss": 0.0188, + "step": 2976 + }, + { + "epoch": 0.39330184628595966, + "grad_norm": 0.22340957820415497, + "learning_rate": 0.0001331301615945384, + "loss": 0.0262, + "step": 2977 + }, + { + "epoch": 0.3934339597714437, + "grad_norm": 0.2279328852891922, + "learning_rate": 0.00013309094493758411, + "loss": 0.0186, + "step": 2978 + }, + { + "epoch": 0.3935660732569277, + "grad_norm": 0.29355117678642273, + "learning_rate": 0.00013305172256478787, + "loss": 0.0292, + "step": 2979 + }, + { + "epoch": 0.39369818674241175, + "grad_norm": 0.4233926832675934, + "learning_rate": 0.0001330124944829245, + "loss": 0.0333, + "step": 2980 + }, + { + "epoch": 0.3938303002278958, + "grad_norm": 0.16877886652946472, + "learning_rate": 0.00013297326069876996, + "loss": 0.0214, + "step": 2981 + }, + { + "epoch": 0.3939624137133798, + "grad_norm": 0.2501515746116638, + "learning_rate": 0.00013293402121910113, + "loss": 0.0257, + "step": 2982 + }, + { + "epoch": 0.39409452719886384, + "grad_norm": 0.26275867223739624, + "learning_rate": 0.00013289477605069595, + "loss": 0.0245, + "step": 2983 + }, + { + "epoch": 0.3942266406843479, + "grad_norm": 0.21009047329425812, + "learning_rate": 0.00013285552520033318, + "loss": 0.0286, + "step": 2984 + }, + { + "epoch": 0.3943587541698319, + "grad_norm": 0.1921718567609787, + "learning_rate": 0.00013281626867479276, + "loss": 0.0221, + "step": 2985 + }, + { + "epoch": 0.39449086765531594, + "grad_norm": 0.23634794354438782, + "learning_rate": 0.00013277700648085556, + "loss": 0.0263, + "step": 2986 + }, + { + "epoch": 0.39462298114079997, + "grad_norm": 0.3247498571872711, + "learning_rate": 0.0001327377386253033, + "loss": 0.0315, + "step": 2987 + }, + { + "epoch": 0.394755094626284, + "grad_norm": 0.21689558029174805, + "learning_rate": 0.0001326984651149188, + "loss": 0.0248, + "step": 2988 + }, + { + "epoch": 0.39488720811176803, + "grad_norm": 0.23579855263233185, + "learning_rate": 0.0001326591859564858, + "loss": 0.0239, + "step": 2989 + }, + { + "epoch": 0.39501932159725206, + "grad_norm": 0.21000924706459045, + "learning_rate": 0.00013261990115678905, + "loss": 0.0137, + "step": 2990 + }, + { + "epoch": 0.3951514350827361, + "grad_norm": 0.15919966995716095, + "learning_rate": 0.00013258061072261433, + "loss": 0.0173, + "step": 2991 + }, + { + "epoch": 0.3952835485682201, + "grad_norm": 0.15629446506500244, + "learning_rate": 0.00013254131466074824, + "loss": 0.0201, + "step": 2992 + }, + { + "epoch": 0.39541566205370415, + "grad_norm": 0.20342610776424408, + "learning_rate": 0.0001325020129779785, + "loss": 0.0345, + "step": 2993 + }, + { + "epoch": 0.3955477755391882, + "grad_norm": 0.17629854381084442, + "learning_rate": 0.00013246270568109374, + "loss": 0.0189, + "step": 2994 + }, + { + "epoch": 0.3956798890246722, + "grad_norm": 0.18273794651031494, + "learning_rate": 0.0001324233927768835, + "loss": 0.0168, + "step": 2995 + }, + { + "epoch": 0.39581200251015625, + "grad_norm": 0.28699859976768494, + "learning_rate": 0.0001323840742721384, + "loss": 0.037, + "step": 2996 + }, + { + "epoch": 0.3959441159956403, + "grad_norm": 0.1269131451845169, + "learning_rate": 0.00013234475017364993, + "loss": 0.0137, + "step": 2997 + }, + { + "epoch": 0.3960762294811243, + "grad_norm": 0.30667126178741455, + "learning_rate": 0.00013230542048821067, + "loss": 0.0324, + "step": 2998 + }, + { + "epoch": 0.39620834296660834, + "grad_norm": 0.30856379866600037, + "learning_rate": 0.000132266085222614, + "loss": 0.0181, + "step": 2999 + }, + { + "epoch": 0.39634045645209237, + "grad_norm": 0.21341513097286224, + "learning_rate": 0.00013222674438365442, + "loss": 0.0209, + "step": 3000 + }, + { + "epoch": 0.3964725699375764, + "grad_norm": 0.22583794593811035, + "learning_rate": 0.00013218739797812731, + "loss": 0.0271, + "step": 3001 + }, + { + "epoch": 0.39660468342306043, + "grad_norm": 0.19691592454910278, + "learning_rate": 0.00013214804601282903, + "loss": 0.0245, + "step": 3002 + }, + { + "epoch": 0.39673679690854446, + "grad_norm": 0.15087178349494934, + "learning_rate": 0.00013210868849455686, + "loss": 0.0209, + "step": 3003 + }, + { + "epoch": 0.3968689103940285, + "grad_norm": 0.196724995970726, + "learning_rate": 0.0001320693254301091, + "loss": 0.0231, + "step": 3004 + }, + { + "epoch": 0.3970010238795125, + "grad_norm": 0.28341200947761536, + "learning_rate": 0.00013202995682628496, + "loss": 0.0291, + "step": 3005 + }, + { + "epoch": 0.39713313736499656, + "grad_norm": 0.1786683052778244, + "learning_rate": 0.00013199058268988467, + "loss": 0.0194, + "step": 3006 + }, + { + "epoch": 0.3972652508504806, + "grad_norm": 0.19349396228790283, + "learning_rate": 0.00013195120302770936, + "loss": 0.0226, + "step": 3007 + }, + { + "epoch": 0.3973973643359646, + "grad_norm": 0.13992834091186523, + "learning_rate": 0.00013191181784656114, + "loss": 0.0215, + "step": 3008 + }, + { + "epoch": 0.39752947782144865, + "grad_norm": 0.14852946996688843, + "learning_rate": 0.000131872427153243, + "loss": 0.0172, + "step": 3009 + }, + { + "epoch": 0.3976615913069327, + "grad_norm": 0.24151255190372467, + "learning_rate": 0.00013183303095455905, + "loss": 0.0317, + "step": 3010 + }, + { + "epoch": 0.3977937047924167, + "grad_norm": 0.22274500131607056, + "learning_rate": 0.00013179362925731415, + "loss": 0.0186, + "step": 3011 + }, + { + "epoch": 0.3979258182779007, + "grad_norm": 0.2599247694015503, + "learning_rate": 0.0001317542220683143, + "loss": 0.0183, + "step": 3012 + }, + { + "epoch": 0.3980579317633847, + "grad_norm": 0.2223990559577942, + "learning_rate": 0.00013171480939436626, + "loss": 0.0226, + "step": 3013 + }, + { + "epoch": 0.39819004524886875, + "grad_norm": 0.20143398642539978, + "learning_rate": 0.0001316753912422779, + "loss": 0.0247, + "step": 3014 + }, + { + "epoch": 0.3983221587343528, + "grad_norm": 0.1691291183233261, + "learning_rate": 0.00013163596761885796, + "loss": 0.0129, + "step": 3015 + }, + { + "epoch": 0.3984542722198368, + "grad_norm": 0.2109353244304657, + "learning_rate": 0.00013159653853091606, + "loss": 0.0182, + "step": 3016 + }, + { + "epoch": 0.39858638570532084, + "grad_norm": 0.08470373600721359, + "learning_rate": 0.00013155710398526295, + "loss": 0.0085, + "step": 3017 + }, + { + "epoch": 0.3987184991908049, + "grad_norm": 0.16234825551509857, + "learning_rate": 0.00013151766398871015, + "loss": 0.015, + "step": 3018 + }, + { + "epoch": 0.3988506126762889, + "grad_norm": 0.30378457903862, + "learning_rate": 0.00013147821854807022, + "loss": 0.036, + "step": 3019 + }, + { + "epoch": 0.39898272616177294, + "grad_norm": 0.16837498545646667, + "learning_rate": 0.00013143876767015655, + "loss": 0.0182, + "step": 3020 + }, + { + "epoch": 0.39911483964725697, + "grad_norm": 0.3299632668495178, + "learning_rate": 0.00013139931136178359, + "loss": 0.035, + "step": 3021 + }, + { + "epoch": 0.399246953132741, + "grad_norm": 0.39903292059898376, + "learning_rate": 0.0001313598496297667, + "loss": 0.025, + "step": 3022 + }, + { + "epoch": 0.39937906661822503, + "grad_norm": 0.1631740927696228, + "learning_rate": 0.00013132038248092208, + "loss": 0.0199, + "step": 3023 + }, + { + "epoch": 0.39951118010370906, + "grad_norm": 0.17485398054122925, + "learning_rate": 0.00013128090992206703, + "loss": 0.0121, + "step": 3024 + }, + { + "epoch": 0.3996432935891931, + "grad_norm": 0.17089051008224487, + "learning_rate": 0.00013124143196001963, + "loss": 0.0159, + "step": 3025 + }, + { + "epoch": 0.3997754070746771, + "grad_norm": 0.21537205576896667, + "learning_rate": 0.00013120194860159902, + "loss": 0.0209, + "step": 3026 + }, + { + "epoch": 0.39990752056016116, + "grad_norm": 0.46143659949302673, + "learning_rate": 0.0001311624598536252, + "loss": 0.0314, + "step": 3027 + }, + { + "epoch": 0.4000396340456452, + "grad_norm": 0.1878209412097931, + "learning_rate": 0.00013112296572291904, + "loss": 0.0216, + "step": 3028 + }, + { + "epoch": 0.4001717475311292, + "grad_norm": 0.13695985078811646, + "learning_rate": 0.0001310834662163025, + "loss": 0.0125, + "step": 3029 + }, + { + "epoch": 0.40030386101661325, + "grad_norm": 0.14801840484142303, + "learning_rate": 0.0001310439613405983, + "loss": 0.0215, + "step": 3030 + }, + { + "epoch": 0.4004359745020973, + "grad_norm": 0.2693830728530884, + "learning_rate": 0.0001310044511026303, + "loss": 0.023, + "step": 3031 + }, + { + "epoch": 0.4005680879875813, + "grad_norm": 0.27653369307518005, + "learning_rate": 0.000130964935509223, + "loss": 0.0229, + "step": 3032 + }, + { + "epoch": 0.40070020147306534, + "grad_norm": 0.15918390452861786, + "learning_rate": 0.0001309254145672021, + "loss": 0.0188, + "step": 3033 + }, + { + "epoch": 0.4008323149585494, + "grad_norm": 0.23531582951545715, + "learning_rate": 0.00013088588828339402, + "loss": 0.0182, + "step": 3034 + }, + { + "epoch": 0.4009644284440334, + "grad_norm": 0.13156545162200928, + "learning_rate": 0.00013084635666462622, + "loss": 0.0131, + "step": 3035 + }, + { + "epoch": 0.40109654192951744, + "grad_norm": 0.17341113090515137, + "learning_rate": 0.00013080681971772707, + "loss": 0.0271, + "step": 3036 + }, + { + "epoch": 0.40122865541500147, + "grad_norm": 0.1439688354730606, + "learning_rate": 0.0001307672774495258, + "loss": 0.0167, + "step": 3037 + }, + { + "epoch": 0.4013607689004855, + "grad_norm": 0.18817555904388428, + "learning_rate": 0.0001307277298668526, + "loss": 0.0197, + "step": 3038 + }, + { + "epoch": 0.40149288238596953, + "grad_norm": 0.28514885902404785, + "learning_rate": 0.0001306881769765386, + "loss": 0.0239, + "step": 3039 + }, + { + "epoch": 0.40162499587145356, + "grad_norm": 0.2204063981771469, + "learning_rate": 0.00013064861878541584, + "loss": 0.0158, + "step": 3040 + }, + { + "epoch": 0.4017571093569376, + "grad_norm": 0.1329529732465744, + "learning_rate": 0.0001306090553003172, + "loss": 0.0125, + "step": 3041 + }, + { + "epoch": 0.4018892228424216, + "grad_norm": 0.1567203551530838, + "learning_rate": 0.0001305694865280766, + "loss": 0.0156, + "step": 3042 + }, + { + "epoch": 0.40202133632790565, + "grad_norm": 0.258801132440567, + "learning_rate": 0.00013052991247552873, + "loss": 0.0284, + "step": 3043 + }, + { + "epoch": 0.4021534498133897, + "grad_norm": 0.14416727423667908, + "learning_rate": 0.00013049033314950931, + "loss": 0.0191, + "step": 3044 + }, + { + "epoch": 0.4022855632988737, + "grad_norm": 0.24458062648773193, + "learning_rate": 0.00013045074855685493, + "loss": 0.0358, + "step": 3045 + }, + { + "epoch": 0.40241767678435775, + "grad_norm": 0.187479630112648, + "learning_rate": 0.0001304111587044031, + "loss": 0.0177, + "step": 3046 + }, + { + "epoch": 0.4025497902698418, + "grad_norm": 0.17222760617733002, + "learning_rate": 0.00013037156359899216, + "loss": 0.0195, + "step": 3047 + }, + { + "epoch": 0.4026819037553258, + "grad_norm": 0.20241984724998474, + "learning_rate": 0.00013033196324746153, + "loss": 0.023, + "step": 3048 + }, + { + "epoch": 0.40281401724080984, + "grad_norm": 0.19136063754558563, + "learning_rate": 0.00013029235765665134, + "loss": 0.023, + "step": 3049 + }, + { + "epoch": 0.40294613072629387, + "grad_norm": 0.17290174961090088, + "learning_rate": 0.0001302527468334028, + "loss": 0.0132, + "step": 3050 + }, + { + "epoch": 0.4030782442117779, + "grad_norm": 0.26126572489738464, + "learning_rate": 0.00013021313078455783, + "loss": 0.0211, + "step": 3051 + }, + { + "epoch": 0.40321035769726193, + "grad_norm": 0.30257081985473633, + "learning_rate": 0.00013017350951695944, + "loss": 0.0237, + "step": 3052 + }, + { + "epoch": 0.40334247118274597, + "grad_norm": 0.1518670916557312, + "learning_rate": 0.00013013388303745145, + "loss": 0.0164, + "step": 3053 + }, + { + "epoch": 0.40347458466823, + "grad_norm": 0.26053333282470703, + "learning_rate": 0.0001300942513528786, + "loss": 0.0167, + "step": 3054 + }, + { + "epoch": 0.403606698153714, + "grad_norm": 0.2704867422580719, + "learning_rate": 0.00013005461447008647, + "loss": 0.0238, + "step": 3055 + }, + { + "epoch": 0.40373881163919806, + "grad_norm": 0.1770847886800766, + "learning_rate": 0.00013001497239592164, + "loss": 0.0103, + "step": 3056 + }, + { + "epoch": 0.4038709251246821, + "grad_norm": 0.44425147771835327, + "learning_rate": 0.00012997532513723154, + "loss": 0.024, + "step": 3057 + }, + { + "epoch": 0.4040030386101661, + "grad_norm": 0.1942521631717682, + "learning_rate": 0.0001299356727008645, + "loss": 0.0178, + "step": 3058 + }, + { + "epoch": 0.40413515209565015, + "grad_norm": 0.24964120984077454, + "learning_rate": 0.00012989601509366967, + "loss": 0.022, + "step": 3059 + }, + { + "epoch": 0.4042672655811342, + "grad_norm": 0.34468597173690796, + "learning_rate": 0.00012985635232249724, + "loss": 0.0253, + "step": 3060 + }, + { + "epoch": 0.4043993790666182, + "grad_norm": 0.19694958627223969, + "learning_rate": 0.00012981668439419815, + "loss": 0.0332, + "step": 3061 + }, + { + "epoch": 0.40453149255210225, + "grad_norm": 0.09895392507314682, + "learning_rate": 0.00012977701131562436, + "loss": 0.009, + "step": 3062 + }, + { + "epoch": 0.4046636060375863, + "grad_norm": 0.14473307132720947, + "learning_rate": 0.00012973733309362857, + "loss": 0.02, + "step": 3063 + }, + { + "epoch": 0.4047957195230703, + "grad_norm": 0.2015502005815506, + "learning_rate": 0.00012969764973506454, + "loss": 0.0279, + "step": 3064 + }, + { + "epoch": 0.40492783300855434, + "grad_norm": 0.17858926951885223, + "learning_rate": 0.00012965796124678677, + "loss": 0.024, + "step": 3065 + }, + { + "epoch": 0.40505994649403837, + "grad_norm": 0.3196292519569397, + "learning_rate": 0.0001296182676356507, + "loss": 0.0288, + "step": 3066 + }, + { + "epoch": 0.4051920599795224, + "grad_norm": 0.20552773773670197, + "learning_rate": 0.0001295785689085127, + "loss": 0.0251, + "step": 3067 + }, + { + "epoch": 0.40532417346500643, + "grad_norm": 0.19985134899616241, + "learning_rate": 0.00012953886507222992, + "loss": 0.0173, + "step": 3068 + }, + { + "epoch": 0.40545628695049046, + "grad_norm": 0.14639391005039215, + "learning_rate": 0.0001294991561336605, + "loss": 0.0225, + "step": 3069 + }, + { + "epoch": 0.4055884004359745, + "grad_norm": 1.9710235595703125, + "learning_rate": 0.00012945944209966345, + "loss": 0.0663, + "step": 3070 + }, + { + "epoch": 0.4057205139214585, + "grad_norm": 0.275114506483078, + "learning_rate": 0.0001294197229770986, + "loss": 0.0314, + "step": 3071 + }, + { + "epoch": 0.40585262740694256, + "grad_norm": 0.12577217817306519, + "learning_rate": 0.00012937999877282662, + "loss": 0.0104, + "step": 3072 + }, + { + "epoch": 0.4059847408924266, + "grad_norm": 0.12943196296691895, + "learning_rate": 0.0001293402694937092, + "loss": 0.0144, + "step": 3073 + }, + { + "epoch": 0.4061168543779106, + "grad_norm": 0.18152198195457458, + "learning_rate": 0.00012930053514660883, + "loss": 0.0144, + "step": 3074 + }, + { + "epoch": 0.40624896786339465, + "grad_norm": 0.1707521229982376, + "learning_rate": 0.0001292607957383888, + "loss": 0.0163, + "step": 3075 + }, + { + "epoch": 0.4063810813488787, + "grad_norm": 0.1473418027162552, + "learning_rate": 0.00012922105127591348, + "loss": 0.0152, + "step": 3076 + }, + { + "epoch": 0.4065131948343627, + "grad_norm": 0.19073860347270966, + "learning_rate": 0.00012918130176604783, + "loss": 0.0218, + "step": 3077 + }, + { + "epoch": 0.40664530831984674, + "grad_norm": 0.23325873911380768, + "learning_rate": 0.00012914154721565795, + "loss": 0.0187, + "step": 3078 + }, + { + "epoch": 0.4067774218053308, + "grad_norm": 0.1258496791124344, + "learning_rate": 0.00012910178763161066, + "loss": 0.0132, + "step": 3079 + }, + { + "epoch": 0.4069095352908148, + "grad_norm": 0.25707176327705383, + "learning_rate": 0.00012906202302077365, + "loss": 0.0192, + "step": 3080 + }, + { + "epoch": 0.40704164877629884, + "grad_norm": 0.2727690041065216, + "learning_rate": 0.00012902225339001558, + "loss": 0.043, + "step": 3081 + }, + { + "epoch": 0.40717376226178287, + "grad_norm": 0.2927294075489044, + "learning_rate": 0.00012898247874620585, + "loss": 0.0261, + "step": 3082 + }, + { + "epoch": 0.4073058757472669, + "grad_norm": 0.20090535283088684, + "learning_rate": 0.0001289426990962148, + "loss": 0.0243, + "step": 3083 + }, + { + "epoch": 0.40743798923275093, + "grad_norm": 0.2168041467666626, + "learning_rate": 0.0001289029144469137, + "loss": 0.0344, + "step": 3084 + }, + { + "epoch": 0.40757010271823496, + "grad_norm": 0.48691752552986145, + "learning_rate": 0.00012886312480517447, + "loss": 0.0255, + "step": 3085 + }, + { + "epoch": 0.407702216203719, + "grad_norm": 0.2309977412223816, + "learning_rate": 0.0001288233301778701, + "loss": 0.0221, + "step": 3086 + }, + { + "epoch": 0.407834329689203, + "grad_norm": 0.17883218824863434, + "learning_rate": 0.00012878353057187435, + "loss": 0.0205, + "step": 3087 + }, + { + "epoch": 0.40796644317468705, + "grad_norm": 0.24214334785938263, + "learning_rate": 0.00012874372599406192, + "loss": 0.027, + "step": 3088 + }, + { + "epoch": 0.4080985566601711, + "grad_norm": 0.15543243288993835, + "learning_rate": 0.00012870391645130818, + "loss": 0.0111, + "step": 3089 + }, + { + "epoch": 0.4082306701456551, + "grad_norm": 0.25050976872444153, + "learning_rate": 0.0001286641019504896, + "loss": 0.0391, + "step": 3090 + }, + { + "epoch": 0.40836278363113915, + "grad_norm": 0.2022220343351364, + "learning_rate": 0.00012862428249848335, + "loss": 0.0196, + "step": 3091 + }, + { + "epoch": 0.4084948971166232, + "grad_norm": 0.19390860199928284, + "learning_rate": 0.00012858445810216747, + "loss": 0.0157, + "step": 3092 + }, + { + "epoch": 0.4086270106021072, + "grad_norm": 0.22381038963794708, + "learning_rate": 0.00012854462876842095, + "loss": 0.0267, + "step": 3093 + }, + { + "epoch": 0.40875912408759124, + "grad_norm": 0.32017675042152405, + "learning_rate": 0.00012850479450412348, + "loss": 0.0227, + "step": 3094 + }, + { + "epoch": 0.4088912375730753, + "grad_norm": 0.253348708152771, + "learning_rate": 0.00012846495531615573, + "loss": 0.0343, + "step": 3095 + }, + { + "epoch": 0.4090233510585593, + "grad_norm": 0.3195996582508087, + "learning_rate": 0.00012842511121139916, + "loss": 0.0157, + "step": 3096 + }, + { + "epoch": 0.40915546454404333, + "grad_norm": 0.1614832580089569, + "learning_rate": 0.0001283852621967361, + "loss": 0.0201, + "step": 3097 + }, + { + "epoch": 0.40928757802952737, + "grad_norm": 0.1978360265493393, + "learning_rate": 0.00012834540827904976, + "loss": 0.0217, + "step": 3098 + }, + { + "epoch": 0.4094196915150114, + "grad_norm": 0.1857193112373352, + "learning_rate": 0.00012830554946522405, + "loss": 0.0137, + "step": 3099 + }, + { + "epoch": 0.40955180500049543, + "grad_norm": 0.2626390755176544, + "learning_rate": 0.00012826568576214398, + "loss": 0.0146, + "step": 3100 + }, + { + "epoch": 0.40968391848597946, + "grad_norm": 0.20259742438793182, + "learning_rate": 0.00012822581717669514, + "loss": 0.0226, + "step": 3101 + }, + { + "epoch": 0.4098160319714635, + "grad_norm": 0.1670604795217514, + "learning_rate": 0.00012818594371576412, + "loss": 0.0168, + "step": 3102 + }, + { + "epoch": 0.4099481454569475, + "grad_norm": 0.15540018677711487, + "learning_rate": 0.00012814606538623835, + "loss": 0.0151, + "step": 3103 + }, + { + "epoch": 0.41008025894243155, + "grad_norm": 0.31598779559135437, + "learning_rate": 0.00012810618219500603, + "loss": 0.0346, + "step": 3104 + }, + { + "epoch": 0.4102123724279156, + "grad_norm": 0.20833928883075714, + "learning_rate": 0.00012806629414895625, + "loss": 0.0234, + "step": 3105 + }, + { + "epoch": 0.4103444859133996, + "grad_norm": 0.12443934381008148, + "learning_rate": 0.00012802640125497892, + "loss": 0.0135, + "step": 3106 + }, + { + "epoch": 0.41047659939888365, + "grad_norm": 0.12392992526292801, + "learning_rate": 0.00012798650351996478, + "loss": 0.0127, + "step": 3107 + }, + { + "epoch": 0.4106087128843677, + "grad_norm": 0.26349079608917236, + "learning_rate": 0.00012794660095080543, + "loss": 0.0233, + "step": 3108 + }, + { + "epoch": 0.4107408263698517, + "grad_norm": 0.14821572601795197, + "learning_rate": 0.0001279066935543933, + "loss": 0.0161, + "step": 3109 + }, + { + "epoch": 0.41087293985533574, + "grad_norm": 0.305846244096756, + "learning_rate": 0.00012786678133762164, + "loss": 0.0147, + "step": 3110 + }, + { + "epoch": 0.41100505334081977, + "grad_norm": 0.2815128266811371, + "learning_rate": 0.00012782686430738453, + "loss": 0.0324, + "step": 3111 + }, + { + "epoch": 0.4111371668263038, + "grad_norm": 0.5089820027351379, + "learning_rate": 0.0001277869424705769, + "loss": 0.0328, + "step": 3112 + }, + { + "epoch": 0.41126928031178783, + "grad_norm": 0.24389733374118805, + "learning_rate": 0.0001277470158340945, + "loss": 0.0256, + "step": 3113 + }, + { + "epoch": 0.41140139379727186, + "grad_norm": 0.281982958316803, + "learning_rate": 0.0001277070844048339, + "loss": 0.0348, + "step": 3114 + }, + { + "epoch": 0.4115335072827559, + "grad_norm": 0.20420657098293304, + "learning_rate": 0.00012766714818969254, + "loss": 0.0229, + "step": 3115 + }, + { + "epoch": 0.4116656207682399, + "grad_norm": 0.19268015027046204, + "learning_rate": 0.0001276272071955686, + "loss": 0.0111, + "step": 3116 + }, + { + "epoch": 0.41179773425372396, + "grad_norm": 0.3611801266670227, + "learning_rate": 0.00012758726142936117, + "loss": 0.0402, + "step": 3117 + }, + { + "epoch": 0.411929847739208, + "grad_norm": 0.18338996171951294, + "learning_rate": 0.00012754731089797015, + "loss": 0.0175, + "step": 3118 + }, + { + "epoch": 0.412061961224692, + "grad_norm": 0.20748326182365417, + "learning_rate": 0.00012750735560829624, + "loss": 0.0255, + "step": 3119 + }, + { + "epoch": 0.41219407471017605, + "grad_norm": 0.20322351157665253, + "learning_rate": 0.00012746739556724091, + "loss": 0.0112, + "step": 3120 + }, + { + "epoch": 0.4123261881956601, + "grad_norm": 0.07140868902206421, + "learning_rate": 0.00012742743078170664, + "loss": 0.0043, + "step": 3121 + }, + { + "epoch": 0.4124583016811441, + "grad_norm": 0.2231295257806778, + "learning_rate": 0.0001273874612585965, + "loss": 0.0225, + "step": 3122 + }, + { + "epoch": 0.41259041516662814, + "grad_norm": 0.21522819995880127, + "learning_rate": 0.00012734748700481444, + "loss": 0.0333, + "step": 3123 + }, + { + "epoch": 0.4127225286521122, + "grad_norm": 0.1936907023191452, + "learning_rate": 0.0001273075080272654, + "loss": 0.0216, + "step": 3124 + }, + { + "epoch": 0.4128546421375962, + "grad_norm": 0.2655859887599945, + "learning_rate": 0.00012726752433285486, + "loss": 0.0306, + "step": 3125 + }, + { + "epoch": 0.41298675562308024, + "grad_norm": 0.2395084798336029, + "learning_rate": 0.00012722753592848935, + "loss": 0.0302, + "step": 3126 + }, + { + "epoch": 0.41311886910856427, + "grad_norm": 0.23985159397125244, + "learning_rate": 0.00012718754282107608, + "loss": 0.0232, + "step": 3127 + }, + { + "epoch": 0.4132509825940483, + "grad_norm": 0.3043936491012573, + "learning_rate": 0.00012714754501752312, + "loss": 0.0345, + "step": 3128 + }, + { + "epoch": 0.41338309607953233, + "grad_norm": 0.20778505504131317, + "learning_rate": 0.00012710754252473935, + "loss": 0.0189, + "step": 3129 + }, + { + "epoch": 0.41351520956501636, + "grad_norm": 0.21860454976558685, + "learning_rate": 0.00012706753534963444, + "loss": 0.0239, + "step": 3130 + }, + { + "epoch": 0.4136473230505004, + "grad_norm": 0.2815905213356018, + "learning_rate": 0.00012702752349911888, + "loss": 0.0339, + "step": 3131 + }, + { + "epoch": 0.4137794365359844, + "grad_norm": 0.22736093401908875, + "learning_rate": 0.000126987506980104, + "loss": 0.025, + "step": 3132 + }, + { + "epoch": 0.41391155002146846, + "grad_norm": 0.41342923045158386, + "learning_rate": 0.00012694748579950187, + "loss": 0.0222, + "step": 3133 + }, + { + "epoch": 0.4140436635069525, + "grad_norm": 0.3165312707424164, + "learning_rate": 0.00012690745996422542, + "loss": 0.0362, + "step": 3134 + }, + { + "epoch": 0.4141757769924365, + "grad_norm": 0.17335399985313416, + "learning_rate": 0.00012686742948118831, + "loss": 0.0216, + "step": 3135 + }, + { + "epoch": 0.41430789047792055, + "grad_norm": 0.2092505544424057, + "learning_rate": 0.0001268273943573052, + "loss": 0.0275, + "step": 3136 + }, + { + "epoch": 0.4144400039634046, + "grad_norm": 0.25818854570388794, + "learning_rate": 0.0001267873545994912, + "loss": 0.0275, + "step": 3137 + }, + { + "epoch": 0.4145721174488886, + "grad_norm": 0.1769459843635559, + "learning_rate": 0.00012674731021466263, + "loss": 0.0203, + "step": 3138 + }, + { + "epoch": 0.41470423093437264, + "grad_norm": 0.26153334975242615, + "learning_rate": 0.00012670726120973631, + "loss": 0.016, + "step": 3139 + }, + { + "epoch": 0.4148363444198567, + "grad_norm": 0.33175545930862427, + "learning_rate": 0.00012666720759162996, + "loss": 0.034, + "step": 3140 + }, + { + "epoch": 0.4149684579053407, + "grad_norm": 0.17180666327476501, + "learning_rate": 0.0001266271493672621, + "loss": 0.0187, + "step": 3141 + }, + { + "epoch": 0.41510057139082474, + "grad_norm": 0.18626075983047485, + "learning_rate": 0.00012658708654355203, + "loss": 0.0193, + "step": 3142 + }, + { + "epoch": 0.41523268487630877, + "grad_norm": 0.32776767015457153, + "learning_rate": 0.00012654701912741988, + "loss": 0.019, + "step": 3143 + }, + { + "epoch": 0.4153647983617928, + "grad_norm": 0.1480986624956131, + "learning_rate": 0.00012650694712578652, + "loss": 0.0105, + "step": 3144 + }, + { + "epoch": 0.41549691184727683, + "grad_norm": 0.18982647359371185, + "learning_rate": 0.00012646687054557366, + "loss": 0.0246, + "step": 3145 + }, + { + "epoch": 0.41562902533276086, + "grad_norm": 0.3359149396419525, + "learning_rate": 0.00012642678939370376, + "loss": 0.0178, + "step": 3146 + }, + { + "epoch": 0.4157611388182449, + "grad_norm": 0.1366925984621048, + "learning_rate": 0.00012638670367710013, + "loss": 0.0204, + "step": 3147 + }, + { + "epoch": 0.4158932523037289, + "grad_norm": 0.18426810204982758, + "learning_rate": 0.0001263466134026868, + "loss": 0.019, + "step": 3148 + }, + { + "epoch": 0.41602536578921295, + "grad_norm": 0.2569717764854431, + "learning_rate": 0.00012630651857738854, + "loss": 0.0189, + "step": 3149 + }, + { + "epoch": 0.416157479274697, + "grad_norm": 0.17694905400276184, + "learning_rate": 0.00012626641920813114, + "loss": 0.0182, + "step": 3150 + }, + { + "epoch": 0.416289592760181, + "grad_norm": 0.3232481777667999, + "learning_rate": 0.0001262263153018409, + "loss": 0.0192, + "step": 3151 + }, + { + "epoch": 0.41642170624566505, + "grad_norm": 0.23598027229309082, + "learning_rate": 0.00012618620686544505, + "loss": 0.0315, + "step": 3152 + }, + { + "epoch": 0.4165538197311491, + "grad_norm": 0.2639772891998291, + "learning_rate": 0.00012614609390587157, + "loss": 0.024, + "step": 3153 + }, + { + "epoch": 0.4166859332166331, + "grad_norm": 0.17504766583442688, + "learning_rate": 0.00012610597643004926, + "loss": 0.0241, + "step": 3154 + }, + { + "epoch": 0.41681804670211714, + "grad_norm": 0.1681995987892151, + "learning_rate": 0.00012606585444490762, + "loss": 0.0191, + "step": 3155 + }, + { + "epoch": 0.41695016018760117, + "grad_norm": 0.13266129791736603, + "learning_rate": 0.00012602572795737695, + "loss": 0.015, + "step": 3156 + }, + { + "epoch": 0.4170822736730852, + "grad_norm": 0.14186996221542358, + "learning_rate": 0.00012598559697438844, + "loss": 0.019, + "step": 3157 + }, + { + "epoch": 0.41721438715856923, + "grad_norm": 0.20277422666549683, + "learning_rate": 0.00012594546150287384, + "loss": 0.0177, + "step": 3158 + }, + { + "epoch": 0.41734650064405326, + "grad_norm": 0.18437455594539642, + "learning_rate": 0.00012590532154976595, + "loss": 0.0211, + "step": 3159 + }, + { + "epoch": 0.4174786141295373, + "grad_norm": 0.22293388843536377, + "learning_rate": 0.00012586517712199807, + "loss": 0.0173, + "step": 3160 + }, + { + "epoch": 0.4176107276150213, + "grad_norm": 0.1834786832332611, + "learning_rate": 0.00012582502822650445, + "loss": 0.0168, + "step": 3161 + }, + { + "epoch": 0.41774284110050536, + "grad_norm": 0.16545066237449646, + "learning_rate": 0.0001257848748702201, + "loss": 0.012, + "step": 3162 + }, + { + "epoch": 0.4178749545859894, + "grad_norm": 0.16071240603923798, + "learning_rate": 0.00012574471706008067, + "loss": 0.0217, + "step": 3163 + }, + { + "epoch": 0.4180070680714734, + "grad_norm": 0.2613682746887207, + "learning_rate": 0.00012570455480302278, + "loss": 0.0285, + "step": 3164 + }, + { + "epoch": 0.41813918155695745, + "grad_norm": 0.1711345762014389, + "learning_rate": 0.00012566438810598365, + "loss": 0.0248, + "step": 3165 + }, + { + "epoch": 0.4182712950424415, + "grad_norm": 0.29203635454177856, + "learning_rate": 0.0001256242169759013, + "loss": 0.03, + "step": 3166 + }, + { + "epoch": 0.4184034085279255, + "grad_norm": 0.18163099884986877, + "learning_rate": 0.0001255840414197146, + "loss": 0.0185, + "step": 3167 + }, + { + "epoch": 0.41853552201340954, + "grad_norm": 0.21773065626621246, + "learning_rate": 0.00012554386144436304, + "loss": 0.0199, + "step": 3168 + }, + { + "epoch": 0.4186676354988936, + "grad_norm": 0.19037318229675293, + "learning_rate": 0.00012550367705678708, + "loss": 0.0278, + "step": 3169 + }, + { + "epoch": 0.4187997489843776, + "grad_norm": 0.23356559872627258, + "learning_rate": 0.00012546348826392772, + "loss": 0.021, + "step": 3170 + }, + { + "epoch": 0.41893186246986164, + "grad_norm": 0.2311427742242813, + "learning_rate": 0.00012542329507272688, + "loss": 0.0234, + "step": 3171 + }, + { + "epoch": 0.41906397595534567, + "grad_norm": 0.17222066223621368, + "learning_rate": 0.00012538309749012715, + "loss": 0.0198, + "step": 3172 + }, + { + "epoch": 0.4191960894408297, + "grad_norm": 0.24496857821941376, + "learning_rate": 0.0001253428955230719, + "loss": 0.037, + "step": 3173 + }, + { + "epoch": 0.41932820292631373, + "grad_norm": 0.2687060534954071, + "learning_rate": 0.00012530268917850535, + "loss": 0.0192, + "step": 3174 + }, + { + "epoch": 0.4194603164117977, + "grad_norm": 0.24776218831539154, + "learning_rate": 0.00012526247846337228, + "loss": 0.0215, + "step": 3175 + }, + { + "epoch": 0.41959242989728174, + "grad_norm": 0.15776574611663818, + "learning_rate": 0.00012522226338461842, + "loss": 0.0153, + "step": 3176 + }, + { + "epoch": 0.41972454338276577, + "grad_norm": 0.3436722159385681, + "learning_rate": 0.00012518204394919015, + "loss": 0.0237, + "step": 3177 + }, + { + "epoch": 0.4198566568682498, + "grad_norm": 0.17449195683002472, + "learning_rate": 0.00012514182016403461, + "loss": 0.0139, + "step": 3178 + }, + { + "epoch": 0.41998877035373383, + "grad_norm": 0.21825307607650757, + "learning_rate": 0.00012510159203609974, + "loss": 0.0312, + "step": 3179 + }, + { + "epoch": 0.42012088383921786, + "grad_norm": 0.207487091422081, + "learning_rate": 0.00012506135957233416, + "loss": 0.0302, + "step": 3180 + }, + { + "epoch": 0.4202529973247019, + "grad_norm": 0.17855414748191833, + "learning_rate": 0.0001250211227796873, + "loss": 0.0176, + "step": 3181 + }, + { + "epoch": 0.4203851108101859, + "grad_norm": 0.20130109786987305, + "learning_rate": 0.00012498088166510931, + "loss": 0.018, + "step": 3182 + }, + { + "epoch": 0.42051722429566996, + "grad_norm": 0.25873619318008423, + "learning_rate": 0.00012494063623555107, + "loss": 0.022, + "step": 3183 + }, + { + "epoch": 0.420649337781154, + "grad_norm": 0.1681220680475235, + "learning_rate": 0.00012490038649796425, + "loss": 0.0241, + "step": 3184 + }, + { + "epoch": 0.420781451266638, + "grad_norm": 0.24111413955688477, + "learning_rate": 0.00012486013245930125, + "loss": 0.0265, + "step": 3185 + }, + { + "epoch": 0.42091356475212205, + "grad_norm": 0.1893637329339981, + "learning_rate": 0.0001248198741265152, + "loss": 0.0233, + "step": 3186 + }, + { + "epoch": 0.4210456782376061, + "grad_norm": 0.1877831071615219, + "learning_rate": 0.0001247796115065599, + "loss": 0.0292, + "step": 3187 + }, + { + "epoch": 0.4211777917230901, + "grad_norm": 0.21083886921405792, + "learning_rate": 0.00012473934460639007, + "loss": 0.0336, + "step": 3188 + }, + { + "epoch": 0.42130990520857414, + "grad_norm": 0.1846085637807846, + "learning_rate": 0.00012469907343296097, + "loss": 0.0175, + "step": 3189 + }, + { + "epoch": 0.4214420186940582, + "grad_norm": 0.17249713838100433, + "learning_rate": 0.00012465879799322877, + "loss": 0.0168, + "step": 3190 + }, + { + "epoch": 0.4215741321795422, + "grad_norm": 0.2629503011703491, + "learning_rate": 0.00012461851829415028, + "loss": 0.0222, + "step": 3191 + }, + { + "epoch": 0.42170624566502624, + "grad_norm": 0.18024857342243195, + "learning_rate": 0.00012457823434268303, + "loss": 0.0196, + "step": 3192 + }, + { + "epoch": 0.42183835915051027, + "grad_norm": 0.23293425142765045, + "learning_rate": 0.00012453794614578537, + "loss": 0.0248, + "step": 3193 + }, + { + "epoch": 0.4219704726359943, + "grad_norm": 0.16819629073143005, + "learning_rate": 0.00012449765371041628, + "loss": 0.0108, + "step": 3194 + }, + { + "epoch": 0.42210258612147833, + "grad_norm": 0.20160499215126038, + "learning_rate": 0.00012445735704353557, + "loss": 0.0185, + "step": 3195 + }, + { + "epoch": 0.42223469960696236, + "grad_norm": 0.18579092621803284, + "learning_rate": 0.00012441705615210368, + "loss": 0.0225, + "step": 3196 + }, + { + "epoch": 0.4223668130924464, + "grad_norm": 0.16678574681282043, + "learning_rate": 0.00012437675104308194, + "loss": 0.0222, + "step": 3197 + }, + { + "epoch": 0.4224989265779304, + "grad_norm": 0.22756989300251007, + "learning_rate": 0.0001243364417234322, + "loss": 0.0211, + "step": 3198 + }, + { + "epoch": 0.42263104006341445, + "grad_norm": 0.2957361042499542, + "learning_rate": 0.00012429612820011717, + "loss": 0.0244, + "step": 3199 + }, + { + "epoch": 0.4227631535488985, + "grad_norm": 0.1491062194108963, + "learning_rate": 0.00012425581048010028, + "loss": 0.0145, + "step": 3200 + }, + { + "epoch": 0.4228952670343825, + "grad_norm": 0.1475045531988144, + "learning_rate": 0.0001242154885703456, + "loss": 0.0145, + "step": 3201 + }, + { + "epoch": 0.42302738051986655, + "grad_norm": 0.23000866174697876, + "learning_rate": 0.0001241751624778181, + "loss": 0.0158, + "step": 3202 + }, + { + "epoch": 0.4231594940053506, + "grad_norm": 0.22191113233566284, + "learning_rate": 0.00012413483220948324, + "loss": 0.0203, + "step": 3203 + }, + { + "epoch": 0.4232916074908346, + "grad_norm": 0.14425142109394073, + "learning_rate": 0.0001240944977723074, + "loss": 0.0112, + "step": 3204 + }, + { + "epoch": 0.42342372097631864, + "grad_norm": 0.5771812796592712, + "learning_rate": 0.00012405415917325757, + "loss": 0.049, + "step": 3205 + }, + { + "epoch": 0.42355583446180267, + "grad_norm": 0.19761916995048523, + "learning_rate": 0.0001240138164193015, + "loss": 0.0186, + "step": 3206 + }, + { + "epoch": 0.4236879479472867, + "grad_norm": 0.19133833050727844, + "learning_rate": 0.0001239734695174076, + "loss": 0.0244, + "step": 3207 + }, + { + "epoch": 0.42382006143277073, + "grad_norm": 0.16783583164215088, + "learning_rate": 0.0001239331184745451, + "loss": 0.0202, + "step": 3208 + }, + { + "epoch": 0.42395217491825476, + "grad_norm": 0.23732124269008636, + "learning_rate": 0.00012389276329768386, + "loss": 0.0196, + "step": 3209 + }, + { + "epoch": 0.4240842884037388, + "grad_norm": 0.0561022087931633, + "learning_rate": 0.0001238524039937945, + "loss": 0.0046, + "step": 3210 + }, + { + "epoch": 0.4242164018892228, + "grad_norm": 0.18584446609020233, + "learning_rate": 0.00012381204056984832, + "loss": 0.0241, + "step": 3211 + }, + { + "epoch": 0.42434851537470686, + "grad_norm": 0.6407673954963684, + "learning_rate": 0.00012377167303281736, + "loss": 0.0226, + "step": 3212 + }, + { + "epoch": 0.4244806288601909, + "grad_norm": 0.2582055926322937, + "learning_rate": 0.00012373130138967434, + "loss": 0.0223, + "step": 3213 + }, + { + "epoch": 0.4246127423456749, + "grad_norm": 0.18671056628227234, + "learning_rate": 0.0001236909256473927, + "loss": 0.0242, + "step": 3214 + }, + { + "epoch": 0.42474485583115895, + "grad_norm": 0.18192487955093384, + "learning_rate": 0.00012365054581294665, + "loss": 0.0214, + "step": 3215 + }, + { + "epoch": 0.424876969316643, + "grad_norm": 0.19129694998264313, + "learning_rate": 0.00012361016189331098, + "loss": 0.027, + "step": 3216 + }, + { + "epoch": 0.425009082802127, + "grad_norm": 0.23282450437545776, + "learning_rate": 0.0001235697738954613, + "loss": 0.0234, + "step": 3217 + }, + { + "epoch": 0.42514119628761105, + "grad_norm": 0.22248545289039612, + "learning_rate": 0.00012352938182637387, + "loss": 0.0178, + "step": 3218 + }, + { + "epoch": 0.4252733097730951, + "grad_norm": 0.20974159240722656, + "learning_rate": 0.00012348898569302565, + "loss": 0.0286, + "step": 3219 + }, + { + "epoch": 0.4254054232585791, + "grad_norm": 0.13747277855873108, + "learning_rate": 0.00012344858550239433, + "loss": 0.0152, + "step": 3220 + }, + { + "epoch": 0.42553753674406314, + "grad_norm": 0.2712138593196869, + "learning_rate": 0.0001234081812614583, + "loss": 0.0259, + "step": 3221 + }, + { + "epoch": 0.42566965022954717, + "grad_norm": 0.21853908896446228, + "learning_rate": 0.00012336777297719667, + "loss": 0.0219, + "step": 3222 + }, + { + "epoch": 0.4258017637150312, + "grad_norm": 0.17728565633296967, + "learning_rate": 0.0001233273606565891, + "loss": 0.0233, + "step": 3223 + }, + { + "epoch": 0.42593387720051523, + "grad_norm": 0.209276482462883, + "learning_rate": 0.00012328694430661618, + "loss": 0.0219, + "step": 3224 + }, + { + "epoch": 0.42606599068599926, + "grad_norm": 0.15013103187084198, + "learning_rate": 0.000123246523934259, + "loss": 0.0143, + "step": 3225 + }, + { + "epoch": 0.4261981041714833, + "grad_norm": 0.19634680449962616, + "learning_rate": 0.0001232060995464995, + "loss": 0.028, + "step": 3226 + }, + { + "epoch": 0.4263302176569673, + "grad_norm": 0.19637438654899597, + "learning_rate": 0.00012316567115032014, + "loss": 0.0363, + "step": 3227 + }, + { + "epoch": 0.42646233114245136, + "grad_norm": 0.3099977374076843, + "learning_rate": 0.00012312523875270426, + "loss": 0.0282, + "step": 3228 + }, + { + "epoch": 0.4265944446279354, + "grad_norm": 0.18816916644573212, + "learning_rate": 0.00012308480236063578, + "loss": 0.0264, + "step": 3229 + }, + { + "epoch": 0.4267265581134194, + "grad_norm": 0.14757655560970306, + "learning_rate": 0.00012304436198109928, + "loss": 0.0201, + "step": 3230 + }, + { + "epoch": 0.42685867159890345, + "grad_norm": 0.24353700876235962, + "learning_rate": 0.00012300391762108014, + "loss": 0.0253, + "step": 3231 + }, + { + "epoch": 0.4269907850843875, + "grad_norm": 0.2125053107738495, + "learning_rate": 0.00012296346928756432, + "loss": 0.0192, + "step": 3232 + }, + { + "epoch": 0.4271228985698715, + "grad_norm": 0.1806890368461609, + "learning_rate": 0.00012292301698753853, + "loss": 0.021, + "step": 3233 + }, + { + "epoch": 0.42725501205535554, + "grad_norm": 0.2877216637134552, + "learning_rate": 0.00012288256072799018, + "loss": 0.0249, + "step": 3234 + }, + { + "epoch": 0.4273871255408396, + "grad_norm": 0.18381038308143616, + "learning_rate": 0.0001228421005159073, + "loss": 0.014, + "step": 3235 + }, + { + "epoch": 0.4275192390263236, + "grad_norm": 0.23033744096755981, + "learning_rate": 0.00012280163635827865, + "loss": 0.0199, + "step": 3236 + }, + { + "epoch": 0.42765135251180764, + "grad_norm": 0.2439228892326355, + "learning_rate": 0.00012276116826209362, + "loss": 0.0279, + "step": 3237 + }, + { + "epoch": 0.42778346599729167, + "grad_norm": 0.1486399620771408, + "learning_rate": 0.00012272069623434236, + "loss": 0.0106, + "step": 3238 + }, + { + "epoch": 0.4279155794827757, + "grad_norm": 0.20771166682243347, + "learning_rate": 0.00012268022028201562, + "loss": 0.0336, + "step": 3239 + }, + { + "epoch": 0.42804769296825973, + "grad_norm": 0.18823282420635223, + "learning_rate": 0.0001226397404121049, + "loss": 0.0185, + "step": 3240 + }, + { + "epoch": 0.42817980645374376, + "grad_norm": 0.14146701991558075, + "learning_rate": 0.00012259925663160232, + "loss": 0.0067, + "step": 3241 + }, + { + "epoch": 0.4283119199392278, + "grad_norm": 0.16572389006614685, + "learning_rate": 0.0001225587689475007, + "loss": 0.0222, + "step": 3242 + }, + { + "epoch": 0.4284440334247118, + "grad_norm": 0.20891334116458893, + "learning_rate": 0.00012251827736679358, + "loss": 0.0331, + "step": 3243 + }, + { + "epoch": 0.42857614691019585, + "grad_norm": 0.16869033873081207, + "learning_rate": 0.00012247778189647502, + "loss": 0.0183, + "step": 3244 + }, + { + "epoch": 0.4287082603956799, + "grad_norm": 0.2835995554924011, + "learning_rate": 0.00012243728254353992, + "loss": 0.0224, + "step": 3245 + }, + { + "epoch": 0.4288403738811639, + "grad_norm": 0.23434291779994965, + "learning_rate": 0.00012239677931498376, + "loss": 0.0227, + "step": 3246 + }, + { + "epoch": 0.42897248736664795, + "grad_norm": 0.1688721477985382, + "learning_rate": 0.00012235627221780278, + "loss": 0.0204, + "step": 3247 + }, + { + "epoch": 0.429104600852132, + "grad_norm": 0.30692258477211, + "learning_rate": 0.00012231576125899373, + "loss": 0.0328, + "step": 3248 + }, + { + "epoch": 0.429236714337616, + "grad_norm": 0.18616972863674164, + "learning_rate": 0.00012227524644555418, + "loss": 0.0162, + "step": 3249 + }, + { + "epoch": 0.42936882782310004, + "grad_norm": 0.12355251610279083, + "learning_rate": 0.0001222347277844823, + "loss": 0.0116, + "step": 3250 + }, + { + "epoch": 0.42950094130858407, + "grad_norm": 0.2097376137971878, + "learning_rate": 0.00012219420528277692, + "loss": 0.0148, + "step": 3251 + }, + { + "epoch": 0.4296330547940681, + "grad_norm": 0.39561423659324646, + "learning_rate": 0.00012215367894743756, + "loss": 0.017, + "step": 3252 + }, + { + "epoch": 0.42976516827955213, + "grad_norm": 0.18149615824222565, + "learning_rate": 0.00012211314878546436, + "loss": 0.0201, + "step": 3253 + }, + { + "epoch": 0.42989728176503617, + "grad_norm": 0.20877254009246826, + "learning_rate": 0.00012207261480385817, + "loss": 0.0189, + "step": 3254 + }, + { + "epoch": 0.4300293952505202, + "grad_norm": 0.17631439864635468, + "learning_rate": 0.00012203207700962047, + "loss": 0.0246, + "step": 3255 + }, + { + "epoch": 0.43016150873600423, + "grad_norm": 0.24336519837379456, + "learning_rate": 0.00012199153540975342, + "loss": 0.0263, + "step": 3256 + }, + { + "epoch": 0.43029362222148826, + "grad_norm": 0.16430045664310455, + "learning_rate": 0.00012195099001125978, + "loss": 0.0089, + "step": 3257 + }, + { + "epoch": 0.4304257357069723, + "grad_norm": 0.12992851436138153, + "learning_rate": 0.00012191044082114305, + "loss": 0.0113, + "step": 3258 + }, + { + "epoch": 0.4305578491924563, + "grad_norm": 0.1720288246870041, + "learning_rate": 0.00012186988784640736, + "loss": 0.015, + "step": 3259 + }, + { + "epoch": 0.43068996267794035, + "grad_norm": 0.16994862258434296, + "learning_rate": 0.0001218293310940574, + "loss": 0.0256, + "step": 3260 + }, + { + "epoch": 0.4308220761634244, + "grad_norm": 0.24179445207118988, + "learning_rate": 0.0001217887705710987, + "loss": 0.0196, + "step": 3261 + }, + { + "epoch": 0.4309541896489084, + "grad_norm": 0.20141147077083588, + "learning_rate": 0.00012174820628453725, + "loss": 0.0191, + "step": 3262 + }, + { + "epoch": 0.43108630313439245, + "grad_norm": 0.11286444216966629, + "learning_rate": 0.00012170763824137978, + "loss": 0.0186, + "step": 3263 + }, + { + "epoch": 0.4312184166198765, + "grad_norm": 0.24733056128025055, + "learning_rate": 0.0001216670664486337, + "loss": 0.0177, + "step": 3264 + }, + { + "epoch": 0.4313505301053605, + "grad_norm": 0.35599496960639954, + "learning_rate": 0.00012162649091330698, + "loss": 0.0328, + "step": 3265 + }, + { + "epoch": 0.43148264359084454, + "grad_norm": 0.13181497156620026, + "learning_rate": 0.00012158591164240833, + "loss": 0.0189, + "step": 3266 + }, + { + "epoch": 0.43161475707632857, + "grad_norm": 0.1531556248664856, + "learning_rate": 0.00012154532864294703, + "loss": 0.0171, + "step": 3267 + }, + { + "epoch": 0.4317468705618126, + "grad_norm": 0.3520362377166748, + "learning_rate": 0.00012150474192193306, + "loss": 0.0141, + "step": 3268 + }, + { + "epoch": 0.43187898404729663, + "grad_norm": 0.18968984484672546, + "learning_rate": 0.00012146415148637702, + "loss": 0.0133, + "step": 3269 + }, + { + "epoch": 0.43201109753278066, + "grad_norm": 0.1430044025182724, + "learning_rate": 0.0001214235573432901, + "loss": 0.0119, + "step": 3270 + }, + { + "epoch": 0.4321432110182647, + "grad_norm": 0.15614572167396545, + "learning_rate": 0.00012138295949968424, + "loss": 0.0108, + "step": 3271 + }, + { + "epoch": 0.4322753245037487, + "grad_norm": 0.2793543338775635, + "learning_rate": 0.0001213423579625719, + "loss": 0.0239, + "step": 3272 + }, + { + "epoch": 0.43240743798923276, + "grad_norm": 0.20650598406791687, + "learning_rate": 0.00012130175273896626, + "loss": 0.0326, + "step": 3273 + }, + { + "epoch": 0.4325395514747168, + "grad_norm": 0.21166659891605377, + "learning_rate": 0.00012126114383588114, + "loss": 0.0181, + "step": 3274 + }, + { + "epoch": 0.4326716649602008, + "grad_norm": 0.22816048562526703, + "learning_rate": 0.00012122053126033096, + "loss": 0.0248, + "step": 3275 + }, + { + "epoch": 0.43280377844568485, + "grad_norm": 0.21115653216838837, + "learning_rate": 0.00012117991501933074, + "loss": 0.0341, + "step": 3276 + }, + { + "epoch": 0.4329358919311689, + "grad_norm": 0.1442337930202484, + "learning_rate": 0.00012113929511989619, + "loss": 0.0105, + "step": 3277 + }, + { + "epoch": 0.4330680054166529, + "grad_norm": 0.2589665949344635, + "learning_rate": 0.00012109867156904371, + "loss": 0.0185, + "step": 3278 + }, + { + "epoch": 0.43320011890213694, + "grad_norm": 0.17965416610240936, + "learning_rate": 0.00012105804437379018, + "loss": 0.0283, + "step": 3279 + }, + { + "epoch": 0.433332232387621, + "grad_norm": 0.19684933125972748, + "learning_rate": 0.00012101741354115321, + "loss": 0.0313, + "step": 3280 + }, + { + "epoch": 0.433464345873105, + "grad_norm": 0.1718248873949051, + "learning_rate": 0.00012097677907815103, + "loss": 0.0165, + "step": 3281 + }, + { + "epoch": 0.43359645935858904, + "grad_norm": 0.22733500599861145, + "learning_rate": 0.00012093614099180246, + "loss": 0.0212, + "step": 3282 + }, + { + "epoch": 0.43372857284407307, + "grad_norm": 0.16537611186504364, + "learning_rate": 0.00012089549928912698, + "loss": 0.0161, + "step": 3283 + }, + { + "epoch": 0.4338606863295571, + "grad_norm": 0.22515685856342316, + "learning_rate": 0.00012085485397714469, + "loss": 0.0244, + "step": 3284 + }, + { + "epoch": 0.43399279981504113, + "grad_norm": 0.31087931990623474, + "learning_rate": 0.0001208142050628763, + "loss": 0.027, + "step": 3285 + }, + { + "epoch": 0.43412491330052516, + "grad_norm": 0.26226183772087097, + "learning_rate": 0.0001207735525533432, + "loss": 0.0215, + "step": 3286 + }, + { + "epoch": 0.4342570267860092, + "grad_norm": 0.14036330580711365, + "learning_rate": 0.00012073289645556724, + "loss": 0.0113, + "step": 3287 + }, + { + "epoch": 0.4343891402714932, + "grad_norm": 0.32060906291007996, + "learning_rate": 0.00012069223677657112, + "loss": 0.0258, + "step": 3288 + }, + { + "epoch": 0.43452125375697725, + "grad_norm": 0.2714420258998871, + "learning_rate": 0.00012065157352337793, + "loss": 0.0214, + "step": 3289 + }, + { + "epoch": 0.4346533672424613, + "grad_norm": 0.2144080549478531, + "learning_rate": 0.00012061090670301158, + "loss": 0.0329, + "step": 3290 + }, + { + "epoch": 0.4347854807279453, + "grad_norm": 0.20472444593906403, + "learning_rate": 0.00012057023632249645, + "loss": 0.016, + "step": 3291 + }, + { + "epoch": 0.43491759421342935, + "grad_norm": 0.19094641506671906, + "learning_rate": 0.00012052956238885762, + "loss": 0.0245, + "step": 3292 + }, + { + "epoch": 0.4350497076989134, + "grad_norm": 0.12641161680221558, + "learning_rate": 0.00012048888490912071, + "loss": 0.0177, + "step": 3293 + }, + { + "epoch": 0.4351818211843974, + "grad_norm": 0.22624894976615906, + "learning_rate": 0.00012044820389031203, + "loss": 0.024, + "step": 3294 + }, + { + "epoch": 0.43531393466988144, + "grad_norm": 0.33124491572380066, + "learning_rate": 0.00012040751933945847, + "loss": 0.0286, + "step": 3295 + }, + { + "epoch": 0.4354460481553655, + "grad_norm": 0.24567212164402008, + "learning_rate": 0.00012036683126358747, + "loss": 0.0382, + "step": 3296 + }, + { + "epoch": 0.4355781616408495, + "grad_norm": 0.296768456697464, + "learning_rate": 0.00012032613966972721, + "loss": 0.0238, + "step": 3297 + }, + { + "epoch": 0.43571027512633353, + "grad_norm": 0.28525975346565247, + "learning_rate": 0.00012028544456490634, + "loss": 0.0274, + "step": 3298 + }, + { + "epoch": 0.43584238861181757, + "grad_norm": 0.19230809807777405, + "learning_rate": 0.00012024474595615422, + "loss": 0.0246, + "step": 3299 + }, + { + "epoch": 0.4359745020973016, + "grad_norm": 0.16705505549907684, + "learning_rate": 0.00012020404385050078, + "loss": 0.0072, + "step": 3300 + }, + { + "epoch": 0.43610661558278563, + "grad_norm": 0.1971859335899353, + "learning_rate": 0.00012016333825497647, + "loss": 0.0167, + "step": 3301 + }, + { + "epoch": 0.43623872906826966, + "grad_norm": 0.17014151811599731, + "learning_rate": 0.00012012262917661252, + "loss": 0.0236, + "step": 3302 + }, + { + "epoch": 0.4363708425537537, + "grad_norm": 0.08884875476360321, + "learning_rate": 0.00012008191662244059, + "loss": 0.0083, + "step": 3303 + }, + { + "epoch": 0.4365029560392377, + "grad_norm": 0.11667829751968384, + "learning_rate": 0.00012004120059949307, + "loss": 0.0147, + "step": 3304 + }, + { + "epoch": 0.43663506952472175, + "grad_norm": 0.17145667970180511, + "learning_rate": 0.00012000048111480283, + "loss": 0.0202, + "step": 3305 + }, + { + "epoch": 0.4367671830102058, + "grad_norm": 0.25768762826919556, + "learning_rate": 0.00011995975817540346, + "loss": 0.0228, + "step": 3306 + }, + { + "epoch": 0.4368992964956898, + "grad_norm": 0.11496775597333908, + "learning_rate": 0.00011991903178832902, + "loss": 0.0103, + "step": 3307 + }, + { + "epoch": 0.43703140998117385, + "grad_norm": 0.2076563835144043, + "learning_rate": 0.00011987830196061429, + "loss": 0.0183, + "step": 3308 + }, + { + "epoch": 0.4371635234666579, + "grad_norm": 0.47022193670272827, + "learning_rate": 0.00011983756869929456, + "loss": 0.0437, + "step": 3309 + }, + { + "epoch": 0.4372956369521419, + "grad_norm": 0.11586523801088333, + "learning_rate": 0.00011979683201140577, + "loss": 0.0163, + "step": 3310 + }, + { + "epoch": 0.43742775043762594, + "grad_norm": 0.2560293972492218, + "learning_rate": 0.00011975609190398438, + "loss": 0.0398, + "step": 3311 + }, + { + "epoch": 0.43755986392310997, + "grad_norm": 0.1786472499370575, + "learning_rate": 0.00011971534838406753, + "loss": 0.0278, + "step": 3312 + }, + { + "epoch": 0.437691977408594, + "grad_norm": 0.1825864315032959, + "learning_rate": 0.00011967460145869282, + "loss": 0.0322, + "step": 3313 + }, + { + "epoch": 0.43782409089407803, + "grad_norm": 0.31292450428009033, + "learning_rate": 0.0001196338511348986, + "loss": 0.0311, + "step": 3314 + }, + { + "epoch": 0.43795620437956206, + "grad_norm": 0.21444229781627655, + "learning_rate": 0.00011959309741972369, + "loss": 0.025, + "step": 3315 + }, + { + "epoch": 0.4380883178650461, + "grad_norm": 0.2052425593137741, + "learning_rate": 0.00011955234032020752, + "loss": 0.0196, + "step": 3316 + }, + { + "epoch": 0.4382204313505301, + "grad_norm": 0.3350735306739807, + "learning_rate": 0.00011951157984339014, + "loss": 0.0315, + "step": 3317 + }, + { + "epoch": 0.43835254483601416, + "grad_norm": 0.1974852830171585, + "learning_rate": 0.00011947081599631218, + "loss": 0.0219, + "step": 3318 + }, + { + "epoch": 0.4384846583214982, + "grad_norm": 0.1646255999803543, + "learning_rate": 0.0001194300487860148, + "loss": 0.0301, + "step": 3319 + }, + { + "epoch": 0.4386167718069822, + "grad_norm": 0.15746988356113434, + "learning_rate": 0.00011938927821953978, + "loss": 0.0203, + "step": 3320 + }, + { + "epoch": 0.43874888529246625, + "grad_norm": 0.10100744664669037, + "learning_rate": 0.00011934850430392948, + "loss": 0.0134, + "step": 3321 + }, + { + "epoch": 0.4388809987779503, + "grad_norm": 0.21147461235523224, + "learning_rate": 0.00011930772704622679, + "loss": 0.0242, + "step": 3322 + }, + { + "epoch": 0.4390131122634343, + "grad_norm": 0.17001758515834808, + "learning_rate": 0.00011926694645347529, + "loss": 0.0154, + "step": 3323 + }, + { + "epoch": 0.43914522574891834, + "grad_norm": 0.11639127880334854, + "learning_rate": 0.00011922616253271901, + "loss": 0.0125, + "step": 3324 + }, + { + "epoch": 0.4392773392344024, + "grad_norm": 0.19367121160030365, + "learning_rate": 0.00011918537529100264, + "loss": 0.0214, + "step": 3325 + }, + { + "epoch": 0.4394094527198864, + "grad_norm": 0.08695891499519348, + "learning_rate": 0.00011914458473537142, + "loss": 0.0081, + "step": 3326 + }, + { + "epoch": 0.43954156620537044, + "grad_norm": 0.17832736670970917, + "learning_rate": 0.00011910379087287111, + "loss": 0.0195, + "step": 3327 + }, + { + "epoch": 0.43967367969085447, + "grad_norm": 0.2097679078578949, + "learning_rate": 0.00011906299371054814, + "loss": 0.0252, + "step": 3328 + }, + { + "epoch": 0.4398057931763385, + "grad_norm": 0.1070525124669075, + "learning_rate": 0.00011902219325544939, + "loss": 0.0087, + "step": 3329 + }, + { + "epoch": 0.43993790666182253, + "grad_norm": 0.2027360051870346, + "learning_rate": 0.00011898138951462248, + "loss": 0.0172, + "step": 3330 + }, + { + "epoch": 0.44007002014730656, + "grad_norm": 0.22899967432022095, + "learning_rate": 0.00011894058249511537, + "loss": 0.0203, + "step": 3331 + }, + { + "epoch": 0.4402021336327906, + "grad_norm": 0.2035856544971466, + "learning_rate": 0.00011889977220397682, + "loss": 0.0221, + "step": 3332 + }, + { + "epoch": 0.4403342471182746, + "grad_norm": 0.17923541367053986, + "learning_rate": 0.00011885895864825599, + "loss": 0.0147, + "step": 3333 + }, + { + "epoch": 0.44046636060375866, + "grad_norm": 0.16749948263168335, + "learning_rate": 0.00011881814183500262, + "loss": 0.0203, + "step": 3334 + }, + { + "epoch": 0.4405984740892427, + "grad_norm": 0.1700792908668518, + "learning_rate": 0.00011877732177126715, + "loss": 0.0207, + "step": 3335 + }, + { + "epoch": 0.4407305875747267, + "grad_norm": 0.19713635742664337, + "learning_rate": 0.00011873649846410038, + "loss": 0.0146, + "step": 3336 + }, + { + "epoch": 0.4408627010602107, + "grad_norm": 0.2588673233985901, + "learning_rate": 0.00011869567192055382, + "loss": 0.0296, + "step": 3337 + }, + { + "epoch": 0.4409948145456947, + "grad_norm": 0.25432080030441284, + "learning_rate": 0.00011865484214767955, + "loss": 0.0244, + "step": 3338 + }, + { + "epoch": 0.44112692803117876, + "grad_norm": 0.23010925948619843, + "learning_rate": 0.00011861400915253005, + "loss": 0.0267, + "step": 3339 + }, + { + "epoch": 0.4412590415166628, + "grad_norm": 0.18061229586601257, + "learning_rate": 0.00011857317294215851, + "loss": 0.0214, + "step": 3340 + }, + { + "epoch": 0.4413911550021468, + "grad_norm": 0.10983574390411377, + "learning_rate": 0.0001185323335236186, + "loss": 0.0131, + "step": 3341 + }, + { + "epoch": 0.44152326848763085, + "grad_norm": 0.5960947275161743, + "learning_rate": 0.00011849149090396461, + "loss": 0.0207, + "step": 3342 + }, + { + "epoch": 0.4416553819731149, + "grad_norm": 0.13170459866523743, + "learning_rate": 0.0001184506450902513, + "loss": 0.0177, + "step": 3343 + }, + { + "epoch": 0.4417874954585989, + "grad_norm": 0.14053009450435638, + "learning_rate": 0.000118409796089534, + "loss": 0.0157, + "step": 3344 + }, + { + "epoch": 0.44191960894408294, + "grad_norm": 0.17476029694080353, + "learning_rate": 0.00011836894390886866, + "loss": 0.013, + "step": 3345 + }, + { + "epoch": 0.442051722429567, + "grad_norm": 0.1530439704656601, + "learning_rate": 0.00011832808855531171, + "loss": 0.0108, + "step": 3346 + }, + { + "epoch": 0.442183835915051, + "grad_norm": 0.2883946895599365, + "learning_rate": 0.00011828723003592015, + "loss": 0.0223, + "step": 3347 + }, + { + "epoch": 0.44231594940053504, + "grad_norm": 0.16099697351455688, + "learning_rate": 0.00011824636835775149, + "loss": 0.0209, + "step": 3348 + }, + { + "epoch": 0.44244806288601907, + "grad_norm": 0.1955551952123642, + "learning_rate": 0.00011820550352786388, + "loss": 0.0174, + "step": 3349 + }, + { + "epoch": 0.4425801763715031, + "grad_norm": 0.2711743414402008, + "learning_rate": 0.00011816463555331594, + "loss": 0.0186, + "step": 3350 + }, + { + "epoch": 0.44271228985698713, + "grad_norm": 0.2542886734008789, + "learning_rate": 0.00011812376444116681, + "loss": 0.0306, + "step": 3351 + }, + { + "epoch": 0.44284440334247116, + "grad_norm": 0.33619576692581177, + "learning_rate": 0.00011808289019847627, + "loss": 0.0232, + "step": 3352 + }, + { + "epoch": 0.4429765168279552, + "grad_norm": 0.21356289088726044, + "learning_rate": 0.00011804201283230452, + "loss": 0.022, + "step": 3353 + }, + { + "epoch": 0.4431086303134392, + "grad_norm": 0.28175997734069824, + "learning_rate": 0.00011800113234971242, + "loss": 0.0265, + "step": 3354 + }, + { + "epoch": 0.44324074379892325, + "grad_norm": 0.16907796263694763, + "learning_rate": 0.00011796024875776123, + "loss": 0.0251, + "step": 3355 + }, + { + "epoch": 0.4433728572844073, + "grad_norm": 0.16458839178085327, + "learning_rate": 0.00011791936206351293, + "loss": 0.021, + "step": 3356 + }, + { + "epoch": 0.4435049707698913, + "grad_norm": 0.12611019611358643, + "learning_rate": 0.00011787847227402986, + "loss": 0.0074, + "step": 3357 + }, + { + "epoch": 0.44363708425537535, + "grad_norm": 0.2491726577281952, + "learning_rate": 0.00011783757939637494, + "loss": 0.0272, + "step": 3358 + }, + { + "epoch": 0.4437691977408594, + "grad_norm": 0.124544657766819, + "learning_rate": 0.00011779668343761174, + "loss": 0.01, + "step": 3359 + }, + { + "epoch": 0.4439013112263434, + "grad_norm": 0.2654660940170288, + "learning_rate": 0.00011775578440480421, + "loss": 0.025, + "step": 3360 + }, + { + "epoch": 0.44403342471182744, + "grad_norm": 0.24884164333343506, + "learning_rate": 0.00011771488230501692, + "loss": 0.025, + "step": 3361 + }, + { + "epoch": 0.44416553819731147, + "grad_norm": 0.11097685992717743, + "learning_rate": 0.00011767397714531492, + "loss": 0.0079, + "step": 3362 + }, + { + "epoch": 0.4442976516827955, + "grad_norm": 0.21636100113391876, + "learning_rate": 0.00011763306893276382, + "loss": 0.0261, + "step": 3363 + }, + { + "epoch": 0.44442976516827953, + "grad_norm": 0.1294645369052887, + "learning_rate": 0.00011759215767442977, + "loss": 0.0091, + "step": 3364 + }, + { + "epoch": 0.44456187865376356, + "grad_norm": 0.20058712363243103, + "learning_rate": 0.00011755124337737937, + "loss": 0.0178, + "step": 3365 + }, + { + "epoch": 0.4446939921392476, + "grad_norm": 0.13472117483615875, + "learning_rate": 0.00011751032604867987, + "loss": 0.0172, + "step": 3366 + }, + { + "epoch": 0.4448261056247316, + "grad_norm": 0.2171383798122406, + "learning_rate": 0.00011746940569539893, + "loss": 0.025, + "step": 3367 + }, + { + "epoch": 0.44495821911021566, + "grad_norm": 0.18214093148708344, + "learning_rate": 0.00011742848232460479, + "loss": 0.0239, + "step": 3368 + }, + { + "epoch": 0.4450903325956997, + "grad_norm": 0.23197366297245026, + "learning_rate": 0.0001173875559433662, + "loss": 0.0231, + "step": 3369 + }, + { + "epoch": 0.4452224460811837, + "grad_norm": 0.24646830558776855, + "learning_rate": 0.00011734662655875242, + "loss": 0.0222, + "step": 3370 + }, + { + "epoch": 0.44535455956666775, + "grad_norm": 0.25738704204559326, + "learning_rate": 0.00011730569417783322, + "loss": 0.0339, + "step": 3371 + }, + { + "epoch": 0.4454866730521518, + "grad_norm": 0.2571242153644562, + "learning_rate": 0.00011726475880767893, + "loss": 0.0306, + "step": 3372 + }, + { + "epoch": 0.4456187865376358, + "grad_norm": 0.18332485854625702, + "learning_rate": 0.00011722382045536036, + "loss": 0.0189, + "step": 3373 + }, + { + "epoch": 0.44575090002311984, + "grad_norm": 0.28679078817367554, + "learning_rate": 0.00011718287912794885, + "loss": 0.0228, + "step": 3374 + }, + { + "epoch": 0.4458830135086039, + "grad_norm": 0.21024517714977264, + "learning_rate": 0.00011714193483251623, + "loss": 0.0338, + "step": 3375 + }, + { + "epoch": 0.4460151269940879, + "grad_norm": 0.2537991404533386, + "learning_rate": 0.0001171009875761349, + "loss": 0.0211, + "step": 3376 + }, + { + "epoch": 0.44614724047957194, + "grad_norm": 0.21349625289440155, + "learning_rate": 0.00011706003736587768, + "loss": 0.0214, + "step": 3377 + }, + { + "epoch": 0.44627935396505597, + "grad_norm": 0.21972428262233734, + "learning_rate": 0.00011701908420881799, + "loss": 0.0275, + "step": 3378 + }, + { + "epoch": 0.44641146745054, + "grad_norm": 0.16195261478424072, + "learning_rate": 0.00011697812811202971, + "loss": 0.0194, + "step": 3379 + }, + { + "epoch": 0.44654358093602403, + "grad_norm": 0.3250785171985626, + "learning_rate": 0.00011693716908258727, + "loss": 0.0283, + "step": 3380 + }, + { + "epoch": 0.44667569442150806, + "grad_norm": 0.1459767371416092, + "learning_rate": 0.00011689620712756553, + "loss": 0.0197, + "step": 3381 + }, + { + "epoch": 0.4468078079069921, + "grad_norm": 0.18223801255226135, + "learning_rate": 0.00011685524225403993, + "loss": 0.0247, + "step": 3382 + }, + { + "epoch": 0.4469399213924761, + "grad_norm": 0.14529427886009216, + "learning_rate": 0.00011681427446908637, + "loss": 0.0164, + "step": 3383 + }, + { + "epoch": 0.44707203487796016, + "grad_norm": 0.1351812779903412, + "learning_rate": 0.00011677330377978127, + "loss": 0.0119, + "step": 3384 + }, + { + "epoch": 0.4472041483634442, + "grad_norm": 0.20510388910770416, + "learning_rate": 0.00011673233019320155, + "loss": 0.0201, + "step": 3385 + }, + { + "epoch": 0.4473362618489282, + "grad_norm": 0.2545018196105957, + "learning_rate": 0.00011669135371642465, + "loss": 0.0263, + "step": 3386 + }, + { + "epoch": 0.44746837533441225, + "grad_norm": 0.1861943006515503, + "learning_rate": 0.00011665037435652849, + "loss": 0.0287, + "step": 3387 + }, + { + "epoch": 0.4476004888198963, + "grad_norm": 0.2189074158668518, + "learning_rate": 0.00011660939212059147, + "loss": 0.0224, + "step": 3388 + }, + { + "epoch": 0.4477326023053803, + "grad_norm": 0.18213927745819092, + "learning_rate": 0.0001165684070156925, + "loss": 0.0196, + "step": 3389 + }, + { + "epoch": 0.44786471579086434, + "grad_norm": 0.2507062256336212, + "learning_rate": 0.000116527419048911, + "loss": 0.0192, + "step": 3390 + }, + { + "epoch": 0.4479968292763484, + "grad_norm": 0.17222446203231812, + "learning_rate": 0.0001164864282273269, + "loss": 0.016, + "step": 3391 + }, + { + "epoch": 0.4481289427618324, + "grad_norm": 0.1732499599456787, + "learning_rate": 0.00011644543455802055, + "loss": 0.0221, + "step": 3392 + }, + { + "epoch": 0.44826105624731644, + "grad_norm": 0.22399833798408508, + "learning_rate": 0.00011640443804807286, + "loss": 0.0317, + "step": 3393 + }, + { + "epoch": 0.44839316973280047, + "grad_norm": 0.21018847823143005, + "learning_rate": 0.00011636343870456523, + "loss": 0.0217, + "step": 3394 + }, + { + "epoch": 0.4485252832182845, + "grad_norm": 0.231175497174263, + "learning_rate": 0.00011632243653457952, + "loss": 0.0271, + "step": 3395 + }, + { + "epoch": 0.44865739670376853, + "grad_norm": 0.26954904198646545, + "learning_rate": 0.00011628143154519806, + "loss": 0.023, + "step": 3396 + }, + { + "epoch": 0.44878951018925256, + "grad_norm": 0.19411055743694305, + "learning_rate": 0.00011624042374350377, + "loss": 0.0201, + "step": 3397 + }, + { + "epoch": 0.4489216236747366, + "grad_norm": 0.1828741729259491, + "learning_rate": 0.00011619941313657987, + "loss": 0.0093, + "step": 3398 + }, + { + "epoch": 0.4490537371602206, + "grad_norm": 0.14031386375427246, + "learning_rate": 0.00011615839973151028, + "loss": 0.0148, + "step": 3399 + }, + { + "epoch": 0.44918585064570465, + "grad_norm": 0.1643749177455902, + "learning_rate": 0.00011611738353537924, + "loss": 0.0149, + "step": 3400 + }, + { + "epoch": 0.4493179641311887, + "grad_norm": 0.16316166520118713, + "learning_rate": 0.00011607636455527155, + "loss": 0.0116, + "step": 3401 + }, + { + "epoch": 0.4494500776166727, + "grad_norm": 0.2305441051721573, + "learning_rate": 0.00011603534279827246, + "loss": 0.031, + "step": 3402 + }, + { + "epoch": 0.44958219110215675, + "grad_norm": 0.20265796780586243, + "learning_rate": 0.00011599431827146772, + "loss": 0.0157, + "step": 3403 + }, + { + "epoch": 0.4497143045876408, + "grad_norm": 0.19166412949562073, + "learning_rate": 0.00011595329098194354, + "loss": 0.0215, + "step": 3404 + }, + { + "epoch": 0.4498464180731248, + "grad_norm": 0.19884948432445526, + "learning_rate": 0.00011591226093678665, + "loss": 0.0213, + "step": 3405 + }, + { + "epoch": 0.44997853155860884, + "grad_norm": 0.15096840262413025, + "learning_rate": 0.00011587122814308418, + "loss": 0.0101, + "step": 3406 + }, + { + "epoch": 0.45011064504409287, + "grad_norm": 0.17579104006290436, + "learning_rate": 0.0001158301926079238, + "loss": 0.0151, + "step": 3407 + }, + { + "epoch": 0.4502427585295769, + "grad_norm": 0.2230728715658188, + "learning_rate": 0.00011578915433839364, + "loss": 0.0164, + "step": 3408 + }, + { + "epoch": 0.45037487201506093, + "grad_norm": 0.17997904121875763, + "learning_rate": 0.00011574811334158227, + "loss": 0.0257, + "step": 3409 + }, + { + "epoch": 0.45050698550054497, + "grad_norm": 0.16918472945690155, + "learning_rate": 0.00011570706962457876, + "loss": 0.0124, + "step": 3410 + }, + { + "epoch": 0.450639098986029, + "grad_norm": 0.2919192910194397, + "learning_rate": 0.00011566602319447266, + "loss": 0.0235, + "step": 3411 + }, + { + "epoch": 0.450771212471513, + "grad_norm": 0.20782966911792755, + "learning_rate": 0.00011562497405835396, + "loss": 0.0208, + "step": 3412 + }, + { + "epoch": 0.45090332595699706, + "grad_norm": 0.17539459466934204, + "learning_rate": 0.00011558392222331313, + "loss": 0.0162, + "step": 3413 + }, + { + "epoch": 0.4510354394424811, + "grad_norm": 0.17549258470535278, + "learning_rate": 0.00011554286769644113, + "loss": 0.0139, + "step": 3414 + }, + { + "epoch": 0.4511675529279651, + "grad_norm": 0.1104462742805481, + "learning_rate": 0.00011550181048482936, + "loss": 0.0116, + "step": 3415 + }, + { + "epoch": 0.45129966641344915, + "grad_norm": 0.17143550515174866, + "learning_rate": 0.00011546075059556965, + "loss": 0.0175, + "step": 3416 + }, + { + "epoch": 0.4514317798989332, + "grad_norm": 0.18821991980075836, + "learning_rate": 0.00011541968803575433, + "loss": 0.0196, + "step": 3417 + }, + { + "epoch": 0.4515638933844172, + "grad_norm": 0.178142711520195, + "learning_rate": 0.00011537862281247624, + "loss": 0.0218, + "step": 3418 + }, + { + "epoch": 0.45169600686990125, + "grad_norm": 0.3631950616836548, + "learning_rate": 0.00011533755493282857, + "loss": 0.0212, + "step": 3419 + }, + { + "epoch": 0.4518281203553853, + "grad_norm": 0.16244377195835114, + "learning_rate": 0.00011529648440390508, + "loss": 0.0195, + "step": 3420 + }, + { + "epoch": 0.4519602338408693, + "grad_norm": 0.09488275647163391, + "learning_rate": 0.00011525541123279991, + "loss": 0.0109, + "step": 3421 + }, + { + "epoch": 0.45209234732635334, + "grad_norm": 0.24750275909900665, + "learning_rate": 0.00011521433542660767, + "loss": 0.0297, + "step": 3422 + }, + { + "epoch": 0.45222446081183737, + "grad_norm": 0.13846619427204132, + "learning_rate": 0.00011517325699242345, + "loss": 0.0183, + "step": 3423 + }, + { + "epoch": 0.4523565742973214, + "grad_norm": 0.15736238658428192, + "learning_rate": 0.00011513217593734277, + "loss": 0.0101, + "step": 3424 + }, + { + "epoch": 0.45248868778280543, + "grad_norm": 0.19229941070079803, + "learning_rate": 0.00011509109226846164, + "loss": 0.0167, + "step": 3425 + }, + { + "epoch": 0.45262080126828946, + "grad_norm": 0.178132563829422, + "learning_rate": 0.0001150500059928765, + "loss": 0.0194, + "step": 3426 + }, + { + "epoch": 0.4527529147537735, + "grad_norm": 0.21312348544597626, + "learning_rate": 0.0001150089171176842, + "loss": 0.0297, + "step": 3427 + }, + { + "epoch": 0.4528850282392575, + "grad_norm": 0.20051683485507965, + "learning_rate": 0.0001149678256499821, + "loss": 0.0126, + "step": 3428 + }, + { + "epoch": 0.45301714172474156, + "grad_norm": 0.22905012965202332, + "learning_rate": 0.00011492673159686797, + "loss": 0.0224, + "step": 3429 + }, + { + "epoch": 0.4531492552102256, + "grad_norm": 0.25302499532699585, + "learning_rate": 0.00011488563496544007, + "loss": 0.0322, + "step": 3430 + }, + { + "epoch": 0.4532813686957096, + "grad_norm": 0.20484831929206848, + "learning_rate": 0.00011484453576279703, + "loss": 0.0148, + "step": 3431 + }, + { + "epoch": 0.45341348218119365, + "grad_norm": 0.22035667300224304, + "learning_rate": 0.00011480343399603799, + "loss": 0.0113, + "step": 3432 + }, + { + "epoch": 0.4535455956666777, + "grad_norm": 0.2351701557636261, + "learning_rate": 0.00011476232967226252, + "loss": 0.0262, + "step": 3433 + }, + { + "epoch": 0.4536777091521617, + "grad_norm": 0.14787419140338898, + "learning_rate": 0.00011472122279857061, + "loss": 0.0113, + "step": 3434 + }, + { + "epoch": 0.45380982263764574, + "grad_norm": 0.47793006896972656, + "learning_rate": 0.00011468011338206271, + "loss": 0.037, + "step": 3435 + }, + { + "epoch": 0.4539419361231298, + "grad_norm": 0.1750878393650055, + "learning_rate": 0.0001146390014298397, + "loss": 0.0179, + "step": 3436 + }, + { + "epoch": 0.4540740496086138, + "grad_norm": 0.3732263147830963, + "learning_rate": 0.00011459788694900289, + "loss": 0.0284, + "step": 3437 + }, + { + "epoch": 0.45420616309409784, + "grad_norm": 0.3407856822013855, + "learning_rate": 0.00011455676994665407, + "loss": 0.0347, + "step": 3438 + }, + { + "epoch": 0.45433827657958187, + "grad_norm": 0.2009911835193634, + "learning_rate": 0.0001145156504298954, + "loss": 0.0255, + "step": 3439 + }, + { + "epoch": 0.4544703900650659, + "grad_norm": 0.14986108243465424, + "learning_rate": 0.00011447452840582952, + "loss": 0.0158, + "step": 3440 + }, + { + "epoch": 0.45460250355054993, + "grad_norm": 0.264070987701416, + "learning_rate": 0.0001144334038815595, + "loss": 0.023, + "step": 3441 + }, + { + "epoch": 0.45473461703603396, + "grad_norm": 0.27384451031684875, + "learning_rate": 0.00011439227686418883, + "loss": 0.0191, + "step": 3442 + }, + { + "epoch": 0.454866730521518, + "grad_norm": 0.2228957712650299, + "learning_rate": 0.00011435114736082142, + "loss": 0.0204, + "step": 3443 + }, + { + "epoch": 0.454998844007002, + "grad_norm": 0.144685298204422, + "learning_rate": 0.00011431001537856163, + "loss": 0.0122, + "step": 3444 + }, + { + "epoch": 0.45513095749248605, + "grad_norm": 0.1886610984802246, + "learning_rate": 0.00011426888092451427, + "loss": 0.0216, + "step": 3445 + }, + { + "epoch": 0.4552630709779701, + "grad_norm": 0.17720267176628113, + "learning_rate": 0.00011422774400578446, + "loss": 0.0191, + "step": 3446 + }, + { + "epoch": 0.4553951844634541, + "grad_norm": 0.30038321018218994, + "learning_rate": 0.00011418660462947795, + "loss": 0.0262, + "step": 3447 + }, + { + "epoch": 0.45552729794893815, + "grad_norm": 0.18168781697750092, + "learning_rate": 0.0001141454628027007, + "loss": 0.0343, + "step": 3448 + }, + { + "epoch": 0.4556594114344222, + "grad_norm": 0.2796495258808136, + "learning_rate": 0.00011410431853255925, + "loss": 0.0247, + "step": 3449 + }, + { + "epoch": 0.4557915249199062, + "grad_norm": 0.23623229563236237, + "learning_rate": 0.00011406317182616049, + "loss": 0.0114, + "step": 3450 + }, + { + "epoch": 0.45592363840539024, + "grad_norm": 0.1999683976173401, + "learning_rate": 0.00011402202269061173, + "loss": 0.0274, + "step": 3451 + }, + { + "epoch": 0.4560557518908743, + "grad_norm": 0.2691020965576172, + "learning_rate": 0.00011398087113302074, + "loss": 0.0258, + "step": 3452 + }, + { + "epoch": 0.4561878653763583, + "grad_norm": 0.31196510791778564, + "learning_rate": 0.00011393971716049563, + "loss": 0.0261, + "step": 3453 + }, + { + "epoch": 0.45631997886184233, + "grad_norm": 0.14454197883605957, + "learning_rate": 0.00011389856078014504, + "loss": 0.0219, + "step": 3454 + }, + { + "epoch": 0.45645209234732637, + "grad_norm": 0.20809254050254822, + "learning_rate": 0.00011385740199907792, + "loss": 0.0218, + "step": 3455 + }, + { + "epoch": 0.4565842058328104, + "grad_norm": 0.21281687915325165, + "learning_rate": 0.00011381624082440374, + "loss": 0.0202, + "step": 3456 + }, + { + "epoch": 0.45671631931829443, + "grad_norm": 0.11839515715837479, + "learning_rate": 0.00011377507726323227, + "loss": 0.0125, + "step": 3457 + }, + { + "epoch": 0.45684843280377846, + "grad_norm": 0.18069903552532196, + "learning_rate": 0.00011373391132267374, + "loss": 0.0206, + "step": 3458 + }, + { + "epoch": 0.4569805462892625, + "grad_norm": 0.11462843418121338, + "learning_rate": 0.00011369274300983886, + "loss": 0.0157, + "step": 3459 + }, + { + "epoch": 0.4571126597747465, + "grad_norm": 0.2575992941856384, + "learning_rate": 0.00011365157233183858, + "loss": 0.0355, + "step": 3460 + }, + { + "epoch": 0.45724477326023055, + "grad_norm": 0.1611703485250473, + "learning_rate": 0.00011361039929578447, + "loss": 0.0135, + "step": 3461 + }, + { + "epoch": 0.4573768867457146, + "grad_norm": 0.2228485494852066, + "learning_rate": 0.00011356922390878834, + "loss": 0.0359, + "step": 3462 + }, + { + "epoch": 0.4575090002311986, + "grad_norm": 0.15648458898067474, + "learning_rate": 0.00011352804617796251, + "loss": 0.013, + "step": 3463 + }, + { + "epoch": 0.45764111371668265, + "grad_norm": 0.1394144892692566, + "learning_rate": 0.00011348686611041963, + "loss": 0.0195, + "step": 3464 + }, + { + "epoch": 0.4577732272021667, + "grad_norm": 0.17651647329330444, + "learning_rate": 0.00011344568371327277, + "loss": 0.0174, + "step": 3465 + }, + { + "epoch": 0.4579053406876507, + "grad_norm": 0.15418827533721924, + "learning_rate": 0.00011340449899363547, + "loss": 0.0238, + "step": 3466 + }, + { + "epoch": 0.45803745417313474, + "grad_norm": 0.09896305948495865, + "learning_rate": 0.00011336331195862159, + "loss": 0.0098, + "step": 3467 + }, + { + "epoch": 0.45816956765861877, + "grad_norm": 0.17593730986118317, + "learning_rate": 0.00011332212261534545, + "loss": 0.0188, + "step": 3468 + }, + { + "epoch": 0.4583016811441028, + "grad_norm": 0.13243882358074188, + "learning_rate": 0.00011328093097092168, + "loss": 0.0102, + "step": 3469 + }, + { + "epoch": 0.45843379462958683, + "grad_norm": 0.16697779297828674, + "learning_rate": 0.00011323973703246542, + "loss": 0.0152, + "step": 3470 + }, + { + "epoch": 0.45856590811507086, + "grad_norm": 0.15453162789344788, + "learning_rate": 0.00011319854080709215, + "loss": 0.0202, + "step": 3471 + }, + { + "epoch": 0.4586980216005549, + "grad_norm": 0.200933039188385, + "learning_rate": 0.0001131573423019177, + "loss": 0.0288, + "step": 3472 + }, + { + "epoch": 0.4588301350860389, + "grad_norm": 0.21381884813308716, + "learning_rate": 0.0001131161415240584, + "loss": 0.0181, + "step": 3473 + }, + { + "epoch": 0.45896224857152296, + "grad_norm": 0.32991519570350647, + "learning_rate": 0.00011307493848063086, + "loss": 0.033, + "step": 3474 + }, + { + "epoch": 0.459094362057007, + "grad_norm": 0.21740446984767914, + "learning_rate": 0.0001130337331787522, + "loss": 0.0132, + "step": 3475 + }, + { + "epoch": 0.459226475542491, + "grad_norm": 0.11426756531000137, + "learning_rate": 0.00011299252562553979, + "loss": 0.0115, + "step": 3476 + }, + { + "epoch": 0.45935858902797505, + "grad_norm": 0.14595068991184235, + "learning_rate": 0.00011295131582811153, + "loss": 0.0149, + "step": 3477 + }, + { + "epoch": 0.4594907025134591, + "grad_norm": 0.1178387925028801, + "learning_rate": 0.00011291010379358563, + "loss": 0.0137, + "step": 3478 + }, + { + "epoch": 0.4596228159989431, + "grad_norm": 0.23638631403446198, + "learning_rate": 0.00011286888952908063, + "loss": 0.0201, + "step": 3479 + }, + { + "epoch": 0.45975492948442714, + "grad_norm": 0.1705041527748108, + "learning_rate": 0.00011282767304171562, + "loss": 0.0146, + "step": 3480 + }, + { + "epoch": 0.4598870429699112, + "grad_norm": 0.12411267310380936, + "learning_rate": 0.00011278645433860991, + "loss": 0.0216, + "step": 3481 + }, + { + "epoch": 0.4600191564553952, + "grad_norm": 0.11123984307050705, + "learning_rate": 0.00011274523342688328, + "loss": 0.0155, + "step": 3482 + }, + { + "epoch": 0.46015126994087924, + "grad_norm": 0.17560520768165588, + "learning_rate": 0.00011270401031365592, + "loss": 0.0209, + "step": 3483 + }, + { + "epoch": 0.46028338342636327, + "grad_norm": 0.15862911939620972, + "learning_rate": 0.00011266278500604826, + "loss": 0.0203, + "step": 3484 + }, + { + "epoch": 0.4604154969118473, + "grad_norm": 0.14899875223636627, + "learning_rate": 0.00011262155751118128, + "loss": 0.0154, + "step": 3485 + }, + { + "epoch": 0.46054761039733133, + "grad_norm": 0.1562994122505188, + "learning_rate": 0.0001125803278361762, + "loss": 0.0148, + "step": 3486 + }, + { + "epoch": 0.46067972388281536, + "grad_norm": 0.11963161826133728, + "learning_rate": 0.00011253909598815474, + "loss": 0.0105, + "step": 3487 + }, + { + "epoch": 0.4608118373682994, + "grad_norm": 0.14713045954704285, + "learning_rate": 0.00011249786197423888, + "loss": 0.0233, + "step": 3488 + }, + { + "epoch": 0.4609439508537834, + "grad_norm": 0.1988755166530609, + "learning_rate": 0.00011245662580155102, + "loss": 0.0227, + "step": 3489 + }, + { + "epoch": 0.46107606433926746, + "grad_norm": 0.15347225964069366, + "learning_rate": 0.00011241538747721401, + "loss": 0.0145, + "step": 3490 + }, + { + "epoch": 0.4612081778247515, + "grad_norm": 0.18890933692455292, + "learning_rate": 0.00011237414700835089, + "loss": 0.0163, + "step": 3491 + }, + { + "epoch": 0.4613402913102355, + "grad_norm": 0.19923219084739685, + "learning_rate": 0.00011233290440208528, + "loss": 0.0233, + "step": 3492 + }, + { + "epoch": 0.46147240479571955, + "grad_norm": 0.12134901434183121, + "learning_rate": 0.00011229165966554098, + "loss": 0.013, + "step": 3493 + }, + { + "epoch": 0.4616045182812036, + "grad_norm": 0.17246650159358978, + "learning_rate": 0.00011225041280584234, + "loss": 0.0226, + "step": 3494 + }, + { + "epoch": 0.4617366317666876, + "grad_norm": 0.1895778924226761, + "learning_rate": 0.00011220916383011393, + "loss": 0.0143, + "step": 3495 + }, + { + "epoch": 0.46186874525217164, + "grad_norm": 0.3674432337284088, + "learning_rate": 0.0001121679127454807, + "loss": 0.0286, + "step": 3496 + }, + { + "epoch": 0.4620008587376557, + "grad_norm": 0.24747756123542786, + "learning_rate": 0.00011212665955906812, + "loss": 0.0117, + "step": 3497 + }, + { + "epoch": 0.4621329722231397, + "grad_norm": 0.17657174170017242, + "learning_rate": 0.00011208540427800178, + "loss": 0.0206, + "step": 3498 + }, + { + "epoch": 0.46226508570862374, + "grad_norm": 0.18776048719882965, + "learning_rate": 0.00011204414690940783, + "loss": 0.0157, + "step": 3499 + }, + { + "epoch": 0.4623971991941077, + "grad_norm": 0.16772229969501495, + "learning_rate": 0.0001120028874604127, + "loss": 0.0163, + "step": 3500 + }, + { + "epoch": 0.46252931267959174, + "grad_norm": 0.19655610620975494, + "learning_rate": 0.00011196162593814319, + "loss": 0.0223, + "step": 3501 + }, + { + "epoch": 0.4626614261650758, + "grad_norm": 0.1830878108739853, + "learning_rate": 0.00011192036234972645, + "loss": 0.0224, + "step": 3502 + }, + { + "epoch": 0.4627935396505598, + "grad_norm": 0.13783776760101318, + "learning_rate": 0.00011187909670228998, + "loss": 0.0154, + "step": 3503 + }, + { + "epoch": 0.46292565313604384, + "grad_norm": 0.5649703741073608, + "learning_rate": 0.00011183782900296168, + "loss": 0.0479, + "step": 3504 + }, + { + "epoch": 0.46305776662152787, + "grad_norm": 0.16268661618232727, + "learning_rate": 0.00011179655925886971, + "loss": 0.0173, + "step": 3505 + }, + { + "epoch": 0.4631898801070119, + "grad_norm": 0.15541048347949982, + "learning_rate": 0.00011175528747714272, + "loss": 0.0201, + "step": 3506 + }, + { + "epoch": 0.46332199359249593, + "grad_norm": 0.33864355087280273, + "learning_rate": 0.00011171401366490961, + "loss": 0.0281, + "step": 3507 + }, + { + "epoch": 0.46345410707797996, + "grad_norm": 0.1839621514081955, + "learning_rate": 0.00011167273782929968, + "loss": 0.0165, + "step": 3508 + }, + { + "epoch": 0.463586220563464, + "grad_norm": 0.17850792407989502, + "learning_rate": 0.0001116314599774425, + "loss": 0.0181, + "step": 3509 + }, + { + "epoch": 0.463718334048948, + "grad_norm": 0.18538732826709747, + "learning_rate": 0.00011159018011646811, + "loss": 0.021, + "step": 3510 + }, + { + "epoch": 0.46385044753443205, + "grad_norm": 0.21953551471233368, + "learning_rate": 0.00011154889825350681, + "loss": 0.0184, + "step": 3511 + }, + { + "epoch": 0.4639825610199161, + "grad_norm": 0.10858046263456345, + "learning_rate": 0.00011150761439568925, + "loss": 0.0121, + "step": 3512 + }, + { + "epoch": 0.4641146745054001, + "grad_norm": 0.14788410067558289, + "learning_rate": 0.00011146632855014647, + "loss": 0.0142, + "step": 3513 + }, + { + "epoch": 0.46424678799088415, + "grad_norm": 0.219330295920372, + "learning_rate": 0.00011142504072400983, + "loss": 0.0296, + "step": 3514 + }, + { + "epoch": 0.4643789014763682, + "grad_norm": 0.12793508172035217, + "learning_rate": 0.00011138375092441102, + "loss": 0.0138, + "step": 3515 + }, + { + "epoch": 0.4645110149618522, + "grad_norm": 0.11018357425928116, + "learning_rate": 0.00011134245915848209, + "loss": 0.0143, + "step": 3516 + }, + { + "epoch": 0.46464312844733624, + "grad_norm": 0.15088607370853424, + "learning_rate": 0.00011130116543335541, + "loss": 0.0108, + "step": 3517 + }, + { + "epoch": 0.46477524193282027, + "grad_norm": 0.19519521296024323, + "learning_rate": 0.0001112598697561637, + "loss": 0.0208, + "step": 3518 + }, + { + "epoch": 0.4649073554183043, + "grad_norm": 0.17513087391853333, + "learning_rate": 0.00011121857213404, + "loss": 0.0241, + "step": 3519 + }, + { + "epoch": 0.46503946890378833, + "grad_norm": 0.19763897359371185, + "learning_rate": 0.00011117727257411776, + "loss": 0.0158, + "step": 3520 + }, + { + "epoch": 0.46517158238927236, + "grad_norm": 0.28380870819091797, + "learning_rate": 0.00011113597108353064, + "loss": 0.0134, + "step": 3521 + }, + { + "epoch": 0.4653036958747564, + "grad_norm": 0.3162674307823181, + "learning_rate": 0.00011109466766941275, + "loss": 0.0371, + "step": 3522 + }, + { + "epoch": 0.4654358093602404, + "grad_norm": 0.21053746342658997, + "learning_rate": 0.00011105336233889845, + "loss": 0.0265, + "step": 3523 + }, + { + "epoch": 0.46556792284572446, + "grad_norm": 0.15278670191764832, + "learning_rate": 0.00011101205509912245, + "loss": 0.0103, + "step": 3524 + }, + { + "epoch": 0.4657000363312085, + "grad_norm": 0.127748042345047, + "learning_rate": 0.00011097074595721985, + "loss": 0.0146, + "step": 3525 + }, + { + "epoch": 0.4658321498166925, + "grad_norm": 0.3685021698474884, + "learning_rate": 0.000110929434920326, + "loss": 0.0292, + "step": 3526 + }, + { + "epoch": 0.46596426330217655, + "grad_norm": 0.23161841928958893, + "learning_rate": 0.00011088812199557663, + "loss": 0.0256, + "step": 3527 + }, + { + "epoch": 0.4660963767876606, + "grad_norm": 0.31696006655693054, + "learning_rate": 0.00011084680719010777, + "loss": 0.0109, + "step": 3528 + }, + { + "epoch": 0.4662284902731446, + "grad_norm": 0.13080094754695892, + "learning_rate": 0.00011080549051105573, + "loss": 0.0108, + "step": 3529 + }, + { + "epoch": 0.46636060375862864, + "grad_norm": 0.2266928255558014, + "learning_rate": 0.00011076417196555728, + "loss": 0.0227, + "step": 3530 + }, + { + "epoch": 0.4664927172441127, + "grad_norm": 0.27095434069633484, + "learning_rate": 0.00011072285156074935, + "loss": 0.0289, + "step": 3531 + }, + { + "epoch": 0.4666248307295967, + "grad_norm": 0.18671102821826935, + "learning_rate": 0.00011068152930376933, + "loss": 0.0199, + "step": 3532 + }, + { + "epoch": 0.46675694421508074, + "grad_norm": 0.18002068996429443, + "learning_rate": 0.00011064020520175482, + "loss": 0.0119, + "step": 3533 + }, + { + "epoch": 0.46688905770056477, + "grad_norm": 0.32328853011131287, + "learning_rate": 0.00011059887926184382, + "loss": 0.0199, + "step": 3534 + }, + { + "epoch": 0.4670211711860488, + "grad_norm": 0.2010609656572342, + "learning_rate": 0.00011055755149117462, + "loss": 0.0247, + "step": 3535 + }, + { + "epoch": 0.46715328467153283, + "grad_norm": 0.2535361647605896, + "learning_rate": 0.00011051622189688575, + "loss": 0.0382, + "step": 3536 + }, + { + "epoch": 0.46728539815701686, + "grad_norm": 0.13432317972183228, + "learning_rate": 0.00011047489048611619, + "loss": 0.0161, + "step": 3537 + }, + { + "epoch": 0.4674175116425009, + "grad_norm": 0.15032632648944855, + "learning_rate": 0.00011043355726600516, + "loss": 0.0145, + "step": 3538 + }, + { + "epoch": 0.4675496251279849, + "grad_norm": 0.15258188545703888, + "learning_rate": 0.0001103922222436922, + "loss": 0.0233, + "step": 3539 + }, + { + "epoch": 0.46768173861346896, + "grad_norm": 0.32376575469970703, + "learning_rate": 0.0001103508854263171, + "loss": 0.0367, + "step": 3540 + }, + { + "epoch": 0.467813852098953, + "grad_norm": 0.18361371755599976, + "learning_rate": 0.00011030954682102011, + "loss": 0.0231, + "step": 3541 + }, + { + "epoch": 0.467945965584437, + "grad_norm": 0.15508785843849182, + "learning_rate": 0.00011026820643494167, + "loss": 0.0156, + "step": 3542 + }, + { + "epoch": 0.46807807906992105, + "grad_norm": 0.18575936555862427, + "learning_rate": 0.00011022686427522255, + "loss": 0.0214, + "step": 3543 + }, + { + "epoch": 0.4682101925554051, + "grad_norm": 0.23005272448062897, + "learning_rate": 0.00011018552034900385, + "loss": 0.0256, + "step": 3544 + }, + { + "epoch": 0.4683423060408891, + "grad_norm": 0.08953981846570969, + "learning_rate": 0.00011014417466342695, + "loss": 0.0095, + "step": 3545 + }, + { + "epoch": 0.46847441952637314, + "grad_norm": 0.19710010290145874, + "learning_rate": 0.00011010282722563354, + "loss": 0.0146, + "step": 3546 + }, + { + "epoch": 0.4686065330118572, + "grad_norm": 0.20660652220249176, + "learning_rate": 0.00011006147804276563, + "loss": 0.0091, + "step": 3547 + }, + { + "epoch": 0.4687386464973412, + "grad_norm": 4.406435489654541, + "learning_rate": 0.0001100201271219655, + "loss": 0.0727, + "step": 3548 + }, + { + "epoch": 0.46887075998282524, + "grad_norm": 0.14570850133895874, + "learning_rate": 0.00010997877447037577, + "loss": 0.0171, + "step": 3549 + }, + { + "epoch": 0.46900287346830927, + "grad_norm": 0.23842819035053253, + "learning_rate": 0.0001099374200951393, + "loss": 0.0273, + "step": 3550 + }, + { + "epoch": 0.4691349869537933, + "grad_norm": 0.15830504894256592, + "learning_rate": 0.00010989606400339933, + "loss": 0.0166, + "step": 3551 + }, + { + "epoch": 0.46926710043927733, + "grad_norm": 0.27519121766090393, + "learning_rate": 0.00010985470620229937, + "loss": 0.0268, + "step": 3552 + }, + { + "epoch": 0.46939921392476136, + "grad_norm": 0.21721608936786652, + "learning_rate": 0.00010981334669898311, + "loss": 0.0126, + "step": 3553 + }, + { + "epoch": 0.4695313274102454, + "grad_norm": 0.16585609316825867, + "learning_rate": 0.00010977198550059471, + "loss": 0.0194, + "step": 3554 + }, + { + "epoch": 0.4696634408957294, + "grad_norm": 0.2184048593044281, + "learning_rate": 0.00010973062261427853, + "loss": 0.0365, + "step": 3555 + }, + { + "epoch": 0.46979555438121345, + "grad_norm": 0.17115822434425354, + "learning_rate": 0.00010968925804717925, + "loss": 0.0128, + "step": 3556 + }, + { + "epoch": 0.4699276678666975, + "grad_norm": 0.10518503934144974, + "learning_rate": 0.00010964789180644175, + "loss": 0.0101, + "step": 3557 + }, + { + "epoch": 0.4700597813521815, + "grad_norm": 0.27113738656044006, + "learning_rate": 0.00010960652389921137, + "loss": 0.0217, + "step": 3558 + }, + { + "epoch": 0.47019189483766555, + "grad_norm": 0.15465222299098969, + "learning_rate": 0.00010956515433263361, + "loss": 0.0168, + "step": 3559 + }, + { + "epoch": 0.4703240083231496, + "grad_norm": 0.1740829050540924, + "learning_rate": 0.00010952378311385426, + "loss": 0.0215, + "step": 3560 + }, + { + "epoch": 0.4704561218086336, + "grad_norm": 0.2015177458524704, + "learning_rate": 0.00010948241025001947, + "loss": 0.021, + "step": 3561 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.14692965149879456, + "learning_rate": 0.00010944103574827555, + "loss": 0.0231, + "step": 3562 + }, + { + "epoch": 0.47072034877960167, + "grad_norm": 0.1983291208744049, + "learning_rate": 0.00010939965961576927, + "loss": 0.0203, + "step": 3563 + }, + { + "epoch": 0.4708524622650857, + "grad_norm": 0.17785078287124634, + "learning_rate": 0.00010935828185964754, + "loss": 0.0087, + "step": 3564 + }, + { + "epoch": 0.47098457575056973, + "grad_norm": 0.1965969204902649, + "learning_rate": 0.00010931690248705759, + "loss": 0.0202, + "step": 3565 + }, + { + "epoch": 0.47111668923605377, + "grad_norm": 0.2205072045326233, + "learning_rate": 0.00010927552150514693, + "loss": 0.0209, + "step": 3566 + }, + { + "epoch": 0.4712488027215378, + "grad_norm": 0.20020027458667755, + "learning_rate": 0.00010923413892106335, + "loss": 0.0207, + "step": 3567 + }, + { + "epoch": 0.4713809162070218, + "grad_norm": 0.17927613854408264, + "learning_rate": 0.00010919275474195494, + "loss": 0.021, + "step": 3568 + }, + { + "epoch": 0.47151302969250586, + "grad_norm": 0.22265446186065674, + "learning_rate": 0.00010915136897497, + "loss": 0.0246, + "step": 3569 + }, + { + "epoch": 0.4716451431779899, + "grad_norm": 0.14600390195846558, + "learning_rate": 0.00010910998162725718, + "loss": 0.0116, + "step": 3570 + }, + { + "epoch": 0.4717772566634739, + "grad_norm": 0.2514609098434448, + "learning_rate": 0.00010906859270596541, + "loss": 0.0244, + "step": 3571 + }, + { + "epoch": 0.47190937014895795, + "grad_norm": 0.2012319266796112, + "learning_rate": 0.00010902720221824376, + "loss": 0.0261, + "step": 3572 + }, + { + "epoch": 0.472041483634442, + "grad_norm": 0.28338226675987244, + "learning_rate": 0.00010898581017124174, + "loss": 0.0229, + "step": 3573 + }, + { + "epoch": 0.472173597119926, + "grad_norm": 0.2096048891544342, + "learning_rate": 0.00010894441657210898, + "loss": 0.0372, + "step": 3574 + }, + { + "epoch": 0.47230571060541005, + "grad_norm": 0.1837790608406067, + "learning_rate": 0.00010890302142799555, + "loss": 0.0142, + "step": 3575 + }, + { + "epoch": 0.4724378240908941, + "grad_norm": 0.14642560482025146, + "learning_rate": 0.00010886162474605159, + "loss": 0.0154, + "step": 3576 + }, + { + "epoch": 0.4725699375763781, + "grad_norm": 0.2872735559940338, + "learning_rate": 0.00010882022653342767, + "loss": 0.0238, + "step": 3577 + }, + { + "epoch": 0.47270205106186214, + "grad_norm": 0.16493089497089386, + "learning_rate": 0.00010877882679727453, + "loss": 0.0247, + "step": 3578 + }, + { + "epoch": 0.47283416454734617, + "grad_norm": 0.4759635329246521, + "learning_rate": 0.00010873742554474317, + "loss": 0.0378, + "step": 3579 + }, + { + "epoch": 0.4729662780328302, + "grad_norm": 0.4725508391857147, + "learning_rate": 0.00010869602278298496, + "loss": 0.0363, + "step": 3580 + }, + { + "epoch": 0.47309839151831423, + "grad_norm": 0.16512827575206757, + "learning_rate": 0.00010865461851915138, + "loss": 0.015, + "step": 3581 + }, + { + "epoch": 0.47323050500379826, + "grad_norm": 0.18740400671958923, + "learning_rate": 0.00010861321276039426, + "loss": 0.0234, + "step": 3582 + }, + { + "epoch": 0.4733626184892823, + "grad_norm": 0.14816854894161224, + "learning_rate": 0.00010857180551386568, + "loss": 0.0206, + "step": 3583 + }, + { + "epoch": 0.4734947319747663, + "grad_norm": 0.3054364025592804, + "learning_rate": 0.00010853039678671799, + "loss": 0.0143, + "step": 3584 + }, + { + "epoch": 0.47362684546025036, + "grad_norm": 0.22532625496387482, + "learning_rate": 0.00010848898658610374, + "loss": 0.0153, + "step": 3585 + }, + { + "epoch": 0.4737589589457344, + "grad_norm": 0.19474582374095917, + "learning_rate": 0.00010844757491917577, + "loss": 0.0163, + "step": 3586 + }, + { + "epoch": 0.4738910724312184, + "grad_norm": 0.11799314618110657, + "learning_rate": 0.00010840616179308718, + "loss": 0.0071, + "step": 3587 + }, + { + "epoch": 0.47402318591670245, + "grad_norm": 0.3400368392467499, + "learning_rate": 0.0001083647472149913, + "loss": 0.0263, + "step": 3588 + }, + { + "epoch": 0.4741552994021865, + "grad_norm": 0.2943856120109558, + "learning_rate": 0.00010832333119204177, + "loss": 0.0295, + "step": 3589 + }, + { + "epoch": 0.4742874128876705, + "grad_norm": 0.16527043282985687, + "learning_rate": 0.00010828191373139238, + "loss": 0.0184, + "step": 3590 + }, + { + "epoch": 0.47441952637315454, + "grad_norm": 0.2068243771791458, + "learning_rate": 0.00010824049484019725, + "loss": 0.029, + "step": 3591 + }, + { + "epoch": 0.4745516398586386, + "grad_norm": 0.1771349161863327, + "learning_rate": 0.00010819907452561071, + "loss": 0.0189, + "step": 3592 + }, + { + "epoch": 0.4746837533441226, + "grad_norm": 0.1710856556892395, + "learning_rate": 0.00010815765279478733, + "loss": 0.0162, + "step": 3593 + }, + { + "epoch": 0.47481586682960664, + "grad_norm": 0.34175270795822144, + "learning_rate": 0.00010811622965488198, + "loss": 0.0243, + "step": 3594 + }, + { + "epoch": 0.47494798031509067, + "grad_norm": 0.21491940319538116, + "learning_rate": 0.00010807480511304968, + "loss": 0.0161, + "step": 3595 + }, + { + "epoch": 0.4750800938005747, + "grad_norm": 0.17027001082897186, + "learning_rate": 0.0001080333791764458, + "loss": 0.0174, + "step": 3596 + }, + { + "epoch": 0.47521220728605873, + "grad_norm": 0.3166252672672272, + "learning_rate": 0.00010799195185222584, + "loss": 0.0326, + "step": 3597 + }, + { + "epoch": 0.47534432077154276, + "grad_norm": 0.16182473301887512, + "learning_rate": 0.00010795052314754565, + "loss": 0.0167, + "step": 3598 + }, + { + "epoch": 0.4754764342570268, + "grad_norm": 0.26130416989326477, + "learning_rate": 0.00010790909306956125, + "loss": 0.0296, + "step": 3599 + }, + { + "epoch": 0.4756085477425108, + "grad_norm": 0.19988104701042175, + "learning_rate": 0.00010786766162542884, + "loss": 0.0229, + "step": 3600 + }, + { + "epoch": 0.47574066122799485, + "grad_norm": 0.1847897619009018, + "learning_rate": 0.00010782622882230504, + "loss": 0.0323, + "step": 3601 + }, + { + "epoch": 0.4758727747134789, + "grad_norm": 0.2483040988445282, + "learning_rate": 0.00010778479466734654, + "loss": 0.0287, + "step": 3602 + }, + { + "epoch": 0.4760048881989629, + "grad_norm": 0.21745510399341583, + "learning_rate": 0.0001077433591677103, + "loss": 0.0226, + "step": 3603 + }, + { + "epoch": 0.47613700168444695, + "grad_norm": 0.4640471041202545, + "learning_rate": 0.00010770192233055356, + "loss": 0.0218, + "step": 3604 + }, + { + "epoch": 0.476269115169931, + "grad_norm": 0.3053547441959381, + "learning_rate": 0.00010766048416303373, + "loss": 0.0304, + "step": 3605 + }, + { + "epoch": 0.476401228655415, + "grad_norm": 0.2106274515390396, + "learning_rate": 0.0001076190446723085, + "loss": 0.0201, + "step": 3606 + }, + { + "epoch": 0.47653334214089904, + "grad_norm": 0.187311053276062, + "learning_rate": 0.00010757760386553574, + "loss": 0.0195, + "step": 3607 + }, + { + "epoch": 0.47666545562638307, + "grad_norm": 0.13241535425186157, + "learning_rate": 0.00010753616174987362, + "loss": 0.0199, + "step": 3608 + }, + { + "epoch": 0.4767975691118671, + "grad_norm": 0.18745078146457672, + "learning_rate": 0.00010749471833248046, + "loss": 0.0294, + "step": 3609 + }, + { + "epoch": 0.47692968259735113, + "grad_norm": 0.5134204626083374, + "learning_rate": 0.0001074532736205148, + "loss": 0.0321, + "step": 3610 + }, + { + "epoch": 0.47706179608283517, + "grad_norm": 0.43000027537345886, + "learning_rate": 0.00010741182762113553, + "loss": 0.0283, + "step": 3611 + }, + { + "epoch": 0.4771939095683192, + "grad_norm": 0.17699387669563293, + "learning_rate": 0.00010737038034150158, + "loss": 0.0178, + "step": 3612 + }, + { + "epoch": 0.47732602305380323, + "grad_norm": 0.19420090317726135, + "learning_rate": 0.00010732893178877225, + "loss": 0.021, + "step": 3613 + }, + { + "epoch": 0.47745813653928726, + "grad_norm": 0.25928303599357605, + "learning_rate": 0.00010728748197010699, + "loss": 0.0215, + "step": 3614 + }, + { + "epoch": 0.4775902500247713, + "grad_norm": 0.149773508310318, + "learning_rate": 0.00010724603089266547, + "loss": 0.0166, + "step": 3615 + }, + { + "epoch": 0.4777223635102553, + "grad_norm": 0.1963023990392685, + "learning_rate": 0.0001072045785636076, + "loss": 0.0231, + "step": 3616 + }, + { + "epoch": 0.47785447699573935, + "grad_norm": 0.22853727638721466, + "learning_rate": 0.00010716312499009346, + "loss": 0.0211, + "step": 3617 + }, + { + "epoch": 0.4779865904812234, + "grad_norm": 0.31241101026535034, + "learning_rate": 0.00010712167017928345, + "loss": 0.0287, + "step": 3618 + }, + { + "epoch": 0.4781187039667074, + "grad_norm": 0.1355995535850525, + "learning_rate": 0.00010708021413833804, + "loss": 0.0135, + "step": 3619 + }, + { + "epoch": 0.47825081745219145, + "grad_norm": 0.23037013411521912, + "learning_rate": 0.00010703875687441804, + "loss": 0.0226, + "step": 3620 + }, + { + "epoch": 0.4783829309376755, + "grad_norm": 0.272693008184433, + "learning_rate": 0.00010699729839468437, + "loss": 0.0315, + "step": 3621 + }, + { + "epoch": 0.4785150444231595, + "grad_norm": 0.1348901391029358, + "learning_rate": 0.00010695583870629827, + "loss": 0.0135, + "step": 3622 + }, + { + "epoch": 0.47864715790864354, + "grad_norm": 0.3382134437561035, + "learning_rate": 0.00010691437781642107, + "loss": 0.0371, + "step": 3623 + }, + { + "epoch": 0.47877927139412757, + "grad_norm": 0.197440043091774, + "learning_rate": 0.00010687291573221436, + "loss": 0.0207, + "step": 3624 + }, + { + "epoch": 0.4789113848796116, + "grad_norm": 0.2662707567214966, + "learning_rate": 0.00010683145246083999, + "loss": 0.0239, + "step": 3625 + }, + { + "epoch": 0.47904349836509563, + "grad_norm": 0.20408916473388672, + "learning_rate": 0.00010678998800945991, + "loss": 0.0284, + "step": 3626 + }, + { + "epoch": 0.47917561185057966, + "grad_norm": 0.16986006498336792, + "learning_rate": 0.00010674852238523639, + "loss": 0.0239, + "step": 3627 + }, + { + "epoch": 0.4793077253360637, + "grad_norm": 0.2010790854692459, + "learning_rate": 0.00010670705559533178, + "loss": 0.0131, + "step": 3628 + }, + { + "epoch": 0.4794398388215477, + "grad_norm": 0.14181357622146606, + "learning_rate": 0.00010666558764690871, + "loss": 0.0083, + "step": 3629 + }, + { + "epoch": 0.47957195230703176, + "grad_norm": 0.3691260814666748, + "learning_rate": 0.00010662411854713004, + "loss": 0.0312, + "step": 3630 + }, + { + "epoch": 0.4797040657925158, + "grad_norm": 0.18499897420406342, + "learning_rate": 0.00010658264830315872, + "loss": 0.0146, + "step": 3631 + }, + { + "epoch": 0.4798361792779998, + "grad_norm": 0.17759235203266144, + "learning_rate": 0.00010654117692215799, + "loss": 0.0245, + "step": 3632 + }, + { + "epoch": 0.47996829276348385, + "grad_norm": 0.21312375366687775, + "learning_rate": 0.00010649970441129124, + "loss": 0.0209, + "step": 3633 + }, + { + "epoch": 0.4801004062489679, + "grad_norm": 0.17901498079299927, + "learning_rate": 0.0001064582307777221, + "loss": 0.0136, + "step": 3634 + }, + { + "epoch": 0.4802325197344519, + "grad_norm": 0.2698601484298706, + "learning_rate": 0.00010641675602861434, + "loss": 0.0277, + "step": 3635 + }, + { + "epoch": 0.48036463321993594, + "grad_norm": 0.17690856754779816, + "learning_rate": 0.00010637528017113192, + "loss": 0.0177, + "step": 3636 + }, + { + "epoch": 0.48049674670542, + "grad_norm": 0.21805188059806824, + "learning_rate": 0.00010633380321243909, + "loss": 0.0103, + "step": 3637 + }, + { + "epoch": 0.480628860190904, + "grad_norm": 0.16631975769996643, + "learning_rate": 0.00010629232515970015, + "loss": 0.0256, + "step": 3638 + }, + { + "epoch": 0.48076097367638804, + "grad_norm": 0.21511219441890717, + "learning_rate": 0.00010625084602007972, + "loss": 0.0336, + "step": 3639 + }, + { + "epoch": 0.48089308716187207, + "grad_norm": 0.22643572092056274, + "learning_rate": 0.00010620936580074248, + "loss": 0.0223, + "step": 3640 + }, + { + "epoch": 0.4810252006473561, + "grad_norm": 0.13022497296333313, + "learning_rate": 0.00010616788450885342, + "loss": 0.0157, + "step": 3641 + }, + { + "epoch": 0.48115731413284013, + "grad_norm": 0.5486253499984741, + "learning_rate": 0.00010612640215157766, + "loss": 0.0341, + "step": 3642 + }, + { + "epoch": 0.48128942761832416, + "grad_norm": 0.18713150918483734, + "learning_rate": 0.00010608491873608041, + "loss": 0.0258, + "step": 3643 + }, + { + "epoch": 0.4814215411038082, + "grad_norm": 0.24169377982616425, + "learning_rate": 0.00010604343426952728, + "loss": 0.0195, + "step": 3644 + }, + { + "epoch": 0.4815536545892922, + "grad_norm": 0.13835279643535614, + "learning_rate": 0.00010600194875908382, + "loss": 0.0102, + "step": 3645 + }, + { + "epoch": 0.48168576807477625, + "grad_norm": 0.21671341359615326, + "learning_rate": 0.00010596046221191598, + "loss": 0.0235, + "step": 3646 + }, + { + "epoch": 0.4818178815602603, + "grad_norm": 0.17495031654834747, + "learning_rate": 0.00010591897463518969, + "loss": 0.018, + "step": 3647 + }, + { + "epoch": 0.4819499950457443, + "grad_norm": 0.2383643090724945, + "learning_rate": 0.00010587748603607124, + "loss": 0.0145, + "step": 3648 + }, + { + "epoch": 0.48208210853122835, + "grad_norm": 0.1689545065164566, + "learning_rate": 0.00010583599642172697, + "loss": 0.0169, + "step": 3649 + }, + { + "epoch": 0.4822142220167124, + "grad_norm": 0.12825123965740204, + "learning_rate": 0.00010579450579932342, + "loss": 0.0108, + "step": 3650 + }, + { + "epoch": 0.4823463355021964, + "grad_norm": 0.17009027302265167, + "learning_rate": 0.00010575301417602734, + "loss": 0.0175, + "step": 3651 + }, + { + "epoch": 0.48247844898768044, + "grad_norm": 0.14984194934368134, + "learning_rate": 0.00010571152155900561, + "loss": 0.0186, + "step": 3652 + }, + { + "epoch": 0.4826105624731645, + "grad_norm": 0.15858642756938934, + "learning_rate": 0.00010567002795542535, + "loss": 0.0225, + "step": 3653 + }, + { + "epoch": 0.4827426759586485, + "grad_norm": 0.24578054249286652, + "learning_rate": 0.00010562853337245373, + "loss": 0.022, + "step": 3654 + }, + { + "epoch": 0.48287478944413254, + "grad_norm": 0.14686349034309387, + "learning_rate": 0.00010558703781725825, + "loss": 0.0254, + "step": 3655 + }, + { + "epoch": 0.48300690292961657, + "grad_norm": 0.23595882952213287, + "learning_rate": 0.00010554554129700646, + "loss": 0.0196, + "step": 3656 + }, + { + "epoch": 0.4831390164151006, + "grad_norm": 0.1301383078098297, + "learning_rate": 0.00010550404381886605, + "loss": 0.0151, + "step": 3657 + }, + { + "epoch": 0.48327112990058463, + "grad_norm": 0.15810106694698334, + "learning_rate": 0.000105462545390005, + "loss": 0.0238, + "step": 3658 + }, + { + "epoch": 0.48340324338606866, + "grad_norm": 0.19346168637275696, + "learning_rate": 0.00010542104601759137, + "loss": 0.0224, + "step": 3659 + }, + { + "epoch": 0.4835353568715527, + "grad_norm": 0.1737697869539261, + "learning_rate": 0.0001053795457087934, + "loss": 0.0182, + "step": 3660 + }, + { + "epoch": 0.4836674703570367, + "grad_norm": 0.21048392355442047, + "learning_rate": 0.0001053380444707795, + "loss": 0.0143, + "step": 3661 + }, + { + "epoch": 0.48379958384252075, + "grad_norm": 0.10285349935293198, + "learning_rate": 0.00010529654231071821, + "loss": 0.0127, + "step": 3662 + }, + { + "epoch": 0.48393169732800473, + "grad_norm": 0.26792213320732117, + "learning_rate": 0.00010525503923577829, + "loss": 0.0287, + "step": 3663 + }, + { + "epoch": 0.48406381081348876, + "grad_norm": 0.18912583589553833, + "learning_rate": 0.00010521353525312859, + "loss": 0.0225, + "step": 3664 + }, + { + "epoch": 0.4841959242989728, + "grad_norm": 0.20273041725158691, + "learning_rate": 0.00010517203036993815, + "loss": 0.0325, + "step": 3665 + }, + { + "epoch": 0.4843280377844568, + "grad_norm": 0.14839552342891693, + "learning_rate": 0.00010513052459337616, + "loss": 0.0114, + "step": 3666 + }, + { + "epoch": 0.48446015126994085, + "grad_norm": 0.4060331881046295, + "learning_rate": 0.00010508901793061196, + "loss": 0.0209, + "step": 3667 + }, + { + "epoch": 0.4845922647554249, + "grad_norm": 0.15798570215702057, + "learning_rate": 0.00010504751038881511, + "loss": 0.0161, + "step": 3668 + }, + { + "epoch": 0.4847243782409089, + "grad_norm": 0.19073081016540527, + "learning_rate": 0.00010500600197515515, + "loss": 0.0183, + "step": 3669 + }, + { + "epoch": 0.48485649172639295, + "grad_norm": 0.16638392210006714, + "learning_rate": 0.000104964492696802, + "loss": 0.0169, + "step": 3670 + }, + { + "epoch": 0.484988605211877, + "grad_norm": 0.2253982424736023, + "learning_rate": 0.00010492298256092552, + "loss": 0.0277, + "step": 3671 + }, + { + "epoch": 0.485120718697361, + "grad_norm": 0.20915861427783966, + "learning_rate": 0.0001048814715746959, + "loss": 0.0256, + "step": 3672 + }, + { + "epoch": 0.48525283218284504, + "grad_norm": 0.24739712476730347, + "learning_rate": 0.00010483995974528332, + "loss": 0.0228, + "step": 3673 + }, + { + "epoch": 0.48538494566832907, + "grad_norm": 0.16098754107952118, + "learning_rate": 0.00010479844707985816, + "loss": 0.0219, + "step": 3674 + }, + { + "epoch": 0.4855170591538131, + "grad_norm": 0.18969130516052246, + "learning_rate": 0.000104756933585591, + "loss": 0.0191, + "step": 3675 + }, + { + "epoch": 0.48564917263929713, + "grad_norm": 0.29885485768318176, + "learning_rate": 0.00010471541926965249, + "loss": 0.0327, + "step": 3676 + }, + { + "epoch": 0.48578128612478116, + "grad_norm": 0.3478846848011017, + "learning_rate": 0.0001046739041392135, + "loss": 0.0399, + "step": 3677 + }, + { + "epoch": 0.4859133996102652, + "grad_norm": 0.21352167427539825, + "learning_rate": 0.00010463238820144492, + "loss": 0.0203, + "step": 3678 + }, + { + "epoch": 0.4860455130957492, + "grad_norm": 0.2575002908706665, + "learning_rate": 0.00010459087146351791, + "loss": 0.0243, + "step": 3679 + }, + { + "epoch": 0.48617762658123326, + "grad_norm": 0.447734534740448, + "learning_rate": 0.0001045493539326037, + "loss": 0.0204, + "step": 3680 + }, + { + "epoch": 0.4863097400667173, + "grad_norm": 0.19277682900428772, + "learning_rate": 0.00010450783561587365, + "loss": 0.0145, + "step": 3681 + }, + { + "epoch": 0.4864418535522013, + "grad_norm": 0.18014197051525116, + "learning_rate": 0.0001044663165204993, + "loss": 0.0253, + "step": 3682 + }, + { + "epoch": 0.48657396703768535, + "grad_norm": 0.1811141073703766, + "learning_rate": 0.00010442479665365224, + "loss": 0.0192, + "step": 3683 + }, + { + "epoch": 0.4867060805231694, + "grad_norm": 0.18892107903957367, + "learning_rate": 0.00010438327602250433, + "loss": 0.0209, + "step": 3684 + }, + { + "epoch": 0.4868381940086534, + "grad_norm": 0.17446766793727875, + "learning_rate": 0.00010434175463422739, + "loss": 0.0284, + "step": 3685 + }, + { + "epoch": 0.48697030749413744, + "grad_norm": 0.29287025332450867, + "learning_rate": 0.00010430023249599357, + "loss": 0.0131, + "step": 3686 + }, + { + "epoch": 0.4871024209796215, + "grad_norm": 0.16245250403881073, + "learning_rate": 0.00010425870961497495, + "loss": 0.0168, + "step": 3687 + }, + { + "epoch": 0.4872345344651055, + "grad_norm": 0.17298054695129395, + "learning_rate": 0.00010421718599834389, + "loss": 0.0176, + "step": 3688 + }, + { + "epoch": 0.48736664795058954, + "grad_norm": 0.21618597209453583, + "learning_rate": 0.00010417566165327279, + "loss": 0.0233, + "step": 3689 + }, + { + "epoch": 0.48749876143607357, + "grad_norm": 0.0877385288476944, + "learning_rate": 0.00010413413658693423, + "loss": 0.0057, + "step": 3690 + }, + { + "epoch": 0.4876308749215576, + "grad_norm": 0.19247932732105255, + "learning_rate": 0.00010409261080650086, + "loss": 0.0323, + "step": 3691 + }, + { + "epoch": 0.48776298840704163, + "grad_norm": 0.23241962492465973, + "learning_rate": 0.0001040510843191455, + "loss": 0.0208, + "step": 3692 + }, + { + "epoch": 0.48789510189252566, + "grad_norm": 0.17630332708358765, + "learning_rate": 0.00010400955713204106, + "loss": 0.0307, + "step": 3693 + }, + { + "epoch": 0.4880272153780097, + "grad_norm": 0.16653984785079956, + "learning_rate": 0.0001039680292523606, + "loss": 0.0151, + "step": 3694 + }, + { + "epoch": 0.4881593288634937, + "grad_norm": 0.2570105493068695, + "learning_rate": 0.00010392650068727728, + "loss": 0.0288, + "step": 3695 + }, + { + "epoch": 0.48829144234897776, + "grad_norm": 0.1397189050912857, + "learning_rate": 0.00010388497144396436, + "loss": 0.0123, + "step": 3696 + }, + { + "epoch": 0.4884235558344618, + "grad_norm": 0.24010951817035675, + "learning_rate": 0.00010384344152959529, + "loss": 0.0179, + "step": 3697 + }, + { + "epoch": 0.4885556693199458, + "grad_norm": 0.26753032207489014, + "learning_rate": 0.00010380191095134355, + "loss": 0.0366, + "step": 3698 + }, + { + "epoch": 0.48868778280542985, + "grad_norm": 0.28516536951065063, + "learning_rate": 0.00010376037971638278, + "loss": 0.0245, + "step": 3699 + }, + { + "epoch": 0.4888198962909139, + "grad_norm": 0.1718878448009491, + "learning_rate": 0.0001037188478318867, + "loss": 0.0229, + "step": 3700 + }, + { + "epoch": 0.4889520097763979, + "grad_norm": 0.1643366664648056, + "learning_rate": 0.00010367731530502923, + "loss": 0.0217, + "step": 3701 + }, + { + "epoch": 0.48908412326188194, + "grad_norm": 0.15962187945842743, + "learning_rate": 0.00010363578214298424, + "loss": 0.0137, + "step": 3702 + }, + { + "epoch": 0.489216236747366, + "grad_norm": 0.2725326418876648, + "learning_rate": 0.00010359424835292591, + "loss": 0.0187, + "step": 3703 + }, + { + "epoch": 0.48934835023285, + "grad_norm": 0.38736462593078613, + "learning_rate": 0.00010355271394202834, + "loss": 0.0292, + "step": 3704 + }, + { + "epoch": 0.48948046371833404, + "grad_norm": 0.23041263222694397, + "learning_rate": 0.00010351117891746592, + "loss": 0.0247, + "step": 3705 + }, + { + "epoch": 0.48961257720381807, + "grad_norm": 0.19857461750507355, + "learning_rate": 0.00010346964328641297, + "loss": 0.0305, + "step": 3706 + }, + { + "epoch": 0.4897446906893021, + "grad_norm": 0.18201592564582825, + "learning_rate": 0.000103428107056044, + "loss": 0.0176, + "step": 3707 + }, + { + "epoch": 0.48987680417478613, + "grad_norm": 0.29200470447540283, + "learning_rate": 0.00010338657023353364, + "loss": 0.0278, + "step": 3708 + }, + { + "epoch": 0.49000891766027016, + "grad_norm": 0.11148899048566818, + "learning_rate": 0.00010334503282605656, + "loss": 0.0163, + "step": 3709 + }, + { + "epoch": 0.4901410311457542, + "grad_norm": 0.1690640151500702, + "learning_rate": 0.00010330349484078765, + "loss": 0.0148, + "step": 3710 + }, + { + "epoch": 0.4902731446312382, + "grad_norm": 0.1554546058177948, + "learning_rate": 0.00010326195628490174, + "loss": 0.0125, + "step": 3711 + }, + { + "epoch": 0.49040525811672225, + "grad_norm": 0.22738389670848846, + "learning_rate": 0.00010322041716557391, + "loss": 0.0214, + "step": 3712 + }, + { + "epoch": 0.4905373716022063, + "grad_norm": 0.1931189000606537, + "learning_rate": 0.0001031788774899792, + "loss": 0.0271, + "step": 3713 + }, + { + "epoch": 0.4906694850876903, + "grad_norm": 0.19231070578098297, + "learning_rate": 0.00010313733726529284, + "loss": 0.0238, + "step": 3714 + }, + { + "epoch": 0.49080159857317435, + "grad_norm": 0.2267046421766281, + "learning_rate": 0.00010309579649869014, + "loss": 0.0178, + "step": 3715 + }, + { + "epoch": 0.4909337120586584, + "grad_norm": 0.18220418691635132, + "learning_rate": 0.00010305425519734646, + "loss": 0.0134, + "step": 3716 + }, + { + "epoch": 0.4910658255441424, + "grad_norm": 0.17011044919490814, + "learning_rate": 0.0001030127133684373, + "loss": 0.0142, + "step": 3717 + }, + { + "epoch": 0.49119793902962644, + "grad_norm": 0.16121694445610046, + "learning_rate": 0.00010297117101913825, + "loss": 0.0149, + "step": 3718 + }, + { + "epoch": 0.49133005251511047, + "grad_norm": 0.1913376748561859, + "learning_rate": 0.00010292962815662494, + "loss": 0.0221, + "step": 3719 + }, + { + "epoch": 0.4914621660005945, + "grad_norm": 0.09859661012887955, + "learning_rate": 0.00010288808478807316, + "loss": 0.0187, + "step": 3720 + }, + { + "epoch": 0.49159427948607853, + "grad_norm": 0.23269788920879364, + "learning_rate": 0.00010284654092065873, + "loss": 0.0333, + "step": 3721 + }, + { + "epoch": 0.49172639297156256, + "grad_norm": 0.24885809421539307, + "learning_rate": 0.00010280499656155757, + "loss": 0.0169, + "step": 3722 + }, + { + "epoch": 0.4918585064570466, + "grad_norm": 0.2113824039697647, + "learning_rate": 0.00010276345171794573, + "loss": 0.0287, + "step": 3723 + }, + { + "epoch": 0.4919906199425306, + "grad_norm": 0.24789120256900787, + "learning_rate": 0.00010272190639699924, + "loss": 0.0205, + "step": 3724 + }, + { + "epoch": 0.49212273342801466, + "grad_norm": 0.09954213351011276, + "learning_rate": 0.00010268036060589432, + "loss": 0.0098, + "step": 3725 + }, + { + "epoch": 0.4922548469134987, + "grad_norm": 0.3465690314769745, + "learning_rate": 0.00010263881435180722, + "loss": 0.0237, + "step": 3726 + }, + { + "epoch": 0.4923869603989827, + "grad_norm": 0.13831698894500732, + "learning_rate": 0.00010259726764191428, + "loss": 0.0129, + "step": 3727 + }, + { + "epoch": 0.49251907388446675, + "grad_norm": 0.24895714223384857, + "learning_rate": 0.0001025557204833919, + "loss": 0.0227, + "step": 3728 + }, + { + "epoch": 0.4926511873699508, + "grad_norm": 0.261017769575119, + "learning_rate": 0.00010251417288341662, + "loss": 0.0233, + "step": 3729 + }, + { + "epoch": 0.4927833008554348, + "grad_norm": 0.22980886697769165, + "learning_rate": 0.000102472624849165, + "loss": 0.019, + "step": 3730 + }, + { + "epoch": 0.49291541434091884, + "grad_norm": 0.2138877809047699, + "learning_rate": 0.00010243107638781365, + "loss": 0.0182, + "step": 3731 + }, + { + "epoch": 0.4930475278264029, + "grad_norm": 0.16170454025268555, + "learning_rate": 0.00010238952750653929, + "loss": 0.0109, + "step": 3732 + }, + { + "epoch": 0.4931796413118869, + "grad_norm": 0.1737380027770996, + "learning_rate": 0.00010234797821251873, + "loss": 0.026, + "step": 3733 + }, + { + "epoch": 0.49331175479737094, + "grad_norm": 0.12878692150115967, + "learning_rate": 0.00010230642851292887, + "loss": 0.0073, + "step": 3734 + }, + { + "epoch": 0.49344386828285497, + "grad_norm": 0.24098946154117584, + "learning_rate": 0.00010226487841494656, + "loss": 0.0277, + "step": 3735 + }, + { + "epoch": 0.493575981768339, + "grad_norm": 0.24644453823566437, + "learning_rate": 0.00010222332792574889, + "loss": 0.024, + "step": 3736 + }, + { + "epoch": 0.49370809525382303, + "grad_norm": 0.12515372037887573, + "learning_rate": 0.00010218177705251289, + "loss": 0.0164, + "step": 3737 + }, + { + "epoch": 0.49384020873930706, + "grad_norm": 0.3598020672798157, + "learning_rate": 0.00010214022580241567, + "loss": 0.0173, + "step": 3738 + }, + { + "epoch": 0.4939723222247911, + "grad_norm": 0.17398720979690552, + "learning_rate": 0.00010209867418263448, + "loss": 0.0154, + "step": 3739 + }, + { + "epoch": 0.4941044357102751, + "grad_norm": 0.13843418657779694, + "learning_rate": 0.00010205712220034654, + "loss": 0.0202, + "step": 3740 + }, + { + "epoch": 0.49423654919575916, + "grad_norm": 0.1310289353132248, + "learning_rate": 0.00010201556986272922, + "loss": 0.0148, + "step": 3741 + }, + { + "epoch": 0.4943686626812432, + "grad_norm": 0.133134126663208, + "learning_rate": 0.0001019740171769599, + "loss": 0.0113, + "step": 3742 + }, + { + "epoch": 0.4945007761667272, + "grad_norm": 0.19342927634716034, + "learning_rate": 0.00010193246415021602, + "loss": 0.0235, + "step": 3743 + }, + { + "epoch": 0.49463288965221125, + "grad_norm": 0.19014842808246613, + "learning_rate": 0.0001018909107896751, + "loss": 0.0174, + "step": 3744 + }, + { + "epoch": 0.4947650031376953, + "grad_norm": 0.2186693698167801, + "learning_rate": 0.00010184935710251467, + "loss": 0.0155, + "step": 3745 + }, + { + "epoch": 0.4948971166231793, + "grad_norm": 0.17435306310653687, + "learning_rate": 0.00010180780309591236, + "loss": 0.0117, + "step": 3746 + }, + { + "epoch": 0.49502923010866334, + "grad_norm": 0.14553245902061462, + "learning_rate": 0.00010176624877704588, + "loss": 0.0176, + "step": 3747 + }, + { + "epoch": 0.4951613435941474, + "grad_norm": 0.19681425392627716, + "learning_rate": 0.00010172469415309297, + "loss": 0.0193, + "step": 3748 + }, + { + "epoch": 0.4952934570796314, + "grad_norm": 0.14469482004642487, + "learning_rate": 0.00010168313923123141, + "loss": 0.0134, + "step": 3749 + }, + { + "epoch": 0.49542557056511544, + "grad_norm": 0.16757139563560486, + "learning_rate": 0.00010164158401863896, + "loss": 0.0189, + "step": 3750 + }, + { + "epoch": 0.49555768405059947, + "grad_norm": 0.22932220995426178, + "learning_rate": 0.00010160002852249361, + "loss": 0.0213, + "step": 3751 + }, + { + "epoch": 0.4956897975360835, + "grad_norm": 0.13100813329219818, + "learning_rate": 0.00010155847274997323, + "loss": 0.0163, + "step": 3752 + }, + { + "epoch": 0.49582191102156753, + "grad_norm": 0.2130032479763031, + "learning_rate": 0.00010151691670825582, + "loss": 0.0155, + "step": 3753 + }, + { + "epoch": 0.49595402450705156, + "grad_norm": 0.14983811974525452, + "learning_rate": 0.00010147536040451942, + "loss": 0.021, + "step": 3754 + }, + { + "epoch": 0.4960861379925356, + "grad_norm": 0.14002719521522522, + "learning_rate": 0.0001014338038459421, + "loss": 0.0152, + "step": 3755 + }, + { + "epoch": 0.4962182514780196, + "grad_norm": 0.21298381686210632, + "learning_rate": 0.00010139224703970198, + "loss": 0.0276, + "step": 3756 + }, + { + "epoch": 0.49635036496350365, + "grad_norm": 0.15027806162834167, + "learning_rate": 0.0001013506899929772, + "loss": 0.0171, + "step": 3757 + }, + { + "epoch": 0.4964824784489877, + "grad_norm": 0.15074260532855988, + "learning_rate": 0.00010130913271294598, + "loss": 0.013, + "step": 3758 + }, + { + "epoch": 0.4966145919344717, + "grad_norm": 0.22300437092781067, + "learning_rate": 0.00010126757520678653, + "loss": 0.0154, + "step": 3759 + }, + { + "epoch": 0.49674670541995575, + "grad_norm": 0.15333257615566254, + "learning_rate": 0.00010122601748167722, + "loss": 0.0135, + "step": 3760 + }, + { + "epoch": 0.4968788189054398, + "grad_norm": 0.13082996010780334, + "learning_rate": 0.00010118445954479627, + "loss": 0.0082, + "step": 3761 + }, + { + "epoch": 0.4970109323909238, + "grad_norm": 0.21996119618415833, + "learning_rate": 0.0001011429014033221, + "loss": 0.0268, + "step": 3762 + }, + { + "epoch": 0.49714304587640784, + "grad_norm": 0.15659058094024658, + "learning_rate": 0.00010110134306443308, + "loss": 0.011, + "step": 3763 + }, + { + "epoch": 0.49727515936189187, + "grad_norm": 0.2219391167163849, + "learning_rate": 0.00010105978453530765, + "loss": 0.0265, + "step": 3764 + }, + { + "epoch": 0.4974072728473759, + "grad_norm": 0.1443316787481308, + "learning_rate": 0.00010101822582312424, + "loss": 0.0182, + "step": 3765 + }, + { + "epoch": 0.49753938633285993, + "grad_norm": 0.2249058187007904, + "learning_rate": 0.00010097666693506134, + "loss": 0.0244, + "step": 3766 + }, + { + "epoch": 0.49767149981834397, + "grad_norm": 0.25690579414367676, + "learning_rate": 0.00010093510787829752, + "loss": 0.0223, + "step": 3767 + }, + { + "epoch": 0.497803613303828, + "grad_norm": 0.19646599888801575, + "learning_rate": 0.00010089354866001129, + "loss": 0.0195, + "step": 3768 + }, + { + "epoch": 0.497935726789312, + "grad_norm": 0.19536004960536957, + "learning_rate": 0.00010085198928738122, + "loss": 0.0373, + "step": 3769 + }, + { + "epoch": 0.49806784027479606, + "grad_norm": 0.16124306619167328, + "learning_rate": 0.00010081042976758597, + "loss": 0.022, + "step": 3770 + }, + { + "epoch": 0.4981999537602801, + "grad_norm": 0.22762157022953033, + "learning_rate": 0.00010076887010780407, + "loss": 0.0214, + "step": 3771 + }, + { + "epoch": 0.4983320672457641, + "grad_norm": 0.14339573681354523, + "learning_rate": 0.00010072731031521428, + "loss": 0.0174, + "step": 3772 + }, + { + "epoch": 0.49846418073124815, + "grad_norm": 0.22775132954120636, + "learning_rate": 0.00010068575039699521, + "loss": 0.0198, + "step": 3773 + }, + { + "epoch": 0.4985962942167322, + "grad_norm": 0.209895059466362, + "learning_rate": 0.0001006441903603256, + "loss": 0.0167, + "step": 3774 + }, + { + "epoch": 0.4987284077022162, + "grad_norm": 0.23028625547885895, + "learning_rate": 0.00010060263021238412, + "loss": 0.0132, + "step": 3775 + }, + { + "epoch": 0.49886052118770025, + "grad_norm": 0.17354623973369598, + "learning_rate": 0.00010056106996034955, + "loss": 0.0119, + "step": 3776 + }, + { + "epoch": 0.4989926346731843, + "grad_norm": 0.1872815638780594, + "learning_rate": 0.00010051950961140066, + "loss": 0.0255, + "step": 3777 + }, + { + "epoch": 0.4991247481586683, + "grad_norm": 0.18477414548397064, + "learning_rate": 0.00010047794917271615, + "loss": 0.0186, + "step": 3778 + }, + { + "epoch": 0.49925686164415234, + "grad_norm": 0.11314979195594788, + "learning_rate": 0.0001004363886514749, + "loss": 0.0125, + "step": 3779 + }, + { + "epoch": 0.49938897512963637, + "grad_norm": 0.1584259271621704, + "learning_rate": 0.00010039482805485567, + "loss": 0.0163, + "step": 3780 + }, + { + "epoch": 0.4995210886151204, + "grad_norm": 0.18248772621154785, + "learning_rate": 0.00010035326739003726, + "loss": 0.0255, + "step": 3781 + }, + { + "epoch": 0.49965320210060443, + "grad_norm": 0.1315157413482666, + "learning_rate": 0.00010031170666419853, + "loss": 0.0158, + "step": 3782 + }, + { + "epoch": 0.49978531558608846, + "grad_norm": 0.11246932297945023, + "learning_rate": 0.00010027014588451827, + "loss": 0.0085, + "step": 3783 + }, + { + "epoch": 0.4999174290715725, + "grad_norm": 0.22276318073272705, + "learning_rate": 0.00010022858505817539, + "loss": 0.025, + "step": 3784 + }, + { + "epoch": 0.5000495425570565, + "grad_norm": 0.15550485253334045, + "learning_rate": 0.0001001870241923487, + "loss": 0.0146, + "step": 3785 + }, + { + "epoch": 0.5001816560425405, + "grad_norm": 0.123907171189785, + "learning_rate": 0.00010014546329421707, + "loss": 0.0146, + "step": 3786 + }, + { + "epoch": 0.5003137695280245, + "grad_norm": 0.22745291888713837, + "learning_rate": 0.00010010390237095941, + "loss": 0.0361, + "step": 3787 + }, + { + "epoch": 0.5004458830135086, + "grad_norm": 0.15615154802799225, + "learning_rate": 0.00010006234142975452, + "loss": 0.0172, + "step": 3788 + }, + { + "epoch": 0.5005779964989926, + "grad_norm": 0.13986553251743317, + "learning_rate": 0.00010002078047778134, + "loss": 0.0163, + "step": 3789 + }, + { + "epoch": 0.5007101099844766, + "grad_norm": 0.2259240746498108, + "learning_rate": 9.99792195222187e-05, + "loss": 0.0304, + "step": 3790 + }, + { + "epoch": 0.5008422234699607, + "grad_norm": 0.21802827715873718, + "learning_rate": 9.993765857024549e-05, + "loss": 0.0186, + "step": 3791 + }, + { + "epoch": 0.5009743369554447, + "grad_norm": 0.13896140456199646, + "learning_rate": 9.989609762904061e-05, + "loss": 0.0197, + "step": 3792 + }, + { + "epoch": 0.5011064504409287, + "grad_norm": 0.24788185954093933, + "learning_rate": 9.985453670578292e-05, + "loss": 0.0257, + "step": 3793 + }, + { + "epoch": 0.5012385639264128, + "grad_norm": 0.23262043297290802, + "learning_rate": 9.981297580765132e-05, + "loss": 0.0283, + "step": 3794 + }, + { + "epoch": 0.5013706774118968, + "grad_norm": 0.1629173457622528, + "learning_rate": 9.977141494182461e-05, + "loss": 0.0123, + "step": 3795 + }, + { + "epoch": 0.5015027908973808, + "grad_norm": 0.22046636044979095, + "learning_rate": 9.972985411548173e-05, + "loss": 0.0234, + "step": 3796 + }, + { + "epoch": 0.5016349043828648, + "grad_norm": 0.1676786243915558, + "learning_rate": 9.96882933358015e-05, + "loss": 0.023, + "step": 3797 + }, + { + "epoch": 0.5017670178683489, + "grad_norm": 0.13438016176223755, + "learning_rate": 9.964673260996274e-05, + "loss": 0.014, + "step": 3798 + }, + { + "epoch": 0.5018991313538329, + "grad_norm": 0.15927131474018097, + "learning_rate": 9.960517194514435e-05, + "loss": 0.0166, + "step": 3799 + }, + { + "epoch": 0.5020312448393169, + "grad_norm": 0.23379889130592346, + "learning_rate": 9.956361134852509e-05, + "loss": 0.0204, + "step": 3800 + }, + { + "epoch": 0.502163358324801, + "grad_norm": 0.1917094886302948, + "learning_rate": 9.952205082728384e-05, + "loss": 0.0235, + "step": 3801 + }, + { + "epoch": 0.502295471810285, + "grad_norm": 0.20136989653110504, + "learning_rate": 9.948049038859935e-05, + "loss": 0.0096, + "step": 3802 + }, + { + "epoch": 0.502427585295769, + "grad_norm": 0.16332395374774933, + "learning_rate": 9.943893003965044e-05, + "loss": 0.0223, + "step": 3803 + }, + { + "epoch": 0.5025596987812531, + "grad_norm": 0.3075923025608063, + "learning_rate": 9.939736978761589e-05, + "loss": 0.0225, + "step": 3804 + }, + { + "epoch": 0.5026918122667371, + "grad_norm": 0.1782737374305725, + "learning_rate": 9.935580963967442e-05, + "loss": 0.0246, + "step": 3805 + }, + { + "epoch": 0.5028239257522211, + "grad_norm": 0.11710914969444275, + "learning_rate": 9.93142496030048e-05, + "loss": 0.0092, + "step": 3806 + }, + { + "epoch": 0.5029560392377052, + "grad_norm": 0.21303799748420715, + "learning_rate": 9.927268968478573e-05, + "loss": 0.0124, + "step": 3807 + }, + { + "epoch": 0.5030881527231892, + "grad_norm": 0.41062068939208984, + "learning_rate": 9.923112989219594e-05, + "loss": 0.0311, + "step": 3808 + }, + { + "epoch": 0.5032202662086732, + "grad_norm": 0.3300868570804596, + "learning_rate": 9.918957023241406e-05, + "loss": 0.0231, + "step": 3809 + }, + { + "epoch": 0.5033523796941572, + "grad_norm": 0.19093488156795502, + "learning_rate": 9.91480107126188e-05, + "loss": 0.0232, + "step": 3810 + }, + { + "epoch": 0.5034844931796413, + "grad_norm": 0.1572832465171814, + "learning_rate": 9.910645133998875e-05, + "loss": 0.0131, + "step": 3811 + }, + { + "epoch": 0.5036166066651253, + "grad_norm": 0.1398879587650299, + "learning_rate": 9.90648921217025e-05, + "loss": 0.0141, + "step": 3812 + }, + { + "epoch": 0.5037487201506093, + "grad_norm": 0.2715822160243988, + "learning_rate": 9.902333306493868e-05, + "loss": 0.0215, + "step": 3813 + }, + { + "epoch": 0.5038808336360934, + "grad_norm": 0.18688692152500153, + "learning_rate": 9.898177417687578e-05, + "loss": 0.0224, + "step": 3814 + }, + { + "epoch": 0.5040129471215774, + "grad_norm": 0.1852293759584427, + "learning_rate": 9.894021546469239e-05, + "loss": 0.0176, + "step": 3815 + }, + { + "epoch": 0.5041450606070614, + "grad_norm": 0.28092116117477417, + "learning_rate": 9.889865693556694e-05, + "loss": 0.0187, + "step": 3816 + }, + { + "epoch": 0.5042771740925455, + "grad_norm": 0.20824526250362396, + "learning_rate": 9.885709859667792e-05, + "loss": 0.0267, + "step": 3817 + }, + { + "epoch": 0.5044092875780295, + "grad_norm": 0.19389167428016663, + "learning_rate": 9.881554045520376e-05, + "loss": 0.0348, + "step": 3818 + }, + { + "epoch": 0.5045414010635135, + "grad_norm": 0.20891070365905762, + "learning_rate": 9.87739825183228e-05, + "loss": 0.0182, + "step": 3819 + }, + { + "epoch": 0.5046735145489976, + "grad_norm": 0.17443668842315674, + "learning_rate": 9.873242479321348e-05, + "loss": 0.0123, + "step": 3820 + }, + { + "epoch": 0.5048056280344816, + "grad_norm": 0.17366589605808258, + "learning_rate": 9.869086728705406e-05, + "loss": 0.0171, + "step": 3821 + }, + { + "epoch": 0.5049377415199656, + "grad_norm": 0.17065440118312836, + "learning_rate": 9.864931000702284e-05, + "loss": 0.0126, + "step": 3822 + }, + { + "epoch": 0.5050698550054497, + "grad_norm": 0.29523712396621704, + "learning_rate": 9.860775296029805e-05, + "loss": 0.023, + "step": 3823 + }, + { + "epoch": 0.5052019684909337, + "grad_norm": 0.21273942291736603, + "learning_rate": 9.856619615405793e-05, + "loss": 0.0204, + "step": 3824 + }, + { + "epoch": 0.5053340819764177, + "grad_norm": 0.2513734698295593, + "learning_rate": 9.85246395954806e-05, + "loss": 0.0222, + "step": 3825 + }, + { + "epoch": 0.5054661954619017, + "grad_norm": 0.18025194108486176, + "learning_rate": 9.848308329174419e-05, + "loss": 0.0157, + "step": 3826 + }, + { + "epoch": 0.5055983089473858, + "grad_norm": 0.13815920054912567, + "learning_rate": 9.84415272500268e-05, + "loss": 0.0125, + "step": 3827 + }, + { + "epoch": 0.5057304224328698, + "grad_norm": 0.246177539229393, + "learning_rate": 9.839997147750641e-05, + "loss": 0.028, + "step": 3828 + }, + { + "epoch": 0.5058625359183538, + "grad_norm": 0.10835841298103333, + "learning_rate": 9.835841598136105e-05, + "loss": 0.0079, + "step": 3829 + }, + { + "epoch": 0.5059946494038379, + "grad_norm": 0.1650642603635788, + "learning_rate": 9.831686076876863e-05, + "loss": 0.0211, + "step": 3830 + }, + { + "epoch": 0.5061267628893219, + "grad_norm": 0.1597212553024292, + "learning_rate": 9.827530584690705e-05, + "loss": 0.0147, + "step": 3831 + }, + { + "epoch": 0.5062588763748059, + "grad_norm": 0.24264469742774963, + "learning_rate": 9.823375122295414e-05, + "loss": 0.0219, + "step": 3832 + }, + { + "epoch": 0.50639098986029, + "grad_norm": 0.3036757707595825, + "learning_rate": 9.819219690408766e-05, + "loss": 0.0323, + "step": 3833 + }, + { + "epoch": 0.506523103345774, + "grad_norm": 0.20398838818073273, + "learning_rate": 9.815064289748538e-05, + "loss": 0.0177, + "step": 3834 + }, + { + "epoch": 0.506655216831258, + "grad_norm": 0.16952387988567352, + "learning_rate": 9.810908921032495e-05, + "loss": 0.013, + "step": 3835 + }, + { + "epoch": 0.5067873303167421, + "grad_norm": 0.19409286975860596, + "learning_rate": 9.806753584978403e-05, + "loss": 0.0182, + "step": 3836 + }, + { + "epoch": 0.5069194438022261, + "grad_norm": 0.1850726455450058, + "learning_rate": 9.802598282304013e-05, + "loss": 0.0206, + "step": 3837 + }, + { + "epoch": 0.5070515572877101, + "grad_norm": 0.19560378789901733, + "learning_rate": 9.798443013727082e-05, + "loss": 0.0242, + "step": 3838 + }, + { + "epoch": 0.5071836707731942, + "grad_norm": 0.13183735311031342, + "learning_rate": 9.79428777996535e-05, + "loss": 0.0229, + "step": 3839 + }, + { + "epoch": 0.5073157842586782, + "grad_norm": 0.24583682417869568, + "learning_rate": 9.790132581736557e-05, + "loss": 0.029, + "step": 3840 + }, + { + "epoch": 0.5074478977441622, + "grad_norm": 0.1971963793039322, + "learning_rate": 9.785977419758439e-05, + "loss": 0.0151, + "step": 3841 + }, + { + "epoch": 0.5075800112296462, + "grad_norm": 0.12109408527612686, + "learning_rate": 9.781822294748716e-05, + "loss": 0.0132, + "step": 3842 + }, + { + "epoch": 0.5077121247151303, + "grad_norm": 0.21014884114265442, + "learning_rate": 9.777667207425116e-05, + "loss": 0.0189, + "step": 3843 + }, + { + "epoch": 0.5078442382006143, + "grad_norm": 0.16626541316509247, + "learning_rate": 9.773512158505345e-05, + "loss": 0.0097, + "step": 3844 + }, + { + "epoch": 0.5079763516860983, + "grad_norm": 0.16020405292510986, + "learning_rate": 9.76935714870712e-05, + "loss": 0.0188, + "step": 3845 + }, + { + "epoch": 0.5081084651715824, + "grad_norm": 0.3006601631641388, + "learning_rate": 9.765202178748132e-05, + "loss": 0.0337, + "step": 3846 + }, + { + "epoch": 0.5082405786570664, + "grad_norm": 0.26183125376701355, + "learning_rate": 9.761047249346076e-05, + "loss": 0.0328, + "step": 3847 + }, + { + "epoch": 0.5083726921425504, + "grad_norm": 0.1901049017906189, + "learning_rate": 9.756892361218642e-05, + "loss": 0.0175, + "step": 3848 + }, + { + "epoch": 0.5085048056280345, + "grad_norm": 0.17531642317771912, + "learning_rate": 9.752737515083501e-05, + "loss": 0.0192, + "step": 3849 + }, + { + "epoch": 0.5086369191135185, + "grad_norm": 0.2943665683269501, + "learning_rate": 9.748582711658336e-05, + "loss": 0.0346, + "step": 3850 + }, + { + "epoch": 0.5087690325990025, + "grad_norm": 0.12566107511520386, + "learning_rate": 9.744427951660809e-05, + "loss": 0.0142, + "step": 3851 + }, + { + "epoch": 0.5089011460844866, + "grad_norm": 0.1161152645945549, + "learning_rate": 9.740273235808572e-05, + "loss": 0.0037, + "step": 3852 + }, + { + "epoch": 0.5090332595699706, + "grad_norm": 0.1520073562860489, + "learning_rate": 9.736118564819279e-05, + "loss": 0.0155, + "step": 3853 + }, + { + "epoch": 0.5091653730554546, + "grad_norm": 0.13466641306877136, + "learning_rate": 9.73196393941057e-05, + "loss": 0.0089, + "step": 3854 + }, + { + "epoch": 0.5092974865409386, + "grad_norm": 0.309459388256073, + "learning_rate": 9.727809360300077e-05, + "loss": 0.0203, + "step": 3855 + }, + { + "epoch": 0.5094296000264227, + "grad_norm": 0.30888691544532776, + "learning_rate": 9.72365482820543e-05, + "loss": 0.0256, + "step": 3856 + }, + { + "epoch": 0.5095617135119067, + "grad_norm": 0.16353288292884827, + "learning_rate": 9.719500343844242e-05, + "loss": 0.0181, + "step": 3857 + }, + { + "epoch": 0.5096938269973907, + "grad_norm": 0.16413989663124084, + "learning_rate": 9.715345907934128e-05, + "loss": 0.0145, + "step": 3858 + }, + { + "epoch": 0.5098259404828748, + "grad_norm": 0.37346822023391724, + "learning_rate": 9.711191521192685e-05, + "loss": 0.0271, + "step": 3859 + }, + { + "epoch": 0.5099580539683588, + "grad_norm": 0.1723538339138031, + "learning_rate": 9.707037184337506e-05, + "loss": 0.0176, + "step": 3860 + }, + { + "epoch": 0.5100901674538428, + "grad_norm": 0.18036048114299774, + "learning_rate": 9.702882898086177e-05, + "loss": 0.0241, + "step": 3861 + }, + { + "epoch": 0.5102222809393269, + "grad_norm": 0.19181881844997406, + "learning_rate": 9.698728663156271e-05, + "loss": 0.0185, + "step": 3862 + }, + { + "epoch": 0.5103543944248109, + "grad_norm": 0.17694401741027832, + "learning_rate": 9.694574480265357e-05, + "loss": 0.016, + "step": 3863 + }, + { + "epoch": 0.5104865079102949, + "grad_norm": 0.20160824060440063, + "learning_rate": 9.690420350130988e-05, + "loss": 0.028, + "step": 3864 + }, + { + "epoch": 0.510618621395779, + "grad_norm": 0.18602226674556732, + "learning_rate": 9.686266273470718e-05, + "loss": 0.0214, + "step": 3865 + }, + { + "epoch": 0.510750734881263, + "grad_norm": 0.2467876374721527, + "learning_rate": 9.682112251002082e-05, + "loss": 0.0225, + "step": 3866 + }, + { + "epoch": 0.510882848366747, + "grad_norm": 0.13355334103107452, + "learning_rate": 9.677958283442612e-05, + "loss": 0.0158, + "step": 3867 + }, + { + "epoch": 0.511014961852231, + "grad_norm": 0.1554861217737198, + "learning_rate": 9.673804371509827e-05, + "loss": 0.0228, + "step": 3868 + }, + { + "epoch": 0.5111470753377151, + "grad_norm": 0.2639622092247009, + "learning_rate": 9.669650515921236e-05, + "loss": 0.0184, + "step": 3869 + }, + { + "epoch": 0.5112791888231991, + "grad_norm": 0.16291341185569763, + "learning_rate": 9.665496717394345e-05, + "loss": 0.0133, + "step": 3870 + }, + { + "epoch": 0.5114113023086831, + "grad_norm": 0.1974400132894516, + "learning_rate": 9.661342976646638e-05, + "loss": 0.0234, + "step": 3871 + }, + { + "epoch": 0.5115434157941672, + "grad_norm": 0.18314112722873688, + "learning_rate": 9.657189294395603e-05, + "loss": 0.0208, + "step": 3872 + }, + { + "epoch": 0.5116755292796512, + "grad_norm": 0.17760257422924042, + "learning_rate": 9.653035671358705e-05, + "loss": 0.0186, + "step": 3873 + }, + { + "epoch": 0.5118076427651352, + "grad_norm": 0.18622662127017975, + "learning_rate": 9.64888210825341e-05, + "loss": 0.0251, + "step": 3874 + }, + { + "epoch": 0.5119397562506193, + "grad_norm": 0.19988639652729034, + "learning_rate": 9.644728605797167e-05, + "loss": 0.0244, + "step": 3875 + }, + { + "epoch": 0.5120718697361033, + "grad_norm": 0.11412037163972855, + "learning_rate": 9.64057516470741e-05, + "loss": 0.0152, + "step": 3876 + }, + { + "epoch": 0.5122039832215873, + "grad_norm": 0.13845624029636383, + "learning_rate": 9.636421785701577e-05, + "loss": 0.0189, + "step": 3877 + }, + { + "epoch": 0.5123360967070714, + "grad_norm": 0.24381454288959503, + "learning_rate": 9.632268469497081e-05, + "loss": 0.0156, + "step": 3878 + }, + { + "epoch": 0.5124682101925554, + "grad_norm": 0.18773894011974335, + "learning_rate": 9.628115216811332e-05, + "loss": 0.0212, + "step": 3879 + }, + { + "epoch": 0.5126003236780394, + "grad_norm": 0.08962608873844147, + "learning_rate": 9.623962028361725e-05, + "loss": 0.0095, + "step": 3880 + }, + { + "epoch": 0.5127324371635235, + "grad_norm": 0.12506498396396637, + "learning_rate": 9.619808904865649e-05, + "loss": 0.0146, + "step": 3881 + }, + { + "epoch": 0.5128645506490075, + "grad_norm": 0.251169353723526, + "learning_rate": 9.615655847040475e-05, + "loss": 0.0267, + "step": 3882 + }, + { + "epoch": 0.5129966641344915, + "grad_norm": 0.22696994245052338, + "learning_rate": 9.611502855603565e-05, + "loss": 0.0224, + "step": 3883 + }, + { + "epoch": 0.5131287776199756, + "grad_norm": 0.08587751537561417, + "learning_rate": 9.607349931272276e-05, + "loss": 0.007, + "step": 3884 + }, + { + "epoch": 0.5132608911054596, + "grad_norm": 0.2083951234817505, + "learning_rate": 9.603197074763942e-05, + "loss": 0.0168, + "step": 3885 + }, + { + "epoch": 0.5133930045909436, + "grad_norm": 0.24723173677921295, + "learning_rate": 9.599044286795896e-05, + "loss": 0.0174, + "step": 3886 + }, + { + "epoch": 0.5135251180764276, + "grad_norm": 0.1207137405872345, + "learning_rate": 9.594891568085452e-05, + "loss": 0.011, + "step": 3887 + }, + { + "epoch": 0.5136572315619117, + "grad_norm": 0.15802474319934845, + "learning_rate": 9.590738919349917e-05, + "loss": 0.0164, + "step": 3888 + }, + { + "epoch": 0.5137893450473957, + "grad_norm": 0.2264273762702942, + "learning_rate": 9.58658634130658e-05, + "loss": 0.0261, + "step": 3889 + }, + { + "epoch": 0.5139214585328797, + "grad_norm": 0.09568583220243454, + "learning_rate": 9.582433834672723e-05, + "loss": 0.0094, + "step": 3890 + }, + { + "epoch": 0.5140535720183638, + "grad_norm": 0.19316567480564117, + "learning_rate": 9.578281400165614e-05, + "loss": 0.0138, + "step": 3891 + }, + { + "epoch": 0.5141856855038478, + "grad_norm": 0.1675315797328949, + "learning_rate": 9.574129038502506e-05, + "loss": 0.0195, + "step": 3892 + }, + { + "epoch": 0.5143177989893318, + "grad_norm": 0.1676243245601654, + "learning_rate": 9.569976750400648e-05, + "loss": 0.0144, + "step": 3893 + }, + { + "epoch": 0.5144499124748159, + "grad_norm": 0.1998102366924286, + "learning_rate": 9.565824536577262e-05, + "loss": 0.013, + "step": 3894 + }, + { + "epoch": 0.5145820259602999, + "grad_norm": 0.2167922407388687, + "learning_rate": 9.561672397749572e-05, + "loss": 0.0206, + "step": 3895 + }, + { + "epoch": 0.5147141394457839, + "grad_norm": 0.15804733335971832, + "learning_rate": 9.557520334634781e-05, + "loss": 0.0194, + "step": 3896 + }, + { + "epoch": 0.514846252931268, + "grad_norm": 0.14819398522377014, + "learning_rate": 9.553368347950076e-05, + "loss": 0.0166, + "step": 3897 + }, + { + "epoch": 0.514978366416752, + "grad_norm": 0.10181767493486404, + "learning_rate": 9.549216438412639e-05, + "loss": 0.0106, + "step": 3898 + }, + { + "epoch": 0.515110479902236, + "grad_norm": 0.22075830399990082, + "learning_rate": 9.545064606739633e-05, + "loss": 0.0218, + "step": 3899 + }, + { + "epoch": 0.51524259338772, + "grad_norm": 0.15722963213920593, + "learning_rate": 9.540912853648212e-05, + "loss": 0.0218, + "step": 3900 + }, + { + "epoch": 0.5153747068732041, + "grad_norm": 0.17477941513061523, + "learning_rate": 9.53676117985551e-05, + "loss": 0.0229, + "step": 3901 + }, + { + "epoch": 0.5155068203586881, + "grad_norm": 0.18030881881713867, + "learning_rate": 9.532609586078655e-05, + "loss": 0.0244, + "step": 3902 + }, + { + "epoch": 0.5156389338441721, + "grad_norm": 0.15806345641613007, + "learning_rate": 9.528458073034755e-05, + "loss": 0.0272, + "step": 3903 + }, + { + "epoch": 0.5157710473296562, + "grad_norm": 0.2096640020608902, + "learning_rate": 9.524306641440904e-05, + "loss": 0.0136, + "step": 3904 + }, + { + "epoch": 0.5159031608151402, + "grad_norm": 0.11178679019212723, + "learning_rate": 9.52015529201419e-05, + "loss": 0.0101, + "step": 3905 + }, + { + "epoch": 0.5160352743006242, + "grad_norm": 0.19791501760482788, + "learning_rate": 9.516004025471675e-05, + "loss": 0.0317, + "step": 3906 + }, + { + "epoch": 0.5161673877861083, + "grad_norm": 0.24150808155536652, + "learning_rate": 9.51185284253041e-05, + "loss": 0.031, + "step": 3907 + }, + { + "epoch": 0.5162995012715923, + "grad_norm": 0.2426425963640213, + "learning_rate": 9.507701743907446e-05, + "loss": 0.0318, + "step": 3908 + }, + { + "epoch": 0.5164316147570763, + "grad_norm": 0.23894216120243073, + "learning_rate": 9.5035507303198e-05, + "loss": 0.0318, + "step": 3909 + }, + { + "epoch": 0.5165637282425604, + "grad_norm": 0.21717692911624908, + "learning_rate": 9.499399802484485e-05, + "loss": 0.0199, + "step": 3910 + }, + { + "epoch": 0.5166958417280444, + "grad_norm": 0.1702679842710495, + "learning_rate": 9.495248961118492e-05, + "loss": 0.0118, + "step": 3911 + }, + { + "epoch": 0.5168279552135284, + "grad_norm": 0.17146004736423492, + "learning_rate": 9.491098206938803e-05, + "loss": 0.0149, + "step": 3912 + }, + { + "epoch": 0.5169600686990125, + "grad_norm": 0.3598407506942749, + "learning_rate": 9.486947540662385e-05, + "loss": 0.0236, + "step": 3913 + }, + { + "epoch": 0.5170921821844965, + "grad_norm": 0.19947464764118195, + "learning_rate": 9.482796963006186e-05, + "loss": 0.0369, + "step": 3914 + }, + { + "epoch": 0.5172242956699805, + "grad_norm": 0.15635952353477478, + "learning_rate": 9.478646474687142e-05, + "loss": 0.0188, + "step": 3915 + }, + { + "epoch": 0.5173564091554645, + "grad_norm": 0.1626300811767578, + "learning_rate": 9.47449607642217e-05, + "loss": 0.0167, + "step": 3916 + }, + { + "epoch": 0.5174885226409486, + "grad_norm": 0.18757295608520508, + "learning_rate": 9.470345768928178e-05, + "loss": 0.0157, + "step": 3917 + }, + { + "epoch": 0.5176206361264326, + "grad_norm": 0.28251126408576965, + "learning_rate": 9.466195552922052e-05, + "loss": 0.0313, + "step": 3918 + }, + { + "epoch": 0.5177527496119166, + "grad_norm": 0.12264434248209, + "learning_rate": 9.46204542912066e-05, + "loss": 0.0192, + "step": 3919 + }, + { + "epoch": 0.5178848630974007, + "grad_norm": 0.1789926439523697, + "learning_rate": 9.457895398240864e-05, + "loss": 0.0135, + "step": 3920 + }, + { + "epoch": 0.5180169765828847, + "grad_norm": 0.14935386180877686, + "learning_rate": 9.4537454609995e-05, + "loss": 0.0115, + "step": 3921 + }, + { + "epoch": 0.5181490900683687, + "grad_norm": 0.47989845275878906, + "learning_rate": 9.449595618113395e-05, + "loss": 0.0112, + "step": 3922 + }, + { + "epoch": 0.5182812035538528, + "grad_norm": 0.14519643783569336, + "learning_rate": 9.445445870299356e-05, + "loss": 0.014, + "step": 3923 + }, + { + "epoch": 0.5184133170393368, + "grad_norm": 0.15153749287128448, + "learning_rate": 9.441296218274176e-05, + "loss": 0.0181, + "step": 3924 + }, + { + "epoch": 0.5185454305248208, + "grad_norm": 0.11909378319978714, + "learning_rate": 9.437146662754628e-05, + "loss": 0.018, + "step": 3925 + }, + { + "epoch": 0.5186775440103049, + "grad_norm": 0.1426568329334259, + "learning_rate": 9.432997204457467e-05, + "loss": 0.014, + "step": 3926 + }, + { + "epoch": 0.5188096574957889, + "grad_norm": 0.21028953790664673, + "learning_rate": 9.428847844099441e-05, + "loss": 0.0168, + "step": 3927 + }, + { + "epoch": 0.5189417709812729, + "grad_norm": 0.37740516662597656, + "learning_rate": 9.424698582397269e-05, + "loss": 0.0188, + "step": 3928 + }, + { + "epoch": 0.519073884466757, + "grad_norm": 0.1522783786058426, + "learning_rate": 9.420549420067661e-05, + "loss": 0.0218, + "step": 3929 + }, + { + "epoch": 0.519205997952241, + "grad_norm": 0.11716365814208984, + "learning_rate": 9.416400357827306e-05, + "loss": 0.0085, + "step": 3930 + }, + { + "epoch": 0.519338111437725, + "grad_norm": 0.24627836048603058, + "learning_rate": 9.412251396392878e-05, + "loss": 0.0115, + "step": 3931 + }, + { + "epoch": 0.519470224923209, + "grad_norm": 0.22646081447601318, + "learning_rate": 9.408102536481033e-05, + "loss": 0.0314, + "step": 3932 + }, + { + "epoch": 0.5196023384086931, + "grad_norm": 0.11769570410251617, + "learning_rate": 9.403953778808406e-05, + "loss": 0.0086, + "step": 3933 + }, + { + "epoch": 0.5197344518941771, + "grad_norm": 0.1721094697713852, + "learning_rate": 9.399805124091619e-05, + "loss": 0.0216, + "step": 3934 + }, + { + "epoch": 0.5198665653796611, + "grad_norm": 0.2551160752773285, + "learning_rate": 9.395656573047276e-05, + "loss": 0.0272, + "step": 3935 + }, + { + "epoch": 0.5199986788651452, + "grad_norm": 0.23127736151218414, + "learning_rate": 9.39150812639196e-05, + "loss": 0.0174, + "step": 3936 + }, + { + "epoch": 0.5201307923506292, + "grad_norm": 0.12697872519493103, + "learning_rate": 9.387359784842238e-05, + "loss": 0.0222, + "step": 3937 + }, + { + "epoch": 0.5202629058361132, + "grad_norm": 0.19013381004333496, + "learning_rate": 9.383211549114659e-05, + "loss": 0.0246, + "step": 3938 + }, + { + "epoch": 0.5203950193215973, + "grad_norm": 0.17478132247924805, + "learning_rate": 9.379063419925753e-05, + "loss": 0.0168, + "step": 3939 + }, + { + "epoch": 0.5205271328070813, + "grad_norm": 0.1878492385149002, + "learning_rate": 9.37491539799203e-05, + "loss": 0.017, + "step": 3940 + }, + { + "epoch": 0.5206592462925653, + "grad_norm": 0.2478381097316742, + "learning_rate": 9.370767484029987e-05, + "loss": 0.0159, + "step": 3941 + }, + { + "epoch": 0.5207913597780494, + "grad_norm": 0.19272209703922272, + "learning_rate": 9.366619678756092e-05, + "loss": 0.0191, + "step": 3942 + }, + { + "epoch": 0.5209234732635334, + "grad_norm": 0.16208535432815552, + "learning_rate": 9.362471982886809e-05, + "loss": 0.0122, + "step": 3943 + }, + { + "epoch": 0.5210555867490174, + "grad_norm": 0.25687727332115173, + "learning_rate": 9.358324397138568e-05, + "loss": 0.0327, + "step": 3944 + }, + { + "epoch": 0.5211877002345014, + "grad_norm": 0.1902291625738144, + "learning_rate": 9.354176922227793e-05, + "loss": 0.013, + "step": 3945 + }, + { + "epoch": 0.5213198137199855, + "grad_norm": 0.22686885297298431, + "learning_rate": 9.350029558870879e-05, + "loss": 0.0269, + "step": 3946 + }, + { + "epoch": 0.5214519272054695, + "grad_norm": 0.16572003066539764, + "learning_rate": 9.345882307784203e-05, + "loss": 0.0188, + "step": 3947 + }, + { + "epoch": 0.5215840406909535, + "grad_norm": 0.23944619297981262, + "learning_rate": 9.341735169684132e-05, + "loss": 0.022, + "step": 3948 + }, + { + "epoch": 0.5217161541764376, + "grad_norm": 0.2262609750032425, + "learning_rate": 9.337588145286998e-05, + "loss": 0.0295, + "step": 3949 + }, + { + "epoch": 0.5218482676619216, + "grad_norm": 0.12378781288862228, + "learning_rate": 9.33344123530913e-05, + "loss": 0.0123, + "step": 3950 + }, + { + "epoch": 0.5219803811474056, + "grad_norm": 0.26705402135849, + "learning_rate": 9.329294440466825e-05, + "loss": 0.0167, + "step": 3951 + }, + { + "epoch": 0.5221124946328897, + "grad_norm": 0.22340580821037292, + "learning_rate": 9.325147761476365e-05, + "loss": 0.0229, + "step": 3952 + }, + { + "epoch": 0.5222446081183737, + "grad_norm": 0.2640053331851959, + "learning_rate": 9.321001199054012e-05, + "loss": 0.0259, + "step": 3953 + }, + { + "epoch": 0.5223767216038577, + "grad_norm": 0.18717001378536224, + "learning_rate": 9.316854753916006e-05, + "loss": 0.0145, + "step": 3954 + }, + { + "epoch": 0.5225088350893418, + "grad_norm": 0.17452330887317657, + "learning_rate": 9.312708426778568e-05, + "loss": 0.022, + "step": 3955 + }, + { + "epoch": 0.5226409485748258, + "grad_norm": 0.1928424835205078, + "learning_rate": 9.308562218357898e-05, + "loss": 0.0206, + "step": 3956 + }, + { + "epoch": 0.5227730620603098, + "grad_norm": 0.16473954916000366, + "learning_rate": 9.30441612937018e-05, + "loss": 0.018, + "step": 3957 + }, + { + "epoch": 0.5229051755457939, + "grad_norm": 0.21050867438316345, + "learning_rate": 9.300270160531566e-05, + "loss": 0.0154, + "step": 3958 + }, + { + "epoch": 0.5230372890312779, + "grad_norm": 0.23349924385547638, + "learning_rate": 9.296124312558201e-05, + "loss": 0.0165, + "step": 3959 + }, + { + "epoch": 0.5231694025167619, + "grad_norm": 0.17704570293426514, + "learning_rate": 9.291978586166201e-05, + "loss": 0.0283, + "step": 3960 + }, + { + "epoch": 0.523301516002246, + "grad_norm": 0.3553520143032074, + "learning_rate": 9.28783298207166e-05, + "loss": 0.0396, + "step": 3961 + }, + { + "epoch": 0.52343362948773, + "grad_norm": 0.19322533905506134, + "learning_rate": 9.283687500990658e-05, + "loss": 0.011, + "step": 3962 + }, + { + "epoch": 0.523565742973214, + "grad_norm": 0.20268426835536957, + "learning_rate": 9.279542143639245e-05, + "loss": 0.0134, + "step": 3963 + }, + { + "epoch": 0.523697856458698, + "grad_norm": 0.2077493667602539, + "learning_rate": 9.275396910733458e-05, + "loss": 0.034, + "step": 3964 + }, + { + "epoch": 0.5238299699441821, + "grad_norm": 0.17809870839118958, + "learning_rate": 9.271251802989305e-05, + "loss": 0.0131, + "step": 3965 + }, + { + "epoch": 0.5239620834296661, + "grad_norm": 0.22379878163337708, + "learning_rate": 9.267106821122774e-05, + "loss": 0.0083, + "step": 3966 + }, + { + "epoch": 0.5240941969151501, + "grad_norm": 0.2034342736005783, + "learning_rate": 9.26296196584984e-05, + "loss": 0.0161, + "step": 3967 + }, + { + "epoch": 0.5242263104006342, + "grad_norm": 0.20792457461357117, + "learning_rate": 9.25881723788645e-05, + "loss": 0.0298, + "step": 3968 + }, + { + "epoch": 0.5243584238861182, + "grad_norm": 0.28438884019851685, + "learning_rate": 9.254672637948518e-05, + "loss": 0.029, + "step": 3969 + }, + { + "epoch": 0.5244905373716022, + "grad_norm": 0.20338405668735504, + "learning_rate": 9.250528166751956e-05, + "loss": 0.0183, + "step": 3970 + }, + { + "epoch": 0.5246226508570863, + "grad_norm": 0.22804833948612213, + "learning_rate": 9.246383825012637e-05, + "loss": 0.021, + "step": 3971 + }, + { + "epoch": 0.5247547643425703, + "grad_norm": 0.16153642535209656, + "learning_rate": 9.242239613446425e-05, + "loss": 0.0119, + "step": 3972 + }, + { + "epoch": 0.5248868778280543, + "grad_norm": 0.2589012086391449, + "learning_rate": 9.238095532769149e-05, + "loss": 0.025, + "step": 3973 + }, + { + "epoch": 0.5250189913135384, + "grad_norm": 0.14929302036762238, + "learning_rate": 9.233951583696628e-05, + "loss": 0.015, + "step": 3974 + }, + { + "epoch": 0.5251511047990224, + "grad_norm": 0.2125391960144043, + "learning_rate": 9.229807766944645e-05, + "loss": 0.0186, + "step": 3975 + }, + { + "epoch": 0.5252832182845064, + "grad_norm": 0.16655296087265015, + "learning_rate": 9.225664083228969e-05, + "loss": 0.0156, + "step": 3976 + }, + { + "epoch": 0.5254153317699904, + "grad_norm": 0.19410564005374908, + "learning_rate": 9.221520533265347e-05, + "loss": 0.0251, + "step": 3977 + }, + { + "epoch": 0.5255474452554745, + "grad_norm": 0.30204856395721436, + "learning_rate": 9.217377117769494e-05, + "loss": 0.0303, + "step": 3978 + }, + { + "epoch": 0.5256795587409585, + "grad_norm": 0.13079041242599487, + "learning_rate": 9.213233837457115e-05, + "loss": 0.0186, + "step": 3979 + }, + { + "epoch": 0.5258116722264425, + "grad_norm": 0.18816164135932922, + "learning_rate": 9.209090693043877e-05, + "loss": 0.0179, + "step": 3980 + }, + { + "epoch": 0.5259437857119266, + "grad_norm": 0.16152295470237732, + "learning_rate": 9.204947685245436e-05, + "loss": 0.0142, + "step": 3981 + }, + { + "epoch": 0.5260758991974106, + "grad_norm": 0.1895074099302292, + "learning_rate": 9.200804814777417e-05, + "loss": 0.0314, + "step": 3982 + }, + { + "epoch": 0.5262080126828946, + "grad_norm": 0.18929599225521088, + "learning_rate": 9.196662082355423e-05, + "loss": 0.0262, + "step": 3983 + }, + { + "epoch": 0.5263401261683787, + "grad_norm": 0.10806319117546082, + "learning_rate": 9.192519488695034e-05, + "loss": 0.0119, + "step": 3984 + }, + { + "epoch": 0.5264722396538627, + "grad_norm": 0.2200048416852951, + "learning_rate": 9.188377034511805e-05, + "loss": 0.0283, + "step": 3985 + }, + { + "epoch": 0.5266043531393467, + "grad_norm": 0.14298328757286072, + "learning_rate": 9.18423472052127e-05, + "loss": 0.0184, + "step": 3986 + }, + { + "epoch": 0.5267364666248308, + "grad_norm": 0.27281659841537476, + "learning_rate": 9.180092547438931e-05, + "loss": 0.0204, + "step": 3987 + }, + { + "epoch": 0.5268685801103148, + "grad_norm": 0.13350260257720947, + "learning_rate": 9.175950515980277e-05, + "loss": 0.0133, + "step": 3988 + }, + { + "epoch": 0.5270006935957988, + "grad_norm": 0.1405045986175537, + "learning_rate": 9.171808626860765e-05, + "loss": 0.0174, + "step": 3989 + }, + { + "epoch": 0.5271328070812829, + "grad_norm": 0.18542882800102234, + "learning_rate": 9.167666880795824e-05, + "loss": 0.0148, + "step": 3990 + }, + { + "epoch": 0.5272649205667669, + "grad_norm": 0.1905541568994522, + "learning_rate": 9.16352527850087e-05, + "loss": 0.018, + "step": 3991 + }, + { + "epoch": 0.5273970340522509, + "grad_norm": 0.16615454852581024, + "learning_rate": 9.159383820691283e-05, + "loss": 0.015, + "step": 3992 + }, + { + "epoch": 0.5275291475377349, + "grad_norm": 0.48656389117240906, + "learning_rate": 9.155242508082425e-05, + "loss": 0.0272, + "step": 3993 + }, + { + "epoch": 0.527661261023219, + "grad_norm": 0.1482691913843155, + "learning_rate": 9.151101341389627e-05, + "loss": 0.0185, + "step": 3994 + }, + { + "epoch": 0.527793374508703, + "grad_norm": 0.21181003749370575, + "learning_rate": 9.146960321328205e-05, + "loss": 0.0217, + "step": 3995 + }, + { + "epoch": 0.527925487994187, + "grad_norm": 0.18101364374160767, + "learning_rate": 9.142819448613433e-05, + "loss": 0.0224, + "step": 3996 + }, + { + "epoch": 0.5280576014796711, + "grad_norm": 0.1256485879421234, + "learning_rate": 9.138678723960575e-05, + "loss": 0.0128, + "step": 3997 + }, + { + "epoch": 0.5281897149651551, + "grad_norm": 0.17414726316928864, + "learning_rate": 9.134538148084866e-05, + "loss": 0.0182, + "step": 3998 + }, + { + "epoch": 0.5283218284506391, + "grad_norm": 0.15414351224899292, + "learning_rate": 9.130397721701506e-05, + "loss": 0.02, + "step": 3999 + }, + { + "epoch": 0.5284539419361232, + "grad_norm": 0.14810499548912048, + "learning_rate": 9.126257445525684e-05, + "loss": 0.0152, + "step": 4000 + }, + { + "epoch": 0.5285860554216072, + "grad_norm": 0.2823812961578369, + "learning_rate": 9.122117320272549e-05, + "loss": 0.0301, + "step": 4001 + }, + { + "epoch": 0.5287181689070912, + "grad_norm": 0.14143690466880798, + "learning_rate": 9.117977346657235e-05, + "loss": 0.015, + "step": 4002 + }, + { + "epoch": 0.5288502823925753, + "grad_norm": 0.17588815093040466, + "learning_rate": 9.113837525394843e-05, + "loss": 0.0164, + "step": 4003 + }, + { + "epoch": 0.5289823958780593, + "grad_norm": 0.14937280118465424, + "learning_rate": 9.109697857200447e-05, + "loss": 0.0204, + "step": 4004 + }, + { + "epoch": 0.5291145093635433, + "grad_norm": 0.19677433371543884, + "learning_rate": 9.105558342789103e-05, + "loss": 0.0117, + "step": 4005 + }, + { + "epoch": 0.5292466228490273, + "grad_norm": 0.3028697371482849, + "learning_rate": 9.10141898287583e-05, + "loss": 0.0197, + "step": 4006 + }, + { + "epoch": 0.5293787363345114, + "grad_norm": 0.19180864095687866, + "learning_rate": 9.097279778175627e-05, + "loss": 0.0145, + "step": 4007 + }, + { + "epoch": 0.5295108498199954, + "grad_norm": 0.1871991902589798, + "learning_rate": 9.093140729403463e-05, + "loss": 0.0253, + "step": 4008 + }, + { + "epoch": 0.5296429633054794, + "grad_norm": 0.20815619826316833, + "learning_rate": 9.089001837274284e-05, + "loss": 0.0211, + "step": 4009 + }, + { + "epoch": 0.5297750767909635, + "grad_norm": 0.19759541749954224, + "learning_rate": 9.084863102503003e-05, + "loss": 0.0119, + "step": 4010 + }, + { + "epoch": 0.5299071902764475, + "grad_norm": 0.19330792129039764, + "learning_rate": 9.08072452580451e-05, + "loss": 0.0276, + "step": 4011 + }, + { + "epoch": 0.5300393037619315, + "grad_norm": 0.25872206687927246, + "learning_rate": 9.07658610789367e-05, + "loss": 0.0285, + "step": 4012 + }, + { + "epoch": 0.5301714172474156, + "grad_norm": 0.30612707138061523, + "learning_rate": 9.072447849485311e-05, + "loss": 0.0369, + "step": 4013 + }, + { + "epoch": 0.5303035307328996, + "grad_norm": 0.1953236609697342, + "learning_rate": 9.068309751294246e-05, + "loss": 0.028, + "step": 4014 + }, + { + "epoch": 0.5304356442183836, + "grad_norm": 0.15855740010738373, + "learning_rate": 9.06417181403525e-05, + "loss": 0.0181, + "step": 4015 + }, + { + "epoch": 0.5305677577038677, + "grad_norm": 0.1605386883020401, + "learning_rate": 9.060034038423076e-05, + "loss": 0.0262, + "step": 4016 + }, + { + "epoch": 0.5306998711893517, + "grad_norm": 0.15657903254032135, + "learning_rate": 9.055896425172448e-05, + "loss": 0.0142, + "step": 4017 + }, + { + "epoch": 0.5308319846748357, + "grad_norm": 0.14779461920261383, + "learning_rate": 9.05175897499806e-05, + "loss": 0.022, + "step": 4018 + }, + { + "epoch": 0.5309640981603198, + "grad_norm": 0.1896156370639801, + "learning_rate": 9.04762168861458e-05, + "loss": 0.0133, + "step": 4019 + }, + { + "epoch": 0.5310962116458038, + "grad_norm": 0.11784496158361435, + "learning_rate": 9.043484566736644e-05, + "loss": 0.0137, + "step": 4020 + }, + { + "epoch": 0.5312283251312878, + "grad_norm": 0.13881589472293854, + "learning_rate": 9.039347610078866e-05, + "loss": 0.0144, + "step": 4021 + }, + { + "epoch": 0.5313604386167718, + "grad_norm": 0.1399046927690506, + "learning_rate": 9.035210819355827e-05, + "loss": 0.0129, + "step": 4022 + }, + { + "epoch": 0.5314925521022559, + "grad_norm": 0.14713436365127563, + "learning_rate": 9.03107419528208e-05, + "loss": 0.0095, + "step": 4023 + }, + { + "epoch": 0.5316246655877399, + "grad_norm": 0.20945364236831665, + "learning_rate": 9.026937738572148e-05, + "loss": 0.0175, + "step": 4024 + }, + { + "epoch": 0.5317567790732239, + "grad_norm": 0.10825034230947495, + "learning_rate": 9.02280144994053e-05, + "loss": 0.0128, + "step": 4025 + }, + { + "epoch": 0.531888892558708, + "grad_norm": 0.1814744919538498, + "learning_rate": 9.01866533010169e-05, + "loss": 0.0164, + "step": 4026 + }, + { + "epoch": 0.532021006044192, + "grad_norm": 0.3017287254333496, + "learning_rate": 9.014529379770067e-05, + "loss": 0.0222, + "step": 4027 + }, + { + "epoch": 0.532153119529676, + "grad_norm": 0.177540123462677, + "learning_rate": 9.010393599660065e-05, + "loss": 0.0147, + "step": 4028 + }, + { + "epoch": 0.5322852330151601, + "grad_norm": 0.16495585441589355, + "learning_rate": 9.00625799048607e-05, + "loss": 0.0132, + "step": 4029 + }, + { + "epoch": 0.5324173465006441, + "grad_norm": 0.1539219319820404, + "learning_rate": 9.002122552962423e-05, + "loss": 0.0123, + "step": 4030 + }, + { + "epoch": 0.5325494599861281, + "grad_norm": 0.19247393310070038, + "learning_rate": 8.997987287803451e-05, + "loss": 0.0192, + "step": 4031 + }, + { + "epoch": 0.5326815734716122, + "grad_norm": 0.17403817176818848, + "learning_rate": 8.993852195723438e-05, + "loss": 0.0201, + "step": 4032 + }, + { + "epoch": 0.5328136869570962, + "grad_norm": 0.20865043997764587, + "learning_rate": 8.989717277436647e-05, + "loss": 0.0225, + "step": 4033 + }, + { + "epoch": 0.5329458004425802, + "grad_norm": 0.1501004695892334, + "learning_rate": 8.985582533657306e-05, + "loss": 0.0106, + "step": 4034 + }, + { + "epoch": 0.5330779139280643, + "grad_norm": 0.329297810792923, + "learning_rate": 8.981447965099616e-05, + "loss": 0.0267, + "step": 4035 + }, + { + "epoch": 0.5332100274135483, + "grad_norm": 0.12887880206108093, + "learning_rate": 8.977313572477745e-05, + "loss": 0.0172, + "step": 4036 + }, + { + "epoch": 0.5333421408990323, + "grad_norm": 0.2528422772884369, + "learning_rate": 8.973179356505834e-05, + "loss": 0.0219, + "step": 4037 + }, + { + "epoch": 0.5334742543845163, + "grad_norm": 0.2842426002025604, + "learning_rate": 8.96904531789799e-05, + "loss": 0.0413, + "step": 4038 + }, + { + "epoch": 0.5336063678700004, + "grad_norm": 0.18451958894729614, + "learning_rate": 8.964911457368292e-05, + "loss": 0.0192, + "step": 4039 + }, + { + "epoch": 0.5337384813554844, + "grad_norm": 0.17478865385055542, + "learning_rate": 8.960777775630784e-05, + "loss": 0.0169, + "step": 4040 + }, + { + "epoch": 0.5338705948409684, + "grad_norm": 0.19136327505111694, + "learning_rate": 8.956644273399487e-05, + "loss": 0.0202, + "step": 4041 + }, + { + "epoch": 0.5340027083264525, + "grad_norm": 0.3356846868991852, + "learning_rate": 8.952510951388382e-05, + "loss": 0.0192, + "step": 4042 + }, + { + "epoch": 0.5341348218119365, + "grad_norm": 0.2409062683582306, + "learning_rate": 8.948377810311427e-05, + "loss": 0.023, + "step": 4043 + }, + { + "epoch": 0.5342669352974205, + "grad_norm": 0.13506470620632172, + "learning_rate": 8.94424485088254e-05, + "loss": 0.0097, + "step": 4044 + }, + { + "epoch": 0.5343990487829046, + "grad_norm": 0.21863852441310883, + "learning_rate": 8.940112073815619e-05, + "loss": 0.0208, + "step": 4045 + }, + { + "epoch": 0.5345311622683886, + "grad_norm": 0.2842344641685486, + "learning_rate": 8.935979479824519e-05, + "loss": 0.0256, + "step": 4046 + }, + { + "epoch": 0.5346632757538726, + "grad_norm": 0.12600763142108917, + "learning_rate": 8.931847069623068e-05, + "loss": 0.0183, + "step": 4047 + }, + { + "epoch": 0.5347953892393567, + "grad_norm": 0.18402507901191711, + "learning_rate": 8.927714843925066e-05, + "loss": 0.0331, + "step": 4048 + }, + { + "epoch": 0.5349275027248407, + "grad_norm": 0.18134433031082153, + "learning_rate": 8.923582803444274e-05, + "loss": 0.0118, + "step": 4049 + }, + { + "epoch": 0.5350596162103247, + "grad_norm": 0.14058002829551697, + "learning_rate": 8.919450948894429e-05, + "loss": 0.0143, + "step": 4050 + }, + { + "epoch": 0.5351917296958087, + "grad_norm": 0.11900404840707779, + "learning_rate": 8.915319280989226e-05, + "loss": 0.0085, + "step": 4051 + }, + { + "epoch": 0.5353238431812928, + "grad_norm": 0.13689814507961273, + "learning_rate": 8.91118780044234e-05, + "loss": 0.0122, + "step": 4052 + }, + { + "epoch": 0.5354559566667768, + "grad_norm": 0.13677994906902313, + "learning_rate": 8.907056507967402e-05, + "loss": 0.0186, + "step": 4053 + }, + { + "epoch": 0.5355880701522608, + "grad_norm": 0.0793822780251503, + "learning_rate": 8.902925404278017e-05, + "loss": 0.0083, + "step": 4054 + }, + { + "epoch": 0.5357201836377449, + "grad_norm": 0.18173062801361084, + "learning_rate": 8.898794490087757e-05, + "loss": 0.0259, + "step": 4055 + }, + { + "epoch": 0.5358522971232289, + "grad_norm": 0.17125774919986725, + "learning_rate": 8.894663766110159e-05, + "loss": 0.0168, + "step": 4056 + }, + { + "epoch": 0.5359844106087129, + "grad_norm": 0.19805628061294556, + "learning_rate": 8.890533233058729e-05, + "loss": 0.0237, + "step": 4057 + }, + { + "epoch": 0.536116524094197, + "grad_norm": 0.15703165531158447, + "learning_rate": 8.886402891646937e-05, + "loss": 0.0157, + "step": 4058 + }, + { + "epoch": 0.536248637579681, + "grad_norm": 0.12223298847675323, + "learning_rate": 8.882272742588226e-05, + "loss": 0.0155, + "step": 4059 + }, + { + "epoch": 0.536380751065165, + "grad_norm": 0.2299620360136032, + "learning_rate": 8.878142786596002e-05, + "loss": 0.0076, + "step": 4060 + }, + { + "epoch": 0.5365128645506491, + "grad_norm": 0.21877168118953705, + "learning_rate": 8.874013024383631e-05, + "loss": 0.0221, + "step": 4061 + }, + { + "epoch": 0.5366449780361331, + "grad_norm": 0.20613844692707062, + "learning_rate": 8.869883456664462e-05, + "loss": 0.0192, + "step": 4062 + }, + { + "epoch": 0.5367770915216171, + "grad_norm": 0.10324428975582123, + "learning_rate": 8.865754084151792e-05, + "loss": 0.0126, + "step": 4063 + }, + { + "epoch": 0.5369092050071012, + "grad_norm": 0.0655319094657898, + "learning_rate": 8.861624907558899e-05, + "loss": 0.0048, + "step": 4064 + }, + { + "epoch": 0.5370413184925852, + "grad_norm": 0.17869147658348083, + "learning_rate": 8.857495927599018e-05, + "loss": 0.0197, + "step": 4065 + }, + { + "epoch": 0.5371734319780692, + "grad_norm": 0.1799241304397583, + "learning_rate": 8.853367144985355e-05, + "loss": 0.0275, + "step": 4066 + }, + { + "epoch": 0.5373055454635532, + "grad_norm": 0.12295150011777878, + "learning_rate": 8.849238560431079e-05, + "loss": 0.0092, + "step": 4067 + }, + { + "epoch": 0.5374376589490373, + "grad_norm": 0.23537562787532806, + "learning_rate": 8.845110174649323e-05, + "loss": 0.0248, + "step": 4068 + }, + { + "epoch": 0.5375697724345213, + "grad_norm": 0.1479044109582901, + "learning_rate": 8.840981988353193e-05, + "loss": 0.0126, + "step": 4069 + }, + { + "epoch": 0.5377018859200052, + "grad_norm": 0.14745278656482697, + "learning_rate": 8.836854002255752e-05, + "loss": 0.0151, + "step": 4070 + }, + { + "epoch": 0.5378339994054893, + "grad_norm": 0.1753583401441574, + "learning_rate": 8.832726217070037e-05, + "loss": 0.0187, + "step": 4071 + }, + { + "epoch": 0.5379661128909733, + "grad_norm": 0.1836109310388565, + "learning_rate": 8.828598633509041e-05, + "loss": 0.0173, + "step": 4072 + }, + { + "epoch": 0.5380982263764573, + "grad_norm": 0.15662620961666107, + "learning_rate": 8.82447125228573e-05, + "loss": 0.0126, + "step": 4073 + }, + { + "epoch": 0.5382303398619414, + "grad_norm": 0.1305164247751236, + "learning_rate": 8.820344074113034e-05, + "loss": 0.0105, + "step": 4074 + }, + { + "epoch": 0.5383624533474254, + "grad_norm": 0.19191725552082062, + "learning_rate": 8.816217099703839e-05, + "loss": 0.0152, + "step": 4075 + }, + { + "epoch": 0.5384945668329094, + "grad_norm": 0.23539984226226807, + "learning_rate": 8.812090329771007e-05, + "loss": 0.0161, + "step": 4076 + }, + { + "epoch": 0.5386266803183934, + "grad_norm": 0.2325924038887024, + "learning_rate": 8.807963765027359e-05, + "loss": 0.0258, + "step": 4077 + }, + { + "epoch": 0.5387587938038775, + "grad_norm": 0.15382160246372223, + "learning_rate": 8.803837406185686e-05, + "loss": 0.0154, + "step": 4078 + }, + { + "epoch": 0.5388909072893615, + "grad_norm": 0.1661686897277832, + "learning_rate": 8.799711253958733e-05, + "loss": 0.0154, + "step": 4079 + }, + { + "epoch": 0.5390230207748455, + "grad_norm": 0.16271163523197174, + "learning_rate": 8.79558530905922e-05, + "loss": 0.0139, + "step": 4080 + }, + { + "epoch": 0.5391551342603296, + "grad_norm": 0.28479182720184326, + "learning_rate": 8.791459572199827e-05, + "loss": 0.023, + "step": 4081 + }, + { + "epoch": 0.5392872477458136, + "grad_norm": 0.22662101686000824, + "learning_rate": 8.787334044093195e-05, + "loss": 0.0359, + "step": 4082 + }, + { + "epoch": 0.5394193612312976, + "grad_norm": 0.16315026581287384, + "learning_rate": 8.783208725451929e-05, + "loss": 0.0142, + "step": 4083 + }, + { + "epoch": 0.5395514747167817, + "grad_norm": 0.15125629305839539, + "learning_rate": 8.779083616988611e-05, + "loss": 0.0158, + "step": 4084 + }, + { + "epoch": 0.5396835882022657, + "grad_norm": 0.11059519648551941, + "learning_rate": 8.774958719415767e-05, + "loss": 0.0094, + "step": 4085 + }, + { + "epoch": 0.5398157016877497, + "grad_norm": 0.16920366883277893, + "learning_rate": 8.770834033445901e-05, + "loss": 0.0203, + "step": 4086 + }, + { + "epoch": 0.5399478151732338, + "grad_norm": 0.15966933965682983, + "learning_rate": 8.766709559791473e-05, + "loss": 0.0122, + "step": 4087 + }, + { + "epoch": 0.5400799286587178, + "grad_norm": 0.1668696254491806, + "learning_rate": 8.762585299164912e-05, + "loss": 0.0148, + "step": 4088 + }, + { + "epoch": 0.5402120421442018, + "grad_norm": 0.14590130746364594, + "learning_rate": 8.758461252278603e-05, + "loss": 0.0142, + "step": 4089 + }, + { + "epoch": 0.5403441556296859, + "grad_norm": 0.2944794297218323, + "learning_rate": 8.754337419844897e-05, + "loss": 0.0271, + "step": 4090 + }, + { + "epoch": 0.5404762691151699, + "grad_norm": 0.20897763967514038, + "learning_rate": 8.750213802576114e-05, + "loss": 0.0183, + "step": 4091 + }, + { + "epoch": 0.5406083826006539, + "grad_norm": 0.18604964017868042, + "learning_rate": 8.746090401184526e-05, + "loss": 0.0174, + "step": 4092 + }, + { + "epoch": 0.540740496086138, + "grad_norm": 0.2030273675918579, + "learning_rate": 8.74196721638238e-05, + "loss": 0.0216, + "step": 4093 + }, + { + "epoch": 0.540872609571622, + "grad_norm": 0.25758877396583557, + "learning_rate": 8.737844248881873e-05, + "loss": 0.0168, + "step": 4094 + }, + { + "epoch": 0.541004723057106, + "grad_norm": 0.18899019062519073, + "learning_rate": 8.733721499395174e-05, + "loss": 0.0111, + "step": 4095 + }, + { + "epoch": 0.54113683654259, + "grad_norm": 0.20160093903541565, + "learning_rate": 8.729598968634412e-05, + "loss": 0.0205, + "step": 4096 + }, + { + "epoch": 0.5412689500280741, + "grad_norm": 0.14404238760471344, + "learning_rate": 8.725476657311671e-05, + "loss": 0.0146, + "step": 4097 + }, + { + "epoch": 0.5414010635135581, + "grad_norm": 0.1566837728023529, + "learning_rate": 8.72135456613901e-05, + "loss": 0.0165, + "step": 4098 + }, + { + "epoch": 0.5415331769990421, + "grad_norm": 0.18331080675125122, + "learning_rate": 8.71723269582844e-05, + "loss": 0.0209, + "step": 4099 + }, + { + "epoch": 0.5416652904845262, + "grad_norm": 0.20835472643375397, + "learning_rate": 8.713111047091939e-05, + "loss": 0.0258, + "step": 4100 + }, + { + "epoch": 0.5417974039700102, + "grad_norm": 0.263864666223526, + "learning_rate": 8.70898962064144e-05, + "loss": 0.0237, + "step": 4101 + }, + { + "epoch": 0.5419295174554942, + "grad_norm": 0.20449554920196533, + "learning_rate": 8.704868417188849e-05, + "loss": 0.0284, + "step": 4102 + }, + { + "epoch": 0.5420616309409783, + "grad_norm": 0.22227758169174194, + "learning_rate": 8.700747437446023e-05, + "loss": 0.018, + "step": 4103 + }, + { + "epoch": 0.5421937444264623, + "grad_norm": 0.18106292188167572, + "learning_rate": 8.696626682124782e-05, + "loss": 0.0101, + "step": 4104 + }, + { + "epoch": 0.5423258579119463, + "grad_norm": 0.28230977058410645, + "learning_rate": 8.692506151936916e-05, + "loss": 0.0135, + "step": 4105 + }, + { + "epoch": 0.5424579713974303, + "grad_norm": 0.16065078973770142, + "learning_rate": 8.688385847594162e-05, + "loss": 0.0204, + "step": 4106 + }, + { + "epoch": 0.5425900848829144, + "grad_norm": 0.41624608635902405, + "learning_rate": 8.684265769808232e-05, + "loss": 0.0154, + "step": 4107 + }, + { + "epoch": 0.5427221983683984, + "grad_norm": 0.17645595967769623, + "learning_rate": 8.680145919290787e-05, + "loss": 0.0354, + "step": 4108 + }, + { + "epoch": 0.5428543118538824, + "grad_norm": 0.16414207220077515, + "learning_rate": 8.676026296753459e-05, + "loss": 0.0138, + "step": 4109 + }, + { + "epoch": 0.5429864253393665, + "grad_norm": 0.20613645017147064, + "learning_rate": 8.671906902907833e-05, + "loss": 0.0144, + "step": 4110 + }, + { + "epoch": 0.5431185388248505, + "grad_norm": 0.2929520010948181, + "learning_rate": 8.667787738465458e-05, + "loss": 0.0298, + "step": 4111 + }, + { + "epoch": 0.5432506523103345, + "grad_norm": 0.2482895404100418, + "learning_rate": 8.663668804137843e-05, + "loss": 0.0263, + "step": 4112 + }, + { + "epoch": 0.5433827657958186, + "grad_norm": 0.15026050806045532, + "learning_rate": 8.659550100636454e-05, + "loss": 0.0185, + "step": 4113 + }, + { + "epoch": 0.5435148792813026, + "grad_norm": 0.1841093897819519, + "learning_rate": 8.655431628672725e-05, + "loss": 0.0171, + "step": 4114 + }, + { + "epoch": 0.5436469927667866, + "grad_norm": 0.36599624156951904, + "learning_rate": 8.65131338895804e-05, + "loss": 0.0197, + "step": 4115 + }, + { + "epoch": 0.5437791062522707, + "grad_norm": 0.24211275577545166, + "learning_rate": 8.647195382203753e-05, + "loss": 0.025, + "step": 4116 + }, + { + "epoch": 0.5439112197377547, + "grad_norm": 0.20414648950099945, + "learning_rate": 8.643077609121168e-05, + "loss": 0.0254, + "step": 4117 + }, + { + "epoch": 0.5440433332232387, + "grad_norm": 0.15253716707229614, + "learning_rate": 8.638960070421554e-05, + "loss": 0.0194, + "step": 4118 + }, + { + "epoch": 0.5441754467087228, + "grad_norm": 0.19736583530902863, + "learning_rate": 8.634842766816143e-05, + "loss": 0.0179, + "step": 4119 + }, + { + "epoch": 0.5443075601942068, + "grad_norm": 0.06327956169843674, + "learning_rate": 8.630725699016118e-05, + "loss": 0.0053, + "step": 4120 + }, + { + "epoch": 0.5444396736796908, + "grad_norm": 0.19995535910129547, + "learning_rate": 8.626608867732627e-05, + "loss": 0.0205, + "step": 4121 + }, + { + "epoch": 0.5445717871651748, + "grad_norm": 0.2537860572338104, + "learning_rate": 8.622492273676774e-05, + "loss": 0.0158, + "step": 4122 + }, + { + "epoch": 0.5447039006506589, + "grad_norm": 0.180791437625885, + "learning_rate": 8.618375917559627e-05, + "loss": 0.0214, + "step": 4123 + }, + { + "epoch": 0.5448360141361429, + "grad_norm": 0.1629365086555481, + "learning_rate": 8.614259800092209e-05, + "loss": 0.0267, + "step": 4124 + }, + { + "epoch": 0.5449681276216269, + "grad_norm": 0.12369240075349808, + "learning_rate": 8.610143921985498e-05, + "loss": 0.0117, + "step": 4125 + }, + { + "epoch": 0.545100241107111, + "grad_norm": 0.16354741156101227, + "learning_rate": 8.606028283950441e-05, + "loss": 0.0138, + "step": 4126 + }, + { + "epoch": 0.545232354592595, + "grad_norm": 0.12069833278656006, + "learning_rate": 8.60191288669793e-05, + "loss": 0.0123, + "step": 4127 + }, + { + "epoch": 0.545364468078079, + "grad_norm": 0.1605876088142395, + "learning_rate": 8.59779773093883e-05, + "loss": 0.0168, + "step": 4128 + }, + { + "epoch": 0.5454965815635631, + "grad_norm": 0.21643571555614471, + "learning_rate": 8.593682817383955e-05, + "loss": 0.0171, + "step": 4129 + }, + { + "epoch": 0.5456286950490471, + "grad_norm": 0.10620002448558807, + "learning_rate": 8.589568146744078e-05, + "loss": 0.0105, + "step": 4130 + }, + { + "epoch": 0.5457608085345311, + "grad_norm": 0.21386931836605072, + "learning_rate": 8.585453719729935e-05, + "loss": 0.0162, + "step": 4131 + }, + { + "epoch": 0.5458929220200152, + "grad_norm": 0.18839512765407562, + "learning_rate": 8.58133953705221e-05, + "loss": 0.024, + "step": 4132 + }, + { + "epoch": 0.5460250355054992, + "grad_norm": 0.2717703878879547, + "learning_rate": 8.577225599421558e-05, + "loss": 0.0203, + "step": 4133 + }, + { + "epoch": 0.5461571489909832, + "grad_norm": 0.16906102001667023, + "learning_rate": 8.573111907548578e-05, + "loss": 0.0206, + "step": 4134 + }, + { + "epoch": 0.5462892624764673, + "grad_norm": 1.166779637336731, + "learning_rate": 8.56899846214384e-05, + "loss": 0.0125, + "step": 4135 + }, + { + "epoch": 0.5464213759619513, + "grad_norm": 0.21797332167625427, + "learning_rate": 8.564885263917861e-05, + "loss": 0.0299, + "step": 4136 + }, + { + "epoch": 0.5465534894474353, + "grad_norm": 0.11769171804189682, + "learning_rate": 8.56077231358112e-05, + "loss": 0.0096, + "step": 4137 + }, + { + "epoch": 0.5466856029329193, + "grad_norm": 0.18901102244853973, + "learning_rate": 8.556659611844054e-05, + "loss": 0.0107, + "step": 4138 + }, + { + "epoch": 0.5468177164184034, + "grad_norm": 0.16076532006263733, + "learning_rate": 8.55254715941705e-05, + "loss": 0.0184, + "step": 4139 + }, + { + "epoch": 0.5469498299038874, + "grad_norm": 0.3321286141872406, + "learning_rate": 8.548434957010464e-05, + "loss": 0.0244, + "step": 4140 + }, + { + "epoch": 0.5470819433893714, + "grad_norm": 0.23241344094276428, + "learning_rate": 8.544323005334596e-05, + "loss": 0.017, + "step": 4141 + }, + { + "epoch": 0.5472140568748555, + "grad_norm": 0.17156289517879486, + "learning_rate": 8.54021130509971e-05, + "loss": 0.0198, + "step": 4142 + }, + { + "epoch": 0.5473461703603395, + "grad_norm": 0.1776096373796463, + "learning_rate": 8.536099857016031e-05, + "loss": 0.0207, + "step": 4143 + }, + { + "epoch": 0.5474782838458235, + "grad_norm": 0.17695477604866028, + "learning_rate": 8.531988661793729e-05, + "loss": 0.0239, + "step": 4144 + }, + { + "epoch": 0.5476103973313076, + "grad_norm": 0.1847931146621704, + "learning_rate": 8.52787772014294e-05, + "loss": 0.0209, + "step": 4145 + }, + { + "epoch": 0.5477425108167916, + "grad_norm": 0.10361135005950928, + "learning_rate": 8.52376703277375e-05, + "loss": 0.0096, + "step": 4146 + }, + { + "epoch": 0.5478746243022756, + "grad_norm": 0.19445562362670898, + "learning_rate": 8.5196566003962e-05, + "loss": 0.0168, + "step": 4147 + }, + { + "epoch": 0.5480067377877597, + "grad_norm": 0.24059665203094482, + "learning_rate": 8.515546423720298e-05, + "loss": 0.0118, + "step": 4148 + }, + { + "epoch": 0.5481388512732437, + "grad_norm": 0.201985165476799, + "learning_rate": 8.511436503455994e-05, + "loss": 0.0134, + "step": 4149 + }, + { + "epoch": 0.5482709647587277, + "grad_norm": 0.21374210715293884, + "learning_rate": 8.507326840313204e-05, + "loss": 0.0167, + "step": 4150 + }, + { + "epoch": 0.5484030782442118, + "grad_norm": 0.16954916715621948, + "learning_rate": 8.503217435001788e-05, + "loss": 0.0146, + "step": 4151 + }, + { + "epoch": 0.5485351917296958, + "grad_norm": 0.21463601291179657, + "learning_rate": 8.499108288231581e-05, + "loss": 0.0216, + "step": 4152 + }, + { + "epoch": 0.5486673052151798, + "grad_norm": 0.21518591046333313, + "learning_rate": 8.494999400712352e-05, + "loss": 0.0226, + "step": 4153 + }, + { + "epoch": 0.5487994187006638, + "grad_norm": 0.20929618179798126, + "learning_rate": 8.490890773153835e-05, + "loss": 0.0154, + "step": 4154 + }, + { + "epoch": 0.5489315321861479, + "grad_norm": 0.27093571424484253, + "learning_rate": 8.486782406265724e-05, + "loss": 0.0306, + "step": 4155 + }, + { + "epoch": 0.5490636456716319, + "grad_norm": 0.14851485192775726, + "learning_rate": 8.482674300757657e-05, + "loss": 0.0174, + "step": 4156 + }, + { + "epoch": 0.5491957591571159, + "grad_norm": 0.11216623336076736, + "learning_rate": 8.478566457339237e-05, + "loss": 0.0087, + "step": 4157 + }, + { + "epoch": 0.5493278726426, + "grad_norm": 0.14842119812965393, + "learning_rate": 8.474458876720011e-05, + "loss": 0.0114, + "step": 4158 + }, + { + "epoch": 0.549459986128084, + "grad_norm": 0.27710989117622375, + "learning_rate": 8.470351559609494e-05, + "loss": 0.0226, + "step": 4159 + }, + { + "epoch": 0.549592099613568, + "grad_norm": 0.16247709095478058, + "learning_rate": 8.466244506717146e-05, + "loss": 0.0166, + "step": 4160 + }, + { + "epoch": 0.5497242130990521, + "grad_norm": 0.22627033293247223, + "learning_rate": 8.462137718752378e-05, + "loss": 0.0114, + "step": 4161 + }, + { + "epoch": 0.5498563265845361, + "grad_norm": 0.12567301094532013, + "learning_rate": 8.458031196424569e-05, + "loss": 0.0117, + "step": 4162 + }, + { + "epoch": 0.5499884400700201, + "grad_norm": 0.11331895738840103, + "learning_rate": 8.453924940443037e-05, + "loss": 0.0095, + "step": 4163 + }, + { + "epoch": 0.5501205535555042, + "grad_norm": 0.13814541697502136, + "learning_rate": 8.449818951517068e-05, + "loss": 0.0152, + "step": 4164 + }, + { + "epoch": 0.5502526670409882, + "grad_norm": 0.15670281648635864, + "learning_rate": 8.445713230355888e-05, + "loss": 0.0152, + "step": 4165 + }, + { + "epoch": 0.5503847805264722, + "grad_norm": 0.1425250917673111, + "learning_rate": 8.441607777668688e-05, + "loss": 0.0143, + "step": 4166 + }, + { + "epoch": 0.5505168940119562, + "grad_norm": 0.16183023154735565, + "learning_rate": 8.437502594164607e-05, + "loss": 0.0121, + "step": 4167 + }, + { + "epoch": 0.5506490074974403, + "grad_norm": 0.18989704549312592, + "learning_rate": 8.433397680552735e-05, + "loss": 0.0173, + "step": 4168 + }, + { + "epoch": 0.5507811209829243, + "grad_norm": 0.24186430871486664, + "learning_rate": 8.429293037542127e-05, + "loss": 0.0211, + "step": 4169 + }, + { + "epoch": 0.5509132344684083, + "grad_norm": 0.1806151121854782, + "learning_rate": 8.425188665841775e-05, + "loss": 0.0216, + "step": 4170 + }, + { + "epoch": 0.5510453479538924, + "grad_norm": 0.2237834483385086, + "learning_rate": 8.42108456616064e-05, + "loss": 0.0426, + "step": 4171 + }, + { + "epoch": 0.5511774614393764, + "grad_norm": 0.23168422281742096, + "learning_rate": 8.416980739207621e-05, + "loss": 0.0285, + "step": 4172 + }, + { + "epoch": 0.5513095749248604, + "grad_norm": 0.14686016738414764, + "learning_rate": 8.412877185691584e-05, + "loss": 0.0128, + "step": 4173 + }, + { + "epoch": 0.5514416884103445, + "grad_norm": 0.14953650534152985, + "learning_rate": 8.408773906321339e-05, + "loss": 0.0159, + "step": 4174 + }, + { + "epoch": 0.5515738018958285, + "grad_norm": 0.21378695964813232, + "learning_rate": 8.404670901805647e-05, + "loss": 0.022, + "step": 4175 + }, + { + "epoch": 0.5517059153813125, + "grad_norm": 0.22658562660217285, + "learning_rate": 8.400568172853232e-05, + "loss": 0.0148, + "step": 4176 + }, + { + "epoch": 0.5518380288667966, + "grad_norm": 0.25272002816200256, + "learning_rate": 8.396465720172755e-05, + "loss": 0.0254, + "step": 4177 + }, + { + "epoch": 0.5519701423522806, + "grad_norm": 0.18372265994548798, + "learning_rate": 8.392363544472848e-05, + "loss": 0.0182, + "step": 4178 + }, + { + "epoch": 0.5521022558377646, + "grad_norm": 0.2196115404367447, + "learning_rate": 8.388261646462077e-05, + "loss": 0.024, + "step": 4179 + }, + { + "epoch": 0.5522343693232487, + "grad_norm": 0.19392231106758118, + "learning_rate": 8.384160026848974e-05, + "loss": 0.023, + "step": 4180 + }, + { + "epoch": 0.5523664828087327, + "grad_norm": 0.1629837453365326, + "learning_rate": 8.380058686342014e-05, + "loss": 0.0183, + "step": 4181 + }, + { + "epoch": 0.5524985962942167, + "grad_norm": 0.28126442432403564, + "learning_rate": 8.375957625649627e-05, + "loss": 0.0321, + "step": 4182 + }, + { + "epoch": 0.5526307097797007, + "grad_norm": 0.2287890464067459, + "learning_rate": 8.371856845480195e-05, + "loss": 0.0204, + "step": 4183 + }, + { + "epoch": 0.5527628232651848, + "grad_norm": 0.18556039035320282, + "learning_rate": 8.36775634654205e-05, + "loss": 0.0273, + "step": 4184 + }, + { + "epoch": 0.5528949367506688, + "grad_norm": 0.21663334965705872, + "learning_rate": 8.363656129543478e-05, + "loss": 0.0238, + "step": 4185 + }, + { + "epoch": 0.5530270502361528, + "grad_norm": 0.24293342232704163, + "learning_rate": 8.359556195192715e-05, + "loss": 0.0249, + "step": 4186 + }, + { + "epoch": 0.5531591637216369, + "grad_norm": 0.18286862969398499, + "learning_rate": 8.355456544197949e-05, + "loss": 0.0322, + "step": 4187 + }, + { + "epoch": 0.5532912772071209, + "grad_norm": 0.12769746780395508, + "learning_rate": 8.351357177267317e-05, + "loss": 0.0161, + "step": 4188 + }, + { + "epoch": 0.5534233906926049, + "grad_norm": 0.17266884446144104, + "learning_rate": 8.347258095108902e-05, + "loss": 0.0196, + "step": 4189 + }, + { + "epoch": 0.553555504178089, + "grad_norm": 0.2900910973548889, + "learning_rate": 8.343159298430755e-05, + "loss": 0.0199, + "step": 4190 + }, + { + "epoch": 0.553687617663573, + "grad_norm": 0.2615102529525757, + "learning_rate": 8.339060787940858e-05, + "loss": 0.0328, + "step": 4191 + }, + { + "epoch": 0.553819731149057, + "grad_norm": 0.19165071845054626, + "learning_rate": 8.334962564347156e-05, + "loss": 0.0214, + "step": 4192 + }, + { + "epoch": 0.5539518446345411, + "grad_norm": 0.2141731232404709, + "learning_rate": 8.330864628357537e-05, + "loss": 0.0236, + "step": 4193 + }, + { + "epoch": 0.5540839581200251, + "grad_norm": 0.471525639295578, + "learning_rate": 8.326766980679849e-05, + "loss": 0.0303, + "step": 4194 + }, + { + "epoch": 0.5542160716055091, + "grad_norm": 0.15691663324832916, + "learning_rate": 8.32266962202188e-05, + "loss": 0.0087, + "step": 4195 + }, + { + "epoch": 0.5543481850909932, + "grad_norm": 0.1748238503932953, + "learning_rate": 8.318572553091368e-05, + "loss": 0.0155, + "step": 4196 + }, + { + "epoch": 0.5544802985764772, + "grad_norm": 0.2593716084957123, + "learning_rate": 8.314475774596014e-05, + "loss": 0.0246, + "step": 4197 + }, + { + "epoch": 0.5546124120619612, + "grad_norm": 0.2592805325984955, + "learning_rate": 8.31037928724345e-05, + "loss": 0.0277, + "step": 4198 + }, + { + "epoch": 0.5547445255474452, + "grad_norm": 0.15535563230514526, + "learning_rate": 8.306283091741278e-05, + "loss": 0.0171, + "step": 4199 + }, + { + "epoch": 0.5548766390329293, + "grad_norm": 0.18834006786346436, + "learning_rate": 8.302187188797029e-05, + "loss": 0.0163, + "step": 4200 + }, + { + "epoch": 0.5550087525184133, + "grad_norm": 0.1360701322555542, + "learning_rate": 8.2980915791182e-05, + "loss": 0.013, + "step": 4201 + }, + { + "epoch": 0.5551408660038973, + "grad_norm": 0.15888836979866028, + "learning_rate": 8.293996263412233e-05, + "loss": 0.0106, + "step": 4202 + }, + { + "epoch": 0.5552729794893814, + "grad_norm": 0.13162779808044434, + "learning_rate": 8.289901242386513e-05, + "loss": 0.0194, + "step": 4203 + }, + { + "epoch": 0.5554050929748654, + "grad_norm": 0.28755760192871094, + "learning_rate": 8.285806516748377e-05, + "loss": 0.0243, + "step": 4204 + }, + { + "epoch": 0.5555372064603494, + "grad_norm": 0.22045432031154633, + "learning_rate": 8.281712087205115e-05, + "loss": 0.0285, + "step": 4205 + }, + { + "epoch": 0.5556693199458335, + "grad_norm": 0.19695398211479187, + "learning_rate": 8.277617954463964e-05, + "loss": 0.0201, + "step": 4206 + }, + { + "epoch": 0.5558014334313175, + "grad_norm": 0.17969001829624176, + "learning_rate": 8.273524119232108e-05, + "loss": 0.0223, + "step": 4207 + }, + { + "epoch": 0.5559335469168015, + "grad_norm": 0.1344221979379654, + "learning_rate": 8.269430582216678e-05, + "loss": 0.0181, + "step": 4208 + }, + { + "epoch": 0.5560656604022856, + "grad_norm": 0.2272825688123703, + "learning_rate": 8.26533734412476e-05, + "loss": 0.0226, + "step": 4209 + }, + { + "epoch": 0.5561977738877696, + "grad_norm": 0.17867231369018555, + "learning_rate": 8.261244405663382e-05, + "loss": 0.0221, + "step": 4210 + }, + { + "epoch": 0.5563298873732536, + "grad_norm": 0.1968875676393509, + "learning_rate": 8.25715176753952e-05, + "loss": 0.019, + "step": 4211 + }, + { + "epoch": 0.5564620008587376, + "grad_norm": 0.1409069150686264, + "learning_rate": 8.253059430460108e-05, + "loss": 0.0194, + "step": 4212 + }, + { + "epoch": 0.5565941143442217, + "grad_norm": 0.19733133912086487, + "learning_rate": 8.248967395132013e-05, + "loss": 0.0168, + "step": 4213 + }, + { + "epoch": 0.5567262278297057, + "grad_norm": 0.3398911654949188, + "learning_rate": 8.244875662262064e-05, + "loss": 0.0266, + "step": 4214 + }, + { + "epoch": 0.5568583413151897, + "grad_norm": 0.1569708287715912, + "learning_rate": 8.240784232557024e-05, + "loss": 0.0151, + "step": 4215 + }, + { + "epoch": 0.5569904548006738, + "grad_norm": 0.12734025716781616, + "learning_rate": 8.23669310672362e-05, + "loss": 0.0092, + "step": 4216 + }, + { + "epoch": 0.5571225682861578, + "grad_norm": 0.18991941213607788, + "learning_rate": 8.232602285468512e-05, + "loss": 0.0159, + "step": 4217 + }, + { + "epoch": 0.5572546817716418, + "grad_norm": 0.14975979924201965, + "learning_rate": 8.22851176949831e-05, + "loss": 0.0202, + "step": 4218 + }, + { + "epoch": 0.5573867952571259, + "grad_norm": 0.17158475518226624, + "learning_rate": 8.224421559519581e-05, + "loss": 0.0249, + "step": 4219 + }, + { + "epoch": 0.5575189087426099, + "grad_norm": 0.08295943588018417, + "learning_rate": 8.220331656238827e-05, + "loss": 0.0058, + "step": 4220 + }, + { + "epoch": 0.5576510222280939, + "grad_norm": 0.13717104494571686, + "learning_rate": 8.216242060362507e-05, + "loss": 0.0139, + "step": 4221 + }, + { + "epoch": 0.557783135713578, + "grad_norm": 0.2490658164024353, + "learning_rate": 8.212152772597018e-05, + "loss": 0.008, + "step": 4222 + }, + { + "epoch": 0.557915249199062, + "grad_norm": 0.17430169880390167, + "learning_rate": 8.208063793648711e-05, + "loss": 0.0185, + "step": 4223 + }, + { + "epoch": 0.558047362684546, + "grad_norm": 0.19915857911109924, + "learning_rate": 8.203975124223878e-05, + "loss": 0.018, + "step": 4224 + }, + { + "epoch": 0.55817947617003, + "grad_norm": 0.14223940670490265, + "learning_rate": 8.199886765028762e-05, + "loss": 0.0127, + "step": 4225 + }, + { + "epoch": 0.5583115896555141, + "grad_norm": 0.13848844170570374, + "learning_rate": 8.19579871676955e-05, + "loss": 0.017, + "step": 4226 + }, + { + "epoch": 0.5584437031409981, + "grad_norm": 0.16251258552074432, + "learning_rate": 8.191710980152374e-05, + "loss": 0.0156, + "step": 4227 + }, + { + "epoch": 0.5585758166264821, + "grad_norm": 0.24074551463127136, + "learning_rate": 8.187623555883321e-05, + "loss": 0.0162, + "step": 4228 + }, + { + "epoch": 0.5587079301119662, + "grad_norm": 0.1603114753961563, + "learning_rate": 8.183536444668407e-05, + "loss": 0.0222, + "step": 4229 + }, + { + "epoch": 0.5588400435974502, + "grad_norm": 0.37383100390434265, + "learning_rate": 8.179449647213613e-05, + "loss": 0.0278, + "step": 4230 + }, + { + "epoch": 0.5589721570829342, + "grad_norm": 0.1535310596227646, + "learning_rate": 8.175363164224853e-05, + "loss": 0.0116, + "step": 4231 + }, + { + "epoch": 0.5591042705684183, + "grad_norm": 0.16033555567264557, + "learning_rate": 8.171276996407989e-05, + "loss": 0.0128, + "step": 4232 + }, + { + "epoch": 0.5592363840539023, + "grad_norm": 0.17384503781795502, + "learning_rate": 8.167191144468832e-05, + "loss": 0.0182, + "step": 4233 + }, + { + "epoch": 0.5593684975393863, + "grad_norm": 0.1818269044160843, + "learning_rate": 8.163105609113135e-05, + "loss": 0.0169, + "step": 4234 + }, + { + "epoch": 0.5595006110248704, + "grad_norm": 0.12374034523963928, + "learning_rate": 8.159020391046601e-05, + "loss": 0.0147, + "step": 4235 + }, + { + "epoch": 0.5596327245103544, + "grad_norm": 0.13295878469944, + "learning_rate": 8.154935490974873e-05, + "loss": 0.0155, + "step": 4236 + }, + { + "epoch": 0.5597648379958384, + "grad_norm": 0.25557607412338257, + "learning_rate": 8.150850909603541e-05, + "loss": 0.0318, + "step": 4237 + }, + { + "epoch": 0.5598969514813225, + "grad_norm": 0.2126423716545105, + "learning_rate": 8.146766647638142e-05, + "loss": 0.0335, + "step": 4238 + }, + { + "epoch": 0.5600290649668065, + "grad_norm": 0.19664879143238068, + "learning_rate": 8.14268270578415e-05, + "loss": 0.03, + "step": 4239 + }, + { + "epoch": 0.5601611784522905, + "grad_norm": 0.1645100712776184, + "learning_rate": 8.138599084746998e-05, + "loss": 0.0226, + "step": 4240 + }, + { + "epoch": 0.5602932919377746, + "grad_norm": 0.1636127531528473, + "learning_rate": 8.134515785232049e-05, + "loss": 0.0279, + "step": 4241 + }, + { + "epoch": 0.5604254054232586, + "grad_norm": 0.1539396494626999, + "learning_rate": 8.130432807944618e-05, + "loss": 0.0113, + "step": 4242 + }, + { + "epoch": 0.5605575189087426, + "grad_norm": 0.16480429470539093, + "learning_rate": 8.126350153589964e-05, + "loss": 0.0306, + "step": 4243 + }, + { + "epoch": 0.5606896323942266, + "grad_norm": 0.16860733926296234, + "learning_rate": 8.12226782287329e-05, + "loss": 0.0256, + "step": 4244 + }, + { + "epoch": 0.5608217458797107, + "grad_norm": 0.19238042831420898, + "learning_rate": 8.118185816499743e-05, + "loss": 0.0203, + "step": 4245 + }, + { + "epoch": 0.5609538593651947, + "grad_norm": 0.176408052444458, + "learning_rate": 8.114104135174408e-05, + "loss": 0.0273, + "step": 4246 + }, + { + "epoch": 0.5610859728506787, + "grad_norm": 0.19235467910766602, + "learning_rate": 8.110022779602323e-05, + "loss": 0.02, + "step": 4247 + }, + { + "epoch": 0.5612180863361628, + "grad_norm": 0.1220388188958168, + "learning_rate": 8.105941750488465e-05, + "loss": 0.008, + "step": 4248 + }, + { + "epoch": 0.5613501998216468, + "grad_norm": 0.15233349800109863, + "learning_rate": 8.101861048537757e-05, + "loss": 0.0237, + "step": 4249 + }, + { + "epoch": 0.5614823133071308, + "grad_norm": 0.2459915727376938, + "learning_rate": 8.097780674455062e-05, + "loss": 0.0249, + "step": 4250 + }, + { + "epoch": 0.5616144267926149, + "grad_norm": 0.14926408231258392, + "learning_rate": 8.093700628945191e-05, + "loss": 0.016, + "step": 4251 + }, + { + "epoch": 0.5617465402780989, + "grad_norm": 0.2099609375, + "learning_rate": 8.089620912712894e-05, + "loss": 0.0302, + "step": 4252 + }, + { + "epoch": 0.5618786537635829, + "grad_norm": 0.24913211166858673, + "learning_rate": 8.085541526462862e-05, + "loss": 0.0193, + "step": 4253 + }, + { + "epoch": 0.562010767249067, + "grad_norm": 0.1699565201997757, + "learning_rate": 8.081462470899738e-05, + "loss": 0.0135, + "step": 4254 + }, + { + "epoch": 0.562142880734551, + "grad_norm": 0.14638663828372955, + "learning_rate": 8.077383746728101e-05, + "loss": 0.0176, + "step": 4255 + }, + { + "epoch": 0.562274994220035, + "grad_norm": 0.22291651368141174, + "learning_rate": 8.073305354652475e-05, + "loss": 0.0208, + "step": 4256 + }, + { + "epoch": 0.562407107705519, + "grad_norm": 0.1744988113641739, + "learning_rate": 8.069227295377322e-05, + "loss": 0.0161, + "step": 4257 + }, + { + "epoch": 0.5625392211910031, + "grad_norm": 0.17007562518119812, + "learning_rate": 8.065149569607057e-05, + "loss": 0.029, + "step": 4258 + }, + { + "epoch": 0.5626713346764871, + "grad_norm": 0.19890165328979492, + "learning_rate": 8.061072178046023e-05, + "loss": 0.0136, + "step": 4259 + }, + { + "epoch": 0.5628034481619711, + "grad_norm": 0.19456276297569275, + "learning_rate": 8.05699512139852e-05, + "loss": 0.0153, + "step": 4260 + }, + { + "epoch": 0.5629355616474552, + "grad_norm": 0.21431830525398254, + "learning_rate": 8.05291840036878e-05, + "loss": 0.02, + "step": 4261 + }, + { + "epoch": 0.5630676751329392, + "grad_norm": 0.1389990746974945, + "learning_rate": 8.048842015660984e-05, + "loss": 0.017, + "step": 4262 + }, + { + "epoch": 0.5631997886184232, + "grad_norm": 0.11417852342128754, + "learning_rate": 8.044765967979247e-05, + "loss": 0.0113, + "step": 4263 + }, + { + "epoch": 0.5633319021039073, + "grad_norm": 0.23878905177116394, + "learning_rate": 8.040690258027632e-05, + "loss": 0.0196, + "step": 4264 + }, + { + "epoch": 0.5634640155893913, + "grad_norm": 0.2255990356206894, + "learning_rate": 8.03661488651014e-05, + "loss": 0.0174, + "step": 4265 + }, + { + "epoch": 0.5635961290748753, + "grad_norm": 0.2871243357658386, + "learning_rate": 8.032539854130719e-05, + "loss": 0.0168, + "step": 4266 + }, + { + "epoch": 0.5637282425603594, + "grad_norm": 0.3738422393798828, + "learning_rate": 8.028465161593251e-05, + "loss": 0.0372, + "step": 4267 + }, + { + "epoch": 0.5638603560458434, + "grad_norm": 0.19977661967277527, + "learning_rate": 8.02439080960156e-05, + "loss": 0.0156, + "step": 4268 + }, + { + "epoch": 0.5639924695313274, + "grad_norm": 0.2826353907585144, + "learning_rate": 8.020316798859424e-05, + "loss": 0.0253, + "step": 4269 + }, + { + "epoch": 0.5641245830168115, + "grad_norm": 0.1400228589773178, + "learning_rate": 8.016243130070542e-05, + "loss": 0.015, + "step": 4270 + }, + { + "epoch": 0.5642566965022955, + "grad_norm": 0.14575998485088348, + "learning_rate": 8.012169803938572e-05, + "loss": 0.0108, + "step": 4271 + }, + { + "epoch": 0.5643888099877795, + "grad_norm": 0.4165917932987213, + "learning_rate": 8.008096821167097e-05, + "loss": 0.0185, + "step": 4272 + }, + { + "epoch": 0.5645209234732635, + "grad_norm": 0.1631278544664383, + "learning_rate": 8.004024182459657e-05, + "loss": 0.0149, + "step": 4273 + }, + { + "epoch": 0.5646530369587476, + "grad_norm": 0.22125643491744995, + "learning_rate": 7.99995188851972e-05, + "loss": 0.0294, + "step": 4274 + }, + { + "epoch": 0.5647851504442316, + "grad_norm": 0.12258799374103546, + "learning_rate": 7.995879940050695e-05, + "loss": 0.017, + "step": 4275 + }, + { + "epoch": 0.5649172639297156, + "grad_norm": 0.11797162145376205, + "learning_rate": 7.991808337755944e-05, + "loss": 0.0111, + "step": 4276 + }, + { + "epoch": 0.5650493774151997, + "grad_norm": 0.1532120555639267, + "learning_rate": 7.98773708233875e-05, + "loss": 0.0177, + "step": 4277 + }, + { + "epoch": 0.5651814909006837, + "grad_norm": 0.24839898943901062, + "learning_rate": 7.983666174502355e-05, + "loss": 0.0164, + "step": 4278 + }, + { + "epoch": 0.5653136043861677, + "grad_norm": 0.19382832944393158, + "learning_rate": 7.979595614949925e-05, + "loss": 0.0235, + "step": 4279 + }, + { + "epoch": 0.5654457178716518, + "grad_norm": 0.11616852134466171, + "learning_rate": 7.97552540438458e-05, + "loss": 0.0089, + "step": 4280 + }, + { + "epoch": 0.5655778313571358, + "grad_norm": 0.24005387723445892, + "learning_rate": 7.971455543509367e-05, + "loss": 0.0167, + "step": 4281 + }, + { + "epoch": 0.5657099448426198, + "grad_norm": 0.14314687252044678, + "learning_rate": 7.967386033027281e-05, + "loss": 0.0177, + "step": 4282 + }, + { + "epoch": 0.5658420583281039, + "grad_norm": 0.23170839250087738, + "learning_rate": 7.963316873641254e-05, + "loss": 0.0205, + "step": 4283 + }, + { + "epoch": 0.5659741718135879, + "grad_norm": 0.4849940538406372, + "learning_rate": 7.959248066054155e-05, + "loss": 0.057, + "step": 4284 + }, + { + "epoch": 0.5661062852990719, + "grad_norm": 0.17027738690376282, + "learning_rate": 7.955179610968799e-05, + "loss": 0.0163, + "step": 4285 + }, + { + "epoch": 0.566238398784556, + "grad_norm": 0.12950539588928223, + "learning_rate": 7.95111150908793e-05, + "loss": 0.0125, + "step": 4286 + }, + { + "epoch": 0.56637051227004, + "grad_norm": 0.18599557876586914, + "learning_rate": 7.947043761114241e-05, + "loss": 0.0203, + "step": 4287 + }, + { + "epoch": 0.566502625755524, + "grad_norm": 0.1637999415397644, + "learning_rate": 7.942976367750357e-05, + "loss": 0.0165, + "step": 4288 + }, + { + "epoch": 0.566634739241008, + "grad_norm": 0.21928507089614868, + "learning_rate": 7.938909329698844e-05, + "loss": 0.0244, + "step": 4289 + }, + { + "epoch": 0.5667668527264921, + "grad_norm": 0.15104596316814423, + "learning_rate": 7.934842647662208e-05, + "loss": 0.0174, + "step": 4290 + }, + { + "epoch": 0.5668989662119761, + "grad_norm": 0.16194121539592743, + "learning_rate": 7.930776322342892e-05, + "loss": 0.0125, + "step": 4291 + }, + { + "epoch": 0.5670310796974601, + "grad_norm": 0.12635424733161926, + "learning_rate": 7.926710354443278e-05, + "loss": 0.0162, + "step": 4292 + }, + { + "epoch": 0.5671631931829442, + "grad_norm": 0.1685585230588913, + "learning_rate": 7.922644744665684e-05, + "loss": 0.0245, + "step": 4293 + }, + { + "epoch": 0.5672953066684282, + "grad_norm": 0.1646813303232193, + "learning_rate": 7.91857949371237e-05, + "loss": 0.0138, + "step": 4294 + }, + { + "epoch": 0.5674274201539122, + "grad_norm": 0.2017247974872589, + "learning_rate": 7.914514602285534e-05, + "loss": 0.0163, + "step": 4295 + }, + { + "epoch": 0.5675595336393963, + "grad_norm": 0.1613525003194809, + "learning_rate": 7.910450071087303e-05, + "loss": 0.0177, + "step": 4296 + }, + { + "epoch": 0.5676916471248803, + "grad_norm": 0.1867039054632187, + "learning_rate": 7.906385900819757e-05, + "loss": 0.0201, + "step": 4297 + }, + { + "epoch": 0.5678237606103643, + "grad_norm": 0.17872785031795502, + "learning_rate": 7.902322092184899e-05, + "loss": 0.0206, + "step": 4298 + }, + { + "epoch": 0.5679558740958484, + "grad_norm": 0.12369637936353683, + "learning_rate": 7.898258645884681e-05, + "loss": 0.0104, + "step": 4299 + }, + { + "epoch": 0.5680879875813324, + "grad_norm": 0.2565552294254303, + "learning_rate": 7.894195562620983e-05, + "loss": 0.0167, + "step": 4300 + }, + { + "epoch": 0.5682201010668164, + "grad_norm": 0.3674928843975067, + "learning_rate": 7.890132843095631e-05, + "loss": 0.0251, + "step": 4301 + }, + { + "epoch": 0.5683522145523004, + "grad_norm": 0.14947600662708282, + "learning_rate": 7.886070488010382e-05, + "loss": 0.011, + "step": 4302 + }, + { + "epoch": 0.5684843280377845, + "grad_norm": 0.18126291036605835, + "learning_rate": 7.882008498066928e-05, + "loss": 0.0196, + "step": 4303 + }, + { + "epoch": 0.5686164415232685, + "grad_norm": 0.159513458609581, + "learning_rate": 7.87794687396691e-05, + "loss": 0.0181, + "step": 4304 + }, + { + "epoch": 0.5687485550087525, + "grad_norm": 0.2177513837814331, + "learning_rate": 7.873885616411888e-05, + "loss": 0.0315, + "step": 4305 + }, + { + "epoch": 0.5688806684942366, + "grad_norm": 0.2580743432044983, + "learning_rate": 7.869824726103376e-05, + "loss": 0.017, + "step": 4306 + }, + { + "epoch": 0.5690127819797206, + "grad_norm": 0.16138285398483276, + "learning_rate": 7.865764203742813e-05, + "loss": 0.0165, + "step": 4307 + }, + { + "epoch": 0.5691448954652046, + "grad_norm": 0.14814524352550507, + "learning_rate": 7.861704050031583e-05, + "loss": 0.0224, + "step": 4308 + }, + { + "epoch": 0.5692770089506887, + "grad_norm": 0.15521246194839478, + "learning_rate": 7.857644265670994e-05, + "loss": 0.0187, + "step": 4309 + }, + { + "epoch": 0.5694091224361727, + "grad_norm": 0.11792432516813278, + "learning_rate": 7.853584851362302e-05, + "loss": 0.0113, + "step": 4310 + }, + { + "epoch": 0.5695412359216567, + "grad_norm": 0.17021100223064423, + "learning_rate": 7.849525807806697e-05, + "loss": 0.0143, + "step": 4311 + }, + { + "epoch": 0.5696733494071408, + "grad_norm": 0.18499933183193207, + "learning_rate": 7.845467135705298e-05, + "loss": 0.0183, + "step": 4312 + }, + { + "epoch": 0.5698054628926248, + "grad_norm": 0.1422792673110962, + "learning_rate": 7.84140883575917e-05, + "loss": 0.0179, + "step": 4313 + }, + { + "epoch": 0.5699375763781088, + "grad_norm": 0.17566844820976257, + "learning_rate": 7.837350908669302e-05, + "loss": 0.0133, + "step": 4314 + }, + { + "epoch": 0.5700696898635929, + "grad_norm": 0.19467930495738983, + "learning_rate": 7.833293355136635e-05, + "loss": 0.0183, + "step": 4315 + }, + { + "epoch": 0.5702018033490769, + "grad_norm": 0.12969674170017242, + "learning_rate": 7.829236175862027e-05, + "loss": 0.0142, + "step": 4316 + }, + { + "epoch": 0.5703339168345609, + "grad_norm": 0.10846174508333206, + "learning_rate": 7.825179371546277e-05, + "loss": 0.0071, + "step": 4317 + }, + { + "epoch": 0.570466030320045, + "grad_norm": 0.5153217315673828, + "learning_rate": 7.82112294289013e-05, + "loss": 0.0175, + "step": 4318 + }, + { + "epoch": 0.570598143805529, + "grad_norm": 0.21997900307178497, + "learning_rate": 7.817066890594259e-05, + "loss": 0.0217, + "step": 4319 + }, + { + "epoch": 0.570730257291013, + "grad_norm": 0.23901112377643585, + "learning_rate": 7.813011215359265e-05, + "loss": 0.0165, + "step": 4320 + }, + { + "epoch": 0.570862370776497, + "grad_norm": 0.22473230957984924, + "learning_rate": 7.808955917885694e-05, + "loss": 0.0319, + "step": 4321 + }, + { + "epoch": 0.5709944842619811, + "grad_norm": 0.1821734756231308, + "learning_rate": 7.80490099887402e-05, + "loss": 0.015, + "step": 4322 + }, + { + "epoch": 0.5711265977474651, + "grad_norm": 0.2418357878923416, + "learning_rate": 7.80084645902466e-05, + "loss": 0.0323, + "step": 4323 + }, + { + "epoch": 0.5712587112329491, + "grad_norm": 0.18403179943561554, + "learning_rate": 7.796792299037954e-05, + "loss": 0.0228, + "step": 4324 + }, + { + "epoch": 0.5713908247184332, + "grad_norm": 0.2135356068611145, + "learning_rate": 7.792738519614182e-05, + "loss": 0.0168, + "step": 4325 + }, + { + "epoch": 0.5715229382039172, + "grad_norm": 0.20723068714141846, + "learning_rate": 7.788685121453564e-05, + "loss": 0.0203, + "step": 4326 + }, + { + "epoch": 0.5716550516894012, + "grad_norm": 0.14351293444633484, + "learning_rate": 7.784632105256244e-05, + "loss": 0.019, + "step": 4327 + }, + { + "epoch": 0.5717871651748853, + "grad_norm": 0.14696793258190155, + "learning_rate": 7.780579471722308e-05, + "loss": 0.0201, + "step": 4328 + }, + { + "epoch": 0.5719192786603693, + "grad_norm": 0.22235284745693207, + "learning_rate": 7.776527221551769e-05, + "loss": 0.0246, + "step": 4329 + }, + { + "epoch": 0.5720513921458533, + "grad_norm": 0.1321786344051361, + "learning_rate": 7.772475355444582e-05, + "loss": 0.0125, + "step": 4330 + }, + { + "epoch": 0.5721835056313374, + "grad_norm": 0.14376619458198547, + "learning_rate": 7.768423874100629e-05, + "loss": 0.02, + "step": 4331 + }, + { + "epoch": 0.5723156191168214, + "grad_norm": 0.285086065530777, + "learning_rate": 7.764372778219723e-05, + "loss": 0.0258, + "step": 4332 + }, + { + "epoch": 0.5724477326023054, + "grad_norm": 0.32190439105033875, + "learning_rate": 7.760322068501624e-05, + "loss": 0.0201, + "step": 4333 + }, + { + "epoch": 0.5725798460877894, + "grad_norm": 0.17368465662002563, + "learning_rate": 7.75627174564601e-05, + "loss": 0.0255, + "step": 4334 + }, + { + "epoch": 0.5727119595732735, + "grad_norm": 0.2123628556728363, + "learning_rate": 7.752221810352501e-05, + "loss": 0.0213, + "step": 4335 + }, + { + "epoch": 0.5728440730587575, + "grad_norm": 0.16958920657634735, + "learning_rate": 7.748172263320646e-05, + "loss": 0.0137, + "step": 4336 + }, + { + "epoch": 0.5729761865442415, + "grad_norm": 0.22543765604496002, + "learning_rate": 7.74412310524993e-05, + "loss": 0.0142, + "step": 4337 + }, + { + "epoch": 0.5731083000297256, + "grad_norm": 0.24494275450706482, + "learning_rate": 7.74007433683977e-05, + "loss": 0.0175, + "step": 4338 + }, + { + "epoch": 0.5732404135152096, + "grad_norm": 0.22296825051307678, + "learning_rate": 7.736025958789512e-05, + "loss": 0.0145, + "step": 4339 + }, + { + "epoch": 0.5733725270006936, + "grad_norm": 0.152304545044899, + "learning_rate": 7.73197797179844e-05, + "loss": 0.0164, + "step": 4340 + }, + { + "epoch": 0.5735046404861777, + "grad_norm": 0.14745329320430756, + "learning_rate": 7.727930376565766e-05, + "loss": 0.017, + "step": 4341 + }, + { + "epoch": 0.5736367539716617, + "grad_norm": 0.13277247548103333, + "learning_rate": 7.723883173790641e-05, + "loss": 0.0153, + "step": 4342 + }, + { + "epoch": 0.5737688674571457, + "grad_norm": 0.13933366537094116, + "learning_rate": 7.719836364172138e-05, + "loss": 0.0128, + "step": 4343 + }, + { + "epoch": 0.5739009809426298, + "grad_norm": 0.21639874577522278, + "learning_rate": 7.715789948409274e-05, + "loss": 0.0259, + "step": 4344 + }, + { + "epoch": 0.5740330944281138, + "grad_norm": 0.29860708117485046, + "learning_rate": 7.711743927200985e-05, + "loss": 0.0138, + "step": 4345 + }, + { + "epoch": 0.5741652079135978, + "grad_norm": 0.2232387363910675, + "learning_rate": 7.707698301246146e-05, + "loss": 0.0233, + "step": 4346 + }, + { + "epoch": 0.5742973213990819, + "grad_norm": 0.15625309944152832, + "learning_rate": 7.703653071243571e-05, + "loss": 0.0132, + "step": 4347 + }, + { + "epoch": 0.5744294348845659, + "grad_norm": 0.084880031645298, + "learning_rate": 7.699608237891988e-05, + "loss": 0.0066, + "step": 4348 + }, + { + "epoch": 0.5745615483700499, + "grad_norm": 0.18512706458568573, + "learning_rate": 7.695563801890074e-05, + "loss": 0.0319, + "step": 4349 + }, + { + "epoch": 0.5746936618555339, + "grad_norm": 0.13333538174629211, + "learning_rate": 7.691519763936424e-05, + "loss": 0.0111, + "step": 4350 + }, + { + "epoch": 0.574825775341018, + "grad_norm": 0.3957698345184326, + "learning_rate": 7.687476124729576e-05, + "loss": 0.0348, + "step": 4351 + }, + { + "epoch": 0.574957888826502, + "grad_norm": 0.1441449224948883, + "learning_rate": 7.683432884967987e-05, + "loss": 0.0177, + "step": 4352 + }, + { + "epoch": 0.575090002311986, + "grad_norm": 0.16597506403923035, + "learning_rate": 7.679390045350054e-05, + "loss": 0.0125, + "step": 4353 + }, + { + "epoch": 0.5752221157974701, + "grad_norm": 0.13051097095012665, + "learning_rate": 7.675347606574102e-05, + "loss": 0.0144, + "step": 4354 + }, + { + "epoch": 0.5753542292829541, + "grad_norm": 0.27841895818710327, + "learning_rate": 7.671305569338385e-05, + "loss": 0.0337, + "step": 4355 + }, + { + "epoch": 0.5754863427684381, + "grad_norm": 0.29610154032707214, + "learning_rate": 7.667263934341092e-05, + "loss": 0.0182, + "step": 4356 + }, + { + "epoch": 0.5756184562539222, + "grad_norm": 0.13614515960216522, + "learning_rate": 7.663222702280337e-05, + "loss": 0.0121, + "step": 4357 + }, + { + "epoch": 0.5757505697394062, + "grad_norm": 0.1613374799489975, + "learning_rate": 7.659181873854171e-05, + "loss": 0.0178, + "step": 4358 + }, + { + "epoch": 0.5758826832248902, + "grad_norm": 0.35530921816825867, + "learning_rate": 7.655141449760569e-05, + "loss": 0.0233, + "step": 4359 + }, + { + "epoch": 0.5760147967103743, + "grad_norm": 0.2377680391073227, + "learning_rate": 7.651101430697439e-05, + "loss": 0.024, + "step": 4360 + }, + { + "epoch": 0.5761469101958583, + "grad_norm": 0.20939378440380096, + "learning_rate": 7.647061817362617e-05, + "loss": 0.02, + "step": 4361 + }, + { + "epoch": 0.5762790236813423, + "grad_norm": 0.1147724911570549, + "learning_rate": 7.643022610453874e-05, + "loss": 0.0101, + "step": 4362 + }, + { + "epoch": 0.5764111371668263, + "grad_norm": 0.10789413750171661, + "learning_rate": 7.638983810668906e-05, + "loss": 0.0087, + "step": 4363 + }, + { + "epoch": 0.5765432506523104, + "grad_norm": 0.18182159960269928, + "learning_rate": 7.634945418705339e-05, + "loss": 0.012, + "step": 4364 + }, + { + "epoch": 0.5766753641377944, + "grad_norm": 0.18856218457221985, + "learning_rate": 7.630907435260733e-05, + "loss": 0.0304, + "step": 4365 + }, + { + "epoch": 0.5768074776232784, + "grad_norm": 0.1378273069858551, + "learning_rate": 7.626869861032571e-05, + "loss": 0.015, + "step": 4366 + }, + { + "epoch": 0.5769395911087625, + "grad_norm": 0.1456385999917984, + "learning_rate": 7.622832696718269e-05, + "loss": 0.0139, + "step": 4367 + }, + { + "epoch": 0.5770717045942465, + "grad_norm": 0.17599926888942719, + "learning_rate": 7.618795943015172e-05, + "loss": 0.0217, + "step": 4368 + }, + { + "epoch": 0.5772038180797305, + "grad_norm": 0.2156069576740265, + "learning_rate": 7.614759600620553e-05, + "loss": 0.0237, + "step": 4369 + }, + { + "epoch": 0.5773359315652146, + "grad_norm": 0.18197335302829742, + "learning_rate": 7.610723670231619e-05, + "loss": 0.0274, + "step": 4370 + }, + { + "epoch": 0.5774680450506986, + "grad_norm": 0.17185929417610168, + "learning_rate": 7.606688152545494e-05, + "loss": 0.0288, + "step": 4371 + }, + { + "epoch": 0.5776001585361826, + "grad_norm": 0.2738763391971588, + "learning_rate": 7.602653048259244e-05, + "loss": 0.0267, + "step": 4372 + }, + { + "epoch": 0.5777322720216667, + "grad_norm": 0.13118204474449158, + "learning_rate": 7.598618358069858e-05, + "loss": 0.0241, + "step": 4373 + }, + { + "epoch": 0.5778643855071507, + "grad_norm": 0.22741925716400146, + "learning_rate": 7.594584082674248e-05, + "loss": 0.0388, + "step": 4374 + }, + { + "epoch": 0.5779964989926347, + "grad_norm": 0.1998399943113327, + "learning_rate": 7.590550222769265e-05, + "loss": 0.0165, + "step": 4375 + }, + { + "epoch": 0.5781286124781188, + "grad_norm": 0.19354160130023956, + "learning_rate": 7.586516779051677e-05, + "loss": 0.0127, + "step": 4376 + }, + { + "epoch": 0.5782607259636028, + "grad_norm": 0.32591480016708374, + "learning_rate": 7.582483752218192e-05, + "loss": 0.0128, + "step": 4377 + }, + { + "epoch": 0.5783928394490868, + "grad_norm": 0.142011359333992, + "learning_rate": 7.57845114296544e-05, + "loss": 0.0174, + "step": 4378 + }, + { + "epoch": 0.5785249529345708, + "grad_norm": 0.21151968836784363, + "learning_rate": 7.574418951989975e-05, + "loss": 0.0183, + "step": 4379 + }, + { + "epoch": 0.5786570664200549, + "grad_norm": 0.20516203343868256, + "learning_rate": 7.570387179988286e-05, + "loss": 0.0234, + "step": 4380 + }, + { + "epoch": 0.5787891799055389, + "grad_norm": 0.17113912105560303, + "learning_rate": 7.566355827656783e-05, + "loss": 0.0242, + "step": 4381 + }, + { + "epoch": 0.5789212933910229, + "grad_norm": 0.2233685404062271, + "learning_rate": 7.562324895691809e-05, + "loss": 0.0204, + "step": 4382 + }, + { + "epoch": 0.579053406876507, + "grad_norm": 0.14263029396533966, + "learning_rate": 7.55829438478963e-05, + "loss": 0.0179, + "step": 4383 + }, + { + "epoch": 0.579185520361991, + "grad_norm": 0.16608430445194244, + "learning_rate": 7.554264295646444e-05, + "loss": 0.0152, + "step": 4384 + }, + { + "epoch": 0.579317633847475, + "grad_norm": 0.1453695148229599, + "learning_rate": 7.550234628958373e-05, + "loss": 0.0106, + "step": 4385 + }, + { + "epoch": 0.5794497473329591, + "grad_norm": 0.10571756213903427, + "learning_rate": 7.546205385421463e-05, + "loss": 0.0098, + "step": 4386 + }, + { + "epoch": 0.5795818608184431, + "grad_norm": 0.1574452966451645, + "learning_rate": 7.542176565731698e-05, + "loss": 0.0143, + "step": 4387 + }, + { + "epoch": 0.5797139743039271, + "grad_norm": 0.17890842258930206, + "learning_rate": 7.538148170584974e-05, + "loss": 0.0215, + "step": 4388 + }, + { + "epoch": 0.5798460877894112, + "grad_norm": 0.1550913155078888, + "learning_rate": 7.534120200677122e-05, + "loss": 0.0111, + "step": 4389 + }, + { + "epoch": 0.5799782012748952, + "grad_norm": 0.14007329940795898, + "learning_rate": 7.530092656703904e-05, + "loss": 0.0231, + "step": 4390 + }, + { + "epoch": 0.5801103147603792, + "grad_norm": 0.13921970129013062, + "learning_rate": 7.526065539360996e-05, + "loss": 0.0155, + "step": 4391 + }, + { + "epoch": 0.5802424282458633, + "grad_norm": 0.20098300278186798, + "learning_rate": 7.522038849344012e-05, + "loss": 0.0273, + "step": 4392 + }, + { + "epoch": 0.5803745417313473, + "grad_norm": 0.1520303636789322, + "learning_rate": 7.518012587348483e-05, + "loss": 0.0162, + "step": 4393 + }, + { + "epoch": 0.5805066552168313, + "grad_norm": 0.24330918490886688, + "learning_rate": 7.513986754069877e-05, + "loss": 0.0273, + "step": 4394 + }, + { + "epoch": 0.5806387687023153, + "grad_norm": 0.18926988542079926, + "learning_rate": 7.509961350203576e-05, + "loss": 0.0124, + "step": 4395 + }, + { + "epoch": 0.5807708821877993, + "grad_norm": 0.2736714780330658, + "learning_rate": 7.505936376444893e-05, + "loss": 0.0119, + "step": 4396 + }, + { + "epoch": 0.5809029956732833, + "grad_norm": 0.1956966370344162, + "learning_rate": 7.501911833489071e-05, + "loss": 0.0224, + "step": 4397 + }, + { + "epoch": 0.5810351091587673, + "grad_norm": 0.21423551440238953, + "learning_rate": 7.497887722031272e-05, + "loss": 0.0316, + "step": 4398 + }, + { + "epoch": 0.5811672226442514, + "grad_norm": 0.21427178382873535, + "learning_rate": 7.493864042766585e-05, + "loss": 0.0249, + "step": 4399 + }, + { + "epoch": 0.5812993361297354, + "grad_norm": 0.15347111225128174, + "learning_rate": 7.489840796390028e-05, + "loss": 0.0188, + "step": 4400 + }, + { + "epoch": 0.5814314496152194, + "grad_norm": 0.2119879424571991, + "learning_rate": 7.485817983596541e-05, + "loss": 0.0237, + "step": 4401 + }, + { + "epoch": 0.5815635631007035, + "grad_norm": 0.16195791959762573, + "learning_rate": 7.481795605080987e-05, + "loss": 0.0177, + "step": 4402 + }, + { + "epoch": 0.5816956765861875, + "grad_norm": 0.18902483582496643, + "learning_rate": 7.477773661538159e-05, + "loss": 0.0362, + "step": 4403 + }, + { + "epoch": 0.5818277900716715, + "grad_norm": 0.15727530419826508, + "learning_rate": 7.473752153662774e-05, + "loss": 0.0161, + "step": 4404 + }, + { + "epoch": 0.5819599035571555, + "grad_norm": 0.19354747235774994, + "learning_rate": 7.469731082149467e-05, + "loss": 0.0112, + "step": 4405 + }, + { + "epoch": 0.5820920170426396, + "grad_norm": 0.17567463219165802, + "learning_rate": 7.465710447692811e-05, + "loss": 0.0279, + "step": 4406 + }, + { + "epoch": 0.5822241305281236, + "grad_norm": 0.1853395253419876, + "learning_rate": 7.461690250987287e-05, + "loss": 0.0147, + "step": 4407 + }, + { + "epoch": 0.5823562440136076, + "grad_norm": 0.1291511058807373, + "learning_rate": 7.457670492727316e-05, + "loss": 0.0178, + "step": 4408 + }, + { + "epoch": 0.5824883574990917, + "grad_norm": 0.11233451217412949, + "learning_rate": 7.45365117360723e-05, + "loss": 0.0179, + "step": 4409 + }, + { + "epoch": 0.5826204709845757, + "grad_norm": 0.1814539134502411, + "learning_rate": 7.449632294321294e-05, + "loss": 0.0142, + "step": 4410 + }, + { + "epoch": 0.5827525844700597, + "grad_norm": 0.22415396571159363, + "learning_rate": 7.445613855563698e-05, + "loss": 0.0247, + "step": 4411 + }, + { + "epoch": 0.5828846979555438, + "grad_norm": 0.1492513120174408, + "learning_rate": 7.441595858028543e-05, + "loss": 0.0123, + "step": 4412 + }, + { + "epoch": 0.5830168114410278, + "grad_norm": 0.32033640146255493, + "learning_rate": 7.437578302409873e-05, + "loss": 0.0279, + "step": 4413 + }, + { + "epoch": 0.5831489249265118, + "grad_norm": 0.15683256089687347, + "learning_rate": 7.433561189401637e-05, + "loss": 0.0069, + "step": 4414 + }, + { + "epoch": 0.5832810384119959, + "grad_norm": 0.16468288004398346, + "learning_rate": 7.429544519697723e-05, + "loss": 0.021, + "step": 4415 + }, + { + "epoch": 0.5834131518974799, + "grad_norm": 0.14463230967521667, + "learning_rate": 7.425528293991932e-05, + "loss": 0.0182, + "step": 4416 + }, + { + "epoch": 0.5835452653829639, + "grad_norm": 0.16453178226947784, + "learning_rate": 7.421512512977993e-05, + "loss": 0.0123, + "step": 4417 + }, + { + "epoch": 0.583677378868448, + "grad_norm": 0.16732649505138397, + "learning_rate": 7.417497177349556e-05, + "loss": 0.0157, + "step": 4418 + }, + { + "epoch": 0.583809492353932, + "grad_norm": 0.20836827158927917, + "learning_rate": 7.413482287800195e-05, + "loss": 0.0265, + "step": 4419 + }, + { + "epoch": 0.583941605839416, + "grad_norm": 0.2837749421596527, + "learning_rate": 7.40946784502341e-05, + "loss": 0.0238, + "step": 4420 + }, + { + "epoch": 0.5840737193249, + "grad_norm": 0.18477781116962433, + "learning_rate": 7.405453849712616e-05, + "loss": 0.0198, + "step": 4421 + }, + { + "epoch": 0.5842058328103841, + "grad_norm": 0.17079903185367584, + "learning_rate": 7.401440302561162e-05, + "loss": 0.0172, + "step": 4422 + }, + { + "epoch": 0.5843379462958681, + "grad_norm": 0.183742955327034, + "learning_rate": 7.397427204262308e-05, + "loss": 0.0179, + "step": 4423 + }, + { + "epoch": 0.5844700597813521, + "grad_norm": 0.12258812040090561, + "learning_rate": 7.393414555509243e-05, + "loss": 0.0137, + "step": 4424 + }, + { + "epoch": 0.5846021732668362, + "grad_norm": 0.28085893392562866, + "learning_rate": 7.389402356995078e-05, + "loss": 0.0274, + "step": 4425 + }, + { + "epoch": 0.5847342867523202, + "grad_norm": 0.2764669358730316, + "learning_rate": 7.385390609412844e-05, + "loss": 0.0289, + "step": 4426 + }, + { + "epoch": 0.5848664002378042, + "grad_norm": 0.08393105864524841, + "learning_rate": 7.381379313455499e-05, + "loss": 0.0075, + "step": 4427 + }, + { + "epoch": 0.5849985137232883, + "grad_norm": 0.1453171968460083, + "learning_rate": 7.377368469815913e-05, + "loss": 0.0172, + "step": 4428 + }, + { + "epoch": 0.5851306272087723, + "grad_norm": 0.1916455179452896, + "learning_rate": 7.37335807918689e-05, + "loss": 0.0234, + "step": 4429 + }, + { + "epoch": 0.5852627406942563, + "grad_norm": 0.1560536026954651, + "learning_rate": 7.369348142261148e-05, + "loss": 0.0177, + "step": 4430 + }, + { + "epoch": 0.5853948541797404, + "grad_norm": 0.17626263201236725, + "learning_rate": 7.365338659731327e-05, + "loss": 0.0182, + "step": 4431 + }, + { + "epoch": 0.5855269676652244, + "grad_norm": 0.12068506330251694, + "learning_rate": 7.361329632289992e-05, + "loss": 0.011, + "step": 4432 + }, + { + "epoch": 0.5856590811507084, + "grad_norm": 0.2062414288520813, + "learning_rate": 7.357321060629626e-05, + "loss": 0.0071, + "step": 4433 + }, + { + "epoch": 0.5857911946361924, + "grad_norm": 0.14255057275295258, + "learning_rate": 7.353312945442639e-05, + "loss": 0.0212, + "step": 4434 + }, + { + "epoch": 0.5859233081216765, + "grad_norm": 0.259384423494339, + "learning_rate": 7.349305287421348e-05, + "loss": 0.0251, + "step": 4435 + }, + { + "epoch": 0.5860554216071605, + "grad_norm": 0.11826782673597336, + "learning_rate": 7.345298087258013e-05, + "loss": 0.0099, + "step": 4436 + }, + { + "epoch": 0.5861875350926445, + "grad_norm": 0.15347795188426971, + "learning_rate": 7.341291345644797e-05, + "loss": 0.0169, + "step": 4437 + }, + { + "epoch": 0.5863196485781286, + "grad_norm": 0.19067321717739105, + "learning_rate": 7.337285063273793e-05, + "loss": 0.0142, + "step": 4438 + }, + { + "epoch": 0.5864517620636126, + "grad_norm": 0.21550928056240082, + "learning_rate": 7.333279240837005e-05, + "loss": 0.0282, + "step": 4439 + }, + { + "epoch": 0.5865838755490966, + "grad_norm": 0.12859563529491425, + "learning_rate": 7.329273879026371e-05, + "loss": 0.0073, + "step": 4440 + }, + { + "epoch": 0.5867159890345807, + "grad_norm": 0.16383865475654602, + "learning_rate": 7.325268978533735e-05, + "loss": 0.0114, + "step": 4441 + }, + { + "epoch": 0.5868481025200647, + "grad_norm": 0.15545353293418884, + "learning_rate": 7.321264540050876e-05, + "loss": 0.0154, + "step": 4442 + }, + { + "epoch": 0.5869802160055487, + "grad_norm": 0.1711602658033371, + "learning_rate": 7.317260564269482e-05, + "loss": 0.0202, + "step": 4443 + }, + { + "epoch": 0.5871123294910328, + "grad_norm": 0.15408815443515778, + "learning_rate": 7.313257051881165e-05, + "loss": 0.0142, + "step": 4444 + }, + { + "epoch": 0.5872444429765168, + "grad_norm": 0.10001006722450256, + "learning_rate": 7.309254003577459e-05, + "loss": 0.0097, + "step": 4445 + }, + { + "epoch": 0.5873765564620008, + "grad_norm": 0.17244599759578705, + "learning_rate": 7.305251420049813e-05, + "loss": 0.016, + "step": 4446 + }, + { + "epoch": 0.5875086699474849, + "grad_norm": 0.20834384858608246, + "learning_rate": 7.301249301989601e-05, + "loss": 0.0201, + "step": 4447 + }, + { + "epoch": 0.5876407834329689, + "grad_norm": 0.21501821279525757, + "learning_rate": 7.29724765008811e-05, + "loss": 0.027, + "step": 4448 + }, + { + "epoch": 0.5877728969184529, + "grad_norm": 0.293439656496048, + "learning_rate": 7.293246465036557e-05, + "loss": 0.0206, + "step": 4449 + }, + { + "epoch": 0.587905010403937, + "grad_norm": 0.20541538298130035, + "learning_rate": 7.289245747526066e-05, + "loss": 0.0277, + "step": 4450 + }, + { + "epoch": 0.588037123889421, + "grad_norm": 0.21135063469409943, + "learning_rate": 7.285245498247689e-05, + "loss": 0.0151, + "step": 4451 + }, + { + "epoch": 0.588169237374905, + "grad_norm": 0.21523725986480713, + "learning_rate": 7.281245717892396e-05, + "loss": 0.0092, + "step": 4452 + }, + { + "epoch": 0.588301350860389, + "grad_norm": 0.4050748646259308, + "learning_rate": 7.277246407151067e-05, + "loss": 0.0288, + "step": 4453 + }, + { + "epoch": 0.5884334643458731, + "grad_norm": 0.15043921768665314, + "learning_rate": 7.273247566714517e-05, + "loss": 0.0172, + "step": 4454 + }, + { + "epoch": 0.5885655778313571, + "grad_norm": 0.12210755795240402, + "learning_rate": 7.269249197273465e-05, + "loss": 0.0135, + "step": 4455 + }, + { + "epoch": 0.5886976913168411, + "grad_norm": 0.16365686058998108, + "learning_rate": 7.265251299518558e-05, + "loss": 0.0168, + "step": 4456 + }, + { + "epoch": 0.5888298048023252, + "grad_norm": 0.15842820703983307, + "learning_rate": 7.261253874140354e-05, + "loss": 0.0164, + "step": 4457 + }, + { + "epoch": 0.5889619182878092, + "grad_norm": 0.11781178414821625, + "learning_rate": 7.25725692182934e-05, + "loss": 0.0102, + "step": 4458 + }, + { + "epoch": 0.5890940317732932, + "grad_norm": 0.13468272984027863, + "learning_rate": 7.253260443275908e-05, + "loss": 0.0168, + "step": 4459 + }, + { + "epoch": 0.5892261452587773, + "grad_norm": 0.20080485939979553, + "learning_rate": 7.249264439170378e-05, + "loss": 0.0284, + "step": 4460 + }, + { + "epoch": 0.5893582587442613, + "grad_norm": 0.1445317268371582, + "learning_rate": 7.245268910202988e-05, + "loss": 0.0131, + "step": 4461 + }, + { + "epoch": 0.5894903722297453, + "grad_norm": 0.21400819718837738, + "learning_rate": 7.241273857063884e-05, + "loss": 0.0206, + "step": 4462 + }, + { + "epoch": 0.5896224857152293, + "grad_norm": 0.42813754081726074, + "learning_rate": 7.237279280443143e-05, + "loss": 0.0348, + "step": 4463 + }, + { + "epoch": 0.5897545992007134, + "grad_norm": 0.1779608279466629, + "learning_rate": 7.23328518103075e-05, + "loss": 0.0245, + "step": 4464 + }, + { + "epoch": 0.5898867126861974, + "grad_norm": 0.1536242812871933, + "learning_rate": 7.229291559516612e-05, + "loss": 0.0205, + "step": 4465 + }, + { + "epoch": 0.5900188261716814, + "grad_norm": 0.17254947125911713, + "learning_rate": 7.225298416590554e-05, + "loss": 0.0207, + "step": 4466 + }, + { + "epoch": 0.5901509396571655, + "grad_norm": 0.2188318520784378, + "learning_rate": 7.221305752942313e-05, + "loss": 0.0275, + "step": 4467 + }, + { + "epoch": 0.5902830531426495, + "grad_norm": 0.1342228502035141, + "learning_rate": 7.21731356926155e-05, + "loss": 0.0155, + "step": 4468 + }, + { + "epoch": 0.5904151666281335, + "grad_norm": 0.1679522544145584, + "learning_rate": 7.213321866237837e-05, + "loss": 0.0157, + "step": 4469 + }, + { + "epoch": 0.5905472801136176, + "grad_norm": 0.12442659586668015, + "learning_rate": 7.209330644560673e-05, + "loss": 0.0132, + "step": 4470 + }, + { + "epoch": 0.5906793935991016, + "grad_norm": 0.30356472730636597, + "learning_rate": 7.205339904919456e-05, + "loss": 0.0325, + "step": 4471 + }, + { + "epoch": 0.5908115070845856, + "grad_norm": 0.2188597321510315, + "learning_rate": 7.201349648003524e-05, + "loss": 0.0227, + "step": 4472 + }, + { + "epoch": 0.5909436205700697, + "grad_norm": 0.2081504613161087, + "learning_rate": 7.197359874502112e-05, + "loss": 0.0213, + "step": 4473 + }, + { + "epoch": 0.5910757340555537, + "grad_norm": 0.09858710318803787, + "learning_rate": 7.193370585104377e-05, + "loss": 0.0121, + "step": 4474 + }, + { + "epoch": 0.5912078475410377, + "grad_norm": 0.18282747268676758, + "learning_rate": 7.1893817804994e-05, + "loss": 0.013, + "step": 4475 + }, + { + "epoch": 0.5913399610265218, + "grad_norm": 0.16425664722919464, + "learning_rate": 7.185393461376166e-05, + "loss": 0.019, + "step": 4476 + }, + { + "epoch": 0.5914720745120058, + "grad_norm": 0.22384493052959442, + "learning_rate": 7.18140562842359e-05, + "loss": 0.0207, + "step": 4477 + }, + { + "epoch": 0.5916041879974898, + "grad_norm": 0.18941235542297363, + "learning_rate": 7.17741828233049e-05, + "loss": 0.0345, + "step": 4478 + }, + { + "epoch": 0.5917363014829738, + "grad_norm": 0.34682533144950867, + "learning_rate": 7.173431423785609e-05, + "loss": 0.051, + "step": 4479 + }, + { + "epoch": 0.5918684149684579, + "grad_norm": 0.1639723926782608, + "learning_rate": 7.169445053477599e-05, + "loss": 0.0274, + "step": 4480 + }, + { + "epoch": 0.5920005284539419, + "grad_norm": 0.1771511286497116, + "learning_rate": 7.16545917209503e-05, + "loss": 0.0151, + "step": 4481 + }, + { + "epoch": 0.5921326419394259, + "grad_norm": 0.1359596848487854, + "learning_rate": 7.161473780326393e-05, + "loss": 0.0103, + "step": 4482 + }, + { + "epoch": 0.59226475542491, + "grad_norm": 0.14128483831882477, + "learning_rate": 7.157488878860087e-05, + "loss": 0.015, + "step": 4483 + }, + { + "epoch": 0.592396868910394, + "grad_norm": 0.1506226509809494, + "learning_rate": 7.153504468384431e-05, + "loss": 0.02, + "step": 4484 + }, + { + "epoch": 0.592528982395878, + "grad_norm": 0.1360141783952713, + "learning_rate": 7.149520549587656e-05, + "loss": 0.0097, + "step": 4485 + }, + { + "epoch": 0.5926610958813621, + "grad_norm": 0.19670647382736206, + "learning_rate": 7.14553712315791e-05, + "loss": 0.0167, + "step": 4486 + }, + { + "epoch": 0.5927932093668461, + "grad_norm": 0.2583443224430084, + "learning_rate": 7.141554189783256e-05, + "loss": 0.0243, + "step": 4487 + }, + { + "epoch": 0.5929253228523301, + "grad_norm": 0.22510045766830444, + "learning_rate": 7.137571750151668e-05, + "loss": 0.0223, + "step": 4488 + }, + { + "epoch": 0.5930574363378142, + "grad_norm": 0.201186403632164, + "learning_rate": 7.133589804951044e-05, + "loss": 0.0226, + "step": 4489 + }, + { + "epoch": 0.5931895498232982, + "grad_norm": 0.13508693873882294, + "learning_rate": 7.129608354869184e-05, + "loss": 0.0128, + "step": 4490 + }, + { + "epoch": 0.5933216633087822, + "grad_norm": 0.12510472536087036, + "learning_rate": 7.125627400593815e-05, + "loss": 0.0139, + "step": 4491 + }, + { + "epoch": 0.5934537767942663, + "grad_norm": 0.2334831953048706, + "learning_rate": 7.121646942812566e-05, + "loss": 0.0264, + "step": 4492 + }, + { + "epoch": 0.5935858902797503, + "grad_norm": 0.13933241367340088, + "learning_rate": 7.117666982212989e-05, + "loss": 0.0182, + "step": 4493 + }, + { + "epoch": 0.5937180037652343, + "grad_norm": 0.19615226984024048, + "learning_rate": 7.113687519482555e-05, + "loss": 0.0239, + "step": 4494 + }, + { + "epoch": 0.5938501172507183, + "grad_norm": 0.1652141511440277, + "learning_rate": 7.109708555308634e-05, + "loss": 0.017, + "step": 4495 + }, + { + "epoch": 0.5939822307362024, + "grad_norm": 0.1287144422531128, + "learning_rate": 7.105730090378517e-05, + "loss": 0.0182, + "step": 4496 + }, + { + "epoch": 0.5941143442216864, + "grad_norm": 0.1629124879837036, + "learning_rate": 7.101752125379414e-05, + "loss": 0.0137, + "step": 4497 + }, + { + "epoch": 0.5942464577071704, + "grad_norm": 0.13514606654644012, + "learning_rate": 7.097774660998442e-05, + "loss": 0.0172, + "step": 4498 + }, + { + "epoch": 0.5943785711926545, + "grad_norm": 0.1825205534696579, + "learning_rate": 7.093797697922635e-05, + "loss": 0.0172, + "step": 4499 + }, + { + "epoch": 0.5945106846781385, + "grad_norm": 0.1886797845363617, + "learning_rate": 7.089821236838934e-05, + "loss": 0.024, + "step": 4500 + }, + { + "epoch": 0.5946427981636225, + "grad_norm": 0.2030637115240097, + "learning_rate": 7.085845278434206e-05, + "loss": 0.0206, + "step": 4501 + }, + { + "epoch": 0.5947749116491066, + "grad_norm": 0.2834545075893402, + "learning_rate": 7.081869823395217e-05, + "loss": 0.0249, + "step": 4502 + }, + { + "epoch": 0.5949070251345906, + "grad_norm": 0.11763806641101837, + "learning_rate": 7.077894872408655e-05, + "loss": 0.0086, + "step": 4503 + }, + { + "epoch": 0.5950391386200746, + "grad_norm": 0.11526691913604736, + "learning_rate": 7.073920426161121e-05, + "loss": 0.0067, + "step": 4504 + }, + { + "epoch": 0.5951712521055587, + "grad_norm": 0.12258782237768173, + "learning_rate": 7.069946485339118e-05, + "loss": 0.0105, + "step": 4505 + }, + { + "epoch": 0.5953033655910427, + "grad_norm": 0.36307042837142944, + "learning_rate": 7.065973050629081e-05, + "loss": 0.0201, + "step": 4506 + }, + { + "epoch": 0.5954354790765267, + "grad_norm": 0.26752519607543945, + "learning_rate": 7.062000122717338e-05, + "loss": 0.0321, + "step": 4507 + }, + { + "epoch": 0.5955675925620108, + "grad_norm": 0.17262883484363556, + "learning_rate": 7.058027702290144e-05, + "loss": 0.0253, + "step": 4508 + }, + { + "epoch": 0.5956997060474948, + "grad_norm": 0.132182776927948, + "learning_rate": 7.054055790033655e-05, + "loss": 0.0148, + "step": 4509 + }, + { + "epoch": 0.5958318195329788, + "grad_norm": 0.1420915126800537, + "learning_rate": 7.050084386633948e-05, + "loss": 0.0138, + "step": 4510 + }, + { + "epoch": 0.5959639330184628, + "grad_norm": 0.3453371226787567, + "learning_rate": 7.046113492777009e-05, + "loss": 0.016, + "step": 4511 + }, + { + "epoch": 0.5960960465039469, + "grad_norm": 0.12592832744121552, + "learning_rate": 7.042143109148733e-05, + "loss": 0.0098, + "step": 4512 + }, + { + "epoch": 0.5962281599894309, + "grad_norm": 0.17428047955036163, + "learning_rate": 7.038173236434933e-05, + "loss": 0.0183, + "step": 4513 + }, + { + "epoch": 0.5963602734749149, + "grad_norm": 0.17273180186748505, + "learning_rate": 7.034203875321326e-05, + "loss": 0.013, + "step": 4514 + }, + { + "epoch": 0.596492386960399, + "grad_norm": 0.13108059763908386, + "learning_rate": 7.030235026493548e-05, + "loss": 0.0184, + "step": 4515 + }, + { + "epoch": 0.596624500445883, + "grad_norm": 0.20756517350673676, + "learning_rate": 7.026266690637145e-05, + "loss": 0.0193, + "step": 4516 + }, + { + "epoch": 0.596756613931367, + "grad_norm": 0.13016660511493683, + "learning_rate": 7.022298868437567e-05, + "loss": 0.0165, + "step": 4517 + }, + { + "epoch": 0.5968887274168511, + "grad_norm": 0.11081884056329727, + "learning_rate": 7.018331560580187e-05, + "loss": 0.0108, + "step": 4518 + }, + { + "epoch": 0.5970208409023351, + "grad_norm": 0.11399253457784653, + "learning_rate": 7.014364767750277e-05, + "loss": 0.0186, + "step": 4519 + }, + { + "epoch": 0.5971529543878191, + "grad_norm": 0.11999227106571198, + "learning_rate": 7.010398490633035e-05, + "loss": 0.0147, + "step": 4520 + }, + { + "epoch": 0.5972850678733032, + "grad_norm": 0.13482487201690674, + "learning_rate": 7.006432729913552e-05, + "loss": 0.0121, + "step": 4521 + }, + { + "epoch": 0.5974171813587872, + "grad_norm": 0.16840508580207825, + "learning_rate": 7.002467486276847e-05, + "loss": 0.0078, + "step": 4522 + }, + { + "epoch": 0.5975492948442712, + "grad_norm": 0.13563968241214752, + "learning_rate": 6.998502760407838e-05, + "loss": 0.0125, + "step": 4523 + }, + { + "epoch": 0.5976814083297552, + "grad_norm": 0.18716301023960114, + "learning_rate": 6.994538552991354e-05, + "loss": 0.0224, + "step": 4524 + }, + { + "epoch": 0.5978135218152393, + "grad_norm": 0.16288627684116364, + "learning_rate": 6.990574864712144e-05, + "loss": 0.0164, + "step": 4525 + }, + { + "epoch": 0.5979456353007233, + "grad_norm": 0.19571024179458618, + "learning_rate": 6.986611696254857e-05, + "loss": 0.0232, + "step": 4526 + }, + { + "epoch": 0.5980777487862073, + "grad_norm": 0.09460264444351196, + "learning_rate": 6.982649048304057e-05, + "loss": 0.0092, + "step": 4527 + }, + { + "epoch": 0.5982098622716914, + "grad_norm": 0.3020939230918884, + "learning_rate": 6.978686921544218e-05, + "loss": 0.0218, + "step": 4528 + }, + { + "epoch": 0.5983419757571754, + "grad_norm": 0.15641634166240692, + "learning_rate": 6.974725316659725e-05, + "loss": 0.0086, + "step": 4529 + }, + { + "epoch": 0.5984740892426594, + "grad_norm": 0.17395232617855072, + "learning_rate": 6.970764234334868e-05, + "loss": 0.0134, + "step": 4530 + }, + { + "epoch": 0.5986062027281435, + "grad_norm": 0.14680452644824982, + "learning_rate": 6.966803675253848e-05, + "loss": 0.0134, + "step": 4531 + }, + { + "epoch": 0.5987383162136275, + "grad_norm": 0.1943461298942566, + "learning_rate": 6.962843640100785e-05, + "loss": 0.0233, + "step": 4532 + }, + { + "epoch": 0.5988704296991115, + "grad_norm": 0.10282100737094879, + "learning_rate": 6.958884129559693e-05, + "loss": 0.0118, + "step": 4533 + }, + { + "epoch": 0.5990025431845956, + "grad_norm": 0.1541905701160431, + "learning_rate": 6.954925144314511e-05, + "loss": 0.014, + "step": 4534 + }, + { + "epoch": 0.5991346566700796, + "grad_norm": 0.12435435503721237, + "learning_rate": 6.950966685049071e-05, + "loss": 0.0178, + "step": 4535 + }, + { + "epoch": 0.5992667701555636, + "grad_norm": 0.17765118181705475, + "learning_rate": 6.947008752447131e-05, + "loss": 0.0207, + "step": 4536 + }, + { + "epoch": 0.5993988836410477, + "grad_norm": 0.13467107713222504, + "learning_rate": 6.943051347192346e-05, + "loss": 0.0131, + "step": 4537 + }, + { + "epoch": 0.5995309971265317, + "grad_norm": 0.14471574127674103, + "learning_rate": 6.939094469968282e-05, + "loss": 0.0162, + "step": 4538 + }, + { + "epoch": 0.5996631106120157, + "grad_norm": 0.176444873213768, + "learning_rate": 6.93513812145842e-05, + "loss": 0.0252, + "step": 4539 + }, + { + "epoch": 0.5997952240974997, + "grad_norm": 0.18236540257930756, + "learning_rate": 6.931182302346142e-05, + "loss": 0.0155, + "step": 4540 + }, + { + "epoch": 0.5999273375829838, + "grad_norm": 0.19694674015045166, + "learning_rate": 6.927227013314743e-05, + "loss": 0.0128, + "step": 4541 + }, + { + "epoch": 0.6000594510684678, + "grad_norm": 0.12531161308288574, + "learning_rate": 6.923272255047424e-05, + "loss": 0.0088, + "step": 4542 + }, + { + "epoch": 0.6001915645539518, + "grad_norm": 0.18968115746974945, + "learning_rate": 6.919318028227298e-05, + "loss": 0.029, + "step": 4543 + }, + { + "epoch": 0.6003236780394359, + "grad_norm": 0.15968884527683258, + "learning_rate": 6.915364333537383e-05, + "loss": 0.011, + "step": 4544 + }, + { + "epoch": 0.6004557915249199, + "grad_norm": 0.18767327070236206, + "learning_rate": 6.911411171660602e-05, + "loss": 0.0233, + "step": 4545 + }, + { + "epoch": 0.6005879050104039, + "grad_norm": 0.19067466259002686, + "learning_rate": 6.907458543279797e-05, + "loss": 0.0175, + "step": 4546 + }, + { + "epoch": 0.600720018495888, + "grad_norm": 0.7749344706535339, + "learning_rate": 6.903506449077704e-05, + "loss": 0.0203, + "step": 4547 + }, + { + "epoch": 0.600852131981372, + "grad_norm": 0.19207334518432617, + "learning_rate": 6.899554889736976e-05, + "loss": 0.0201, + "step": 4548 + }, + { + "epoch": 0.600984245466856, + "grad_norm": 0.12247084081172943, + "learning_rate": 6.89560386594017e-05, + "loss": 0.0156, + "step": 4549 + }, + { + "epoch": 0.6011163589523401, + "grad_norm": 0.27735868096351624, + "learning_rate": 6.891653378369754e-05, + "loss": 0.0202, + "step": 4550 + }, + { + "epoch": 0.6012484724378241, + "grad_norm": 0.14426004886627197, + "learning_rate": 6.887703427708101e-05, + "loss": 0.0132, + "step": 4551 + }, + { + "epoch": 0.6013805859233081, + "grad_norm": 0.1699414700269699, + "learning_rate": 6.883754014637483e-05, + "loss": 0.0187, + "step": 4552 + }, + { + "epoch": 0.6015126994087922, + "grad_norm": 0.136097714304924, + "learning_rate": 6.879805139840096e-05, + "loss": 0.0138, + "step": 4553 + }, + { + "epoch": 0.6016448128942762, + "grad_norm": 0.14943480491638184, + "learning_rate": 6.875856803998035e-05, + "loss": 0.0143, + "step": 4554 + }, + { + "epoch": 0.6017769263797602, + "grad_norm": 0.23178283870220184, + "learning_rate": 6.871909007793296e-05, + "loss": 0.0186, + "step": 4555 + }, + { + "epoch": 0.6019090398652442, + "grad_norm": 0.1980462372303009, + "learning_rate": 6.867961751907792e-05, + "loss": 0.024, + "step": 4556 + }, + { + "epoch": 0.6020411533507283, + "grad_norm": 0.1882098913192749, + "learning_rate": 6.864015037023332e-05, + "loss": 0.0195, + "step": 4557 + }, + { + "epoch": 0.6021732668362123, + "grad_norm": 0.18741685152053833, + "learning_rate": 6.860068863821641e-05, + "loss": 0.0138, + "step": 4558 + }, + { + "epoch": 0.6023053803216963, + "grad_norm": 0.1524559110403061, + "learning_rate": 6.856123232984347e-05, + "loss": 0.0221, + "step": 4559 + }, + { + "epoch": 0.6024374938071804, + "grad_norm": 0.1414857655763626, + "learning_rate": 6.852178145192981e-05, + "loss": 0.0177, + "step": 4560 + }, + { + "epoch": 0.6025696072926644, + "grad_norm": 0.16209979355335236, + "learning_rate": 6.848233601128985e-05, + "loss": 0.017, + "step": 4561 + }, + { + "epoch": 0.6027017207781484, + "grad_norm": 0.23777304589748383, + "learning_rate": 6.844289601473704e-05, + "loss": 0.0207, + "step": 4562 + }, + { + "epoch": 0.6028338342636325, + "grad_norm": 0.2142631858587265, + "learning_rate": 6.840346146908394e-05, + "loss": 0.013, + "step": 4563 + }, + { + "epoch": 0.6029659477491165, + "grad_norm": 0.22291328012943268, + "learning_rate": 6.836403238114206e-05, + "loss": 0.017, + "step": 4564 + }, + { + "epoch": 0.6030980612346005, + "grad_norm": 0.14466069638729095, + "learning_rate": 6.83246087577221e-05, + "loss": 0.0229, + "step": 4565 + }, + { + "epoch": 0.6032301747200846, + "grad_norm": 0.1637120097875595, + "learning_rate": 6.828519060563376e-05, + "loss": 0.0164, + "step": 4566 + }, + { + "epoch": 0.6033622882055686, + "grad_norm": 0.1512189507484436, + "learning_rate": 6.824577793168573e-05, + "loss": 0.0135, + "step": 4567 + }, + { + "epoch": 0.6034944016910526, + "grad_norm": 0.19535644352436066, + "learning_rate": 6.820637074268585e-05, + "loss": 0.0205, + "step": 4568 + }, + { + "epoch": 0.6036265151765366, + "grad_norm": 0.20127522945404053, + "learning_rate": 6.816696904544097e-05, + "loss": 0.0326, + "step": 4569 + }, + { + "epoch": 0.6037586286620207, + "grad_norm": 0.15570081770420074, + "learning_rate": 6.812757284675702e-05, + "loss": 0.0157, + "step": 4570 + }, + { + "epoch": 0.6038907421475047, + "grad_norm": 0.22564396262168884, + "learning_rate": 6.80881821534389e-05, + "loss": 0.0256, + "step": 4571 + }, + { + "epoch": 0.6040228556329887, + "grad_norm": 0.1477121114730835, + "learning_rate": 6.804879697229068e-05, + "loss": 0.0139, + "step": 4572 + }, + { + "epoch": 0.6041549691184728, + "grad_norm": 0.19079719483852386, + "learning_rate": 6.800941731011537e-05, + "loss": 0.031, + "step": 4573 + }, + { + "epoch": 0.6042870826039568, + "grad_norm": 0.21819524466991425, + "learning_rate": 6.797004317371507e-05, + "loss": 0.0214, + "step": 4574 + }, + { + "epoch": 0.6044191960894408, + "grad_norm": 0.16312861442565918, + "learning_rate": 6.793067456989095e-05, + "loss": 0.0167, + "step": 4575 + }, + { + "epoch": 0.6045513095749249, + "grad_norm": 0.19186748564243317, + "learning_rate": 6.789131150544316e-05, + "loss": 0.0228, + "step": 4576 + }, + { + "epoch": 0.6046834230604089, + "grad_norm": 0.1798516809940338, + "learning_rate": 6.785195398717101e-05, + "loss": 0.0261, + "step": 4577 + }, + { + "epoch": 0.6048155365458929, + "grad_norm": 0.11643780767917633, + "learning_rate": 6.78126020218727e-05, + "loss": 0.0066, + "step": 4578 + }, + { + "epoch": 0.604947650031377, + "grad_norm": 0.13909032940864563, + "learning_rate": 6.777325561634557e-05, + "loss": 0.0152, + "step": 4579 + }, + { + "epoch": 0.605079763516861, + "grad_norm": 0.18303309381008148, + "learning_rate": 6.773391477738603e-05, + "loss": 0.0195, + "step": 4580 + }, + { + "epoch": 0.605211877002345, + "grad_norm": 0.12904851138591766, + "learning_rate": 6.769457951178935e-05, + "loss": 0.0178, + "step": 4581 + }, + { + "epoch": 0.605343990487829, + "grad_norm": 0.1660364419221878, + "learning_rate": 6.765524982635009e-05, + "loss": 0.0214, + "step": 4582 + }, + { + "epoch": 0.6054761039733131, + "grad_norm": 0.18555185198783875, + "learning_rate": 6.761592572786164e-05, + "loss": 0.0145, + "step": 4583 + }, + { + "epoch": 0.6056082174587971, + "grad_norm": 0.1879284679889679, + "learning_rate": 6.757660722311653e-05, + "loss": 0.0214, + "step": 4584 + }, + { + "epoch": 0.6057403309442811, + "grad_norm": 0.18958178162574768, + "learning_rate": 6.75372943189063e-05, + "loss": 0.0199, + "step": 4585 + }, + { + "epoch": 0.6058724444297652, + "grad_norm": 0.17727269232273102, + "learning_rate": 6.749798702202151e-05, + "loss": 0.0221, + "step": 4586 + }, + { + "epoch": 0.6060045579152492, + "grad_norm": 0.24471880495548248, + "learning_rate": 6.745868533925177e-05, + "loss": 0.0229, + "step": 4587 + }, + { + "epoch": 0.6061366714007332, + "grad_norm": 0.27247729897499084, + "learning_rate": 6.741938927738568e-05, + "loss": 0.0215, + "step": 4588 + }, + { + "epoch": 0.6062687848862173, + "grad_norm": 0.137033611536026, + "learning_rate": 6.738009884321094e-05, + "loss": 0.009, + "step": 4589 + }, + { + "epoch": 0.6064008983717013, + "grad_norm": 0.20412221550941467, + "learning_rate": 6.734081404351423e-05, + "loss": 0.0179, + "step": 4590 + }, + { + "epoch": 0.6065330118571853, + "grad_norm": 0.17425969243049622, + "learning_rate": 6.730153488508124e-05, + "loss": 0.0108, + "step": 4591 + }, + { + "epoch": 0.6066651253426694, + "grad_norm": 0.2222953587770462, + "learning_rate": 6.726226137469673e-05, + "loss": 0.0222, + "step": 4592 + }, + { + "epoch": 0.6067972388281534, + "grad_norm": 0.16521556675434113, + "learning_rate": 6.722299351914448e-05, + "loss": 0.0138, + "step": 4593 + }, + { + "epoch": 0.6069293523136374, + "grad_norm": 0.2272307425737381, + "learning_rate": 6.718373132520724e-05, + "loss": 0.0239, + "step": 4594 + }, + { + "epoch": 0.6070614657991215, + "grad_norm": 0.12938404083251953, + "learning_rate": 6.714447479966683e-05, + "loss": 0.0124, + "step": 4595 + }, + { + "epoch": 0.6071935792846055, + "grad_norm": 0.1868760585784912, + "learning_rate": 6.710522394930412e-05, + "loss": 0.0228, + "step": 4596 + }, + { + "epoch": 0.6073256927700895, + "grad_norm": 0.18667960166931152, + "learning_rate": 6.706597878089888e-05, + "loss": 0.0148, + "step": 4597 + }, + { + "epoch": 0.6074578062555736, + "grad_norm": 0.2412571907043457, + "learning_rate": 6.702673930123009e-05, + "loss": 0.0295, + "step": 4598 + }, + { + "epoch": 0.6075899197410576, + "grad_norm": 0.1540968120098114, + "learning_rate": 6.698750551707553e-05, + "loss": 0.0183, + "step": 4599 + }, + { + "epoch": 0.6077220332265416, + "grad_norm": 0.1171368882060051, + "learning_rate": 6.694827743521217e-05, + "loss": 0.0096, + "step": 4600 + }, + { + "epoch": 0.6078541467120256, + "grad_norm": 0.1463003009557724, + "learning_rate": 6.690905506241591e-05, + "loss": 0.0151, + "step": 4601 + }, + { + "epoch": 0.6079862601975097, + "grad_norm": 0.12538869678974152, + "learning_rate": 6.686983840546166e-05, + "loss": 0.0207, + "step": 4602 + }, + { + "epoch": 0.6081183736829937, + "grad_norm": 0.5921977162361145, + "learning_rate": 6.683062747112341e-05, + "loss": 0.038, + "step": 4603 + }, + { + "epoch": 0.6082504871684777, + "grad_norm": 0.14486072957515717, + "learning_rate": 6.679142226617406e-05, + "loss": 0.0168, + "step": 4604 + }, + { + "epoch": 0.6083826006539618, + "grad_norm": 0.1616198569536209, + "learning_rate": 6.675222279738562e-05, + "loss": 0.0208, + "step": 4605 + }, + { + "epoch": 0.6085147141394458, + "grad_norm": 0.15971048176288605, + "learning_rate": 6.671302907152902e-05, + "loss": 0.019, + "step": 4606 + }, + { + "epoch": 0.6086468276249298, + "grad_norm": 0.17092269659042358, + "learning_rate": 6.66738410953743e-05, + "loss": 0.0195, + "step": 4607 + }, + { + "epoch": 0.6087789411104139, + "grad_norm": 0.1348509043455124, + "learning_rate": 6.663465887569043e-05, + "loss": 0.0153, + "step": 4608 + }, + { + "epoch": 0.6089110545958979, + "grad_norm": 0.12598492205142975, + "learning_rate": 6.659548241924537e-05, + "loss": 0.0111, + "step": 4609 + }, + { + "epoch": 0.6090431680813819, + "grad_norm": 0.17370101809501648, + "learning_rate": 6.655631173280613e-05, + "loss": 0.0173, + "step": 4610 + }, + { + "epoch": 0.609175281566866, + "grad_norm": 0.18581904470920563, + "learning_rate": 6.651714682313877e-05, + "loss": 0.0131, + "step": 4611 + }, + { + "epoch": 0.60930739505235, + "grad_norm": 0.14752104878425598, + "learning_rate": 6.647798769700824e-05, + "loss": 0.0159, + "step": 4612 + }, + { + "epoch": 0.609439508537834, + "grad_norm": 0.3449000418186188, + "learning_rate": 6.64388343611786e-05, + "loss": 0.0429, + "step": 4613 + }, + { + "epoch": 0.609571622023318, + "grad_norm": 0.12423645704984665, + "learning_rate": 6.639968682241277e-05, + "loss": 0.0111, + "step": 4614 + }, + { + "epoch": 0.6097037355088021, + "grad_norm": 0.25737959146499634, + "learning_rate": 6.636054508747286e-05, + "loss": 0.0232, + "step": 4615 + }, + { + "epoch": 0.6098358489942861, + "grad_norm": 0.20272405445575714, + "learning_rate": 6.632140916311981e-05, + "loss": 0.0195, + "step": 4616 + }, + { + "epoch": 0.6099679624797701, + "grad_norm": 0.3123978078365326, + "learning_rate": 6.62822790561136e-05, + "loss": 0.0341, + "step": 4617 + }, + { + "epoch": 0.6101000759652542, + "grad_norm": 0.15190282464027405, + "learning_rate": 6.624315477321328e-05, + "loss": 0.017, + "step": 4618 + }, + { + "epoch": 0.6102321894507382, + "grad_norm": 0.1630333811044693, + "learning_rate": 6.62040363211768e-05, + "loss": 0.0185, + "step": 4619 + }, + { + "epoch": 0.6103643029362222, + "grad_norm": 0.16900700330734253, + "learning_rate": 6.616492370676114e-05, + "loss": 0.0183, + "step": 4620 + }, + { + "epoch": 0.6104964164217063, + "grad_norm": 0.1657828986644745, + "learning_rate": 6.612581693672231e-05, + "loss": 0.0291, + "step": 4621 + }, + { + "epoch": 0.6106285299071903, + "grad_norm": 0.1700577735900879, + "learning_rate": 6.608671601781525e-05, + "loss": 0.0158, + "step": 4622 + }, + { + "epoch": 0.6107606433926743, + "grad_norm": 0.18941977620124817, + "learning_rate": 6.60476209567939e-05, + "loss": 0.0286, + "step": 4623 + }, + { + "epoch": 0.6108927568781584, + "grad_norm": 0.21406427025794983, + "learning_rate": 6.600853176041121e-05, + "loss": 0.0257, + "step": 4624 + }, + { + "epoch": 0.6110248703636424, + "grad_norm": 0.36508557200431824, + "learning_rate": 6.596944843541913e-05, + "loss": 0.0158, + "step": 4625 + }, + { + "epoch": 0.6111569838491264, + "grad_norm": 0.31032535433769226, + "learning_rate": 6.593037098856853e-05, + "loss": 0.0232, + "step": 4626 + }, + { + "epoch": 0.6112890973346105, + "grad_norm": 0.30940091609954834, + "learning_rate": 6.589129942660936e-05, + "loss": 0.0227, + "step": 4627 + }, + { + "epoch": 0.6114212108200945, + "grad_norm": 0.19503773748874664, + "learning_rate": 6.585223375629044e-05, + "loss": 0.0217, + "step": 4628 + }, + { + "epoch": 0.6115533243055785, + "grad_norm": 0.19577208161354065, + "learning_rate": 6.58131739843597e-05, + "loss": 0.026, + "step": 4629 + }, + { + "epoch": 0.6116854377910625, + "grad_norm": 0.0925767570734024, + "learning_rate": 6.577412011756394e-05, + "loss": 0.0078, + "step": 4630 + }, + { + "epoch": 0.6118175512765466, + "grad_norm": 0.6488147974014282, + "learning_rate": 6.5735072162649e-05, + "loss": 0.0224, + "step": 4631 + }, + { + "epoch": 0.6119496647620306, + "grad_norm": 0.16737666726112366, + "learning_rate": 6.569603012635969e-05, + "loss": 0.0208, + "step": 4632 + }, + { + "epoch": 0.6120817782475146, + "grad_norm": 0.12599046528339386, + "learning_rate": 6.565699401543977e-05, + "loss": 0.0113, + "step": 4633 + }, + { + "epoch": 0.6122138917329987, + "grad_norm": 0.19616693258285522, + "learning_rate": 6.561796383663203e-05, + "loss": 0.0108, + "step": 4634 + }, + { + "epoch": 0.6123460052184827, + "grad_norm": 0.1805287003517151, + "learning_rate": 6.557893959667817e-05, + "loss": 0.0133, + "step": 4635 + }, + { + "epoch": 0.6124781187039667, + "grad_norm": 0.16672852635383606, + "learning_rate": 6.553992130231892e-05, + "loss": 0.0189, + "step": 4636 + }, + { + "epoch": 0.6126102321894508, + "grad_norm": 0.10782662779092789, + "learning_rate": 6.550090896029397e-05, + "loss": 0.0119, + "step": 4637 + }, + { + "epoch": 0.6127423456749348, + "grad_norm": 0.21042101085186005, + "learning_rate": 6.546190257734194e-05, + "loss": 0.0238, + "step": 4638 + }, + { + "epoch": 0.6128744591604188, + "grad_norm": 0.16405358910560608, + "learning_rate": 6.542290216020048e-05, + "loss": 0.017, + "step": 4639 + }, + { + "epoch": 0.6130065726459029, + "grad_norm": 0.22960861027240753, + "learning_rate": 6.538390771560616e-05, + "loss": 0.0229, + "step": 4640 + }, + { + "epoch": 0.6131386861313869, + "grad_norm": 0.1733035296201706, + "learning_rate": 6.534491925029458e-05, + "loss": 0.0073, + "step": 4641 + }, + { + "epoch": 0.6132707996168709, + "grad_norm": 0.1495857983827591, + "learning_rate": 6.530593677100025e-05, + "loss": 0.0172, + "step": 4642 + }, + { + "epoch": 0.613402913102355, + "grad_norm": 0.18175730109214783, + "learning_rate": 6.526696028445663e-05, + "loss": 0.0201, + "step": 4643 + }, + { + "epoch": 0.613535026587839, + "grad_norm": 0.1300545483827591, + "learning_rate": 6.522798979739622e-05, + "loss": 0.0121, + "step": 4644 + }, + { + "epoch": 0.613667140073323, + "grad_norm": 0.16516652703285217, + "learning_rate": 6.518902531655043e-05, + "loss": 0.0181, + "step": 4645 + }, + { + "epoch": 0.613799253558807, + "grad_norm": 0.19759850203990936, + "learning_rate": 6.515006684864963e-05, + "loss": 0.0216, + "step": 4646 + }, + { + "epoch": 0.6139313670442911, + "grad_norm": 0.22777485847473145, + "learning_rate": 6.51111144004232e-05, + "loss": 0.0171, + "step": 4647 + }, + { + "epoch": 0.6140634805297751, + "grad_norm": 0.20385077595710754, + "learning_rate": 6.507216797859944e-05, + "loss": 0.0115, + "step": 4648 + }, + { + "epoch": 0.6141955940152591, + "grad_norm": 0.13005998730659485, + "learning_rate": 6.503322758990559e-05, + "loss": 0.0119, + "step": 4649 + }, + { + "epoch": 0.6143277075007432, + "grad_norm": 0.16478468477725983, + "learning_rate": 6.49942932410679e-05, + "loss": 0.0166, + "step": 4650 + }, + { + "epoch": 0.6144598209862272, + "grad_norm": 0.1881481558084488, + "learning_rate": 6.495536493881154e-05, + "loss": 0.0305, + "step": 4651 + }, + { + "epoch": 0.6145919344717112, + "grad_norm": 0.15473182499408722, + "learning_rate": 6.491644268986064e-05, + "loss": 0.0238, + "step": 4652 + }, + { + "epoch": 0.6147240479571953, + "grad_norm": 0.13667252659797668, + "learning_rate": 6.487752650093832e-05, + "loss": 0.0111, + "step": 4653 + }, + { + "epoch": 0.6148561614426793, + "grad_norm": 0.29269251227378845, + "learning_rate": 6.483861637876657e-05, + "loss": 0.0307, + "step": 4654 + }, + { + "epoch": 0.6149882749281633, + "grad_norm": 0.22216090559959412, + "learning_rate": 6.479971233006645e-05, + "loss": 0.0186, + "step": 4655 + }, + { + "epoch": 0.6151203884136474, + "grad_norm": 0.1534397006034851, + "learning_rate": 6.476081436155787e-05, + "loss": 0.008, + "step": 4656 + }, + { + "epoch": 0.6152525018991314, + "grad_norm": 0.19428527355194092, + "learning_rate": 6.472192247995971e-05, + "loss": 0.0124, + "step": 4657 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.16967034339904785, + "learning_rate": 6.468303669198985e-05, + "loss": 0.0167, + "step": 4658 + }, + { + "epoch": 0.6155167288700994, + "grad_norm": 0.17743651568889618, + "learning_rate": 6.464415700436506e-05, + "loss": 0.0156, + "step": 4659 + }, + { + "epoch": 0.6156488423555835, + "grad_norm": 0.19646964967250824, + "learning_rate": 6.460528342380112e-05, + "loss": 0.0197, + "step": 4660 + }, + { + "epoch": 0.6157809558410675, + "grad_norm": 0.5039405822753906, + "learning_rate": 6.456641595701265e-05, + "loss": 0.0292, + "step": 4661 + }, + { + "epoch": 0.6159130693265515, + "grad_norm": 0.17759563028812408, + "learning_rate": 6.452755461071334e-05, + "loss": 0.0154, + "step": 4662 + }, + { + "epoch": 0.6160451828120356, + "grad_norm": 0.19328869879245758, + "learning_rate": 6.448869939161573e-05, + "loss": 0.0101, + "step": 4663 + }, + { + "epoch": 0.6161772962975196, + "grad_norm": 0.17122767865657806, + "learning_rate": 6.444985030643131e-05, + "loss": 0.0163, + "step": 4664 + }, + { + "epoch": 0.6163094097830036, + "grad_norm": 0.1447526514530182, + "learning_rate": 6.441100736187058e-05, + "loss": 0.0128, + "step": 4665 + }, + { + "epoch": 0.6164415232684877, + "grad_norm": 0.19974549114704132, + "learning_rate": 6.43721705646429e-05, + "loss": 0.0261, + "step": 4666 + }, + { + "epoch": 0.6165736367539717, + "grad_norm": 0.19671718776226044, + "learning_rate": 6.433333992145662e-05, + "loss": 0.0304, + "step": 4667 + }, + { + "epoch": 0.6167057502394557, + "grad_norm": 0.1849774569272995, + "learning_rate": 6.429451543901899e-05, + "loss": 0.0241, + "step": 4668 + }, + { + "epoch": 0.6168378637249398, + "grad_norm": 0.20412255823612213, + "learning_rate": 6.425569712403619e-05, + "loss": 0.0193, + "step": 4669 + }, + { + "epoch": 0.6169699772104238, + "grad_norm": 0.6266623735427856, + "learning_rate": 6.421688498321344e-05, + "loss": 0.0203, + "step": 4670 + }, + { + "epoch": 0.6171020906959078, + "grad_norm": 0.1785677969455719, + "learning_rate": 6.417807902325477e-05, + "loss": 0.0174, + "step": 4671 + }, + { + "epoch": 0.6172342041813919, + "grad_norm": 0.0982740968465805, + "learning_rate": 6.413927925086313e-05, + "loss": 0.01, + "step": 4672 + }, + { + "epoch": 0.6173663176668759, + "grad_norm": 0.12710632383823395, + "learning_rate": 6.410048567274056e-05, + "loss": 0.0121, + "step": 4673 + }, + { + "epoch": 0.6174984311523599, + "grad_norm": 0.30231770873069763, + "learning_rate": 6.406169829558781e-05, + "loss": 0.0206, + "step": 4674 + }, + { + "epoch": 0.617630544637844, + "grad_norm": 0.1966603398323059, + "learning_rate": 6.402291712610477e-05, + "loss": 0.0164, + "step": 4675 + }, + { + "epoch": 0.617762658123328, + "grad_norm": 0.23650838434696198, + "learning_rate": 6.398414217099011e-05, + "loss": 0.0312, + "step": 4676 + }, + { + "epoch": 0.617894771608812, + "grad_norm": 0.19775453209877014, + "learning_rate": 6.39453734369415e-05, + "loss": 0.0226, + "step": 4677 + }, + { + "epoch": 0.618026885094296, + "grad_norm": 0.14034952223300934, + "learning_rate": 6.39066109306555e-05, + "loss": 0.0179, + "step": 4678 + }, + { + "epoch": 0.6181589985797801, + "grad_norm": 0.21675540506839752, + "learning_rate": 6.386785465882761e-05, + "loss": 0.0164, + "step": 4679 + }, + { + "epoch": 0.6182911120652641, + "grad_norm": 0.18577510118484497, + "learning_rate": 6.382910462815228e-05, + "loss": 0.0171, + "step": 4680 + }, + { + "epoch": 0.6184232255507481, + "grad_norm": 0.19053871929645538, + "learning_rate": 6.379036084532279e-05, + "loss": 0.0139, + "step": 4681 + }, + { + "epoch": 0.6185553390362322, + "grad_norm": 0.2634298801422119, + "learning_rate": 6.375162331703146e-05, + "loss": 0.0209, + "step": 4682 + }, + { + "epoch": 0.6186874525217162, + "grad_norm": 0.1316649615764618, + "learning_rate": 6.371289204996946e-05, + "loss": 0.0082, + "step": 4683 + }, + { + "epoch": 0.6188195660072002, + "grad_norm": 0.17446738481521606, + "learning_rate": 6.36741670508269e-05, + "loss": 0.0209, + "step": 4684 + }, + { + "epoch": 0.6189516794926843, + "grad_norm": 0.24308434128761292, + "learning_rate": 6.363544832629277e-05, + "loss": 0.0125, + "step": 4685 + }, + { + "epoch": 0.6190837929781683, + "grad_norm": 0.15134310722351074, + "learning_rate": 6.359673588305502e-05, + "loss": 0.0147, + "step": 4686 + }, + { + "epoch": 0.6192159064636523, + "grad_norm": 0.24589499831199646, + "learning_rate": 6.355802972780052e-05, + "loss": 0.0296, + "step": 4687 + }, + { + "epoch": 0.6193480199491364, + "grad_norm": 0.16589580476284027, + "learning_rate": 6.351932986721498e-05, + "loss": 0.0143, + "step": 4688 + }, + { + "epoch": 0.6194801334346204, + "grad_norm": 0.16544070839881897, + "learning_rate": 6.348063630798316e-05, + "loss": 0.0222, + "step": 4689 + }, + { + "epoch": 0.6196122469201044, + "grad_norm": 0.21558035910129547, + "learning_rate": 6.344194905678858e-05, + "loss": 0.0191, + "step": 4690 + }, + { + "epoch": 0.6197443604055884, + "grad_norm": 0.12022353708744049, + "learning_rate": 6.340326812031378e-05, + "loss": 0.0156, + "step": 4691 + }, + { + "epoch": 0.6198764738910725, + "grad_norm": 0.15502390265464783, + "learning_rate": 6.336459350524016e-05, + "loss": 0.0116, + "step": 4692 + }, + { + "epoch": 0.6200085873765565, + "grad_norm": 0.10256446897983551, + "learning_rate": 6.332592521824801e-05, + "loss": 0.0061, + "step": 4693 + }, + { + "epoch": 0.6201407008620405, + "grad_norm": 0.40712037682533264, + "learning_rate": 6.328726326601659e-05, + "loss": 0.0313, + "step": 4694 + }, + { + "epoch": 0.6202728143475246, + "grad_norm": 0.23398032784461975, + "learning_rate": 6.3248607655224e-05, + "loss": 0.0134, + "step": 4695 + }, + { + "epoch": 0.6204049278330086, + "grad_norm": 0.14428845047950745, + "learning_rate": 6.320995839254731e-05, + "loss": 0.0139, + "step": 4696 + }, + { + "epoch": 0.6205370413184926, + "grad_norm": 0.14608009159564972, + "learning_rate": 6.31713154846624e-05, + "loss": 0.0141, + "step": 4697 + }, + { + "epoch": 0.6206691548039767, + "grad_norm": 0.23591239750385284, + "learning_rate": 6.313267893824417e-05, + "loss": 0.0179, + "step": 4698 + }, + { + "epoch": 0.6208012682894607, + "grad_norm": 0.18376502394676208, + "learning_rate": 6.309404875996632e-05, + "loss": 0.0214, + "step": 4699 + }, + { + "epoch": 0.6209333817749447, + "grad_norm": 0.20980310440063477, + "learning_rate": 6.30554249565015e-05, + "loss": 0.0261, + "step": 4700 + }, + { + "epoch": 0.6210654952604288, + "grad_norm": 0.2826692759990692, + "learning_rate": 6.301680753452129e-05, + "loss": 0.0372, + "step": 4701 + }, + { + "epoch": 0.6211976087459128, + "grad_norm": 0.2074815183877945, + "learning_rate": 6.297819650069605e-05, + "loss": 0.0118, + "step": 4702 + }, + { + "epoch": 0.6213297222313968, + "grad_norm": 0.16915234923362732, + "learning_rate": 6.293959186169521e-05, + "loss": 0.0185, + "step": 4703 + }, + { + "epoch": 0.6214618357168809, + "grad_norm": 0.14982269704341888, + "learning_rate": 6.290099362418689e-05, + "loss": 0.0247, + "step": 4704 + }, + { + "epoch": 0.6215939492023649, + "grad_norm": 0.16975736618041992, + "learning_rate": 6.286240179483831e-05, + "loss": 0.0156, + "step": 4705 + }, + { + "epoch": 0.6217260626878489, + "grad_norm": 0.1671624630689621, + "learning_rate": 6.282381638031545e-05, + "loss": 0.0152, + "step": 4706 + }, + { + "epoch": 0.6218581761733329, + "grad_norm": 0.16445261240005493, + "learning_rate": 6.278523738728317e-05, + "loss": 0.0126, + "step": 4707 + }, + { + "epoch": 0.621990289658817, + "grad_norm": 0.18265871703624725, + "learning_rate": 6.274666482240536e-05, + "loss": 0.0112, + "step": 4708 + }, + { + "epoch": 0.622122403144301, + "grad_norm": 0.17513689398765564, + "learning_rate": 6.270809869234466e-05, + "loss": 0.0254, + "step": 4709 + }, + { + "epoch": 0.622254516629785, + "grad_norm": 0.19515810906887054, + "learning_rate": 6.266953900376265e-05, + "loss": 0.0146, + "step": 4710 + }, + { + "epoch": 0.6223866301152691, + "grad_norm": 0.17156194150447845, + "learning_rate": 6.263098576331978e-05, + "loss": 0.0222, + "step": 4711 + }, + { + "epoch": 0.6225187436007531, + "grad_norm": 0.23880699276924133, + "learning_rate": 6.259243897767546e-05, + "loss": 0.0183, + "step": 4712 + }, + { + "epoch": 0.6226508570862371, + "grad_norm": 0.14619049429893494, + "learning_rate": 6.255389865348787e-05, + "loss": 0.0136, + "step": 4713 + }, + { + "epoch": 0.6227829705717212, + "grad_norm": 0.19037936627864838, + "learning_rate": 6.251536479741414e-05, + "loss": 0.0096, + "step": 4714 + }, + { + "epoch": 0.6229150840572052, + "grad_norm": 0.15618152916431427, + "learning_rate": 6.24768374161103e-05, + "loss": 0.0232, + "step": 4715 + }, + { + "epoch": 0.6230471975426892, + "grad_norm": 0.14753462374210358, + "learning_rate": 6.243831651623118e-05, + "loss": 0.0215, + "step": 4716 + }, + { + "epoch": 0.6231793110281733, + "grad_norm": 0.14098712801933289, + "learning_rate": 6.239980210443061e-05, + "loss": 0.019, + "step": 4717 + }, + { + "epoch": 0.6233114245136573, + "grad_norm": 0.24728527665138245, + "learning_rate": 6.236129418736118e-05, + "loss": 0.0247, + "step": 4718 + }, + { + "epoch": 0.6234435379991413, + "grad_norm": 0.20924845337867737, + "learning_rate": 6.232279277167448e-05, + "loss": 0.0266, + "step": 4719 + }, + { + "epoch": 0.6235756514846253, + "grad_norm": 0.18728812038898468, + "learning_rate": 6.228429786402084e-05, + "loss": 0.0139, + "step": 4720 + }, + { + "epoch": 0.6237077649701093, + "grad_norm": 0.17054276168346405, + "learning_rate": 6.224580947104957e-05, + "loss": 0.0142, + "step": 4721 + }, + { + "epoch": 0.6238398784555933, + "grad_norm": 0.14980489015579224, + "learning_rate": 6.220732759940882e-05, + "loss": 0.0168, + "step": 4722 + }, + { + "epoch": 0.6239719919410773, + "grad_norm": 0.10852868854999542, + "learning_rate": 6.216885225574558e-05, + "loss": 0.0107, + "step": 4723 + }, + { + "epoch": 0.6241041054265614, + "grad_norm": 0.14671917259693146, + "learning_rate": 6.213038344670579e-05, + "loss": 0.0185, + "step": 4724 + }, + { + "epoch": 0.6242362189120454, + "grad_norm": 0.1404392421245575, + "learning_rate": 6.209192117893418e-05, + "loss": 0.0118, + "step": 4725 + }, + { + "epoch": 0.6243683323975294, + "grad_norm": 0.17956751585006714, + "learning_rate": 6.205346545907445e-05, + "loss": 0.0196, + "step": 4726 + }, + { + "epoch": 0.6245004458830135, + "grad_norm": 0.14269684255123138, + "learning_rate": 6.201501629376906e-05, + "loss": 0.0148, + "step": 4727 + }, + { + "epoch": 0.6246325593684975, + "grad_norm": 0.11928752064704895, + "learning_rate": 6.197657368965934e-05, + "loss": 0.0058, + "step": 4728 + }, + { + "epoch": 0.6247646728539815, + "grad_norm": 0.27102896571159363, + "learning_rate": 6.193813765338561e-05, + "loss": 0.0393, + "step": 4729 + }, + { + "epoch": 0.6248967863394655, + "grad_norm": 0.1431749016046524, + "learning_rate": 6.189970819158696e-05, + "loss": 0.0079, + "step": 4730 + }, + { + "epoch": 0.6250288998249496, + "grad_norm": 0.12428173422813416, + "learning_rate": 6.186128531090132e-05, + "loss": 0.0142, + "step": 4731 + }, + { + "epoch": 0.6251610133104336, + "grad_norm": 0.12851598858833313, + "learning_rate": 6.182286901796558e-05, + "loss": 0.0165, + "step": 4732 + }, + { + "epoch": 0.6252931267959176, + "grad_norm": 0.14972122013568878, + "learning_rate": 6.17844593194154e-05, + "loss": 0.0221, + "step": 4733 + }, + { + "epoch": 0.6254252402814017, + "grad_norm": 0.1200818419456482, + "learning_rate": 6.174605622188536e-05, + "loss": 0.0173, + "step": 4734 + }, + { + "epoch": 0.6255573537668857, + "grad_norm": 0.10524090379476547, + "learning_rate": 6.170765973200887e-05, + "loss": 0.0198, + "step": 4735 + }, + { + "epoch": 0.6256894672523697, + "grad_norm": 0.07244350016117096, + "learning_rate": 6.166926985641817e-05, + "loss": 0.0084, + "step": 4736 + }, + { + "epoch": 0.6258215807378538, + "grad_norm": 0.1618015170097351, + "learning_rate": 6.163088660174443e-05, + "loss": 0.0174, + "step": 4737 + }, + { + "epoch": 0.6259536942233378, + "grad_norm": 0.3594222664833069, + "learning_rate": 6.159250997461763e-05, + "loss": 0.0215, + "step": 4738 + }, + { + "epoch": 0.6260858077088218, + "grad_norm": 0.163102388381958, + "learning_rate": 6.155413998166664e-05, + "loss": 0.0136, + "step": 4739 + }, + { + "epoch": 0.6262179211943059, + "grad_norm": 0.1344098001718521, + "learning_rate": 6.15157766295191e-05, + "loss": 0.0156, + "step": 4740 + }, + { + "epoch": 0.6263500346797899, + "grad_norm": 0.12539459764957428, + "learning_rate": 6.147741992480163e-05, + "loss": 0.0197, + "step": 4741 + }, + { + "epoch": 0.6264821481652739, + "grad_norm": 0.1360604166984558, + "learning_rate": 6.143906987413959e-05, + "loss": 0.0184, + "step": 4742 + }, + { + "epoch": 0.626614261650758, + "grad_norm": 0.12114249914884567, + "learning_rate": 6.140072648415722e-05, + "loss": 0.0107, + "step": 4743 + }, + { + "epoch": 0.626746375136242, + "grad_norm": 0.19674722850322723, + "learning_rate": 6.136238976147768e-05, + "loss": 0.0165, + "step": 4744 + }, + { + "epoch": 0.626878488621726, + "grad_norm": 0.23598308861255646, + "learning_rate": 6.132405971272288e-05, + "loss": 0.0221, + "step": 4745 + }, + { + "epoch": 0.62701060210721, + "grad_norm": 0.12908229231834412, + "learning_rate": 6.128573634451364e-05, + "loss": 0.0131, + "step": 4746 + }, + { + "epoch": 0.6271427155926941, + "grad_norm": 0.32704001665115356, + "learning_rate": 6.124741966346956e-05, + "loss": 0.0318, + "step": 4747 + }, + { + "epoch": 0.6272748290781781, + "grad_norm": 0.1676252782344818, + "learning_rate": 6.120910967620921e-05, + "loss": 0.0216, + "step": 4748 + }, + { + "epoch": 0.6274069425636621, + "grad_norm": 0.15792980790138245, + "learning_rate": 6.117080638934987e-05, + "loss": 0.0179, + "step": 4749 + }, + { + "epoch": 0.6275390560491462, + "grad_norm": 0.12319156527519226, + "learning_rate": 6.113250980950772e-05, + "loss": 0.0142, + "step": 4750 + }, + { + "epoch": 0.6276711695346302, + "grad_norm": 0.17394210398197174, + "learning_rate": 6.109421994329778e-05, + "loss": 0.0188, + "step": 4751 + }, + { + "epoch": 0.6278032830201142, + "grad_norm": 0.19802264869213104, + "learning_rate": 6.10559367973339e-05, + "loss": 0.017, + "step": 4752 + }, + { + "epoch": 0.6279353965055983, + "grad_norm": 0.14336781203746796, + "learning_rate": 6.1017660378228814e-05, + "loss": 0.0161, + "step": 4753 + }, + { + "epoch": 0.6280675099910823, + "grad_norm": 0.19559355080127716, + "learning_rate": 6.097939069259402e-05, + "loss": 0.0112, + "step": 4754 + }, + { + "epoch": 0.6281996234765663, + "grad_norm": 0.11833098530769348, + "learning_rate": 6.0941127747039914e-05, + "loss": 0.0076, + "step": 4755 + }, + { + "epoch": 0.6283317369620504, + "grad_norm": 0.2858635187149048, + "learning_rate": 6.09028715481757e-05, + "loss": 0.028, + "step": 4756 + }, + { + "epoch": 0.6284638504475344, + "grad_norm": 0.2030288726091385, + "learning_rate": 6.0864622102609395e-05, + "loss": 0.0207, + "step": 4757 + }, + { + "epoch": 0.6285959639330184, + "grad_norm": 0.1523391306400299, + "learning_rate": 6.082637941694792e-05, + "loss": 0.0165, + "step": 4758 + }, + { + "epoch": 0.6287280774185025, + "grad_norm": 0.17553837597370148, + "learning_rate": 6.078814349779693e-05, + "loss": 0.018, + "step": 4759 + }, + { + "epoch": 0.6288601909039865, + "grad_norm": 0.214276984333992, + "learning_rate": 6.074991435176103e-05, + "loss": 0.0291, + "step": 4760 + }, + { + "epoch": 0.6289923043894705, + "grad_norm": 0.14726516604423523, + "learning_rate": 6.071169198544353e-05, + "loss": 0.022, + "step": 4761 + }, + { + "epoch": 0.6291244178749545, + "grad_norm": 0.13246333599090576, + "learning_rate": 6.067347640544668e-05, + "loss": 0.0102, + "step": 4762 + }, + { + "epoch": 0.6292565313604386, + "grad_norm": 0.204450324177742, + "learning_rate": 6.0635267618371485e-05, + "loss": 0.027, + "step": 4763 + }, + { + "epoch": 0.6293886448459226, + "grad_norm": 0.14314386248588562, + "learning_rate": 6.059706563081777e-05, + "loss": 0.0129, + "step": 4764 + }, + { + "epoch": 0.6295207583314066, + "grad_norm": 0.3385053873062134, + "learning_rate": 6.055887044938426e-05, + "loss": 0.018, + "step": 4765 + }, + { + "epoch": 0.6296528718168907, + "grad_norm": 0.14576059579849243, + "learning_rate": 6.052068208066841e-05, + "loss": 0.0083, + "step": 4766 + }, + { + "epoch": 0.6297849853023747, + "grad_norm": 0.22583547234535217, + "learning_rate": 6.048250053126661e-05, + "loss": 0.0127, + "step": 4767 + }, + { + "epoch": 0.6299170987878587, + "grad_norm": 0.26217544078826904, + "learning_rate": 6.044432580777395e-05, + "loss": 0.0125, + "step": 4768 + }, + { + "epoch": 0.6300492122733428, + "grad_norm": 0.12332071363925934, + "learning_rate": 6.040615791678444e-05, + "loss": 0.0104, + "step": 4769 + }, + { + "epoch": 0.6301813257588268, + "grad_norm": 0.2592184543609619, + "learning_rate": 6.036799686489085e-05, + "loss": 0.0328, + "step": 4770 + }, + { + "epoch": 0.6303134392443108, + "grad_norm": 0.10757103562355042, + "learning_rate": 6.032984265868478e-05, + "loss": 0.0145, + "step": 4771 + }, + { + "epoch": 0.6304455527297949, + "grad_norm": 0.13460460305213928, + "learning_rate": 6.029169530475668e-05, + "loss": 0.0179, + "step": 4772 + }, + { + "epoch": 0.6305776662152789, + "grad_norm": 0.1447414755821228, + "learning_rate": 6.0253554809695765e-05, + "loss": 0.0176, + "step": 4773 + }, + { + "epoch": 0.6307097797007629, + "grad_norm": 0.2827383875846863, + "learning_rate": 6.021542118009012e-05, + "loss": 0.0216, + "step": 4774 + }, + { + "epoch": 0.630841893186247, + "grad_norm": 0.13897140324115753, + "learning_rate": 6.0177294422526584e-05, + "loss": 0.0149, + "step": 4775 + }, + { + "epoch": 0.630974006671731, + "grad_norm": 0.22664396464824677, + "learning_rate": 6.013917454359088e-05, + "loss": 0.0114, + "step": 4776 + }, + { + "epoch": 0.631106120157215, + "grad_norm": 0.19409899413585663, + "learning_rate": 6.01010615498675e-05, + "loss": 0.0221, + "step": 4777 + }, + { + "epoch": 0.631238233642699, + "grad_norm": 0.21891489624977112, + "learning_rate": 6.0062955447939694e-05, + "loss": 0.031, + "step": 4778 + }, + { + "epoch": 0.6313703471281831, + "grad_norm": 0.1206694096326828, + "learning_rate": 6.002485624438965e-05, + "loss": 0.0104, + "step": 4779 + }, + { + "epoch": 0.6315024606136671, + "grad_norm": 0.23604559898376465, + "learning_rate": 5.998676394579824e-05, + "loss": 0.023, + "step": 4780 + }, + { + "epoch": 0.6316345740991511, + "grad_norm": 0.22032544016838074, + "learning_rate": 5.994867855874524e-05, + "loss": 0.016, + "step": 4781 + }, + { + "epoch": 0.6317666875846352, + "grad_norm": 0.15425023436546326, + "learning_rate": 5.991060008980916e-05, + "loss": 0.0132, + "step": 4782 + }, + { + "epoch": 0.6318988010701192, + "grad_norm": 0.1916843056678772, + "learning_rate": 5.9872528545567365e-05, + "loss": 0.0273, + "step": 4783 + }, + { + "epoch": 0.6320309145556032, + "grad_norm": 0.22243209183216095, + "learning_rate": 5.9834463932595984e-05, + "loss": 0.0239, + "step": 4784 + }, + { + "epoch": 0.6321630280410873, + "grad_norm": 0.16001495718955994, + "learning_rate": 5.979640625746996e-05, + "loss": 0.0135, + "step": 4785 + }, + { + "epoch": 0.6322951415265713, + "grad_norm": 0.20562537014484406, + "learning_rate": 5.975835552676303e-05, + "loss": 0.0169, + "step": 4786 + }, + { + "epoch": 0.6324272550120553, + "grad_norm": 0.19932161271572113, + "learning_rate": 5.972031174704782e-05, + "loss": 0.0173, + "step": 4787 + }, + { + "epoch": 0.6325593684975394, + "grad_norm": 0.17273645102977753, + "learning_rate": 5.968227492489562e-05, + "loss": 0.0225, + "step": 4788 + }, + { + "epoch": 0.6326914819830234, + "grad_norm": 0.20375372469425201, + "learning_rate": 5.96442450668766e-05, + "loss": 0.0197, + "step": 4789 + }, + { + "epoch": 0.6328235954685074, + "grad_norm": 0.189348503947258, + "learning_rate": 5.960622217955969e-05, + "loss": 0.0156, + "step": 4790 + }, + { + "epoch": 0.6329557089539914, + "grad_norm": 0.1797255575656891, + "learning_rate": 5.956820626951267e-05, + "loss": 0.0157, + "step": 4791 + }, + { + "epoch": 0.6330878224394755, + "grad_norm": 0.178375706076622, + "learning_rate": 5.9530197343302054e-05, + "loss": 0.0136, + "step": 4792 + }, + { + "epoch": 0.6332199359249595, + "grad_norm": 0.15374751389026642, + "learning_rate": 5.9492195407493144e-05, + "loss": 0.0177, + "step": 4793 + }, + { + "epoch": 0.6333520494104435, + "grad_norm": 0.17693348228931427, + "learning_rate": 5.945420046865011e-05, + "loss": 0.0204, + "step": 4794 + }, + { + "epoch": 0.6334841628959276, + "grad_norm": 0.236887127161026, + "learning_rate": 5.941621253333585e-05, + "loss": 0.0131, + "step": 4795 + }, + { + "epoch": 0.6336162763814116, + "grad_norm": 0.13650371134281158, + "learning_rate": 5.937823160811207e-05, + "loss": 0.0148, + "step": 4796 + }, + { + "epoch": 0.6337483898668956, + "grad_norm": 0.17048349976539612, + "learning_rate": 5.9340257699539236e-05, + "loss": 0.0119, + "step": 4797 + }, + { + "epoch": 0.6338805033523797, + "grad_norm": 0.11208688467741013, + "learning_rate": 5.9302290814176684e-05, + "loss": 0.0083, + "step": 4798 + }, + { + "epoch": 0.6340126168378637, + "grad_norm": 0.1634460985660553, + "learning_rate": 5.926433095858247e-05, + "loss": 0.0128, + "step": 4799 + }, + { + "epoch": 0.6341447303233477, + "grad_norm": 0.16812430322170258, + "learning_rate": 5.922637813931341e-05, + "loss": 0.0181, + "step": 4800 + }, + { + "epoch": 0.6342768438088318, + "grad_norm": 0.13605019450187683, + "learning_rate": 5.9188432362925196e-05, + "loss": 0.014, + "step": 4801 + }, + { + "epoch": 0.6344089572943158, + "grad_norm": 0.11597549915313721, + "learning_rate": 5.9150493635972194e-05, + "loss": 0.0168, + "step": 4802 + }, + { + "epoch": 0.6345410707797998, + "grad_norm": 0.2631230056285858, + "learning_rate": 5.9112561965007676e-05, + "loss": 0.0202, + "step": 4803 + }, + { + "epoch": 0.6346731842652839, + "grad_norm": 0.1404346376657486, + "learning_rate": 5.9074637356583564e-05, + "loss": 0.0124, + "step": 4804 + }, + { + "epoch": 0.6348052977507679, + "grad_norm": 0.13995184004306793, + "learning_rate": 5.9036719817250675e-05, + "loss": 0.0122, + "step": 4805 + }, + { + "epoch": 0.6349374112362519, + "grad_norm": 0.12033804506063461, + "learning_rate": 5.899880935355854e-05, + "loss": 0.0094, + "step": 4806 + }, + { + "epoch": 0.635069524721736, + "grad_norm": 0.19234386086463928, + "learning_rate": 5.896090597205546e-05, + "loss": 0.0171, + "step": 4807 + }, + { + "epoch": 0.63520163820722, + "grad_norm": 0.1520492434501648, + "learning_rate": 5.8923009679288565e-05, + "loss": 0.0184, + "step": 4808 + }, + { + "epoch": 0.635333751692704, + "grad_norm": 0.15152016282081604, + "learning_rate": 5.8885120481803715e-05, + "loss": 0.0178, + "step": 4809 + }, + { + "epoch": 0.635465865178188, + "grad_norm": 0.15282461047172546, + "learning_rate": 5.884723838614559e-05, + "loss": 0.0156, + "step": 4810 + }, + { + "epoch": 0.6355979786636721, + "grad_norm": 0.2525296211242676, + "learning_rate": 5.880936339885754e-05, + "loss": 0.0305, + "step": 4811 + }, + { + "epoch": 0.6357300921491561, + "grad_norm": 0.19699762761592865, + "learning_rate": 5.877149552648186e-05, + "loss": 0.0171, + "step": 4812 + }, + { + "epoch": 0.6358622056346401, + "grad_norm": 0.10816171765327454, + "learning_rate": 5.8733634775559456e-05, + "loss": 0.0124, + "step": 4813 + }, + { + "epoch": 0.6359943191201242, + "grad_norm": 0.34312406182289124, + "learning_rate": 5.869578115263006e-05, + "loss": 0.0277, + "step": 4814 + }, + { + "epoch": 0.6361264326056082, + "grad_norm": 0.1626928299665451, + "learning_rate": 5.8657934664232205e-05, + "loss": 0.0242, + "step": 4815 + }, + { + "epoch": 0.6362585460910922, + "grad_norm": 0.12160996347665787, + "learning_rate": 5.862009531690313e-05, + "loss": 0.014, + "step": 4816 + }, + { + "epoch": 0.6363906595765763, + "grad_norm": 0.2601321339607239, + "learning_rate": 5.858226311717894e-05, + "loss": 0.0173, + "step": 4817 + }, + { + "epoch": 0.6365227730620603, + "grad_norm": 0.1506567746400833, + "learning_rate": 5.8544438071594354e-05, + "loss": 0.0216, + "step": 4818 + }, + { + "epoch": 0.6366548865475443, + "grad_norm": 0.20273055136203766, + "learning_rate": 5.8506620186683014e-05, + "loss": 0.0238, + "step": 4819 + }, + { + "epoch": 0.6367870000330283, + "grad_norm": 0.1697157919406891, + "learning_rate": 5.846880946897722e-05, + "loss": 0.0164, + "step": 4820 + }, + { + "epoch": 0.6369191135185124, + "grad_norm": 0.10192979127168655, + "learning_rate": 5.843100592500805e-05, + "loss": 0.0113, + "step": 4821 + }, + { + "epoch": 0.6370512270039964, + "grad_norm": 0.1476384401321411, + "learning_rate": 5.839320956130542e-05, + "loss": 0.0095, + "step": 4822 + }, + { + "epoch": 0.6371833404894804, + "grad_norm": 0.16391359269618988, + "learning_rate": 5.8355420384397865e-05, + "loss": 0.0176, + "step": 4823 + }, + { + "epoch": 0.6373154539749645, + "grad_norm": 0.124471016228199, + "learning_rate": 5.83176384008128e-05, + "loss": 0.0102, + "step": 4824 + }, + { + "epoch": 0.6374475674604485, + "grad_norm": 0.17791259288787842, + "learning_rate": 5.82798636170764e-05, + "loss": 0.0136, + "step": 4825 + }, + { + "epoch": 0.6375796809459325, + "grad_norm": 0.2600245475769043, + "learning_rate": 5.824209603971347e-05, + "loss": 0.0181, + "step": 4826 + }, + { + "epoch": 0.6377117944314166, + "grad_norm": 0.2228402942419052, + "learning_rate": 5.8204335675247676e-05, + "loss": 0.0164, + "step": 4827 + }, + { + "epoch": 0.6378439079169006, + "grad_norm": 0.1939469873905182, + "learning_rate": 5.8166582530201483e-05, + "loss": 0.0214, + "step": 4828 + }, + { + "epoch": 0.6379760214023846, + "grad_norm": 0.38613161444664, + "learning_rate": 5.812883661109594e-05, + "loss": 0.0219, + "step": 4829 + }, + { + "epoch": 0.6381081348878687, + "grad_norm": 0.1431112140417099, + "learning_rate": 5.8091097924451e-05, + "loss": 0.0173, + "step": 4830 + }, + { + "epoch": 0.6382402483733527, + "grad_norm": 0.17689582705497742, + "learning_rate": 5.805336647678531e-05, + "loss": 0.0213, + "step": 4831 + }, + { + "epoch": 0.6383723618588367, + "grad_norm": 0.12897051870822906, + "learning_rate": 5.801564227461631e-05, + "loss": 0.0157, + "step": 4832 + }, + { + "epoch": 0.6385044753443208, + "grad_norm": 0.1856435388326645, + "learning_rate": 5.797792532446008e-05, + "loss": 0.0159, + "step": 4833 + }, + { + "epoch": 0.6386365888298048, + "grad_norm": 0.21794654428958893, + "learning_rate": 5.794021563283154e-05, + "loss": 0.0111, + "step": 4834 + }, + { + "epoch": 0.6387687023152888, + "grad_norm": 0.21521590650081635, + "learning_rate": 5.7902513206244404e-05, + "loss": 0.0244, + "step": 4835 + }, + { + "epoch": 0.6389008158007728, + "grad_norm": 0.18167127668857574, + "learning_rate": 5.786481805121096e-05, + "loss": 0.0131, + "step": 4836 + }, + { + "epoch": 0.6390329292862569, + "grad_norm": 0.1692265123128891, + "learning_rate": 5.782713017424237e-05, + "loss": 0.0151, + "step": 4837 + }, + { + "epoch": 0.6391650427717409, + "grad_norm": 0.25949326157569885, + "learning_rate": 5.7789449581848534e-05, + "loss": 0.0149, + "step": 4838 + }, + { + "epoch": 0.6392971562572249, + "grad_norm": 0.09822472929954529, + "learning_rate": 5.7751776280538104e-05, + "loss": 0.0088, + "step": 4839 + }, + { + "epoch": 0.639429269742709, + "grad_norm": 0.16368919610977173, + "learning_rate": 5.7714110276818354e-05, + "loss": 0.0161, + "step": 4840 + }, + { + "epoch": 0.639561383228193, + "grad_norm": 0.12360408902168274, + "learning_rate": 5.7676451577195425e-05, + "loss": 0.0135, + "step": 4841 + }, + { + "epoch": 0.639693496713677, + "grad_norm": 0.17056262493133545, + "learning_rate": 5.763880018817418e-05, + "loss": 0.0191, + "step": 4842 + }, + { + "epoch": 0.6398256101991611, + "grad_norm": 0.1338781863451004, + "learning_rate": 5.760115611625814e-05, + "loss": 0.0167, + "step": 4843 + }, + { + "epoch": 0.6399577236846451, + "grad_norm": 0.30153313279151917, + "learning_rate": 5.756351936794961e-05, + "loss": 0.0173, + "step": 4844 + }, + { + "epoch": 0.6400898371701291, + "grad_norm": 0.1391494870185852, + "learning_rate": 5.75258899497497e-05, + "loss": 0.0094, + "step": 4845 + }, + { + "epoch": 0.6402219506556132, + "grad_norm": 0.20900259912014008, + "learning_rate": 5.748826786815813e-05, + "loss": 0.0297, + "step": 4846 + }, + { + "epoch": 0.6403540641410972, + "grad_norm": 0.16079430282115936, + "learning_rate": 5.745065312967344e-05, + "loss": 0.0143, + "step": 4847 + }, + { + "epoch": 0.6404861776265812, + "grad_norm": 0.19876495003700256, + "learning_rate": 5.741304574079285e-05, + "loss": 0.0127, + "step": 4848 + }, + { + "epoch": 0.6406182911120653, + "grad_norm": 0.13469110429286957, + "learning_rate": 5.73754457080124e-05, + "loss": 0.0098, + "step": 4849 + }, + { + "epoch": 0.6407504045975493, + "grad_norm": 0.15722264349460602, + "learning_rate": 5.7337853037826706e-05, + "loss": 0.018, + "step": 4850 + }, + { + "epoch": 0.6408825180830333, + "grad_norm": 0.1624094843864441, + "learning_rate": 5.730026773672923e-05, + "loss": 0.0168, + "step": 4851 + }, + { + "epoch": 0.6410146315685173, + "grad_norm": 0.35877251625061035, + "learning_rate": 5.726268981121217e-05, + "loss": 0.0136, + "step": 4852 + }, + { + "epoch": 0.6411467450540014, + "grad_norm": 0.15343332290649414, + "learning_rate": 5.7225119267766326e-05, + "loss": 0.0125, + "step": 4853 + }, + { + "epoch": 0.6412788585394854, + "grad_norm": 0.21046748757362366, + "learning_rate": 5.718755611288137e-05, + "loss": 0.021, + "step": 4854 + }, + { + "epoch": 0.6414109720249694, + "grad_norm": 0.13725142180919647, + "learning_rate": 5.715000035304561e-05, + "loss": 0.011, + "step": 4855 + }, + { + "epoch": 0.6415430855104535, + "grad_norm": 0.17193253338336945, + "learning_rate": 5.7112451994746154e-05, + "loss": 0.0238, + "step": 4856 + }, + { + "epoch": 0.6416751989959375, + "grad_norm": 0.1537133902311325, + "learning_rate": 5.7074911044468696e-05, + "loss": 0.0146, + "step": 4857 + }, + { + "epoch": 0.6418073124814215, + "grad_norm": 0.12296643853187561, + "learning_rate": 5.7037377508697774e-05, + "loss": 0.0086, + "step": 4858 + }, + { + "epoch": 0.6419394259669056, + "grad_norm": 0.19924704730510712, + "learning_rate": 5.6999851393916645e-05, + "loss": 0.0096, + "step": 4859 + }, + { + "epoch": 0.6420715394523896, + "grad_norm": 0.15559297800064087, + "learning_rate": 5.696233270660716e-05, + "loss": 0.0148, + "step": 4860 + }, + { + "epoch": 0.6422036529378736, + "grad_norm": 0.2160009890794754, + "learning_rate": 5.692482145325002e-05, + "loss": 0.0216, + "step": 4861 + }, + { + "epoch": 0.6423357664233577, + "grad_norm": 0.2480064034461975, + "learning_rate": 5.688731764032458e-05, + "loss": 0.0223, + "step": 4862 + }, + { + "epoch": 0.6424678799088417, + "grad_norm": 0.1594134420156479, + "learning_rate": 5.684982127430895e-05, + "loss": 0.0184, + "step": 4863 + }, + { + "epoch": 0.6425999933943257, + "grad_norm": 0.25493767857551575, + "learning_rate": 5.681233236167989e-05, + "loss": 0.0167, + "step": 4864 + }, + { + "epoch": 0.6427321068798098, + "grad_norm": 0.1988016664981842, + "learning_rate": 5.6774850908912926e-05, + "loss": 0.0168, + "step": 4865 + }, + { + "epoch": 0.6428642203652938, + "grad_norm": 0.19576209783554077, + "learning_rate": 5.6737376922482296e-05, + "loss": 0.0172, + "step": 4866 + }, + { + "epoch": 0.6429963338507778, + "grad_norm": 0.16945160925388336, + "learning_rate": 5.669991040886088e-05, + "loss": 0.0156, + "step": 4867 + }, + { + "epoch": 0.6431284473362618, + "grad_norm": 0.147054985165596, + "learning_rate": 5.666245137452034e-05, + "loss": 0.0107, + "step": 4868 + }, + { + "epoch": 0.6432605608217459, + "grad_norm": 0.1534264087677002, + "learning_rate": 5.662499982593104e-05, + "loss": 0.0215, + "step": 4869 + }, + { + "epoch": 0.6433926743072299, + "grad_norm": 0.15351586043834686, + "learning_rate": 5.6587555769562064e-05, + "loss": 0.0116, + "step": 4870 + }, + { + "epoch": 0.6435247877927139, + "grad_norm": 0.10792558640241623, + "learning_rate": 5.6550119211881095e-05, + "loss": 0.0099, + "step": 4871 + }, + { + "epoch": 0.643656901278198, + "grad_norm": 0.20718395709991455, + "learning_rate": 5.651269015935463e-05, + "loss": 0.022, + "step": 4872 + }, + { + "epoch": 0.643789014763682, + "grad_norm": 0.15308907628059387, + "learning_rate": 5.6475268618447896e-05, + "loss": 0.0166, + "step": 4873 + }, + { + "epoch": 0.643921128249166, + "grad_norm": 0.13652434945106506, + "learning_rate": 5.643785459562466e-05, + "loss": 0.0148, + "step": 4874 + }, + { + "epoch": 0.6440532417346501, + "grad_norm": 0.14849905669689178, + "learning_rate": 5.640044809734756e-05, + "loss": 0.0182, + "step": 4875 + }, + { + "epoch": 0.6441853552201341, + "grad_norm": 0.14648254215717316, + "learning_rate": 5.636304913007786e-05, + "loss": 0.0115, + "step": 4876 + }, + { + "epoch": 0.6443174687056181, + "grad_norm": 0.21516041457653046, + "learning_rate": 5.6325657700275555e-05, + "loss": 0.0135, + "step": 4877 + }, + { + "epoch": 0.6444495821911022, + "grad_norm": 0.1507827490568161, + "learning_rate": 5.6288273814399276e-05, + "loss": 0.0209, + "step": 4878 + }, + { + "epoch": 0.6445816956765862, + "grad_norm": 0.17382963001728058, + "learning_rate": 5.6250897478906396e-05, + "loss": 0.0056, + "step": 4879 + }, + { + "epoch": 0.6447138091620702, + "grad_norm": 0.19124169647693634, + "learning_rate": 5.621352870025302e-05, + "loss": 0.021, + "step": 4880 + }, + { + "epoch": 0.6448459226475542, + "grad_norm": 0.1883854866027832, + "learning_rate": 5.617616748489384e-05, + "loss": 0.0276, + "step": 4881 + }, + { + "epoch": 0.6449780361330383, + "grad_norm": 0.15622587502002716, + "learning_rate": 5.6138813839282346e-05, + "loss": 0.0125, + "step": 4882 + }, + { + "epoch": 0.6451101496185223, + "grad_norm": 0.23187081515789032, + "learning_rate": 5.6101467769870666e-05, + "loss": 0.0176, + "step": 4883 + }, + { + "epoch": 0.6452422631040063, + "grad_norm": 0.25262168049812317, + "learning_rate": 5.606412928310969e-05, + "loss": 0.0224, + "step": 4884 + }, + { + "epoch": 0.6453743765894904, + "grad_norm": 0.1560918241739273, + "learning_rate": 5.6026798385448866e-05, + "loss": 0.023, + "step": 4885 + }, + { + "epoch": 0.6455064900749744, + "grad_norm": 0.16323047876358032, + "learning_rate": 5.598947508333643e-05, + "loss": 0.0235, + "step": 4886 + }, + { + "epoch": 0.6456386035604584, + "grad_norm": 0.16999828815460205, + "learning_rate": 5.595215938321934e-05, + "loss": 0.0088, + "step": 4887 + }, + { + "epoch": 0.6457707170459425, + "grad_norm": 0.12709566950798035, + "learning_rate": 5.5914851291543104e-05, + "loss": 0.0105, + "step": 4888 + }, + { + "epoch": 0.6459028305314265, + "grad_norm": 0.17557081580162048, + "learning_rate": 5.587755081475203e-05, + "loss": 0.0171, + "step": 4889 + }, + { + "epoch": 0.6460349440169105, + "grad_norm": 0.14582808315753937, + "learning_rate": 5.584025795928909e-05, + "loss": 0.0136, + "step": 4890 + }, + { + "epoch": 0.6461670575023946, + "grad_norm": 0.20256875455379486, + "learning_rate": 5.580297273159596e-05, + "loss": 0.0205, + "step": 4891 + }, + { + "epoch": 0.6462991709878786, + "grad_norm": 0.2181100994348526, + "learning_rate": 5.5765695138112896e-05, + "loss": 0.0252, + "step": 4892 + }, + { + "epoch": 0.6464312844733626, + "grad_norm": 0.17208850383758545, + "learning_rate": 5.572842518527892e-05, + "loss": 0.0211, + "step": 4893 + }, + { + "epoch": 0.6465633979588467, + "grad_norm": 0.1517520546913147, + "learning_rate": 5.5691162879531774e-05, + "loss": 0.0148, + "step": 4894 + }, + { + "epoch": 0.6466955114443307, + "grad_norm": 0.13908398151397705, + "learning_rate": 5.5653908227307764e-05, + "loss": 0.0136, + "step": 4895 + }, + { + "epoch": 0.6468276249298147, + "grad_norm": 0.20299845933914185, + "learning_rate": 5.5616661235041945e-05, + "loss": 0.0157, + "step": 4896 + }, + { + "epoch": 0.6469597384152987, + "grad_norm": 0.12722185254096985, + "learning_rate": 5.557942190916805e-05, + "loss": 0.0079, + "step": 4897 + }, + { + "epoch": 0.6470918519007828, + "grad_norm": 0.27305638790130615, + "learning_rate": 5.554219025611853e-05, + "loss": 0.0303, + "step": 4898 + }, + { + "epoch": 0.6472239653862668, + "grad_norm": 0.19381427764892578, + "learning_rate": 5.550496628232435e-05, + "loss": 0.022, + "step": 4899 + }, + { + "epoch": 0.6473560788717508, + "grad_norm": 0.17927169799804688, + "learning_rate": 5.5467749994215315e-05, + "loss": 0.0175, + "step": 4900 + }, + { + "epoch": 0.6474881923572349, + "grad_norm": 0.10161993652582169, + "learning_rate": 5.543054139821986e-05, + "loss": 0.0061, + "step": 4901 + }, + { + "epoch": 0.6476203058427189, + "grad_norm": 0.14280255138874054, + "learning_rate": 5.539334050076503e-05, + "loss": 0.0125, + "step": 4902 + }, + { + "epoch": 0.6477524193282029, + "grad_norm": 0.22020979225635529, + "learning_rate": 5.535614730827656e-05, + "loss": 0.0193, + "step": 4903 + }, + { + "epoch": 0.647884532813687, + "grad_norm": 0.17913341522216797, + "learning_rate": 5.531896182717901e-05, + "loss": 0.0127, + "step": 4904 + }, + { + "epoch": 0.648016646299171, + "grad_norm": 0.10980281233787537, + "learning_rate": 5.528178406389535e-05, + "loss": 0.0091, + "step": 4905 + }, + { + "epoch": 0.648148759784655, + "grad_norm": 0.1731892079114914, + "learning_rate": 5.5244614024847374e-05, + "loss": 0.0151, + "step": 4906 + }, + { + "epoch": 0.6482808732701391, + "grad_norm": 0.15978483855724335, + "learning_rate": 5.520745171645556e-05, + "loss": 0.0192, + "step": 4907 + }, + { + "epoch": 0.6484129867556231, + "grad_norm": 0.15524441003799438, + "learning_rate": 5.517029714513893e-05, + "loss": 0.0138, + "step": 4908 + }, + { + "epoch": 0.6485451002411071, + "grad_norm": 0.11588185280561447, + "learning_rate": 5.513315031731527e-05, + "loss": 0.0107, + "step": 4909 + }, + { + "epoch": 0.6486772137265912, + "grad_norm": 0.13747727870941162, + "learning_rate": 5.509601123940103e-05, + "loss": 0.0171, + "step": 4910 + }, + { + "epoch": 0.6488093272120752, + "grad_norm": 0.14048703014850616, + "learning_rate": 5.5058879917811276e-05, + "loss": 0.0212, + "step": 4911 + }, + { + "epoch": 0.6489414406975592, + "grad_norm": 0.17134098708629608, + "learning_rate": 5.502175635895972e-05, + "loss": 0.017, + "step": 4912 + }, + { + "epoch": 0.6490735541830432, + "grad_norm": 0.1490405946969986, + "learning_rate": 5.498464056925876e-05, + "loss": 0.0183, + "step": 4913 + }, + { + "epoch": 0.6492056676685273, + "grad_norm": 0.2269468903541565, + "learning_rate": 5.494753255511953e-05, + "loss": 0.0236, + "step": 4914 + }, + { + "epoch": 0.6493377811540113, + "grad_norm": 0.16695119440555573, + "learning_rate": 5.4910432322951656e-05, + "loss": 0.0218, + "step": 4915 + }, + { + "epoch": 0.6494698946394953, + "grad_norm": 0.14104008674621582, + "learning_rate": 5.4873339879163545e-05, + "loss": 0.0114, + "step": 4916 + }, + { + "epoch": 0.6496020081249794, + "grad_norm": 0.17982810735702515, + "learning_rate": 5.483625523016223e-05, + "loss": 0.0179, + "step": 4917 + }, + { + "epoch": 0.6497341216104634, + "grad_norm": 0.18194417655467987, + "learning_rate": 5.4799178382353425e-05, + "loss": 0.018, + "step": 4918 + }, + { + "epoch": 0.6498662350959474, + "grad_norm": 0.22586305439472198, + "learning_rate": 5.476210934214137e-05, + "loss": 0.017, + "step": 4919 + }, + { + "epoch": 0.6499983485814315, + "grad_norm": 0.4077160060405731, + "learning_rate": 5.47250481159291e-05, + "loss": 0.0202, + "step": 4920 + }, + { + "epoch": 0.6501304620669155, + "grad_norm": 0.1317765861749649, + "learning_rate": 5.46879947101183e-05, + "loss": 0.0161, + "step": 4921 + }, + { + "epoch": 0.6502625755523995, + "grad_norm": 0.12023100256919861, + "learning_rate": 5.465094913110915e-05, + "loss": 0.0144, + "step": 4922 + }, + { + "epoch": 0.6503946890378836, + "grad_norm": 0.3004123866558075, + "learning_rate": 5.461391138530065e-05, + "loss": 0.0153, + "step": 4923 + }, + { + "epoch": 0.6505268025233676, + "grad_norm": 0.11634991317987442, + "learning_rate": 5.457688147909036e-05, + "loss": 0.0116, + "step": 4924 + }, + { + "epoch": 0.6506589160088516, + "grad_norm": 0.12899671494960785, + "learning_rate": 5.453985941887454e-05, + "loss": 0.0112, + "step": 4925 + }, + { + "epoch": 0.6507910294943356, + "grad_norm": 0.2067309319972992, + "learning_rate": 5.450284521104798e-05, + "loss": 0.0135, + "step": 4926 + }, + { + "epoch": 0.6509231429798197, + "grad_norm": 0.15902547538280487, + "learning_rate": 5.446583886200425e-05, + "loss": 0.0145, + "step": 4927 + }, + { + "epoch": 0.6510552564653037, + "grad_norm": 0.17149271070957184, + "learning_rate": 5.4428840378135524e-05, + "loss": 0.0241, + "step": 4928 + }, + { + "epoch": 0.6511873699507877, + "grad_norm": 0.14578677713871002, + "learning_rate": 5.439184976583254e-05, + "loss": 0.0195, + "step": 4929 + }, + { + "epoch": 0.6513194834362718, + "grad_norm": 0.1228649690747261, + "learning_rate": 5.4354867031484736e-05, + "loss": 0.0105, + "step": 4930 + }, + { + "epoch": 0.6514515969217558, + "grad_norm": 0.210816890001297, + "learning_rate": 5.4317892181480226e-05, + "loss": 0.0296, + "step": 4931 + }, + { + "epoch": 0.6515837104072398, + "grad_norm": 0.18472038209438324, + "learning_rate": 5.428092522220576e-05, + "loss": 0.0265, + "step": 4932 + }, + { + "epoch": 0.6517158238927239, + "grad_norm": 0.22026722133159637, + "learning_rate": 5.424396616004659e-05, + "loss": 0.0312, + "step": 4933 + }, + { + "epoch": 0.6518479373782079, + "grad_norm": 0.17818962037563324, + "learning_rate": 5.420701500138674e-05, + "loss": 0.0176, + "step": 4934 + }, + { + "epoch": 0.6519800508636919, + "grad_norm": 0.15920813381671906, + "learning_rate": 5.417007175260891e-05, + "loss": 0.0188, + "step": 4935 + }, + { + "epoch": 0.652112164349176, + "grad_norm": 0.19341981410980225, + "learning_rate": 5.4133136420094234e-05, + "loss": 0.015, + "step": 4936 + }, + { + "epoch": 0.65224427783466, + "grad_norm": 0.24290108680725098, + "learning_rate": 5.4096209010222654e-05, + "loss": 0.0303, + "step": 4937 + }, + { + "epoch": 0.652376391320144, + "grad_norm": 0.13119028508663177, + "learning_rate": 5.4059289529372704e-05, + "loss": 0.0143, + "step": 4938 + }, + { + "epoch": 0.652508504805628, + "grad_norm": 0.1338653862476349, + "learning_rate": 5.402237798392156e-05, + "loss": 0.0141, + "step": 4939 + }, + { + "epoch": 0.6526406182911121, + "grad_norm": 0.18375612795352936, + "learning_rate": 5.398547438024492e-05, + "loss": 0.0281, + "step": 4940 + }, + { + "epoch": 0.6527727317765961, + "grad_norm": 0.17538763582706451, + "learning_rate": 5.3948578724717236e-05, + "loss": 0.0113, + "step": 4941 + }, + { + "epoch": 0.6529048452620801, + "grad_norm": 0.12488379329442978, + "learning_rate": 5.3911691023711565e-05, + "loss": 0.0129, + "step": 4942 + }, + { + "epoch": 0.6530369587475642, + "grad_norm": 0.22224709391593933, + "learning_rate": 5.3874811283599524e-05, + "loss": 0.0118, + "step": 4943 + }, + { + "epoch": 0.6531690722330482, + "grad_norm": 0.10177693516016006, + "learning_rate": 5.383793951075141e-05, + "loss": 0.0087, + "step": 4944 + }, + { + "epoch": 0.6533011857185322, + "grad_norm": 0.15108650922775269, + "learning_rate": 5.380107571153614e-05, + "loss": 0.0117, + "step": 4945 + }, + { + "epoch": 0.6534332992040163, + "grad_norm": 0.15009194612503052, + "learning_rate": 5.37642198923213e-05, + "loss": 0.0132, + "step": 4946 + }, + { + "epoch": 0.6535654126895003, + "grad_norm": 0.35668736696243286, + "learning_rate": 5.3727372059472934e-05, + "loss": 0.0265, + "step": 4947 + }, + { + "epoch": 0.6536975261749843, + "grad_norm": 0.16157856583595276, + "learning_rate": 5.369053221935588e-05, + "loss": 0.0146, + "step": 4948 + }, + { + "epoch": 0.6538296396604684, + "grad_norm": 0.15010638535022736, + "learning_rate": 5.365370037833357e-05, + "loss": 0.0113, + "step": 4949 + }, + { + "epoch": 0.6539617531459524, + "grad_norm": 0.1262449473142624, + "learning_rate": 5.3616876542767924e-05, + "loss": 0.0156, + "step": 4950 + }, + { + "epoch": 0.6540938666314364, + "grad_norm": 0.10484588891267776, + "learning_rate": 5.358006071901962e-05, + "loss": 0.0168, + "step": 4951 + }, + { + "epoch": 0.6542259801169205, + "grad_norm": 0.15599007904529572, + "learning_rate": 5.3543252913447894e-05, + "loss": 0.0198, + "step": 4952 + }, + { + "epoch": 0.6543580936024045, + "grad_norm": 0.24297498166561127, + "learning_rate": 5.350645313241066e-05, + "loss": 0.0242, + "step": 4953 + }, + { + "epoch": 0.6544902070878885, + "grad_norm": 0.20962654054164886, + "learning_rate": 5.34696613822643e-05, + "loss": 0.0198, + "step": 4954 + }, + { + "epoch": 0.6546223205733726, + "grad_norm": 0.13655038177967072, + "learning_rate": 5.3432877669363956e-05, + "loss": 0.0158, + "step": 4955 + }, + { + "epoch": 0.6547544340588566, + "grad_norm": 0.12744346261024475, + "learning_rate": 5.339610200006334e-05, + "loss": 0.0131, + "step": 4956 + }, + { + "epoch": 0.6548865475443406, + "grad_norm": 0.09227416664361954, + "learning_rate": 5.335933438071471e-05, + "loss": 0.0038, + "step": 4957 + }, + { + "epoch": 0.6550186610298246, + "grad_norm": 0.29073062539100647, + "learning_rate": 5.3322574817669004e-05, + "loss": 0.0217, + "step": 4958 + }, + { + "epoch": 0.6551507745153087, + "grad_norm": 0.11703348159790039, + "learning_rate": 5.328582331727576e-05, + "loss": 0.0094, + "step": 4959 + }, + { + "epoch": 0.6552828880007927, + "grad_norm": 0.17218007147312164, + "learning_rate": 5.324907988588316e-05, + "loss": 0.0183, + "step": 4960 + }, + { + "epoch": 0.6554150014862767, + "grad_norm": 0.16797365248203278, + "learning_rate": 5.321234452983786e-05, + "loss": 0.0199, + "step": 4961 + }, + { + "epoch": 0.6555471149717608, + "grad_norm": 0.117110975086689, + "learning_rate": 5.317561725548518e-05, + "loss": 0.0116, + "step": 4962 + }, + { + "epoch": 0.6556792284572448, + "grad_norm": 0.23052513599395752, + "learning_rate": 5.313889806916921e-05, + "loss": 0.0239, + "step": 4963 + }, + { + "epoch": 0.6558113419427288, + "grad_norm": 0.11556703597307205, + "learning_rate": 5.310218697723239e-05, + "loss": 0.009, + "step": 4964 + }, + { + "epoch": 0.6559434554282129, + "grad_norm": 0.22753220796585083, + "learning_rate": 5.306548398601592e-05, + "loss": 0.0213, + "step": 4965 + }, + { + "epoch": 0.6560755689136969, + "grad_norm": 0.2390792965888977, + "learning_rate": 5.302878910185958e-05, + "loss": 0.0215, + "step": 4966 + }, + { + "epoch": 0.6562076823991809, + "grad_norm": 0.22379370033740997, + "learning_rate": 5.299210233110163e-05, + "loss": 0.0228, + "step": 4967 + }, + { + "epoch": 0.656339795884665, + "grad_norm": 0.18010953068733215, + "learning_rate": 5.295542368007911e-05, + "loss": 0.0236, + "step": 4968 + }, + { + "epoch": 0.656471909370149, + "grad_norm": 0.13127058744430542, + "learning_rate": 5.291875315512753e-05, + "loss": 0.0139, + "step": 4969 + }, + { + "epoch": 0.656604022855633, + "grad_norm": 0.15722498297691345, + "learning_rate": 5.288209076258109e-05, + "loss": 0.0167, + "step": 4970 + }, + { + "epoch": 0.656736136341117, + "grad_norm": 0.16607514023780823, + "learning_rate": 5.284543650877246e-05, + "loss": 0.0175, + "step": 4971 + }, + { + "epoch": 0.6568682498266011, + "grad_norm": 0.17862720787525177, + "learning_rate": 5.2808790400033015e-05, + "loss": 0.0176, + "step": 4972 + }, + { + "epoch": 0.6570003633120851, + "grad_norm": 0.198783740401268, + "learning_rate": 5.2772152442692715e-05, + "loss": 0.0146, + "step": 4973 + }, + { + "epoch": 0.6571324767975691, + "grad_norm": 0.1337892860174179, + "learning_rate": 5.2735522643080014e-05, + "loss": 0.0181, + "step": 4974 + }, + { + "epoch": 0.6572645902830532, + "grad_norm": 0.11915634572505951, + "learning_rate": 5.269890100752205e-05, + "loss": 0.0139, + "step": 4975 + }, + { + "epoch": 0.6573967037685372, + "grad_norm": 0.1657743602991104, + "learning_rate": 5.266228754234455e-05, + "loss": 0.0183, + "step": 4976 + }, + { + "epoch": 0.6575288172540212, + "grad_norm": 0.16712002456188202, + "learning_rate": 5.262568225387181e-05, + "loss": 0.0168, + "step": 4977 + }, + { + "epoch": 0.6576609307395053, + "grad_norm": 0.19823117554187775, + "learning_rate": 5.258908514842667e-05, + "loss": 0.0264, + "step": 4978 + }, + { + "epoch": 0.6577930442249893, + "grad_norm": 0.2605159282684326, + "learning_rate": 5.2552496232330605e-05, + "loss": 0.0134, + "step": 4979 + }, + { + "epoch": 0.6579251577104733, + "grad_norm": 0.188632994890213, + "learning_rate": 5.2515915511903715e-05, + "loss": 0.0214, + "step": 4980 + }, + { + "epoch": 0.6580572711959574, + "grad_norm": 0.1341782957315445, + "learning_rate": 5.247934299346455e-05, + "loss": 0.0196, + "step": 4981 + }, + { + "epoch": 0.6581893846814414, + "grad_norm": 0.27716678380966187, + "learning_rate": 5.2442778683330405e-05, + "loss": 0.0185, + "step": 4982 + }, + { + "epoch": 0.6583214981669254, + "grad_norm": 0.3105461597442627, + "learning_rate": 5.240622258781702e-05, + "loss": 0.0315, + "step": 4983 + }, + { + "epoch": 0.6584536116524095, + "grad_norm": 0.3102239668369293, + "learning_rate": 5.236967471323888e-05, + "loss": 0.0194, + "step": 4984 + }, + { + "epoch": 0.6585857251378935, + "grad_norm": 0.14362885057926178, + "learning_rate": 5.233313506590881e-05, + "loss": 0.0098, + "step": 4985 + }, + { + "epoch": 0.6587178386233775, + "grad_norm": 0.21245473623275757, + "learning_rate": 5.229660365213843e-05, + "loss": 0.0147, + "step": 4986 + }, + { + "epoch": 0.6588499521088615, + "grad_norm": 0.10704601556062698, + "learning_rate": 5.226008047823788e-05, + "loss": 0.0086, + "step": 4987 + }, + { + "epoch": 0.6589820655943456, + "grad_norm": 0.26376500725746155, + "learning_rate": 5.222356555051579e-05, + "loss": 0.0234, + "step": 4988 + }, + { + "epoch": 0.6591141790798296, + "grad_norm": 0.15192173421382904, + "learning_rate": 5.218705887527946e-05, + "loss": 0.0186, + "step": 4989 + }, + { + "epoch": 0.6592462925653136, + "grad_norm": 0.1726227104663849, + "learning_rate": 5.215056045883473e-05, + "loss": 0.0267, + "step": 4990 + }, + { + "epoch": 0.6593784060507977, + "grad_norm": 0.1516808718442917, + "learning_rate": 5.211407030748607e-05, + "loss": 0.0137, + "step": 4991 + }, + { + "epoch": 0.6595105195362817, + "grad_norm": 0.07529601454734802, + "learning_rate": 5.207758842753638e-05, + "loss": 0.0056, + "step": 4992 + }, + { + "epoch": 0.6596426330217657, + "grad_norm": 0.140614315867424, + "learning_rate": 5.2041114825287284e-05, + "loss": 0.0118, + "step": 4993 + }, + { + "epoch": 0.6597747465072498, + "grad_norm": 0.22331038117408752, + "learning_rate": 5.200464950703894e-05, + "loss": 0.0119, + "step": 4994 + }, + { + "epoch": 0.6599068599927338, + "grad_norm": 0.16355013847351074, + "learning_rate": 5.196819247908997e-05, + "loss": 0.0142, + "step": 4995 + }, + { + "epoch": 0.6600389734782178, + "grad_norm": 0.1185201033949852, + "learning_rate": 5.193174374773768e-05, + "loss": 0.0069, + "step": 4996 + }, + { + "epoch": 0.6601710869637019, + "grad_norm": 0.1419517993927002, + "learning_rate": 5.189530331927792e-05, + "loss": 0.017, + "step": 4997 + }, + { + "epoch": 0.6603032004491859, + "grad_norm": 0.16416120529174805, + "learning_rate": 5.185887120000512e-05, + "loss": 0.02, + "step": 4998 + }, + { + "epoch": 0.6604353139346699, + "grad_norm": 0.5249677896499634, + "learning_rate": 5.182244739621218e-05, + "loss": 0.0137, + "step": 4999 + }, + { + "epoch": 0.660567427420154, + "grad_norm": 0.11340130120515823, + "learning_rate": 5.178603191419066e-05, + "loss": 0.0063, + "step": 5000 + }, + { + "epoch": 0.660699540905638, + "grad_norm": 0.17886826395988464, + "learning_rate": 5.174962476023069e-05, + "loss": 0.0144, + "step": 5001 + }, + { + "epoch": 0.660831654391122, + "grad_norm": 0.1607665717601776, + "learning_rate": 5.171322594062086e-05, + "loss": 0.0179, + "step": 5002 + }, + { + "epoch": 0.660963767876606, + "grad_norm": 0.20411817729473114, + "learning_rate": 5.167683546164841e-05, + "loss": 0.0207, + "step": 5003 + }, + { + "epoch": 0.6610958813620901, + "grad_norm": 0.14540739357471466, + "learning_rate": 5.164045332959913e-05, + "loss": 0.0102, + "step": 5004 + }, + { + "epoch": 0.6612279948475741, + "grad_norm": 0.17851538956165314, + "learning_rate": 5.160407955075739e-05, + "loss": 0.0205, + "step": 5005 + }, + { + "epoch": 0.6613601083330581, + "grad_norm": 0.1295069456100464, + "learning_rate": 5.1567714131405984e-05, + "loss": 0.016, + "step": 5006 + }, + { + "epoch": 0.6614922218185422, + "grad_norm": 0.15553872287273407, + "learning_rate": 5.153135707782641e-05, + "loss": 0.0183, + "step": 5007 + }, + { + "epoch": 0.6616243353040262, + "grad_norm": 0.0835447609424591, + "learning_rate": 5.1495008396298726e-05, + "loss": 0.0067, + "step": 5008 + }, + { + "epoch": 0.6617564487895102, + "grad_norm": 0.09932634234428406, + "learning_rate": 5.1458668093101384e-05, + "loss": 0.0086, + "step": 5009 + }, + { + "epoch": 0.6618885622749943, + "grad_norm": 0.09297791868448257, + "learning_rate": 5.142233617451153e-05, + "loss": 0.0104, + "step": 5010 + }, + { + "epoch": 0.6620206757604783, + "grad_norm": 0.16000889241695404, + "learning_rate": 5.1386012646804826e-05, + "loss": 0.0157, + "step": 5011 + }, + { + "epoch": 0.6621527892459623, + "grad_norm": 0.1270449310541153, + "learning_rate": 5.1349697516255535e-05, + "loss": 0.0186, + "step": 5012 + }, + { + "epoch": 0.6622849027314464, + "grad_norm": 0.18740740418434143, + "learning_rate": 5.131339078913634e-05, + "loss": 0.0265, + "step": 5013 + }, + { + "epoch": 0.6624170162169304, + "grad_norm": 0.13335908949375153, + "learning_rate": 5.1277092471718566e-05, + "loss": 0.014, + "step": 5014 + }, + { + "epoch": 0.6625491297024144, + "grad_norm": 0.2011447548866272, + "learning_rate": 5.1240802570272126e-05, + "loss": 0.0221, + "step": 5015 + }, + { + "epoch": 0.6626812431878984, + "grad_norm": 0.07277203351259232, + "learning_rate": 5.120452109106535e-05, + "loss": 0.0081, + "step": 5016 + }, + { + "epoch": 0.6628133566733825, + "grad_norm": 0.172392338514328, + "learning_rate": 5.11682480403652e-05, + "loss": 0.0159, + "step": 5017 + }, + { + "epoch": 0.6629454701588665, + "grad_norm": 0.2753337323665619, + "learning_rate": 5.113198342443719e-05, + "loss": 0.0284, + "step": 5018 + }, + { + "epoch": 0.6630775836443505, + "grad_norm": 0.16935433447360992, + "learning_rate": 5.109572724954538e-05, + "loss": 0.0175, + "step": 5019 + }, + { + "epoch": 0.6632096971298346, + "grad_norm": 0.1387585699558258, + "learning_rate": 5.105947952195227e-05, + "loss": 0.013, + "step": 5020 + }, + { + "epoch": 0.6633418106153186, + "grad_norm": 0.23182858526706696, + "learning_rate": 5.102324024791902e-05, + "loss": 0.0269, + "step": 5021 + }, + { + "epoch": 0.6634739241008026, + "grad_norm": 0.2846618890762329, + "learning_rate": 5.098700943370528e-05, + "loss": 0.0264, + "step": 5022 + }, + { + "epoch": 0.6636060375862867, + "grad_norm": 0.21124884486198425, + "learning_rate": 5.0950787085569265e-05, + "loss": 0.0186, + "step": 5023 + }, + { + "epoch": 0.6637381510717707, + "grad_norm": 0.21159759163856506, + "learning_rate": 5.091457320976767e-05, + "loss": 0.0335, + "step": 5024 + }, + { + "epoch": 0.6638702645572547, + "grad_norm": 0.2589932084083557, + "learning_rate": 5.087836781255585e-05, + "loss": 0.0141, + "step": 5025 + }, + { + "epoch": 0.6640023780427388, + "grad_norm": 0.2727756202220917, + "learning_rate": 5.08421709001875e-05, + "loss": 0.0202, + "step": 5026 + }, + { + "epoch": 0.6641344915282228, + "grad_norm": 0.09600035101175308, + "learning_rate": 5.0805982478915015e-05, + "loss": 0.009, + "step": 5027 + }, + { + "epoch": 0.6642666050137068, + "grad_norm": 0.18655623495578766, + "learning_rate": 5.076980255498931e-05, + "loss": 0.0134, + "step": 5028 + }, + { + "epoch": 0.6643987184991909, + "grad_norm": 0.13871648907661438, + "learning_rate": 5.073363113465969e-05, + "loss": 0.0108, + "step": 5029 + }, + { + "epoch": 0.6645308319846749, + "grad_norm": 0.1454835832118988, + "learning_rate": 5.069746822417415e-05, + "loss": 0.0173, + "step": 5030 + }, + { + "epoch": 0.6646629454701589, + "grad_norm": 0.18189920485019684, + "learning_rate": 5.066131382977914e-05, + "loss": 0.014, + "step": 5031 + }, + { + "epoch": 0.664795058955643, + "grad_norm": 0.19891028106212616, + "learning_rate": 5.0625167957719724e-05, + "loss": 0.0279, + "step": 5032 + }, + { + "epoch": 0.664927172441127, + "grad_norm": 0.1992846429347992, + "learning_rate": 5.058903061423932e-05, + "loss": 0.0164, + "step": 5033 + }, + { + "epoch": 0.665059285926611, + "grad_norm": 0.13385896384716034, + "learning_rate": 5.0552901805580034e-05, + "loss": 0.0189, + "step": 5034 + }, + { + "epoch": 0.665191399412095, + "grad_norm": 0.5015331506729126, + "learning_rate": 5.051678153798247e-05, + "loss": 0.0159, + "step": 5035 + }, + { + "epoch": 0.6653235128975791, + "grad_norm": 0.21186839044094086, + "learning_rate": 5.0480669817685656e-05, + "loss": 0.0191, + "step": 5036 + }, + { + "epoch": 0.6654556263830631, + "grad_norm": 0.10766605287790298, + "learning_rate": 5.044456665092725e-05, + "loss": 0.01, + "step": 5037 + }, + { + "epoch": 0.6655877398685471, + "grad_norm": 0.11912591010332108, + "learning_rate": 5.040847204394341e-05, + "loss": 0.0079, + "step": 5038 + }, + { + "epoch": 0.6657198533540312, + "grad_norm": 0.15789036452770233, + "learning_rate": 5.037238600296883e-05, + "loss": 0.0084, + "step": 5039 + }, + { + "epoch": 0.6658519668395152, + "grad_norm": 0.14228802919387817, + "learning_rate": 5.033630853423663e-05, + "loss": 0.0163, + "step": 5040 + }, + { + "epoch": 0.6659840803249992, + "grad_norm": 0.19446247816085815, + "learning_rate": 5.030023964397857e-05, + "loss": 0.0208, + "step": 5041 + }, + { + "epoch": 0.6661161938104833, + "grad_norm": 0.15082289278507233, + "learning_rate": 5.026417933842489e-05, + "loss": 0.0172, + "step": 5042 + }, + { + "epoch": 0.6662483072959673, + "grad_norm": 0.24349327385425568, + "learning_rate": 5.0228127623804266e-05, + "loss": 0.0171, + "step": 5043 + }, + { + "epoch": 0.6663804207814513, + "grad_norm": 0.1227944940328598, + "learning_rate": 5.019208450634398e-05, + "loss": 0.0157, + "step": 5044 + }, + { + "epoch": 0.6665125342669354, + "grad_norm": 0.16644077003002167, + "learning_rate": 5.015604999226985e-05, + "loss": 0.0138, + "step": 5045 + }, + { + "epoch": 0.6666446477524194, + "grad_norm": 0.15668226778507233, + "learning_rate": 5.012002408780616e-05, + "loss": 0.014, + "step": 5046 + }, + { + "epoch": 0.6667767612379033, + "grad_norm": 0.18227237462997437, + "learning_rate": 5.008400679917567e-05, + "loss": 0.0228, + "step": 5047 + }, + { + "epoch": 0.6669088747233873, + "grad_norm": 0.18179632723331451, + "learning_rate": 5.004799813259968e-05, + "loss": 0.0189, + "step": 5048 + }, + { + "epoch": 0.6670409882088714, + "grad_norm": 0.20042505860328674, + "learning_rate": 5.001199809429811e-05, + "loss": 0.0206, + "step": 5049 + }, + { + "epoch": 0.6671731016943554, + "grad_norm": 0.11901211738586426, + "learning_rate": 4.9976006690489184e-05, + "loss": 0.0165, + "step": 5050 + }, + { + "epoch": 0.6673052151798394, + "grad_norm": 0.1777932196855545, + "learning_rate": 4.9940023927389786e-05, + "loss": 0.0181, + "step": 5051 + }, + { + "epoch": 0.6674373286653235, + "grad_norm": 0.16570496559143066, + "learning_rate": 4.990404981121528e-05, + "loss": 0.0219, + "step": 5052 + }, + { + "epoch": 0.6675694421508075, + "grad_norm": 0.18598343431949615, + "learning_rate": 4.986808434817954e-05, + "loss": 0.0238, + "step": 5053 + }, + { + "epoch": 0.6677015556362915, + "grad_norm": 0.17573808133602142, + "learning_rate": 4.983212754449487e-05, + "loss": 0.0141, + "step": 5054 + }, + { + "epoch": 0.6678336691217756, + "grad_norm": 0.1362745463848114, + "learning_rate": 4.979617940637216e-05, + "loss": 0.021, + "step": 5055 + }, + { + "epoch": 0.6679657826072596, + "grad_norm": 0.26245835423469543, + "learning_rate": 4.976023994002081e-05, + "loss": 0.0208, + "step": 5056 + }, + { + "epoch": 0.6680978960927436, + "grad_norm": 0.15304391086101532, + "learning_rate": 4.972430915164864e-05, + "loss": 0.0126, + "step": 5057 + }, + { + "epoch": 0.6682300095782276, + "grad_norm": 0.1346382051706314, + "learning_rate": 4.968838704746205e-05, + "loss": 0.0115, + "step": 5058 + }, + { + "epoch": 0.6683621230637117, + "grad_norm": 0.12355558574199677, + "learning_rate": 4.9652473633665896e-05, + "loss": 0.0085, + "step": 5059 + }, + { + "epoch": 0.6684942365491957, + "grad_norm": 0.19657886028289795, + "learning_rate": 4.96165689164636e-05, + "loss": 0.018, + "step": 5060 + }, + { + "epoch": 0.6686263500346797, + "grad_norm": 0.13189183175563812, + "learning_rate": 4.9580672902056954e-05, + "loss": 0.0146, + "step": 5061 + }, + { + "epoch": 0.6687584635201638, + "grad_norm": 0.15036815404891968, + "learning_rate": 4.954478559664636e-05, + "loss": 0.0117, + "step": 5062 + }, + { + "epoch": 0.6688905770056478, + "grad_norm": 0.11169622093439102, + "learning_rate": 4.9508907006430724e-05, + "loss": 0.0142, + "step": 5063 + }, + { + "epoch": 0.6690226904911318, + "grad_norm": 0.21585702896118164, + "learning_rate": 4.947303713760731e-05, + "loss": 0.0162, + "step": 5064 + }, + { + "epoch": 0.6691548039766159, + "grad_norm": 0.13905102014541626, + "learning_rate": 4.943717599637202e-05, + "loss": 0.014, + "step": 5065 + }, + { + "epoch": 0.6692869174620999, + "grad_norm": 0.16844500601291656, + "learning_rate": 4.940132358891919e-05, + "loss": 0.0233, + "step": 5066 + }, + { + "epoch": 0.6694190309475839, + "grad_norm": 0.18952059745788574, + "learning_rate": 4.9365479921441684e-05, + "loss": 0.0144, + "step": 5067 + }, + { + "epoch": 0.669551144433068, + "grad_norm": 0.2647591233253479, + "learning_rate": 4.932964500013077e-05, + "loss": 0.0269, + "step": 5068 + }, + { + "epoch": 0.669683257918552, + "grad_norm": 0.13148827850818634, + "learning_rate": 4.929381883117626e-05, + "loss": 0.0242, + "step": 5069 + }, + { + "epoch": 0.669815371404036, + "grad_norm": 0.19894959032535553, + "learning_rate": 4.925800142076654e-05, + "loss": 0.0215, + "step": 5070 + }, + { + "epoch": 0.66994748488952, + "grad_norm": 0.15500599145889282, + "learning_rate": 4.9222192775088296e-05, + "loss": 0.0138, + "step": 5071 + }, + { + "epoch": 0.6700795983750041, + "grad_norm": 0.22023585438728333, + "learning_rate": 4.9186392900326836e-05, + "loss": 0.0249, + "step": 5072 + }, + { + "epoch": 0.6702117118604881, + "grad_norm": 0.14289738237857819, + "learning_rate": 4.915060180266593e-05, + "loss": 0.0114, + "step": 5073 + }, + { + "epoch": 0.6703438253459721, + "grad_norm": 0.15781345963478088, + "learning_rate": 4.9114819488287855e-05, + "loss": 0.0175, + "step": 5074 + }, + { + "epoch": 0.6704759388314562, + "grad_norm": 0.2148284912109375, + "learning_rate": 4.907904596337326e-05, + "loss": 0.0161, + "step": 5075 + }, + { + "epoch": 0.6706080523169402, + "grad_norm": 0.2217796891927719, + "learning_rate": 4.90432812341014e-05, + "loss": 0.0141, + "step": 5076 + }, + { + "epoch": 0.6707401658024242, + "grad_norm": 0.4486676752567291, + "learning_rate": 4.900752530664998e-05, + "loss": 0.0325, + "step": 5077 + }, + { + "epoch": 0.6708722792879083, + "grad_norm": 0.15495853126049042, + "learning_rate": 4.897177818719512e-05, + "loss": 0.0078, + "step": 5078 + }, + { + "epoch": 0.6710043927733923, + "grad_norm": 0.22847874462604523, + "learning_rate": 4.893603988191145e-05, + "loss": 0.0205, + "step": 5079 + }, + { + "epoch": 0.6711365062588763, + "grad_norm": 0.12668964266777039, + "learning_rate": 4.890031039697219e-05, + "loss": 0.0138, + "step": 5080 + }, + { + "epoch": 0.6712686197443604, + "grad_norm": 0.16602325439453125, + "learning_rate": 4.886458973854886e-05, + "loss": 0.0205, + "step": 5081 + }, + { + "epoch": 0.6714007332298444, + "grad_norm": 0.18135517835617065, + "learning_rate": 4.882887791281157e-05, + "loss": 0.0346, + "step": 5082 + }, + { + "epoch": 0.6715328467153284, + "grad_norm": 0.17789128422737122, + "learning_rate": 4.8793174925928884e-05, + "loss": 0.0207, + "step": 5083 + }, + { + "epoch": 0.6716649602008125, + "grad_norm": 0.2250068634748459, + "learning_rate": 4.8757480784067764e-05, + "loss": 0.0225, + "step": 5084 + }, + { + "epoch": 0.6717970736862965, + "grad_norm": 0.2617173194885254, + "learning_rate": 4.872179549339375e-05, + "loss": 0.0336, + "step": 5085 + }, + { + "epoch": 0.6719291871717805, + "grad_norm": 0.14353157579898834, + "learning_rate": 4.86861190600708e-05, + "loss": 0.0164, + "step": 5086 + }, + { + "epoch": 0.6720613006572645, + "grad_norm": 0.17198024690151215, + "learning_rate": 4.8650451490261386e-05, + "loss": 0.0229, + "step": 5087 + }, + { + "epoch": 0.6721934141427486, + "grad_norm": 0.15205325186252594, + "learning_rate": 4.861479279012635e-05, + "loss": 0.0128, + "step": 5088 + }, + { + "epoch": 0.6723255276282326, + "grad_norm": 0.09790311753749847, + "learning_rate": 4.857914296582509e-05, + "loss": 0.0117, + "step": 5089 + }, + { + "epoch": 0.6724576411137166, + "grad_norm": 0.21584492921829224, + "learning_rate": 4.85435020235155e-05, + "loss": 0.0204, + "step": 5090 + }, + { + "epoch": 0.6725897545992007, + "grad_norm": 0.10409726947546005, + "learning_rate": 4.8507869969353794e-05, + "loss": 0.0148, + "step": 5091 + }, + { + "epoch": 0.6727218680846847, + "grad_norm": 0.14929348230361938, + "learning_rate": 4.8472246809494784e-05, + "loss": 0.0151, + "step": 5092 + }, + { + "epoch": 0.6728539815701687, + "grad_norm": 0.17571406066417694, + "learning_rate": 4.843663255009171e-05, + "loss": 0.025, + "step": 5093 + }, + { + "epoch": 0.6729860950556528, + "grad_norm": 0.13798776268959045, + "learning_rate": 4.840102719729631e-05, + "loss": 0.0161, + "step": 5094 + }, + { + "epoch": 0.6731182085411368, + "grad_norm": 0.25582829117774963, + "learning_rate": 4.836543075725867e-05, + "loss": 0.0255, + "step": 5095 + }, + { + "epoch": 0.6732503220266208, + "grad_norm": 0.14417283236980438, + "learning_rate": 4.832984323612744e-05, + "loss": 0.02, + "step": 5096 + }, + { + "epoch": 0.6733824355121049, + "grad_norm": 0.23102858662605286, + "learning_rate": 4.829426464004974e-05, + "loss": 0.0158, + "step": 5097 + }, + { + "epoch": 0.6735145489975889, + "grad_norm": 0.16588328778743744, + "learning_rate": 4.825869497517102e-05, + "loss": 0.0107, + "step": 5098 + }, + { + "epoch": 0.6736466624830729, + "grad_norm": 0.19536004960536957, + "learning_rate": 4.8223134247635316e-05, + "loss": 0.0163, + "step": 5099 + }, + { + "epoch": 0.673778775968557, + "grad_norm": 0.15462182462215424, + "learning_rate": 4.818758246358509e-05, + "loss": 0.0133, + "step": 5100 + }, + { + "epoch": 0.673910889454041, + "grad_norm": 0.6555613875389099, + "learning_rate": 4.81520396291613e-05, + "loss": 0.0187, + "step": 5101 + }, + { + "epoch": 0.674043002939525, + "grad_norm": 0.1688404679298401, + "learning_rate": 4.811650575050318e-05, + "loss": 0.0192, + "step": 5102 + }, + { + "epoch": 0.674175116425009, + "grad_norm": 0.16610851883888245, + "learning_rate": 4.808098083374863e-05, + "loss": 0.0117, + "step": 5103 + }, + { + "epoch": 0.6743072299104931, + "grad_norm": 0.1498517096042633, + "learning_rate": 4.804546488503393e-05, + "loss": 0.0183, + "step": 5104 + }, + { + "epoch": 0.6744393433959771, + "grad_norm": 0.0877426490187645, + "learning_rate": 4.800995791049373e-05, + "loss": 0.0105, + "step": 5105 + }, + { + "epoch": 0.6745714568814611, + "grad_norm": 0.2841498851776123, + "learning_rate": 4.797445991626123e-05, + "loss": 0.0165, + "step": 5106 + }, + { + "epoch": 0.6747035703669452, + "grad_norm": 0.11339423060417175, + "learning_rate": 4.793897090846803e-05, + "loss": 0.0161, + "step": 5107 + }, + { + "epoch": 0.6748356838524292, + "grad_norm": 0.23054392635822296, + "learning_rate": 4.790349089324425e-05, + "loss": 0.0185, + "step": 5108 + }, + { + "epoch": 0.6749677973379132, + "grad_norm": 0.10517584532499313, + "learning_rate": 4.786801987671833e-05, + "loss": 0.0126, + "step": 5109 + }, + { + "epoch": 0.6750999108233973, + "grad_norm": 0.19798646867275238, + "learning_rate": 4.7832557865017235e-05, + "loss": 0.0161, + "step": 5110 + }, + { + "epoch": 0.6752320243088813, + "grad_norm": 0.2040533870458603, + "learning_rate": 4.779710486426643e-05, + "loss": 0.0119, + "step": 5111 + }, + { + "epoch": 0.6753641377943653, + "grad_norm": 0.13550186157226562, + "learning_rate": 4.7761660880589666e-05, + "loss": 0.0119, + "step": 5112 + }, + { + "epoch": 0.6754962512798494, + "grad_norm": 0.15335050225257874, + "learning_rate": 4.772622592010927e-05, + "loss": 0.0151, + "step": 5113 + }, + { + "epoch": 0.6756283647653334, + "grad_norm": 0.19715610146522522, + "learning_rate": 4.7690799988945963e-05, + "loss": 0.0148, + "step": 5114 + }, + { + "epoch": 0.6757604782508174, + "grad_norm": 0.17787174880504608, + "learning_rate": 4.765538309321896e-05, + "loss": 0.0271, + "step": 5115 + }, + { + "epoch": 0.6758925917363015, + "grad_norm": 0.22029972076416016, + "learning_rate": 4.761997523904579e-05, + "loss": 0.0211, + "step": 5116 + }, + { + "epoch": 0.6760247052217855, + "grad_norm": 0.14438408613204956, + "learning_rate": 4.758457643254254e-05, + "loss": 0.0102, + "step": 5117 + }, + { + "epoch": 0.6761568187072695, + "grad_norm": 0.15860889852046967, + "learning_rate": 4.754918667982371e-05, + "loss": 0.0188, + "step": 5118 + }, + { + "epoch": 0.6762889321927535, + "grad_norm": 0.17438390851020813, + "learning_rate": 4.7513805987002166e-05, + "loss": 0.0177, + "step": 5119 + }, + { + "epoch": 0.6764210456782376, + "grad_norm": 0.1780308187007904, + "learning_rate": 4.7478434360189284e-05, + "loss": 0.0188, + "step": 5120 + }, + { + "epoch": 0.6765531591637216, + "grad_norm": 0.16482895612716675, + "learning_rate": 4.7443071805494865e-05, + "loss": 0.0128, + "step": 5121 + }, + { + "epoch": 0.6766852726492056, + "grad_norm": 0.1242159903049469, + "learning_rate": 4.740771832902715e-05, + "loss": 0.0114, + "step": 5122 + }, + { + "epoch": 0.6768173861346897, + "grad_norm": 0.1562412679195404, + "learning_rate": 4.737237393689272e-05, + "loss": 0.0166, + "step": 5123 + }, + { + "epoch": 0.6769494996201737, + "grad_norm": 0.16549445688724518, + "learning_rate": 4.7337038635196704e-05, + "loss": 0.0183, + "step": 5124 + }, + { + "epoch": 0.6770816131056577, + "grad_norm": 0.25368818640708923, + "learning_rate": 4.730171243004265e-05, + "loss": 0.0186, + "step": 5125 + }, + { + "epoch": 0.6772137265911418, + "grad_norm": 0.10777774453163147, + "learning_rate": 4.726639532753243e-05, + "loss": 0.0094, + "step": 5126 + }, + { + "epoch": 0.6773458400766258, + "grad_norm": 0.1838887333869934, + "learning_rate": 4.7231087333766435e-05, + "loss": 0.024, + "step": 5127 + }, + { + "epoch": 0.6774779535621098, + "grad_norm": 0.25211283564567566, + "learning_rate": 4.719578845484346e-05, + "loss": 0.0191, + "step": 5128 + }, + { + "epoch": 0.6776100670475939, + "grad_norm": 0.09369178861379623, + "learning_rate": 4.716049869686078e-05, + "loss": 0.0086, + "step": 5129 + }, + { + "epoch": 0.6777421805330779, + "grad_norm": 0.11376402527093887, + "learning_rate": 4.712521806591396e-05, + "loss": 0.0086, + "step": 5130 + }, + { + "epoch": 0.6778742940185619, + "grad_norm": 0.1679966002702713, + "learning_rate": 4.70899465680971e-05, + "loss": 0.0183, + "step": 5131 + }, + { + "epoch": 0.678006407504046, + "grad_norm": 0.1598343551158905, + "learning_rate": 4.705468420950273e-05, + "loss": 0.0149, + "step": 5132 + }, + { + "epoch": 0.67813852098953, + "grad_norm": 0.23596306145191193, + "learning_rate": 4.70194309962217e-05, + "loss": 0.0242, + "step": 5133 + }, + { + "epoch": 0.678270634475014, + "grad_norm": 0.21659186482429504, + "learning_rate": 4.698418693434338e-05, + "loss": 0.0207, + "step": 5134 + }, + { + "epoch": 0.678402747960498, + "grad_norm": 0.15793080627918243, + "learning_rate": 4.6948952029955506e-05, + "loss": 0.0198, + "step": 5135 + }, + { + "epoch": 0.6785348614459821, + "grad_norm": 0.1283729374408722, + "learning_rate": 4.69137262891443e-05, + "loss": 0.016, + "step": 5136 + }, + { + "epoch": 0.6786669749314661, + "grad_norm": 0.11209773272275925, + "learning_rate": 4.687850971799427e-05, + "loss": 0.0038, + "step": 5137 + }, + { + "epoch": 0.6787990884169501, + "grad_norm": 0.1797674298286438, + "learning_rate": 4.6843302322588423e-05, + "loss": 0.0104, + "step": 5138 + }, + { + "epoch": 0.6789312019024342, + "grad_norm": 0.10780422389507294, + "learning_rate": 4.680810410900829e-05, + "loss": 0.0146, + "step": 5139 + }, + { + "epoch": 0.6790633153879182, + "grad_norm": 0.10967176407575607, + "learning_rate": 4.6772915083333576e-05, + "loss": 0.0128, + "step": 5140 + }, + { + "epoch": 0.6791954288734022, + "grad_norm": 0.20822446048259735, + "learning_rate": 4.67377352516426e-05, + "loss": 0.0174, + "step": 5141 + }, + { + "epoch": 0.6793275423588863, + "grad_norm": 0.13655947148799896, + "learning_rate": 4.6702564620012035e-05, + "loss": 0.0192, + "step": 5142 + }, + { + "epoch": 0.6794596558443703, + "grad_norm": 0.12891127169132233, + "learning_rate": 4.666740319451687e-05, + "loss": 0.0196, + "step": 5143 + }, + { + "epoch": 0.6795917693298543, + "grad_norm": 0.2711154818534851, + "learning_rate": 4.663225098123063e-05, + "loss": 0.0046, + "step": 5144 + }, + { + "epoch": 0.6797238828153384, + "grad_norm": 0.1241656094789505, + "learning_rate": 4.659710798622521e-05, + "loss": 0.0144, + "step": 5145 + }, + { + "epoch": 0.6798559963008224, + "grad_norm": 0.12367275357246399, + "learning_rate": 4.656197421557092e-05, + "loss": 0.0149, + "step": 5146 + }, + { + "epoch": 0.6799881097863064, + "grad_norm": 0.13578025996685028, + "learning_rate": 4.652684967533641e-05, + "loss": 0.0122, + "step": 5147 + }, + { + "epoch": 0.6801202232717904, + "grad_norm": 0.18295130133628845, + "learning_rate": 4.649173437158882e-05, + "loss": 0.0217, + "step": 5148 + }, + { + "epoch": 0.6802523367572745, + "grad_norm": 0.0850638747215271, + "learning_rate": 4.64566283103937e-05, + "loss": 0.0101, + "step": 5149 + }, + { + "epoch": 0.6803844502427585, + "grad_norm": 0.20700609683990479, + "learning_rate": 4.642153149781488e-05, + "loss": 0.0186, + "step": 5150 + }, + { + "epoch": 0.6805165637282425, + "grad_norm": 0.0652320459485054, + "learning_rate": 4.638644393991472e-05, + "loss": 0.0056, + "step": 5151 + }, + { + "epoch": 0.6806486772137266, + "grad_norm": 0.12985356152057648, + "learning_rate": 4.635136564275395e-05, + "loss": 0.0111, + "step": 5152 + }, + { + "epoch": 0.6807807906992106, + "grad_norm": 0.3793286979198456, + "learning_rate": 4.631629661239171e-05, + "loss": 0.0326, + "step": 5153 + }, + { + "epoch": 0.6809129041846946, + "grad_norm": 0.15389122068881989, + "learning_rate": 4.6281236854885456e-05, + "loss": 0.0194, + "step": 5154 + }, + { + "epoch": 0.6810450176701787, + "grad_norm": 0.1676003634929657, + "learning_rate": 4.624618637629115e-05, + "loss": 0.024, + "step": 5155 + }, + { + "epoch": 0.6811771311556627, + "grad_norm": 0.23593369126319885, + "learning_rate": 4.621114518266313e-05, + "loss": 0.0212, + "step": 5156 + }, + { + "epoch": 0.6813092446411467, + "grad_norm": 0.340415894985199, + "learning_rate": 4.617611328005403e-05, + "loss": 0.0186, + "step": 5157 + }, + { + "epoch": 0.6814413581266308, + "grad_norm": 0.15453080832958221, + "learning_rate": 4.6141090674515006e-05, + "loss": 0.0121, + "step": 5158 + }, + { + "epoch": 0.6815734716121148, + "grad_norm": 0.1813836246728897, + "learning_rate": 4.6106077372095556e-05, + "loss": 0.0219, + "step": 5159 + }, + { + "epoch": 0.6817055850975988, + "grad_norm": 0.2668374478816986, + "learning_rate": 4.607107337884361e-05, + "loss": 0.0169, + "step": 5160 + }, + { + "epoch": 0.6818376985830829, + "grad_norm": 0.41349849104881287, + "learning_rate": 4.603607870080537e-05, + "loss": 0.02, + "step": 5161 + }, + { + "epoch": 0.6819698120685669, + "grad_norm": 0.15275467932224274, + "learning_rate": 4.600109334402556e-05, + "loss": 0.0151, + "step": 5162 + }, + { + "epoch": 0.6821019255540509, + "grad_norm": 0.1274024397134781, + "learning_rate": 4.596611731454728e-05, + "loss": 0.0094, + "step": 5163 + }, + { + "epoch": 0.682234039039535, + "grad_norm": 0.17443904280662537, + "learning_rate": 4.593115061841191e-05, + "loss": 0.0151, + "step": 5164 + }, + { + "epoch": 0.682366152525019, + "grad_norm": 0.1481851190328598, + "learning_rate": 4.589619326165932e-05, + "loss": 0.014, + "step": 5165 + }, + { + "epoch": 0.682498266010503, + "grad_norm": 0.15162219107151031, + "learning_rate": 4.5861245250327764e-05, + "loss": 0.0146, + "step": 5166 + }, + { + "epoch": 0.682630379495987, + "grad_norm": 0.19407233595848083, + "learning_rate": 4.582630659045388e-05, + "loss": 0.0161, + "step": 5167 + }, + { + "epoch": 0.6827624929814711, + "grad_norm": 0.1700238138437271, + "learning_rate": 4.57913772880726e-05, + "loss": 0.0182, + "step": 5168 + }, + { + "epoch": 0.6828946064669551, + "grad_norm": 0.12905320525169373, + "learning_rate": 4.575645734921733e-05, + "loss": 0.0116, + "step": 5169 + }, + { + "epoch": 0.6830267199524391, + "grad_norm": 0.10565055906772614, + "learning_rate": 4.572154677991989e-05, + "loss": 0.0068, + "step": 5170 + }, + { + "epoch": 0.6831588334379232, + "grad_norm": 0.13061608374118805, + "learning_rate": 4.568664558621034e-05, + "loss": 0.0172, + "step": 5171 + }, + { + "epoch": 0.6832909469234072, + "grad_norm": 0.11616747826337814, + "learning_rate": 4.5651753774117255e-05, + "loss": 0.0093, + "step": 5172 + }, + { + "epoch": 0.6834230604088912, + "grad_norm": 0.16339434683322906, + "learning_rate": 4.561687134966755e-05, + "loss": 0.0152, + "step": 5173 + }, + { + "epoch": 0.6835551738943753, + "grad_norm": 0.09892938286066055, + "learning_rate": 4.558199831888653e-05, + "loss": 0.0128, + "step": 5174 + }, + { + "epoch": 0.6836872873798593, + "grad_norm": 0.15133710205554962, + "learning_rate": 4.554713468779781e-05, + "loss": 0.0112, + "step": 5175 + }, + { + "epoch": 0.6838194008653433, + "grad_norm": 0.16055816411972046, + "learning_rate": 4.551228046242344e-05, + "loss": 0.0104, + "step": 5176 + }, + { + "epoch": 0.6839515143508273, + "grad_norm": 0.24436479806900024, + "learning_rate": 4.5477435648783885e-05, + "loss": 0.0191, + "step": 5177 + }, + { + "epoch": 0.6840836278363114, + "grad_norm": 0.16935010254383087, + "learning_rate": 4.544260025289787e-05, + "loss": 0.0136, + "step": 5178 + }, + { + "epoch": 0.6842157413217954, + "grad_norm": 0.11614146828651428, + "learning_rate": 4.540777428078258e-05, + "loss": 0.0123, + "step": 5179 + }, + { + "epoch": 0.6843478548072794, + "grad_norm": 0.32731178402900696, + "learning_rate": 4.537295773845356e-05, + "loss": 0.0189, + "step": 5180 + }, + { + "epoch": 0.6844799682927635, + "grad_norm": 0.21909061074256897, + "learning_rate": 4.5338150631924745e-05, + "loss": 0.024, + "step": 5181 + }, + { + "epoch": 0.6846120817782475, + "grad_norm": 0.4557572901248932, + "learning_rate": 4.530335296720835e-05, + "loss": 0.0205, + "step": 5182 + }, + { + "epoch": 0.6847441952637315, + "grad_norm": 0.16844063997268677, + "learning_rate": 4.526856475031504e-05, + "loss": 0.0215, + "step": 5183 + }, + { + "epoch": 0.6848763087492156, + "grad_norm": 0.30879849195480347, + "learning_rate": 4.52337859872539e-05, + "loss": 0.0209, + "step": 5184 + }, + { + "epoch": 0.6850084222346996, + "grad_norm": 0.23125775158405304, + "learning_rate": 4.51990166840322e-05, + "loss": 0.02, + "step": 5185 + }, + { + "epoch": 0.6851405357201836, + "grad_norm": 0.1605955809354782, + "learning_rate": 4.5164256846655737e-05, + "loss": 0.0112, + "step": 5186 + }, + { + "epoch": 0.6852726492056677, + "grad_norm": 0.16270045936107635, + "learning_rate": 4.512950648112864e-05, + "loss": 0.0171, + "step": 5187 + }, + { + "epoch": 0.6854047626911517, + "grad_norm": 0.09362021088600159, + "learning_rate": 4.509476559345339e-05, + "loss": 0.0092, + "step": 5188 + }, + { + "epoch": 0.6855368761766357, + "grad_norm": 0.13662898540496826, + "learning_rate": 4.5060034189630774e-05, + "loss": 0.0202, + "step": 5189 + }, + { + "epoch": 0.6856689896621198, + "grad_norm": 0.19320911169052124, + "learning_rate": 4.5025312275660025e-05, + "loss": 0.0268, + "step": 5190 + }, + { + "epoch": 0.6858011031476038, + "grad_norm": 0.1628652662038803, + "learning_rate": 4.499059985753874e-05, + "loss": 0.0181, + "step": 5191 + }, + { + "epoch": 0.6859332166330878, + "grad_norm": 0.16020523011684418, + "learning_rate": 4.495589694126278e-05, + "loss": 0.0118, + "step": 5192 + }, + { + "epoch": 0.6860653301185718, + "grad_norm": 0.15857800841331482, + "learning_rate": 4.492120353282643e-05, + "loss": 0.0173, + "step": 5193 + }, + { + "epoch": 0.6861974436040559, + "grad_norm": 0.13342146575450897, + "learning_rate": 4.4886519638222355e-05, + "loss": 0.0111, + "step": 5194 + }, + { + "epoch": 0.6863295570895399, + "grad_norm": 0.35289087891578674, + "learning_rate": 4.485184526344157e-05, + "loss": 0.0127, + "step": 5195 + }, + { + "epoch": 0.6864616705750239, + "grad_norm": 0.17435653507709503, + "learning_rate": 4.4817180414473333e-05, + "loss": 0.0211, + "step": 5196 + }, + { + "epoch": 0.686593784060508, + "grad_norm": 0.21706287562847137, + "learning_rate": 4.478252509730548e-05, + "loss": 0.0201, + "step": 5197 + }, + { + "epoch": 0.686725897545992, + "grad_norm": 0.14080168306827545, + "learning_rate": 4.4747879317923966e-05, + "loss": 0.016, + "step": 5198 + }, + { + "epoch": 0.686858011031476, + "grad_norm": 0.11725561320781708, + "learning_rate": 4.471324308231323e-05, + "loss": 0.0105, + "step": 5199 + }, + { + "epoch": 0.6869901245169601, + "grad_norm": 0.1532520353794098, + "learning_rate": 4.467861639645604e-05, + "loss": 0.0124, + "step": 5200 + }, + { + "epoch": 0.6871222380024441, + "grad_norm": 0.16441814601421356, + "learning_rate": 4.4643999266333544e-05, + "loss": 0.0163, + "step": 5201 + }, + { + "epoch": 0.6872543514879281, + "grad_norm": 0.188445582985878, + "learning_rate": 4.460939169792514e-05, + "loss": 0.0261, + "step": 5202 + }, + { + "epoch": 0.6873864649734122, + "grad_norm": 0.15494723618030548, + "learning_rate": 4.4574793697208675e-05, + "loss": 0.0173, + "step": 5203 + }, + { + "epoch": 0.6875185784588962, + "grad_norm": 0.214800164103508, + "learning_rate": 4.4540205270160316e-05, + "loss": 0.0314, + "step": 5204 + }, + { + "epoch": 0.6876506919443802, + "grad_norm": 0.2373284250497818, + "learning_rate": 4.450562642275452e-05, + "loss": 0.0359, + "step": 5205 + }, + { + "epoch": 0.6877828054298643, + "grad_norm": 0.12864165008068085, + "learning_rate": 4.447105716096417e-05, + "loss": 0.0145, + "step": 5206 + }, + { + "epoch": 0.6879149189153483, + "grad_norm": 0.2078774869441986, + "learning_rate": 4.443649749076045e-05, + "loss": 0.0318, + "step": 5207 + }, + { + "epoch": 0.6880470324008323, + "grad_norm": 0.13967713713645935, + "learning_rate": 4.440194741811295e-05, + "loss": 0.021, + "step": 5208 + }, + { + "epoch": 0.6881791458863163, + "grad_norm": 0.16248448193073273, + "learning_rate": 4.436740694898946e-05, + "loss": 0.0159, + "step": 5209 + }, + { + "epoch": 0.6883112593718004, + "grad_norm": 0.19797800481319427, + "learning_rate": 4.433287608935622e-05, + "loss": 0.0158, + "step": 5210 + }, + { + "epoch": 0.6884433728572844, + "grad_norm": 0.21106760203838348, + "learning_rate": 4.429835484517788e-05, + "loss": 0.0139, + "step": 5211 + }, + { + "epoch": 0.6885754863427684, + "grad_norm": 0.14479297399520874, + "learning_rate": 4.4263843222417224e-05, + "loss": 0.0154, + "step": 5212 + }, + { + "epoch": 0.6887075998282525, + "grad_norm": 0.1643540859222412, + "learning_rate": 4.4229341227035525e-05, + "loss": 0.0211, + "step": 5213 + }, + { + "epoch": 0.6888397133137365, + "grad_norm": 0.14425019919872284, + "learning_rate": 4.419484886499239e-05, + "loss": 0.0179, + "step": 5214 + }, + { + "epoch": 0.6889718267992205, + "grad_norm": 0.1826268434524536, + "learning_rate": 4.416036614224574e-05, + "loss": 0.019, + "step": 5215 + }, + { + "epoch": 0.6891039402847046, + "grad_norm": 0.23135367035865784, + "learning_rate": 4.412589306475174e-05, + "loss": 0.0303, + "step": 5216 + }, + { + "epoch": 0.6892360537701886, + "grad_norm": 0.13859330117702484, + "learning_rate": 4.409142963846503e-05, + "loss": 0.0133, + "step": 5217 + }, + { + "epoch": 0.6893681672556726, + "grad_norm": 0.08792946487665176, + "learning_rate": 4.4056975869338544e-05, + "loss": 0.0092, + "step": 5218 + }, + { + "epoch": 0.6895002807411567, + "grad_norm": 0.24918882548809052, + "learning_rate": 4.402253176332347e-05, + "loss": 0.0126, + "step": 5219 + }, + { + "epoch": 0.6896323942266407, + "grad_norm": 0.25261256098747253, + "learning_rate": 4.3988097326369396e-05, + "loss": 0.0207, + "step": 5220 + }, + { + "epoch": 0.6897645077121247, + "grad_norm": 0.23432409763336182, + "learning_rate": 4.395367256442424e-05, + "loss": 0.0194, + "step": 5221 + }, + { + "epoch": 0.6898966211976088, + "grad_norm": 0.1472122222185135, + "learning_rate": 4.3919257483434284e-05, + "loss": 0.0163, + "step": 5222 + }, + { + "epoch": 0.6900287346830928, + "grad_norm": 0.23036111891269684, + "learning_rate": 4.3884852089344e-05, + "loss": 0.0234, + "step": 5223 + }, + { + "epoch": 0.6901608481685768, + "grad_norm": 0.21664488315582275, + "learning_rate": 4.38504563880963e-05, + "loss": 0.0308, + "step": 5224 + }, + { + "epoch": 0.6902929616540608, + "grad_norm": 0.18760326504707336, + "learning_rate": 4.381607038563247e-05, + "loss": 0.0185, + "step": 5225 + }, + { + "epoch": 0.6904250751395449, + "grad_norm": 0.13710762560367584, + "learning_rate": 4.378169408789196e-05, + "loss": 0.0118, + "step": 5226 + }, + { + "epoch": 0.6905571886250289, + "grad_norm": 0.19669315218925476, + "learning_rate": 4.374732750081265e-05, + "loss": 0.0348, + "step": 5227 + }, + { + "epoch": 0.6906893021105129, + "grad_norm": 0.18398793041706085, + "learning_rate": 4.371297063033075e-05, + "loss": 0.0155, + "step": 5228 + }, + { + "epoch": 0.690821415595997, + "grad_norm": 0.194778174161911, + "learning_rate": 4.3678623482380806e-05, + "loss": 0.025, + "step": 5229 + }, + { + "epoch": 0.690953529081481, + "grad_norm": 0.12789078056812286, + "learning_rate": 4.364428606289556e-05, + "loss": 0.0218, + "step": 5230 + }, + { + "epoch": 0.691085642566965, + "grad_norm": 0.1261407881975174, + "learning_rate": 4.3609958377806194e-05, + "loss": 0.0122, + "step": 5231 + }, + { + "epoch": 0.6912177560524491, + "grad_norm": 0.2067817747592926, + "learning_rate": 4.3575640433042206e-05, + "loss": 0.0179, + "step": 5232 + }, + { + "epoch": 0.6913498695379331, + "grad_norm": 0.19638954102993011, + "learning_rate": 4.354133223453133e-05, + "loss": 0.0259, + "step": 5233 + }, + { + "epoch": 0.6914819830234171, + "grad_norm": 0.16034339368343353, + "learning_rate": 4.350703378819968e-05, + "loss": 0.0142, + "step": 5234 + }, + { + "epoch": 0.6916140965089012, + "grad_norm": 0.16898213326931, + "learning_rate": 4.347274509997169e-05, + "loss": 0.0204, + "step": 5235 + }, + { + "epoch": 0.6917462099943852, + "grad_norm": 0.4070425033569336, + "learning_rate": 4.34384661757701e-05, + "loss": 0.0292, + "step": 5236 + }, + { + "epoch": 0.6918783234798692, + "grad_norm": 0.15171203017234802, + "learning_rate": 4.34041970215159e-05, + "loss": 0.0198, + "step": 5237 + }, + { + "epoch": 0.6920104369653532, + "grad_norm": 0.35594651103019714, + "learning_rate": 4.3369937643128475e-05, + "loss": 0.0221, + "step": 5238 + }, + { + "epoch": 0.6921425504508373, + "grad_norm": 0.18406252562999725, + "learning_rate": 4.3335688046525534e-05, + "loss": 0.0299, + "step": 5239 + }, + { + "epoch": 0.6922746639363213, + "grad_norm": 0.1940668672323227, + "learning_rate": 4.330144823762299e-05, + "loss": 0.0205, + "step": 5240 + }, + { + "epoch": 0.6924067774218053, + "grad_norm": 0.1684083491563797, + "learning_rate": 4.326721822233514e-05, + "loss": 0.0295, + "step": 5241 + }, + { + "epoch": 0.6925388909072894, + "grad_norm": 0.16625122725963593, + "learning_rate": 4.32329980065746e-05, + "loss": 0.0125, + "step": 5242 + }, + { + "epoch": 0.6926710043927734, + "grad_norm": 0.1683010458946228, + "learning_rate": 4.31987875962523e-05, + "loss": 0.0192, + "step": 5243 + }, + { + "epoch": 0.6928031178782574, + "grad_norm": 0.16990509629249573, + "learning_rate": 4.316458699727738e-05, + "loss": 0.016, + "step": 5244 + }, + { + "epoch": 0.6929352313637415, + "grad_norm": 0.13994839787483215, + "learning_rate": 4.313039621555738e-05, + "loss": 0.0173, + "step": 5245 + }, + { + "epoch": 0.6930673448492255, + "grad_norm": 0.2251029908657074, + "learning_rate": 4.3096215256998175e-05, + "loss": 0.0141, + "step": 5246 + }, + { + "epoch": 0.6931994583347095, + "grad_norm": 0.14937463402748108, + "learning_rate": 4.30620441275038e-05, + "loss": 0.0118, + "step": 5247 + }, + { + "epoch": 0.6933315718201936, + "grad_norm": 0.20369993150234222, + "learning_rate": 4.302788283297672e-05, + "loss": 0.017, + "step": 5248 + }, + { + "epoch": 0.6934636853056776, + "grad_norm": 0.10524363815784454, + "learning_rate": 4.299373137931765e-05, + "loss": 0.0103, + "step": 5249 + }, + { + "epoch": 0.6935957987911616, + "grad_norm": 0.0838766023516655, + "learning_rate": 4.295958977242566e-05, + "loss": 0.0066, + "step": 5250 + }, + { + "epoch": 0.6937279122766457, + "grad_norm": 0.16826088726520538, + "learning_rate": 4.292545801819801e-05, + "loss": 0.0118, + "step": 5251 + }, + { + "epoch": 0.6938600257621297, + "grad_norm": 0.16063901782035828, + "learning_rate": 4.2891336122530335e-05, + "loss": 0.0204, + "step": 5252 + }, + { + "epoch": 0.6939921392476137, + "grad_norm": 0.1232612207531929, + "learning_rate": 4.2857224091316615e-05, + "loss": 0.0184, + "step": 5253 + }, + { + "epoch": 0.6941242527330977, + "grad_norm": 0.16036061942577362, + "learning_rate": 4.282312193044897e-05, + "loss": 0.0139, + "step": 5254 + }, + { + "epoch": 0.6942563662185818, + "grad_norm": 0.1817048043012619, + "learning_rate": 4.2789029645817945e-05, + "loss": 0.013, + "step": 5255 + }, + { + "epoch": 0.6943884797040658, + "grad_norm": 0.16562654078006744, + "learning_rate": 4.275494724331242e-05, + "loss": 0.0105, + "step": 5256 + }, + { + "epoch": 0.6945205931895498, + "grad_norm": 0.19718943536281586, + "learning_rate": 4.272087472881939e-05, + "loss": 0.0182, + "step": 5257 + }, + { + "epoch": 0.6946527066750339, + "grad_norm": 0.10483067482709885, + "learning_rate": 4.2686812108224294e-05, + "loss": 0.0104, + "step": 5258 + }, + { + "epoch": 0.6947848201605179, + "grad_norm": 0.14471067488193512, + "learning_rate": 4.2652759387410814e-05, + "loss": 0.0075, + "step": 5259 + }, + { + "epoch": 0.6949169336460019, + "grad_norm": 0.07532824575901031, + "learning_rate": 4.2618716572260944e-05, + "loss": 0.0059, + "step": 5260 + }, + { + "epoch": 0.695049047131486, + "grad_norm": 0.16629336774349213, + "learning_rate": 4.258468366865487e-05, + "loss": 0.0123, + "step": 5261 + }, + { + "epoch": 0.69518116061697, + "grad_norm": 0.19355374574661255, + "learning_rate": 4.255066068247118e-05, + "loss": 0.0158, + "step": 5262 + }, + { + "epoch": 0.695313274102454, + "grad_norm": 0.20441442728042603, + "learning_rate": 4.251664761958676e-05, + "loss": 0.0259, + "step": 5263 + }, + { + "epoch": 0.6954453875879381, + "grad_norm": 0.16446572542190552, + "learning_rate": 4.248264448587663e-05, + "loss": 0.0142, + "step": 5264 + }, + { + "epoch": 0.6955775010734221, + "grad_norm": 0.13875527679920197, + "learning_rate": 4.244865128721426e-05, + "loss": 0.0141, + "step": 5265 + }, + { + "epoch": 0.6957096145589061, + "grad_norm": 0.08903460204601288, + "learning_rate": 4.241466802947133e-05, + "loss": 0.0071, + "step": 5266 + }, + { + "epoch": 0.6958417280443902, + "grad_norm": 0.13762259483337402, + "learning_rate": 4.238069471851783e-05, + "loss": 0.0079, + "step": 5267 + }, + { + "epoch": 0.6959738415298742, + "grad_norm": 0.1628267914056778, + "learning_rate": 4.234673136022197e-05, + "loss": 0.0199, + "step": 5268 + }, + { + "epoch": 0.6961059550153582, + "grad_norm": 0.12279197573661804, + "learning_rate": 4.23127779604503e-05, + "loss": 0.0078, + "step": 5269 + }, + { + "epoch": 0.6962380685008422, + "grad_norm": 0.19203080236911774, + "learning_rate": 4.227883452506769e-05, + "loss": 0.0177, + "step": 5270 + }, + { + "epoch": 0.6963701819863263, + "grad_norm": 0.1306513249874115, + "learning_rate": 4.2244901059937144e-05, + "loss": 0.0085, + "step": 5271 + }, + { + "epoch": 0.6965022954718103, + "grad_norm": 0.15973524749279022, + "learning_rate": 4.2210977570920085e-05, + "loss": 0.0173, + "step": 5272 + }, + { + "epoch": 0.6966344089572943, + "grad_norm": 0.17098453640937805, + "learning_rate": 4.2177064063876145e-05, + "loss": 0.0211, + "step": 5273 + }, + { + "epoch": 0.6967665224427784, + "grad_norm": 0.11793388426303864, + "learning_rate": 4.21431605446633e-05, + "loss": 0.0151, + "step": 5274 + }, + { + "epoch": 0.6968986359282624, + "grad_norm": 0.11402009427547455, + "learning_rate": 4.2109267019137656e-05, + "loss": 0.0117, + "step": 5275 + }, + { + "epoch": 0.6970307494137464, + "grad_norm": 0.09435441344976425, + "learning_rate": 4.207538349315375e-05, + "loss": 0.0127, + "step": 5276 + }, + { + "epoch": 0.6971628628992305, + "grad_norm": 0.2840203642845154, + "learning_rate": 4.204150997256434e-05, + "loss": 0.019, + "step": 5277 + }, + { + "epoch": 0.6972949763847145, + "grad_norm": 0.306834876537323, + "learning_rate": 4.2007646463220384e-05, + "loss": 0.0146, + "step": 5278 + }, + { + "epoch": 0.6974270898701985, + "grad_norm": 0.24466602504253387, + "learning_rate": 4.197379297097121e-05, + "loss": 0.0231, + "step": 5279 + }, + { + "epoch": 0.6975592033556826, + "grad_norm": 0.19821403920650482, + "learning_rate": 4.193994950166435e-05, + "loss": 0.0143, + "step": 5280 + }, + { + "epoch": 0.6976913168411666, + "grad_norm": 0.1387878656387329, + "learning_rate": 4.190611606114571e-05, + "loss": 0.0114, + "step": 5281 + }, + { + "epoch": 0.6978234303266506, + "grad_norm": 0.14503693580627441, + "learning_rate": 4.1872292655259274e-05, + "loss": 0.0223, + "step": 5282 + }, + { + "epoch": 0.6979555438121346, + "grad_norm": 0.14114725589752197, + "learning_rate": 4.1838479289847456e-05, + "loss": 0.0177, + "step": 5283 + }, + { + "epoch": 0.6980876572976187, + "grad_norm": 0.18414288759231567, + "learning_rate": 4.1804675970750906e-05, + "loss": 0.0148, + "step": 5284 + }, + { + "epoch": 0.6982197707831027, + "grad_norm": 0.14067934453487396, + "learning_rate": 4.177088270380846e-05, + "loss": 0.0129, + "step": 5285 + }, + { + "epoch": 0.6983518842685867, + "grad_norm": 0.16954699158668518, + "learning_rate": 4.17370994948573e-05, + "loss": 0.0192, + "step": 5286 + }, + { + "epoch": 0.6984839977540708, + "grad_norm": 0.18649275600910187, + "learning_rate": 4.170332634973284e-05, + "loss": 0.0164, + "step": 5287 + }, + { + "epoch": 0.6986161112395548, + "grad_norm": 0.11534375697374344, + "learning_rate": 4.166956327426881e-05, + "loss": 0.0062, + "step": 5288 + }, + { + "epoch": 0.6987482247250388, + "grad_norm": 0.1631186604499817, + "learning_rate": 4.163581027429706e-05, + "loss": 0.0111, + "step": 5289 + }, + { + "epoch": 0.6988803382105229, + "grad_norm": 0.21578098833560944, + "learning_rate": 4.160206735564783e-05, + "loss": 0.0155, + "step": 5290 + }, + { + "epoch": 0.6990124516960069, + "grad_norm": 0.1441599726676941, + "learning_rate": 4.156833452414963e-05, + "loss": 0.011, + "step": 5291 + }, + { + "epoch": 0.6991445651814909, + "grad_norm": 0.18236543238162994, + "learning_rate": 4.1534611785629087e-05, + "loss": 0.028, + "step": 5292 + }, + { + "epoch": 0.699276678666975, + "grad_norm": 0.18258638679981232, + "learning_rate": 4.150089914591121e-05, + "loss": 0.0169, + "step": 5293 + }, + { + "epoch": 0.699408792152459, + "grad_norm": 0.12462858110666275, + "learning_rate": 4.1467196610819234e-05, + "loss": 0.0082, + "step": 5294 + }, + { + "epoch": 0.699540905637943, + "grad_norm": 0.14412254095077515, + "learning_rate": 4.143350418617469e-05, + "loss": 0.0104, + "step": 5295 + }, + { + "epoch": 0.699673019123427, + "grad_norm": 0.1670442372560501, + "learning_rate": 4.1399821877797205e-05, + "loss": 0.0183, + "step": 5296 + }, + { + "epoch": 0.6998051326089111, + "grad_norm": 0.1631360799074173, + "learning_rate": 4.136614969150484e-05, + "loss": 0.0176, + "step": 5297 + }, + { + "epoch": 0.6999372460943951, + "grad_norm": 0.2254563421010971, + "learning_rate": 4.133248763311386e-05, + "loss": 0.0146, + "step": 5298 + }, + { + "epoch": 0.7000693595798791, + "grad_norm": 0.1660764068365097, + "learning_rate": 4.129883570843868e-05, + "loss": 0.0184, + "step": 5299 + }, + { + "epoch": 0.7002014730653632, + "grad_norm": 0.14396461844444275, + "learning_rate": 4.1265193923292076e-05, + "loss": 0.0147, + "step": 5300 + }, + { + "epoch": 0.7003335865508472, + "grad_norm": 0.11012925952672958, + "learning_rate": 4.123156228348505e-05, + "loss": 0.0109, + "step": 5301 + }, + { + "epoch": 0.7004657000363312, + "grad_norm": 0.14452813565731049, + "learning_rate": 4.119794079482686e-05, + "loss": 0.0208, + "step": 5302 + }, + { + "epoch": 0.7005978135218153, + "grad_norm": 0.2126447707414627, + "learning_rate": 4.116432946312493e-05, + "loss": 0.027, + "step": 5303 + }, + { + "epoch": 0.7007299270072993, + "grad_norm": 0.18720543384552002, + "learning_rate": 4.113072829418502e-05, + "loss": 0.021, + "step": 5304 + }, + { + "epoch": 0.7008620404927833, + "grad_norm": 0.2129470854997635, + "learning_rate": 4.109713729381113e-05, + "loss": 0.0254, + "step": 5305 + }, + { + "epoch": 0.7009941539782674, + "grad_norm": 0.20480234920978546, + "learning_rate": 4.106355646780541e-05, + "loss": 0.0193, + "step": 5306 + }, + { + "epoch": 0.7011262674637514, + "grad_norm": 0.11257723718881607, + "learning_rate": 4.1029985821968366e-05, + "loss": 0.0092, + "step": 5307 + }, + { + "epoch": 0.7012583809492354, + "grad_norm": 0.13825350999832153, + "learning_rate": 4.099642536209869e-05, + "loss": 0.0151, + "step": 5308 + }, + { + "epoch": 0.7013904944347195, + "grad_norm": 0.150064617395401, + "learning_rate": 4.096287509399337e-05, + "loss": 0.0206, + "step": 5309 + }, + { + "epoch": 0.7015226079202035, + "grad_norm": 0.14601023495197296, + "learning_rate": 4.09293350234475e-05, + "loss": 0.0147, + "step": 5310 + }, + { + "epoch": 0.7016547214056875, + "grad_norm": 0.14445830881595612, + "learning_rate": 4.089580515625454e-05, + "loss": 0.0185, + "step": 5311 + }, + { + "epoch": 0.7017868348911716, + "grad_norm": 0.15438856184482574, + "learning_rate": 4.08622854982062e-05, + "loss": 0.0146, + "step": 5312 + }, + { + "epoch": 0.7019189483766556, + "grad_norm": 0.1915358155965805, + "learning_rate": 4.082877605509229e-05, + "loss": 0.016, + "step": 5313 + }, + { + "epoch": 0.7020510618621396, + "grad_norm": 0.25274017453193665, + "learning_rate": 4.079527683270093e-05, + "loss": 0.0155, + "step": 5314 + }, + { + "epoch": 0.7021831753476236, + "grad_norm": 0.17730306088924408, + "learning_rate": 4.076178783681861e-05, + "loss": 0.0232, + "step": 5315 + }, + { + "epoch": 0.7023152888331077, + "grad_norm": 0.1441715657711029, + "learning_rate": 4.072830907322981e-05, + "loss": 0.013, + "step": 5316 + }, + { + "epoch": 0.7024474023185917, + "grad_norm": 0.23619748651981354, + "learning_rate": 4.0694840547717394e-05, + "loss": 0.0233, + "step": 5317 + }, + { + "epoch": 0.7025795158040757, + "grad_norm": 0.16689759492874146, + "learning_rate": 4.0661382266062475e-05, + "loss": 0.0125, + "step": 5318 + }, + { + "epoch": 0.7027116292895598, + "grad_norm": 0.14752112329006195, + "learning_rate": 4.062793423404426e-05, + "loss": 0.0101, + "step": 5319 + }, + { + "epoch": 0.7028437427750438, + "grad_norm": 0.14175213873386383, + "learning_rate": 4.0594496457440314e-05, + "loss": 0.0162, + "step": 5320 + }, + { + "epoch": 0.7029758562605278, + "grad_norm": 0.16413505375385284, + "learning_rate": 4.056106894202637e-05, + "loss": 0.0122, + "step": 5321 + }, + { + "epoch": 0.7031079697460119, + "grad_norm": 0.18232764303684235, + "learning_rate": 4.0527651693576463e-05, + "loss": 0.0204, + "step": 5322 + }, + { + "epoch": 0.7032400832314959, + "grad_norm": 0.13424061238765717, + "learning_rate": 4.049424471786273e-05, + "loss": 0.0193, + "step": 5323 + }, + { + "epoch": 0.7033721967169799, + "grad_norm": 0.13029293715953827, + "learning_rate": 4.046084802065562e-05, + "loss": 0.0097, + "step": 5324 + }, + { + "epoch": 0.703504310202464, + "grad_norm": 0.11119290441274643, + "learning_rate": 4.042746160772382e-05, + "loss": 0.0161, + "step": 5325 + }, + { + "epoch": 0.703636423687948, + "grad_norm": 0.17957177758216858, + "learning_rate": 4.039408548483416e-05, + "loss": 0.0161, + "step": 5326 + }, + { + "epoch": 0.703768537173432, + "grad_norm": 0.23708337545394897, + "learning_rate": 4.036071965775175e-05, + "loss": 0.0258, + "step": 5327 + }, + { + "epoch": 0.703900650658916, + "grad_norm": 0.18829962611198425, + "learning_rate": 4.032736413223994e-05, + "loss": 0.0153, + "step": 5328 + }, + { + "epoch": 0.7040327641444001, + "grad_norm": 0.27105703949928284, + "learning_rate": 4.02940189140603e-05, + "loss": 0.0318, + "step": 5329 + }, + { + "epoch": 0.7041648776298841, + "grad_norm": 0.11548773944377899, + "learning_rate": 4.026068400897251e-05, + "loss": 0.0109, + "step": 5330 + }, + { + "epoch": 0.7042969911153681, + "grad_norm": 0.10774447023868561, + "learning_rate": 4.02273594227346e-05, + "loss": 0.0143, + "step": 5331 + }, + { + "epoch": 0.7044291046008522, + "grad_norm": 0.1054147332906723, + "learning_rate": 4.019404516110279e-05, + "loss": 0.0151, + "step": 5332 + }, + { + "epoch": 0.7045612180863362, + "grad_norm": 0.12237270921468735, + "learning_rate": 4.016074122983144e-05, + "loss": 0.0151, + "step": 5333 + }, + { + "epoch": 0.7046933315718202, + "grad_norm": 0.1550246775150299, + "learning_rate": 4.012744763467322e-05, + "loss": 0.0152, + "step": 5334 + }, + { + "epoch": 0.7048254450573043, + "grad_norm": 0.1900588423013687, + "learning_rate": 4.0094164381378964e-05, + "loss": 0.0169, + "step": 5335 + }, + { + "epoch": 0.7049575585427883, + "grad_norm": 0.12014942616224289, + "learning_rate": 4.006089147569776e-05, + "loss": 0.0096, + "step": 5336 + }, + { + "epoch": 0.7050896720282723, + "grad_norm": 0.13331158459186554, + "learning_rate": 4.002762892337684e-05, + "loss": 0.0114, + "step": 5337 + }, + { + "epoch": 0.7052217855137564, + "grad_norm": 0.1908334344625473, + "learning_rate": 3.9994376730161685e-05, + "loss": 0.0168, + "step": 5338 + }, + { + "epoch": 0.7053538989992404, + "grad_norm": 0.11662087589502335, + "learning_rate": 3.996113490179605e-05, + "loss": 0.014, + "step": 5339 + }, + { + "epoch": 0.7054860124847244, + "grad_norm": 0.17463147640228271, + "learning_rate": 3.992790344402176e-05, + "loss": 0.0148, + "step": 5340 + }, + { + "epoch": 0.7056181259702085, + "grad_norm": 0.1223292127251625, + "learning_rate": 3.989468236257897e-05, + "loss": 0.0084, + "step": 5341 + }, + { + "epoch": 0.7057502394556925, + "grad_norm": 0.2078678011894226, + "learning_rate": 3.986147166320599e-05, + "loss": 0.0237, + "step": 5342 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.145186647772789, + "learning_rate": 3.98282713516394e-05, + "loss": 0.0053, + "step": 5343 + }, + { + "epoch": 0.7060144664266605, + "grad_norm": 0.1675570160150528, + "learning_rate": 3.979508143361385e-05, + "loss": 0.0118, + "step": 5344 + }, + { + "epoch": 0.7061465799121446, + "grad_norm": 0.14878936111927032, + "learning_rate": 3.976190191486231e-05, + "loss": 0.0197, + "step": 5345 + }, + { + "epoch": 0.7062786933976286, + "grad_norm": 0.1630447953939438, + "learning_rate": 3.972873280111597e-05, + "loss": 0.0181, + "step": 5346 + }, + { + "epoch": 0.7064108068831126, + "grad_norm": 0.28922614455223083, + "learning_rate": 3.969557409810408e-05, + "loss": 0.0206, + "step": 5347 + }, + { + "epoch": 0.7065429203685967, + "grad_norm": 0.12345405668020248, + "learning_rate": 3.966242581155424e-05, + "loss": 0.008, + "step": 5348 + }, + { + "epoch": 0.7066750338540807, + "grad_norm": 0.16507402062416077, + "learning_rate": 3.9629287947192196e-05, + "loss": 0.0221, + "step": 5349 + }, + { + "epoch": 0.7068071473395647, + "grad_norm": 0.23153814673423767, + "learning_rate": 3.959616051074193e-05, + "loss": 0.0161, + "step": 5350 + }, + { + "epoch": 0.7069392608250488, + "grad_norm": 0.20211441814899445, + "learning_rate": 3.9563043507925514e-05, + "loss": 0.0312, + "step": 5351 + }, + { + "epoch": 0.7070713743105328, + "grad_norm": 0.14686128497123718, + "learning_rate": 3.952993694446332e-05, + "loss": 0.0174, + "step": 5352 + }, + { + "epoch": 0.7072034877960168, + "grad_norm": 0.19460754096508026, + "learning_rate": 3.949684082607393e-05, + "loss": 0.0259, + "step": 5353 + }, + { + "epoch": 0.7073356012815009, + "grad_norm": 0.24404078722000122, + "learning_rate": 3.9463755158474e-05, + "loss": 0.017, + "step": 5354 + }, + { + "epoch": 0.7074677147669849, + "grad_norm": 0.13295841217041016, + "learning_rate": 3.9430679947378526e-05, + "loss": 0.0108, + "step": 5355 + }, + { + "epoch": 0.7075998282524689, + "grad_norm": 0.11946427822113037, + "learning_rate": 3.939761519850059e-05, + "loss": 0.0098, + "step": 5356 + }, + { + "epoch": 0.707731941737953, + "grad_norm": 0.1620432436466217, + "learning_rate": 3.9364560917551574e-05, + "loss": 0.0173, + "step": 5357 + }, + { + "epoch": 0.707864055223437, + "grad_norm": 0.17194488644599915, + "learning_rate": 3.93315171102409e-05, + "loss": 0.0117, + "step": 5358 + }, + { + "epoch": 0.707996168708921, + "grad_norm": 0.19578425586223602, + "learning_rate": 3.929848378227632e-05, + "loss": 0.0237, + "step": 5359 + }, + { + "epoch": 0.708128282194405, + "grad_norm": 0.11690971255302429, + "learning_rate": 3.926546093936374e-05, + "loss": 0.0138, + "step": 5360 + }, + { + "epoch": 0.7082603956798891, + "grad_norm": 0.12806108593940735, + "learning_rate": 3.923244858720718e-05, + "loss": 0.0144, + "step": 5361 + }, + { + "epoch": 0.7083925091653731, + "grad_norm": 0.2078218311071396, + "learning_rate": 3.919944673150894e-05, + "loss": 0.0222, + "step": 5362 + }, + { + "epoch": 0.7085246226508571, + "grad_norm": 0.1328975111246109, + "learning_rate": 3.916645537796947e-05, + "loss": 0.0121, + "step": 5363 + }, + { + "epoch": 0.7086567361363412, + "grad_norm": 0.12743492424488068, + "learning_rate": 3.9133474532287453e-05, + "loss": 0.0207, + "step": 5364 + }, + { + "epoch": 0.7087888496218252, + "grad_norm": 0.11305283010005951, + "learning_rate": 3.910050420015964e-05, + "loss": 0.0083, + "step": 5365 + }, + { + "epoch": 0.7089209631073092, + "grad_norm": 0.20388872921466827, + "learning_rate": 3.906754438728106e-05, + "loss": 0.0219, + "step": 5366 + }, + { + "epoch": 0.7090530765927933, + "grad_norm": 0.17367129027843475, + "learning_rate": 3.9034595099344964e-05, + "loss": 0.0194, + "step": 5367 + }, + { + "epoch": 0.7091851900782773, + "grad_norm": 0.18559734523296356, + "learning_rate": 3.900165634204263e-05, + "loss": 0.0219, + "step": 5368 + }, + { + "epoch": 0.7093173035637613, + "grad_norm": 0.14173002541065216, + "learning_rate": 3.896872812106367e-05, + "loss": 0.0099, + "step": 5369 + }, + { + "epoch": 0.7094494170492454, + "grad_norm": 0.20723380148410797, + "learning_rate": 3.89358104420958e-05, + "loss": 0.0108, + "step": 5370 + }, + { + "epoch": 0.7095815305347294, + "grad_norm": 0.6518858075141907, + "learning_rate": 3.890290331082499e-05, + "loss": 0.0205, + "step": 5371 + }, + { + "epoch": 0.7097136440202133, + "grad_norm": 0.15502440929412842, + "learning_rate": 3.8870006732935206e-05, + "loss": 0.0129, + "step": 5372 + }, + { + "epoch": 0.7098457575056973, + "grad_norm": 0.13822178542613983, + "learning_rate": 3.883712071410882e-05, + "loss": 0.0171, + "step": 5373 + }, + { + "epoch": 0.7099778709911814, + "grad_norm": 0.1479300558567047, + "learning_rate": 3.880424526002631e-05, + "loss": 0.0137, + "step": 5374 + }, + { + "epoch": 0.7101099844766654, + "grad_norm": 0.21749626100063324, + "learning_rate": 3.8771380376366186e-05, + "loss": 0.0216, + "step": 5375 + }, + { + "epoch": 0.7102420979621494, + "grad_norm": 0.20055606961250305, + "learning_rate": 3.873852606880529e-05, + "loss": 0.0194, + "step": 5376 + }, + { + "epoch": 0.7103742114476335, + "grad_norm": 0.13387270271778107, + "learning_rate": 3.8705682343018645e-05, + "loss": 0.0103, + "step": 5377 + }, + { + "epoch": 0.7105063249331175, + "grad_norm": 0.15972426533699036, + "learning_rate": 3.86728492046793e-05, + "loss": 0.0201, + "step": 5378 + }, + { + "epoch": 0.7106384384186015, + "grad_norm": 0.07348188757896423, + "learning_rate": 3.864002665945859e-05, + "loss": 0.0059, + "step": 5379 + }, + { + "epoch": 0.7107705519040856, + "grad_norm": 0.07760182023048401, + "learning_rate": 3.8607214713026016e-05, + "loss": 0.007, + "step": 5380 + }, + { + "epoch": 0.7109026653895696, + "grad_norm": 0.1682509332895279, + "learning_rate": 3.8574413371049264e-05, + "loss": 0.0116, + "step": 5381 + }, + { + "epoch": 0.7110347788750536, + "grad_norm": 0.2266969084739685, + "learning_rate": 3.854162263919408e-05, + "loss": 0.015, + "step": 5382 + }, + { + "epoch": 0.7111668923605377, + "grad_norm": 0.14937283098697662, + "learning_rate": 3.8508842523124466e-05, + "loss": 0.0089, + "step": 5383 + }, + { + "epoch": 0.7112990058460217, + "grad_norm": 0.1911482810974121, + "learning_rate": 3.8476073028502634e-05, + "loss": 0.016, + "step": 5384 + }, + { + "epoch": 0.7114311193315057, + "grad_norm": 0.13798931241035461, + "learning_rate": 3.844331416098882e-05, + "loss": 0.0214, + "step": 5385 + }, + { + "epoch": 0.7115632328169897, + "grad_norm": 0.12851540744304657, + "learning_rate": 3.841056592624155e-05, + "loss": 0.0175, + "step": 5386 + }, + { + "epoch": 0.7116953463024738, + "grad_norm": 0.18764302134513855, + "learning_rate": 3.8377828329917456e-05, + "loss": 0.0166, + "step": 5387 + }, + { + "epoch": 0.7118274597879578, + "grad_norm": 0.1515864133834839, + "learning_rate": 3.834510137767138e-05, + "loss": 0.0184, + "step": 5388 + }, + { + "epoch": 0.7119595732734418, + "grad_norm": 0.19872577488422394, + "learning_rate": 3.831238507515623e-05, + "loss": 0.0197, + "step": 5389 + }, + { + "epoch": 0.7120916867589259, + "grad_norm": 0.27276068925857544, + "learning_rate": 3.827967942802317e-05, + "loss": 0.0258, + "step": 5390 + }, + { + "epoch": 0.7122238002444099, + "grad_norm": 0.15714675188064575, + "learning_rate": 3.824698444192153e-05, + "loss": 0.0137, + "step": 5391 + }, + { + "epoch": 0.7123559137298939, + "grad_norm": 0.1761281043291092, + "learning_rate": 3.821430012249867e-05, + "loss": 0.0226, + "step": 5392 + }, + { + "epoch": 0.712488027215378, + "grad_norm": 0.14952731132507324, + "learning_rate": 3.818162647540024e-05, + "loss": 0.0155, + "step": 5393 + }, + { + "epoch": 0.712620140700862, + "grad_norm": 0.16243423521518707, + "learning_rate": 3.814896350627001e-05, + "loss": 0.015, + "step": 5394 + }, + { + "epoch": 0.712752254186346, + "grad_norm": 0.13705137372016907, + "learning_rate": 3.8116311220749915e-05, + "loss": 0.0151, + "step": 5395 + }, + { + "epoch": 0.71288436767183, + "grad_norm": 0.13219165802001953, + "learning_rate": 3.8083669624479964e-05, + "loss": 0.0138, + "step": 5396 + }, + { + "epoch": 0.7130164811573141, + "grad_norm": 0.2142573893070221, + "learning_rate": 3.805103872309843e-05, + "loss": 0.0163, + "step": 5397 + }, + { + "epoch": 0.7131485946427981, + "grad_norm": 0.12394694983959198, + "learning_rate": 3.8018418522241705e-05, + "loss": 0.015, + "step": 5398 + }, + { + "epoch": 0.7132807081282821, + "grad_norm": 0.17286397516727448, + "learning_rate": 3.798580902754426e-05, + "loss": 0.015, + "step": 5399 + }, + { + "epoch": 0.7134128216137662, + "grad_norm": 0.7974072694778442, + "learning_rate": 3.795321024463882e-05, + "loss": 0.0205, + "step": 5400 + }, + { + "epoch": 0.7135449350992502, + "grad_norm": 0.16915547847747803, + "learning_rate": 3.7920622179156194e-05, + "loss": 0.0171, + "step": 5401 + }, + { + "epoch": 0.7136770485847342, + "grad_norm": 0.1622791886329651, + "learning_rate": 3.78880448367254e-05, + "loss": 0.0162, + "step": 5402 + }, + { + "epoch": 0.7138091620702183, + "grad_norm": 0.2695022523403168, + "learning_rate": 3.785547822297352e-05, + "loss": 0.0136, + "step": 5403 + }, + { + "epoch": 0.7139412755557023, + "grad_norm": 0.15922003984451294, + "learning_rate": 3.7822922343525826e-05, + "loss": 0.0136, + "step": 5404 + }, + { + "epoch": 0.7140733890411863, + "grad_norm": 0.1592172086238861, + "learning_rate": 3.77903772040058e-05, + "loss": 0.0179, + "step": 5405 + }, + { + "epoch": 0.7142055025266704, + "grad_norm": 0.2182742953300476, + "learning_rate": 3.775784281003493e-05, + "loss": 0.0189, + "step": 5406 + }, + { + "epoch": 0.7143376160121544, + "grad_norm": 0.16970114409923553, + "learning_rate": 3.772531916723294e-05, + "loss": 0.0261, + "step": 5407 + }, + { + "epoch": 0.7144697294976384, + "grad_norm": 0.21392595767974854, + "learning_rate": 3.769280628121772e-05, + "loss": 0.0196, + "step": 5408 + }, + { + "epoch": 0.7146018429831225, + "grad_norm": 0.09747704863548279, + "learning_rate": 3.766030415760525e-05, + "loss": 0.0069, + "step": 5409 + }, + { + "epoch": 0.7147339564686065, + "grad_norm": 0.16599935293197632, + "learning_rate": 3.762781280200964e-05, + "loss": 0.0174, + "step": 5410 + }, + { + "epoch": 0.7148660699540905, + "grad_norm": 0.25641050934791565, + "learning_rate": 3.759533222004318e-05, + "loss": 0.0117, + "step": 5411 + }, + { + "epoch": 0.7149981834395746, + "grad_norm": 0.1257232427597046, + "learning_rate": 3.75628624173163e-05, + "loss": 0.0109, + "step": 5412 + }, + { + "epoch": 0.7151302969250586, + "grad_norm": 0.0978141725063324, + "learning_rate": 3.7530403399437506e-05, + "loss": 0.0048, + "step": 5413 + }, + { + "epoch": 0.7152624104105426, + "grad_norm": 0.12040681391954422, + "learning_rate": 3.749795517201352e-05, + "loss": 0.0074, + "step": 5414 + }, + { + "epoch": 0.7153945238960266, + "grad_norm": 0.20707263052463531, + "learning_rate": 3.746551774064915e-05, + "loss": 0.0269, + "step": 5415 + }, + { + "epoch": 0.7155266373815107, + "grad_norm": 0.10615110397338867, + "learning_rate": 3.7433091110947406e-05, + "loss": 0.0084, + "step": 5416 + }, + { + "epoch": 0.7156587508669947, + "grad_norm": 0.14646649360656738, + "learning_rate": 3.740067528850931e-05, + "loss": 0.0182, + "step": 5417 + }, + { + "epoch": 0.7157908643524787, + "grad_norm": 0.16973420977592468, + "learning_rate": 3.736827027893411e-05, + "loss": 0.0195, + "step": 5418 + }, + { + "epoch": 0.7159229778379628, + "grad_norm": 0.2045861929655075, + "learning_rate": 3.733587608781922e-05, + "loss": 0.025, + "step": 5419 + }, + { + "epoch": 0.7160550913234468, + "grad_norm": 0.12251792848110199, + "learning_rate": 3.730349272076006e-05, + "loss": 0.0182, + "step": 5420 + }, + { + "epoch": 0.7161872048089308, + "grad_norm": 0.14665895700454712, + "learning_rate": 3.7271120183350274e-05, + "loss": 0.0127, + "step": 5421 + }, + { + "epoch": 0.7163193182944149, + "grad_norm": 0.15170572698116302, + "learning_rate": 3.7238758481181614e-05, + "loss": 0.0128, + "step": 5422 + }, + { + "epoch": 0.7164514317798989, + "grad_norm": 0.20112718641757965, + "learning_rate": 3.7206407619843995e-05, + "loss": 0.0158, + "step": 5423 + }, + { + "epoch": 0.7165835452653829, + "grad_norm": 0.13899517059326172, + "learning_rate": 3.7174067604925354e-05, + "loss": 0.0133, + "step": 5424 + }, + { + "epoch": 0.716715658750867, + "grad_norm": 0.17532852292060852, + "learning_rate": 3.714173844201187e-05, + "loss": 0.0194, + "step": 5425 + }, + { + "epoch": 0.716847772236351, + "grad_norm": 0.1427544355392456, + "learning_rate": 3.710942013668782e-05, + "loss": 0.0109, + "step": 5426 + }, + { + "epoch": 0.716979885721835, + "grad_norm": 0.158977672457695, + "learning_rate": 3.707711269453553e-05, + "loss": 0.0181, + "step": 5427 + }, + { + "epoch": 0.717111999207319, + "grad_norm": 0.10790096968412399, + "learning_rate": 3.704481612113554e-05, + "loss": 0.0104, + "step": 5428 + }, + { + "epoch": 0.7172441126928031, + "grad_norm": 0.12931567430496216, + "learning_rate": 3.701253042206646e-05, + "loss": 0.0142, + "step": 5429 + }, + { + "epoch": 0.7173762261782871, + "grad_norm": 0.09444590657949448, + "learning_rate": 3.69802556029051e-05, + "loss": 0.0117, + "step": 5430 + }, + { + "epoch": 0.7175083396637711, + "grad_norm": 0.22288116812705994, + "learning_rate": 3.6947991669226225e-05, + "loss": 0.024, + "step": 5431 + }, + { + "epoch": 0.7176404531492552, + "grad_norm": 0.1592830866575241, + "learning_rate": 3.6915738626602936e-05, + "loss": 0.0085, + "step": 5432 + }, + { + "epoch": 0.7177725666347392, + "grad_norm": 0.1483837217092514, + "learning_rate": 3.688349648060628e-05, + "loss": 0.0118, + "step": 5433 + }, + { + "epoch": 0.7179046801202232, + "grad_norm": 0.11007316410541534, + "learning_rate": 3.68512652368055e-05, + "loss": 0.0132, + "step": 5434 + }, + { + "epoch": 0.7180367936057073, + "grad_norm": 0.20218022167682648, + "learning_rate": 3.681904490076793e-05, + "loss": 0.0206, + "step": 5435 + }, + { + "epoch": 0.7181689070911913, + "grad_norm": 0.16886425018310547, + "learning_rate": 3.678683547805908e-05, + "loss": 0.0221, + "step": 5436 + }, + { + "epoch": 0.7183010205766753, + "grad_norm": 0.1661931276321411, + "learning_rate": 3.675463697424246e-05, + "loss": 0.0176, + "step": 5437 + }, + { + "epoch": 0.7184331340621594, + "grad_norm": 0.13714224100112915, + "learning_rate": 3.6722449394879774e-05, + "loss": 0.0113, + "step": 5438 + }, + { + "epoch": 0.7185652475476434, + "grad_norm": 0.13536618649959564, + "learning_rate": 3.669027274553088e-05, + "loss": 0.0171, + "step": 5439 + }, + { + "epoch": 0.7186973610331274, + "grad_norm": 0.2858543395996094, + "learning_rate": 3.665810703175362e-05, + "loss": 0.0207, + "step": 5440 + }, + { + "epoch": 0.7188294745186115, + "grad_norm": 0.16099917888641357, + "learning_rate": 3.6625952259104045e-05, + "loss": 0.0187, + "step": 5441 + }, + { + "epoch": 0.7189615880040955, + "grad_norm": 0.21166282892227173, + "learning_rate": 3.65938084331363e-05, + "loss": 0.0206, + "step": 5442 + }, + { + "epoch": 0.7190937014895795, + "grad_norm": 0.17520025372505188, + "learning_rate": 3.656167555940265e-05, + "loss": 0.0256, + "step": 5443 + }, + { + "epoch": 0.7192258149750635, + "grad_norm": 0.15425994992256165, + "learning_rate": 3.65295536434534e-05, + "loss": 0.014, + "step": 5444 + }, + { + "epoch": 0.7193579284605476, + "grad_norm": 0.14609721302986145, + "learning_rate": 3.6497442690837025e-05, + "loss": 0.0142, + "step": 5445 + }, + { + "epoch": 0.7194900419460316, + "grad_norm": 0.4000420570373535, + "learning_rate": 3.6465342707100136e-05, + "loss": 0.015, + "step": 5446 + }, + { + "epoch": 0.7196221554315156, + "grad_norm": 0.21911656856536865, + "learning_rate": 3.643325369778734e-05, + "loss": 0.0137, + "step": 5447 + }, + { + "epoch": 0.7197542689169997, + "grad_norm": 0.19847404956817627, + "learning_rate": 3.640117566844144e-05, + "loss": 0.0275, + "step": 5448 + }, + { + "epoch": 0.7198863824024837, + "grad_norm": 0.22252239286899567, + "learning_rate": 3.636910862460332e-05, + "loss": 0.0295, + "step": 5449 + }, + { + "epoch": 0.7200184958879677, + "grad_norm": 0.15440602600574493, + "learning_rate": 3.633705257181199e-05, + "loss": 0.0093, + "step": 5450 + }, + { + "epoch": 0.7201506093734518, + "grad_norm": 0.13582760095596313, + "learning_rate": 3.6305007515604484e-05, + "loss": 0.012, + "step": 5451 + }, + { + "epoch": 0.7202827228589358, + "grad_norm": 0.26740747690200806, + "learning_rate": 3.6272973461515994e-05, + "loss": 0.0175, + "step": 5452 + }, + { + "epoch": 0.7204148363444198, + "grad_norm": 0.11057921499013901, + "learning_rate": 3.624095041507985e-05, + "loss": 0.0145, + "step": 5453 + }, + { + "epoch": 0.7205469498299039, + "grad_norm": 0.17326849699020386, + "learning_rate": 3.620893838182737e-05, + "loss": 0.0138, + "step": 5454 + }, + { + "epoch": 0.7206790633153879, + "grad_norm": 0.13421256840229034, + "learning_rate": 3.617693736728808e-05, + "loss": 0.0125, + "step": 5455 + }, + { + "epoch": 0.7208111768008719, + "grad_norm": 0.22888900339603424, + "learning_rate": 3.6144947376989525e-05, + "loss": 0.0186, + "step": 5456 + }, + { + "epoch": 0.720943290286356, + "grad_norm": 0.10150676965713501, + "learning_rate": 3.611296841645744e-05, + "loss": 0.0069, + "step": 5457 + }, + { + "epoch": 0.72107540377184, + "grad_norm": 0.15139426290988922, + "learning_rate": 3.608100049121551e-05, + "loss": 0.0148, + "step": 5458 + }, + { + "epoch": 0.721207517257324, + "grad_norm": 0.25483521819114685, + "learning_rate": 3.604904360678563e-05, + "loss": 0.0203, + "step": 5459 + }, + { + "epoch": 0.721339630742808, + "grad_norm": 0.23597757518291473, + "learning_rate": 3.601709776868779e-05, + "loss": 0.0205, + "step": 5460 + }, + { + "epoch": 0.7214717442282921, + "grad_norm": 0.14819002151489258, + "learning_rate": 3.598516298243998e-05, + "loss": 0.0155, + "step": 5461 + }, + { + "epoch": 0.7216038577137761, + "grad_norm": 0.15574060380458832, + "learning_rate": 3.595323925355836e-05, + "loss": 0.0156, + "step": 5462 + }, + { + "epoch": 0.7217359711992601, + "grad_norm": 0.21066302061080933, + "learning_rate": 3.592132658755716e-05, + "loss": 0.0146, + "step": 5463 + }, + { + "epoch": 0.7218680846847442, + "grad_norm": 0.13125142455101013, + "learning_rate": 3.5889424989948725e-05, + "loss": 0.0105, + "step": 5464 + }, + { + "epoch": 0.7220001981702282, + "grad_norm": 0.13324397802352905, + "learning_rate": 3.5857534466243404e-05, + "loss": 0.0133, + "step": 5465 + }, + { + "epoch": 0.7221323116557122, + "grad_norm": 0.27421751618385315, + "learning_rate": 3.58256550219497e-05, + "loss": 0.0137, + "step": 5466 + }, + { + "epoch": 0.7222644251411963, + "grad_norm": 0.14124605059623718, + "learning_rate": 3.5793786662574255e-05, + "loss": 0.0144, + "step": 5467 + }, + { + "epoch": 0.7223965386266803, + "grad_norm": 0.14751692116260529, + "learning_rate": 3.576192939362164e-05, + "loss": 0.0188, + "step": 5468 + }, + { + "epoch": 0.7225286521121643, + "grad_norm": 0.123799629509449, + "learning_rate": 3.5730083220594646e-05, + "loss": 0.0082, + "step": 5469 + }, + { + "epoch": 0.7226607655976484, + "grad_norm": 0.12402285635471344, + "learning_rate": 3.5698248148994106e-05, + "loss": 0.0167, + "step": 5470 + }, + { + "epoch": 0.7227928790831324, + "grad_norm": 0.18801485002040863, + "learning_rate": 3.566642418431897e-05, + "loss": 0.0184, + "step": 5471 + }, + { + "epoch": 0.7229249925686164, + "grad_norm": 0.15218046307563782, + "learning_rate": 3.563461133206616e-05, + "loss": 0.0215, + "step": 5472 + }, + { + "epoch": 0.7230571060541005, + "grad_norm": 0.1386367380619049, + "learning_rate": 3.560280959773078e-05, + "loss": 0.0168, + "step": 5473 + }, + { + "epoch": 0.7231892195395845, + "grad_norm": 0.1280830353498459, + "learning_rate": 3.557101898680601e-05, + "loss": 0.013, + "step": 5474 + }, + { + "epoch": 0.7233213330250685, + "grad_norm": 0.1812014877796173, + "learning_rate": 3.553923950478305e-05, + "loss": 0.0152, + "step": 5475 + }, + { + "epoch": 0.7234534465105525, + "grad_norm": 0.12511831521987915, + "learning_rate": 3.5507471157151214e-05, + "loss": 0.0136, + "step": 5476 + }, + { + "epoch": 0.7235855599960366, + "grad_norm": 0.1868468075990677, + "learning_rate": 3.5475713949397914e-05, + "loss": 0.0083, + "step": 5477 + }, + { + "epoch": 0.7237176734815206, + "grad_norm": 0.17483773827552795, + "learning_rate": 3.544396788700863e-05, + "loss": 0.0154, + "step": 5478 + }, + { + "epoch": 0.7238497869670046, + "grad_norm": 0.0938841700553894, + "learning_rate": 3.541223297546683e-05, + "loss": 0.0086, + "step": 5479 + }, + { + "epoch": 0.7239819004524887, + "grad_norm": 0.13073642551898956, + "learning_rate": 3.5380509220254176e-05, + "loss": 0.0126, + "step": 5480 + }, + { + "epoch": 0.7241140139379727, + "grad_norm": 0.14963223040103912, + "learning_rate": 3.534879662685038e-05, + "loss": 0.0183, + "step": 5481 + }, + { + "epoch": 0.7242461274234567, + "grad_norm": 0.11111201345920563, + "learning_rate": 3.531709520073313e-05, + "loss": 0.0104, + "step": 5482 + }, + { + "epoch": 0.7243782409089408, + "grad_norm": 0.22355780005455017, + "learning_rate": 3.528540494737829e-05, + "loss": 0.0241, + "step": 5483 + }, + { + "epoch": 0.7245103543944248, + "grad_norm": 0.22933557629585266, + "learning_rate": 3.5253725872259756e-05, + "loss": 0.015, + "step": 5484 + }, + { + "epoch": 0.7246424678799088, + "grad_norm": 0.19833749532699585, + "learning_rate": 3.522205798084954e-05, + "loss": 0.0243, + "step": 5485 + }, + { + "epoch": 0.7247745813653929, + "grad_norm": 0.18630573153495789, + "learning_rate": 3.51904012786176e-05, + "loss": 0.0221, + "step": 5486 + }, + { + "epoch": 0.7249066948508769, + "grad_norm": 0.2737390100955963, + "learning_rate": 3.515875577103207e-05, + "loss": 0.0285, + "step": 5487 + }, + { + "epoch": 0.7250388083363609, + "grad_norm": 0.16898681223392487, + "learning_rate": 3.5127121463559165e-05, + "loss": 0.0163, + "step": 5488 + }, + { + "epoch": 0.725170921821845, + "grad_norm": 0.14862626791000366, + "learning_rate": 3.5095498361663015e-05, + "loss": 0.0104, + "step": 5489 + }, + { + "epoch": 0.725303035307329, + "grad_norm": 0.2469290941953659, + "learning_rate": 3.5063886470806015e-05, + "loss": 0.0314, + "step": 5490 + }, + { + "epoch": 0.725435148792813, + "grad_norm": 0.116226427257061, + "learning_rate": 3.503228579644854e-05, + "loss": 0.0112, + "step": 5491 + }, + { + "epoch": 0.725567262278297, + "grad_norm": 0.08892618864774704, + "learning_rate": 3.5000696344048934e-05, + "loss": 0.0101, + "step": 5492 + }, + { + "epoch": 0.7256993757637811, + "grad_norm": 0.16128192842006683, + "learning_rate": 3.496911811906373e-05, + "loss": 0.0202, + "step": 5493 + }, + { + "epoch": 0.7258314892492651, + "grad_norm": 0.20747168362140656, + "learning_rate": 3.4937551126947465e-05, + "loss": 0.0136, + "step": 5494 + }, + { + "epoch": 0.7259636027347491, + "grad_norm": 0.16624540090560913, + "learning_rate": 3.490599537315279e-05, + "loss": 0.0151, + "step": 5495 + }, + { + "epoch": 0.7260957162202332, + "grad_norm": 0.1490861475467682, + "learning_rate": 3.48744508631303e-05, + "loss": 0.0149, + "step": 5496 + }, + { + "epoch": 0.7262278297057172, + "grad_norm": 0.1925140619277954, + "learning_rate": 3.484291760232876e-05, + "loss": 0.0133, + "step": 5497 + }, + { + "epoch": 0.7263599431912012, + "grad_norm": 0.18869644403457642, + "learning_rate": 3.481139559619497e-05, + "loss": 0.0177, + "step": 5498 + }, + { + "epoch": 0.7264920566766853, + "grad_norm": 0.16262781620025635, + "learning_rate": 3.47798848501737e-05, + "loss": 0.0103, + "step": 5499 + }, + { + "epoch": 0.7266241701621693, + "grad_norm": 0.15225239098072052, + "learning_rate": 3.4748385369707906e-05, + "loss": 0.0161, + "step": 5500 + }, + { + "epoch": 0.7267562836476533, + "grad_norm": 0.1811465173959732, + "learning_rate": 3.471689716023849e-05, + "loss": 0.0109, + "step": 5501 + }, + { + "epoch": 0.7268883971331374, + "grad_norm": 0.1548740416765213, + "learning_rate": 3.4685420227204526e-05, + "loss": 0.0169, + "step": 5502 + }, + { + "epoch": 0.7270205106186214, + "grad_norm": 0.11394735425710678, + "learning_rate": 3.465395457604297e-05, + "loss": 0.0148, + "step": 5503 + }, + { + "epoch": 0.7271526241041054, + "grad_norm": 0.1491413563489914, + "learning_rate": 3.4622500212188966e-05, + "loss": 0.0135, + "step": 5504 + }, + { + "epoch": 0.7272847375895894, + "grad_norm": 0.23304897546768188, + "learning_rate": 3.459105714107571e-05, + "loss": 0.016, + "step": 5505 + }, + { + "epoch": 0.7274168510750735, + "grad_norm": 0.318596214056015, + "learning_rate": 3.455962536813432e-05, + "loss": 0.0168, + "step": 5506 + }, + { + "epoch": 0.7275489645605575, + "grad_norm": 0.2458907663822174, + "learning_rate": 3.4528204898794104e-05, + "loss": 0.0184, + "step": 5507 + }, + { + "epoch": 0.7276810780460415, + "grad_norm": 0.2839372456073761, + "learning_rate": 3.449679573848233e-05, + "loss": 0.0173, + "step": 5508 + }, + { + "epoch": 0.7278131915315256, + "grad_norm": 0.19794289767742157, + "learning_rate": 3.4465397892624417e-05, + "loss": 0.0155, + "step": 5509 + }, + { + "epoch": 0.7279453050170096, + "grad_norm": 0.17745184898376465, + "learning_rate": 3.4434011366643645e-05, + "loss": 0.022, + "step": 5510 + }, + { + "epoch": 0.7280774185024936, + "grad_norm": 0.16284947097301483, + "learning_rate": 3.4402636165961524e-05, + "loss": 0.0171, + "step": 5511 + }, + { + "epoch": 0.7282095319879777, + "grad_norm": 0.1310679018497467, + "learning_rate": 3.437127229599754e-05, + "loss": 0.0112, + "step": 5512 + }, + { + "epoch": 0.7283416454734617, + "grad_norm": 0.21692624688148499, + "learning_rate": 3.433991976216915e-05, + "loss": 0.0348, + "step": 5513 + }, + { + "epoch": 0.7284737589589457, + "grad_norm": 0.16403941810131073, + "learning_rate": 3.430857856989196e-05, + "loss": 0.0196, + "step": 5514 + }, + { + "epoch": 0.7286058724444298, + "grad_norm": 0.19069243967533112, + "learning_rate": 3.427724872457957e-05, + "loss": 0.0215, + "step": 5515 + }, + { + "epoch": 0.7287379859299138, + "grad_norm": 0.17088550329208374, + "learning_rate": 3.424593023164366e-05, + "loss": 0.0116, + "step": 5516 + }, + { + "epoch": 0.7288700994153978, + "grad_norm": 0.19054432213306427, + "learning_rate": 3.421462309649385e-05, + "loss": 0.0175, + "step": 5517 + }, + { + "epoch": 0.7290022129008819, + "grad_norm": 0.17557857930660248, + "learning_rate": 3.41833273245379e-05, + "loss": 0.0155, + "step": 5518 + }, + { + "epoch": 0.7291343263863659, + "grad_norm": 0.13773734867572784, + "learning_rate": 3.4152042921181584e-05, + "loss": 0.0141, + "step": 5519 + }, + { + "epoch": 0.7292664398718499, + "grad_norm": 0.20974580943584442, + "learning_rate": 3.412076989182864e-05, + "loss": 0.029, + "step": 5520 + }, + { + "epoch": 0.729398553357334, + "grad_norm": 0.17927774786949158, + "learning_rate": 3.408950824188094e-05, + "loss": 0.024, + "step": 5521 + }, + { + "epoch": 0.729530666842818, + "grad_norm": 0.20614896714687347, + "learning_rate": 3.405825797673835e-05, + "loss": 0.0305, + "step": 5522 + }, + { + "epoch": 0.729662780328302, + "grad_norm": 0.2965283691883087, + "learning_rate": 3.402701910179879e-05, + "loss": 0.026, + "step": 5523 + }, + { + "epoch": 0.729794893813786, + "grad_norm": 0.1992853283882141, + "learning_rate": 3.399579162245814e-05, + "loss": 0.0199, + "step": 5524 + }, + { + "epoch": 0.7299270072992701, + "grad_norm": 0.24798119068145752, + "learning_rate": 3.396457554411038e-05, + "loss": 0.0276, + "step": 5525 + }, + { + "epoch": 0.7300591207847541, + "grad_norm": 0.07950809597969055, + "learning_rate": 3.393337087214755e-05, + "loss": 0.0069, + "step": 5526 + }, + { + "epoch": 0.7301912342702381, + "grad_norm": 0.1755135953426361, + "learning_rate": 3.3902177611959606e-05, + "loss": 0.0148, + "step": 5527 + }, + { + "epoch": 0.7303233477557222, + "grad_norm": 0.13987764716148376, + "learning_rate": 3.387099576893462e-05, + "loss": 0.0178, + "step": 5528 + }, + { + "epoch": 0.7304554612412062, + "grad_norm": 0.15794652700424194, + "learning_rate": 3.38398253484587e-05, + "loss": 0.0214, + "step": 5529 + }, + { + "epoch": 0.7305875747266902, + "grad_norm": 0.1341640204191208, + "learning_rate": 3.3808666355915954e-05, + "loss": 0.0144, + "step": 5530 + }, + { + "epoch": 0.7307196882121743, + "grad_norm": 0.152041494846344, + "learning_rate": 3.377751879668847e-05, + "loss": 0.0153, + "step": 5531 + }, + { + "epoch": 0.7308518016976583, + "grad_norm": 0.10529367625713348, + "learning_rate": 3.374638267615643e-05, + "loss": 0.0043, + "step": 5532 + }, + { + "epoch": 0.7309839151831423, + "grad_norm": 0.13288097083568573, + "learning_rate": 3.371525799969806e-05, + "loss": 0.0159, + "step": 5533 + }, + { + "epoch": 0.7311160286686263, + "grad_norm": 0.1831805258989334, + "learning_rate": 3.3684144772689494e-05, + "loss": 0.0204, + "step": 5534 + }, + { + "epoch": 0.7312481421541104, + "grad_norm": 0.21946914494037628, + "learning_rate": 3.365304300050499e-05, + "loss": 0.0278, + "step": 5535 + }, + { + "epoch": 0.7313802556395944, + "grad_norm": 0.17427095770835876, + "learning_rate": 3.36219526885168e-05, + "loss": 0.0222, + "step": 5536 + }, + { + "epoch": 0.7315123691250784, + "grad_norm": 0.10152660310268402, + "learning_rate": 3.359087384209523e-05, + "loss": 0.0108, + "step": 5537 + }, + { + "epoch": 0.7316444826105625, + "grad_norm": 0.21317848563194275, + "learning_rate": 3.35598064666085e-05, + "loss": 0.0276, + "step": 5538 + }, + { + "epoch": 0.7317765960960465, + "grad_norm": 0.10813872516155243, + "learning_rate": 3.352875056742295e-05, + "loss": 0.0065, + "step": 5539 + }, + { + "epoch": 0.7319087095815305, + "grad_norm": 0.13802549242973328, + "learning_rate": 3.3497706149902944e-05, + "loss": 0.0146, + "step": 5540 + }, + { + "epoch": 0.7320408230670146, + "grad_norm": 0.1580958366394043, + "learning_rate": 3.346667321941076e-05, + "loss": 0.0158, + "step": 5541 + }, + { + "epoch": 0.7321729365524986, + "grad_norm": 0.1649957150220871, + "learning_rate": 3.343565178130678e-05, + "loss": 0.0171, + "step": 5542 + }, + { + "epoch": 0.7323050500379826, + "grad_norm": 0.1353890597820282, + "learning_rate": 3.340464184094938e-05, + "loss": 0.0075, + "step": 5543 + }, + { + "epoch": 0.7324371635234667, + "grad_norm": 0.14383232593536377, + "learning_rate": 3.337364340369499e-05, + "loss": 0.0205, + "step": 5544 + }, + { + "epoch": 0.7325692770089507, + "grad_norm": 0.12641078233718872, + "learning_rate": 3.334265647489794e-05, + "loss": 0.0116, + "step": 5545 + }, + { + "epoch": 0.7327013904944347, + "grad_norm": 0.1792270541191101, + "learning_rate": 3.331168105991067e-05, + "loss": 0.0248, + "step": 5546 + }, + { + "epoch": 0.7328335039799188, + "grad_norm": 0.14551857113838196, + "learning_rate": 3.328071716408364e-05, + "loss": 0.0102, + "step": 5547 + }, + { + "epoch": 0.7329656174654028, + "grad_norm": 0.15047802031040192, + "learning_rate": 3.324976479276518e-05, + "loss": 0.0043, + "step": 5548 + }, + { + "epoch": 0.7330977309508868, + "grad_norm": 0.20460718870162964, + "learning_rate": 3.321882395130185e-05, + "loss": 0.0155, + "step": 5549 + }, + { + "epoch": 0.7332298444363708, + "grad_norm": 0.09014448523521423, + "learning_rate": 3.318789464503808e-05, + "loss": 0.0095, + "step": 5550 + }, + { + "epoch": 0.7333619579218549, + "grad_norm": 0.145356684923172, + "learning_rate": 3.315697687931627e-05, + "loss": 0.0129, + "step": 5551 + }, + { + "epoch": 0.7334940714073389, + "grad_norm": 0.29295435547828674, + "learning_rate": 3.312607065947693e-05, + "loss": 0.0171, + "step": 5552 + }, + { + "epoch": 0.7336261848928229, + "grad_norm": 0.2355080246925354, + "learning_rate": 3.309517599085855e-05, + "loss": 0.0095, + "step": 5553 + }, + { + "epoch": 0.733758298378307, + "grad_norm": 0.17795896530151367, + "learning_rate": 3.3064292878797556e-05, + "loss": 0.014, + "step": 5554 + }, + { + "epoch": 0.733890411863791, + "grad_norm": 0.1291665881872177, + "learning_rate": 3.3033421328628447e-05, + "loss": 0.0163, + "step": 5555 + }, + { + "epoch": 0.734022525349275, + "grad_norm": 0.27356666326522827, + "learning_rate": 3.3002561345683715e-05, + "loss": 0.0275, + "step": 5556 + }, + { + "epoch": 0.7341546388347591, + "grad_norm": 0.22058287262916565, + "learning_rate": 3.297171293529386e-05, + "loss": 0.0217, + "step": 5557 + }, + { + "epoch": 0.7342867523202431, + "grad_norm": 0.12218068540096283, + "learning_rate": 3.2940876102787336e-05, + "loss": 0.016, + "step": 5558 + }, + { + "epoch": 0.7344188658057271, + "grad_norm": 0.07197923213243484, + "learning_rate": 3.291005085349062e-05, + "loss": 0.0077, + "step": 5559 + }, + { + "epoch": 0.7345509792912112, + "grad_norm": 0.1551055759191513, + "learning_rate": 3.2879237192728276e-05, + "loss": 0.024, + "step": 5560 + }, + { + "epoch": 0.7346830927766952, + "grad_norm": 0.24776233732700348, + "learning_rate": 3.284843512582268e-05, + "loss": 0.0305, + "step": 5561 + }, + { + "epoch": 0.7348152062621792, + "grad_norm": 0.20823752880096436, + "learning_rate": 3.2817644658094384e-05, + "loss": 0.0215, + "step": 5562 + }, + { + "epoch": 0.7349473197476633, + "grad_norm": 0.17506052553653717, + "learning_rate": 3.278686579486183e-05, + "loss": 0.0101, + "step": 5563 + }, + { + "epoch": 0.7350794332331473, + "grad_norm": 0.14158056676387787, + "learning_rate": 3.275609854144155e-05, + "loss": 0.0124, + "step": 5564 + }, + { + "epoch": 0.7352115467186313, + "grad_norm": 0.14460735023021698, + "learning_rate": 3.2725342903147936e-05, + "loss": 0.0201, + "step": 5565 + }, + { + "epoch": 0.7353436602041153, + "grad_norm": 0.2015376091003418, + "learning_rate": 3.2694598885293485e-05, + "loss": 0.0197, + "step": 5566 + }, + { + "epoch": 0.7354757736895994, + "grad_norm": 0.1528630405664444, + "learning_rate": 3.266386649318868e-05, + "loss": 0.0114, + "step": 5567 + }, + { + "epoch": 0.7356078871750834, + "grad_norm": 0.2069101631641388, + "learning_rate": 3.263314573214189e-05, + "loss": 0.0239, + "step": 5568 + }, + { + "epoch": 0.7357400006605674, + "grad_norm": 0.1767331212759018, + "learning_rate": 3.260243660745961e-05, + "loss": 0.0151, + "step": 5569 + }, + { + "epoch": 0.7358721141460515, + "grad_norm": 0.13966885209083557, + "learning_rate": 3.2571739124446255e-05, + "loss": 0.0186, + "step": 5570 + }, + { + "epoch": 0.7360042276315355, + "grad_norm": 0.16529060900211334, + "learning_rate": 3.254105328840428e-05, + "loss": 0.0178, + "step": 5571 + }, + { + "epoch": 0.7361363411170195, + "grad_norm": 0.11949215084314346, + "learning_rate": 3.2510379104634e-05, + "loss": 0.0082, + "step": 5572 + }, + { + "epoch": 0.7362684546025036, + "grad_norm": 0.1410934329032898, + "learning_rate": 3.2479716578433884e-05, + "loss": 0.017, + "step": 5573 + }, + { + "epoch": 0.7364005680879876, + "grad_norm": 0.1328480988740921, + "learning_rate": 3.24490657151003e-05, + "loss": 0.0179, + "step": 5574 + }, + { + "epoch": 0.7365326815734716, + "grad_norm": 0.23808583617210388, + "learning_rate": 3.241842651992757e-05, + "loss": 0.0139, + "step": 5575 + }, + { + "epoch": 0.7366647950589557, + "grad_norm": 0.7010180354118347, + "learning_rate": 3.2387798998208064e-05, + "loss": 0.014, + "step": 5576 + }, + { + "epoch": 0.7367969085444397, + "grad_norm": 0.16123968362808228, + "learning_rate": 3.2357183155232106e-05, + "loss": 0.0109, + "step": 5577 + }, + { + "epoch": 0.7369290220299237, + "grad_norm": 0.2551591992378235, + "learning_rate": 3.232657899628807e-05, + "loss": 0.0156, + "step": 5578 + }, + { + "epoch": 0.7370611355154078, + "grad_norm": 0.2279757261276245, + "learning_rate": 3.229598652666217e-05, + "loss": 0.0254, + "step": 5579 + }, + { + "epoch": 0.7371932490008918, + "grad_norm": 0.21289393305778503, + "learning_rate": 3.226540575163871e-05, + "loss": 0.0187, + "step": 5580 + }, + { + "epoch": 0.7373253624863758, + "grad_norm": 0.11979267001152039, + "learning_rate": 3.223483667649999e-05, + "loss": 0.0192, + "step": 5581 + }, + { + "epoch": 0.7374574759718598, + "grad_norm": 0.21272334456443787, + "learning_rate": 3.2204279306526175e-05, + "loss": 0.0211, + "step": 5582 + }, + { + "epoch": 0.7375895894573439, + "grad_norm": 0.1978456676006317, + "learning_rate": 3.2173733646995516e-05, + "loss": 0.0165, + "step": 5583 + }, + { + "epoch": 0.7377217029428279, + "grad_norm": 0.13364800810813904, + "learning_rate": 3.214319970318421e-05, + "loss": 0.0133, + "step": 5584 + }, + { + "epoch": 0.7378538164283119, + "grad_norm": 0.15159595012664795, + "learning_rate": 3.211267748036645e-05, + "loss": 0.0245, + "step": 5585 + }, + { + "epoch": 0.737985929913796, + "grad_norm": 0.4125491976737976, + "learning_rate": 3.208216698381431e-05, + "loss": 0.046, + "step": 5586 + }, + { + "epoch": 0.73811804339928, + "grad_norm": 0.2198030799627304, + "learning_rate": 3.205166821879795e-05, + "loss": 0.0164, + "step": 5587 + }, + { + "epoch": 0.738250156884764, + "grad_norm": 0.13166871666908264, + "learning_rate": 3.202118119058548e-05, + "loss": 0.0155, + "step": 5588 + }, + { + "epoch": 0.7383822703702481, + "grad_norm": 0.17269164323806763, + "learning_rate": 3.199070590444292e-05, + "loss": 0.0135, + "step": 5589 + }, + { + "epoch": 0.7385143838557321, + "grad_norm": 0.13313475251197815, + "learning_rate": 3.1960242365634316e-05, + "loss": 0.0118, + "step": 5590 + }, + { + "epoch": 0.7386464973412161, + "grad_norm": 0.15216787159442902, + "learning_rate": 3.192979057942169e-05, + "loss": 0.0144, + "step": 5591 + }, + { + "epoch": 0.7387786108267002, + "grad_norm": 0.17991912364959717, + "learning_rate": 3.189935055106506e-05, + "loss": 0.0167, + "step": 5592 + }, + { + "epoch": 0.7389107243121842, + "grad_norm": 0.20824044942855835, + "learning_rate": 3.1868922285822265e-05, + "loss": 0.0209, + "step": 5593 + }, + { + "epoch": 0.7390428377976682, + "grad_norm": 0.15515249967575073, + "learning_rate": 3.18385057889493e-05, + "loss": 0.0092, + "step": 5594 + }, + { + "epoch": 0.7391749512831522, + "grad_norm": 0.15328307449817657, + "learning_rate": 3.180810106570006e-05, + "loss": 0.0176, + "step": 5595 + }, + { + "epoch": 0.7393070647686363, + "grad_norm": 0.15170076489448547, + "learning_rate": 3.1777708121326324e-05, + "loss": 0.0118, + "step": 5596 + }, + { + "epoch": 0.7394391782541203, + "grad_norm": 0.12343598157167435, + "learning_rate": 3.174732696107793e-05, + "loss": 0.0148, + "step": 5597 + }, + { + "epoch": 0.7395712917396043, + "grad_norm": 0.16363227367401123, + "learning_rate": 3.171695759020267e-05, + "loss": 0.0238, + "step": 5598 + }, + { + "epoch": 0.7397034052250884, + "grad_norm": 0.15410636365413666, + "learning_rate": 3.168660001394631e-05, + "loss": 0.019, + "step": 5599 + }, + { + "epoch": 0.7398355187105724, + "grad_norm": 0.1777234524488449, + "learning_rate": 3.1656254237552495e-05, + "loss": 0.0189, + "step": 5600 + }, + { + "epoch": 0.7399676321960564, + "grad_norm": 0.1454850733280182, + "learning_rate": 3.162592026626291e-05, + "loss": 0.0137, + "step": 5601 + }, + { + "epoch": 0.7400997456815405, + "grad_norm": 0.31561899185180664, + "learning_rate": 3.159559810531724e-05, + "loss": 0.012, + "step": 5602 + }, + { + "epoch": 0.7402318591670245, + "grad_norm": 0.13987892866134644, + "learning_rate": 3.156528775995298e-05, + "loss": 0.0097, + "step": 5603 + }, + { + "epoch": 0.7403639726525085, + "grad_norm": 0.17761053144931793, + "learning_rate": 3.153498923540571e-05, + "loss": 0.0116, + "step": 5604 + }, + { + "epoch": 0.7404960861379926, + "grad_norm": 0.1544499695301056, + "learning_rate": 3.1504702536908946e-05, + "loss": 0.0168, + "step": 5605 + }, + { + "epoch": 0.7406281996234766, + "grad_norm": 0.15382182598114014, + "learning_rate": 3.147442766969417e-05, + "loss": 0.0166, + "step": 5606 + }, + { + "epoch": 0.7407603131089606, + "grad_norm": 0.1814822107553482, + "learning_rate": 3.144416463899071e-05, + "loss": 0.0158, + "step": 5607 + }, + { + "epoch": 0.7408924265944447, + "grad_norm": 0.13609567284584045, + "learning_rate": 3.1413913450026047e-05, + "loss": 0.0164, + "step": 5608 + }, + { + "epoch": 0.7410245400799287, + "grad_norm": 0.175263911485672, + "learning_rate": 3.1383674108025484e-05, + "loss": 0.0176, + "step": 5609 + }, + { + "epoch": 0.7411566535654127, + "grad_norm": 0.15628427267074585, + "learning_rate": 3.135344661821226e-05, + "loss": 0.0175, + "step": 5610 + }, + { + "epoch": 0.7412887670508967, + "grad_norm": 0.22614404559135437, + "learning_rate": 3.1323230985807614e-05, + "loss": 0.0382, + "step": 5611 + }, + { + "epoch": 0.7414208805363808, + "grad_norm": 0.1333487331867218, + "learning_rate": 3.129302721603078e-05, + "loss": 0.0138, + "step": 5612 + }, + { + "epoch": 0.7415529940218648, + "grad_norm": 0.25842511653900146, + "learning_rate": 3.1262835314098835e-05, + "loss": 0.0177, + "step": 5613 + }, + { + "epoch": 0.7416851075073488, + "grad_norm": 0.17177551984786987, + "learning_rate": 3.12326552852269e-05, + "loss": 0.0138, + "step": 5614 + }, + { + "epoch": 0.7418172209928329, + "grad_norm": 0.21476882696151733, + "learning_rate": 3.120248713462799e-05, + "loss": 0.017, + "step": 5615 + }, + { + "epoch": 0.7419493344783169, + "grad_norm": 0.11701980978250504, + "learning_rate": 3.1172330867513135e-05, + "loss": 0.0123, + "step": 5616 + }, + { + "epoch": 0.7420814479638009, + "grad_norm": 0.17461326718330383, + "learning_rate": 3.1142186489091206e-05, + "loss": 0.0193, + "step": 5617 + }, + { + "epoch": 0.742213561449285, + "grad_norm": 0.1957211196422577, + "learning_rate": 3.11120540045691e-05, + "loss": 0.0195, + "step": 5618 + }, + { + "epoch": 0.742345674934769, + "grad_norm": 0.15636511147022247, + "learning_rate": 3.108193341915169e-05, + "loss": 0.0189, + "step": 5619 + }, + { + "epoch": 0.742477788420253, + "grad_norm": 0.12543295323848724, + "learning_rate": 3.1051824738041666e-05, + "loss": 0.0142, + "step": 5620 + }, + { + "epoch": 0.7426099019057371, + "grad_norm": 0.19471098482608795, + "learning_rate": 3.1021727966439773e-05, + "loss": 0.0189, + "step": 5621 + }, + { + "epoch": 0.7427420153912211, + "grad_norm": 0.12781931459903717, + "learning_rate": 3.099164310954468e-05, + "loss": 0.0107, + "step": 5622 + }, + { + "epoch": 0.7428741288767051, + "grad_norm": 0.2116575390100479, + "learning_rate": 3.096157017255299e-05, + "loss": 0.0194, + "step": 5623 + }, + { + "epoch": 0.7430062423621892, + "grad_norm": 0.16661399602890015, + "learning_rate": 3.09315091606592e-05, + "loss": 0.0139, + "step": 5624 + }, + { + "epoch": 0.7431383558476732, + "grad_norm": 0.13406845927238464, + "learning_rate": 3.09014600790558e-05, + "loss": 0.0107, + "step": 5625 + }, + { + "epoch": 0.7432704693331572, + "grad_norm": 0.16295015811920166, + "learning_rate": 3.087142293293326e-05, + "loss": 0.0158, + "step": 5626 + }, + { + "epoch": 0.7434025828186412, + "grad_norm": 0.15222692489624023, + "learning_rate": 3.084139772747985e-05, + "loss": 0.0111, + "step": 5627 + }, + { + "epoch": 0.7435346963041253, + "grad_norm": 0.14454548060894012, + "learning_rate": 3.081138446788191e-05, + "loss": 0.0115, + "step": 5628 + }, + { + "epoch": 0.7436668097896093, + "grad_norm": 0.14269648492336273, + "learning_rate": 3.078138315932366e-05, + "loss": 0.0125, + "step": 5629 + }, + { + "epoch": 0.7437989232750933, + "grad_norm": 0.30650556087493896, + "learning_rate": 3.07513938069873e-05, + "loss": 0.0176, + "step": 5630 + }, + { + "epoch": 0.7439310367605774, + "grad_norm": 0.12681737542152405, + "learning_rate": 3.0721416416052884e-05, + "loss": 0.0093, + "step": 5631 + }, + { + "epoch": 0.7440631502460614, + "grad_norm": 0.19957475364208221, + "learning_rate": 3.0691450991698456e-05, + "loss": 0.0139, + "step": 5632 + }, + { + "epoch": 0.7441952637315454, + "grad_norm": 0.15414023399353027, + "learning_rate": 3.066149753910002e-05, + "loss": 0.0081, + "step": 5633 + }, + { + "epoch": 0.7443273772170295, + "grad_norm": 0.12280531227588654, + "learning_rate": 3.06315560634314e-05, + "loss": 0.0138, + "step": 5634 + }, + { + "epoch": 0.7444594907025135, + "grad_norm": 0.14327071607112885, + "learning_rate": 3.060162656986448e-05, + "loss": 0.0107, + "step": 5635 + }, + { + "epoch": 0.7445916041879975, + "grad_norm": 0.13788668811321259, + "learning_rate": 3.057170906356901e-05, + "loss": 0.018, + "step": 5636 + }, + { + "epoch": 0.7447237176734816, + "grad_norm": 0.1548607051372528, + "learning_rate": 3.05418035497127e-05, + "loss": 0.0116, + "step": 5637 + }, + { + "epoch": 0.7448558311589656, + "grad_norm": 0.363374799489975, + "learning_rate": 3.0511910033461134e-05, + "loss": 0.0199, + "step": 5638 + }, + { + "epoch": 0.7449879446444496, + "grad_norm": 0.1580091416835785, + "learning_rate": 3.0482028519977857e-05, + "loss": 0.0158, + "step": 5639 + }, + { + "epoch": 0.7451200581299336, + "grad_norm": 0.18859657645225525, + "learning_rate": 3.0452159014424396e-05, + "loss": 0.0164, + "step": 5640 + }, + { + "epoch": 0.7452521716154177, + "grad_norm": 0.16260674595832825, + "learning_rate": 3.0422301521960074e-05, + "loss": 0.0138, + "step": 5641 + }, + { + "epoch": 0.7453842851009017, + "grad_norm": 0.14317110180854797, + "learning_rate": 3.0392456047742257e-05, + "loss": 0.0134, + "step": 5642 + }, + { + "epoch": 0.7455163985863857, + "grad_norm": 0.20928920805454254, + "learning_rate": 3.036262259692618e-05, + "loss": 0.0195, + "step": 5643 + }, + { + "epoch": 0.7456485120718698, + "grad_norm": 0.2817302644252777, + "learning_rate": 3.033280117466506e-05, + "loss": 0.0303, + "step": 5644 + }, + { + "epoch": 0.7457806255573538, + "grad_norm": 0.13203193247318268, + "learning_rate": 3.0302991786109913e-05, + "loss": 0.0183, + "step": 5645 + }, + { + "epoch": 0.7459127390428378, + "grad_norm": 0.4010050594806671, + "learning_rate": 3.027319443640979e-05, + "loss": 0.0273, + "step": 5646 + }, + { + "epoch": 0.7460448525283219, + "grad_norm": 0.2017243355512619, + "learning_rate": 3.0243409130711665e-05, + "loss": 0.0275, + "step": 5647 + }, + { + "epoch": 0.7461769660138059, + "grad_norm": 0.18311437964439392, + "learning_rate": 3.0213635874160316e-05, + "loss": 0.0144, + "step": 5648 + }, + { + "epoch": 0.7463090794992899, + "grad_norm": 0.11720288544893265, + "learning_rate": 3.018387467189856e-05, + "loss": 0.0105, + "step": 5649 + }, + { + "epoch": 0.746441192984774, + "grad_norm": 0.11851353943347931, + "learning_rate": 3.015412552906708e-05, + "loss": 0.009, + "step": 5650 + }, + { + "epoch": 0.746573306470258, + "grad_norm": 0.10665460675954819, + "learning_rate": 3.012438845080452e-05, + "loss": 0.0088, + "step": 5651 + }, + { + "epoch": 0.746705419955742, + "grad_norm": 0.11494076997041702, + "learning_rate": 3.009466344224734e-05, + "loss": 0.0131, + "step": 5652 + }, + { + "epoch": 0.746837533441226, + "grad_norm": 0.12239422649145126, + "learning_rate": 3.006495050853001e-05, + "loss": 0.0084, + "step": 5653 + }, + { + "epoch": 0.7469696469267101, + "grad_norm": 0.12373550236225128, + "learning_rate": 3.0035249654784926e-05, + "loss": 0.0072, + "step": 5654 + }, + { + "epoch": 0.7471017604121941, + "grad_norm": 0.17365232110023499, + "learning_rate": 3.000556088614227e-05, + "loss": 0.0136, + "step": 5655 + }, + { + "epoch": 0.7472338738976781, + "grad_norm": 0.17435801029205322, + "learning_rate": 2.9975884207730275e-05, + "loss": 0.0148, + "step": 5656 + }, + { + "epoch": 0.7473659873831622, + "grad_norm": 0.17454983294010162, + "learning_rate": 2.994621962467502e-05, + "loss": 0.0249, + "step": 5657 + }, + { + "epoch": 0.7474981008686462, + "grad_norm": 0.31457021832466125, + "learning_rate": 2.9916567142100538e-05, + "loss": 0.0167, + "step": 5658 + }, + { + "epoch": 0.7476302143541302, + "grad_norm": 0.24514144659042358, + "learning_rate": 2.9886926765128688e-05, + "loss": 0.0192, + "step": 5659 + }, + { + "epoch": 0.7477623278396143, + "grad_norm": 0.18715476989746094, + "learning_rate": 2.9857298498879306e-05, + "loss": 0.0193, + "step": 5660 + }, + { + "epoch": 0.7478944413250983, + "grad_norm": 0.15529604256153107, + "learning_rate": 2.9827682348470178e-05, + "loss": 0.0119, + "step": 5661 + }, + { + "epoch": 0.7480265548105823, + "grad_norm": 0.18206310272216797, + "learning_rate": 2.979807831901684e-05, + "loss": 0.0168, + "step": 5662 + }, + { + "epoch": 0.7481586682960664, + "grad_norm": 0.14686575531959534, + "learning_rate": 2.9768486415632914e-05, + "loss": 0.0174, + "step": 5663 + }, + { + "epoch": 0.7482907817815504, + "grad_norm": 0.14675019681453705, + "learning_rate": 2.973890664342981e-05, + "loss": 0.0185, + "step": 5664 + }, + { + "epoch": 0.7484228952670344, + "grad_norm": 0.1723177284002304, + "learning_rate": 2.970933900751689e-05, + "loss": 0.0167, + "step": 5665 + }, + { + "epoch": 0.7485550087525185, + "grad_norm": 0.1251523345708847, + "learning_rate": 2.9679783513001412e-05, + "loss": 0.0125, + "step": 5666 + }, + { + "epoch": 0.7486871222380025, + "grad_norm": 0.1644349992275238, + "learning_rate": 2.9650240164988563e-05, + "loss": 0.0217, + "step": 5667 + }, + { + "epoch": 0.7488192357234865, + "grad_norm": 0.17225313186645508, + "learning_rate": 2.9620708968581356e-05, + "loss": 0.018, + "step": 5668 + }, + { + "epoch": 0.7489513492089706, + "grad_norm": 0.19602759182453156, + "learning_rate": 2.959118992888077e-05, + "loss": 0.0189, + "step": 5669 + }, + { + "epoch": 0.7490834626944546, + "grad_norm": 0.14376594126224518, + "learning_rate": 2.9561683050985677e-05, + "loss": 0.0159, + "step": 5670 + }, + { + "epoch": 0.7492155761799386, + "grad_norm": 0.1839967966079712, + "learning_rate": 2.953218833999285e-05, + "loss": 0.0209, + "step": 5671 + }, + { + "epoch": 0.7493476896654226, + "grad_norm": 0.20183296501636505, + "learning_rate": 2.950270580099691e-05, + "loss": 0.0253, + "step": 5672 + }, + { + "epoch": 0.7494798031509067, + "grad_norm": 0.24092161655426025, + "learning_rate": 2.947323543909044e-05, + "loss": 0.0162, + "step": 5673 + }, + { + "epoch": 0.7496119166363907, + "grad_norm": 0.3723273277282715, + "learning_rate": 2.9443777259363912e-05, + "loss": 0.0208, + "step": 5674 + }, + { + "epoch": 0.7497440301218747, + "grad_norm": 0.19728074967861176, + "learning_rate": 2.9414331266905627e-05, + "loss": 0.0167, + "step": 5675 + }, + { + "epoch": 0.7498761436073588, + "grad_norm": 0.12510357797145844, + "learning_rate": 2.9384897466801852e-05, + "loss": 0.0131, + "step": 5676 + }, + { + "epoch": 0.7500082570928428, + "grad_norm": 0.22277474403381348, + "learning_rate": 2.935547586413674e-05, + "loss": 0.0152, + "step": 5677 + }, + { + "epoch": 0.7501403705783268, + "grad_norm": 0.1679277867078781, + "learning_rate": 2.932606646399233e-05, + "loss": 0.0209, + "step": 5678 + }, + { + "epoch": 0.7502724840638109, + "grad_norm": 0.1746317446231842, + "learning_rate": 2.929666927144851e-05, + "loss": 0.0092, + "step": 5679 + }, + { + "epoch": 0.7504045975492949, + "grad_norm": 0.14948616921901703, + "learning_rate": 2.926728429158311e-05, + "loss": 0.0181, + "step": 5680 + }, + { + "epoch": 0.7505367110347789, + "grad_norm": 0.13942132890224457, + "learning_rate": 2.9237911529471862e-05, + "loss": 0.0109, + "step": 5681 + }, + { + "epoch": 0.750668824520263, + "grad_norm": 0.1250900775194168, + "learning_rate": 2.9208550990188312e-05, + "loss": 0.0168, + "step": 5682 + }, + { + "epoch": 0.750800938005747, + "grad_norm": 0.1930038183927536, + "learning_rate": 2.9179202678803973e-05, + "loss": 0.0216, + "step": 5683 + }, + { + "epoch": 0.750933051491231, + "grad_norm": 0.14589211344718933, + "learning_rate": 2.914986660038822e-05, + "loss": 0.0134, + "step": 5684 + }, + { + "epoch": 0.751065164976715, + "grad_norm": 0.19304388761520386, + "learning_rate": 2.912054276000834e-05, + "loss": 0.0155, + "step": 5685 + }, + { + "epoch": 0.7511972784621991, + "grad_norm": 0.2510669231414795, + "learning_rate": 2.9091231162729403e-05, + "loss": 0.0218, + "step": 5686 + }, + { + "epoch": 0.7513293919476831, + "grad_norm": 0.13263855874538422, + "learning_rate": 2.9061931813614497e-05, + "loss": 0.0118, + "step": 5687 + }, + { + "epoch": 0.7514615054331671, + "grad_norm": 0.1308731585741043, + "learning_rate": 2.9032644717724543e-05, + "loss": 0.0129, + "step": 5688 + }, + { + "epoch": 0.7515936189186512, + "grad_norm": 0.453821063041687, + "learning_rate": 2.900336988011829e-05, + "loss": 0.0257, + "step": 5689 + }, + { + "epoch": 0.7517257324041352, + "grad_norm": 0.14083927869796753, + "learning_rate": 2.897410730585245e-05, + "loss": 0.0118, + "step": 5690 + }, + { + "epoch": 0.7518578458896192, + "grad_norm": 0.15414288640022278, + "learning_rate": 2.8944856999981572e-05, + "loss": 0.0137, + "step": 5691 + }, + { + "epoch": 0.7519899593751033, + "grad_norm": 0.2339172065258026, + "learning_rate": 2.8915618967558144e-05, + "loss": 0.024, + "step": 5692 + }, + { + "epoch": 0.7521220728605873, + "grad_norm": 0.17207179963588715, + "learning_rate": 2.8886393213632435e-05, + "loss": 0.0183, + "step": 5693 + }, + { + "epoch": 0.7522541863460713, + "grad_norm": 0.08931787312030792, + "learning_rate": 2.885717974325266e-05, + "loss": 0.0093, + "step": 5694 + }, + { + "epoch": 0.7523862998315554, + "grad_norm": 0.1811581701040268, + "learning_rate": 2.8827978561464943e-05, + "loss": 0.0285, + "step": 5695 + }, + { + "epoch": 0.7525184133170394, + "grad_norm": 0.14569365978240967, + "learning_rate": 2.8798789673313164e-05, + "loss": 0.0099, + "step": 5696 + }, + { + "epoch": 0.7526505268025234, + "grad_norm": 0.3001488149166107, + "learning_rate": 2.8769613083839208e-05, + "loss": 0.0276, + "step": 5697 + }, + { + "epoch": 0.7527826402880073, + "grad_norm": 0.15972566604614258, + "learning_rate": 2.8740448798082786e-05, + "loss": 0.0101, + "step": 5698 + }, + { + "epoch": 0.7529147537734914, + "grad_norm": 0.1297338753938675, + "learning_rate": 2.871129682108149e-05, + "loss": 0.0174, + "step": 5699 + }, + { + "epoch": 0.7530468672589754, + "grad_norm": 0.19060315191745758, + "learning_rate": 2.868215715787075e-05, + "loss": 0.0276, + "step": 5700 + }, + { + "epoch": 0.7531789807444594, + "grad_norm": 0.11152473092079163, + "learning_rate": 2.86530298134839e-05, + "loss": 0.0102, + "step": 5701 + }, + { + "epoch": 0.7533110942299435, + "grad_norm": 0.2610558271408081, + "learning_rate": 2.8623914792952188e-05, + "loss": 0.0196, + "step": 5702 + }, + { + "epoch": 0.7534432077154275, + "grad_norm": 0.1260378360748291, + "learning_rate": 2.8594812101304624e-05, + "loss": 0.0125, + "step": 5703 + }, + { + "epoch": 0.7535753212009115, + "grad_norm": 0.14703889191150665, + "learning_rate": 2.8565721743568195e-05, + "loss": 0.0132, + "step": 5704 + }, + { + "epoch": 0.7537074346863956, + "grad_norm": 0.27930232882499695, + "learning_rate": 2.853664372476771e-05, + "loss": 0.0163, + "step": 5705 + }, + { + "epoch": 0.7538395481718796, + "grad_norm": 0.17410196363925934, + "learning_rate": 2.8507578049925875e-05, + "loss": 0.0173, + "step": 5706 + }, + { + "epoch": 0.7539716616573636, + "grad_norm": 0.1297542154788971, + "learning_rate": 2.8478524724063195e-05, + "loss": 0.0127, + "step": 5707 + }, + { + "epoch": 0.7541037751428477, + "grad_norm": 0.18584750592708588, + "learning_rate": 2.84494837521981e-05, + "loss": 0.0179, + "step": 5708 + }, + { + "epoch": 0.7542358886283317, + "grad_norm": 0.1973746418952942, + "learning_rate": 2.8420455139346935e-05, + "loss": 0.0144, + "step": 5709 + }, + { + "epoch": 0.7543680021138157, + "grad_norm": 0.18765583634376526, + "learning_rate": 2.8391438890523757e-05, + "loss": 0.0161, + "step": 5710 + }, + { + "epoch": 0.7545001155992997, + "grad_norm": 0.11874744296073914, + "learning_rate": 2.836243501074064e-05, + "loss": 0.0082, + "step": 5711 + }, + { + "epoch": 0.7546322290847838, + "grad_norm": 0.17037716507911682, + "learning_rate": 2.833344350500744e-05, + "loss": 0.0158, + "step": 5712 + }, + { + "epoch": 0.7547643425702678, + "grad_norm": 0.13802199065685272, + "learning_rate": 2.830446437833193e-05, + "loss": 0.0136, + "step": 5713 + }, + { + "epoch": 0.7548964560557518, + "grad_norm": 0.15014030039310455, + "learning_rate": 2.8275497635719663e-05, + "loss": 0.0152, + "step": 5714 + }, + { + "epoch": 0.7550285695412359, + "grad_norm": 0.30440592765808105, + "learning_rate": 2.824654328217413e-05, + "loss": 0.0265, + "step": 5715 + }, + { + "epoch": 0.7551606830267199, + "grad_norm": 0.1533273160457611, + "learning_rate": 2.8217601322696675e-05, + "loss": 0.0176, + "step": 5716 + }, + { + "epoch": 0.7552927965122039, + "grad_norm": 0.09308493882417679, + "learning_rate": 2.8188671762286434e-05, + "loss": 0.0033, + "step": 5717 + }, + { + "epoch": 0.755424909997688, + "grad_norm": 0.10201088339090347, + "learning_rate": 2.815975460594047e-05, + "loss": 0.0094, + "step": 5718 + }, + { + "epoch": 0.755557023483172, + "grad_norm": 0.18083249032497406, + "learning_rate": 2.8130849858653673e-05, + "loss": 0.0194, + "step": 5719 + }, + { + "epoch": 0.755689136968656, + "grad_norm": 0.14451417326927185, + "learning_rate": 2.8101957525418842e-05, + "loss": 0.0125, + "step": 5720 + }, + { + "epoch": 0.7558212504541401, + "grad_norm": 0.10200035572052002, + "learning_rate": 2.8073077611226518e-05, + "loss": 0.0086, + "step": 5721 + }, + { + "epoch": 0.7559533639396241, + "grad_norm": 0.1436675488948822, + "learning_rate": 2.8044210121065195e-05, + "loss": 0.011, + "step": 5722 + }, + { + "epoch": 0.7560854774251081, + "grad_norm": 0.2443426251411438, + "learning_rate": 2.8015355059921235e-05, + "loss": 0.0392, + "step": 5723 + }, + { + "epoch": 0.7562175909105922, + "grad_norm": 0.18814495205879211, + "learning_rate": 2.798651243277871e-05, + "loss": 0.026, + "step": 5724 + }, + { + "epoch": 0.7563497043960762, + "grad_norm": 0.20542608201503754, + "learning_rate": 2.7957682244619733e-05, + "loss": 0.0227, + "step": 5725 + }, + { + "epoch": 0.7564818178815602, + "grad_norm": 0.1900263875722885, + "learning_rate": 2.792886450042419e-05, + "loss": 0.0185, + "step": 5726 + }, + { + "epoch": 0.7566139313670442, + "grad_norm": 0.12506645917892456, + "learning_rate": 2.790005920516974e-05, + "loss": 0.0144, + "step": 5727 + }, + { + "epoch": 0.7567460448525283, + "grad_norm": 0.14222222566604614, + "learning_rate": 2.7871266363831983e-05, + "loss": 0.0195, + "step": 5728 + }, + { + "epoch": 0.7568781583380123, + "grad_norm": 0.13620705902576447, + "learning_rate": 2.784248598138435e-05, + "loss": 0.0264, + "step": 5729 + }, + { + "epoch": 0.7570102718234963, + "grad_norm": 0.17226412892341614, + "learning_rate": 2.7813718062798156e-05, + "loss": 0.0133, + "step": 5730 + }, + { + "epoch": 0.7571423853089804, + "grad_norm": 0.14862510561943054, + "learning_rate": 2.7784962613042442e-05, + "loss": 0.0164, + "step": 5731 + }, + { + "epoch": 0.7572744987944644, + "grad_norm": 0.18596234917640686, + "learning_rate": 2.7756219637084212e-05, + "loss": 0.0186, + "step": 5732 + }, + { + "epoch": 0.7574066122799484, + "grad_norm": 0.1989210993051529, + "learning_rate": 2.772748913988832e-05, + "loss": 0.0146, + "step": 5733 + }, + { + "epoch": 0.7575387257654325, + "grad_norm": 0.18087489902973175, + "learning_rate": 2.7698771126417333e-05, + "loss": 0.0189, + "step": 5734 + }, + { + "epoch": 0.7576708392509165, + "grad_norm": 0.10741331428289413, + "learning_rate": 2.767006560163181e-05, + "loss": 0.0104, + "step": 5735 + }, + { + "epoch": 0.7578029527364005, + "grad_norm": 0.14782588183879852, + "learning_rate": 2.7641372570490076e-05, + "loss": 0.0197, + "step": 5736 + }, + { + "epoch": 0.7579350662218846, + "grad_norm": 0.16277161240577698, + "learning_rate": 2.7612692037948352e-05, + "loss": 0.0123, + "step": 5737 + }, + { + "epoch": 0.7580671797073686, + "grad_norm": 0.17703665792942047, + "learning_rate": 2.7584024008960607e-05, + "loss": 0.0117, + "step": 5738 + }, + { + "epoch": 0.7581992931928526, + "grad_norm": 0.3099222779273987, + "learning_rate": 2.7555368488478727e-05, + "loss": 0.0265, + "step": 5739 + }, + { + "epoch": 0.7583314066783367, + "grad_norm": 0.18668995797634125, + "learning_rate": 2.7526725481452464e-05, + "loss": 0.0232, + "step": 5740 + }, + { + "epoch": 0.7584635201638207, + "grad_norm": 0.213165283203125, + "learning_rate": 2.7498094992829283e-05, + "loss": 0.0227, + "step": 5741 + }, + { + "epoch": 0.7585956336493047, + "grad_norm": 0.1654159426689148, + "learning_rate": 2.74694770275546e-05, + "loss": 0.017, + "step": 5742 + }, + { + "epoch": 0.7587277471347887, + "grad_norm": 0.12483178824186325, + "learning_rate": 2.744087159057165e-05, + "loss": 0.0155, + "step": 5743 + }, + { + "epoch": 0.7588598606202728, + "grad_norm": 0.3348318040370941, + "learning_rate": 2.7412278686821502e-05, + "loss": 0.0245, + "step": 5744 + }, + { + "epoch": 0.7589919741057568, + "grad_norm": 0.16964954137802124, + "learning_rate": 2.738369832124298e-05, + "loss": 0.022, + "step": 5745 + }, + { + "epoch": 0.7591240875912408, + "grad_norm": 0.21486179530620575, + "learning_rate": 2.735513049877285e-05, + "loss": 0.0209, + "step": 5746 + }, + { + "epoch": 0.7592562010767249, + "grad_norm": 0.15283586084842682, + "learning_rate": 2.7326575224345697e-05, + "loss": 0.0169, + "step": 5747 + }, + { + "epoch": 0.7593883145622089, + "grad_norm": 0.15681065618991852, + "learning_rate": 2.7298032502893855e-05, + "loss": 0.0141, + "step": 5748 + }, + { + "epoch": 0.7595204280476929, + "grad_norm": 0.26059630513191223, + "learning_rate": 2.7269502339347564e-05, + "loss": 0.0222, + "step": 5749 + }, + { + "epoch": 0.759652541533177, + "grad_norm": 0.15516114234924316, + "learning_rate": 2.7240984738634877e-05, + "loss": 0.0161, + "step": 5750 + }, + { + "epoch": 0.759784655018661, + "grad_norm": 0.16622744500637054, + "learning_rate": 2.7212479705681715e-05, + "loss": 0.02, + "step": 5751 + }, + { + "epoch": 0.759916768504145, + "grad_norm": 0.17459112405776978, + "learning_rate": 2.7183987245411724e-05, + "loss": 0.0199, + "step": 5752 + }, + { + "epoch": 0.760048881989629, + "grad_norm": 0.17584876716136932, + "learning_rate": 2.7155507362746478e-05, + "loss": 0.0197, + "step": 5753 + }, + { + "epoch": 0.7601809954751131, + "grad_norm": 0.1639474630355835, + "learning_rate": 2.712704006260538e-05, + "loss": 0.0188, + "step": 5754 + }, + { + "epoch": 0.7603131089605971, + "grad_norm": 0.15318681299686432, + "learning_rate": 2.7098585349905547e-05, + "loss": 0.0157, + "step": 5755 + }, + { + "epoch": 0.7604452224460811, + "grad_norm": 0.15242336690425873, + "learning_rate": 2.707014322956204e-05, + "loss": 0.0154, + "step": 5756 + }, + { + "epoch": 0.7605773359315652, + "grad_norm": 0.17166076600551605, + "learning_rate": 2.7041713706487692e-05, + "loss": 0.0214, + "step": 5757 + }, + { + "epoch": 0.7607094494170492, + "grad_norm": 0.3508407771587372, + "learning_rate": 2.7013296785593223e-05, + "loss": 0.0325, + "step": 5758 + }, + { + "epoch": 0.7608415629025332, + "grad_norm": 0.22720810770988464, + "learning_rate": 2.698489247178705e-05, + "loss": 0.0274, + "step": 5759 + }, + { + "epoch": 0.7609736763880173, + "grad_norm": 0.16163846850395203, + "learning_rate": 2.6956500769975512e-05, + "loss": 0.0179, + "step": 5760 + }, + { + "epoch": 0.7611057898735013, + "grad_norm": 0.12275891751050949, + "learning_rate": 2.692812168506278e-05, + "loss": 0.0135, + "step": 5761 + }, + { + "epoch": 0.7612379033589853, + "grad_norm": 0.22080405056476593, + "learning_rate": 2.6899755221950764e-05, + "loss": 0.0192, + "step": 5762 + }, + { + "epoch": 0.7613700168444694, + "grad_norm": 0.1974334418773651, + "learning_rate": 2.687140138553925e-05, + "loss": 0.0113, + "step": 5763 + }, + { + "epoch": 0.7615021303299534, + "grad_norm": 0.18567024171352386, + "learning_rate": 2.6843060180725844e-05, + "loss": 0.014, + "step": 5764 + }, + { + "epoch": 0.7616342438154374, + "grad_norm": 0.25421595573425293, + "learning_rate": 2.6814731612405987e-05, + "loss": 0.0178, + "step": 5765 + }, + { + "epoch": 0.7617663573009215, + "grad_norm": 0.09751256555318832, + "learning_rate": 2.6786415685472843e-05, + "loss": 0.0132, + "step": 5766 + }, + { + "epoch": 0.7618984707864055, + "grad_norm": 0.26204532384872437, + "learning_rate": 2.6758112404817503e-05, + "loss": 0.0276, + "step": 5767 + }, + { + "epoch": 0.7620305842718895, + "grad_norm": 0.15766333043575287, + "learning_rate": 2.6729821775328844e-05, + "loss": 0.0114, + "step": 5768 + }, + { + "epoch": 0.7621626977573736, + "grad_norm": 0.15987655520439148, + "learning_rate": 2.670154380189349e-05, + "loss": 0.0189, + "step": 5769 + }, + { + "epoch": 0.7622948112428576, + "grad_norm": 0.11225584894418716, + "learning_rate": 2.667327848939597e-05, + "loss": 0.0091, + "step": 5770 + }, + { + "epoch": 0.7624269247283416, + "grad_norm": 0.12827295064926147, + "learning_rate": 2.6645025842718587e-05, + "loss": 0.0116, + "step": 5771 + }, + { + "epoch": 0.7625590382138256, + "grad_norm": 0.11164995282888412, + "learning_rate": 2.6616785866741467e-05, + "loss": 0.0133, + "step": 5772 + }, + { + "epoch": 0.7626911516993097, + "grad_norm": 0.10456161201000214, + "learning_rate": 2.65885585663425e-05, + "loss": 0.0082, + "step": 5773 + }, + { + "epoch": 0.7628232651847937, + "grad_norm": 0.11802589893341064, + "learning_rate": 2.656034394639745e-05, + "loss": 0.0093, + "step": 5774 + }, + { + "epoch": 0.7629553786702777, + "grad_norm": 0.24839834868907928, + "learning_rate": 2.653214201177988e-05, + "loss": 0.0264, + "step": 5775 + }, + { + "epoch": 0.7630874921557618, + "grad_norm": 0.14445337653160095, + "learning_rate": 2.6503952767361117e-05, + "loss": 0.0189, + "step": 5776 + }, + { + "epoch": 0.7632196056412458, + "grad_norm": 0.14437027275562286, + "learning_rate": 2.647577621801033e-05, + "loss": 0.0137, + "step": 5777 + }, + { + "epoch": 0.7633517191267298, + "grad_norm": 0.14201775193214417, + "learning_rate": 2.6447612368594488e-05, + "loss": 0.0208, + "step": 5778 + }, + { + "epoch": 0.7634838326122139, + "grad_norm": 0.21351680159568787, + "learning_rate": 2.6419461223978425e-05, + "loss": 0.0189, + "step": 5779 + }, + { + "epoch": 0.7636159460976979, + "grad_norm": 0.1488891988992691, + "learning_rate": 2.639132278902464e-05, + "loss": 0.0168, + "step": 5780 + }, + { + "epoch": 0.7637480595831819, + "grad_norm": 0.1703590601682663, + "learning_rate": 2.636319706859357e-05, + "loss": 0.0144, + "step": 5781 + }, + { + "epoch": 0.763880173068666, + "grad_norm": 0.2704126536846161, + "learning_rate": 2.633508406754339e-05, + "loss": 0.0216, + "step": 5782 + }, + { + "epoch": 0.76401228655415, + "grad_norm": 0.15560157597064972, + "learning_rate": 2.6306983790730112e-05, + "loss": 0.0152, + "step": 5783 + }, + { + "epoch": 0.764144400039634, + "grad_norm": 0.15597420930862427, + "learning_rate": 2.627889624300752e-05, + "loss": 0.0171, + "step": 5784 + }, + { + "epoch": 0.764276513525118, + "grad_norm": 0.1779235601425171, + "learning_rate": 2.6250821429227258e-05, + "loss": 0.0118, + "step": 5785 + }, + { + "epoch": 0.7644086270106021, + "grad_norm": 0.14752568304538727, + "learning_rate": 2.6222759354238645e-05, + "loss": 0.0182, + "step": 5786 + }, + { + "epoch": 0.7645407404960861, + "grad_norm": 0.1394970864057541, + "learning_rate": 2.6194710022888937e-05, + "loss": 0.0212, + "step": 5787 + }, + { + "epoch": 0.7646728539815701, + "grad_norm": 0.09506494551897049, + "learning_rate": 2.6166673440023127e-05, + "loss": 0.0101, + "step": 5788 + }, + { + "epoch": 0.7648049674670542, + "grad_norm": 0.12080918252468109, + "learning_rate": 2.613864961048398e-05, + "loss": 0.0119, + "step": 5789 + }, + { + "epoch": 0.7649370809525382, + "grad_norm": 0.17146851122379303, + "learning_rate": 2.6110638539112098e-05, + "loss": 0.022, + "step": 5790 + }, + { + "epoch": 0.7650691944380222, + "grad_norm": 0.1401028335094452, + "learning_rate": 2.608264023074588e-05, + "loss": 0.0162, + "step": 5791 + }, + { + "epoch": 0.7652013079235063, + "grad_norm": 0.20553302764892578, + "learning_rate": 2.605465469022155e-05, + "loss": 0.025, + "step": 5792 + }, + { + "epoch": 0.7653334214089903, + "grad_norm": 0.20330609381198883, + "learning_rate": 2.6026681922372998e-05, + "loss": 0.016, + "step": 5793 + }, + { + "epoch": 0.7654655348944743, + "grad_norm": 0.12444961071014404, + "learning_rate": 2.5998721932032056e-05, + "loss": 0.0132, + "step": 5794 + }, + { + "epoch": 0.7655976483799584, + "grad_norm": 0.14549531042575836, + "learning_rate": 2.5970774724028314e-05, + "loss": 0.0066, + "step": 5795 + }, + { + "epoch": 0.7657297618654424, + "grad_norm": 0.16223883628845215, + "learning_rate": 2.5942840303189055e-05, + "loss": 0.0266, + "step": 5796 + }, + { + "epoch": 0.7658618753509264, + "grad_norm": 0.219892218708992, + "learning_rate": 2.591491867433946e-05, + "loss": 0.0123, + "step": 5797 + }, + { + "epoch": 0.7659939888364105, + "grad_norm": 0.14039765298366547, + "learning_rate": 2.588700984230249e-05, + "loss": 0.0223, + "step": 5798 + }, + { + "epoch": 0.7661261023218945, + "grad_norm": 0.14343780279159546, + "learning_rate": 2.5859113811898885e-05, + "loss": 0.0137, + "step": 5799 + }, + { + "epoch": 0.7662582158073785, + "grad_norm": 0.11566592007875443, + "learning_rate": 2.5831230587947097e-05, + "loss": 0.013, + "step": 5800 + }, + { + "epoch": 0.7663903292928625, + "grad_norm": 0.5028908252716064, + "learning_rate": 2.580336017526348e-05, + "loss": 0.0214, + "step": 5801 + }, + { + "epoch": 0.7665224427783466, + "grad_norm": 0.17451821267604828, + "learning_rate": 2.5775502578662148e-05, + "loss": 0.0249, + "step": 5802 + }, + { + "epoch": 0.7666545562638306, + "grad_norm": 0.1618116796016693, + "learning_rate": 2.5747657802954918e-05, + "loss": 0.0244, + "step": 5803 + }, + { + "epoch": 0.7667866697493146, + "grad_norm": 0.17778781056404114, + "learning_rate": 2.5719825852951484e-05, + "loss": 0.0226, + "step": 5804 + }, + { + "epoch": 0.7669187832347987, + "grad_norm": 0.09891565144062042, + "learning_rate": 2.5692006733459294e-05, + "loss": 0.0066, + "step": 5805 + }, + { + "epoch": 0.7670508967202827, + "grad_norm": 0.15232637524604797, + "learning_rate": 2.5664200449283627e-05, + "loss": 0.0155, + "step": 5806 + }, + { + "epoch": 0.7671830102057667, + "grad_norm": 0.24721577763557434, + "learning_rate": 2.5636407005227413e-05, + "loss": 0.0165, + "step": 5807 + }, + { + "epoch": 0.7673151236912508, + "grad_norm": 0.08634576201438904, + "learning_rate": 2.5608626406091507e-05, + "loss": 0.0049, + "step": 5808 + }, + { + "epoch": 0.7674472371767348, + "grad_norm": 0.15494407713413239, + "learning_rate": 2.558085865667449e-05, + "loss": 0.0153, + "step": 5809 + }, + { + "epoch": 0.7675793506622188, + "grad_norm": 0.2543340027332306, + "learning_rate": 2.555310376177268e-05, + "loss": 0.0208, + "step": 5810 + }, + { + "epoch": 0.7677114641477029, + "grad_norm": 0.1335250288248062, + "learning_rate": 2.5525361726180243e-05, + "loss": 0.0151, + "step": 5811 + }, + { + "epoch": 0.7678435776331869, + "grad_norm": 0.1976231038570404, + "learning_rate": 2.549763255468909e-05, + "loss": 0.0156, + "step": 5812 + }, + { + "epoch": 0.7679756911186709, + "grad_norm": 0.1935354620218277, + "learning_rate": 2.5469916252088954e-05, + "loss": 0.024, + "step": 5813 + }, + { + "epoch": 0.768107804604155, + "grad_norm": 0.1447988748550415, + "learning_rate": 2.5442212823167243e-05, + "loss": 0.0168, + "step": 5814 + }, + { + "epoch": 0.768239918089639, + "grad_norm": 0.10164899379014969, + "learning_rate": 2.5414522272709253e-05, + "loss": 0.015, + "step": 5815 + }, + { + "epoch": 0.768372031575123, + "grad_norm": 0.07989451289176941, + "learning_rate": 2.5386844605498015e-05, + "loss": 0.0061, + "step": 5816 + }, + { + "epoch": 0.768504145060607, + "grad_norm": 0.13421711325645447, + "learning_rate": 2.5359179826314283e-05, + "loss": 0.0075, + "step": 5817 + }, + { + "epoch": 0.7686362585460911, + "grad_norm": 0.06816146522760391, + "learning_rate": 2.533152793993665e-05, + "loss": 0.0044, + "step": 5818 + }, + { + "epoch": 0.7687683720315751, + "grad_norm": 0.13832999765872955, + "learning_rate": 2.5303888951141476e-05, + "loss": 0.0162, + "step": 5819 + }, + { + "epoch": 0.7689004855170591, + "grad_norm": 0.17114047706127167, + "learning_rate": 2.5276262864702895e-05, + "loss": 0.0185, + "step": 5820 + }, + { + "epoch": 0.7690325990025432, + "grad_norm": 0.12861768901348114, + "learning_rate": 2.5248649685392743e-05, + "loss": 0.0118, + "step": 5821 + }, + { + "epoch": 0.7691647124880272, + "grad_norm": 0.16697217524051666, + "learning_rate": 2.5221049417980726e-05, + "loss": 0.0116, + "step": 5822 + }, + { + "epoch": 0.7692968259735112, + "grad_norm": 0.10849463194608688, + "learning_rate": 2.5193462067234275e-05, + "loss": 0.0052, + "step": 5823 + }, + { + "epoch": 0.7694289394589953, + "grad_norm": 0.2121121883392334, + "learning_rate": 2.516588763791855e-05, + "loss": 0.0218, + "step": 5824 + }, + { + "epoch": 0.7695610529444793, + "grad_norm": 0.23473268747329712, + "learning_rate": 2.5138326134796543e-05, + "loss": 0.0253, + "step": 5825 + }, + { + "epoch": 0.7696931664299633, + "grad_norm": 0.1477525532245636, + "learning_rate": 2.5110777562628985e-05, + "loss": 0.011, + "step": 5826 + }, + { + "epoch": 0.7698252799154474, + "grad_norm": 0.16129234433174133, + "learning_rate": 2.5083241926174406e-05, + "loss": 0.0226, + "step": 5827 + }, + { + "epoch": 0.7699573934009314, + "grad_norm": 0.17213378846645355, + "learning_rate": 2.5055719230189013e-05, + "loss": 0.0161, + "step": 5828 + }, + { + "epoch": 0.7700895068864154, + "grad_norm": 0.16342443227767944, + "learning_rate": 2.502820947942688e-05, + "loss": 0.0181, + "step": 5829 + }, + { + "epoch": 0.7702216203718995, + "grad_norm": 0.16742447018623352, + "learning_rate": 2.5000712678639815e-05, + "loss": 0.0129, + "step": 5830 + }, + { + "epoch": 0.7703537338573835, + "grad_norm": 0.08959437161684036, + "learning_rate": 2.4973228832577324e-05, + "loss": 0.0053, + "step": 5831 + }, + { + "epoch": 0.7704858473428675, + "grad_norm": 0.07583253085613251, + "learning_rate": 2.4945757945986748e-05, + "loss": 0.0067, + "step": 5832 + }, + { + "epoch": 0.7706179608283515, + "grad_norm": 0.14746922254562378, + "learning_rate": 2.4918300023613183e-05, + "loss": 0.0137, + "step": 5833 + }, + { + "epoch": 0.7707500743138356, + "grad_norm": 0.18567243218421936, + "learning_rate": 2.4890855070199505e-05, + "loss": 0.0147, + "step": 5834 + }, + { + "epoch": 0.7708821877993196, + "grad_norm": 0.3144448697566986, + "learning_rate": 2.486342309048624e-05, + "loss": 0.023, + "step": 5835 + }, + { + "epoch": 0.7710143012848036, + "grad_norm": 0.23152709007263184, + "learning_rate": 2.4836004089211785e-05, + "loss": 0.016, + "step": 5836 + }, + { + "epoch": 0.7711464147702877, + "grad_norm": 0.11881105601787567, + "learning_rate": 2.4808598071112288e-05, + "loss": 0.0066, + "step": 5837 + }, + { + "epoch": 0.7712785282557717, + "grad_norm": 0.17560985684394836, + "learning_rate": 2.4781205040921584e-05, + "loss": 0.0123, + "step": 5838 + }, + { + "epoch": 0.7714106417412557, + "grad_norm": 0.15621699392795563, + "learning_rate": 2.475382500337131e-05, + "loss": 0.0204, + "step": 5839 + }, + { + "epoch": 0.7715427552267398, + "grad_norm": 0.12854041159152985, + "learning_rate": 2.4726457963190875e-05, + "loss": 0.0117, + "step": 5840 + }, + { + "epoch": 0.7716748687122238, + "grad_norm": 0.2113855928182602, + "learning_rate": 2.4699103925107413e-05, + "loss": 0.0167, + "step": 5841 + }, + { + "epoch": 0.7718069821977078, + "grad_norm": 0.14856669306755066, + "learning_rate": 2.4671762893845828e-05, + "loss": 0.0112, + "step": 5842 + }, + { + "epoch": 0.7719390956831919, + "grad_norm": 0.12639237940311432, + "learning_rate": 2.4644434874128776e-05, + "loss": 0.0069, + "step": 5843 + }, + { + "epoch": 0.7720712091686759, + "grad_norm": 0.18509209156036377, + "learning_rate": 2.4617119870676676e-05, + "loss": 0.0155, + "step": 5844 + }, + { + "epoch": 0.7722033226541599, + "grad_norm": 0.11860229820013046, + "learning_rate": 2.4589817888207645e-05, + "loss": 0.0097, + "step": 5845 + }, + { + "epoch": 0.772335436139644, + "grad_norm": 0.12120820581912994, + "learning_rate": 2.45625289314376e-05, + "loss": 0.0145, + "step": 5846 + }, + { + "epoch": 0.772467549625128, + "grad_norm": 0.14185993373394012, + "learning_rate": 2.453525300508024e-05, + "loss": 0.0127, + "step": 5847 + }, + { + "epoch": 0.772599663110612, + "grad_norm": 0.14186576008796692, + "learning_rate": 2.4507990113846913e-05, + "loss": 0.0093, + "step": 5848 + }, + { + "epoch": 0.772731776596096, + "grad_norm": 0.13555972278118134, + "learning_rate": 2.44807402624468e-05, + "loss": 0.0184, + "step": 5849 + }, + { + "epoch": 0.7728638900815801, + "grad_norm": 0.14067058265209198, + "learning_rate": 2.445350345558679e-05, + "loss": 0.0166, + "step": 5850 + }, + { + "epoch": 0.7729960035670641, + "grad_norm": 0.1019555851817131, + "learning_rate": 2.4426279697971587e-05, + "loss": 0.0103, + "step": 5851 + }, + { + "epoch": 0.7731281170525481, + "grad_norm": 0.08230996876955032, + "learning_rate": 2.439906899430351e-05, + "loss": 0.0074, + "step": 5852 + }, + { + "epoch": 0.7732602305380322, + "grad_norm": 0.21562132239341736, + "learning_rate": 2.4371871349282727e-05, + "loss": 0.0113, + "step": 5853 + }, + { + "epoch": 0.7733923440235162, + "grad_norm": 0.34086093306541443, + "learning_rate": 2.4344686767607172e-05, + "loss": 0.0284, + "step": 5854 + }, + { + "epoch": 0.7735244575090002, + "grad_norm": 0.1579699069261551, + "learning_rate": 2.431751525397239e-05, + "loss": 0.0206, + "step": 5855 + }, + { + "epoch": 0.7736565709944843, + "grad_norm": 0.2254895567893982, + "learning_rate": 2.429035681307179e-05, + "loss": 0.0329, + "step": 5856 + }, + { + "epoch": 0.7737886844799683, + "grad_norm": 0.22535626590251923, + "learning_rate": 2.426321144959649e-05, + "loss": 0.0262, + "step": 5857 + }, + { + "epoch": 0.7739207979654523, + "grad_norm": 0.14168086647987366, + "learning_rate": 2.423607916823537e-05, + "loss": 0.0116, + "step": 5858 + }, + { + "epoch": 0.7740529114509364, + "grad_norm": 0.1516113579273224, + "learning_rate": 2.420895997367497e-05, + "loss": 0.0193, + "step": 5859 + }, + { + "epoch": 0.7741850249364204, + "grad_norm": 0.19735190272331238, + "learning_rate": 2.4181853870599648e-05, + "loss": 0.0131, + "step": 5860 + }, + { + "epoch": 0.7743171384219044, + "grad_norm": 0.256398469209671, + "learning_rate": 2.4154760863691505e-05, + "loss": 0.0066, + "step": 5861 + }, + { + "epoch": 0.7744492519073884, + "grad_norm": 0.13530072569847107, + "learning_rate": 2.4127680957630295e-05, + "loss": 0.0137, + "step": 5862 + }, + { + "epoch": 0.7745813653928725, + "grad_norm": 0.2853071391582489, + "learning_rate": 2.4100614157093593e-05, + "loss": 0.0287, + "step": 5863 + }, + { + "epoch": 0.7747134788783565, + "grad_norm": 0.10476741939783096, + "learning_rate": 2.4073560466756682e-05, + "loss": 0.011, + "step": 5864 + }, + { + "epoch": 0.7748455923638405, + "grad_norm": 0.15531739592552185, + "learning_rate": 2.4046519891292607e-05, + "loss": 0.0169, + "step": 5865 + }, + { + "epoch": 0.7749777058493246, + "grad_norm": 0.19582904875278473, + "learning_rate": 2.4019492435372083e-05, + "loss": 0.0132, + "step": 5866 + }, + { + "epoch": 0.7751098193348086, + "grad_norm": 0.1912730634212494, + "learning_rate": 2.3992478103663606e-05, + "loss": 0.0181, + "step": 5867 + }, + { + "epoch": 0.7752419328202926, + "grad_norm": 0.1268959492444992, + "learning_rate": 2.3965476900833428e-05, + "loss": 0.0147, + "step": 5868 + }, + { + "epoch": 0.7753740463057767, + "grad_norm": 0.13870106637477875, + "learning_rate": 2.3938488831545446e-05, + "loss": 0.0166, + "step": 5869 + }, + { + "epoch": 0.7755061597912607, + "grad_norm": 0.13729718327522278, + "learning_rate": 2.3911513900461392e-05, + "loss": 0.0172, + "step": 5870 + }, + { + "epoch": 0.7756382732767447, + "grad_norm": 0.08225198090076447, + "learning_rate": 2.3884552112240655e-05, + "loss": 0.0065, + "step": 5871 + }, + { + "epoch": 0.7757703867622288, + "grad_norm": 0.1484571397304535, + "learning_rate": 2.3857603471540414e-05, + "loss": 0.0066, + "step": 5872 + }, + { + "epoch": 0.7759025002477128, + "grad_norm": 0.21782946586608887, + "learning_rate": 2.3830667983015486e-05, + "loss": 0.01, + "step": 5873 + }, + { + "epoch": 0.7760346137331968, + "grad_norm": 0.1634368747472763, + "learning_rate": 2.380374565131852e-05, + "loss": 0.0196, + "step": 5874 + }, + { + "epoch": 0.7761667272186809, + "grad_norm": 0.21178163588047028, + "learning_rate": 2.377683648109985e-05, + "loss": 0.0185, + "step": 5875 + }, + { + "epoch": 0.7762988407041649, + "grad_norm": 0.09029104560613632, + "learning_rate": 2.3749940477007482e-05, + "loss": 0.0063, + "step": 5876 + }, + { + "epoch": 0.7764309541896489, + "grad_norm": 0.1842057704925537, + "learning_rate": 2.372305764368723e-05, + "loss": 0.0142, + "step": 5877 + }, + { + "epoch": 0.776563067675133, + "grad_norm": 0.16264231503009796, + "learning_rate": 2.3696187985782602e-05, + "loss": 0.0152, + "step": 5878 + }, + { + "epoch": 0.776695181160617, + "grad_norm": 0.17312869429588318, + "learning_rate": 2.3669331507934856e-05, + "loss": 0.0193, + "step": 5879 + }, + { + "epoch": 0.776827294646101, + "grad_norm": 0.21124525368213654, + "learning_rate": 2.3642488214782886e-05, + "loss": 0.0283, + "step": 5880 + }, + { + "epoch": 0.776959408131585, + "grad_norm": 0.1384699046611786, + "learning_rate": 2.36156581109634e-05, + "loss": 0.0098, + "step": 5881 + }, + { + "epoch": 0.7770915216170691, + "grad_norm": 0.1894114464521408, + "learning_rate": 2.358884120111082e-05, + "loss": 0.0225, + "step": 5882 + }, + { + "epoch": 0.7772236351025531, + "grad_norm": 0.0946621522307396, + "learning_rate": 2.3562037489857226e-05, + "loss": 0.0087, + "step": 5883 + }, + { + "epoch": 0.7773557485880371, + "grad_norm": 0.2099320888519287, + "learning_rate": 2.353524698183246e-05, + "loss": 0.022, + "step": 5884 + }, + { + "epoch": 0.7774878620735212, + "grad_norm": 0.1482798159122467, + "learning_rate": 2.3508469681664102e-05, + "loss": 0.0209, + "step": 5885 + }, + { + "epoch": 0.7776199755590052, + "grad_norm": 0.15039187669754028, + "learning_rate": 2.3481705593977456e-05, + "loss": 0.0137, + "step": 5886 + }, + { + "epoch": 0.7777520890444892, + "grad_norm": 0.17164143919944763, + "learning_rate": 2.345495472339545e-05, + "loss": 0.0128, + "step": 5887 + }, + { + "epoch": 0.7778842025299733, + "grad_norm": 0.2455880045890808, + "learning_rate": 2.342821707453884e-05, + "loss": 0.0136, + "step": 5888 + }, + { + "epoch": 0.7780163160154573, + "grad_norm": 0.15861807763576508, + "learning_rate": 2.340149265202607e-05, + "loss": 0.0209, + "step": 5889 + }, + { + "epoch": 0.7781484295009413, + "grad_norm": 0.1428367793560028, + "learning_rate": 2.3374781460473226e-05, + "loss": 0.0166, + "step": 5890 + }, + { + "epoch": 0.7782805429864253, + "grad_norm": 0.1530083864927292, + "learning_rate": 2.334808350449421e-05, + "loss": 0.0218, + "step": 5891 + }, + { + "epoch": 0.7784126564719094, + "grad_norm": 0.11520033329725266, + "learning_rate": 2.3321398788700622e-05, + "loss": 0.0133, + "step": 5892 + }, + { + "epoch": 0.7785447699573934, + "grad_norm": 0.23249691724777222, + "learning_rate": 2.3294727317701672e-05, + "loss": 0.0089, + "step": 5893 + }, + { + "epoch": 0.7786768834428774, + "grad_norm": 0.1689661145210266, + "learning_rate": 2.3268069096104406e-05, + "loss": 0.0178, + "step": 5894 + }, + { + "epoch": 0.7788089969283615, + "grad_norm": 0.1357138752937317, + "learning_rate": 2.3241424128513522e-05, + "loss": 0.0124, + "step": 5895 + }, + { + "epoch": 0.7789411104138455, + "grad_norm": 0.142770916223526, + "learning_rate": 2.3214792419531473e-05, + "loss": 0.0139, + "step": 5896 + }, + { + "epoch": 0.7790732238993295, + "grad_norm": 0.14981845021247864, + "learning_rate": 2.318817397375833e-05, + "loss": 0.0122, + "step": 5897 + }, + { + "epoch": 0.7792053373848136, + "grad_norm": 0.21843738853931427, + "learning_rate": 2.3161568795791965e-05, + "loss": 0.017, + "step": 5898 + }, + { + "epoch": 0.7793374508702976, + "grad_norm": 0.12665678560733795, + "learning_rate": 2.3134976890227923e-05, + "loss": 0.0128, + "step": 5899 + }, + { + "epoch": 0.7794695643557816, + "grad_norm": 0.20801448822021484, + "learning_rate": 2.3108398261659447e-05, + "loss": 0.0213, + "step": 5900 + }, + { + "epoch": 0.7796016778412657, + "grad_norm": 0.1517508625984192, + "learning_rate": 2.3081832914677514e-05, + "loss": 0.0255, + "step": 5901 + }, + { + "epoch": 0.7797337913267497, + "grad_norm": 0.1105872094631195, + "learning_rate": 2.305528085387082e-05, + "loss": 0.015, + "step": 5902 + }, + { + "epoch": 0.7798659048122337, + "grad_norm": 0.3150453567504883, + "learning_rate": 2.302874208382567e-05, + "loss": 0.0267, + "step": 5903 + }, + { + "epoch": 0.7799980182977178, + "grad_norm": 0.1409350335597992, + "learning_rate": 2.300221660912617e-05, + "loss": 0.0104, + "step": 5904 + }, + { + "epoch": 0.7801301317832018, + "grad_norm": 0.1132245883345604, + "learning_rate": 2.2975704434354096e-05, + "loss": 0.0147, + "step": 5905 + }, + { + "epoch": 0.7802622452686858, + "grad_norm": 0.24876229465007782, + "learning_rate": 2.294920556408897e-05, + "loss": 0.0213, + "step": 5906 + }, + { + "epoch": 0.7803943587541698, + "grad_norm": 0.1730552315711975, + "learning_rate": 2.2922720002907926e-05, + "loss": 0.0088, + "step": 5907 + }, + { + "epoch": 0.7805264722396539, + "grad_norm": 0.18458811938762665, + "learning_rate": 2.2896247755385857e-05, + "loss": 0.0177, + "step": 5908 + }, + { + "epoch": 0.7806585857251379, + "grad_norm": 0.15846845507621765, + "learning_rate": 2.2869788826095383e-05, + "loss": 0.0143, + "step": 5909 + }, + { + "epoch": 0.7807906992106219, + "grad_norm": 0.141799196600914, + "learning_rate": 2.284334321960674e-05, + "loss": 0.0083, + "step": 5910 + }, + { + "epoch": 0.780922812696106, + "grad_norm": 0.19804981350898743, + "learning_rate": 2.2816910940487935e-05, + "loss": 0.0234, + "step": 5911 + }, + { + "epoch": 0.78105492618159, + "grad_norm": 0.16551578044891357, + "learning_rate": 2.279049199330465e-05, + "loss": 0.02, + "step": 5912 + }, + { + "epoch": 0.781187039667074, + "grad_norm": 0.12346825003623962, + "learning_rate": 2.276408638262031e-05, + "loss": 0.0141, + "step": 5913 + }, + { + "epoch": 0.7813191531525581, + "grad_norm": 0.12936897575855255, + "learning_rate": 2.27376941129959e-05, + "loss": 0.0098, + "step": 5914 + }, + { + "epoch": 0.7814512666380421, + "grad_norm": 0.22945274412631989, + "learning_rate": 2.2711315188990247e-05, + "loss": 0.0152, + "step": 5915 + }, + { + "epoch": 0.7815833801235261, + "grad_norm": 0.17532186210155487, + "learning_rate": 2.2684949615159834e-05, + "loss": 0.017, + "step": 5916 + }, + { + "epoch": 0.7817154936090102, + "grad_norm": 0.27837732434272766, + "learning_rate": 2.2658597396058768e-05, + "loss": 0.013, + "step": 5917 + }, + { + "epoch": 0.7818476070944942, + "grad_norm": 0.1335458606481552, + "learning_rate": 2.2632258536238915e-05, + "loss": 0.0172, + "step": 5918 + }, + { + "epoch": 0.7819797205799782, + "grad_norm": 0.17067228257656097, + "learning_rate": 2.260593304024985e-05, + "loss": 0.013, + "step": 5919 + }, + { + "epoch": 0.7821118340654623, + "grad_norm": 0.19821752607822418, + "learning_rate": 2.257962091263882e-05, + "loss": 0.0227, + "step": 5920 + }, + { + "epoch": 0.7822439475509463, + "grad_norm": 0.19090385735034943, + "learning_rate": 2.2553322157950696e-05, + "loss": 0.0163, + "step": 5921 + }, + { + "epoch": 0.7823760610364303, + "grad_norm": 0.16109652817249298, + "learning_rate": 2.2527036780728128e-05, + "loss": 0.0252, + "step": 5922 + }, + { + "epoch": 0.7825081745219143, + "grad_norm": 0.1403699666261673, + "learning_rate": 2.250076478551145e-05, + "loss": 0.0079, + "step": 5923 + }, + { + "epoch": 0.7826402880073984, + "grad_norm": 0.14240224659442902, + "learning_rate": 2.2474506176838605e-05, + "loss": 0.0105, + "step": 5924 + }, + { + "epoch": 0.7827724014928824, + "grad_norm": 0.19920337200164795, + "learning_rate": 2.2448260959245304e-05, + "loss": 0.0234, + "step": 5925 + }, + { + "epoch": 0.7829045149783664, + "grad_norm": 0.1772182583808899, + "learning_rate": 2.242202913726491e-05, + "loss": 0.0208, + "step": 5926 + }, + { + "epoch": 0.7830366284638505, + "grad_norm": 0.2252548635005951, + "learning_rate": 2.239581071542852e-05, + "loss": 0.0219, + "step": 5927 + }, + { + "epoch": 0.7831687419493345, + "grad_norm": 0.13585056364536285, + "learning_rate": 2.2369605698264817e-05, + "loss": 0.0079, + "step": 5928 + }, + { + "epoch": 0.7833008554348185, + "grad_norm": 0.22028230130672455, + "learning_rate": 2.234341409030024e-05, + "loss": 0.026, + "step": 5929 + }, + { + "epoch": 0.7834329689203026, + "grad_norm": 0.11685667932033539, + "learning_rate": 2.2317235896058953e-05, + "loss": 0.0183, + "step": 5930 + }, + { + "epoch": 0.7835650824057866, + "grad_norm": 0.14003141224384308, + "learning_rate": 2.229107112006268e-05, + "loss": 0.0205, + "step": 5931 + }, + { + "epoch": 0.7836971958912706, + "grad_norm": 0.19484570622444153, + "learning_rate": 2.2264919766830927e-05, + "loss": 0.0204, + "step": 5932 + }, + { + "epoch": 0.7838293093767547, + "grad_norm": 0.19756214320659637, + "learning_rate": 2.223878184088084e-05, + "loss": 0.0194, + "step": 5933 + }, + { + "epoch": 0.7839614228622387, + "grad_norm": 0.11648136377334595, + "learning_rate": 2.2212657346727307e-05, + "loss": 0.0039, + "step": 5934 + }, + { + "epoch": 0.7840935363477227, + "grad_norm": 0.21022813022136688, + "learning_rate": 2.218654628888277e-05, + "loss": 0.0224, + "step": 5935 + }, + { + "epoch": 0.7842256498332068, + "grad_norm": 0.15801659226417542, + "learning_rate": 2.216044867185747e-05, + "loss": 0.0151, + "step": 5936 + }, + { + "epoch": 0.7843577633186908, + "grad_norm": 0.2231561243534088, + "learning_rate": 2.21343645001593e-05, + "loss": 0.017, + "step": 5937 + }, + { + "epoch": 0.7844898768041748, + "grad_norm": 0.16394644975662231, + "learning_rate": 2.2108293778293754e-05, + "loss": 0.0187, + "step": 5938 + }, + { + "epoch": 0.7846219902896588, + "grad_norm": 0.1428672969341278, + "learning_rate": 2.2082236510764098e-05, + "loss": 0.0147, + "step": 5939 + }, + { + "epoch": 0.7847541037751429, + "grad_norm": 0.22260761260986328, + "learning_rate": 2.2056192702071233e-05, + "loss": 0.0148, + "step": 5940 + }, + { + "epoch": 0.7848862172606269, + "grad_norm": 0.25145766139030457, + "learning_rate": 2.203016235671378e-05, + "loss": 0.0246, + "step": 5941 + }, + { + "epoch": 0.7850183307461109, + "grad_norm": 0.4117206633090973, + "learning_rate": 2.2004145479187922e-05, + "loss": 0.0179, + "step": 5942 + }, + { + "epoch": 0.785150444231595, + "grad_norm": 0.17742082476615906, + "learning_rate": 2.1978142073987617e-05, + "loss": 0.0236, + "step": 5943 + }, + { + "epoch": 0.785282557717079, + "grad_norm": 0.10295175760984421, + "learning_rate": 2.195215214560451e-05, + "loss": 0.0066, + "step": 5944 + }, + { + "epoch": 0.785414671202563, + "grad_norm": 0.149460569024086, + "learning_rate": 2.1926175698527806e-05, + "loss": 0.0173, + "step": 5945 + }, + { + "epoch": 0.7855467846880471, + "grad_norm": 0.15568538010120392, + "learning_rate": 2.1900212737244484e-05, + "loss": 0.0147, + "step": 5946 + }, + { + "epoch": 0.7856788981735311, + "grad_norm": 0.18868260085582733, + "learning_rate": 2.187426326623916e-05, + "loss": 0.0186, + "step": 5947 + }, + { + "epoch": 0.7858110116590151, + "grad_norm": 0.19524319469928741, + "learning_rate": 2.1848327289994143e-05, + "loss": 0.017, + "step": 5948 + }, + { + "epoch": 0.7859431251444992, + "grad_norm": 0.29335448145866394, + "learning_rate": 2.182240481298934e-05, + "loss": 0.0109, + "step": 5949 + }, + { + "epoch": 0.7860752386299832, + "grad_norm": 0.16811603307724, + "learning_rate": 2.1796495839702392e-05, + "loss": 0.0157, + "step": 5950 + }, + { + "epoch": 0.7862073521154672, + "grad_norm": 0.1527618020772934, + "learning_rate": 2.177060037460863e-05, + "loss": 0.0135, + "step": 5951 + }, + { + "epoch": 0.7863394656009512, + "grad_norm": 0.11982507258653641, + "learning_rate": 2.1744718422180945e-05, + "loss": 0.0087, + "step": 5952 + }, + { + "epoch": 0.7864715790864353, + "grad_norm": 0.2619565427303314, + "learning_rate": 2.171884998688999e-05, + "loss": 0.019, + "step": 5953 + }, + { + "epoch": 0.7866036925719193, + "grad_norm": 0.1242903470993042, + "learning_rate": 2.169299507320406e-05, + "loss": 0.0148, + "step": 5954 + }, + { + "epoch": 0.7867358060574033, + "grad_norm": 0.150680810213089, + "learning_rate": 2.1667153685589124e-05, + "loss": 0.013, + "step": 5955 + }, + { + "epoch": 0.7868679195428874, + "grad_norm": 0.14262771606445312, + "learning_rate": 2.1641325828508764e-05, + "loss": 0.0171, + "step": 5956 + }, + { + "epoch": 0.7870000330283714, + "grad_norm": 0.2812264561653137, + "learning_rate": 2.161551150642427e-05, + "loss": 0.025, + "step": 5957 + }, + { + "epoch": 0.7871321465138554, + "grad_norm": 0.1431320160627365, + "learning_rate": 2.1589710723794575e-05, + "loss": 0.011, + "step": 5958 + }, + { + "epoch": 0.7872642599993395, + "grad_norm": 0.20700080692768097, + "learning_rate": 2.156392348507631e-05, + "loss": 0.0219, + "step": 5959 + }, + { + "epoch": 0.7873963734848235, + "grad_norm": 0.10715476423501968, + "learning_rate": 2.1538149794723706e-05, + "loss": 0.0109, + "step": 5960 + }, + { + "epoch": 0.7875284869703075, + "grad_norm": 0.10400570929050446, + "learning_rate": 2.1512389657188748e-05, + "loss": 0.0084, + "step": 5961 + }, + { + "epoch": 0.7876606004557916, + "grad_norm": 0.1712016463279724, + "learning_rate": 2.1486643076920932e-05, + "loss": 0.0197, + "step": 5962 + }, + { + "epoch": 0.7877927139412756, + "grad_norm": 0.0920313149690628, + "learning_rate": 2.1460910058367543e-05, + "loss": 0.0099, + "step": 5963 + }, + { + "epoch": 0.7879248274267596, + "grad_norm": 0.10525844246149063, + "learning_rate": 2.14351906059735e-05, + "loss": 0.0073, + "step": 5964 + }, + { + "epoch": 0.7880569409122437, + "grad_norm": 0.30008089542388916, + "learning_rate": 2.1409484724181306e-05, + "loss": 0.0242, + "step": 5965 + }, + { + "epoch": 0.7881890543977277, + "grad_norm": 0.12237905710935593, + "learning_rate": 2.138379241743119e-05, + "loss": 0.0181, + "step": 5966 + }, + { + "epoch": 0.7883211678832117, + "grad_norm": 0.13109365105628967, + "learning_rate": 2.135811369016104e-05, + "loss": 0.0111, + "step": 5967 + }, + { + "epoch": 0.7884532813686957, + "grad_norm": 0.12001150846481323, + "learning_rate": 2.1332448546806382e-05, + "loss": 0.0079, + "step": 5968 + }, + { + "epoch": 0.7885853948541798, + "grad_norm": 0.11735592782497406, + "learning_rate": 2.1306796991800337e-05, + "loss": 0.0167, + "step": 5969 + }, + { + "epoch": 0.7887175083396638, + "grad_norm": 0.1726117581129074, + "learning_rate": 2.1281159029573772e-05, + "loss": 0.0137, + "step": 5970 + }, + { + "epoch": 0.7888496218251478, + "grad_norm": 0.20775265991687775, + "learning_rate": 2.1255534664555175e-05, + "loss": 0.0211, + "step": 5971 + }, + { + "epoch": 0.7889817353106319, + "grad_norm": 0.15340012311935425, + "learning_rate": 2.1229923901170646e-05, + "loss": 0.0099, + "step": 5972 + }, + { + "epoch": 0.7891138487961159, + "grad_norm": 0.1911078691482544, + "learning_rate": 2.1204326743843962e-05, + "loss": 0.0236, + "step": 5973 + }, + { + "epoch": 0.7892459622815999, + "grad_norm": 0.1343451589345932, + "learning_rate": 2.1178743196996576e-05, + "loss": 0.012, + "step": 5974 + }, + { + "epoch": 0.789378075767084, + "grad_norm": 0.29213130474090576, + "learning_rate": 2.115317326504759e-05, + "loss": 0.0219, + "step": 5975 + }, + { + "epoch": 0.789510189252568, + "grad_norm": 0.30969685316085815, + "learning_rate": 2.1127616952413666e-05, + "loss": 0.0286, + "step": 5976 + }, + { + "epoch": 0.789642302738052, + "grad_norm": 0.12944519519805908, + "learning_rate": 2.110207426350922e-05, + "loss": 0.0105, + "step": 5977 + }, + { + "epoch": 0.7897744162235361, + "grad_norm": 0.14682377874851227, + "learning_rate": 2.10765452027463e-05, + "loss": 0.0167, + "step": 5978 + }, + { + "epoch": 0.7899065297090201, + "grad_norm": 0.19225028157234192, + "learning_rate": 2.1051029774534504e-05, + "loss": 0.0151, + "step": 5979 + }, + { + "epoch": 0.7900386431945041, + "grad_norm": 0.18472789227962494, + "learning_rate": 2.102552798328119e-05, + "loss": 0.0204, + "step": 5980 + }, + { + "epoch": 0.7901707566799882, + "grad_norm": 0.11308620870113373, + "learning_rate": 2.1000039833391318e-05, + "loss": 0.0139, + "step": 5981 + }, + { + "epoch": 0.7903028701654722, + "grad_norm": 0.1950245052576065, + "learning_rate": 2.0974565329267502e-05, + "loss": 0.0196, + "step": 5982 + }, + { + "epoch": 0.7904349836509562, + "grad_norm": 0.21679812669754028, + "learning_rate": 2.0949104475309933e-05, + "loss": 0.0198, + "step": 5983 + }, + { + "epoch": 0.7905670971364402, + "grad_norm": 0.12117374688386917, + "learning_rate": 2.092365727591654e-05, + "loss": 0.015, + "step": 5984 + }, + { + "epoch": 0.7906992106219243, + "grad_norm": 0.09717853367328644, + "learning_rate": 2.0898223735482857e-05, + "loss": 0.0106, + "step": 5985 + }, + { + "epoch": 0.7908313241074083, + "grad_norm": 0.16154758632183075, + "learning_rate": 2.0872803858402013e-05, + "loss": 0.0216, + "step": 5986 + }, + { + "epoch": 0.7909634375928923, + "grad_norm": 0.3653877079486847, + "learning_rate": 2.084739764906485e-05, + "loss": 0.0397, + "step": 5987 + }, + { + "epoch": 0.7910955510783764, + "grad_norm": 0.12396520376205444, + "learning_rate": 2.082200511185979e-05, + "loss": 0.0165, + "step": 5988 + }, + { + "epoch": 0.7912276645638604, + "grad_norm": 0.1662580817937851, + "learning_rate": 2.0796626251172968e-05, + "loss": 0.0197, + "step": 5989 + }, + { + "epoch": 0.7913597780493444, + "grad_norm": 0.13176557421684265, + "learning_rate": 2.0771261071388047e-05, + "loss": 0.0181, + "step": 5990 + }, + { + "epoch": 0.7914918915348285, + "grad_norm": 0.14741529524326324, + "learning_rate": 2.0745909576886414e-05, + "loss": 0.0116, + "step": 5991 + }, + { + "epoch": 0.7916240050203125, + "grad_norm": 0.20206743478775024, + "learning_rate": 2.0720571772047092e-05, + "loss": 0.0165, + "step": 5992 + }, + { + "epoch": 0.7917561185057965, + "grad_norm": 0.11618878692388535, + "learning_rate": 2.0695247661246665e-05, + "loss": 0.0089, + "step": 5993 + }, + { + "epoch": 0.7918882319912806, + "grad_norm": 0.16395948827266693, + "learning_rate": 2.0669937248859416e-05, + "loss": 0.0167, + "step": 5994 + }, + { + "epoch": 0.7920203454767646, + "grad_norm": 0.17397412657737732, + "learning_rate": 2.0644640539257266e-05, + "loss": 0.0181, + "step": 5995 + }, + { + "epoch": 0.7921524589622486, + "grad_norm": 0.12989552319049835, + "learning_rate": 2.0619357536809746e-05, + "loss": 0.0155, + "step": 5996 + }, + { + "epoch": 0.7922845724477326, + "grad_norm": 0.140245720744133, + "learning_rate": 2.0594088245883982e-05, + "loss": 0.0132, + "step": 5997 + }, + { + "epoch": 0.7924166859332167, + "grad_norm": 0.13462164998054504, + "learning_rate": 2.0568832670844805e-05, + "loss": 0.0104, + "step": 5998 + }, + { + "epoch": 0.7925487994187007, + "grad_norm": 0.10564357042312622, + "learning_rate": 2.054359081605467e-05, + "loss": 0.0082, + "step": 5999 + }, + { + "epoch": 0.7926809129041847, + "grad_norm": 0.13238169252872467, + "learning_rate": 2.051836268587357e-05, + "loss": 0.0127, + "step": 6000 + }, + { + "epoch": 0.7928130263896688, + "grad_norm": 0.1388818621635437, + "learning_rate": 2.0493148284659225e-05, + "loss": 0.0165, + "step": 6001 + }, + { + "epoch": 0.7929451398751528, + "grad_norm": 0.13400214910507202, + "learning_rate": 2.046794761676696e-05, + "loss": 0.0108, + "step": 6002 + }, + { + "epoch": 0.7930772533606368, + "grad_norm": 0.27306875586509705, + "learning_rate": 2.0442760686549732e-05, + "loss": 0.023, + "step": 6003 + }, + { + "epoch": 0.7932093668461209, + "grad_norm": 0.26368728280067444, + "learning_rate": 2.041758749835806e-05, + "loss": 0.019, + "step": 6004 + }, + { + "epoch": 0.7933414803316049, + "grad_norm": 0.13244707882404327, + "learning_rate": 2.039242805654018e-05, + "loss": 0.0165, + "step": 6005 + }, + { + "epoch": 0.7934735938170889, + "grad_norm": 0.13773098587989807, + "learning_rate": 2.036728236544194e-05, + "loss": 0.013, + "step": 6006 + }, + { + "epoch": 0.793605707302573, + "grad_norm": 0.18311674892902374, + "learning_rate": 2.0342150429406727e-05, + "loss": 0.0141, + "step": 6007 + }, + { + "epoch": 0.793737820788057, + "grad_norm": 0.20516015589237213, + "learning_rate": 2.0317032252775638e-05, + "loss": 0.0289, + "step": 6008 + }, + { + "epoch": 0.793869934273541, + "grad_norm": 0.16869759559631348, + "learning_rate": 2.0291927839887383e-05, + "loss": 0.017, + "step": 6009 + }, + { + "epoch": 0.794002047759025, + "grad_norm": 0.24092960357666016, + "learning_rate": 2.026683719507828e-05, + "loss": 0.0159, + "step": 6010 + }, + { + "epoch": 0.7941341612445091, + "grad_norm": 0.14799204468727112, + "learning_rate": 2.0241760322682247e-05, + "loss": 0.0196, + "step": 6011 + }, + { + "epoch": 0.7942662747299931, + "grad_norm": 0.18595071136951447, + "learning_rate": 2.0216697227030855e-05, + "loss": 0.0109, + "step": 6012 + }, + { + "epoch": 0.7943983882154771, + "grad_norm": 0.112423837184906, + "learning_rate": 2.0191647912453317e-05, + "loss": 0.0057, + "step": 6013 + }, + { + "epoch": 0.7945305017009612, + "grad_norm": 0.1787814199924469, + "learning_rate": 2.016661238327636e-05, + "loss": 0.0096, + "step": 6014 + }, + { + "epoch": 0.7946626151864452, + "grad_norm": 0.14639584720134735, + "learning_rate": 2.014159064382446e-05, + "loss": 0.0119, + "step": 6015 + }, + { + "epoch": 0.7947947286719292, + "grad_norm": 0.1412990540266037, + "learning_rate": 2.0116582698419638e-05, + "loss": 0.016, + "step": 6016 + }, + { + "epoch": 0.7949268421574133, + "grad_norm": 0.1513022631406784, + "learning_rate": 2.009158855138156e-05, + "loss": 0.012, + "step": 6017 + }, + { + "epoch": 0.7950589556428973, + "grad_norm": 0.37479570508003235, + "learning_rate": 2.006660820702748e-05, + "loss": 0.0187, + "step": 6018 + }, + { + "epoch": 0.7951910691283813, + "grad_norm": 0.13417895138263702, + "learning_rate": 2.0041641669672305e-05, + "loss": 0.0104, + "step": 6019 + }, + { + "epoch": 0.7953231826138654, + "grad_norm": 0.18156084418296814, + "learning_rate": 2.001668894362856e-05, + "loss": 0.0228, + "step": 6020 + }, + { + "epoch": 0.7954552960993494, + "grad_norm": 0.14486129581928253, + "learning_rate": 1.999175003320629e-05, + "loss": 0.0091, + "step": 6021 + }, + { + "epoch": 0.7955874095848334, + "grad_norm": 0.16329576075077057, + "learning_rate": 1.996682494271327e-05, + "loss": 0.0131, + "step": 6022 + }, + { + "epoch": 0.7957195230703173, + "grad_norm": 0.13377881050109863, + "learning_rate": 1.9941913676454872e-05, + "loss": 0.0121, + "step": 6023 + }, + { + "epoch": 0.7958516365558014, + "grad_norm": 0.21078723669052124, + "learning_rate": 1.9917016238733976e-05, + "loss": 0.009, + "step": 6024 + }, + { + "epoch": 0.7959837500412854, + "grad_norm": 0.16063301265239716, + "learning_rate": 1.9892132633851214e-05, + "loss": 0.0152, + "step": 6025 + }, + { + "epoch": 0.7961158635267694, + "grad_norm": 0.22228194773197174, + "learning_rate": 1.986726286610472e-05, + "loss": 0.014, + "step": 6026 + }, + { + "epoch": 0.7962479770122535, + "grad_norm": 0.12263708561658859, + "learning_rate": 1.9842406939790337e-05, + "loss": 0.0171, + "step": 6027 + }, + { + "epoch": 0.7963800904977375, + "grad_norm": 0.08849442005157471, + "learning_rate": 1.981756485920141e-05, + "loss": 0.0051, + "step": 6028 + }, + { + "epoch": 0.7965122039832215, + "grad_norm": 0.1484423279762268, + "learning_rate": 1.979273662862895e-05, + "loss": 0.0111, + "step": 6029 + }, + { + "epoch": 0.7966443174687056, + "grad_norm": 0.20627662539482117, + "learning_rate": 1.9767922252361603e-05, + "loss": 0.0295, + "step": 6030 + }, + { + "epoch": 0.7967764309541896, + "grad_norm": 0.18533170223236084, + "learning_rate": 1.9743121734685545e-05, + "loss": 0.0181, + "step": 6031 + }, + { + "epoch": 0.7969085444396736, + "grad_norm": 0.13025982677936554, + "learning_rate": 1.971833507988462e-05, + "loss": 0.0132, + "step": 6032 + }, + { + "epoch": 0.7970406579251577, + "grad_norm": 0.13095568120479584, + "learning_rate": 1.9693562292240265e-05, + "loss": 0.0093, + "step": 6033 + }, + { + "epoch": 0.7971727714106417, + "grad_norm": 0.2232745885848999, + "learning_rate": 1.966880337603154e-05, + "loss": 0.0243, + "step": 6034 + }, + { + "epoch": 0.7973048848961257, + "grad_norm": 0.11646081507205963, + "learning_rate": 1.964405833553503e-05, + "loss": 0.0077, + "step": 6035 + }, + { + "epoch": 0.7974369983816098, + "grad_norm": 0.14113196730613708, + "learning_rate": 1.9619327175025004e-05, + "loss": 0.0137, + "step": 6036 + }, + { + "epoch": 0.7975691118670938, + "grad_norm": 0.12095404416322708, + "learning_rate": 1.9594609898773343e-05, + "loss": 0.0103, + "step": 6037 + }, + { + "epoch": 0.7977012253525778, + "grad_norm": 0.45921650528907776, + "learning_rate": 1.956990651104943e-05, + "loss": 0.0236, + "step": 6038 + }, + { + "epoch": 0.7978333388380618, + "grad_norm": 0.23896144330501556, + "learning_rate": 1.9545217016120342e-05, + "loss": 0.036, + "step": 6039 + }, + { + "epoch": 0.7979654523235459, + "grad_norm": 0.1391197144985199, + "learning_rate": 1.9520541418250727e-05, + "loss": 0.0115, + "step": 6040 + }, + { + "epoch": 0.7980975658090299, + "grad_norm": 0.2012115865945816, + "learning_rate": 1.949587972170286e-05, + "loss": 0.0149, + "step": 6041 + }, + { + "epoch": 0.7982296792945139, + "grad_norm": 0.1714034527540207, + "learning_rate": 1.9471231930736546e-05, + "loss": 0.0211, + "step": 6042 + }, + { + "epoch": 0.798361792779998, + "grad_norm": 0.12861862778663635, + "learning_rate": 1.9446598049609245e-05, + "loss": 0.0146, + "step": 6043 + }, + { + "epoch": 0.798493906265482, + "grad_norm": 0.1933763325214386, + "learning_rate": 1.942197808257602e-05, + "loss": 0.0105, + "step": 6044 + }, + { + "epoch": 0.798626019750966, + "grad_norm": 0.12401843816041946, + "learning_rate": 1.939737203388948e-05, + "loss": 0.0163, + "step": 6045 + }, + { + "epoch": 0.7987581332364501, + "grad_norm": 0.16275176405906677, + "learning_rate": 1.9372779907799865e-05, + "loss": 0.0142, + "step": 6046 + }, + { + "epoch": 0.7988902467219341, + "grad_norm": 0.1322012096643448, + "learning_rate": 1.9348201708555015e-05, + "loss": 0.0102, + "step": 6047 + }, + { + "epoch": 0.7990223602074181, + "grad_norm": 0.1378653198480606, + "learning_rate": 1.9323637440400365e-05, + "loss": 0.0242, + "step": 6048 + }, + { + "epoch": 0.7991544736929022, + "grad_norm": 0.18538536131381989, + "learning_rate": 1.9299087107578908e-05, + "loss": 0.0123, + "step": 6049 + }, + { + "epoch": 0.7992865871783862, + "grad_norm": 0.1685674786567688, + "learning_rate": 1.9274550714331253e-05, + "loss": 0.0172, + "step": 6050 + }, + { + "epoch": 0.7994187006638702, + "grad_norm": 0.16482692956924438, + "learning_rate": 1.925002826489566e-05, + "loss": 0.0157, + "step": 6051 + }, + { + "epoch": 0.7995508141493542, + "grad_norm": 0.11981571465730667, + "learning_rate": 1.9225519763507838e-05, + "loss": 0.0063, + "step": 6052 + }, + { + "epoch": 0.7996829276348383, + "grad_norm": 0.17827653884887695, + "learning_rate": 1.9201025214401223e-05, + "loss": 0.0119, + "step": 6053 + }, + { + "epoch": 0.7998150411203223, + "grad_norm": 0.238278329372406, + "learning_rate": 1.917654462180678e-05, + "loss": 0.0319, + "step": 6054 + }, + { + "epoch": 0.7999471546058063, + "grad_norm": 0.18031644821166992, + "learning_rate": 1.9152077989953097e-05, + "loss": 0.0164, + "step": 6055 + }, + { + "epoch": 0.8000792680912904, + "grad_norm": 0.21356049180030823, + "learning_rate": 1.912762532306628e-05, + "loss": 0.0147, + "step": 6056 + }, + { + "epoch": 0.8002113815767744, + "grad_norm": 0.1678241789340973, + "learning_rate": 1.9103186625370095e-05, + "loss": 0.0128, + "step": 6057 + }, + { + "epoch": 0.8003434950622584, + "grad_norm": 0.2096024453639984, + "learning_rate": 1.9078761901085905e-05, + "loss": 0.0211, + "step": 6058 + }, + { + "epoch": 0.8004756085477425, + "grad_norm": 0.15075698494911194, + "learning_rate": 1.905435115443256e-05, + "loss": 0.0161, + "step": 6059 + }, + { + "epoch": 0.8006077220332265, + "grad_norm": 0.11386120319366455, + "learning_rate": 1.902995438962659e-05, + "loss": 0.0135, + "step": 6060 + }, + { + "epoch": 0.8007398355187105, + "grad_norm": 0.22624462842941284, + "learning_rate": 1.900557161088208e-05, + "loss": 0.0127, + "step": 6061 + }, + { + "epoch": 0.8008719490041946, + "grad_norm": 0.16365881264209747, + "learning_rate": 1.8981202822410725e-05, + "loss": 0.0203, + "step": 6062 + }, + { + "epoch": 0.8010040624896786, + "grad_norm": 0.2588220536708832, + "learning_rate": 1.8956848028421725e-05, + "loss": 0.0195, + "step": 6063 + }, + { + "epoch": 0.8011361759751626, + "grad_norm": 0.16350001096725464, + "learning_rate": 1.8932507233121944e-05, + "loss": 0.0221, + "step": 6064 + }, + { + "epoch": 0.8012682894606467, + "grad_norm": 0.14093947410583496, + "learning_rate": 1.8908180440715828e-05, + "loss": 0.0114, + "step": 6065 + }, + { + "epoch": 0.8014004029461307, + "grad_norm": 0.12100745737552643, + "learning_rate": 1.8883867655405306e-05, + "loss": 0.0122, + "step": 6066 + }, + { + "epoch": 0.8015325164316147, + "grad_norm": 0.15186220407485962, + "learning_rate": 1.885956888139001e-05, + "loss": 0.0183, + "step": 6067 + }, + { + "epoch": 0.8016646299170987, + "grad_norm": 0.17774933576583862, + "learning_rate": 1.883528412286708e-05, + "loss": 0.0196, + "step": 6068 + }, + { + "epoch": 0.8017967434025828, + "grad_norm": 0.16041073203086853, + "learning_rate": 1.8811013384031283e-05, + "loss": 0.0257, + "step": 6069 + }, + { + "epoch": 0.8019288568880668, + "grad_norm": 0.15739186108112335, + "learning_rate": 1.878675666907489e-05, + "loss": 0.0088, + "step": 6070 + }, + { + "epoch": 0.8020609703735508, + "grad_norm": 0.18149971961975098, + "learning_rate": 1.8762513982187812e-05, + "loss": 0.0202, + "step": 6071 + }, + { + "epoch": 0.8021930838590349, + "grad_norm": 0.13623569905757904, + "learning_rate": 1.8738285327557546e-05, + "loss": 0.0121, + "step": 6072 + }, + { + "epoch": 0.8023251973445189, + "grad_norm": 0.2696712613105774, + "learning_rate": 1.8714070709369103e-05, + "loss": 0.0127, + "step": 6073 + }, + { + "epoch": 0.8024573108300029, + "grad_norm": 0.1282675415277481, + "learning_rate": 1.868987013180511e-05, + "loss": 0.0056, + "step": 6074 + }, + { + "epoch": 0.802589424315487, + "grad_norm": 0.1721215546131134, + "learning_rate": 1.866568359904578e-05, + "loss": 0.017, + "step": 6075 + }, + { + "epoch": 0.802721537800971, + "grad_norm": 0.31074950098991394, + "learning_rate": 1.8641511115268873e-05, + "loss": 0.0204, + "step": 6076 + }, + { + "epoch": 0.802853651286455, + "grad_norm": 0.13287027180194855, + "learning_rate": 1.8617352684649736e-05, + "loss": 0.009, + "step": 6077 + }, + { + "epoch": 0.8029857647719391, + "grad_norm": 0.17623546719551086, + "learning_rate": 1.859320831136132e-05, + "loss": 0.0133, + "step": 6078 + }, + { + "epoch": 0.8031178782574231, + "grad_norm": 0.309251993894577, + "learning_rate": 1.856907799957406e-05, + "loss": 0.0276, + "step": 6079 + }, + { + "epoch": 0.8032499917429071, + "grad_norm": 0.12996280193328857, + "learning_rate": 1.8544961753456037e-05, + "loss": 0.0123, + "step": 6080 + }, + { + "epoch": 0.8033821052283912, + "grad_norm": 0.2583175301551819, + "learning_rate": 1.8520859577172887e-05, + "loss": 0.0095, + "step": 6081 + }, + { + "epoch": 0.8035142187138752, + "grad_norm": 0.21694119274616241, + "learning_rate": 1.8496771474887832e-05, + "loss": 0.0205, + "step": 6082 + }, + { + "epoch": 0.8036463321993592, + "grad_norm": 0.1103861853480339, + "learning_rate": 1.847269745076159e-05, + "loss": 0.0113, + "step": 6083 + }, + { + "epoch": 0.8037784456848432, + "grad_norm": 0.14845329523086548, + "learning_rate": 1.8448637508952526e-05, + "loss": 0.0129, + "step": 6084 + }, + { + "epoch": 0.8039105591703273, + "grad_norm": 0.2458660900592804, + "learning_rate": 1.8424591653616564e-05, + "loss": 0.0214, + "step": 6085 + }, + { + "epoch": 0.8040426726558113, + "grad_norm": 0.2118985801935196, + "learning_rate": 1.840055988890714e-05, + "loss": 0.0189, + "step": 6086 + }, + { + "epoch": 0.8041747861412953, + "grad_norm": 0.1412656158208847, + "learning_rate": 1.83765422189753e-05, + "loss": 0.0138, + "step": 6087 + }, + { + "epoch": 0.8043068996267794, + "grad_norm": 0.24748028814792633, + "learning_rate": 1.8352538647969662e-05, + "loss": 0.0264, + "step": 6088 + }, + { + "epoch": 0.8044390131122634, + "grad_norm": 0.41792353987693787, + "learning_rate": 1.8328549180036412e-05, + "loss": 0.0254, + "step": 6089 + }, + { + "epoch": 0.8045711265977474, + "grad_norm": 0.23838505148887634, + "learning_rate": 1.8304573819319226e-05, + "loss": 0.0168, + "step": 6090 + }, + { + "epoch": 0.8047032400832315, + "grad_norm": 0.15929056704044342, + "learning_rate": 1.8280612569959443e-05, + "loss": 0.0143, + "step": 6091 + }, + { + "epoch": 0.8048353535687155, + "grad_norm": 0.21665242314338684, + "learning_rate": 1.825666543609592e-05, + "loss": 0.0143, + "step": 6092 + }, + { + "epoch": 0.8049674670541995, + "grad_norm": 0.20312796533107758, + "learning_rate": 1.823273242186505e-05, + "loss": 0.0189, + "step": 6093 + }, + { + "epoch": 0.8050995805396836, + "grad_norm": 0.11221221834421158, + "learning_rate": 1.8208813531400827e-05, + "loss": 0.0082, + "step": 6094 + }, + { + "epoch": 0.8052316940251676, + "grad_norm": 0.23713520169258118, + "learning_rate": 1.8184908768834796e-05, + "loss": 0.026, + "step": 6095 + }, + { + "epoch": 0.8053638075106516, + "grad_norm": 0.18237431347370148, + "learning_rate": 1.8161018138296083e-05, + "loss": 0.0206, + "step": 6096 + }, + { + "epoch": 0.8054959209961357, + "grad_norm": 0.13127483427524567, + "learning_rate": 1.8137141643911294e-05, + "loss": 0.0091, + "step": 6097 + }, + { + "epoch": 0.8056280344816197, + "grad_norm": 0.13977746665477753, + "learning_rate": 1.8113279289804673e-05, + "loss": 0.0158, + "step": 6098 + }, + { + "epoch": 0.8057601479671037, + "grad_norm": 0.15803252160549164, + "learning_rate": 1.808943108009802e-05, + "loss": 0.0169, + "step": 6099 + }, + { + "epoch": 0.8058922614525877, + "grad_norm": 0.240422323346138, + "learning_rate": 1.8065597018910617e-05, + "loss": 0.0096, + "step": 6100 + }, + { + "epoch": 0.8060243749380718, + "grad_norm": 0.17421869933605194, + "learning_rate": 1.804177711035938e-05, + "loss": 0.0182, + "step": 6101 + }, + { + "epoch": 0.8061564884235558, + "grad_norm": 0.27411147952079773, + "learning_rate": 1.801797135855876e-05, + "loss": 0.0278, + "step": 6102 + }, + { + "epoch": 0.8062886019090398, + "grad_norm": 0.0999927967786789, + "learning_rate": 1.7994179767620766e-05, + "loss": 0.0094, + "step": 6103 + }, + { + "epoch": 0.8064207153945239, + "grad_norm": 0.37366190552711487, + "learning_rate": 1.7970402341654902e-05, + "loss": 0.0269, + "step": 6104 + }, + { + "epoch": 0.8065528288800079, + "grad_norm": 0.17533250153064728, + "learning_rate": 1.794663908476831e-05, + "loss": 0.0107, + "step": 6105 + }, + { + "epoch": 0.8066849423654919, + "grad_norm": 0.13562843203544617, + "learning_rate": 1.7922890001065673e-05, + "loss": 0.0139, + "step": 6106 + }, + { + "epoch": 0.806817055850976, + "grad_norm": 0.24872201681137085, + "learning_rate": 1.7899155094649135e-05, + "loss": 0.0257, + "step": 6107 + }, + { + "epoch": 0.80694916933646, + "grad_norm": 0.15552151203155518, + "learning_rate": 1.7875434369618495e-05, + "loss": 0.0146, + "step": 6108 + }, + { + "epoch": 0.807081282821944, + "grad_norm": 0.11690077930688858, + "learning_rate": 1.7851727830071063e-05, + "loss": 0.0094, + "step": 6109 + }, + { + "epoch": 0.807213396307428, + "grad_norm": 0.17192292213439941, + "learning_rate": 1.7828035480101722e-05, + "loss": 0.0187, + "step": 6110 + }, + { + "epoch": 0.8073455097929121, + "grad_norm": 0.24925702810287476, + "learning_rate": 1.7804357323802845e-05, + "loss": 0.0325, + "step": 6111 + }, + { + "epoch": 0.8074776232783961, + "grad_norm": 0.10455185174942017, + "learning_rate": 1.778069336526439e-05, + "loss": 0.0135, + "step": 6112 + }, + { + "epoch": 0.8076097367638801, + "grad_norm": 0.09632913768291473, + "learning_rate": 1.775704360857392e-05, + "loss": 0.0063, + "step": 6113 + }, + { + "epoch": 0.8077418502493642, + "grad_norm": 0.15340298414230347, + "learning_rate": 1.7733408057816413e-05, + "loss": 0.0227, + "step": 6114 + }, + { + "epoch": 0.8078739637348482, + "grad_norm": 0.10595980286598206, + "learning_rate": 1.7709786717074504e-05, + "loss": 0.0089, + "step": 6115 + }, + { + "epoch": 0.8080060772203322, + "grad_norm": 0.11453713476657867, + "learning_rate": 1.7686179590428344e-05, + "loss": 0.0111, + "step": 6116 + }, + { + "epoch": 0.8081381907058163, + "grad_norm": 0.23642563819885254, + "learning_rate": 1.766258668195564e-05, + "loss": 0.0243, + "step": 6117 + }, + { + "epoch": 0.8082703041913003, + "grad_norm": 0.23531033098697662, + "learning_rate": 1.763900799573157e-05, + "loss": 0.0161, + "step": 6118 + }, + { + "epoch": 0.8084024176767843, + "grad_norm": 0.1391163170337677, + "learning_rate": 1.7615443535828945e-05, + "loss": 0.013, + "step": 6119 + }, + { + "epoch": 0.8085345311622684, + "grad_norm": 0.14361713826656342, + "learning_rate": 1.759189330631811e-05, + "loss": 0.0183, + "step": 6120 + }, + { + "epoch": 0.8086666446477524, + "grad_norm": 0.2131107896566391, + "learning_rate": 1.756835731126687e-05, + "loss": 0.0212, + "step": 6121 + }, + { + "epoch": 0.8087987581332364, + "grad_norm": 0.14085067808628082, + "learning_rate": 1.754483555474067e-05, + "loss": 0.0153, + "step": 6122 + }, + { + "epoch": 0.8089308716187205, + "grad_norm": 0.14828112721443176, + "learning_rate": 1.7521328040802422e-05, + "loss": 0.0151, + "step": 6123 + }, + { + "epoch": 0.8090629851042045, + "grad_norm": 0.1502753645181656, + "learning_rate": 1.7497834773512666e-05, + "loss": 0.0118, + "step": 6124 + }, + { + "epoch": 0.8091950985896885, + "grad_norm": 0.16962772607803345, + "learning_rate": 1.747435575692936e-05, + "loss": 0.0202, + "step": 6125 + }, + { + "epoch": 0.8093272120751726, + "grad_norm": 0.1703941524028778, + "learning_rate": 1.7450890995108095e-05, + "loss": 0.0202, + "step": 6126 + }, + { + "epoch": 0.8094593255606566, + "grad_norm": 0.17077529430389404, + "learning_rate": 1.7427440492101986e-05, + "loss": 0.0162, + "step": 6127 + }, + { + "epoch": 0.8095914390461406, + "grad_norm": 0.3168460428714752, + "learning_rate": 1.7404004251961635e-05, + "loss": 0.0215, + "step": 6128 + }, + { + "epoch": 0.8097235525316246, + "grad_norm": 0.17691540718078613, + "learning_rate": 1.7380582278735224e-05, + "loss": 0.0219, + "step": 6129 + }, + { + "epoch": 0.8098556660171087, + "grad_norm": 0.1743171364068985, + "learning_rate": 1.735717457646847e-05, + "loss": 0.0123, + "step": 6130 + }, + { + "epoch": 0.8099877795025927, + "grad_norm": 0.1374894380569458, + "learning_rate": 1.733378114920463e-05, + "loss": 0.0104, + "step": 6131 + }, + { + "epoch": 0.8101198929880767, + "grad_norm": 0.18259602785110474, + "learning_rate": 1.731040200098445e-05, + "loss": 0.0143, + "step": 6132 + }, + { + "epoch": 0.8102520064735608, + "grad_norm": 0.25777730345726013, + "learning_rate": 1.728703713584624e-05, + "loss": 0.0277, + "step": 6133 + }, + { + "epoch": 0.8103841199590448, + "grad_norm": 0.12829157710075378, + "learning_rate": 1.7263686557825864e-05, + "loss": 0.0136, + "step": 6134 + }, + { + "epoch": 0.8105162334445288, + "grad_norm": 0.15571993589401245, + "learning_rate": 1.7240350270956697e-05, + "loss": 0.0167, + "step": 6135 + }, + { + "epoch": 0.8106483469300129, + "grad_norm": 0.1591111421585083, + "learning_rate": 1.7217028279269644e-05, + "loss": 0.0136, + "step": 6136 + }, + { + "epoch": 0.8107804604154969, + "grad_norm": 0.14531998336315155, + "learning_rate": 1.719372058679315e-05, + "loss": 0.0117, + "step": 6137 + }, + { + "epoch": 0.8109125739009809, + "grad_norm": 0.17569659650325775, + "learning_rate": 1.7170427197553164e-05, + "loss": 0.0137, + "step": 6138 + }, + { + "epoch": 0.811044687386465, + "grad_norm": 0.18718554079532623, + "learning_rate": 1.7147148115573175e-05, + "loss": 0.0124, + "step": 6139 + }, + { + "epoch": 0.811176800871949, + "grad_norm": 0.14613309502601624, + "learning_rate": 1.712388334487425e-05, + "loss": 0.016, + "step": 6140 + }, + { + "epoch": 0.811308914357433, + "grad_norm": 0.14430725574493408, + "learning_rate": 1.710063288947492e-05, + "loss": 0.0133, + "step": 6141 + }, + { + "epoch": 0.811441027842917, + "grad_norm": 0.16222628951072693, + "learning_rate": 1.7077396753391262e-05, + "loss": 0.0118, + "step": 6142 + }, + { + "epoch": 0.8115731413284011, + "grad_norm": 0.1703341156244278, + "learning_rate": 1.705417494063687e-05, + "loss": 0.0191, + "step": 6143 + }, + { + "epoch": 0.8117052548138851, + "grad_norm": 0.11394409835338593, + "learning_rate": 1.7030967455222936e-05, + "loss": 0.0143, + "step": 6144 + }, + { + "epoch": 0.8118373682993691, + "grad_norm": 0.1658678501844406, + "learning_rate": 1.7007774301158054e-05, + "loss": 0.0216, + "step": 6145 + }, + { + "epoch": 0.8119694817848532, + "grad_norm": 0.14283902943134308, + "learning_rate": 1.6984595482448418e-05, + "loss": 0.0114, + "step": 6146 + }, + { + "epoch": 0.8121015952703372, + "grad_norm": 0.1382722705602646, + "learning_rate": 1.696143100309776e-05, + "loss": 0.0184, + "step": 6147 + }, + { + "epoch": 0.8122337087558212, + "grad_norm": 0.13045085966587067, + "learning_rate": 1.6938280867107335e-05, + "loss": 0.0076, + "step": 6148 + }, + { + "epoch": 0.8123658222413053, + "grad_norm": 0.16077639162540436, + "learning_rate": 1.6915145078475824e-05, + "loss": 0.0157, + "step": 6149 + }, + { + "epoch": 0.8124979357267893, + "grad_norm": 0.12093255668878555, + "learning_rate": 1.689202364119955e-05, + "loss": 0.0137, + "step": 6150 + }, + { + "epoch": 0.8126300492122733, + "grad_norm": 0.14236758649349213, + "learning_rate": 1.686891655927232e-05, + "loss": 0.0103, + "step": 6151 + }, + { + "epoch": 0.8127621626977574, + "grad_norm": 0.1466006338596344, + "learning_rate": 1.6845823836685413e-05, + "loss": 0.0109, + "step": 6152 + }, + { + "epoch": 0.8128942761832414, + "grad_norm": 0.12988708913326263, + "learning_rate": 1.682274547742767e-05, + "loss": 0.0148, + "step": 6153 + }, + { + "epoch": 0.8130263896687254, + "grad_norm": 0.14889812469482422, + "learning_rate": 1.6799681485485464e-05, + "loss": 0.0078, + "step": 6154 + }, + { + "epoch": 0.8131585031542095, + "grad_norm": 0.1435319036245346, + "learning_rate": 1.6776631864842685e-05, + "loss": 0.014, + "step": 6155 + }, + { + "epoch": 0.8132906166396935, + "grad_norm": 0.13213448226451874, + "learning_rate": 1.6753596619480684e-05, + "loss": 0.0082, + "step": 6156 + }, + { + "epoch": 0.8134227301251775, + "grad_norm": 0.16954585909843445, + "learning_rate": 1.6730575753378375e-05, + "loss": 0.0153, + "step": 6157 + }, + { + "epoch": 0.8135548436106615, + "grad_norm": 0.18840613961219788, + "learning_rate": 1.6707569270512224e-05, + "loss": 0.0178, + "step": 6158 + }, + { + "epoch": 0.8136869570961456, + "grad_norm": 0.07202202081680298, + "learning_rate": 1.6684577174856118e-05, + "loss": 0.0041, + "step": 6159 + }, + { + "epoch": 0.8138190705816296, + "grad_norm": 0.18953919410705566, + "learning_rate": 1.666159947038153e-05, + "loss": 0.0237, + "step": 6160 + }, + { + "epoch": 0.8139511840671136, + "grad_norm": 0.1994379609823227, + "learning_rate": 1.6638636161057442e-05, + "loss": 0.0253, + "step": 6161 + }, + { + "epoch": 0.8140832975525977, + "grad_norm": 0.1680680215358734, + "learning_rate": 1.6615687250850344e-05, + "loss": 0.0133, + "step": 6162 + }, + { + "epoch": 0.8142154110380817, + "grad_norm": 0.1264996975660324, + "learning_rate": 1.659275274372418e-05, + "loss": 0.0087, + "step": 6163 + }, + { + "epoch": 0.8143475245235657, + "grad_norm": 0.11757846176624298, + "learning_rate": 1.6569832643640505e-05, + "loss": 0.0119, + "step": 6164 + }, + { + "epoch": 0.8144796380090498, + "grad_norm": 0.17562219500541687, + "learning_rate": 1.654692695455835e-05, + "loss": 0.015, + "step": 6165 + }, + { + "epoch": 0.8146117514945338, + "grad_norm": 0.14509950578212738, + "learning_rate": 1.652403568043418e-05, + "loss": 0.0173, + "step": 6166 + }, + { + "epoch": 0.8147438649800178, + "grad_norm": 0.17648886144161224, + "learning_rate": 1.6501158825222085e-05, + "loss": 0.0191, + "step": 6167 + }, + { + "epoch": 0.8148759784655019, + "grad_norm": 0.17454563081264496, + "learning_rate": 1.6478296392873603e-05, + "loss": 0.0137, + "step": 6168 + }, + { + "epoch": 0.8150080919509859, + "grad_norm": 0.13573813438415527, + "learning_rate": 1.6455448387337812e-05, + "loss": 0.0142, + "step": 6169 + }, + { + "epoch": 0.8151402054364699, + "grad_norm": 0.12894220650196075, + "learning_rate": 1.643261481256123e-05, + "loss": 0.0152, + "step": 6170 + }, + { + "epoch": 0.815272318921954, + "grad_norm": 0.15514323115348816, + "learning_rate": 1.640979567248796e-05, + "loss": 0.0148, + "step": 6171 + }, + { + "epoch": 0.815404432407438, + "grad_norm": 0.2614951729774475, + "learning_rate": 1.63869909710596e-05, + "loss": 0.0127, + "step": 6172 + }, + { + "epoch": 0.815536545892922, + "grad_norm": 0.26733624935150146, + "learning_rate": 1.6364200712215194e-05, + "loss": 0.0322, + "step": 6173 + }, + { + "epoch": 0.815668659378406, + "grad_norm": 0.13763479888439178, + "learning_rate": 1.6341424899891355e-05, + "loss": 0.0155, + "step": 6174 + }, + { + "epoch": 0.8158007728638901, + "grad_norm": 0.3485512137413025, + "learning_rate": 1.631866353802217e-05, + "loss": 0.0181, + "step": 6175 + }, + { + "epoch": 0.8159328863493741, + "grad_norm": 0.13298067450523376, + "learning_rate": 1.6295916630539286e-05, + "loss": 0.0218, + "step": 6176 + }, + { + "epoch": 0.8160649998348581, + "grad_norm": 0.11390813440084457, + "learning_rate": 1.6273184181371724e-05, + "loss": 0.0089, + "step": 6177 + }, + { + "epoch": 0.8161971133203422, + "grad_norm": 0.19629289209842682, + "learning_rate": 1.6250466194446147e-05, + "loss": 0.0156, + "step": 6178 + }, + { + "epoch": 0.8163292268058262, + "grad_norm": 0.1922062635421753, + "learning_rate": 1.6227762673686665e-05, + "loss": 0.0193, + "step": 6179 + }, + { + "epoch": 0.8164613402913102, + "grad_norm": 0.225606307387352, + "learning_rate": 1.620507362301483e-05, + "loss": 0.0152, + "step": 6180 + }, + { + "epoch": 0.8165934537767943, + "grad_norm": 0.16128596663475037, + "learning_rate": 1.6182399046349806e-05, + "loss": 0.0147, + "step": 6181 + }, + { + "epoch": 0.8167255672622783, + "grad_norm": 0.09733172506093979, + "learning_rate": 1.6159738947608184e-05, + "loss": 0.0127, + "step": 6182 + }, + { + "epoch": 0.8168576807477623, + "grad_norm": 0.17848917841911316, + "learning_rate": 1.61370933307041e-05, + "loss": 0.0119, + "step": 6183 + }, + { + "epoch": 0.8169897942332464, + "grad_norm": 0.19904151558876038, + "learning_rate": 1.6114462199549106e-05, + "loss": 0.0181, + "step": 6184 + }, + { + "epoch": 0.8171219077187304, + "grad_norm": 0.07581532746553421, + "learning_rate": 1.6091845558052343e-05, + "loss": 0.0073, + "step": 6185 + }, + { + "epoch": 0.8172540212042144, + "grad_norm": 0.3024727702140808, + "learning_rate": 1.6069243410120427e-05, + "loss": 0.0215, + "step": 6186 + }, + { + "epoch": 0.8173861346896985, + "grad_norm": 0.08493215590715408, + "learning_rate": 1.6046655759657413e-05, + "loss": 0.0071, + "step": 6187 + }, + { + "epoch": 0.8175182481751825, + "grad_norm": 0.1411338597536087, + "learning_rate": 1.6024082610564918e-05, + "loss": 0.0125, + "step": 6188 + }, + { + "epoch": 0.8176503616606665, + "grad_norm": 0.11337349563837051, + "learning_rate": 1.6001523966742025e-05, + "loss": 0.0118, + "step": 6189 + }, + { + "epoch": 0.8177824751461505, + "grad_norm": 0.2526368200778961, + "learning_rate": 1.597897983208536e-05, + "loss": 0.0281, + "step": 6190 + }, + { + "epoch": 0.8179145886316346, + "grad_norm": 0.22702518105506897, + "learning_rate": 1.5956450210488936e-05, + "loss": 0.0265, + "step": 6191 + }, + { + "epoch": 0.8180467021171186, + "grad_norm": 0.17712631821632385, + "learning_rate": 1.5933935105844345e-05, + "loss": 0.0167, + "step": 6192 + }, + { + "epoch": 0.8181788156026026, + "grad_norm": 0.22530877590179443, + "learning_rate": 1.591143452204067e-05, + "loss": 0.0147, + "step": 6193 + }, + { + "epoch": 0.8183109290880867, + "grad_norm": 0.13503044843673706, + "learning_rate": 1.588894846296445e-05, + "loss": 0.0118, + "step": 6194 + }, + { + "epoch": 0.8184430425735707, + "grad_norm": 0.22267352044582367, + "learning_rate": 1.586647693249973e-05, + "loss": 0.0241, + "step": 6195 + }, + { + "epoch": 0.8185751560590547, + "grad_norm": 0.1505124866962433, + "learning_rate": 1.5844019934528088e-05, + "loss": 0.016, + "step": 6196 + }, + { + "epoch": 0.8187072695445388, + "grad_norm": 0.1663101613521576, + "learning_rate": 1.5821577472928484e-05, + "loss": 0.016, + "step": 6197 + }, + { + "epoch": 0.8188393830300228, + "grad_norm": 0.135949045419693, + "learning_rate": 1.5799149551577464e-05, + "loss": 0.0138, + "step": 6198 + }, + { + "epoch": 0.8189714965155068, + "grad_norm": 0.15780101716518402, + "learning_rate": 1.577673617434906e-05, + "loss": 0.0093, + "step": 6199 + }, + { + "epoch": 0.8191036100009909, + "grad_norm": 0.2328743040561676, + "learning_rate": 1.575433734511471e-05, + "loss": 0.0176, + "step": 6200 + }, + { + "epoch": 0.8192357234864749, + "grad_norm": 0.3225279748439789, + "learning_rate": 1.573195306774342e-05, + "loss": 0.0189, + "step": 6201 + }, + { + "epoch": 0.8193678369719589, + "grad_norm": 0.2229243665933609, + "learning_rate": 1.5709583346101653e-05, + "loss": 0.0219, + "step": 6202 + }, + { + "epoch": 0.819499950457443, + "grad_norm": 0.17149214446544647, + "learning_rate": 1.5687228184053393e-05, + "loss": 0.0154, + "step": 6203 + }, + { + "epoch": 0.819632063942927, + "grad_norm": 0.18319302797317505, + "learning_rate": 1.566488758546002e-05, + "loss": 0.0204, + "step": 6204 + }, + { + "epoch": 0.819764177428411, + "grad_norm": 0.1378282606601715, + "learning_rate": 1.564256155418047e-05, + "loss": 0.0158, + "step": 6205 + }, + { + "epoch": 0.819896290913895, + "grad_norm": 0.17166048288345337, + "learning_rate": 1.5620250094071188e-05, + "loss": 0.0112, + "step": 6206 + }, + { + "epoch": 0.8200284043993791, + "grad_norm": 0.17572304606437683, + "learning_rate": 1.5597953208986017e-05, + "loss": 0.0151, + "step": 6207 + }, + { + "epoch": 0.8201605178848631, + "grad_norm": 0.1491813212633133, + "learning_rate": 1.557567090277633e-05, + "loss": 0.0105, + "step": 6208 + }, + { + "epoch": 0.8202926313703471, + "grad_norm": 0.18040210008621216, + "learning_rate": 1.5553403179290994e-05, + "loss": 0.0159, + "step": 6209 + }, + { + "epoch": 0.8204247448558312, + "grad_norm": 0.18628931045532227, + "learning_rate": 1.553115004237635e-05, + "loss": 0.0187, + "step": 6210 + }, + { + "epoch": 0.8205568583413152, + "grad_norm": 0.24394503235816956, + "learning_rate": 1.5508911495876188e-05, + "loss": 0.0226, + "step": 6211 + }, + { + "epoch": 0.8206889718267992, + "grad_norm": 0.14266592264175415, + "learning_rate": 1.5486687543631807e-05, + "loss": 0.0207, + "step": 6212 + }, + { + "epoch": 0.8208210853122833, + "grad_norm": 0.11844737082719803, + "learning_rate": 1.5464478189482013e-05, + "loss": 0.0126, + "step": 6213 + }, + { + "epoch": 0.8209531987977673, + "grad_norm": 0.1422780603170395, + "learning_rate": 1.5442283437263005e-05, + "loss": 0.0178, + "step": 6214 + }, + { + "epoch": 0.8210853122832513, + "grad_norm": 0.1076541319489479, + "learning_rate": 1.542010329080853e-05, + "loss": 0.0056, + "step": 6215 + }, + { + "epoch": 0.8212174257687354, + "grad_norm": 0.13329507410526276, + "learning_rate": 1.5397937753949798e-05, + "loss": 0.015, + "step": 6216 + }, + { + "epoch": 0.8213495392542194, + "grad_norm": 0.14316102862358093, + "learning_rate": 1.5375786830515515e-05, + "loss": 0.0104, + "step": 6217 + }, + { + "epoch": 0.8214816527397034, + "grad_norm": 0.13540340960025787, + "learning_rate": 1.5353650524331787e-05, + "loss": 0.0106, + "step": 6218 + }, + { + "epoch": 0.8216137662251874, + "grad_norm": 0.1679118573665619, + "learning_rate": 1.5331528839222287e-05, + "loss": 0.0145, + "step": 6219 + }, + { + "epoch": 0.8217458797106715, + "grad_norm": 0.21163515746593475, + "learning_rate": 1.5309421779008125e-05, + "loss": 0.0152, + "step": 6220 + }, + { + "epoch": 0.8218779931961555, + "grad_norm": 0.3536343574523926, + "learning_rate": 1.528732934750785e-05, + "loss": 0.0184, + "step": 6221 + }, + { + "epoch": 0.8220101066816395, + "grad_norm": 0.16983529925346375, + "learning_rate": 1.526525154853753e-05, + "loss": 0.0181, + "step": 6222 + }, + { + "epoch": 0.8221422201671236, + "grad_norm": 0.20116670429706573, + "learning_rate": 1.52431883859107e-05, + "loss": 0.0223, + "step": 6223 + }, + { + "epoch": 0.8222743336526076, + "grad_norm": 0.20575542747974396, + "learning_rate": 1.5221139863438372e-05, + "loss": 0.0225, + "step": 6224 + }, + { + "epoch": 0.8224064471380916, + "grad_norm": 0.12439869344234467, + "learning_rate": 1.5199105984928985e-05, + "loss": 0.0139, + "step": 6225 + }, + { + "epoch": 0.8225385606235757, + "grad_norm": 0.10762098431587219, + "learning_rate": 1.5177086754188486e-05, + "loss": 0.007, + "step": 6226 + }, + { + "epoch": 0.8226706741090597, + "grad_norm": 0.20593222975730896, + "learning_rate": 1.5155082175020318e-05, + "loss": 0.0219, + "step": 6227 + }, + { + "epoch": 0.8228027875945437, + "grad_norm": 0.1083982065320015, + "learning_rate": 1.513309225122531e-05, + "loss": 0.0094, + "step": 6228 + }, + { + "epoch": 0.8229349010800278, + "grad_norm": 0.12976078689098358, + "learning_rate": 1.5111116986601848e-05, + "loss": 0.009, + "step": 6229 + }, + { + "epoch": 0.8230670145655118, + "grad_norm": 0.17364388704299927, + "learning_rate": 1.5089156384945725e-05, + "loss": 0.0152, + "step": 6230 + }, + { + "epoch": 0.8231991280509958, + "grad_norm": 0.11856499314308167, + "learning_rate": 1.5067210450050261e-05, + "loss": 0.0128, + "step": 6231 + }, + { + "epoch": 0.8233312415364799, + "grad_norm": 0.2314266711473465, + "learning_rate": 1.5045279185706162e-05, + "loss": 0.0183, + "step": 6232 + }, + { + "epoch": 0.8234633550219639, + "grad_norm": 0.11939840018749237, + "learning_rate": 1.5023362595701652e-05, + "loss": 0.0127, + "step": 6233 + }, + { + "epoch": 0.8235954685074479, + "grad_norm": 0.10636747628450394, + "learning_rate": 1.5001460683822456e-05, + "loss": 0.0103, + "step": 6234 + }, + { + "epoch": 0.823727581992932, + "grad_norm": 0.14871914684772491, + "learning_rate": 1.4979573453851658e-05, + "loss": 0.0128, + "step": 6235 + }, + { + "epoch": 0.823859695478416, + "grad_norm": 0.16616952419281006, + "learning_rate": 1.4957700909569894e-05, + "loss": 0.0159, + "step": 6236 + }, + { + "epoch": 0.8239918089639, + "grad_norm": 0.167495995759964, + "learning_rate": 1.4935843054755238e-05, + "loss": 0.015, + "step": 6237 + }, + { + "epoch": 0.824123922449384, + "grad_norm": 0.15769924223423004, + "learning_rate": 1.491399989318325e-05, + "loss": 0.0135, + "step": 6238 + }, + { + "epoch": 0.8242560359348681, + "grad_norm": 0.1334027200937271, + "learning_rate": 1.489217142862689e-05, + "loss": 0.0134, + "step": 6239 + }, + { + "epoch": 0.8243881494203521, + "grad_norm": 0.2270067185163498, + "learning_rate": 1.4870357664856626e-05, + "loss": 0.0108, + "step": 6240 + }, + { + "epoch": 0.8245202629058361, + "grad_norm": 0.12010292708873749, + "learning_rate": 1.4848558605640406e-05, + "loss": 0.0064, + "step": 6241 + }, + { + "epoch": 0.8246523763913202, + "grad_norm": 0.23596997559070587, + "learning_rate": 1.4826774254743559e-05, + "loss": 0.0175, + "step": 6242 + }, + { + "epoch": 0.8247844898768042, + "grad_norm": 0.24235551059246063, + "learning_rate": 1.4805004615928953e-05, + "loss": 0.0251, + "step": 6243 + }, + { + "epoch": 0.8249166033622882, + "grad_norm": 0.1819847673177719, + "learning_rate": 1.478324969295689e-05, + "loss": 0.0122, + "step": 6244 + }, + { + "epoch": 0.8250487168477723, + "grad_norm": 0.23536427319049835, + "learning_rate": 1.4761509489585146e-05, + "loss": 0.0103, + "step": 6245 + }, + { + "epoch": 0.8251808303332563, + "grad_norm": 0.19968615472316742, + "learning_rate": 1.473978400956889e-05, + "loss": 0.0244, + "step": 6246 + }, + { + "epoch": 0.8253129438187403, + "grad_norm": 0.2093326598405838, + "learning_rate": 1.4718073256660802e-05, + "loss": 0.0135, + "step": 6247 + }, + { + "epoch": 0.8254450573042243, + "grad_norm": 0.1305864006280899, + "learning_rate": 1.4696377234611058e-05, + "loss": 0.0141, + "step": 6248 + }, + { + "epoch": 0.8255771707897084, + "grad_norm": 0.2174091637134552, + "learning_rate": 1.4674695947167171e-05, + "loss": 0.0229, + "step": 6249 + }, + { + "epoch": 0.8257092842751924, + "grad_norm": 0.2369208186864853, + "learning_rate": 1.4653029398074202e-05, + "loss": 0.015, + "step": 6250 + }, + { + "epoch": 0.8258413977606764, + "grad_norm": 0.13431859016418457, + "learning_rate": 1.4631377591074658e-05, + "loss": 0.0108, + "step": 6251 + }, + { + "epoch": 0.8259735112461605, + "grad_norm": 0.12542381882667542, + "learning_rate": 1.4609740529908467e-05, + "loss": 0.0139, + "step": 6252 + }, + { + "epoch": 0.8261056247316445, + "grad_norm": 0.22405870258808136, + "learning_rate": 1.4588118218313041e-05, + "loss": 0.022, + "step": 6253 + }, + { + "epoch": 0.8262377382171285, + "grad_norm": 0.10808577388525009, + "learning_rate": 1.4566510660023225e-05, + "loss": 0.0134, + "step": 6254 + }, + { + "epoch": 0.8263698517026126, + "grad_norm": 0.14455221593379974, + "learning_rate": 1.4544917858771335e-05, + "loss": 0.0167, + "step": 6255 + }, + { + "epoch": 0.8265019651880966, + "grad_norm": 0.15318076312541962, + "learning_rate": 1.4523339818287085e-05, + "loss": 0.0102, + "step": 6256 + }, + { + "epoch": 0.8266340786735806, + "grad_norm": 0.21173270046710968, + "learning_rate": 1.4501776542297706e-05, + "loss": 0.0208, + "step": 6257 + }, + { + "epoch": 0.8267661921590647, + "grad_norm": 0.1443268209695816, + "learning_rate": 1.4480228034527876e-05, + "loss": 0.0137, + "step": 6258 + }, + { + "epoch": 0.8268983056445487, + "grad_norm": 0.10800027847290039, + "learning_rate": 1.4458694298699626e-05, + "loss": 0.0145, + "step": 6259 + }, + { + "epoch": 0.8270304191300327, + "grad_norm": 0.1543063521385193, + "learning_rate": 1.443717533853256e-05, + "loss": 0.0174, + "step": 6260 + }, + { + "epoch": 0.8271625326155168, + "grad_norm": 0.1753726601600647, + "learning_rate": 1.441567115774366e-05, + "loss": 0.0208, + "step": 6261 + }, + { + "epoch": 0.8272946461010008, + "grad_norm": 0.1982455551624298, + "learning_rate": 1.4394181760047398e-05, + "loss": 0.0106, + "step": 6262 + }, + { + "epoch": 0.8274267595864848, + "grad_norm": 0.25313007831573486, + "learning_rate": 1.4372707149155617e-05, + "loss": 0.0231, + "step": 6263 + }, + { + "epoch": 0.8275588730719688, + "grad_norm": 0.15842032432556152, + "learning_rate": 1.4351247328777672e-05, + "loss": 0.0231, + "step": 6264 + }, + { + "epoch": 0.8276909865574529, + "grad_norm": 0.22092995047569275, + "learning_rate": 1.4329802302620388e-05, + "loss": 0.0193, + "step": 6265 + }, + { + "epoch": 0.8278231000429369, + "grad_norm": 0.1875273585319519, + "learning_rate": 1.4308372074387933e-05, + "loss": 0.0187, + "step": 6266 + }, + { + "epoch": 0.8279552135284209, + "grad_norm": 0.17721475660800934, + "learning_rate": 1.428695664778199e-05, + "loss": 0.0119, + "step": 6267 + }, + { + "epoch": 0.828087327013905, + "grad_norm": 0.11427704244852066, + "learning_rate": 1.4265556026501703e-05, + "loss": 0.011, + "step": 6268 + }, + { + "epoch": 0.828219440499389, + "grad_norm": 0.16599473357200623, + "learning_rate": 1.4244170214243624e-05, + "loss": 0.0098, + "step": 6269 + }, + { + "epoch": 0.828351553984873, + "grad_norm": 0.1077287420630455, + "learning_rate": 1.4222799214701721e-05, + "loss": 0.0094, + "step": 6270 + }, + { + "epoch": 0.8284836674703571, + "grad_norm": 0.16686657071113586, + "learning_rate": 1.4201443031567451e-05, + "loss": 0.0113, + "step": 6271 + }, + { + "epoch": 0.8286157809558411, + "grad_norm": 0.17287173867225647, + "learning_rate": 1.4180101668529721e-05, + "loss": 0.0046, + "step": 6272 + }, + { + "epoch": 0.8287478944413251, + "grad_norm": 0.10685086995363235, + "learning_rate": 1.4158775129274815e-05, + "loss": 0.0078, + "step": 6273 + }, + { + "epoch": 0.8288800079268092, + "grad_norm": 0.23552167415618896, + "learning_rate": 1.4137463417486495e-05, + "loss": 0.0187, + "step": 6274 + }, + { + "epoch": 0.8290121214122932, + "grad_norm": 0.24776104092597961, + "learning_rate": 1.4116166536845988e-05, + "loss": 0.0167, + "step": 6275 + }, + { + "epoch": 0.8291442348977772, + "grad_norm": 0.15167926251888275, + "learning_rate": 1.4094884491031934e-05, + "loss": 0.02, + "step": 6276 + }, + { + "epoch": 0.8292763483832613, + "grad_norm": 0.22547586262226105, + "learning_rate": 1.4073617283720376e-05, + "loss": 0.0244, + "step": 6277 + }, + { + "epoch": 0.8294084618687453, + "grad_norm": 0.13497163355350494, + "learning_rate": 1.4052364918584837e-05, + "loss": 0.0127, + "step": 6278 + }, + { + "epoch": 0.8295405753542293, + "grad_norm": 0.14811024069786072, + "learning_rate": 1.4031127399296296e-05, + "loss": 0.0194, + "step": 6279 + }, + { + "epoch": 0.8296726888397133, + "grad_norm": 0.1362769901752472, + "learning_rate": 1.4009904729523083e-05, + "loss": 0.0157, + "step": 6280 + }, + { + "epoch": 0.8298048023251974, + "grad_norm": 0.16739948093891144, + "learning_rate": 1.3988696912931065e-05, + "loss": 0.0076, + "step": 6281 + }, + { + "epoch": 0.8299369158106814, + "grad_norm": 0.1778571456670761, + "learning_rate": 1.396750395318347e-05, + "loss": 0.0116, + "step": 6282 + }, + { + "epoch": 0.8300690292961654, + "grad_norm": 0.18066781759262085, + "learning_rate": 1.3946325853941012e-05, + "loss": 0.0104, + "step": 6283 + }, + { + "epoch": 0.8302011427816495, + "grad_norm": 0.15547846257686615, + "learning_rate": 1.3925162618861776e-05, + "loss": 0.0076, + "step": 6284 + }, + { + "epoch": 0.8303332562671335, + "grad_norm": 0.2607351541519165, + "learning_rate": 1.3904014251601328e-05, + "loss": 0.0104, + "step": 6285 + }, + { + "epoch": 0.8304653697526175, + "grad_norm": 0.14374835789203644, + "learning_rate": 1.3882880755812689e-05, + "loss": 0.0135, + "step": 6286 + }, + { + "epoch": 0.8305974832381016, + "grad_norm": 0.2493097335100174, + "learning_rate": 1.3861762135146217e-05, + "loss": 0.0216, + "step": 6287 + }, + { + "epoch": 0.8307295967235856, + "grad_norm": 0.15661141276359558, + "learning_rate": 1.3840658393249784e-05, + "loss": 0.0151, + "step": 6288 + }, + { + "epoch": 0.8308617102090696, + "grad_norm": 0.1942441165447235, + "learning_rate": 1.3819569533768673e-05, + "loss": 0.0124, + "step": 6289 + }, + { + "epoch": 0.8309938236945537, + "grad_norm": 0.10488870739936829, + "learning_rate": 1.3798495560345603e-05, + "loss": 0.0078, + "step": 6290 + }, + { + "epoch": 0.8311259371800377, + "grad_norm": 0.1622200906276703, + "learning_rate": 1.3777436476620675e-05, + "loss": 0.0115, + "step": 6291 + }, + { + "epoch": 0.8312580506655217, + "grad_norm": 0.22020554542541504, + "learning_rate": 1.3756392286231468e-05, + "loss": 0.0153, + "step": 6292 + }, + { + "epoch": 0.8313901641510058, + "grad_norm": 0.11919867992401123, + "learning_rate": 1.3735362992812994e-05, + "loss": 0.0146, + "step": 6293 + }, + { + "epoch": 0.8315222776364898, + "grad_norm": 0.1653178334236145, + "learning_rate": 1.3714348599997628e-05, + "loss": 0.0156, + "step": 6294 + }, + { + "epoch": 0.8316543911219738, + "grad_norm": 0.09803801029920578, + "learning_rate": 1.3693349111415243e-05, + "loss": 0.0113, + "step": 6295 + }, + { + "epoch": 0.8317865046074578, + "grad_norm": 0.20118437707424164, + "learning_rate": 1.3672364530693094e-05, + "loss": 0.0268, + "step": 6296 + }, + { + "epoch": 0.8319186180929419, + "grad_norm": 0.12770311534404755, + "learning_rate": 1.3651394861455902e-05, + "loss": 0.0113, + "step": 6297 + }, + { + "epoch": 0.8320507315784259, + "grad_norm": 0.12034901976585388, + "learning_rate": 1.3630440107325737e-05, + "loss": 0.014, + "step": 6298 + }, + { + "epoch": 0.8321828450639099, + "grad_norm": 0.16769957542419434, + "learning_rate": 1.3609500271922171e-05, + "loss": 0.0135, + "step": 6299 + }, + { + "epoch": 0.832314958549394, + "grad_norm": 0.16110952198505402, + "learning_rate": 1.3588575358862188e-05, + "loss": 0.0189, + "step": 6300 + }, + { + "epoch": 0.832447072034878, + "grad_norm": 0.09111898392438889, + "learning_rate": 1.3567665371760141e-05, + "loss": 0.0064, + "step": 6301 + }, + { + "epoch": 0.832579185520362, + "grad_norm": 0.1689785271883011, + "learning_rate": 1.3546770314227841e-05, + "loss": 0.0126, + "step": 6302 + }, + { + "epoch": 0.8327112990058461, + "grad_norm": 0.19347065687179565, + "learning_rate": 1.3525890189874536e-05, + "loss": 0.0109, + "step": 6303 + }, + { + "epoch": 0.8328434124913301, + "grad_norm": 0.18107086420059204, + "learning_rate": 1.3505025002306893e-05, + "loss": 0.0156, + "step": 6304 + }, + { + "epoch": 0.8329755259768141, + "grad_norm": 0.16515274345874786, + "learning_rate": 1.3484174755128932e-05, + "loss": 0.0211, + "step": 6305 + }, + { + "epoch": 0.8331076394622982, + "grad_norm": 0.10837220400571823, + "learning_rate": 1.3463339451942181e-05, + "loss": 0.0082, + "step": 6306 + }, + { + "epoch": 0.8332397529477822, + "grad_norm": 0.18501004576683044, + "learning_rate": 1.3442519096345563e-05, + "loss": 0.0301, + "step": 6307 + }, + { + "epoch": 0.8333718664332662, + "grad_norm": 0.12566491961479187, + "learning_rate": 1.3421713691935356e-05, + "loss": 0.0104, + "step": 6308 + }, + { + "epoch": 0.8335039799187502, + "grad_norm": 0.14533939957618713, + "learning_rate": 1.340092324230533e-05, + "loss": 0.0126, + "step": 6309 + }, + { + "epoch": 0.8336360934042343, + "grad_norm": 0.11882299929857254, + "learning_rate": 1.3380147751046646e-05, + "loss": 0.0096, + "step": 6310 + }, + { + "epoch": 0.8337682068897183, + "grad_norm": 0.19842758774757385, + "learning_rate": 1.3359387221747876e-05, + "loss": 0.015, + "step": 6311 + }, + { + "epoch": 0.8339003203752023, + "grad_norm": 0.1555911749601364, + "learning_rate": 1.3338641657995033e-05, + "loss": 0.0254, + "step": 6312 + }, + { + "epoch": 0.8340324338606864, + "grad_norm": 0.12339769303798676, + "learning_rate": 1.3317911063371536e-05, + "loss": 0.0098, + "step": 6313 + }, + { + "epoch": 0.8341645473461704, + "grad_norm": 0.15423816442489624, + "learning_rate": 1.3297195441458154e-05, + "loss": 0.0155, + "step": 6314 + }, + { + "epoch": 0.8342966608316544, + "grad_norm": 0.2922482192516327, + "learning_rate": 1.327649479583316e-05, + "loss": 0.0221, + "step": 6315 + }, + { + "epoch": 0.8344287743171385, + "grad_norm": 0.19567056000232697, + "learning_rate": 1.3255809130072194e-05, + "loss": 0.0133, + "step": 6316 + }, + { + "epoch": 0.8345608878026225, + "grad_norm": 0.18917563557624817, + "learning_rate": 1.3235138447748342e-05, + "loss": 0.0175, + "step": 6317 + }, + { + "epoch": 0.8346930012881065, + "grad_norm": 0.1895277500152588, + "learning_rate": 1.3214482752432033e-05, + "loss": 0.0193, + "step": 6318 + }, + { + "epoch": 0.8348251147735906, + "grad_norm": 0.10119043290615082, + "learning_rate": 1.3193842047691174e-05, + "loss": 0.0102, + "step": 6319 + }, + { + "epoch": 0.8349572282590746, + "grad_norm": 0.21565113961696625, + "learning_rate": 1.3173216337091098e-05, + "loss": 0.0242, + "step": 6320 + }, + { + "epoch": 0.8350893417445586, + "grad_norm": 0.20141518115997314, + "learning_rate": 1.3152605624194436e-05, + "loss": 0.0182, + "step": 6321 + }, + { + "epoch": 0.8352214552300427, + "grad_norm": 0.08046968281269073, + "learning_rate": 1.3132009912561361e-05, + "loss": 0.0064, + "step": 6322 + }, + { + "epoch": 0.8353535687155267, + "grad_norm": 0.13898640871047974, + "learning_rate": 1.311142920574937e-05, + "loss": 0.0135, + "step": 6323 + }, + { + "epoch": 0.8354856822010107, + "grad_norm": 0.12394464761018753, + "learning_rate": 1.3090863507313422e-05, + "loss": 0.0107, + "step": 6324 + }, + { + "epoch": 0.8356177956864947, + "grad_norm": 0.23427864909172058, + "learning_rate": 1.307031282080582e-05, + "loss": 0.0229, + "step": 6325 + }, + { + "epoch": 0.8357499091719788, + "grad_norm": 0.13156038522720337, + "learning_rate": 1.3049777149776332e-05, + "loss": 0.0175, + "step": 6326 + }, + { + "epoch": 0.8358820226574628, + "grad_norm": 0.17762839794158936, + "learning_rate": 1.3029256497772136e-05, + "loss": 0.0177, + "step": 6327 + }, + { + "epoch": 0.8360141361429468, + "grad_norm": 0.10986623913049698, + "learning_rate": 1.3008750868337738e-05, + "loss": 0.0153, + "step": 6328 + }, + { + "epoch": 0.8361462496284309, + "grad_norm": 0.23786404728889465, + "learning_rate": 1.2988260265015128e-05, + "loss": 0.0173, + "step": 6329 + }, + { + "epoch": 0.8362783631139149, + "grad_norm": 0.13705205917358398, + "learning_rate": 1.2967784691343676e-05, + "loss": 0.0129, + "step": 6330 + }, + { + "epoch": 0.8364104765993989, + "grad_norm": 0.1301295906305313, + "learning_rate": 1.2947324150860174e-05, + "loss": 0.0174, + "step": 6331 + }, + { + "epoch": 0.836542590084883, + "grad_norm": 0.17995145916938782, + "learning_rate": 1.2926878647098762e-05, + "loss": 0.0178, + "step": 6332 + }, + { + "epoch": 0.836674703570367, + "grad_norm": 0.11307818442583084, + "learning_rate": 1.2906448183591024e-05, + "loss": 0.0083, + "step": 6333 + }, + { + "epoch": 0.836806817055851, + "grad_norm": 0.2079186737537384, + "learning_rate": 1.2886032763865975e-05, + "loss": 0.0111, + "step": 6334 + }, + { + "epoch": 0.8369389305413351, + "grad_norm": 0.15454719960689545, + "learning_rate": 1.2865632391449956e-05, + "loss": 0.0187, + "step": 6335 + }, + { + "epoch": 0.8370710440268191, + "grad_norm": 0.1949504017829895, + "learning_rate": 1.2845247069866761e-05, + "loss": 0.0207, + "step": 6336 + }, + { + "epoch": 0.8372031575123031, + "grad_norm": 0.11500924825668335, + "learning_rate": 1.2824876802637586e-05, + "loss": 0.0128, + "step": 6337 + }, + { + "epoch": 0.8373352709977872, + "grad_norm": 0.1913449466228485, + "learning_rate": 1.2804521593281016e-05, + "loss": 0.0188, + "step": 6338 + }, + { + "epoch": 0.8374673844832712, + "grad_norm": 0.19859804213047028, + "learning_rate": 1.2784181445313015e-05, + "loss": 0.0271, + "step": 6339 + }, + { + "epoch": 0.8375994979687552, + "grad_norm": 0.17501254379749298, + "learning_rate": 1.2763856362246962e-05, + "loss": 0.02, + "step": 6340 + }, + { + "epoch": 0.8377316114542392, + "grad_norm": 0.15442462265491486, + "learning_rate": 1.2743546347593672e-05, + "loss": 0.0149, + "step": 6341 + }, + { + "epoch": 0.8378637249397233, + "grad_norm": 0.17348526418209076, + "learning_rate": 1.2723251404861259e-05, + "loss": 0.0178, + "step": 6342 + }, + { + "epoch": 0.8379958384252073, + "grad_norm": 0.1573207974433899, + "learning_rate": 1.270297153755534e-05, + "loss": 0.027, + "step": 6343 + }, + { + "epoch": 0.8381279519106913, + "grad_norm": 0.9048620462417603, + "learning_rate": 1.2682706749178874e-05, + "loss": 0.0215, + "step": 6344 + }, + { + "epoch": 0.8382600653961754, + "grad_norm": 0.12792988121509552, + "learning_rate": 1.2662457043232235e-05, + "loss": 0.0092, + "step": 6345 + }, + { + "epoch": 0.8383921788816594, + "grad_norm": 0.1024986281991005, + "learning_rate": 1.2642222423213145e-05, + "loss": 0.0105, + "step": 6346 + }, + { + "epoch": 0.8385242923671434, + "grad_norm": 0.16506196558475494, + "learning_rate": 1.262200289261679e-05, + "loss": 0.0158, + "step": 6347 + }, + { + "epoch": 0.8386564058526275, + "grad_norm": 0.11527720093727112, + "learning_rate": 1.260179845493572e-05, + "loss": 0.0124, + "step": 6348 + }, + { + "epoch": 0.8387885193381114, + "grad_norm": 0.09696735441684723, + "learning_rate": 1.2581609113659842e-05, + "loss": 0.0161, + "step": 6349 + }, + { + "epoch": 0.8389206328235954, + "grad_norm": 0.16589656472206116, + "learning_rate": 1.25614348722765e-05, + "loss": 0.0157, + "step": 6350 + }, + { + "epoch": 0.8390527463090794, + "grad_norm": 0.16736134886741638, + "learning_rate": 1.2541275734270419e-05, + "loss": 0.0159, + "step": 6351 + }, + { + "epoch": 0.8391848597945635, + "grad_norm": 0.13019654154777527, + "learning_rate": 1.2521131703123745e-05, + "loss": 0.0089, + "step": 6352 + }, + { + "epoch": 0.8393169732800475, + "grad_norm": 0.15413515269756317, + "learning_rate": 1.2501002782315918e-05, + "loss": 0.0146, + "step": 6353 + }, + { + "epoch": 0.8394490867655315, + "grad_norm": 0.14028388261795044, + "learning_rate": 1.248088897532388e-05, + "loss": 0.0148, + "step": 6354 + }, + { + "epoch": 0.8395812002510156, + "grad_norm": 0.12065458297729492, + "learning_rate": 1.2460790285621916e-05, + "loss": 0.0117, + "step": 6355 + }, + { + "epoch": 0.8397133137364996, + "grad_norm": 0.19000723958015442, + "learning_rate": 1.2440706716681672e-05, + "loss": 0.0273, + "step": 6356 + }, + { + "epoch": 0.8398454272219836, + "grad_norm": 0.2658844590187073, + "learning_rate": 1.2420638271972218e-05, + "loss": 0.0329, + "step": 6357 + }, + { + "epoch": 0.8399775407074677, + "grad_norm": 0.22127166390419006, + "learning_rate": 1.2400584954960016e-05, + "loss": 0.0139, + "step": 6358 + }, + { + "epoch": 0.8401096541929517, + "grad_norm": 0.13642321527004242, + "learning_rate": 1.2380546769108903e-05, + "loss": 0.0162, + "step": 6359 + }, + { + "epoch": 0.8402417676784357, + "grad_norm": 0.24587512016296387, + "learning_rate": 1.2360523717880068e-05, + "loss": 0.031, + "step": 6360 + }, + { + "epoch": 0.8403738811639198, + "grad_norm": 0.17175354063510895, + "learning_rate": 1.2340515804732155e-05, + "loss": 0.0194, + "step": 6361 + }, + { + "epoch": 0.8405059946494038, + "grad_norm": 0.1904156357049942, + "learning_rate": 1.2320523033121156e-05, + "loss": 0.015, + "step": 6362 + }, + { + "epoch": 0.8406381081348878, + "grad_norm": 0.12668251991271973, + "learning_rate": 1.2300545406500408e-05, + "loss": 0.011, + "step": 6363 + }, + { + "epoch": 0.8407702216203718, + "grad_norm": 0.17923864722251892, + "learning_rate": 1.2280582928320717e-05, + "loss": 0.0188, + "step": 6364 + }, + { + "epoch": 0.8409023351058559, + "grad_norm": 0.154831200838089, + "learning_rate": 1.2260635602030202e-05, + "loss": 0.0175, + "step": 6365 + }, + { + "epoch": 0.8410344485913399, + "grad_norm": 0.13334999978542328, + "learning_rate": 1.2240703431074418e-05, + "loss": 0.013, + "step": 6366 + }, + { + "epoch": 0.8411665620768239, + "grad_norm": 0.17862273752689362, + "learning_rate": 1.2220786418896236e-05, + "loss": 0.0161, + "step": 6367 + }, + { + "epoch": 0.841298675562308, + "grad_norm": 0.2127719670534134, + "learning_rate": 1.2200884568935956e-05, + "loss": 0.0221, + "step": 6368 + }, + { + "epoch": 0.841430789047792, + "grad_norm": 0.09783175587654114, + "learning_rate": 1.2180997884631296e-05, + "loss": 0.0088, + "step": 6369 + }, + { + "epoch": 0.841562902533276, + "grad_norm": 0.15384633839130402, + "learning_rate": 1.2161126369417252e-05, + "loss": 0.0143, + "step": 6370 + }, + { + "epoch": 0.8416950160187601, + "grad_norm": 0.14649969339370728, + "learning_rate": 1.2141270026726293e-05, + "loss": 0.0163, + "step": 6371 + }, + { + "epoch": 0.8418271295042441, + "grad_norm": 0.1692427098751068, + "learning_rate": 1.2121428859988227e-05, + "loss": 0.0216, + "step": 6372 + }, + { + "epoch": 0.8419592429897281, + "grad_norm": 0.2643669545650482, + "learning_rate": 1.2101602872630224e-05, + "loss": 0.0154, + "step": 6373 + }, + { + "epoch": 0.8420913564752122, + "grad_norm": 0.0852733626961708, + "learning_rate": 1.2081792068076858e-05, + "loss": 0.0067, + "step": 6374 + }, + { + "epoch": 0.8422234699606962, + "grad_norm": 0.13838987052440643, + "learning_rate": 1.206199644975009e-05, + "loss": 0.0146, + "step": 6375 + }, + { + "epoch": 0.8423555834461802, + "grad_norm": 0.1700979620218277, + "learning_rate": 1.2042216021069252e-05, + "loss": 0.0219, + "step": 6376 + }, + { + "epoch": 0.8424876969316643, + "grad_norm": 0.14072169363498688, + "learning_rate": 1.2022450785451001e-05, + "loss": 0.0125, + "step": 6377 + }, + { + "epoch": 0.8426198104171483, + "grad_norm": 0.15077058970928192, + "learning_rate": 1.2002700746309437e-05, + "loss": 0.0109, + "step": 6378 + }, + { + "epoch": 0.8427519239026323, + "grad_norm": 0.26554206013679504, + "learning_rate": 1.1982965907056032e-05, + "loss": 0.0241, + "step": 6379 + }, + { + "epoch": 0.8428840373881163, + "grad_norm": 0.10113240778446198, + "learning_rate": 1.1963246271099571e-05, + "loss": 0.0041, + "step": 6380 + }, + { + "epoch": 0.8430161508736004, + "grad_norm": 0.3097294569015503, + "learning_rate": 1.1943541841846262e-05, + "loss": 0.0248, + "step": 6381 + }, + { + "epoch": 0.8431482643590844, + "grad_norm": 0.13991662859916687, + "learning_rate": 1.1923852622699693e-05, + "loss": 0.0058, + "step": 6382 + }, + { + "epoch": 0.8432803778445684, + "grad_norm": 0.13752928376197815, + "learning_rate": 1.1904178617060812e-05, + "loss": 0.0102, + "step": 6383 + }, + { + "epoch": 0.8434124913300525, + "grad_norm": 0.16692815721035004, + "learning_rate": 1.1884519828327912e-05, + "loss": 0.0153, + "step": 6384 + }, + { + "epoch": 0.8435446048155365, + "grad_norm": 0.12915128469467163, + "learning_rate": 1.1864876259896684e-05, + "loss": 0.012, + "step": 6385 + }, + { + "epoch": 0.8436767183010205, + "grad_norm": 0.12745681405067444, + "learning_rate": 1.1845247915160219e-05, + "loss": 0.0116, + "step": 6386 + }, + { + "epoch": 0.8438088317865046, + "grad_norm": 0.13691522181034088, + "learning_rate": 1.18256347975089e-05, + "loss": 0.0193, + "step": 6387 + }, + { + "epoch": 0.8439409452719886, + "grad_norm": 0.24836905300617218, + "learning_rate": 1.1806036910330554e-05, + "loss": 0.0289, + "step": 6388 + }, + { + "epoch": 0.8440730587574726, + "grad_norm": 0.12612557411193848, + "learning_rate": 1.1786454257010337e-05, + "loss": 0.0081, + "step": 6389 + }, + { + "epoch": 0.8442051722429567, + "grad_norm": 0.1398400068283081, + "learning_rate": 1.1766886840930824e-05, + "loss": 0.0217, + "step": 6390 + }, + { + "epoch": 0.8443372857284407, + "grad_norm": 0.19320712983608246, + "learning_rate": 1.1747334665471865e-05, + "loss": 0.0297, + "step": 6391 + }, + { + "epoch": 0.8444693992139247, + "grad_norm": 0.12965945899486542, + "learning_rate": 1.1727797734010771e-05, + "loss": 0.0153, + "step": 6392 + }, + { + "epoch": 0.8446015126994088, + "grad_norm": 0.08290174603462219, + "learning_rate": 1.1708276049922174e-05, + "loss": 0.008, + "step": 6393 + }, + { + "epoch": 0.8447336261848928, + "grad_norm": 0.28244665265083313, + "learning_rate": 1.1688769616578067e-05, + "loss": 0.019, + "step": 6394 + }, + { + "epoch": 0.8448657396703768, + "grad_norm": 0.23474980890750885, + "learning_rate": 1.1669278437347819e-05, + "loss": 0.0266, + "step": 6395 + }, + { + "epoch": 0.8449978531558608, + "grad_norm": 0.11438631266355515, + "learning_rate": 1.1649802515598185e-05, + "loss": 0.0194, + "step": 6396 + }, + { + "epoch": 0.8451299666413449, + "grad_norm": 0.11677414923906326, + "learning_rate": 1.1630341854693273e-05, + "loss": 0.0106, + "step": 6397 + }, + { + "epoch": 0.8452620801268289, + "grad_norm": 0.1963653713464737, + "learning_rate": 1.1610896457994513e-05, + "loss": 0.0202, + "step": 6398 + }, + { + "epoch": 0.8453941936123129, + "grad_norm": 0.13961468636989594, + "learning_rate": 1.1591466328860756e-05, + "loss": 0.0181, + "step": 6399 + }, + { + "epoch": 0.845526307097797, + "grad_norm": 0.1912894994020462, + "learning_rate": 1.1572051470648216e-05, + "loss": 0.0173, + "step": 6400 + }, + { + "epoch": 0.845658420583281, + "grad_norm": 0.11899048835039139, + "learning_rate": 1.1552651886710398e-05, + "loss": 0.0149, + "step": 6401 + }, + { + "epoch": 0.845790534068765, + "grad_norm": 0.22597384452819824, + "learning_rate": 1.1533267580398254e-05, + "loss": 0.0179, + "step": 6402 + }, + { + "epoch": 0.8459226475542491, + "grad_norm": 0.1366269737482071, + "learning_rate": 1.1513898555060033e-05, + "loss": 0.0067, + "step": 6403 + }, + { + "epoch": 0.8460547610397331, + "grad_norm": 0.22207403182983398, + "learning_rate": 1.1494544814041419e-05, + "loss": 0.0179, + "step": 6404 + }, + { + "epoch": 0.8461868745252171, + "grad_norm": 0.17005328834056854, + "learning_rate": 1.1475206360685353e-05, + "loss": 0.0162, + "step": 6405 + }, + { + "epoch": 0.8463189880107012, + "grad_norm": 0.0824325904250145, + "learning_rate": 1.1455883198332217e-05, + "loss": 0.0036, + "step": 6406 + }, + { + "epoch": 0.8464511014961852, + "grad_norm": 0.22979408502578735, + "learning_rate": 1.1436575330319744e-05, + "loss": 0.0226, + "step": 6407 + }, + { + "epoch": 0.8465832149816692, + "grad_norm": 0.08795081824064255, + "learning_rate": 1.1417282759982972e-05, + "loss": 0.009, + "step": 6408 + }, + { + "epoch": 0.8467153284671532, + "grad_norm": 0.30979159474372864, + "learning_rate": 1.1398005490654352e-05, + "loss": 0.0271, + "step": 6409 + }, + { + "epoch": 0.8468474419526373, + "grad_norm": 0.1569080799818039, + "learning_rate": 1.1378743525663659e-05, + "loss": 0.0145, + "step": 6410 + }, + { + "epoch": 0.8469795554381213, + "grad_norm": 0.08720047026872635, + "learning_rate": 1.1359496868338072e-05, + "loss": 0.0099, + "step": 6411 + }, + { + "epoch": 0.8471116689236053, + "grad_norm": 0.2238951474428177, + "learning_rate": 1.1340265522002036e-05, + "loss": 0.0138, + "step": 6412 + }, + { + "epoch": 0.8472437824090894, + "grad_norm": 0.17409300804138184, + "learning_rate": 1.1321049489977443e-05, + "loss": 0.0181, + "step": 6413 + }, + { + "epoch": 0.8473758958945734, + "grad_norm": 0.23001131415367126, + "learning_rate": 1.1301848775583513e-05, + "loss": 0.0222, + "step": 6414 + }, + { + "epoch": 0.8475080093800574, + "grad_norm": 0.15086467564105988, + "learning_rate": 1.1282663382136783e-05, + "loss": 0.0132, + "step": 6415 + }, + { + "epoch": 0.8476401228655415, + "grad_norm": 0.08834061026573181, + "learning_rate": 1.1263493312951168e-05, + "loss": 0.0057, + "step": 6416 + }, + { + "epoch": 0.8477722363510255, + "grad_norm": 0.12019912898540497, + "learning_rate": 1.1244338571337964e-05, + "loss": 0.0165, + "step": 6417 + }, + { + "epoch": 0.8479043498365095, + "grad_norm": 0.29346486926078796, + "learning_rate": 1.1225199160605793e-05, + "loss": 0.0147, + "step": 6418 + }, + { + "epoch": 0.8480364633219936, + "grad_norm": 0.13543280959129333, + "learning_rate": 1.1206075084060608e-05, + "loss": 0.0154, + "step": 6419 + }, + { + "epoch": 0.8481685768074776, + "grad_norm": 0.1138705313205719, + "learning_rate": 1.1186966345005745e-05, + "loss": 0.0128, + "step": 6420 + }, + { + "epoch": 0.8483006902929616, + "grad_norm": 0.1780928373336792, + "learning_rate": 1.1167872946741909e-05, + "loss": 0.0218, + "step": 6421 + }, + { + "epoch": 0.8484328037784457, + "grad_norm": 0.16972623765468597, + "learning_rate": 1.1148794892567071e-05, + "loss": 0.0147, + "step": 6422 + }, + { + "epoch": 0.8485649172639297, + "grad_norm": 0.14040710031986237, + "learning_rate": 1.1129732185776654e-05, + "loss": 0.0171, + "step": 6423 + }, + { + "epoch": 0.8486970307494137, + "grad_norm": 0.1257903128862381, + "learning_rate": 1.1110684829663364e-05, + "loss": 0.0168, + "step": 6424 + }, + { + "epoch": 0.8488291442348977, + "grad_norm": 0.12174368649721146, + "learning_rate": 1.1091652827517296e-05, + "loss": 0.011, + "step": 6425 + }, + { + "epoch": 0.8489612577203818, + "grad_norm": 0.19201435148715973, + "learning_rate": 1.1072636182625851e-05, + "loss": 0.0216, + "step": 6426 + }, + { + "epoch": 0.8490933712058658, + "grad_norm": 0.1880090832710266, + "learning_rate": 1.1053634898273802e-05, + "loss": 0.0252, + "step": 6427 + }, + { + "epoch": 0.8492254846913498, + "grad_norm": 0.15559418499469757, + "learning_rate": 1.1034648977743267e-05, + "loss": 0.0176, + "step": 6428 + }, + { + "epoch": 0.8493575981768339, + "grad_norm": 0.16924457252025604, + "learning_rate": 1.1015678424313713e-05, + "loss": 0.015, + "step": 6429 + }, + { + "epoch": 0.8494897116623179, + "grad_norm": 0.11257937550544739, + "learning_rate": 1.0996723241261942e-05, + "loss": 0.0124, + "step": 6430 + }, + { + "epoch": 0.8496218251478019, + "grad_norm": 0.19125103950500488, + "learning_rate": 1.0977783431862144e-05, + "loss": 0.0184, + "step": 6431 + }, + { + "epoch": 0.849753938633286, + "grad_norm": 0.28629064559936523, + "learning_rate": 1.0958858999385758e-05, + "loss": 0.0297, + "step": 6432 + }, + { + "epoch": 0.84988605211877, + "grad_norm": 0.17692437767982483, + "learning_rate": 1.0939949947101646e-05, + "loss": 0.0095, + "step": 6433 + }, + { + "epoch": 0.850018165604254, + "grad_norm": 0.12471568584442139, + "learning_rate": 1.0921056278276031e-05, + "loss": 0.0106, + "step": 6434 + }, + { + "epoch": 0.8501502790897381, + "grad_norm": 0.1254560351371765, + "learning_rate": 1.0902177996172392e-05, + "loss": 0.0136, + "step": 6435 + }, + { + "epoch": 0.8502823925752221, + "grad_norm": 0.0843840092420578, + "learning_rate": 1.0883315104051617e-05, + "loss": 0.0068, + "step": 6436 + }, + { + "epoch": 0.8504145060607061, + "grad_norm": 0.14639145135879517, + "learning_rate": 1.0864467605171912e-05, + "loss": 0.0176, + "step": 6437 + }, + { + "epoch": 0.8505466195461902, + "grad_norm": 0.204713374376297, + "learning_rate": 1.0845635502788865e-05, + "loss": 0.0178, + "step": 6438 + }, + { + "epoch": 0.8506787330316742, + "grad_norm": 0.15403705835342407, + "learning_rate": 1.0826818800155313e-05, + "loss": 0.0244, + "step": 6439 + }, + { + "epoch": 0.8508108465171582, + "grad_norm": 0.16191795468330383, + "learning_rate": 1.0808017500521528e-05, + "loss": 0.0118, + "step": 6440 + }, + { + "epoch": 0.8509429600026422, + "grad_norm": 0.1546401083469391, + "learning_rate": 1.0789231607135086e-05, + "loss": 0.0149, + "step": 6441 + }, + { + "epoch": 0.8510750734881263, + "grad_norm": 0.09436122328042984, + "learning_rate": 1.0770461123240871e-05, + "loss": 0.0087, + "step": 6442 + }, + { + "epoch": 0.8512071869736103, + "grad_norm": 0.31698280572891235, + "learning_rate": 1.0751706052081146e-05, + "loss": 0.0299, + "step": 6443 + }, + { + "epoch": 0.8513393004590943, + "grad_norm": 0.20286083221435547, + "learning_rate": 1.0732966396895494e-05, + "loss": 0.0232, + "step": 6444 + }, + { + "epoch": 0.8514714139445784, + "grad_norm": 0.20891332626342773, + "learning_rate": 1.0714242160920873e-05, + "loss": 0.0158, + "step": 6445 + }, + { + "epoch": 0.8516035274300624, + "grad_norm": 0.25830474495887756, + "learning_rate": 1.0695533347391507e-05, + "loss": 0.0451, + "step": 6446 + }, + { + "epoch": 0.8517356409155464, + "grad_norm": 0.15606266260147095, + "learning_rate": 1.0676839959538986e-05, + "loss": 0.0165, + "step": 6447 + }, + { + "epoch": 0.8518677544010305, + "grad_norm": 0.1563085913658142, + "learning_rate": 1.0658162000592298e-05, + "loss": 0.0202, + "step": 6448 + }, + { + "epoch": 0.8519998678865145, + "grad_norm": 0.13570018112659454, + "learning_rate": 1.0639499473777648e-05, + "loss": 0.0083, + "step": 6449 + }, + { + "epoch": 0.8521319813719985, + "grad_norm": 0.11977182328701019, + "learning_rate": 1.0620852382318669e-05, + "loss": 0.0142, + "step": 6450 + }, + { + "epoch": 0.8522640948574826, + "grad_norm": 0.15313532948493958, + "learning_rate": 1.0602220729436297e-05, + "loss": 0.0192, + "step": 6451 + }, + { + "epoch": 0.8523962083429666, + "grad_norm": 0.12061281502246857, + "learning_rate": 1.0583604518348821e-05, + "loss": 0.0162, + "step": 6452 + }, + { + "epoch": 0.8525283218284506, + "grad_norm": 0.1367337554693222, + "learning_rate": 1.0565003752271796e-05, + "loss": 0.0148, + "step": 6453 + }, + { + "epoch": 0.8526604353139347, + "grad_norm": 0.14763393998146057, + "learning_rate": 1.0546418434418181e-05, + "loss": 0.0186, + "step": 6454 + }, + { + "epoch": 0.8527925487994187, + "grad_norm": 0.19914959371089935, + "learning_rate": 1.0527848567998266e-05, + "loss": 0.014, + "step": 6455 + }, + { + "epoch": 0.8529246622849027, + "grad_norm": 0.2014371007680893, + "learning_rate": 1.0509294156219608e-05, + "loss": 0.0295, + "step": 6456 + }, + { + "epoch": 0.8530567757703867, + "grad_norm": 0.1509523242712021, + "learning_rate": 1.049075520228715e-05, + "loss": 0.017, + "step": 6457 + }, + { + "epoch": 0.8531888892558708, + "grad_norm": 0.17880138754844666, + "learning_rate": 1.0472231709403157e-05, + "loss": 0.0091, + "step": 6458 + }, + { + "epoch": 0.8533210027413548, + "grad_norm": 0.09584607183933258, + "learning_rate": 1.0453723680767225e-05, + "loss": 0.0125, + "step": 6459 + }, + { + "epoch": 0.8534531162268388, + "grad_norm": 0.20094123482704163, + "learning_rate": 1.0435231119576239e-05, + "loss": 0.0164, + "step": 6460 + }, + { + "epoch": 0.8535852297123229, + "grad_norm": 0.1413847953081131, + "learning_rate": 1.0416754029024467e-05, + "loss": 0.014, + "step": 6461 + }, + { + "epoch": 0.8537173431978069, + "grad_norm": 0.14215205609798431, + "learning_rate": 1.0398292412303478e-05, + "loss": 0.0179, + "step": 6462 + }, + { + "epoch": 0.8538494566832909, + "grad_norm": 0.1754753738641739, + "learning_rate": 1.0379846272602156e-05, + "loss": 0.0107, + "step": 6463 + }, + { + "epoch": 0.853981570168775, + "grad_norm": 0.14488495886325836, + "learning_rate": 1.036141561310674e-05, + "loss": 0.0108, + "step": 6464 + }, + { + "epoch": 0.854113683654259, + "grad_norm": 0.15773916244506836, + "learning_rate": 1.0343000437000783e-05, + "loss": 0.0137, + "step": 6465 + }, + { + "epoch": 0.854245797139743, + "grad_norm": 0.16901792585849762, + "learning_rate": 1.0324600747465174e-05, + "loss": 0.0159, + "step": 6466 + }, + { + "epoch": 0.854377910625227, + "grad_norm": 0.1686817854642868, + "learning_rate": 1.0306216547678082e-05, + "loss": 0.0144, + "step": 6467 + }, + { + "epoch": 0.8545100241107111, + "grad_norm": 0.1480901539325714, + "learning_rate": 1.0287847840815046e-05, + "loss": 0.022, + "step": 6468 + }, + { + "epoch": 0.8546421375961951, + "grad_norm": 0.1318725347518921, + "learning_rate": 1.0269494630048948e-05, + "loss": 0.0152, + "step": 6469 + }, + { + "epoch": 0.8547742510816791, + "grad_norm": 0.10686702281236649, + "learning_rate": 1.025115691854992e-05, + "loss": 0.0097, + "step": 6470 + }, + { + "epoch": 0.8549063645671632, + "grad_norm": 0.15249253809452057, + "learning_rate": 1.0232834709485472e-05, + "loss": 0.0153, + "step": 6471 + }, + { + "epoch": 0.8550384780526472, + "grad_norm": 0.10670942068099976, + "learning_rate": 1.0214528006020429e-05, + "loss": 0.0148, + "step": 6472 + }, + { + "epoch": 0.8551705915381312, + "grad_norm": 0.16914334893226624, + "learning_rate": 1.019623681131695e-05, + "loss": 0.0242, + "step": 6473 + }, + { + "epoch": 0.8553027050236153, + "grad_norm": 0.13639064133167267, + "learning_rate": 1.0177961128534453e-05, + "loss": 0.0144, + "step": 6474 + }, + { + "epoch": 0.8554348185090993, + "grad_norm": 0.1852792203426361, + "learning_rate": 1.0159700960829744e-05, + "loss": 0.0135, + "step": 6475 + }, + { + "epoch": 0.8555669319945833, + "grad_norm": 0.15428900718688965, + "learning_rate": 1.0141456311356945e-05, + "loss": 0.0148, + "step": 6476 + }, + { + "epoch": 0.8556990454800674, + "grad_norm": 0.11808072775602341, + "learning_rate": 1.0123227183267437e-05, + "loss": 0.0152, + "step": 6477 + }, + { + "epoch": 0.8558311589655514, + "grad_norm": 0.11304613947868347, + "learning_rate": 1.0105013579709987e-05, + "loss": 0.0093, + "step": 6478 + }, + { + "epoch": 0.8559632724510354, + "grad_norm": 0.1460811197757721, + "learning_rate": 1.008681550383065e-05, + "loss": 0.0137, + "step": 6479 + }, + { + "epoch": 0.8560953859365195, + "grad_norm": 0.15243810415267944, + "learning_rate": 1.0068632958772829e-05, + "loss": 0.0169, + "step": 6480 + }, + { + "epoch": 0.8562274994220035, + "grad_norm": 0.17395997047424316, + "learning_rate": 1.0050465947677167e-05, + "loss": 0.0165, + "step": 6481 + }, + { + "epoch": 0.8563596129074875, + "grad_norm": 0.14886704087257385, + "learning_rate": 1.0032314473681692e-05, + "loss": 0.0143, + "step": 6482 + }, + { + "epoch": 0.8564917263929716, + "grad_norm": 0.1678609549999237, + "learning_rate": 1.0014178539921782e-05, + "loss": 0.019, + "step": 6483 + }, + { + "epoch": 0.8566238398784556, + "grad_norm": 0.13445799052715302, + "learning_rate": 9.996058149530008e-06, + "loss": 0.0108, + "step": 6484 + }, + { + "epoch": 0.8567559533639396, + "grad_norm": 0.09424842149019241, + "learning_rate": 9.977953305636335e-06, + "loss": 0.0099, + "step": 6485 + }, + { + "epoch": 0.8568880668494236, + "grad_norm": 0.22159160673618317, + "learning_rate": 9.959864011368115e-06, + "loss": 0.0179, + "step": 6486 + }, + { + "epoch": 0.8570201803349077, + "grad_norm": 0.17116421461105347, + "learning_rate": 9.941790269849848e-06, + "loss": 0.0147, + "step": 6487 + }, + { + "epoch": 0.8571522938203917, + "grad_norm": 0.1429840326309204, + "learning_rate": 9.923732084203475e-06, + "loss": 0.0106, + "step": 6488 + }, + { + "epoch": 0.8572844073058757, + "grad_norm": 0.17208239436149597, + "learning_rate": 9.905689457548207e-06, + "loss": 0.0227, + "step": 6489 + }, + { + "epoch": 0.8574165207913598, + "grad_norm": 0.18521353602409363, + "learning_rate": 9.887662393000585e-06, + "loss": 0.0181, + "step": 6490 + }, + { + "epoch": 0.8575486342768438, + "grad_norm": 0.21980801224708557, + "learning_rate": 9.86965089367441e-06, + "loss": 0.0096, + "step": 6491 + }, + { + "epoch": 0.8576807477623278, + "grad_norm": 0.16842244565486908, + "learning_rate": 9.851654962680856e-06, + "loss": 0.012, + "step": 6492 + }, + { + "epoch": 0.8578128612478119, + "grad_norm": 0.12133488804101944, + "learning_rate": 9.833674603128395e-06, + "loss": 0.0128, + "step": 6493 + }, + { + "epoch": 0.8579449747332959, + "grad_norm": 0.13182833790779114, + "learning_rate": 9.815709818122753e-06, + "loss": 0.0104, + "step": 6494 + }, + { + "epoch": 0.8580770882187799, + "grad_norm": 0.1352997124195099, + "learning_rate": 9.797760610767049e-06, + "loss": 0.0132, + "step": 6495 + }, + { + "epoch": 0.858209201704264, + "grad_norm": 0.1745452880859375, + "learning_rate": 9.779826984161666e-06, + "loss": 0.0174, + "step": 6496 + }, + { + "epoch": 0.858341315189748, + "grad_norm": 0.1221051961183548, + "learning_rate": 9.761908941404319e-06, + "loss": 0.0153, + "step": 6497 + }, + { + "epoch": 0.858473428675232, + "grad_norm": 0.191941037774086, + "learning_rate": 9.744006485589974e-06, + "loss": 0.018, + "step": 6498 + }, + { + "epoch": 0.858605542160716, + "grad_norm": 0.1384330838918686, + "learning_rate": 9.726119619810969e-06, + "loss": 0.0136, + "step": 6499 + }, + { + "epoch": 0.8587376556462001, + "grad_norm": 0.19022780656814575, + "learning_rate": 9.708248347156946e-06, + "loss": 0.016, + "step": 6500 + }, + { + "epoch": 0.8588697691316841, + "grad_norm": 0.16361016035079956, + "learning_rate": 9.690392670714787e-06, + "loss": 0.0149, + "step": 6501 + }, + { + "epoch": 0.8590018826171681, + "grad_norm": 0.14385753870010376, + "learning_rate": 9.672552593568751e-06, + "loss": 0.0088, + "step": 6502 + }, + { + "epoch": 0.8591339961026522, + "grad_norm": 0.1275266408920288, + "learning_rate": 9.654728118800383e-06, + "loss": 0.01, + "step": 6503 + }, + { + "epoch": 0.8592661095881362, + "grad_norm": 0.18232011795043945, + "learning_rate": 9.636919249488541e-06, + "loss": 0.0139, + "step": 6504 + }, + { + "epoch": 0.8593982230736202, + "grad_norm": 0.1661888062953949, + "learning_rate": 9.619125988709332e-06, + "loss": 0.0137, + "step": 6505 + }, + { + "epoch": 0.8595303365591043, + "grad_norm": 0.18684124946594238, + "learning_rate": 9.601348339536232e-06, + "loss": 0.0142, + "step": 6506 + }, + { + "epoch": 0.8596624500445883, + "grad_norm": 0.1515057533979416, + "learning_rate": 9.583586305040016e-06, + "loss": 0.0094, + "step": 6507 + }, + { + "epoch": 0.8597945635300723, + "grad_norm": 0.12896044552326202, + "learning_rate": 9.56583988828872e-06, + "loss": 0.0169, + "step": 6508 + }, + { + "epoch": 0.8599266770155564, + "grad_norm": 0.15606920421123505, + "learning_rate": 9.548109092347702e-06, + "loss": 0.0205, + "step": 6509 + }, + { + "epoch": 0.8600587905010404, + "grad_norm": 0.09972091764211655, + "learning_rate": 9.530393920279624e-06, + "loss": 0.0102, + "step": 6510 + }, + { + "epoch": 0.8601909039865244, + "grad_norm": 0.18018166720867157, + "learning_rate": 9.512694375144494e-06, + "loss": 0.017, + "step": 6511 + }, + { + "epoch": 0.8603230174720085, + "grad_norm": 0.2273482084274292, + "learning_rate": 9.49501045999952e-06, + "loss": 0.0141, + "step": 6512 + }, + { + "epoch": 0.8604551309574925, + "grad_norm": 0.19603601098060608, + "learning_rate": 9.477342177899296e-06, + "loss": 0.0202, + "step": 6513 + }, + { + "epoch": 0.8605872444429765, + "grad_norm": 0.0983457863330841, + "learning_rate": 9.4596895318957e-06, + "loss": 0.0124, + "step": 6514 + }, + { + "epoch": 0.8607193579284605, + "grad_norm": 0.23922207951545715, + "learning_rate": 9.44205252503787e-06, + "loss": 0.0164, + "step": 6515 + }, + { + "epoch": 0.8608514714139446, + "grad_norm": 0.2205938994884491, + "learning_rate": 9.42443116037227e-06, + "loss": 0.0194, + "step": 6516 + }, + { + "epoch": 0.8609835848994286, + "grad_norm": 0.17030929028987885, + "learning_rate": 9.406825440942678e-06, + "loss": 0.0208, + "step": 6517 + }, + { + "epoch": 0.8611156983849126, + "grad_norm": 0.26909691095352173, + "learning_rate": 9.389235369790162e-06, + "loss": 0.0169, + "step": 6518 + }, + { + "epoch": 0.8612478118703967, + "grad_norm": 0.24096190929412842, + "learning_rate": 9.371660949953043e-06, + "loss": 0.0194, + "step": 6519 + }, + { + "epoch": 0.8613799253558807, + "grad_norm": 0.21997550129890442, + "learning_rate": 9.354102184466984e-06, + "loss": 0.0173, + "step": 6520 + }, + { + "epoch": 0.8615120388413647, + "grad_norm": 0.20401428639888763, + "learning_rate": 9.33655907636497e-06, + "loss": 0.016, + "step": 6521 + }, + { + "epoch": 0.8616441523268488, + "grad_norm": 0.14194954931735992, + "learning_rate": 9.3190316286772e-06, + "loss": 0.0216, + "step": 6522 + }, + { + "epoch": 0.8617762658123328, + "grad_norm": 0.1661507934331894, + "learning_rate": 9.301519844431217e-06, + "loss": 0.0245, + "step": 6523 + }, + { + "epoch": 0.8619083792978168, + "grad_norm": 0.11384832113981247, + "learning_rate": 9.28402372665188e-06, + "loss": 0.0102, + "step": 6524 + }, + { + "epoch": 0.8620404927833009, + "grad_norm": 0.12378716468811035, + "learning_rate": 9.266543278361318e-06, + "loss": 0.0124, + "step": 6525 + }, + { + "epoch": 0.8621726062687849, + "grad_norm": 0.2553863823413849, + "learning_rate": 9.249078502578913e-06, + "loss": 0.0125, + "step": 6526 + }, + { + "epoch": 0.8623047197542689, + "grad_norm": 0.16368991136550903, + "learning_rate": 9.231629402321406e-06, + "loss": 0.0205, + "step": 6527 + }, + { + "epoch": 0.862436833239753, + "grad_norm": 0.2544208765029907, + "learning_rate": 9.214195980602813e-06, + "loss": 0.0244, + "step": 6528 + }, + { + "epoch": 0.862568946725237, + "grad_norm": 0.1652582883834839, + "learning_rate": 9.196778240434401e-06, + "loss": 0.0096, + "step": 6529 + }, + { + "epoch": 0.862701060210721, + "grad_norm": 0.18706029653549194, + "learning_rate": 9.179376184824785e-06, + "loss": 0.0118, + "step": 6530 + }, + { + "epoch": 0.862833173696205, + "grad_norm": 0.12401444464921951, + "learning_rate": 9.161989816779825e-06, + "loss": 0.0128, + "step": 6531 + }, + { + "epoch": 0.8629652871816891, + "grad_norm": 0.1702900379896164, + "learning_rate": 9.144619139302712e-06, + "loss": 0.0218, + "step": 6532 + }, + { + "epoch": 0.8630974006671731, + "grad_norm": 0.16278304159641266, + "learning_rate": 9.12726415539389e-06, + "loss": 0.0271, + "step": 6533 + }, + { + "epoch": 0.8632295141526571, + "grad_norm": 0.14746056497097015, + "learning_rate": 9.109924868051112e-06, + "loss": 0.0119, + "step": 6534 + }, + { + "epoch": 0.8633616276381412, + "grad_norm": 0.20100457966327667, + "learning_rate": 9.09260128026943e-06, + "loss": 0.027, + "step": 6535 + }, + { + "epoch": 0.8634937411236252, + "grad_norm": 0.16931913793087006, + "learning_rate": 9.07529339504114e-06, + "loss": 0.0189, + "step": 6536 + }, + { + "epoch": 0.8636258546091092, + "grad_norm": 0.16593056917190552, + "learning_rate": 9.058001215355872e-06, + "loss": 0.0229, + "step": 6537 + }, + { + "epoch": 0.8637579680945933, + "grad_norm": 0.2710971534252167, + "learning_rate": 9.040724744200524e-06, + "loss": 0.0207, + "step": 6538 + }, + { + "epoch": 0.8638900815800773, + "grad_norm": 0.13531526923179626, + "learning_rate": 9.02346398455931e-06, + "loss": 0.0135, + "step": 6539 + }, + { + "epoch": 0.8640221950655613, + "grad_norm": 0.1501166820526123, + "learning_rate": 9.006218939413658e-06, + "loss": 0.0159, + "step": 6540 + }, + { + "epoch": 0.8641543085510454, + "grad_norm": 0.16745105385780334, + "learning_rate": 8.98898961174235e-06, + "loss": 0.0154, + "step": 6541 + }, + { + "epoch": 0.8642864220365294, + "grad_norm": 0.20952169597148895, + "learning_rate": 8.971776004521449e-06, + "loss": 0.0147, + "step": 6542 + }, + { + "epoch": 0.8644185355220134, + "grad_norm": 0.17372874915599823, + "learning_rate": 8.954578120724244e-06, + "loss": 0.0174, + "step": 6543 + }, + { + "epoch": 0.8645506490074975, + "grad_norm": 0.193229541182518, + "learning_rate": 8.937395963321338e-06, + "loss": 0.0183, + "step": 6544 + }, + { + "epoch": 0.8646827624929815, + "grad_norm": 0.1361188441514969, + "learning_rate": 8.920229535280712e-06, + "loss": 0.0135, + "step": 6545 + }, + { + "epoch": 0.8648148759784655, + "grad_norm": 0.16256161034107208, + "learning_rate": 8.903078839567457e-06, + "loss": 0.0131, + "step": 6546 + }, + { + "epoch": 0.8649469894639495, + "grad_norm": 0.1579897105693817, + "learning_rate": 8.885943879144076e-06, + "loss": 0.0241, + "step": 6547 + }, + { + "epoch": 0.8650791029494336, + "grad_norm": 0.14809980988502502, + "learning_rate": 8.868824656970332e-06, + "loss": 0.011, + "step": 6548 + }, + { + "epoch": 0.8652112164349176, + "grad_norm": 0.17065958678722382, + "learning_rate": 8.851721176003192e-06, + "loss": 0.0158, + "step": 6549 + }, + { + "epoch": 0.8653433299204016, + "grad_norm": 0.13681527972221375, + "learning_rate": 8.834633439197004e-06, + "loss": 0.0125, + "step": 6550 + }, + { + "epoch": 0.8654754434058857, + "grad_norm": 0.13763427734375, + "learning_rate": 8.817561449503343e-06, + "loss": 0.0106, + "step": 6551 + }, + { + "epoch": 0.8656075568913697, + "grad_norm": 0.10495647042989731, + "learning_rate": 8.80050520987109e-06, + "loss": 0.0106, + "step": 6552 + }, + { + "epoch": 0.8657396703768537, + "grad_norm": 0.11058825254440308, + "learning_rate": 8.783464723246371e-06, + "loss": 0.0094, + "step": 6553 + }, + { + "epoch": 0.8658717838623378, + "grad_norm": 0.2615870237350464, + "learning_rate": 8.766439992572618e-06, + "loss": 0.0122, + "step": 6554 + }, + { + "epoch": 0.8660038973478218, + "grad_norm": 0.17798537015914917, + "learning_rate": 8.749431020790555e-06, + "loss": 0.0162, + "step": 6555 + }, + { + "epoch": 0.8661360108333058, + "grad_norm": 0.2039807289838791, + "learning_rate": 8.732437810838124e-06, + "loss": 0.0294, + "step": 6556 + }, + { + "epoch": 0.8662681243187899, + "grad_norm": 0.11167783290147781, + "learning_rate": 8.715460365650607e-06, + "loss": 0.0065, + "step": 6557 + }, + { + "epoch": 0.8664002378042739, + "grad_norm": 0.18214501440525055, + "learning_rate": 8.69849868816055e-06, + "loss": 0.0084, + "step": 6558 + }, + { + "epoch": 0.8665323512897579, + "grad_norm": 0.17260056734085083, + "learning_rate": 8.681552781297763e-06, + "loss": 0.0162, + "step": 6559 + }, + { + "epoch": 0.866664464775242, + "grad_norm": 0.16543985903263092, + "learning_rate": 8.664622647989317e-06, + "loss": 0.0102, + "step": 6560 + }, + { + "epoch": 0.866796578260726, + "grad_norm": 0.23392952978610992, + "learning_rate": 8.647708291159583e-06, + "loss": 0.021, + "step": 6561 + }, + { + "epoch": 0.86692869174621, + "grad_norm": 0.11497484147548676, + "learning_rate": 8.630809713730226e-06, + "loss": 0.0075, + "step": 6562 + }, + { + "epoch": 0.867060805231694, + "grad_norm": 0.1542213410139084, + "learning_rate": 8.613926918620108e-06, + "loss": 0.0125, + "step": 6563 + }, + { + "epoch": 0.8671929187171781, + "grad_norm": 0.16790266335010529, + "learning_rate": 8.597059908745453e-06, + "loss": 0.0198, + "step": 6564 + }, + { + "epoch": 0.8673250322026621, + "grad_norm": 0.10018698126077652, + "learning_rate": 8.580208687019709e-06, + "loss": 0.0068, + "step": 6565 + }, + { + "epoch": 0.8674571456881461, + "grad_norm": 0.13718554377555847, + "learning_rate": 8.563373256353635e-06, + "loss": 0.0105, + "step": 6566 + }, + { + "epoch": 0.8675892591736302, + "grad_norm": 0.1705043762922287, + "learning_rate": 8.546553619655196e-06, + "loss": 0.017, + "step": 6567 + }, + { + "epoch": 0.8677213726591142, + "grad_norm": 0.11266963928937912, + "learning_rate": 8.529749779829688e-06, + "loss": 0.0115, + "step": 6568 + }, + { + "epoch": 0.8678534861445982, + "grad_norm": 0.1814977079629898, + "learning_rate": 8.512961739779678e-06, + "loss": 0.0192, + "step": 6569 + }, + { + "epoch": 0.8679855996300823, + "grad_norm": 0.2693195343017578, + "learning_rate": 8.49618950240495e-06, + "loss": 0.0232, + "step": 6570 + }, + { + "epoch": 0.8681177131155663, + "grad_norm": 0.14247596263885498, + "learning_rate": 8.479433070602616e-06, + "loss": 0.0167, + "step": 6571 + }, + { + "epoch": 0.8682498266010503, + "grad_norm": 0.14830470085144043, + "learning_rate": 8.46269244726704e-06, + "loss": 0.0107, + "step": 6572 + }, + { + "epoch": 0.8683819400865344, + "grad_norm": 0.14798453450202942, + "learning_rate": 8.445967635289854e-06, + "loss": 0.0204, + "step": 6573 + }, + { + "epoch": 0.8685140535720184, + "grad_norm": 0.24593189358711243, + "learning_rate": 8.429258637559933e-06, + "loss": 0.0119, + "step": 6574 + }, + { + "epoch": 0.8686461670575024, + "grad_norm": 0.15432271361351013, + "learning_rate": 8.41256545696346e-06, + "loss": 0.0092, + "step": 6575 + }, + { + "epoch": 0.8687782805429864, + "grad_norm": 0.12995439767837524, + "learning_rate": 8.395888096383897e-06, + "loss": 0.0172, + "step": 6576 + }, + { + "epoch": 0.8689103940284705, + "grad_norm": 0.10527817904949188, + "learning_rate": 8.37922655870189e-06, + "loss": 0.0091, + "step": 6577 + }, + { + "epoch": 0.8690425075139545, + "grad_norm": 0.16179177165031433, + "learning_rate": 8.362580846795443e-06, + "loss": 0.0131, + "step": 6578 + }, + { + "epoch": 0.8691746209994385, + "grad_norm": 0.27898624539375305, + "learning_rate": 8.345950963539772e-06, + "loss": 0.0204, + "step": 6579 + }, + { + "epoch": 0.8693067344849226, + "grad_norm": 0.156561017036438, + "learning_rate": 8.329336911807417e-06, + "loss": 0.0145, + "step": 6580 + }, + { + "epoch": 0.8694388479704066, + "grad_norm": 0.19675347208976746, + "learning_rate": 8.312738694468103e-06, + "loss": 0.0241, + "step": 6581 + }, + { + "epoch": 0.8695709614558906, + "grad_norm": 0.1354101300239563, + "learning_rate": 8.29615631438888e-06, + "loss": 0.014, + "step": 6582 + }, + { + "epoch": 0.8697030749413747, + "grad_norm": 0.11665624380111694, + "learning_rate": 8.27958977443406e-06, + "loss": 0.0136, + "step": 6583 + }, + { + "epoch": 0.8698351884268587, + "grad_norm": 0.15645678341388702, + "learning_rate": 8.263039077465163e-06, + "loss": 0.0188, + "step": 6584 + }, + { + "epoch": 0.8699673019123427, + "grad_norm": 0.19669926166534424, + "learning_rate": 8.246504226341035e-06, + "loss": 0.0206, + "step": 6585 + }, + { + "epoch": 0.8700994153978268, + "grad_norm": 0.09903514385223389, + "learning_rate": 8.229985223917757e-06, + "loss": 0.009, + "step": 6586 + }, + { + "epoch": 0.8702315288833108, + "grad_norm": 0.35547196865081787, + "learning_rate": 8.213482073048707e-06, + "loss": 0.0254, + "step": 6587 + }, + { + "epoch": 0.8703636423687948, + "grad_norm": 0.22408826649188995, + "learning_rate": 8.196994776584455e-06, + "loss": 0.0136, + "step": 6588 + }, + { + "epoch": 0.8704957558542789, + "grad_norm": 0.16860933601856232, + "learning_rate": 8.180523337372881e-06, + "loss": 0.0177, + "step": 6589 + }, + { + "epoch": 0.8706278693397629, + "grad_norm": 0.14488452672958374, + "learning_rate": 8.164067758259153e-06, + "loss": 0.0146, + "step": 6590 + }, + { + "epoch": 0.8707599828252469, + "grad_norm": 0.1677248477935791, + "learning_rate": 8.14762804208562e-06, + "loss": 0.0119, + "step": 6591 + }, + { + "epoch": 0.870892096310731, + "grad_norm": 0.13122430443763733, + "learning_rate": 8.131204191691954e-06, + "loss": 0.0136, + "step": 6592 + }, + { + "epoch": 0.871024209796215, + "grad_norm": 0.23603014647960663, + "learning_rate": 8.114796209915066e-06, + "loss": 0.0074, + "step": 6593 + }, + { + "epoch": 0.871156323281699, + "grad_norm": 0.10830773413181305, + "learning_rate": 8.098404099589141e-06, + "loss": 0.0148, + "step": 6594 + }, + { + "epoch": 0.871288436767183, + "grad_norm": 0.12386782467365265, + "learning_rate": 8.082027863545594e-06, + "loss": 0.0145, + "step": 6595 + }, + { + "epoch": 0.8714205502526671, + "grad_norm": 0.0879635289311409, + "learning_rate": 8.065667504613107e-06, + "loss": 0.0085, + "step": 6596 + }, + { + "epoch": 0.8715526637381511, + "grad_norm": 0.13344678282737732, + "learning_rate": 8.049323025617662e-06, + "loss": 0.0155, + "step": 6597 + }, + { + "epoch": 0.8716847772236351, + "grad_norm": 0.1500246524810791, + "learning_rate": 8.032994429382412e-06, + "loss": 0.0184, + "step": 6598 + }, + { + "epoch": 0.8718168907091192, + "grad_norm": 0.16355468332767487, + "learning_rate": 8.016681718727848e-06, + "loss": 0.0156, + "step": 6599 + }, + { + "epoch": 0.8719490041946032, + "grad_norm": 0.17966286838054657, + "learning_rate": 8.00038489647168e-06, + "loss": 0.0198, + "step": 6600 + }, + { + "epoch": 0.8720811176800872, + "grad_norm": 0.18434107303619385, + "learning_rate": 7.984103965428902e-06, + "loss": 0.0268, + "step": 6601 + }, + { + "epoch": 0.8722132311655713, + "grad_norm": 0.15422309935092926, + "learning_rate": 7.967838928411698e-06, + "loss": 0.0159, + "step": 6602 + }, + { + "epoch": 0.8723453446510553, + "grad_norm": 0.11695985496044159, + "learning_rate": 7.951589788229542e-06, + "loss": 0.0225, + "step": 6603 + }, + { + "epoch": 0.8724774581365393, + "grad_norm": 0.11366904526948929, + "learning_rate": 7.935356547689244e-06, + "loss": 0.0086, + "step": 6604 + }, + { + "epoch": 0.8726095716220234, + "grad_norm": 0.08670374006032944, + "learning_rate": 7.919139209594717e-06, + "loss": 0.0085, + "step": 6605 + }, + { + "epoch": 0.8727416851075074, + "grad_norm": 0.1729203164577484, + "learning_rate": 7.902937776747232e-06, + "loss": 0.013, + "step": 6606 + }, + { + "epoch": 0.8728737985929914, + "grad_norm": 0.12578381597995758, + "learning_rate": 7.886752251945306e-06, + "loss": 0.0067, + "step": 6607 + }, + { + "epoch": 0.8730059120784754, + "grad_norm": 0.11218776553869247, + "learning_rate": 7.870582637984636e-06, + "loss": 0.0127, + "step": 6608 + }, + { + "epoch": 0.8731380255639595, + "grad_norm": 0.2189989537000656, + "learning_rate": 7.854428937658253e-06, + "loss": 0.0173, + "step": 6609 + }, + { + "epoch": 0.8732701390494435, + "grad_norm": 0.15799090266227722, + "learning_rate": 7.838291153756395e-06, + "loss": 0.0095, + "step": 6610 + }, + { + "epoch": 0.8734022525349275, + "grad_norm": 0.1464925855398178, + "learning_rate": 7.822169289066583e-06, + "loss": 0.0181, + "step": 6611 + }, + { + "epoch": 0.8735343660204116, + "grad_norm": 0.17121390998363495, + "learning_rate": 7.80606334637355e-06, + "loss": 0.0152, + "step": 6612 + }, + { + "epoch": 0.8736664795058956, + "grad_norm": 0.13004009425640106, + "learning_rate": 7.789973328459288e-06, + "loss": 0.0174, + "step": 6613 + }, + { + "epoch": 0.8737985929913796, + "grad_norm": 0.08923876285552979, + "learning_rate": 7.773899238103066e-06, + "loss": 0.011, + "step": 6614 + }, + { + "epoch": 0.8739307064768637, + "grad_norm": 0.21297034621238708, + "learning_rate": 7.757841078081373e-06, + "loss": 0.0254, + "step": 6615 + }, + { + "epoch": 0.8740628199623477, + "grad_norm": 0.11785417795181274, + "learning_rate": 7.741798851167947e-06, + "loss": 0.0124, + "step": 6616 + }, + { + "epoch": 0.8741949334478317, + "grad_norm": 0.2768336534500122, + "learning_rate": 7.725772560133792e-06, + "loss": 0.0245, + "step": 6617 + }, + { + "epoch": 0.8743270469333158, + "grad_norm": 0.15279079973697662, + "learning_rate": 7.709762207747173e-06, + "loss": 0.0181, + "step": 6618 + }, + { + "epoch": 0.8744591604187998, + "grad_norm": 0.15320299565792084, + "learning_rate": 7.693767796773543e-06, + "loss": 0.0192, + "step": 6619 + }, + { + "epoch": 0.8745912739042838, + "grad_norm": 0.1632274091243744, + "learning_rate": 7.677789329975648e-06, + "loss": 0.0165, + "step": 6620 + }, + { + "epoch": 0.8747233873897678, + "grad_norm": 0.16090667247772217, + "learning_rate": 7.661826810113493e-06, + "loss": 0.0141, + "step": 6621 + }, + { + "epoch": 0.8748555008752519, + "grad_norm": 0.16104121506214142, + "learning_rate": 7.645880239944259e-06, + "loss": 0.0167, + "step": 6622 + }, + { + "epoch": 0.8749876143607359, + "grad_norm": 0.27421918511390686, + "learning_rate": 7.629949622222443e-06, + "loss": 0.0249, + "step": 6623 + }, + { + "epoch": 0.8751197278462199, + "grad_norm": 0.17468570172786713, + "learning_rate": 7.6140349596997675e-06, + "loss": 0.0156, + "step": 6624 + }, + { + "epoch": 0.875251841331704, + "grad_norm": 0.11296243220567703, + "learning_rate": 7.59813625512521e-06, + "loss": 0.0139, + "step": 6625 + }, + { + "epoch": 0.875383954817188, + "grad_norm": 0.1640525609254837, + "learning_rate": 7.58225351124493e-06, + "loss": 0.0235, + "step": 6626 + }, + { + "epoch": 0.875516068302672, + "grad_norm": 0.09715352207422256, + "learning_rate": 7.566386730802388e-06, + "loss": 0.0069, + "step": 6627 + }, + { + "epoch": 0.8756481817881561, + "grad_norm": 0.136221244931221, + "learning_rate": 7.550535916538304e-06, + "loss": 0.0143, + "step": 6628 + }, + { + "epoch": 0.8757802952736401, + "grad_norm": 0.17743642628192902, + "learning_rate": 7.534701071190575e-06, + "loss": 0.0116, + "step": 6629 + }, + { + "epoch": 0.8759124087591241, + "grad_norm": 0.21691836416721344, + "learning_rate": 7.518882197494382e-06, + "loss": 0.0194, + "step": 6630 + }, + { + "epoch": 0.8760445222446082, + "grad_norm": 0.1735524982213974, + "learning_rate": 7.503079298182147e-06, + "loss": 0.0136, + "step": 6631 + }, + { + "epoch": 0.8761766357300922, + "grad_norm": 0.2355421930551529, + "learning_rate": 7.487292375983545e-06, + "loss": 0.018, + "step": 6632 + }, + { + "epoch": 0.8763087492155762, + "grad_norm": 0.21247804164886475, + "learning_rate": 7.471521433625428e-06, + "loss": 0.0246, + "step": 6633 + }, + { + "epoch": 0.8764408627010603, + "grad_norm": 0.15796038508415222, + "learning_rate": 7.455766473831949e-06, + "loss": 0.0128, + "step": 6634 + }, + { + "epoch": 0.8765729761865443, + "grad_norm": 0.16143396496772766, + "learning_rate": 7.440027499324509e-06, + "loss": 0.0141, + "step": 6635 + }, + { + "epoch": 0.8767050896720283, + "grad_norm": 0.2800748646259308, + "learning_rate": 7.424304512821678e-06, + "loss": 0.0109, + "step": 6636 + }, + { + "epoch": 0.8768372031575123, + "grad_norm": 0.16707810759544373, + "learning_rate": 7.4085975170393395e-06, + "loss": 0.0178, + "step": 6637 + }, + { + "epoch": 0.8769693166429964, + "grad_norm": 0.1927337348461151, + "learning_rate": 7.392906514690567e-06, + "loss": 0.0177, + "step": 6638 + }, + { + "epoch": 0.8771014301284804, + "grad_norm": 0.10031986981630325, + "learning_rate": 7.377231508485705e-06, + "loss": 0.0071, + "step": 6639 + }, + { + "epoch": 0.8772335436139644, + "grad_norm": 0.15559269487857819, + "learning_rate": 7.3615725011322964e-06, + "loss": 0.0198, + "step": 6640 + }, + { + "epoch": 0.8773656570994485, + "grad_norm": 0.142380028963089, + "learning_rate": 7.345929495335158e-06, + "loss": 0.0195, + "step": 6641 + }, + { + "epoch": 0.8774977705849325, + "grad_norm": 0.21019425988197327, + "learning_rate": 7.330302493796326e-06, + "loss": 0.0195, + "step": 6642 + }, + { + "epoch": 0.8776298840704165, + "grad_norm": 0.09215234220027924, + "learning_rate": 7.314691499215054e-06, + "loss": 0.0076, + "step": 6643 + }, + { + "epoch": 0.8777619975559006, + "grad_norm": 0.13140788674354553, + "learning_rate": 7.299096514287862e-06, + "loss": 0.009, + "step": 6644 + }, + { + "epoch": 0.8778941110413846, + "grad_norm": 0.2494693100452423, + "learning_rate": 7.2835175417084954e-06, + "loss": 0.0176, + "step": 6645 + }, + { + "epoch": 0.8780262245268686, + "grad_norm": 0.21612705290317535, + "learning_rate": 7.2679545841679464e-06, + "loss": 0.0178, + "step": 6646 + }, + { + "epoch": 0.8781583380123527, + "grad_norm": 0.20610679686069489, + "learning_rate": 7.252407644354397e-06, + "loss": 0.0235, + "step": 6647 + }, + { + "epoch": 0.8782904514978367, + "grad_norm": 0.19679805636405945, + "learning_rate": 7.23687672495329e-06, + "loss": 0.0362, + "step": 6648 + }, + { + "epoch": 0.8784225649833207, + "grad_norm": 0.12437400966882706, + "learning_rate": 7.221361828647333e-06, + "loss": 0.0109, + "step": 6649 + }, + { + "epoch": 0.8785546784688048, + "grad_norm": 0.11433594673871994, + "learning_rate": 7.205862958116394e-06, + "loss": 0.0098, + "step": 6650 + }, + { + "epoch": 0.8786867919542888, + "grad_norm": 0.1080607920885086, + "learning_rate": 7.190380116037631e-06, + "loss": 0.0108, + "step": 6651 + }, + { + "epoch": 0.8788189054397728, + "grad_norm": 0.14211799204349518, + "learning_rate": 7.174913305085406e-06, + "loss": 0.0149, + "step": 6652 + }, + { + "epoch": 0.8789510189252568, + "grad_norm": 0.22237631678581238, + "learning_rate": 7.15946252793136e-06, + "loss": 0.0226, + "step": 6653 + }, + { + "epoch": 0.8790831324107409, + "grad_norm": 0.18012912571430206, + "learning_rate": 7.144027787244289e-06, + "loss": 0.0103, + "step": 6654 + }, + { + "epoch": 0.8792152458962249, + "grad_norm": 0.11783156543970108, + "learning_rate": 7.128609085690252e-06, + "loss": 0.0132, + "step": 6655 + }, + { + "epoch": 0.8793473593817089, + "grad_norm": 0.12326275557279587, + "learning_rate": 7.113206425932573e-06, + "loss": 0.0118, + "step": 6656 + }, + { + "epoch": 0.879479472867193, + "grad_norm": 0.16605113446712494, + "learning_rate": 7.097819810631734e-06, + "loss": 0.0138, + "step": 6657 + }, + { + "epoch": 0.879611586352677, + "grad_norm": 0.13420824706554413, + "learning_rate": 7.08244924244551e-06, + "loss": 0.0188, + "step": 6658 + }, + { + "epoch": 0.879743699838161, + "grad_norm": 0.16448046267032623, + "learning_rate": 7.0670947240288775e-06, + "loss": 0.0241, + "step": 6659 + }, + { + "epoch": 0.8798758133236451, + "grad_norm": 0.1652592271566391, + "learning_rate": 7.051756258034048e-06, + "loss": 0.0233, + "step": 6660 + }, + { + "epoch": 0.8800079268091291, + "grad_norm": 0.14599083364009857, + "learning_rate": 7.036433847110424e-06, + "loss": 0.0175, + "step": 6661 + }, + { + "epoch": 0.8801400402946131, + "grad_norm": 0.14731009304523468, + "learning_rate": 7.021127493904711e-06, + "loss": 0.014, + "step": 6662 + }, + { + "epoch": 0.8802721537800972, + "grad_norm": 0.18450333178043365, + "learning_rate": 7.005837201060761e-06, + "loss": 0.0166, + "step": 6663 + }, + { + "epoch": 0.8804042672655812, + "grad_norm": 0.17317119240760803, + "learning_rate": 6.990562971219694e-06, + "loss": 0.0244, + "step": 6664 + }, + { + "epoch": 0.8805363807510652, + "grad_norm": 0.13952180743217468, + "learning_rate": 6.9753048070198554e-06, + "loss": 0.0158, + "step": 6665 + }, + { + "epoch": 0.8806684942365492, + "grad_norm": 0.09438351541757584, + "learning_rate": 6.9600627110968155e-06, + "loss": 0.011, + "step": 6666 + }, + { + "epoch": 0.8808006077220333, + "grad_norm": 0.1911882907152176, + "learning_rate": 6.944836686083334e-06, + "loss": 0.023, + "step": 6667 + }, + { + "epoch": 0.8809327212075173, + "grad_norm": 0.12660865485668182, + "learning_rate": 6.9296267346094405e-06, + "loss": 0.0169, + "step": 6668 + }, + { + "epoch": 0.8810648346930013, + "grad_norm": 0.15999475121498108, + "learning_rate": 6.914432859302377e-06, + "loss": 0.0099, + "step": 6669 + }, + { + "epoch": 0.8811969481784854, + "grad_norm": 0.2235376089811325, + "learning_rate": 6.89925506278658e-06, + "loss": 0.0259, + "step": 6670 + }, + { + "epoch": 0.8813290616639694, + "grad_norm": 0.10345666855573654, + "learning_rate": 6.88409334768374e-06, + "loss": 0.0071, + "step": 6671 + }, + { + "epoch": 0.8814611751494534, + "grad_norm": 0.1981154978275299, + "learning_rate": 6.868947716612762e-06, + "loss": 0.0148, + "step": 6672 + }, + { + "epoch": 0.8815932886349375, + "grad_norm": 0.2624906897544861, + "learning_rate": 6.853818172189774e-06, + "loss": 0.0123, + "step": 6673 + }, + { + "epoch": 0.8817254021204214, + "grad_norm": 0.1436055451631546, + "learning_rate": 6.838704717028111e-06, + "loss": 0.0123, + "step": 6674 + }, + { + "epoch": 0.8818575156059054, + "grad_norm": 0.11317655444145203, + "learning_rate": 6.8236073537383485e-06, + "loss": 0.0087, + "step": 6675 + }, + { + "epoch": 0.8819896290913894, + "grad_norm": 0.19594664871692657, + "learning_rate": 6.80852608492828e-06, + "loss": 0.0171, + "step": 6676 + }, + { + "epoch": 0.8821217425768735, + "grad_norm": 0.16084632277488708, + "learning_rate": 6.7934609132028985e-06, + "loss": 0.0132, + "step": 6677 + }, + { + "epoch": 0.8822538560623575, + "grad_norm": 0.15218578279018402, + "learning_rate": 6.778411841164423e-06, + "loss": 0.0145, + "step": 6678 + }, + { + "epoch": 0.8823859695478415, + "grad_norm": 0.130471333861351, + "learning_rate": 6.763378871412318e-06, + "loss": 0.0075, + "step": 6679 + }, + { + "epoch": 0.8825180830333256, + "grad_norm": 0.1638917773962021, + "learning_rate": 6.748362006543263e-06, + "loss": 0.0169, + "step": 6680 + }, + { + "epoch": 0.8826501965188096, + "grad_norm": 0.18149948120117188, + "learning_rate": 6.733361249151104e-06, + "loss": 0.0142, + "step": 6681 + }, + { + "epoch": 0.8827823100042936, + "grad_norm": 0.3009966313838959, + "learning_rate": 6.718376601826948e-06, + "loss": 0.021, + "step": 6682 + }, + { + "epoch": 0.8829144234897777, + "grad_norm": 0.12758490443229675, + "learning_rate": 6.7034080671591446e-06, + "loss": 0.0109, + "step": 6683 + }, + { + "epoch": 0.8830465369752617, + "grad_norm": 0.0941629558801651, + "learning_rate": 6.6884556477331936e-06, + "loss": 0.0083, + "step": 6684 + }, + { + "epoch": 0.8831786504607457, + "grad_norm": 0.1017313301563263, + "learning_rate": 6.673519346131851e-06, + "loss": 0.0078, + "step": 6685 + }, + { + "epoch": 0.8833107639462298, + "grad_norm": 0.16601882874965668, + "learning_rate": 6.658599164935097e-06, + "loss": 0.0175, + "step": 6686 + }, + { + "epoch": 0.8834428774317138, + "grad_norm": 0.1425500214099884, + "learning_rate": 6.6436951067201155e-06, + "loss": 0.0144, + "step": 6687 + }, + { + "epoch": 0.8835749909171978, + "grad_norm": 0.21121959388256073, + "learning_rate": 6.628807174061291e-06, + "loss": 0.0183, + "step": 6688 + }, + { + "epoch": 0.8837071044026819, + "grad_norm": 0.15938931703567505, + "learning_rate": 6.613935369530233e-06, + "loss": 0.0129, + "step": 6689 + }, + { + "epoch": 0.8838392178881659, + "grad_norm": 0.11776001751422882, + "learning_rate": 6.5990796956957865e-06, + "loss": 0.0142, + "step": 6690 + }, + { + "epoch": 0.8839713313736499, + "grad_norm": 0.12473277002573013, + "learning_rate": 6.584240155123977e-06, + "loss": 0.0069, + "step": 6691 + }, + { + "epoch": 0.884103444859134, + "grad_norm": 0.14708156883716583, + "learning_rate": 6.569416750378055e-06, + "loss": 0.0218, + "step": 6692 + }, + { + "epoch": 0.884235558344618, + "grad_norm": 0.11134511977434158, + "learning_rate": 6.554609484018492e-06, + "loss": 0.0072, + "step": 6693 + }, + { + "epoch": 0.884367671830102, + "grad_norm": 0.13585522770881653, + "learning_rate": 6.5398183586029786e-06, + "loss": 0.0174, + "step": 6694 + }, + { + "epoch": 0.884499785315586, + "grad_norm": 0.16885550320148468, + "learning_rate": 6.52504337668638e-06, + "loss": 0.0117, + "step": 6695 + }, + { + "epoch": 0.8846318988010701, + "grad_norm": 0.15060658752918243, + "learning_rate": 6.51028454082081e-06, + "loss": 0.0249, + "step": 6696 + }, + { + "epoch": 0.8847640122865541, + "grad_norm": 0.14600324630737305, + "learning_rate": 6.495541853555609e-06, + "loss": 0.0257, + "step": 6697 + }, + { + "epoch": 0.8848961257720381, + "grad_norm": 0.12542365491390228, + "learning_rate": 6.48081531743725e-06, + "loss": 0.0152, + "step": 6698 + }, + { + "epoch": 0.8850282392575222, + "grad_norm": 0.1343553513288498, + "learning_rate": 6.466104935009487e-06, + "loss": 0.0168, + "step": 6699 + }, + { + "epoch": 0.8851603527430062, + "grad_norm": 0.1018458679318428, + "learning_rate": 6.451410708813277e-06, + "loss": 0.0088, + "step": 6700 + }, + { + "epoch": 0.8852924662284902, + "grad_norm": 0.15615858137607574, + "learning_rate": 6.436732641386778e-06, + "loss": 0.0205, + "step": 6701 + }, + { + "epoch": 0.8854245797139743, + "grad_norm": 0.1561017632484436, + "learning_rate": 6.422070735265318e-06, + "loss": 0.0136, + "step": 6702 + }, + { + "epoch": 0.8855566931994583, + "grad_norm": 0.2412395179271698, + "learning_rate": 6.4074249929814815e-06, + "loss": 0.0205, + "step": 6703 + }, + { + "epoch": 0.8856888066849423, + "grad_norm": 0.17810501158237457, + "learning_rate": 6.392795417065078e-06, + "loss": 0.0147, + "step": 6704 + }, + { + "epoch": 0.8858209201704264, + "grad_norm": 0.16065442562103271, + "learning_rate": 6.378182010043044e-06, + "loss": 0.0131, + "step": 6705 + }, + { + "epoch": 0.8859530336559104, + "grad_norm": 0.12913085520267487, + "learning_rate": 6.363584774439601e-06, + "loss": 0.0123, + "step": 6706 + }, + { + "epoch": 0.8860851471413944, + "grad_norm": 0.3279435336589813, + "learning_rate": 6.349003712776136e-06, + "loss": 0.0271, + "step": 6707 + }, + { + "epoch": 0.8862172606268784, + "grad_norm": 0.14449019730091095, + "learning_rate": 6.3344388275712875e-06, + "loss": 0.0116, + "step": 6708 + }, + { + "epoch": 0.8863493741123625, + "grad_norm": 0.16083396971225739, + "learning_rate": 6.31989012134081e-06, + "loss": 0.014, + "step": 6709 + }, + { + "epoch": 0.8864814875978465, + "grad_norm": 0.25596585869789124, + "learning_rate": 6.305357596597761e-06, + "loss": 0.0225, + "step": 6710 + }, + { + "epoch": 0.8866136010833305, + "grad_norm": 0.18523181974887848, + "learning_rate": 6.290841255852375e-06, + "loss": 0.0165, + "step": 6711 + }, + { + "epoch": 0.8867457145688146, + "grad_norm": 0.19872015714645386, + "learning_rate": 6.2763411016120265e-06, + "loss": 0.0179, + "step": 6712 + }, + { + "epoch": 0.8868778280542986, + "grad_norm": 0.10085300356149673, + "learning_rate": 6.261857136381388e-06, + "loss": 0.0103, + "step": 6713 + }, + { + "epoch": 0.8870099415397826, + "grad_norm": 0.13313262164592743, + "learning_rate": 6.24738936266227e-06, + "loss": 0.0076, + "step": 6714 + }, + { + "epoch": 0.8871420550252667, + "grad_norm": 0.1402639001607895, + "learning_rate": 6.232937782953752e-06, + "loss": 0.0135, + "step": 6715 + }, + { + "epoch": 0.8872741685107507, + "grad_norm": 0.14339350163936615, + "learning_rate": 6.218502399752013e-06, + "loss": 0.0112, + "step": 6716 + }, + { + "epoch": 0.8874062819962347, + "grad_norm": 0.1772618293762207, + "learning_rate": 6.204083215550538e-06, + "loss": 0.0209, + "step": 6717 + }, + { + "epoch": 0.8875383954817188, + "grad_norm": 0.18298009037971497, + "learning_rate": 6.1896802328399675e-06, + "loss": 0.0197, + "step": 6718 + }, + { + "epoch": 0.8876705089672028, + "grad_norm": 0.1741136759519577, + "learning_rate": 6.175293454108122e-06, + "loss": 0.0159, + "step": 6719 + }, + { + "epoch": 0.8878026224526868, + "grad_norm": 0.1649044305086136, + "learning_rate": 6.1609228818400585e-06, + "loss": 0.0106, + "step": 6720 + }, + { + "epoch": 0.8879347359381708, + "grad_norm": 0.10933619737625122, + "learning_rate": 6.146568518518059e-06, + "loss": 0.0154, + "step": 6721 + }, + { + "epoch": 0.8880668494236549, + "grad_norm": 0.09617764502763748, + "learning_rate": 6.132230366621527e-06, + "loss": 0.0132, + "step": 6722 + }, + { + "epoch": 0.8881989629091389, + "grad_norm": 0.15637919306755066, + "learning_rate": 6.117908428627139e-06, + "loss": 0.0156, + "step": 6723 + }, + { + "epoch": 0.8883310763946229, + "grad_norm": 0.18323490023612976, + "learning_rate": 6.103602707008726e-06, + "loss": 0.0091, + "step": 6724 + }, + { + "epoch": 0.888463189880107, + "grad_norm": 0.13832803070545197, + "learning_rate": 6.089313204237346e-06, + "loss": 0.0098, + "step": 6725 + }, + { + "epoch": 0.888595303365591, + "grad_norm": 0.22622539103031158, + "learning_rate": 6.075039922781234e-06, + "loss": 0.0232, + "step": 6726 + }, + { + "epoch": 0.888727416851075, + "grad_norm": 0.14251096546649933, + "learning_rate": 6.060782865105819e-06, + "loss": 0.0076, + "step": 6727 + }, + { + "epoch": 0.8888595303365591, + "grad_norm": 0.11312871426343918, + "learning_rate": 6.046542033673786e-06, + "loss": 0.0122, + "step": 6728 + }, + { + "epoch": 0.8889916438220431, + "grad_norm": 0.14361023902893066, + "learning_rate": 6.032317430944923e-06, + "loss": 0.0167, + "step": 6729 + }, + { + "epoch": 0.8891237573075271, + "grad_norm": 0.18894532322883606, + "learning_rate": 6.018109059376287e-06, + "loss": 0.0246, + "step": 6730 + }, + { + "epoch": 0.8892558707930112, + "grad_norm": 0.13724221289157867, + "learning_rate": 6.003916921422115e-06, + "loss": 0.0125, + "step": 6731 + }, + { + "epoch": 0.8893879842784952, + "grad_norm": 0.42069005966186523, + "learning_rate": 5.989741019533812e-06, + "loss": 0.0158, + "step": 6732 + }, + { + "epoch": 0.8895200977639792, + "grad_norm": 0.14127984642982483, + "learning_rate": 5.975581356160009e-06, + "loss": 0.0119, + "step": 6733 + }, + { + "epoch": 0.8896522112494633, + "grad_norm": 0.1737101823091507, + "learning_rate": 5.961437933746539e-06, + "loss": 0.0127, + "step": 6734 + }, + { + "epoch": 0.8897843247349473, + "grad_norm": 0.28091874718666077, + "learning_rate": 5.947310754736402e-06, + "loss": 0.0143, + "step": 6735 + }, + { + "epoch": 0.8899164382204313, + "grad_norm": 0.16934643685817719, + "learning_rate": 5.933199821569801e-06, + "loss": 0.022, + "step": 6736 + }, + { + "epoch": 0.8900485517059153, + "grad_norm": 0.13304175436496735, + "learning_rate": 5.919105136684133e-06, + "loss": 0.0113, + "step": 6737 + }, + { + "epoch": 0.8901806651913994, + "grad_norm": 0.12925884127616882, + "learning_rate": 5.905026702514005e-06, + "loss": 0.0065, + "step": 6738 + }, + { + "epoch": 0.8903127786768834, + "grad_norm": 0.15998664498329163, + "learning_rate": 5.890964521491182e-06, + "loss": 0.0179, + "step": 6739 + }, + { + "epoch": 0.8904448921623674, + "grad_norm": 0.11898373067378998, + "learning_rate": 5.876918596044667e-06, + "loss": 0.0044, + "step": 6740 + }, + { + "epoch": 0.8905770056478515, + "grad_norm": 0.152593195438385, + "learning_rate": 5.862888928600607e-06, + "loss": 0.0138, + "step": 6741 + }, + { + "epoch": 0.8907091191333355, + "grad_norm": 0.19877228140830994, + "learning_rate": 5.8488755215823975e-06, + "loss": 0.0071, + "step": 6742 + }, + { + "epoch": 0.8908412326188195, + "grad_norm": 0.20507656037807465, + "learning_rate": 5.834878377410557e-06, + "loss": 0.0191, + "step": 6743 + }, + { + "epoch": 0.8909733461043036, + "grad_norm": 0.15045703947544098, + "learning_rate": 5.8208974985028535e-06, + "loss": 0.0116, + "step": 6744 + }, + { + "epoch": 0.8911054595897876, + "grad_norm": 0.15195822715759277, + "learning_rate": 5.806932887274219e-06, + "loss": 0.0123, + "step": 6745 + }, + { + "epoch": 0.8912375730752716, + "grad_norm": 0.17124532163143158, + "learning_rate": 5.792984546136759e-06, + "loss": 0.01, + "step": 6746 + }, + { + "epoch": 0.8913696865607557, + "grad_norm": 0.21854707598686218, + "learning_rate": 5.7790524774998136e-06, + "loss": 0.025, + "step": 6747 + }, + { + "epoch": 0.8915018000462397, + "grad_norm": 0.12468767166137695, + "learning_rate": 5.765136683769868e-06, + "loss": 0.0108, + "step": 6748 + }, + { + "epoch": 0.8916339135317237, + "grad_norm": 0.12682819366455078, + "learning_rate": 5.751237167350643e-06, + "loss": 0.013, + "step": 6749 + }, + { + "epoch": 0.8917660270172078, + "grad_norm": 0.15752992033958435, + "learning_rate": 5.737353930642997e-06, + "loss": 0.0182, + "step": 6750 + }, + { + "epoch": 0.8918981405026918, + "grad_norm": 0.07519881427288055, + "learning_rate": 5.723486976045001e-06, + "loss": 0.0066, + "step": 6751 + }, + { + "epoch": 0.8920302539881758, + "grad_norm": 0.2467702180147171, + "learning_rate": 5.709636305951926e-06, + "loss": 0.0228, + "step": 6752 + }, + { + "epoch": 0.8921623674736598, + "grad_norm": 0.14889715611934662, + "learning_rate": 5.695801922756194e-06, + "loss": 0.0139, + "step": 6753 + }, + { + "epoch": 0.8922944809591439, + "grad_norm": 0.39248546957969666, + "learning_rate": 5.681983828847448e-06, + "loss": 0.0226, + "step": 6754 + }, + { + "epoch": 0.8924265944446279, + "grad_norm": 0.16626907885074615, + "learning_rate": 5.6681820266125006e-06, + "loss": 0.0312, + "step": 6755 + }, + { + "epoch": 0.8925587079301119, + "grad_norm": 0.11991027742624283, + "learning_rate": 5.65439651843539e-06, + "loss": 0.0153, + "step": 6756 + }, + { + "epoch": 0.892690821415596, + "grad_norm": 0.2725294530391693, + "learning_rate": 5.640627306697244e-06, + "loss": 0.0379, + "step": 6757 + }, + { + "epoch": 0.89282293490108, + "grad_norm": 0.16560478508472443, + "learning_rate": 5.626874393776482e-06, + "loss": 0.0128, + "step": 6758 + }, + { + "epoch": 0.892955048386564, + "grad_norm": 0.19976456463336945, + "learning_rate": 5.61313778204865e-06, + "loss": 0.0194, + "step": 6759 + }, + { + "epoch": 0.8930871618720481, + "grad_norm": 0.20092976093292236, + "learning_rate": 5.599417473886481e-06, + "loss": 0.0192, + "step": 6760 + }, + { + "epoch": 0.8932192753575321, + "grad_norm": 0.09612659364938736, + "learning_rate": 5.585713471659915e-06, + "loss": 0.0109, + "step": 6761 + }, + { + "epoch": 0.8933513888430161, + "grad_norm": 0.11753436177968979, + "learning_rate": 5.572025777736056e-06, + "loss": 0.0164, + "step": 6762 + }, + { + "epoch": 0.8934835023285002, + "grad_norm": 0.153545081615448, + "learning_rate": 5.558354394479204e-06, + "loss": 0.0164, + "step": 6763 + }, + { + "epoch": 0.8936156158139842, + "grad_norm": 0.19137759506702423, + "learning_rate": 5.5446993242508235e-06, + "loss": 0.0229, + "step": 6764 + }, + { + "epoch": 0.8937477292994682, + "grad_norm": 0.13186432421207428, + "learning_rate": 5.531060569409574e-06, + "loss": 0.0162, + "step": 6765 + }, + { + "epoch": 0.8938798427849522, + "grad_norm": 0.23756030201911926, + "learning_rate": 5.517438132311315e-06, + "loss": 0.0189, + "step": 6766 + }, + { + "epoch": 0.8940119562704363, + "grad_norm": 0.1321917027235031, + "learning_rate": 5.50383201530904e-06, + "loss": 0.0119, + "step": 6767 + }, + { + "epoch": 0.8941440697559203, + "grad_norm": 0.1857648640871048, + "learning_rate": 5.490242220752961e-06, + "loss": 0.019, + "step": 6768 + }, + { + "epoch": 0.8942761832414043, + "grad_norm": 0.19691239297389984, + "learning_rate": 5.476668750990466e-06, + "loss": 0.0182, + "step": 6769 + }, + { + "epoch": 0.8944082967268884, + "grad_norm": 0.29870468378067017, + "learning_rate": 5.463111608366122e-06, + "loss": 0.013, + "step": 6770 + }, + { + "epoch": 0.8945404102123724, + "grad_norm": 0.17927725613117218, + "learning_rate": 5.449570795221659e-06, + "loss": 0.0147, + "step": 6771 + }, + { + "epoch": 0.8946725236978564, + "grad_norm": 0.12823109328746796, + "learning_rate": 5.436046313896015e-06, + "loss": 0.0152, + "step": 6772 + }, + { + "epoch": 0.8948046371833405, + "grad_norm": 0.18331226706504822, + "learning_rate": 5.422538166725277e-06, + "loss": 0.0131, + "step": 6773 + }, + { + "epoch": 0.8949367506688245, + "grad_norm": 0.17507404088974, + "learning_rate": 5.409046356042735e-06, + "loss": 0.0172, + "step": 6774 + }, + { + "epoch": 0.8950688641543085, + "grad_norm": 0.08184914290904999, + "learning_rate": 5.395570884178824e-06, + "loss": 0.0079, + "step": 6775 + }, + { + "epoch": 0.8952009776397926, + "grad_norm": 0.12260734289884567, + "learning_rate": 5.3821117534612165e-06, + "loss": 0.0141, + "step": 6776 + }, + { + "epoch": 0.8953330911252766, + "grad_norm": 0.15787914395332336, + "learning_rate": 5.368668966214707e-06, + "loss": 0.014, + "step": 6777 + }, + { + "epoch": 0.8954652046107606, + "grad_norm": 0.09953964501619339, + "learning_rate": 5.355242524761261e-06, + "loss": 0.0087, + "step": 6778 + }, + { + "epoch": 0.8955973180962447, + "grad_norm": 0.16149206459522247, + "learning_rate": 5.341832431420091e-06, + "loss": 0.017, + "step": 6779 + }, + { + "epoch": 0.8957294315817287, + "grad_norm": 0.22760704159736633, + "learning_rate": 5.3284386885075195e-06, + "loss": 0.0177, + "step": 6780 + }, + { + "epoch": 0.8958615450672127, + "grad_norm": 0.09379708766937256, + "learning_rate": 5.315061298337065e-06, + "loss": 0.0111, + "step": 6781 + }, + { + "epoch": 0.8959936585526967, + "grad_norm": 0.14150847494602203, + "learning_rate": 5.301700263219411e-06, + "loss": 0.022, + "step": 6782 + }, + { + "epoch": 0.8961257720381808, + "grad_norm": 0.10962291061878204, + "learning_rate": 5.2883555854624565e-06, + "loss": 0.0128, + "step": 6783 + }, + { + "epoch": 0.8962578855236648, + "grad_norm": 0.11838527023792267, + "learning_rate": 5.275027267371213e-06, + "loss": 0.0127, + "step": 6784 + }, + { + "epoch": 0.8963899990091488, + "grad_norm": 0.15095725655555725, + "learning_rate": 5.2617153112479055e-06, + "loss": 0.0111, + "step": 6785 + }, + { + "epoch": 0.8965221124946329, + "grad_norm": 0.15705175697803497, + "learning_rate": 5.2484197193919286e-06, + "loss": 0.0155, + "step": 6786 + }, + { + "epoch": 0.8966542259801169, + "grad_norm": 0.12951944768428802, + "learning_rate": 5.235140494099866e-06, + "loss": 0.0164, + "step": 6787 + }, + { + "epoch": 0.8967863394656009, + "grad_norm": 0.13726073503494263, + "learning_rate": 5.221877637665429e-06, + "loss": 0.0157, + "step": 6788 + }, + { + "epoch": 0.896918452951085, + "grad_norm": 0.14434929192066193, + "learning_rate": 5.208631152379528e-06, + "loss": 0.0138, + "step": 6789 + }, + { + "epoch": 0.897050566436569, + "grad_norm": 0.12067513167858124, + "learning_rate": 5.195401040530279e-06, + "loss": 0.0087, + "step": 6790 + }, + { + "epoch": 0.897182679922053, + "grad_norm": 0.17936775088310242, + "learning_rate": 5.182187304402897e-06, + "loss": 0.0172, + "step": 6791 + }, + { + "epoch": 0.8973147934075371, + "grad_norm": 0.2976245880126953, + "learning_rate": 5.168989946279823e-06, + "loss": 0.038, + "step": 6792 + }, + { + "epoch": 0.8974469068930211, + "grad_norm": 0.2362768054008484, + "learning_rate": 5.155808968440645e-06, + "loss": 0.0188, + "step": 6793 + }, + { + "epoch": 0.8975790203785051, + "grad_norm": 0.1400487720966339, + "learning_rate": 5.142644373162164e-06, + "loss": 0.0043, + "step": 6794 + }, + { + "epoch": 0.8977111338639892, + "grad_norm": 0.1404900997877121, + "learning_rate": 5.129496162718284e-06, + "loss": 0.0172, + "step": 6795 + }, + { + "epoch": 0.8978432473494732, + "grad_norm": 0.14347364008426666, + "learning_rate": 5.11636433938012e-06, + "loss": 0.0124, + "step": 6796 + }, + { + "epoch": 0.8979753608349572, + "grad_norm": 0.17006570100784302, + "learning_rate": 5.103248905415958e-06, + "loss": 0.0183, + "step": 6797 + }, + { + "epoch": 0.8981074743204412, + "grad_norm": 0.34829583764076233, + "learning_rate": 5.09014986309122e-06, + "loss": 0.0187, + "step": 6798 + }, + { + "epoch": 0.8982395878059253, + "grad_norm": 0.13791634142398834, + "learning_rate": 5.077067214668552e-06, + "loss": 0.0118, + "step": 6799 + }, + { + "epoch": 0.8983717012914093, + "grad_norm": 0.1171623021364212, + "learning_rate": 5.064000962407711e-06, + "loss": 0.0116, + "step": 6800 + }, + { + "epoch": 0.8985038147768933, + "grad_norm": 0.21284325420856476, + "learning_rate": 5.050951108565682e-06, + "loss": 0.0194, + "step": 6801 + }, + { + "epoch": 0.8986359282623774, + "grad_norm": 0.1664762794971466, + "learning_rate": 5.03791765539654e-06, + "loss": 0.0182, + "step": 6802 + }, + { + "epoch": 0.8987680417478614, + "grad_norm": 0.21837081015110016, + "learning_rate": 5.024900605151595e-06, + "loss": 0.0258, + "step": 6803 + }, + { + "epoch": 0.8989001552333454, + "grad_norm": 0.17388522624969482, + "learning_rate": 5.011899960079303e-06, + "loss": 0.0136, + "step": 6804 + }, + { + "epoch": 0.8990322687188295, + "grad_norm": 0.10882629454135895, + "learning_rate": 4.998915722425268e-06, + "loss": 0.0033, + "step": 6805 + }, + { + "epoch": 0.8991643822043135, + "grad_norm": 0.11351826041936874, + "learning_rate": 4.985947894432286e-06, + "loss": 0.0083, + "step": 6806 + }, + { + "epoch": 0.8992964956897975, + "grad_norm": 0.16879047453403473, + "learning_rate": 4.972996478340286e-06, + "loss": 0.0143, + "step": 6807 + }, + { + "epoch": 0.8994286091752816, + "grad_norm": 0.25198879837989807, + "learning_rate": 4.960061476386424e-06, + "loss": 0.0156, + "step": 6808 + }, + { + "epoch": 0.8995607226607656, + "grad_norm": 0.1777067482471466, + "learning_rate": 4.9471428908049345e-06, + "loss": 0.0201, + "step": 6809 + }, + { + "epoch": 0.8996928361462496, + "grad_norm": 0.14899660646915436, + "learning_rate": 4.934240723827288e-06, + "loss": 0.0106, + "step": 6810 + }, + { + "epoch": 0.8998249496317337, + "grad_norm": 0.14987502992153168, + "learning_rate": 4.921354977682091e-06, + "loss": 0.011, + "step": 6811 + }, + { + "epoch": 0.8999570631172177, + "grad_norm": 0.16233918070793152, + "learning_rate": 4.908485654595107e-06, + "loss": 0.0172, + "step": 6812 + }, + { + "epoch": 0.9000891766027017, + "grad_norm": 0.13594211637973785, + "learning_rate": 4.89563275678927e-06, + "loss": 0.0132, + "step": 6813 + }, + { + "epoch": 0.9002212900881857, + "grad_norm": 0.1069660559296608, + "learning_rate": 4.882796286484681e-06, + "loss": 0.0138, + "step": 6814 + }, + { + "epoch": 0.9003534035736698, + "grad_norm": 0.15343253314495087, + "learning_rate": 4.8699762458986106e-06, + "loss": 0.0229, + "step": 6815 + }, + { + "epoch": 0.9004855170591538, + "grad_norm": 0.15718504786491394, + "learning_rate": 4.857172637245466e-06, + "loss": 0.0102, + "step": 6816 + }, + { + "epoch": 0.9006176305446378, + "grad_norm": 0.18040478229522705, + "learning_rate": 4.844385462736834e-06, + "loss": 0.0137, + "step": 6817 + }, + { + "epoch": 0.9007497440301219, + "grad_norm": 0.15509812533855438, + "learning_rate": 4.831614724581468e-06, + "loss": 0.0196, + "step": 6818 + }, + { + "epoch": 0.9008818575156059, + "grad_norm": 0.12873555719852448, + "learning_rate": 4.818860424985272e-06, + "loss": 0.0128, + "step": 6819 + }, + { + "epoch": 0.9010139710010899, + "grad_norm": 0.16008254885673523, + "learning_rate": 4.806122566151294e-06, + "loss": 0.0167, + "step": 6820 + }, + { + "epoch": 0.901146084486574, + "grad_norm": 0.11089048534631729, + "learning_rate": 4.793401150279786e-06, + "loss": 0.0092, + "step": 6821 + }, + { + "epoch": 0.901278197972058, + "grad_norm": 0.1454131156206131, + "learning_rate": 4.780696179568133e-06, + "loss": 0.0137, + "step": 6822 + }, + { + "epoch": 0.901410311457542, + "grad_norm": 0.3379257023334503, + "learning_rate": 4.768007656210871e-06, + "loss": 0.0206, + "step": 6823 + }, + { + "epoch": 0.901542424943026, + "grad_norm": 0.17755140364170074, + "learning_rate": 4.7553355823997e-06, + "loss": 0.0139, + "step": 6824 + }, + { + "epoch": 0.9016745384285101, + "grad_norm": 0.11262303590774536, + "learning_rate": 4.7426799603235036e-06, + "loss": 0.0112, + "step": 6825 + }, + { + "epoch": 0.9018066519139941, + "grad_norm": 0.16604208946228027, + "learning_rate": 4.730040792168289e-06, + "loss": 0.0156, + "step": 6826 + }, + { + "epoch": 0.9019387653994781, + "grad_norm": 0.09217502176761627, + "learning_rate": 4.7174180801172305e-06, + "loss": 0.0062, + "step": 6827 + }, + { + "epoch": 0.9020708788849622, + "grad_norm": 0.10860409587621689, + "learning_rate": 4.704811826350675e-06, + "loss": 0.0109, + "step": 6828 + }, + { + "epoch": 0.9022029923704462, + "grad_norm": 0.1001148670911789, + "learning_rate": 4.692222033046134e-06, + "loss": 0.0112, + "step": 6829 + }, + { + "epoch": 0.9023351058559302, + "grad_norm": 0.13455744087696075, + "learning_rate": 4.6796487023782145e-06, + "loss": 0.0108, + "step": 6830 + }, + { + "epoch": 0.9024672193414143, + "grad_norm": 0.13678915798664093, + "learning_rate": 4.667091836518766e-06, + "loss": 0.018, + "step": 6831 + }, + { + "epoch": 0.9025993328268983, + "grad_norm": 0.15132828056812286, + "learning_rate": 4.654551437636745e-06, + "loss": 0.0111, + "step": 6832 + }, + { + "epoch": 0.9027314463123823, + "grad_norm": 0.19527070224285126, + "learning_rate": 4.64202750789825e-06, + "loss": 0.0185, + "step": 6833 + }, + { + "epoch": 0.9028635597978664, + "grad_norm": 0.11000338196754456, + "learning_rate": 4.629520049466573e-06, + "loss": 0.0089, + "step": 6834 + }, + { + "epoch": 0.9029956732833504, + "grad_norm": 0.09668418020009995, + "learning_rate": 4.617029064502132e-06, + "loss": 0.0092, + "step": 6835 + }, + { + "epoch": 0.9031277867688344, + "grad_norm": 0.16052260994911194, + "learning_rate": 4.6045545551625434e-06, + "loss": 0.0149, + "step": 6836 + }, + { + "epoch": 0.9032599002543185, + "grad_norm": 0.3567523956298828, + "learning_rate": 4.592096523602485e-06, + "loss": 0.0233, + "step": 6837 + }, + { + "epoch": 0.9033920137398025, + "grad_norm": 0.18221727013587952, + "learning_rate": 4.579654971973912e-06, + "loss": 0.0197, + "step": 6838 + }, + { + "epoch": 0.9035241272252865, + "grad_norm": 0.12295905500650406, + "learning_rate": 4.567229902425829e-06, + "loss": 0.018, + "step": 6839 + }, + { + "epoch": 0.9036562407107706, + "grad_norm": 0.15242114663124084, + "learning_rate": 4.554821317104452e-06, + "loss": 0.0128, + "step": 6840 + }, + { + "epoch": 0.9037883541962546, + "grad_norm": 0.14071272313594818, + "learning_rate": 4.542429218153121e-06, + "loss": 0.0097, + "step": 6841 + }, + { + "epoch": 0.9039204676817386, + "grad_norm": 0.11783215403556824, + "learning_rate": 4.53005360771237e-06, + "loss": 0.0104, + "step": 6842 + }, + { + "epoch": 0.9040525811672226, + "grad_norm": 0.17464067041873932, + "learning_rate": 4.51769448791981e-06, + "loss": 0.0186, + "step": 6843 + }, + { + "epoch": 0.9041846946527067, + "grad_norm": 0.1822936236858368, + "learning_rate": 4.505351860910268e-06, + "loss": 0.0166, + "step": 6844 + }, + { + "epoch": 0.9043168081381907, + "grad_norm": 0.13986508548259735, + "learning_rate": 4.493025728815725e-06, + "loss": 0.0148, + "step": 6845 + }, + { + "epoch": 0.9044489216236747, + "grad_norm": 0.24553631246089935, + "learning_rate": 4.480716093765247e-06, + "loss": 0.0196, + "step": 6846 + }, + { + "epoch": 0.9045810351091588, + "grad_norm": 0.20718611776828766, + "learning_rate": 4.46842295788511e-06, + "loss": 0.0145, + "step": 6847 + }, + { + "epoch": 0.9047131485946428, + "grad_norm": 0.3277038633823395, + "learning_rate": 4.4561463232987265e-06, + "loss": 0.0216, + "step": 6848 + }, + { + "epoch": 0.9048452620801268, + "grad_norm": 0.12654957175254822, + "learning_rate": 4.443886192126679e-06, + "loss": 0.0125, + "step": 6849 + }, + { + "epoch": 0.9049773755656109, + "grad_norm": 0.1359531730413437, + "learning_rate": 4.431642566486638e-06, + "loss": 0.0144, + "step": 6850 + }, + { + "epoch": 0.9051094890510949, + "grad_norm": 0.18986926972866058, + "learning_rate": 4.419415448493469e-06, + "loss": 0.018, + "step": 6851 + }, + { + "epoch": 0.9052416025365789, + "grad_norm": 0.4173817038536072, + "learning_rate": 4.4072048402592045e-06, + "loss": 0.0228, + "step": 6852 + }, + { + "epoch": 0.905373716022063, + "grad_norm": 0.2071004956960678, + "learning_rate": 4.395010743892957e-06, + "loss": 0.0208, + "step": 6853 + }, + { + "epoch": 0.905505829507547, + "grad_norm": 0.11187253892421722, + "learning_rate": 4.382833161501065e-06, + "loss": 0.0122, + "step": 6854 + }, + { + "epoch": 0.905637942993031, + "grad_norm": 0.19523876905441284, + "learning_rate": 4.370672095186956e-06, + "loss": 0.0156, + "step": 6855 + }, + { + "epoch": 0.905770056478515, + "grad_norm": 0.10688573122024536, + "learning_rate": 4.35852754705125e-06, + "loss": 0.0108, + "step": 6856 + }, + { + "epoch": 0.9059021699639991, + "grad_norm": 0.08100339025259018, + "learning_rate": 4.346399519191657e-06, + "loss": 0.0078, + "step": 6857 + }, + { + "epoch": 0.9060342834494831, + "grad_norm": 0.11563509702682495, + "learning_rate": 4.334288013703091e-06, + "loss": 0.0118, + "step": 6858 + }, + { + "epoch": 0.9061663969349671, + "grad_norm": 0.20008042454719543, + "learning_rate": 4.322193032677602e-06, + "loss": 0.0232, + "step": 6859 + }, + { + "epoch": 0.9062985104204512, + "grad_norm": 0.14969661831855774, + "learning_rate": 4.310114578204327e-06, + "loss": 0.011, + "step": 6860 + }, + { + "epoch": 0.9064306239059352, + "grad_norm": 0.14968842267990112, + "learning_rate": 4.298052652369633e-06, + "loss": 0.0146, + "step": 6861 + }, + { + "epoch": 0.9065627373914192, + "grad_norm": 0.25634080171585083, + "learning_rate": 4.286007257256963e-06, + "loss": 0.0267, + "step": 6862 + }, + { + "epoch": 0.9066948508769033, + "grad_norm": 0.1451036036014557, + "learning_rate": 4.2739783949469645e-06, + "loss": 0.0189, + "step": 6863 + }, + { + "epoch": 0.9068269643623873, + "grad_norm": 0.09582092612981796, + "learning_rate": 4.261966067517375e-06, + "loss": 0.0163, + "step": 6864 + }, + { + "epoch": 0.9069590778478713, + "grad_norm": 0.1373281031847, + "learning_rate": 4.249970277043114e-06, + "loss": 0.0108, + "step": 6865 + }, + { + "epoch": 0.9070911913333554, + "grad_norm": 0.14923548698425293, + "learning_rate": 4.237991025596222e-06, + "loss": 0.0207, + "step": 6866 + }, + { + "epoch": 0.9072233048188394, + "grad_norm": 0.3042697608470917, + "learning_rate": 4.226028315245889e-06, + "loss": 0.0217, + "step": 6867 + }, + { + "epoch": 0.9073554183043234, + "grad_norm": 0.11148915439844131, + "learning_rate": 4.214082148058451e-06, + "loss": 0.0065, + "step": 6868 + }, + { + "epoch": 0.9074875317898075, + "grad_norm": 0.1816503405570984, + "learning_rate": 4.20215252609738e-06, + "loss": 0.0215, + "step": 6869 + }, + { + "epoch": 0.9076196452752915, + "grad_norm": 0.24329888820648193, + "learning_rate": 4.190239451423317e-06, + "loss": 0.0175, + "step": 6870 + }, + { + "epoch": 0.9077517587607755, + "grad_norm": 0.1107625663280487, + "learning_rate": 4.178342926094003e-06, + "loss": 0.0106, + "step": 6871 + }, + { + "epoch": 0.9078838722462595, + "grad_norm": 0.12002073973417282, + "learning_rate": 4.166462952164341e-06, + "loss": 0.0136, + "step": 6872 + }, + { + "epoch": 0.9080159857317436, + "grad_norm": 0.1341729611158371, + "learning_rate": 4.154599531686387e-06, + "loss": 0.0126, + "step": 6873 + }, + { + "epoch": 0.9081480992172276, + "grad_norm": 0.14817306399345398, + "learning_rate": 4.142752666709304e-06, + "loss": 0.0132, + "step": 6874 + }, + { + "epoch": 0.9082802127027116, + "grad_norm": 0.10765121132135391, + "learning_rate": 4.130922359279432e-06, + "loss": 0.0101, + "step": 6875 + }, + { + "epoch": 0.9084123261881957, + "grad_norm": 0.11597680300474167, + "learning_rate": 4.119108611440225e-06, + "loss": 0.0052, + "step": 6876 + }, + { + "epoch": 0.9085444396736797, + "grad_norm": 0.090935617685318, + "learning_rate": 4.107311425232319e-06, + "loss": 0.0067, + "step": 6877 + }, + { + "epoch": 0.9086765531591637, + "grad_norm": 0.24119891226291656, + "learning_rate": 4.095530802693404e-06, + "loss": 0.0303, + "step": 6878 + }, + { + "epoch": 0.9088086666446478, + "grad_norm": 0.11198988556861877, + "learning_rate": 4.083766745858408e-06, + "loss": 0.0134, + "step": 6879 + }, + { + "epoch": 0.9089407801301318, + "grad_norm": 0.21285560727119446, + "learning_rate": 4.07201925675933e-06, + "loss": 0.0224, + "step": 6880 + }, + { + "epoch": 0.9090728936156158, + "grad_norm": 0.2002650648355484, + "learning_rate": 4.060288337425333e-06, + "loss": 0.0212, + "step": 6881 + }, + { + "epoch": 0.9092050071010999, + "grad_norm": 0.1997833251953125, + "learning_rate": 4.04857398988272e-06, + "loss": 0.0205, + "step": 6882 + }, + { + "epoch": 0.9093371205865839, + "grad_norm": 0.13389846682548523, + "learning_rate": 4.036876216154906e-06, + "loss": 0.0117, + "step": 6883 + }, + { + "epoch": 0.9094692340720679, + "grad_norm": 0.1802111119031906, + "learning_rate": 4.025195018262495e-06, + "loss": 0.0201, + "step": 6884 + }, + { + "epoch": 0.909601347557552, + "grad_norm": 0.16740918159484863, + "learning_rate": 4.0135303982231755e-06, + "loss": 0.0227, + "step": 6885 + }, + { + "epoch": 0.909733461043036, + "grad_norm": 0.1399318426847458, + "learning_rate": 4.001882358051779e-06, + "loss": 0.0142, + "step": 6886 + }, + { + "epoch": 0.90986557452852, + "grad_norm": 0.16021879017353058, + "learning_rate": 3.9902508997603175e-06, + "loss": 0.0141, + "step": 6887 + }, + { + "epoch": 0.909997688014004, + "grad_norm": 0.1452077478170395, + "learning_rate": 3.978636025357885e-06, + "loss": 0.0141, + "step": 6888 + }, + { + "epoch": 0.9101298014994881, + "grad_norm": 0.1472679078578949, + "learning_rate": 3.967037736850743e-06, + "loss": 0.0097, + "step": 6889 + }, + { + "epoch": 0.9102619149849721, + "grad_norm": 0.2741580903530121, + "learning_rate": 3.9554560362422775e-06, + "loss": 0.0192, + "step": 6890 + }, + { + "epoch": 0.9103940284704561, + "grad_norm": 0.13638873398303986, + "learning_rate": 3.943890925533022e-06, + "loss": 0.016, + "step": 6891 + }, + { + "epoch": 0.9105261419559402, + "grad_norm": 0.20388948917388916, + "learning_rate": 3.932342406720602e-06, + "loss": 0.0169, + "step": 6892 + }, + { + "epoch": 0.9106582554414242, + "grad_norm": 0.11608298122882843, + "learning_rate": 3.920810481799841e-06, + "loss": 0.0094, + "step": 6893 + }, + { + "epoch": 0.9107903689269082, + "grad_norm": 0.12942413985729218, + "learning_rate": 3.909295152762648e-06, + "loss": 0.01, + "step": 6894 + }, + { + "epoch": 0.9109224824123923, + "grad_norm": 0.13718636333942413, + "learning_rate": 3.897796421598088e-06, + "loss": 0.0125, + "step": 6895 + }, + { + "epoch": 0.9110545958978763, + "grad_norm": 0.19180552661418915, + "learning_rate": 3.886314290292326e-06, + "loss": 0.0184, + "step": 6896 + }, + { + "epoch": 0.9111867093833603, + "grad_norm": 0.16199803352355957, + "learning_rate": 3.874848760828731e-06, + "loss": 0.0191, + "step": 6897 + }, + { + "epoch": 0.9113188228688444, + "grad_norm": 0.10444027185440063, + "learning_rate": 3.86339983518772e-06, + "loss": 0.0086, + "step": 6898 + }, + { + "epoch": 0.9114509363543284, + "grad_norm": 0.19840379059314728, + "learning_rate": 3.8519675153468995e-06, + "loss": 0.0168, + "step": 6899 + }, + { + "epoch": 0.9115830498398124, + "grad_norm": 0.21695414185523987, + "learning_rate": 3.8405518032809785e-06, + "loss": 0.0187, + "step": 6900 + }, + { + "epoch": 0.9117151633252965, + "grad_norm": 0.07511827349662781, + "learning_rate": 3.829152700961835e-06, + "loss": 0.0044, + "step": 6901 + }, + { + "epoch": 0.9118472768107805, + "grad_norm": 0.15764155983924866, + "learning_rate": 3.817770210358407e-06, + "loss": 0.0243, + "step": 6902 + }, + { + "epoch": 0.9119793902962645, + "grad_norm": 0.11205530911684036, + "learning_rate": 3.8064043334368416e-06, + "loss": 0.0112, + "step": 6903 + }, + { + "epoch": 0.9121115037817485, + "grad_norm": 0.14387571811676025, + "learning_rate": 3.7950550721603696e-06, + "loss": 0.0109, + "step": 6904 + }, + { + "epoch": 0.9122436172672326, + "grad_norm": 0.2204601764678955, + "learning_rate": 3.783722428489367e-06, + "loss": 0.0215, + "step": 6905 + }, + { + "epoch": 0.9123757307527166, + "grad_norm": 0.10634907335042953, + "learning_rate": 3.7724064043813232e-06, + "loss": 0.0087, + "step": 6906 + }, + { + "epoch": 0.9125078442382006, + "grad_norm": 0.19405195116996765, + "learning_rate": 3.7611070017908757e-06, + "loss": 0.0179, + "step": 6907 + }, + { + "epoch": 0.9126399577236847, + "grad_norm": 0.1435571163892746, + "learning_rate": 3.7498242226698066e-06, + "loss": 0.0096, + "step": 6908 + }, + { + "epoch": 0.9127720712091687, + "grad_norm": 0.159325510263443, + "learning_rate": 3.7385580689669796e-06, + "loss": 0.0147, + "step": 6909 + }, + { + "epoch": 0.9129041846946527, + "grad_norm": 0.1752527505159378, + "learning_rate": 3.727308542628416e-06, + "loss": 0.0305, + "step": 6910 + }, + { + "epoch": 0.9130362981801368, + "grad_norm": 0.17263196408748627, + "learning_rate": 3.716075645597272e-06, + "loss": 0.0214, + "step": 6911 + }, + { + "epoch": 0.9131684116656208, + "grad_norm": 0.07739172875881195, + "learning_rate": 3.7048593798138077e-06, + "loss": 0.0052, + "step": 6912 + }, + { + "epoch": 0.9133005251511048, + "grad_norm": 0.21998505294322968, + "learning_rate": 3.6936597472154277e-06, + "loss": 0.0325, + "step": 6913 + }, + { + "epoch": 0.9134326386365889, + "grad_norm": 0.22086240351200104, + "learning_rate": 3.682476749736663e-06, + "loss": 0.0175, + "step": 6914 + }, + { + "epoch": 0.9135647521220729, + "grad_norm": 0.1500852108001709, + "learning_rate": 3.671310389309168e-06, + "loss": 0.0132, + "step": 6915 + }, + { + "epoch": 0.9136968656075569, + "grad_norm": 0.2943585515022278, + "learning_rate": 3.660160667861712e-06, + "loss": 0.0152, + "step": 6916 + }, + { + "epoch": 0.913828979093041, + "grad_norm": 0.16644565761089325, + "learning_rate": 3.6490275873202085e-06, + "loss": 0.0124, + "step": 6917 + }, + { + "epoch": 0.913961092578525, + "grad_norm": 0.2597222924232483, + "learning_rate": 3.637911149607709e-06, + "loss": 0.0221, + "step": 6918 + }, + { + "epoch": 0.914093206064009, + "grad_norm": 0.13682372868061066, + "learning_rate": 3.6268113566443327e-06, + "loss": 0.0144, + "step": 6919 + }, + { + "epoch": 0.914225319549493, + "grad_norm": 0.1653459668159485, + "learning_rate": 3.615728210347369e-06, + "loss": 0.021, + "step": 6920 + }, + { + "epoch": 0.9143574330349771, + "grad_norm": 0.17368349432945251, + "learning_rate": 3.604661712631241e-06, + "loss": 0.0117, + "step": 6921 + }, + { + "epoch": 0.9144895465204611, + "grad_norm": 0.14491091668605804, + "learning_rate": 3.5936118654074867e-06, + "loss": 0.0136, + "step": 6922 + }, + { + "epoch": 0.9146216600059451, + "grad_norm": 0.17515389621257782, + "learning_rate": 3.5825786705847354e-06, + "loss": 0.0167, + "step": 6923 + }, + { + "epoch": 0.9147537734914292, + "grad_norm": 0.1294727772474289, + "learning_rate": 3.571562130068773e-06, + "loss": 0.0113, + "step": 6924 + }, + { + "epoch": 0.9148858869769132, + "grad_norm": 0.25797829031944275, + "learning_rate": 3.560562245762522e-06, + "loss": 0.0219, + "step": 6925 + }, + { + "epoch": 0.9150180004623972, + "grad_norm": 0.11501938849687576, + "learning_rate": 3.549579019565974e-06, + "loss": 0.0094, + "step": 6926 + }, + { + "epoch": 0.9151501139478813, + "grad_norm": 0.1509166806936264, + "learning_rate": 3.5386124533762775e-06, + "loss": 0.0133, + "step": 6927 + }, + { + "epoch": 0.9152822274333653, + "grad_norm": 0.156303271651268, + "learning_rate": 3.527662549087729e-06, + "loss": 0.0104, + "step": 6928 + }, + { + "epoch": 0.9154143409188493, + "grad_norm": 0.39509233832359314, + "learning_rate": 3.5167293085917153e-06, + "loss": 0.0125, + "step": 6929 + }, + { + "epoch": 0.9155464544043334, + "grad_norm": 0.14178279042243958, + "learning_rate": 3.5058127337767253e-06, + "loss": 0.0188, + "step": 6930 + }, + { + "epoch": 0.9156785678898174, + "grad_norm": 0.1454003006219864, + "learning_rate": 3.494912826528407e-06, + "loss": 0.0119, + "step": 6931 + }, + { + "epoch": 0.9158106813753014, + "grad_norm": 0.13929800689220428, + "learning_rate": 3.4840295887295315e-06, + "loss": 0.0113, + "step": 6932 + }, + { + "epoch": 0.9159427948607854, + "grad_norm": 0.1115756407380104, + "learning_rate": 3.47316302225994e-06, + "loss": 0.0138, + "step": 6933 + }, + { + "epoch": 0.9160749083462695, + "grad_norm": 0.12636327743530273, + "learning_rate": 3.4623131289966525e-06, + "loss": 0.0113, + "step": 6934 + }, + { + "epoch": 0.9162070218317535, + "grad_norm": 0.11508063971996307, + "learning_rate": 3.451479910813782e-06, + "loss": 0.0094, + "step": 6935 + }, + { + "epoch": 0.9163391353172375, + "grad_norm": 0.14656183123588562, + "learning_rate": 3.4406633695825752e-06, + "loss": 0.0134, + "step": 6936 + }, + { + "epoch": 0.9164712488027216, + "grad_norm": 0.14659330248832703, + "learning_rate": 3.42986350717136e-06, + "loss": 0.0118, + "step": 6937 + }, + { + "epoch": 0.9166033622882056, + "grad_norm": 0.08480847626924515, + "learning_rate": 3.419080325445634e-06, + "loss": 0.0063, + "step": 6938 + }, + { + "epoch": 0.9167354757736896, + "grad_norm": 0.1341126710176468, + "learning_rate": 3.408313826267984e-06, + "loss": 0.0132, + "step": 6939 + }, + { + "epoch": 0.9168675892591737, + "grad_norm": 0.24358314275741577, + "learning_rate": 3.397564011498111e-06, + "loss": 0.02, + "step": 6940 + }, + { + "epoch": 0.9169997027446577, + "grad_norm": 0.23924903571605682, + "learning_rate": 3.386830882992853e-06, + "loss": 0.0172, + "step": 6941 + }, + { + "epoch": 0.9171318162301417, + "grad_norm": 0.16076403856277466, + "learning_rate": 3.37611444260616e-06, + "loss": 0.0154, + "step": 6942 + }, + { + "epoch": 0.9172639297156258, + "grad_norm": 0.20377475023269653, + "learning_rate": 3.365414692189106e-06, + "loss": 0.0272, + "step": 6943 + }, + { + "epoch": 0.9173960432011098, + "grad_norm": 0.13567671179771423, + "learning_rate": 3.354731633589847e-06, + "loss": 0.0096, + "step": 6944 + }, + { + "epoch": 0.9175281566865938, + "grad_norm": 0.18160465359687805, + "learning_rate": 3.3440652686536957e-06, + "loss": 0.0213, + "step": 6945 + }, + { + "epoch": 0.9176602701720779, + "grad_norm": 0.13435958325862885, + "learning_rate": 3.3334155992230776e-06, + "loss": 0.0133, + "step": 6946 + }, + { + "epoch": 0.9177923836575619, + "grad_norm": 0.14226631820201874, + "learning_rate": 3.3227826271374997e-06, + "loss": 0.0083, + "step": 6947 + }, + { + "epoch": 0.9179244971430459, + "grad_norm": 0.12110575288534164, + "learning_rate": 3.3121663542336256e-06, + "loss": 0.011, + "step": 6948 + }, + { + "epoch": 0.91805661062853, + "grad_norm": 0.1794366091489792, + "learning_rate": 3.301566782345211e-06, + "loss": 0.0214, + "step": 6949 + }, + { + "epoch": 0.918188724114014, + "grad_norm": 0.1318056434392929, + "learning_rate": 3.290983913303147e-06, + "loss": 0.0115, + "step": 6950 + }, + { + "epoch": 0.918320837599498, + "grad_norm": 0.14880859851837158, + "learning_rate": 3.280417748935416e-06, + "loss": 0.0105, + "step": 6951 + }, + { + "epoch": 0.918452951084982, + "grad_norm": 0.14884725213050842, + "learning_rate": 3.269868291067124e-06, + "loss": 0.0109, + "step": 6952 + }, + { + "epoch": 0.9185850645704661, + "grad_norm": 0.19536083936691284, + "learning_rate": 3.259335541520503e-06, + "loss": 0.0113, + "step": 6953 + }, + { + "epoch": 0.9187171780559501, + "grad_norm": 0.13253453373908997, + "learning_rate": 3.2488195021148525e-06, + "loss": 0.014, + "step": 6954 + }, + { + "epoch": 0.9188492915414341, + "grad_norm": 0.11469663679599762, + "learning_rate": 3.238320174666676e-06, + "loss": 0.0182, + "step": 6955 + }, + { + "epoch": 0.9189814050269182, + "grad_norm": 0.12924715876579285, + "learning_rate": 3.227837560989511e-06, + "loss": 0.0105, + "step": 6956 + }, + { + "epoch": 0.9191135185124022, + "grad_norm": 0.13562564551830292, + "learning_rate": 3.2173716628940198e-06, + "loss": 0.0074, + "step": 6957 + }, + { + "epoch": 0.9192456319978862, + "grad_norm": 0.1238885447382927, + "learning_rate": 3.2069224821880127e-06, + "loss": 0.0165, + "step": 6958 + }, + { + "epoch": 0.9193777454833703, + "grad_norm": 0.1311292201280594, + "learning_rate": 3.196490020676379e-06, + "loss": 0.0113, + "step": 6959 + }, + { + "epoch": 0.9195098589688543, + "grad_norm": 0.14047713577747345, + "learning_rate": 3.186074280161122e-06, + "loss": 0.0117, + "step": 6960 + }, + { + "epoch": 0.9196419724543383, + "grad_norm": 0.1856159120798111, + "learning_rate": 3.175675262441391e-06, + "loss": 0.0202, + "step": 6961 + }, + { + "epoch": 0.9197740859398224, + "grad_norm": 0.1386088728904724, + "learning_rate": 3.1652929693133935e-06, + "loss": 0.0133, + "step": 6962 + }, + { + "epoch": 0.9199061994253064, + "grad_norm": 0.21501292288303375, + "learning_rate": 3.1549274025705065e-06, + "loss": 0.0237, + "step": 6963 + }, + { + "epoch": 0.9200383129107904, + "grad_norm": 0.14579859375953674, + "learning_rate": 3.1445785640031646e-06, + "loss": 0.0169, + "step": 6964 + }, + { + "epoch": 0.9201704263962744, + "grad_norm": 0.2133261114358902, + "learning_rate": 3.134246455398937e-06, + "loss": 0.0159, + "step": 6965 + }, + { + "epoch": 0.9203025398817585, + "grad_norm": 0.21119467914104462, + "learning_rate": 3.1239310785425192e-06, + "loss": 0.0227, + "step": 6966 + }, + { + "epoch": 0.9204346533672425, + "grad_norm": 0.13757382333278656, + "learning_rate": 3.1136324352156852e-06, + "loss": 0.0142, + "step": 6967 + }, + { + "epoch": 0.9205667668527265, + "grad_norm": 0.18206921219825745, + "learning_rate": 3.103350527197335e-06, + "loss": 0.0138, + "step": 6968 + }, + { + "epoch": 0.9206988803382106, + "grad_norm": 0.1949104219675064, + "learning_rate": 3.093085356263481e-06, + "loss": 0.0293, + "step": 6969 + }, + { + "epoch": 0.9208309938236946, + "grad_norm": 0.14741899073123932, + "learning_rate": 3.082836924187238e-06, + "loss": 0.0149, + "step": 6970 + }, + { + "epoch": 0.9209631073091786, + "grad_norm": 0.11937861144542694, + "learning_rate": 3.072605232738823e-06, + "loss": 0.0149, + "step": 6971 + }, + { + "epoch": 0.9210952207946627, + "grad_norm": 0.09795008599758148, + "learning_rate": 3.062390283685579e-06, + "loss": 0.0074, + "step": 6972 + }, + { + "epoch": 0.9212273342801467, + "grad_norm": 0.09507367014884949, + "learning_rate": 3.05219207879196e-06, + "loss": 0.0047, + "step": 6973 + }, + { + "epoch": 0.9213594477656307, + "grad_norm": 0.1361870914697647, + "learning_rate": 3.0420106198194797e-06, + "loss": 0.019, + "step": 6974 + }, + { + "epoch": 0.9214915612511148, + "grad_norm": 0.21445150673389435, + "learning_rate": 3.03184590852682e-06, + "loss": 0.0137, + "step": 6975 + }, + { + "epoch": 0.9216236747365988, + "grad_norm": 0.18917971849441528, + "learning_rate": 3.0216979466697436e-06, + "loss": 0.0212, + "step": 6976 + }, + { + "epoch": 0.9217557882220828, + "grad_norm": 0.20330829918384552, + "learning_rate": 3.0115667360011144e-06, + "loss": 0.0178, + "step": 6977 + }, + { + "epoch": 0.9218879017075668, + "grad_norm": 0.14233195781707764, + "learning_rate": 3.001452278270911e-06, + "loss": 0.0116, + "step": 6978 + }, + { + "epoch": 0.9220200151930509, + "grad_norm": 0.1598593294620514, + "learning_rate": 2.991354575226224e-06, + "loss": 0.0166, + "step": 6979 + }, + { + "epoch": 0.9221521286785349, + "grad_norm": 0.24355551600456238, + "learning_rate": 2.9812736286112365e-06, + "loss": 0.034, + "step": 6980 + }, + { + "epoch": 0.9222842421640189, + "grad_norm": 0.15401063859462738, + "learning_rate": 2.971209440167244e-06, + "loss": 0.0115, + "step": 6981 + }, + { + "epoch": 0.922416355649503, + "grad_norm": 0.13340553641319275, + "learning_rate": 2.9611620116326346e-06, + "loss": 0.0092, + "step": 6982 + }, + { + "epoch": 0.922548469134987, + "grad_norm": 0.1478549987077713, + "learning_rate": 2.9511313447429303e-06, + "loss": 0.0144, + "step": 6983 + }, + { + "epoch": 0.922680582620471, + "grad_norm": 0.1342504918575287, + "learning_rate": 2.941117441230756e-06, + "loss": 0.0158, + "step": 6984 + }, + { + "epoch": 0.9228126961059551, + "grad_norm": 0.1833777129650116, + "learning_rate": 2.931120302825785e-06, + "loss": 0.0132, + "step": 6985 + }, + { + "epoch": 0.9229448095914391, + "grad_norm": 0.1599016785621643, + "learning_rate": 2.9211399312548684e-06, + "loss": 0.0163, + "step": 6986 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 0.16944639384746552, + "learning_rate": 2.911176328241927e-06, + "loss": 0.0187, + "step": 6987 + }, + { + "epoch": 0.9232090365624072, + "grad_norm": 0.30180472135543823, + "learning_rate": 2.901229495507962e-06, + "loss": 0.0431, + "step": 6988 + }, + { + "epoch": 0.9233411500478912, + "grad_norm": 0.13991716504096985, + "learning_rate": 2.891299434771133e-06, + "loss": 0.0087, + "step": 6989 + }, + { + "epoch": 0.9234732635333752, + "grad_norm": 0.09995570778846741, + "learning_rate": 2.8813861477466455e-06, + "loss": 0.0063, + "step": 6990 + }, + { + "epoch": 0.9236053770188593, + "grad_norm": 0.17359285056591034, + "learning_rate": 2.8714896361468624e-06, + "loss": 0.0149, + "step": 6991 + }, + { + "epoch": 0.9237374905043433, + "grad_norm": 0.1824413388967514, + "learning_rate": 2.861609901681195e-06, + "loss": 0.0159, + "step": 6992 + }, + { + "epoch": 0.9238696039898273, + "grad_norm": 0.1265413910150528, + "learning_rate": 2.8517469460561998e-06, + "loss": 0.0136, + "step": 6993 + }, + { + "epoch": 0.9240017174753113, + "grad_norm": 0.17025385797023773, + "learning_rate": 2.841900770975514e-06, + "loss": 0.0156, + "step": 6994 + }, + { + "epoch": 0.9241338309607954, + "grad_norm": 0.1325269490480423, + "learning_rate": 2.832071378139878e-06, + "loss": 0.0145, + "step": 6995 + }, + { + "epoch": 0.9242659444462794, + "grad_norm": 0.10881488770246506, + "learning_rate": 2.8222587692471213e-06, + "loss": 0.0095, + "step": 6996 + }, + { + "epoch": 0.9243980579317634, + "grad_norm": 0.12182066589593887, + "learning_rate": 2.8124629459922224e-06, + "loss": 0.0161, + "step": 6997 + }, + { + "epoch": 0.9245301714172475, + "grad_norm": 0.12361127883195877, + "learning_rate": 2.8026839100672054e-06, + "loss": 0.0143, + "step": 6998 + }, + { + "epoch": 0.9246622849027315, + "grad_norm": 0.11965993791818619, + "learning_rate": 2.7929216631612076e-06, + "loss": 0.01, + "step": 6999 + }, + { + "epoch": 0.9247943983882154, + "grad_norm": 0.2763892412185669, + "learning_rate": 2.7831762069604915e-06, + "loss": 0.0245, + "step": 7000 + }, + { + "epoch": 0.9249265118736995, + "grad_norm": 0.29099127650260925, + "learning_rate": 2.7734475431483995e-06, + "loss": 0.017, + "step": 7001 + }, + { + "epoch": 0.9250586253591835, + "grad_norm": 0.13824976980686188, + "learning_rate": 2.7637356734053764e-06, + "loss": 0.0145, + "step": 7002 + }, + { + "epoch": 0.9251907388446675, + "grad_norm": 0.11890945583581924, + "learning_rate": 2.754040599408947e-06, + "loss": 0.0117, + "step": 7003 + }, + { + "epoch": 0.9253228523301515, + "grad_norm": 0.20134928822517395, + "learning_rate": 2.7443623228337822e-06, + "loss": 0.0276, + "step": 7004 + }, + { + "epoch": 0.9254549658156356, + "grad_norm": 0.0993884950876236, + "learning_rate": 2.734700845351612e-06, + "loss": 0.008, + "step": 7005 + }, + { + "epoch": 0.9255870793011196, + "grad_norm": 0.09369389712810516, + "learning_rate": 2.725056168631268e-06, + "loss": 0.0066, + "step": 7006 + }, + { + "epoch": 0.9257191927866036, + "grad_norm": 0.20960983633995056, + "learning_rate": 2.7154282943386957e-06, + "loss": 0.0244, + "step": 7007 + }, + { + "epoch": 0.9258513062720877, + "grad_norm": 0.15154051780700684, + "learning_rate": 2.7058172241369417e-06, + "loss": 0.0113, + "step": 7008 + }, + { + "epoch": 0.9259834197575717, + "grad_norm": 0.09699692577123642, + "learning_rate": 2.696222959686112e-06, + "loss": 0.0081, + "step": 7009 + }, + { + "epoch": 0.9261155332430557, + "grad_norm": 0.2003384381532669, + "learning_rate": 2.6866455026434477e-06, + "loss": 0.0159, + "step": 7010 + }, + { + "epoch": 0.9262476467285398, + "grad_norm": 0.1282658725976944, + "learning_rate": 2.677084854663292e-06, + "loss": 0.0147, + "step": 7011 + }, + { + "epoch": 0.9263797602140238, + "grad_norm": 0.2649095356464386, + "learning_rate": 2.667541017397057e-06, + "loss": 0.0205, + "step": 7012 + }, + { + "epoch": 0.9265118736995078, + "grad_norm": 0.1102382019162178, + "learning_rate": 2.6580139924932355e-06, + "loss": 0.0092, + "step": 7013 + }, + { + "epoch": 0.9266439871849919, + "grad_norm": 0.2010749727487564, + "learning_rate": 2.6485037815974778e-06, + "loss": 0.0156, + "step": 7014 + }, + { + "epoch": 0.9267761006704759, + "grad_norm": 0.38905027508735657, + "learning_rate": 2.6390103863525028e-06, + "loss": 0.0141, + "step": 7015 + }, + { + "epoch": 0.9269082141559599, + "grad_norm": 0.18634909391403198, + "learning_rate": 2.629533808398077e-06, + "loss": 0.0177, + "step": 7016 + }, + { + "epoch": 0.927040327641444, + "grad_norm": 0.13378973305225372, + "learning_rate": 2.620074049371135e-06, + "loss": 0.0137, + "step": 7017 + }, + { + "epoch": 0.927172441126928, + "grad_norm": 0.16449542343616486, + "learning_rate": 2.610631110905659e-06, + "loss": 0.013, + "step": 7018 + }, + { + "epoch": 0.927304554612412, + "grad_norm": 0.10755720734596252, + "learning_rate": 2.6012049946327445e-06, + "loss": 0.0114, + "step": 7019 + }, + { + "epoch": 0.927436668097896, + "grad_norm": 0.11967146396636963, + "learning_rate": 2.591795702180577e-06, + "loss": 0.016, + "step": 7020 + }, + { + "epoch": 0.9275687815833801, + "grad_norm": 0.18971772491931915, + "learning_rate": 2.5824032351744244e-06, + "loss": 0.0113, + "step": 7021 + }, + { + "epoch": 0.9277008950688641, + "grad_norm": 0.13666923344135284, + "learning_rate": 2.5730275952366877e-06, + "loss": 0.0145, + "step": 7022 + }, + { + "epoch": 0.9278330085543481, + "grad_norm": 0.21552050113677979, + "learning_rate": 2.5636687839867947e-06, + "loss": 0.0193, + "step": 7023 + }, + { + "epoch": 0.9279651220398322, + "grad_norm": 0.08234214037656784, + "learning_rate": 2.5543268030413405e-06, + "loss": 0.0082, + "step": 7024 + }, + { + "epoch": 0.9280972355253162, + "grad_norm": 0.2322162389755249, + "learning_rate": 2.5450016540139566e-06, + "loss": 0.0286, + "step": 7025 + }, + { + "epoch": 0.9282293490108002, + "grad_norm": 0.1154404729604721, + "learning_rate": 2.535693338515399e-06, + "loss": 0.0069, + "step": 7026 + }, + { + "epoch": 0.9283614624962843, + "grad_norm": 0.10801186412572861, + "learning_rate": 2.526401858153493e-06, + "loss": 0.0096, + "step": 7027 + }, + { + "epoch": 0.9284935759817683, + "grad_norm": 0.13217969238758087, + "learning_rate": 2.517127214533177e-06, + "loss": 0.0185, + "step": 7028 + }, + { + "epoch": 0.9286256894672523, + "grad_norm": 0.11214915663003922, + "learning_rate": 2.5078694092564804e-06, + "loss": 0.0086, + "step": 7029 + }, + { + "epoch": 0.9287578029527364, + "grad_norm": 0.133506640791893, + "learning_rate": 2.4986284439225015e-06, + "loss": 0.0137, + "step": 7030 + }, + { + "epoch": 0.9288899164382204, + "grad_norm": 0.1064433827996254, + "learning_rate": 2.489404320127442e-06, + "loss": 0.0161, + "step": 7031 + }, + { + "epoch": 0.9290220299237044, + "grad_norm": 0.14859448373317719, + "learning_rate": 2.4801970394646157e-06, + "loss": 0.0124, + "step": 7032 + }, + { + "epoch": 0.9291541434091884, + "grad_norm": 0.160023033618927, + "learning_rate": 2.4710066035243838e-06, + "loss": 0.0103, + "step": 7033 + }, + { + "epoch": 0.9292862568946725, + "grad_norm": 0.1463107019662857, + "learning_rate": 2.4618330138942437e-06, + "loss": 0.017, + "step": 7034 + }, + { + "epoch": 0.9294183703801565, + "grad_norm": 0.12404201179742813, + "learning_rate": 2.4526762721587494e-06, + "loss": 0.0149, + "step": 7035 + }, + { + "epoch": 0.9295504838656405, + "grad_norm": 0.19135360419750214, + "learning_rate": 2.4435363798995692e-06, + "loss": 0.0201, + "step": 7036 + }, + { + "epoch": 0.9296825973511246, + "grad_norm": 0.11712557822465897, + "learning_rate": 2.434413338695429e-06, + "loss": 0.0146, + "step": 7037 + }, + { + "epoch": 0.9298147108366086, + "grad_norm": 0.09654711931943893, + "learning_rate": 2.425307150122169e-06, + "loss": 0.0065, + "step": 7038 + }, + { + "epoch": 0.9299468243220926, + "grad_norm": 0.13926123082637787, + "learning_rate": 2.4162178157527304e-06, + "loss": 0.0128, + "step": 7039 + }, + { + "epoch": 0.9300789378075767, + "grad_norm": 0.10085835307836533, + "learning_rate": 2.407145337157113e-06, + "loss": 0.009, + "step": 7040 + }, + { + "epoch": 0.9302110512930607, + "grad_norm": 0.09652812778949738, + "learning_rate": 2.3980897159024073e-06, + "loss": 0.0087, + "step": 7041 + }, + { + "epoch": 0.9303431647785447, + "grad_norm": 0.1405116319656372, + "learning_rate": 2.389050953552818e-06, + "loss": 0.0146, + "step": 7042 + }, + { + "epoch": 0.9304752782640288, + "grad_norm": 0.21289518475532532, + "learning_rate": 2.38002905166963e-06, + "loss": 0.0181, + "step": 7043 + }, + { + "epoch": 0.9306073917495128, + "grad_norm": 0.2870105504989624, + "learning_rate": 2.371024011811185e-06, + "loss": 0.0197, + "step": 7044 + }, + { + "epoch": 0.9307395052349968, + "grad_norm": 0.14215752482414246, + "learning_rate": 2.36203583553295e-06, + "loss": 0.0142, + "step": 7045 + }, + { + "epoch": 0.9308716187204809, + "grad_norm": 0.0909138098359108, + "learning_rate": 2.3530645243874604e-06, + "loss": 0.0137, + "step": 7046 + }, + { + "epoch": 0.9310037322059649, + "grad_norm": 0.18109942972660065, + "learning_rate": 2.344110079924344e-06, + "loss": 0.0152, + "step": 7047 + }, + { + "epoch": 0.9311358456914489, + "grad_norm": 0.10473036766052246, + "learning_rate": 2.335172503690308e-06, + "loss": 0.0075, + "step": 7048 + }, + { + "epoch": 0.931267959176933, + "grad_norm": 0.18801212310791016, + "learning_rate": 2.326251797229162e-06, + "loss": 0.0266, + "step": 7049 + }, + { + "epoch": 0.931400072662417, + "grad_norm": 0.2324409782886505, + "learning_rate": 2.317347962081784e-06, + "loss": 0.0188, + "step": 7050 + }, + { + "epoch": 0.931532186147901, + "grad_norm": 0.18094132840633392, + "learning_rate": 2.308460999786144e-06, + "loss": 0.0155, + "step": 7051 + }, + { + "epoch": 0.931664299633385, + "grad_norm": 0.2537223696708679, + "learning_rate": 2.299590911877303e-06, + "loss": 0.0144, + "step": 7052 + }, + { + "epoch": 0.9317964131188691, + "grad_norm": 0.16453498601913452, + "learning_rate": 2.290737699887402e-06, + "loss": 0.0284, + "step": 7053 + }, + { + "epoch": 0.9319285266043531, + "grad_norm": 0.15937440097332, + "learning_rate": 2.281901365345662e-06, + "loss": 0.0226, + "step": 7054 + }, + { + "epoch": 0.9320606400898371, + "grad_norm": 0.14340832829475403, + "learning_rate": 2.2730819097783964e-06, + "loss": 0.0124, + "step": 7055 + }, + { + "epoch": 0.9321927535753212, + "grad_norm": 0.09002848714590073, + "learning_rate": 2.2642793347090075e-06, + "loss": 0.0067, + "step": 7056 + }, + { + "epoch": 0.9323248670608052, + "grad_norm": 0.12743428349494934, + "learning_rate": 2.25549364165798e-06, + "loss": 0.0136, + "step": 7057 + }, + { + "epoch": 0.9324569805462892, + "grad_norm": 0.15904666483402252, + "learning_rate": 2.246724832142866e-06, + "loss": 0.0163, + "step": 7058 + }, + { + "epoch": 0.9325890940317733, + "grad_norm": 0.14966551959514618, + "learning_rate": 2.2379729076783096e-06, + "loss": 0.0178, + "step": 7059 + }, + { + "epoch": 0.9327212075172573, + "grad_norm": 0.11228898167610168, + "learning_rate": 2.2292378697760683e-06, + "loss": 0.0124, + "step": 7060 + }, + { + "epoch": 0.9328533210027413, + "grad_norm": 0.30276328325271606, + "learning_rate": 2.2205197199449248e-06, + "loss": 0.0145, + "step": 7061 + }, + { + "epoch": 0.9329854344882254, + "grad_norm": 0.15492114424705505, + "learning_rate": 2.2118184596907845e-06, + "loss": 0.0154, + "step": 7062 + }, + { + "epoch": 0.9331175479737094, + "grad_norm": 0.19936561584472656, + "learning_rate": 2.203134090516634e-06, + "loss": 0.0213, + "step": 7063 + }, + { + "epoch": 0.9332496614591934, + "grad_norm": 0.19477428495883942, + "learning_rate": 2.194466613922552e-06, + "loss": 0.0182, + "step": 7064 + }, + { + "epoch": 0.9333817749446774, + "grad_norm": 0.22519803047180176, + "learning_rate": 2.185816031405652e-06, + "loss": 0.0234, + "step": 7065 + }, + { + "epoch": 0.9335138884301615, + "grad_norm": 0.10661128163337708, + "learning_rate": 2.1771823444601714e-06, + "loss": 0.0099, + "step": 7066 + }, + { + "epoch": 0.9336460019156455, + "grad_norm": 0.21245616674423218, + "learning_rate": 2.16856555457744e-06, + "loss": 0.0136, + "step": 7067 + }, + { + "epoch": 0.9337781154011295, + "grad_norm": 0.1352900117635727, + "learning_rate": 2.159965663245811e-06, + "loss": 0.0127, + "step": 7068 + }, + { + "epoch": 0.9339102288866136, + "grad_norm": 0.2113463431596756, + "learning_rate": 2.1513826719507748e-06, + "loss": 0.0263, + "step": 7069 + }, + { + "epoch": 0.9340423423720976, + "grad_norm": 0.14445137977600098, + "learning_rate": 2.142816582174878e-06, + "loss": 0.0154, + "step": 7070 + }, + { + "epoch": 0.9341744558575816, + "grad_norm": 0.2524077892303467, + "learning_rate": 2.134267395397749e-06, + "loss": 0.0174, + "step": 7071 + }, + { + "epoch": 0.9343065693430657, + "grad_norm": 0.12157204002141953, + "learning_rate": 2.1257351130961167e-06, + "loss": 0.0117, + "step": 7072 + }, + { + "epoch": 0.9344386828285497, + "grad_norm": 0.20304414629936218, + "learning_rate": 2.1172197367437587e-06, + "loss": 0.0198, + "step": 7073 + }, + { + "epoch": 0.9345707963140337, + "grad_norm": 0.20831957459449768, + "learning_rate": 2.1087212678115533e-06, + "loss": 0.0214, + "step": 7074 + }, + { + "epoch": 0.9347029097995178, + "grad_norm": 0.11911209672689438, + "learning_rate": 2.1002397077674372e-06, + "loss": 0.0111, + "step": 7075 + }, + { + "epoch": 0.9348350232850018, + "grad_norm": 0.16563266515731812, + "learning_rate": 2.0917750580764616e-06, + "loss": 0.0216, + "step": 7076 + }, + { + "epoch": 0.9349671367704858, + "grad_norm": 0.1556750386953354, + "learning_rate": 2.083327320200734e-06, + "loss": 0.0092, + "step": 7077 + }, + { + "epoch": 0.9350992502559698, + "grad_norm": 0.17585864663124084, + "learning_rate": 2.074896495599432e-06, + "loss": 0.0206, + "step": 7078 + }, + { + "epoch": 0.9352313637414539, + "grad_norm": 0.13946789503097534, + "learning_rate": 2.066482585728824e-06, + "loss": 0.015, + "step": 7079 + }, + { + "epoch": 0.9353634772269379, + "grad_norm": 0.1683516651391983, + "learning_rate": 2.058085592042269e-06, + "loss": 0.0232, + "step": 7080 + }, + { + "epoch": 0.9354955907124219, + "grad_norm": 0.16836416721343994, + "learning_rate": 2.0497055159901746e-06, + "loss": 0.0152, + "step": 7081 + }, + { + "epoch": 0.935627704197906, + "grad_norm": 0.18335430324077606, + "learning_rate": 2.0413423590200377e-06, + "loss": 0.0251, + "step": 7082 + }, + { + "epoch": 0.93575981768339, + "grad_norm": 0.12575763463974, + "learning_rate": 2.0329961225764584e-06, + "loss": 0.0109, + "step": 7083 + }, + { + "epoch": 0.935891931168874, + "grad_norm": 0.09663445502519608, + "learning_rate": 2.0246668081010944e-06, + "loss": 0.0057, + "step": 7084 + }, + { + "epoch": 0.9360240446543581, + "grad_norm": 0.2766578197479248, + "learning_rate": 2.016354417032662e-06, + "loss": 0.0127, + "step": 7085 + }, + { + "epoch": 0.9361561581398421, + "grad_norm": 0.20240576565265656, + "learning_rate": 2.008058950806968e-06, + "loss": 0.0253, + "step": 7086 + }, + { + "epoch": 0.9362882716253261, + "grad_norm": 0.12679430842399597, + "learning_rate": 1.999780410856922e-06, + "loss": 0.0108, + "step": 7087 + }, + { + "epoch": 0.9364203851108102, + "grad_norm": 0.16213077306747437, + "learning_rate": 1.9915187986124575e-06, + "loss": 0.0087, + "step": 7088 + }, + { + "epoch": 0.9365524985962942, + "grad_norm": 0.1392461359500885, + "learning_rate": 1.983274115500633e-06, + "loss": 0.0136, + "step": 7089 + }, + { + "epoch": 0.9366846120817782, + "grad_norm": 0.15210972726345062, + "learning_rate": 1.9750463629455653e-06, + "loss": 0.0228, + "step": 7090 + }, + { + "epoch": 0.9368167255672623, + "grad_norm": 0.14509055018424988, + "learning_rate": 1.9668355423684504e-06, + "loss": 0.0117, + "step": 7091 + }, + { + "epoch": 0.9369488390527463, + "grad_norm": 0.11387002468109131, + "learning_rate": 1.958641655187521e-06, + "loss": 0.0167, + "step": 7092 + }, + { + "epoch": 0.9370809525382303, + "grad_norm": 0.20874197781085968, + "learning_rate": 1.9504647028181443e-06, + "loss": 0.0209, + "step": 7093 + }, + { + "epoch": 0.9372130660237143, + "grad_norm": 0.2966103255748749, + "learning_rate": 1.942304686672747e-06, + "loss": 0.0172, + "step": 7094 + }, + { + "epoch": 0.9373451795091984, + "grad_norm": 0.20808520913124084, + "learning_rate": 1.9341616081607894e-06, + "loss": 0.0231, + "step": 7095 + }, + { + "epoch": 0.9374772929946824, + "grad_norm": 0.19017285108566284, + "learning_rate": 1.926035468688847e-06, + "loss": 0.0158, + "step": 7096 + }, + { + "epoch": 0.9376094064801664, + "grad_norm": 0.2927365303039551, + "learning_rate": 1.917926269660575e-06, + "loss": 0.0185, + "step": 7097 + }, + { + "epoch": 0.9377415199656505, + "grad_norm": 0.2043789178133011, + "learning_rate": 1.909834012476663e-06, + "loss": 0.0167, + "step": 7098 + }, + { + "epoch": 0.9378736334511345, + "grad_norm": 0.09357764571905136, + "learning_rate": 1.9017586985349168e-06, + "loss": 0.0057, + "step": 7099 + }, + { + "epoch": 0.9380057469366185, + "grad_norm": 0.1728450506925583, + "learning_rate": 1.8937003292301746e-06, + "loss": 0.0167, + "step": 7100 + }, + { + "epoch": 0.9381378604221026, + "grad_norm": 0.11996671557426453, + "learning_rate": 1.8856589059543905e-06, + "loss": 0.0126, + "step": 7101 + }, + { + "epoch": 0.9382699739075866, + "grad_norm": 0.13449335098266602, + "learning_rate": 1.877634430096553e-06, + "loss": 0.0149, + "step": 7102 + }, + { + "epoch": 0.9384020873930706, + "grad_norm": 0.6450221538543701, + "learning_rate": 1.8696269030427427e-06, + "loss": 0.0181, + "step": 7103 + }, + { + "epoch": 0.9385342008785547, + "grad_norm": 0.23661655187606812, + "learning_rate": 1.8616363261761195e-06, + "loss": 0.0138, + "step": 7104 + }, + { + "epoch": 0.9386663143640387, + "grad_norm": 0.1648440659046173, + "learning_rate": 1.8536627008769014e-06, + "loss": 0.0172, + "step": 7105 + }, + { + "epoch": 0.9387984278495227, + "grad_norm": 0.11088185757398605, + "learning_rate": 1.845706028522387e-06, + "loss": 0.0163, + "step": 7106 + }, + { + "epoch": 0.9389305413350068, + "grad_norm": 0.3062111735343933, + "learning_rate": 1.8377663104869325e-06, + "loss": 0.0222, + "step": 7107 + }, + { + "epoch": 0.9390626548204908, + "grad_norm": 0.13327403366565704, + "learning_rate": 1.8298435481419852e-06, + "loss": 0.0121, + "step": 7108 + }, + { + "epoch": 0.9391947683059748, + "grad_norm": 0.22051618993282318, + "learning_rate": 1.8219377428560502e-06, + "loss": 0.0215, + "step": 7109 + }, + { + "epoch": 0.9393268817914588, + "grad_norm": 0.15959028899669647, + "learning_rate": 1.8140488959947023e-06, + "loss": 0.0171, + "step": 7110 + }, + { + "epoch": 0.9394589952769429, + "grad_norm": 0.12574240565299988, + "learning_rate": 1.8061770089206064e-06, + "loss": 0.0117, + "step": 7111 + }, + { + "epoch": 0.9395911087624269, + "grad_norm": 0.15989792346954346, + "learning_rate": 1.7983220829934755e-06, + "loss": 0.0167, + "step": 7112 + }, + { + "epoch": 0.9397232222479109, + "grad_norm": 0.12491068989038467, + "learning_rate": 1.7904841195701016e-06, + "loss": 0.0071, + "step": 7113 + }, + { + "epoch": 0.939855335733395, + "grad_norm": 0.18081387877464294, + "learning_rate": 1.7826631200043353e-06, + "loss": 0.016, + "step": 7114 + }, + { + "epoch": 0.939987449218879, + "grad_norm": 0.19697430729866028, + "learning_rate": 1.7748590856471402e-06, + "loss": 0.0145, + "step": 7115 + }, + { + "epoch": 0.940119562704363, + "grad_norm": 0.18398360908031464, + "learning_rate": 1.7670720178464716e-06, + "loss": 0.0117, + "step": 7116 + }, + { + "epoch": 0.9402516761898471, + "grad_norm": 0.12235773354768753, + "learning_rate": 1.7593019179474312e-06, + "loss": 0.0156, + "step": 7117 + }, + { + "epoch": 0.9403837896753311, + "grad_norm": 0.2906453013420105, + "learning_rate": 1.7515487872921566e-06, + "loss": 0.0212, + "step": 7118 + }, + { + "epoch": 0.9405159031608151, + "grad_norm": 0.17571309208869934, + "learning_rate": 1.743812627219854e-06, + "loss": 0.012, + "step": 7119 + }, + { + "epoch": 0.9406480166462992, + "grad_norm": 0.08763639628887177, + "learning_rate": 1.7360934390667881e-06, + "loss": 0.0128, + "step": 7120 + }, + { + "epoch": 0.9407801301317832, + "grad_norm": 0.1028781607747078, + "learning_rate": 1.728391224166326e-06, + "loss": 0.0083, + "step": 7121 + }, + { + "epoch": 0.9409122436172672, + "grad_norm": 0.16966675221920013, + "learning_rate": 1.7207059838488582e-06, + "loss": 0.0172, + "step": 7122 + }, + { + "epoch": 0.9410443571027512, + "grad_norm": 0.17335392534732819, + "learning_rate": 1.713037719441879e-06, + "loss": 0.0103, + "step": 7123 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.13844364881515503, + "learning_rate": 1.7053864322699398e-06, + "loss": 0.0145, + "step": 7124 + }, + { + "epoch": 0.9413085840737193, + "grad_norm": 0.1319582760334015, + "learning_rate": 1.69775212365465e-06, + "loss": 0.0076, + "step": 7125 + }, + { + "epoch": 0.9414406975592033, + "grad_norm": 0.12818774580955505, + "learning_rate": 1.6901347949146996e-06, + "loss": 0.014, + "step": 7126 + }, + { + "epoch": 0.9415728110446874, + "grad_norm": 0.20910701155662537, + "learning_rate": 1.6825344473658355e-06, + "loss": 0.0234, + "step": 7127 + }, + { + "epoch": 0.9417049245301714, + "grad_norm": 0.17381620407104492, + "learning_rate": 1.6749510823208748e-06, + "loss": 0.0236, + "step": 7128 + }, + { + "epoch": 0.9418370380156554, + "grad_norm": 0.1289311945438385, + "learning_rate": 1.6673847010897025e-06, + "loss": 0.0159, + "step": 7129 + }, + { + "epoch": 0.9419691515011395, + "grad_norm": 0.09960731118917465, + "learning_rate": 1.6598353049792736e-06, + "loss": 0.0044, + "step": 7130 + }, + { + "epoch": 0.9421012649866235, + "grad_norm": 0.12561599910259247, + "learning_rate": 1.6523028952936003e-06, + "loss": 0.0155, + "step": 7131 + }, + { + "epoch": 0.9422333784721075, + "grad_norm": 0.15394242107868195, + "learning_rate": 1.6447874733337753e-06, + "loss": 0.0131, + "step": 7132 + }, + { + "epoch": 0.9423654919575916, + "grad_norm": 0.1523938775062561, + "learning_rate": 1.637289040397938e-06, + "loss": 0.0175, + "step": 7133 + }, + { + "epoch": 0.9424976054430756, + "grad_norm": 0.18024346232414246, + "learning_rate": 1.6298075977812966e-06, + "loss": 0.0217, + "step": 7134 + }, + { + "epoch": 0.9426297189285596, + "grad_norm": 0.22924868762493134, + "learning_rate": 1.6223431467761396e-06, + "loss": 0.0294, + "step": 7135 + }, + { + "epoch": 0.9427618324140437, + "grad_norm": 0.13143298029899597, + "learning_rate": 1.6148956886718246e-06, + "loss": 0.0139, + "step": 7136 + }, + { + "epoch": 0.9428939458995277, + "grad_norm": 0.12749572098255157, + "learning_rate": 1.607465224754734e-06, + "loss": 0.0143, + "step": 7137 + }, + { + "epoch": 0.9430260593850117, + "grad_norm": 0.18467861413955688, + "learning_rate": 1.6000517563083628e-06, + "loss": 0.0176, + "step": 7138 + }, + { + "epoch": 0.9431581728704957, + "grad_norm": 0.25325989723205566, + "learning_rate": 1.5926552846132315e-06, + "loss": 0.015, + "step": 7139 + }, + { + "epoch": 0.9432902863559798, + "grad_norm": 0.2042577713727951, + "learning_rate": 1.5852758109469623e-06, + "loss": 0.0126, + "step": 7140 + }, + { + "epoch": 0.9434223998414638, + "grad_norm": 0.10952811688184738, + "learning_rate": 1.5779133365841915e-06, + "loss": 0.0081, + "step": 7141 + }, + { + "epoch": 0.9435545133269478, + "grad_norm": 0.13215503096580505, + "learning_rate": 1.570567862796679e-06, + "loss": 0.012, + "step": 7142 + }, + { + "epoch": 0.9436866268124319, + "grad_norm": 0.13652601838111877, + "learning_rate": 1.5632393908532106e-06, + "loss": 0.0116, + "step": 7143 + }, + { + "epoch": 0.9438187402979159, + "grad_norm": 0.3904765546321869, + "learning_rate": 1.5559279220196288e-06, + "loss": 0.0062, + "step": 7144 + }, + { + "epoch": 0.9439508537833999, + "grad_norm": 0.13622334599494934, + "learning_rate": 1.548633457558868e-06, + "loss": 0.0093, + "step": 7145 + }, + { + "epoch": 0.944082967268884, + "grad_norm": 0.13196682929992676, + "learning_rate": 1.541355998730909e-06, + "loss": 0.0114, + "step": 7146 + }, + { + "epoch": 0.944215080754368, + "grad_norm": 0.14724312722682953, + "learning_rate": 1.5340955467927909e-06, + "loss": 0.0174, + "step": 7147 + }, + { + "epoch": 0.944347194239852, + "grad_norm": 0.15435270965099335, + "learning_rate": 1.5268521029986104e-06, + "loss": 0.0167, + "step": 7148 + }, + { + "epoch": 0.9444793077253361, + "grad_norm": 0.15787175297737122, + "learning_rate": 1.5196256685995557e-06, + "loss": 0.011, + "step": 7149 + }, + { + "epoch": 0.9446114212108201, + "grad_norm": 0.14526408910751343, + "learning_rate": 1.5124162448438728e-06, + "loss": 0.0161, + "step": 7150 + }, + { + "epoch": 0.9447435346963041, + "grad_norm": 0.23147417604923248, + "learning_rate": 1.5052238329768099e-06, + "loss": 0.0166, + "step": 7151 + }, + { + "epoch": 0.9448756481817882, + "grad_norm": 0.07740113884210587, + "learning_rate": 1.4980484342407507e-06, + "loss": 0.0056, + "step": 7152 + }, + { + "epoch": 0.9450077616672722, + "grad_norm": 0.14792416989803314, + "learning_rate": 1.4908900498751155e-06, + "loss": 0.0118, + "step": 7153 + }, + { + "epoch": 0.9451398751527562, + "grad_norm": 0.1700582653284073, + "learning_rate": 1.4837486811163704e-06, + "loss": 0.0117, + "step": 7154 + }, + { + "epoch": 0.9452719886382402, + "grad_norm": 0.33778321743011475, + "learning_rate": 1.4766243291980507e-06, + "loss": 0.015, + "step": 7155 + }, + { + "epoch": 0.9454041021237243, + "grad_norm": 0.30752959847450256, + "learning_rate": 1.4695169953507614e-06, + "loss": 0.012, + "step": 7156 + }, + { + "epoch": 0.9455362156092083, + "grad_norm": 0.15177850425243378, + "learning_rate": 1.4624266808021647e-06, + "loss": 0.0217, + "step": 7157 + }, + { + "epoch": 0.9456683290946923, + "grad_norm": 0.22389481961727142, + "learning_rate": 1.4553533867769697e-06, + "loss": 0.0226, + "step": 7158 + }, + { + "epoch": 0.9458004425801764, + "grad_norm": 0.21998845040798187, + "learning_rate": 1.4482971144969547e-06, + "loss": 0.0143, + "step": 7159 + }, + { + "epoch": 0.9459325560656604, + "grad_norm": 0.16826555132865906, + "learning_rate": 1.4412578651809894e-06, + "loss": 0.0191, + "step": 7160 + }, + { + "epoch": 0.9460646695511444, + "grad_norm": 0.122434601187706, + "learning_rate": 1.434235640044923e-06, + "loss": 0.0108, + "step": 7161 + }, + { + "epoch": 0.9461967830366285, + "grad_norm": 0.13651913404464722, + "learning_rate": 1.4272304403017523e-06, + "loss": 0.0163, + "step": 7162 + }, + { + "epoch": 0.9463288965221125, + "grad_norm": 0.20436081290245056, + "learning_rate": 1.4202422671614647e-06, + "loss": 0.0211, + "step": 7163 + }, + { + "epoch": 0.9464610100075965, + "grad_norm": 0.25356408953666687, + "learning_rate": 1.4132711218311723e-06, + "loss": 0.023, + "step": 7164 + }, + { + "epoch": 0.9465931234930806, + "grad_norm": 0.23628415167331696, + "learning_rate": 1.4063170055149788e-06, + "loss": 0.0172, + "step": 7165 + }, + { + "epoch": 0.9467252369785646, + "grad_norm": 0.16951411962509155, + "learning_rate": 1.399379919414101e-06, + "loss": 0.0148, + "step": 7166 + }, + { + "epoch": 0.9468573504640486, + "grad_norm": 0.11124689131975174, + "learning_rate": 1.3924598647267694e-06, + "loss": 0.013, + "step": 7167 + }, + { + "epoch": 0.9469894639495327, + "grad_norm": 0.1717250794172287, + "learning_rate": 1.3855568426483057e-06, + "loss": 0.0145, + "step": 7168 + }, + { + "epoch": 0.9471215774350167, + "grad_norm": 0.16048182547092438, + "learning_rate": 1.3786708543710781e-06, + "loss": 0.0175, + "step": 7169 + }, + { + "epoch": 0.9472536909205007, + "grad_norm": 0.09600821882486343, + "learning_rate": 1.3718019010845129e-06, + "loss": 0.0054, + "step": 7170 + }, + { + "epoch": 0.9473858044059847, + "grad_norm": 0.09572196006774902, + "learning_rate": 1.3649499839750946e-06, + "loss": 0.0147, + "step": 7171 + }, + { + "epoch": 0.9475179178914688, + "grad_norm": 0.08443471044301987, + "learning_rate": 1.358115104226343e-06, + "loss": 0.0039, + "step": 7172 + }, + { + "epoch": 0.9476500313769528, + "grad_norm": 0.15685054659843445, + "learning_rate": 1.3512972630188914e-06, + "loss": 0.0158, + "step": 7173 + }, + { + "epoch": 0.9477821448624368, + "grad_norm": 0.16958138346672058, + "learning_rate": 1.3444964615303646e-06, + "loss": 0.0189, + "step": 7174 + }, + { + "epoch": 0.9479142583479209, + "grad_norm": 0.08407772332429886, + "learning_rate": 1.3377127009354895e-06, + "loss": 0.0086, + "step": 7175 + }, + { + "epoch": 0.9480463718334049, + "grad_norm": 0.16646800935268402, + "learning_rate": 1.3309459824060288e-06, + "loss": 0.012, + "step": 7176 + }, + { + "epoch": 0.9481784853188889, + "grad_norm": 0.08961289376020432, + "learning_rate": 1.3241963071108031e-06, + "loss": 0.0064, + "step": 7177 + }, + { + "epoch": 0.948310598804373, + "grad_norm": 0.16918633878231049, + "learning_rate": 1.3174636762157133e-06, + "loss": 0.0162, + "step": 7178 + }, + { + "epoch": 0.948442712289857, + "grad_norm": 0.32409965991973877, + "learning_rate": 1.3107480908836622e-06, + "loss": 0.0322, + "step": 7179 + }, + { + "epoch": 0.948574825775341, + "grad_norm": 0.15334896743297577, + "learning_rate": 1.3040495522746664e-06, + "loss": 0.0117, + "step": 7180 + }, + { + "epoch": 0.948706939260825, + "grad_norm": 0.12855543196201324, + "learning_rate": 1.2973680615457672e-06, + "loss": 0.0073, + "step": 7181 + }, + { + "epoch": 0.9488390527463091, + "grad_norm": 0.11828119307756424, + "learning_rate": 1.2907036198510636e-06, + "loss": 0.0116, + "step": 7182 + }, + { + "epoch": 0.9489711662317931, + "grad_norm": 0.14522184431552887, + "learning_rate": 1.2840562283417013e-06, + "loss": 0.0156, + "step": 7183 + }, + { + "epoch": 0.9491032797172771, + "grad_norm": 0.2941734194755554, + "learning_rate": 1.2774258881659174e-06, + "loss": 0.0224, + "step": 7184 + }, + { + "epoch": 0.9492353932027612, + "grad_norm": 0.19937515258789062, + "learning_rate": 1.2708126004689735e-06, + "loss": 0.0257, + "step": 7185 + }, + { + "epoch": 0.9493675066882452, + "grad_norm": 0.1287490576505661, + "learning_rate": 1.2642163663931895e-06, + "loss": 0.0152, + "step": 7186 + }, + { + "epoch": 0.9494996201737292, + "grad_norm": 0.22921793162822723, + "learning_rate": 1.2576371870779202e-06, + "loss": 0.0188, + "step": 7187 + }, + { + "epoch": 0.9496317336592133, + "grad_norm": 0.289598286151886, + "learning_rate": 1.2510750636596346e-06, + "loss": 0.0096, + "step": 7188 + }, + { + "epoch": 0.9497638471446973, + "grad_norm": 0.12793676555156708, + "learning_rate": 1.2445299972717817e-06, + "loss": 0.0156, + "step": 7189 + }, + { + "epoch": 0.9498959606301813, + "grad_norm": 0.10357484966516495, + "learning_rate": 1.2380019890449124e-06, + "loss": 0.0083, + "step": 7190 + }, + { + "epoch": 0.9500280741156654, + "grad_norm": 0.18829475343227386, + "learning_rate": 1.231491040106636e-06, + "loss": 0.0146, + "step": 7191 + }, + { + "epoch": 0.9501601876011494, + "grad_norm": 0.25576236844062805, + "learning_rate": 1.224997151581564e-06, + "loss": 0.0165, + "step": 7192 + }, + { + "epoch": 0.9502923010866334, + "grad_norm": 0.15136855840682983, + "learning_rate": 1.2185203245914212e-06, + "loss": 0.0171, + "step": 7193 + }, + { + "epoch": 0.9504244145721175, + "grad_norm": 0.13044820725917816, + "learning_rate": 1.2120605602549462e-06, + "loss": 0.0137, + "step": 7194 + }, + { + "epoch": 0.9505565280576015, + "grad_norm": 0.1385304480791092, + "learning_rate": 1.2056178596879352e-06, + "loss": 0.0137, + "step": 7195 + }, + { + "epoch": 0.9506886415430855, + "grad_norm": 0.13936810195446014, + "learning_rate": 1.199192224003265e-06, + "loss": 0.0106, + "step": 7196 + }, + { + "epoch": 0.9508207550285696, + "grad_norm": 0.19669145345687866, + "learning_rate": 1.1927836543108251e-06, + "loss": 0.0145, + "step": 7197 + }, + { + "epoch": 0.9509528685140536, + "grad_norm": 0.4017046391963959, + "learning_rate": 1.1863921517175968e-06, + "loss": 0.0211, + "step": 7198 + }, + { + "epoch": 0.9510849819995376, + "grad_norm": 0.21765701472759247, + "learning_rate": 1.1800177173275639e-06, + "loss": 0.0172, + "step": 7199 + }, + { + "epoch": 0.9512170954850216, + "grad_norm": 0.22565911710262299, + "learning_rate": 1.173660352241812e-06, + "loss": 0.0187, + "step": 7200 + }, + { + "epoch": 0.9513492089705057, + "grad_norm": 0.33037158846855164, + "learning_rate": 1.167320057558452e-06, + "loss": 0.0255, + "step": 7201 + }, + { + "epoch": 0.9514813224559897, + "grad_norm": 0.16202698647975922, + "learning_rate": 1.1609968343726519e-06, + "loss": 0.0137, + "step": 7202 + }, + { + "epoch": 0.9516134359414737, + "grad_norm": 0.10075455904006958, + "learning_rate": 1.1546906837766268e-06, + "loss": 0.0079, + "step": 7203 + }, + { + "epoch": 0.9517455494269578, + "grad_norm": 0.24736060202121735, + "learning_rate": 1.1484016068596393e-06, + "loss": 0.0266, + "step": 7204 + }, + { + "epoch": 0.9518776629124418, + "grad_norm": 0.11703985184431076, + "learning_rate": 1.1421296047080421e-06, + "loss": 0.0125, + "step": 7205 + }, + { + "epoch": 0.9520097763979258, + "grad_norm": 0.1756127029657364, + "learning_rate": 1.1358746784051687e-06, + "loss": 0.0136, + "step": 7206 + }, + { + "epoch": 0.9521418898834099, + "grad_norm": 0.10668937116861343, + "learning_rate": 1.129636829031444e-06, + "loss": 0.013, + "step": 7207 + }, + { + "epoch": 0.9522740033688939, + "grad_norm": 0.11571614444255829, + "learning_rate": 1.1234160576643726e-06, + "loss": 0.0141, + "step": 7208 + }, + { + "epoch": 0.9524061168543779, + "grad_norm": 0.12110844999551773, + "learning_rate": 1.1172123653784394e-06, + "loss": 0.012, + "step": 7209 + }, + { + "epoch": 0.952538230339862, + "grad_norm": 0.1357802003622055, + "learning_rate": 1.111025753245243e-06, + "loss": 0.0107, + "step": 7210 + }, + { + "epoch": 0.952670343825346, + "grad_norm": 0.1572360098361969, + "learning_rate": 1.1048562223333835e-06, + "loss": 0.0132, + "step": 7211 + }, + { + "epoch": 0.95280245731083, + "grad_norm": 0.28157058358192444, + "learning_rate": 1.0987037737085536e-06, + "loss": 0.0285, + "step": 7212 + }, + { + "epoch": 0.952934570796314, + "grad_norm": 0.18760952353477478, + "learning_rate": 1.0925684084334476e-06, + "loss": 0.0232, + "step": 7213 + }, + { + "epoch": 0.9530666842817981, + "grad_norm": 0.15009918808937073, + "learning_rate": 1.0864501275678618e-06, + "loss": 0.0149, + "step": 7214 + }, + { + "epoch": 0.9531987977672821, + "grad_norm": 0.2483643740415573, + "learning_rate": 1.0803489321685957e-06, + "loss": 0.0253, + "step": 7215 + }, + { + "epoch": 0.9533309112527661, + "grad_norm": 0.1848488599061966, + "learning_rate": 1.074264823289528e-06, + "loss": 0.0155, + "step": 7216 + }, + { + "epoch": 0.9534630247382502, + "grad_norm": 0.14977069199085236, + "learning_rate": 1.0681978019815741e-06, + "loss": 0.0173, + "step": 7217 + }, + { + "epoch": 0.9535951382237342, + "grad_norm": 0.1266462504863739, + "learning_rate": 1.0621478692926845e-06, + "loss": 0.0125, + "step": 7218 + }, + { + "epoch": 0.9537272517092182, + "grad_norm": 0.20076677203178406, + "learning_rate": 1.0561150262678899e-06, + "loss": 0.0125, + "step": 7219 + }, + { + "epoch": 0.9538593651947023, + "grad_norm": 0.13278064131736755, + "learning_rate": 1.0500992739492454e-06, + "loss": 0.0137, + "step": 7220 + }, + { + "epoch": 0.9539914786801863, + "grad_norm": 0.10803290456533432, + "learning_rate": 1.0441006133758536e-06, + "loss": 0.0094, + "step": 7221 + }, + { + "epoch": 0.9541235921656703, + "grad_norm": 0.11469370126724243, + "learning_rate": 1.0381190455838852e-06, + "loss": 0.0134, + "step": 7222 + }, + { + "epoch": 0.9542557056511544, + "grad_norm": 0.13785609602928162, + "learning_rate": 1.032154571606525e-06, + "loss": 0.0166, + "step": 7223 + }, + { + "epoch": 0.9543878191366384, + "grad_norm": 0.15190643072128296, + "learning_rate": 1.0262071924740268e-06, + "loss": 0.0152, + "step": 7224 + }, + { + "epoch": 0.9545199326221224, + "grad_norm": 0.1632893681526184, + "learning_rate": 1.0202769092137022e-06, + "loss": 0.0179, + "step": 7225 + }, + { + "epoch": 0.9546520461076065, + "grad_norm": 0.1495528668165207, + "learning_rate": 1.0143637228498981e-06, + "loss": 0.0126, + "step": 7226 + }, + { + "epoch": 0.9547841595930905, + "grad_norm": 0.15436944365501404, + "learning_rate": 1.0084676344039977e-06, + "loss": 0.0095, + "step": 7227 + }, + { + "epoch": 0.9549162730785745, + "grad_norm": 0.1181582435965538, + "learning_rate": 1.002588644894431e-06, + "loss": 0.0157, + "step": 7228 + }, + { + "epoch": 0.9550483865640585, + "grad_norm": 0.09800876677036285, + "learning_rate": 9.967267553367078e-07, + "loss": 0.0116, + "step": 7229 + }, + { + "epoch": 0.9551805000495426, + "grad_norm": 0.21567225456237793, + "learning_rate": 9.9088196674334e-07, + "loss": 0.0196, + "step": 7230 + }, + { + "epoch": 0.9553126135350266, + "grad_norm": 0.13500306010246277, + "learning_rate": 9.850542801239093e-07, + "loss": 0.0104, + "step": 7231 + }, + { + "epoch": 0.9554447270205106, + "grad_norm": 0.1280946284532547, + "learning_rate": 9.792436964850439e-07, + "loss": 0.0076, + "step": 7232 + }, + { + "epoch": 0.9555768405059947, + "grad_norm": 0.11754422634840012, + "learning_rate": 9.734502168304183e-07, + "loss": 0.0148, + "step": 7233 + }, + { + "epoch": 0.9557089539914787, + "grad_norm": 0.14567334949970245, + "learning_rate": 9.676738421607434e-07, + "loss": 0.0134, + "step": 7234 + }, + { + "epoch": 0.9558410674769627, + "grad_norm": 0.11744000762701035, + "learning_rate": 9.619145734737655e-07, + "loss": 0.0132, + "step": 7235 + }, + { + "epoch": 0.9559731809624468, + "grad_norm": 0.26282408833503723, + "learning_rate": 9.56172411764311e-07, + "loss": 0.0333, + "step": 7236 + }, + { + "epoch": 0.9561052944479308, + "grad_norm": 0.11459115147590637, + "learning_rate": 9.504473580242202e-07, + "loss": 0.0089, + "step": 7237 + }, + { + "epoch": 0.9562374079334148, + "grad_norm": 0.09463014453649521, + "learning_rate": 9.447394132423903e-07, + "loss": 0.0066, + "step": 7238 + }, + { + "epoch": 0.9563695214188989, + "grad_norm": 0.14389050006866455, + "learning_rate": 9.390485784047664e-07, + "loss": 0.0131, + "step": 7239 + }, + { + "epoch": 0.9565016349043829, + "grad_norm": 0.12472982704639435, + "learning_rate": 9.333748544943288e-07, + "loss": 0.0179, + "step": 7240 + }, + { + "epoch": 0.9566337483898669, + "grad_norm": 0.170174241065979, + "learning_rate": 9.277182424911158e-07, + "loss": 0.0173, + "step": 7241 + }, + { + "epoch": 0.956765861875351, + "grad_norm": 0.19937077164649963, + "learning_rate": 9.2207874337219e-07, + "loss": 0.0218, + "step": 7242 + }, + { + "epoch": 0.956897975360835, + "grad_norm": 0.45541685819625854, + "learning_rate": 9.164563581116725e-07, + "loss": 0.0117, + "step": 7243 + }, + { + "epoch": 0.957030088846319, + "grad_norm": 0.15659266710281372, + "learning_rate": 9.108510876807308e-07, + "loss": 0.0149, + "step": 7244 + }, + { + "epoch": 0.957162202331803, + "grad_norm": 0.11466649919748306, + "learning_rate": 9.052629330475681e-07, + "loss": 0.0092, + "step": 7245 + }, + { + "epoch": 0.9572943158172871, + "grad_norm": 0.1386268436908722, + "learning_rate": 8.996918951774236e-07, + "loss": 0.0136, + "step": 7246 + }, + { + "epoch": 0.9574264293027711, + "grad_norm": 0.1966053545475006, + "learning_rate": 8.941379750326051e-07, + "loss": 0.0187, + "step": 7247 + }, + { + "epoch": 0.9575585427882551, + "grad_norm": 0.15060442686080933, + "learning_rate": 8.886011735724453e-07, + "loss": 0.0153, + "step": 7248 + }, + { + "epoch": 0.9576906562737392, + "grad_norm": 0.1787412166595459, + "learning_rate": 8.830814917533125e-07, + "loss": 0.0172, + "step": 7249 + }, + { + "epoch": 0.9578227697592232, + "grad_norm": 0.14520373940467834, + "learning_rate": 8.775789305286442e-07, + "loss": 0.0098, + "step": 7250 + }, + { + "epoch": 0.9579548832447072, + "grad_norm": 0.165212482213974, + "learning_rate": 8.720934908488909e-07, + "loss": 0.0159, + "step": 7251 + }, + { + "epoch": 0.9580869967301913, + "grad_norm": 0.12482784688472748, + "learning_rate": 8.666251736615616e-07, + "loss": 0.0045, + "step": 7252 + }, + { + "epoch": 0.9582191102156753, + "grad_norm": 0.1122523620724678, + "learning_rate": 8.611739799112229e-07, + "loss": 0.011, + "step": 7253 + }, + { + "epoch": 0.9583512237011593, + "grad_norm": 0.10343725234270096, + "learning_rate": 8.557399105394437e-07, + "loss": 0.0084, + "step": 7254 + }, + { + "epoch": 0.9584833371866434, + "grad_norm": 0.10364764928817749, + "learning_rate": 8.503229664848733e-07, + "loss": 0.0088, + "step": 7255 + }, + { + "epoch": 0.9586154506721274, + "grad_norm": 0.19793720543384552, + "learning_rate": 8.449231486831744e-07, + "loss": 0.0213, + "step": 7256 + }, + { + "epoch": 0.9587475641576114, + "grad_norm": 0.1351151317358017, + "learning_rate": 8.395404580670785e-07, + "loss": 0.0112, + "step": 7257 + }, + { + "epoch": 0.9588796776430955, + "grad_norm": 0.16011805832386017, + "learning_rate": 8.34174895566342e-07, + "loss": 0.0127, + "step": 7258 + }, + { + "epoch": 0.9590117911285795, + "grad_norm": 0.16073232889175415, + "learning_rate": 8.288264621077457e-07, + "loss": 0.0163, + "step": 7259 + }, + { + "epoch": 0.9591439046140635, + "grad_norm": 0.14454664289951324, + "learning_rate": 8.234951586151729e-07, + "loss": 0.0123, + "step": 7260 + }, + { + "epoch": 0.9592760180995475, + "grad_norm": 0.12177897244691849, + "learning_rate": 8.181809860094647e-07, + "loss": 0.0133, + "step": 7261 + }, + { + "epoch": 0.9594081315850316, + "grad_norm": 0.13024061918258667, + "learning_rate": 8.128839452085535e-07, + "loss": 0.0152, + "step": 7262 + }, + { + "epoch": 0.9595402450705156, + "grad_norm": 0.13621719181537628, + "learning_rate": 8.076040371274296e-07, + "loss": 0.0227, + "step": 7263 + }, + { + "epoch": 0.9596723585559996, + "grad_norm": 0.13376359641551971, + "learning_rate": 8.023412626780746e-07, + "loss": 0.0107, + "step": 7264 + }, + { + "epoch": 0.9598044720414837, + "grad_norm": 0.12808959186077118, + "learning_rate": 7.970956227695392e-07, + "loss": 0.0117, + "step": 7265 + }, + { + "epoch": 0.9599365855269677, + "grad_norm": 0.2006939947605133, + "learning_rate": 7.918671183079096e-07, + "loss": 0.0215, + "step": 7266 + }, + { + "epoch": 0.9600686990124517, + "grad_norm": 0.08638248592615128, + "learning_rate": 7.86655750196319e-07, + "loss": 0.0047, + "step": 7267 + }, + { + "epoch": 0.9602008124979358, + "grad_norm": 0.10825937241315842, + "learning_rate": 7.814615193349251e-07, + "loss": 0.0102, + "step": 7268 + }, + { + "epoch": 0.9603329259834198, + "grad_norm": 0.1509263962507248, + "learning_rate": 7.762844266209435e-07, + "loss": 0.0131, + "step": 7269 + }, + { + "epoch": 0.9604650394689038, + "grad_norm": 0.0950889065861702, + "learning_rate": 7.711244729486034e-07, + "loss": 0.0109, + "step": 7270 + }, + { + "epoch": 0.9605971529543879, + "grad_norm": 0.10586385428905487, + "learning_rate": 7.65981659209214e-07, + "loss": 0.0131, + "step": 7271 + }, + { + "epoch": 0.9607292664398719, + "grad_norm": 0.13581392168998718, + "learning_rate": 7.608559862910758e-07, + "loss": 0.0134, + "step": 7272 + }, + { + "epoch": 0.9608613799253559, + "grad_norm": 0.1681486815214157, + "learning_rate": 7.557474550795695e-07, + "loss": 0.0215, + "step": 7273 + }, + { + "epoch": 0.96099349341084, + "grad_norm": 0.09689151495695114, + "learning_rate": 7.506560664571005e-07, + "loss": 0.006, + "step": 7274 + }, + { + "epoch": 0.961125606896324, + "grad_norm": 0.11082078516483307, + "learning_rate": 7.455818213030985e-07, + "loss": 0.0089, + "step": 7275 + }, + { + "epoch": 0.961257720381808, + "grad_norm": 0.18737493455410004, + "learning_rate": 7.405247204940513e-07, + "loss": 0.0209, + "step": 7276 + }, + { + "epoch": 0.961389833867292, + "grad_norm": 0.18590763211250305, + "learning_rate": 7.354847649034713e-07, + "loss": 0.0171, + "step": 7277 + }, + { + "epoch": 0.9615219473527761, + "grad_norm": 0.2514604926109314, + "learning_rate": 7.304619554019288e-07, + "loss": 0.0095, + "step": 7278 + }, + { + "epoch": 0.9616540608382601, + "grad_norm": 0.10440555959939957, + "learning_rate": 7.254562928570074e-07, + "loss": 0.0129, + "step": 7279 + }, + { + "epoch": 0.9617861743237441, + "grad_norm": 0.13339920341968536, + "learning_rate": 7.204677781333602e-07, + "loss": 0.0116, + "step": 7280 + }, + { + "epoch": 0.9619182878092282, + "grad_norm": 0.19956082105636597, + "learning_rate": 7.154964120926422e-07, + "loss": 0.0078, + "step": 7281 + }, + { + "epoch": 0.9620504012947122, + "grad_norm": 0.1739201694726944, + "learning_rate": 7.105421955935665e-07, + "loss": 0.0224, + "step": 7282 + }, + { + "epoch": 0.9621825147801962, + "grad_norm": 0.25007346272468567, + "learning_rate": 7.056051294918819e-07, + "loss": 0.0255, + "step": 7283 + }, + { + "epoch": 0.9623146282656803, + "grad_norm": 0.1323181390762329, + "learning_rate": 7.006852146403842e-07, + "loss": 0.0201, + "step": 7284 + }, + { + "epoch": 0.9624467417511643, + "grad_norm": 0.13154102861881256, + "learning_rate": 6.957824518888822e-07, + "loss": 0.0172, + "step": 7285 + }, + { + "epoch": 0.9625788552366483, + "grad_norm": 0.15256892144680023, + "learning_rate": 6.908968420842433e-07, + "loss": 0.0268, + "step": 7286 + }, + { + "epoch": 0.9627109687221324, + "grad_norm": 0.17247919738292694, + "learning_rate": 6.860283860703698e-07, + "loss": 0.0081, + "step": 7287 + }, + { + "epoch": 0.9628430822076164, + "grad_norm": 0.22937926650047302, + "learning_rate": 6.811770846882004e-07, + "loss": 0.0141, + "step": 7288 + }, + { + "epoch": 0.9629751956931004, + "grad_norm": 0.14299964904785156, + "learning_rate": 6.76342938775687e-07, + "loss": 0.0107, + "step": 7289 + }, + { + "epoch": 0.9631073091785844, + "grad_norm": 0.156651571393013, + "learning_rate": 6.715259491678505e-07, + "loss": 0.0064, + "step": 7290 + }, + { + "epoch": 0.9632394226640685, + "grad_norm": 0.12845245003700256, + "learning_rate": 6.667261166967365e-07, + "loss": 0.0125, + "step": 7291 + }, + { + "epoch": 0.9633715361495525, + "grad_norm": 0.13767504692077637, + "learning_rate": 6.619434421914262e-07, + "loss": 0.0107, + "step": 7292 + }, + { + "epoch": 0.9635036496350365, + "grad_norm": 0.14422591030597687, + "learning_rate": 6.571779264780364e-07, + "loss": 0.0162, + "step": 7293 + }, + { + "epoch": 0.9636357631205206, + "grad_norm": 0.178952157497406, + "learning_rate": 6.524295703797201e-07, + "loss": 0.0137, + "step": 7294 + }, + { + "epoch": 0.9637678766060046, + "grad_norm": 0.1274910867214203, + "learning_rate": 6.476983747166654e-07, + "loss": 0.0126, + "step": 7295 + }, + { + "epoch": 0.9638999900914886, + "grad_norm": 0.07097754627466202, + "learning_rate": 6.429843403060964e-07, + "loss": 0.0037, + "step": 7296 + }, + { + "epoch": 0.9640321035769727, + "grad_norm": 0.12936192750930786, + "learning_rate": 6.382874679622841e-07, + "loss": 0.0225, + "step": 7297 + }, + { + "epoch": 0.9641642170624567, + "grad_norm": 0.11343681067228317, + "learning_rate": 6.336077584965128e-07, + "loss": 0.0085, + "step": 7298 + }, + { + "epoch": 0.9642963305479407, + "grad_norm": 0.15607260167598724, + "learning_rate": 6.289452127171247e-07, + "loss": 0.0163, + "step": 7299 + }, + { + "epoch": 0.9644284440334248, + "grad_norm": 0.13873155415058136, + "learning_rate": 6.242998314294757e-07, + "loss": 0.0114, + "step": 7300 + }, + { + "epoch": 0.9645605575189088, + "grad_norm": 0.14331649243831635, + "learning_rate": 6.196716154359794e-07, + "loss": 0.0181, + "step": 7301 + }, + { + "epoch": 0.9646926710043928, + "grad_norm": 0.18304325640201569, + "learning_rate": 6.150605655360853e-07, + "loss": 0.0147, + "step": 7302 + }, + { + "epoch": 0.9648247844898769, + "grad_norm": 0.137273371219635, + "learning_rate": 6.10466682526234e-07, + "loss": 0.014, + "step": 7303 + }, + { + "epoch": 0.9649568979753609, + "grad_norm": 0.13832210004329681, + "learning_rate": 6.058899671999574e-07, + "loss": 0.0137, + "step": 7304 + }, + { + "epoch": 0.9650890114608449, + "grad_norm": 0.19359838962554932, + "learning_rate": 6.013304203477788e-07, + "loss": 0.0155, + "step": 7305 + }, + { + "epoch": 0.965221124946329, + "grad_norm": 0.2560751736164093, + "learning_rate": 5.967880427573014e-07, + "loss": 0.0218, + "step": 7306 + }, + { + "epoch": 0.965353238431813, + "grad_norm": 0.33832573890686035, + "learning_rate": 5.922628352131087e-07, + "loss": 0.0129, + "step": 7307 + }, + { + "epoch": 0.965485351917297, + "grad_norm": 0.213003471493721, + "learning_rate": 5.877547984968646e-07, + "loss": 0.0139, + "step": 7308 + }, + { + "epoch": 0.965617465402781, + "grad_norm": 0.3071267008781433, + "learning_rate": 5.832639333872458e-07, + "loss": 0.0176, + "step": 7309 + }, + { + "epoch": 0.9657495788882651, + "grad_norm": 0.2311631441116333, + "learning_rate": 5.787902406599543e-07, + "loss": 0.0189, + "step": 7310 + }, + { + "epoch": 0.9658816923737491, + "grad_norm": 0.10062497109174728, + "learning_rate": 5.743337210877386e-07, + "loss": 0.0114, + "step": 7311 + }, + { + "epoch": 0.9660138058592331, + "grad_norm": 0.18390169739723206, + "learning_rate": 5.69894375440394e-07, + "loss": 0.0147, + "step": 7312 + }, + { + "epoch": 0.9661459193447172, + "grad_norm": 0.17044439911842346, + "learning_rate": 5.654722044847183e-07, + "loss": 0.0166, + "step": 7313 + }, + { + "epoch": 0.9662780328302012, + "grad_norm": 0.0919048935174942, + "learning_rate": 5.61067208984567e-07, + "loss": 0.0084, + "step": 7314 + }, + { + "epoch": 0.9664101463156852, + "grad_norm": 0.14148785173892975, + "learning_rate": 5.566793897008204e-07, + "loss": 0.0136, + "step": 7315 + }, + { + "epoch": 0.9665422598011693, + "grad_norm": 0.2157713621854782, + "learning_rate": 5.523087473913835e-07, + "loss": 0.0137, + "step": 7316 + }, + { + "epoch": 0.9666743732866533, + "grad_norm": 0.24128608405590057, + "learning_rate": 5.479552828112189e-07, + "loss": 0.0228, + "step": 7317 + }, + { + "epoch": 0.9668064867721373, + "grad_norm": 0.12320316582918167, + "learning_rate": 5.436189967122918e-07, + "loss": 0.0117, + "step": 7318 + }, + { + "epoch": 0.9669386002576214, + "grad_norm": 0.18456970155239105, + "learning_rate": 5.392998898436252e-07, + "loss": 0.0225, + "step": 7319 + }, + { + "epoch": 0.9670707137431054, + "grad_norm": 0.14801354706287384, + "learning_rate": 5.349979629512448e-07, + "loss": 0.018, + "step": 7320 + }, + { + "epoch": 0.9672028272285894, + "grad_norm": 0.13997134566307068, + "learning_rate": 5.307132167782558e-07, + "loss": 0.0202, + "step": 7321 + }, + { + "epoch": 0.9673349407140734, + "grad_norm": 0.22053630650043488, + "learning_rate": 5.264456520647554e-07, + "loss": 0.0282, + "step": 7322 + }, + { + "epoch": 0.9674670541995575, + "grad_norm": 0.1400284618139267, + "learning_rate": 5.22195269547876e-07, + "loss": 0.016, + "step": 7323 + }, + { + "epoch": 0.9675991676850415, + "grad_norm": 0.18816642463207245, + "learning_rate": 5.179620699617971e-07, + "loss": 0.0157, + "step": 7324 + }, + { + "epoch": 0.9677312811705255, + "grad_norm": 0.14045368134975433, + "learning_rate": 5.137460540377337e-07, + "loss": 0.0114, + "step": 7325 + }, + { + "epoch": 0.9678633946560095, + "grad_norm": 0.1565784364938736, + "learning_rate": 5.095472225039255e-07, + "loss": 0.0185, + "step": 7326 + }, + { + "epoch": 0.9679955081414935, + "grad_norm": 0.16238823533058167, + "learning_rate": 5.053655760856257e-07, + "loss": 0.0144, + "step": 7327 + }, + { + "epoch": 0.9681276216269775, + "grad_norm": 0.17575566470623016, + "learning_rate": 5.012011155051454e-07, + "loss": 0.0154, + "step": 7328 + }, + { + "epoch": 0.9682597351124616, + "grad_norm": 0.14301219582557678, + "learning_rate": 4.970538414818204e-07, + "loss": 0.0195, + "step": 7329 + }, + { + "epoch": 0.9683918485979456, + "grad_norm": 0.07853206992149353, + "learning_rate": 4.929237547320109e-07, + "loss": 0.0056, + "step": 7330 + }, + { + "epoch": 0.9685239620834296, + "grad_norm": 0.19317726790905, + "learning_rate": 4.888108559691018e-07, + "loss": 0.0185, + "step": 7331 + }, + { + "epoch": 0.9686560755689136, + "grad_norm": 0.20411786437034607, + "learning_rate": 4.84715145903536e-07, + "loss": 0.0211, + "step": 7332 + }, + { + "epoch": 0.9687881890543977, + "grad_norm": 0.17225292325019836, + "learning_rate": 4.806366252427697e-07, + "loss": 0.0199, + "step": 7333 + }, + { + "epoch": 0.9689203025398817, + "grad_norm": 0.10041002184152603, + "learning_rate": 4.7657529469128384e-07, + "loss": 0.0094, + "step": 7334 + }, + { + "epoch": 0.9690524160253657, + "grad_norm": 0.16155697405338287, + "learning_rate": 4.725311549505951e-07, + "loss": 0.0192, + "step": 7335 + }, + { + "epoch": 0.9691845295108498, + "grad_norm": 0.10414627939462662, + "learning_rate": 4.6850420671925575e-07, + "loss": 0.013, + "step": 7336 + }, + { + "epoch": 0.9693166429963338, + "grad_norm": 0.10860517621040344, + "learning_rate": 4.644944506928539e-07, + "loss": 0.0081, + "step": 7337 + }, + { + "epoch": 0.9694487564818178, + "grad_norm": 0.12890401482582092, + "learning_rate": 4.6050188756397994e-07, + "loss": 0.0102, + "step": 7338 + }, + { + "epoch": 0.9695808699673019, + "grad_norm": 0.21085602045059204, + "learning_rate": 4.565265180223044e-07, + "loss": 0.0153, + "step": 7339 + }, + { + "epoch": 0.9697129834527859, + "grad_norm": 0.1475398987531662, + "learning_rate": 4.525683427544669e-07, + "loss": 0.0129, + "step": 7340 + }, + { + "epoch": 0.9698450969382699, + "grad_norm": 0.184042826294899, + "learning_rate": 4.4862736244419835e-07, + "loss": 0.0088, + "step": 7341 + }, + { + "epoch": 0.969977210423754, + "grad_norm": 0.10526137053966522, + "learning_rate": 4.447035777721986e-07, + "loss": 0.0112, + "step": 7342 + }, + { + "epoch": 0.970109323909238, + "grad_norm": 0.12947724759578705, + "learning_rate": 4.407969894162589e-07, + "loss": 0.0129, + "step": 7343 + }, + { + "epoch": 0.970241437394722, + "grad_norm": 0.1230868250131607, + "learning_rate": 4.3690759805113944e-07, + "loss": 0.0107, + "step": 7344 + }, + { + "epoch": 0.970373550880206, + "grad_norm": 0.22553841769695282, + "learning_rate": 4.330354043486806e-07, + "loss": 0.0208, + "step": 7345 + }, + { + "epoch": 0.9705056643656901, + "grad_norm": 0.15463051199913025, + "learning_rate": 4.2918040897772513e-07, + "loss": 0.0145, + "step": 7346 + }, + { + "epoch": 0.9706377778511741, + "grad_norm": 0.18720471858978271, + "learning_rate": 4.253426126041515e-07, + "loss": 0.0074, + "step": 7347 + }, + { + "epoch": 0.9707698913366581, + "grad_norm": 0.1039755642414093, + "learning_rate": 4.215220158908628e-07, + "loss": 0.0122, + "step": 7348 + }, + { + "epoch": 0.9709020048221422, + "grad_norm": 0.14909744262695312, + "learning_rate": 4.177186194977978e-07, + "loss": 0.0144, + "step": 7349 + }, + { + "epoch": 0.9710341183076262, + "grad_norm": 0.1407427191734314, + "learning_rate": 4.139324240819309e-07, + "loss": 0.0137, + "step": 7350 + }, + { + "epoch": 0.9711662317931102, + "grad_norm": 0.16804887354373932, + "learning_rate": 4.1016343029725014e-07, + "loss": 0.0134, + "step": 7351 + }, + { + "epoch": 0.9712983452785943, + "grad_norm": 0.14374983310699463, + "learning_rate": 4.064116387947792e-07, + "loss": 0.0158, + "step": 7352 + }, + { + "epoch": 0.9714304587640783, + "grad_norm": 0.1527361273765564, + "learning_rate": 4.026770502225663e-07, + "loss": 0.016, + "step": 7353 + }, + { + "epoch": 0.9715625722495623, + "grad_norm": 0.1665458083152771, + "learning_rate": 3.989596652256955e-07, + "loss": 0.0139, + "step": 7354 + }, + { + "epoch": 0.9716946857350464, + "grad_norm": 0.12903252243995667, + "learning_rate": 3.9525948444627534e-07, + "loss": 0.0101, + "step": 7355 + }, + { + "epoch": 0.9718267992205304, + "grad_norm": 0.12322012335062027, + "learning_rate": 3.915765085234391e-07, + "loss": 0.0081, + "step": 7356 + }, + { + "epoch": 0.9719589127060144, + "grad_norm": 0.18705911934375763, + "learning_rate": 3.8791073809336666e-07, + "loss": 0.0124, + "step": 7357 + }, + { + "epoch": 0.9720910261914985, + "grad_norm": 0.12552902102470398, + "learning_rate": 3.842621737892294e-07, + "loss": 0.0141, + "step": 7358 + }, + { + "epoch": 0.9722231396769825, + "grad_norm": 0.17267630994319916, + "learning_rate": 3.806308162412564e-07, + "loss": 0.0108, + "step": 7359 + }, + { + "epoch": 0.9723552531624665, + "grad_norm": 0.11716732382774353, + "learning_rate": 3.770166660767016e-07, + "loss": 0.0121, + "step": 7360 + }, + { + "epoch": 0.9724873666479505, + "grad_norm": 0.2246181219816208, + "learning_rate": 3.734197239198434e-07, + "loss": 0.0234, + "step": 7361 + }, + { + "epoch": 0.9726194801334346, + "grad_norm": 0.11996744573116302, + "learning_rate": 3.698399903919847e-07, + "loss": 0.0098, + "step": 7362 + }, + { + "epoch": 0.9727515936189186, + "grad_norm": 0.1341700404882431, + "learning_rate": 3.662774661114421e-07, + "loss": 0.0111, + "step": 7363 + }, + { + "epoch": 0.9728837071044026, + "grad_norm": 0.11980053037405014, + "learning_rate": 3.6273215169360107e-07, + "loss": 0.0102, + "step": 7364 + }, + { + "epoch": 0.9730158205898867, + "grad_norm": 0.12592989206314087, + "learning_rate": 3.5920404775082737e-07, + "loss": 0.0093, + "step": 7365 + }, + { + "epoch": 0.9731479340753707, + "grad_norm": 0.17192436754703522, + "learning_rate": 3.556931548925557e-07, + "loss": 0.0098, + "step": 7366 + }, + { + "epoch": 0.9732800475608547, + "grad_norm": 0.09422129392623901, + "learning_rate": 3.521994737252121e-07, + "loss": 0.0081, + "step": 7367 + }, + { + "epoch": 0.9734121610463388, + "grad_norm": 0.1644677072763443, + "learning_rate": 3.487230048522583e-07, + "loss": 0.0129, + "step": 7368 + }, + { + "epoch": 0.9735442745318228, + "grad_norm": 0.13948781788349152, + "learning_rate": 3.4526374887420275e-07, + "loss": 0.0146, + "step": 7369 + }, + { + "epoch": 0.9736763880173068, + "grad_norm": 0.12497429549694061, + "learning_rate": 3.418217063885565e-07, + "loss": 0.0148, + "step": 7370 + }, + { + "epoch": 0.9738085015027909, + "grad_norm": 0.14855220913887024, + "learning_rate": 3.383968779898883e-07, + "loss": 0.0074, + "step": 7371 + }, + { + "epoch": 0.9739406149882749, + "grad_norm": 0.154222771525383, + "learning_rate": 3.349892642697472e-07, + "loss": 0.0111, + "step": 7372 + }, + { + "epoch": 0.9740727284737589, + "grad_norm": 0.13619877398014069, + "learning_rate": 3.3159886581675124e-07, + "loss": 0.0195, + "step": 7373 + }, + { + "epoch": 0.974204841959243, + "grad_norm": 0.27135902643203735, + "learning_rate": 3.2822568321653204e-07, + "loss": 0.0187, + "step": 7374 + }, + { + "epoch": 0.974336955444727, + "grad_norm": 0.17041723430156708, + "learning_rate": 3.2486971705172343e-07, + "loss": 0.0088, + "step": 7375 + }, + { + "epoch": 0.974469068930211, + "grad_norm": 0.2805335819721222, + "learning_rate": 3.215309679020284e-07, + "loss": 0.0272, + "step": 7376 + }, + { + "epoch": 0.974601182415695, + "grad_norm": 0.15251220762729645, + "learning_rate": 3.182094363441301e-07, + "loss": 0.0126, + "step": 7377 + }, + { + "epoch": 0.9747332959011791, + "grad_norm": 0.23265893757343292, + "learning_rate": 3.1490512295179186e-07, + "loss": 0.0208, + "step": 7378 + }, + { + "epoch": 0.9748654093866631, + "grad_norm": 0.1823471486568451, + "learning_rate": 3.1161802829573486e-07, + "loss": 0.0196, + "step": 7379 + }, + { + "epoch": 0.9749975228721471, + "grad_norm": 0.21355217695236206, + "learning_rate": 3.083481529437715e-07, + "loss": 0.0199, + "step": 7380 + }, + { + "epoch": 0.9751296363576312, + "grad_norm": 0.25127115845680237, + "learning_rate": 3.0509549746070564e-07, + "loss": 0.0244, + "step": 7381 + }, + { + "epoch": 0.9752617498431152, + "grad_norm": 0.1271241009235382, + "learning_rate": 3.018600624083767e-07, + "loss": 0.0113, + "step": 7382 + }, + { + "epoch": 0.9753938633285992, + "grad_norm": 0.20300577580928802, + "learning_rate": 2.9864184834562657e-07, + "loss": 0.0236, + "step": 7383 + }, + { + "epoch": 0.9755259768140833, + "grad_norm": 0.12326376140117645, + "learning_rate": 2.95440855828355e-07, + "loss": 0.0094, + "step": 7384 + }, + { + "epoch": 0.9756580902995673, + "grad_norm": 0.16099140048027039, + "learning_rate": 2.9225708540947527e-07, + "loss": 0.0117, + "step": 7385 + }, + { + "epoch": 0.9757902037850513, + "grad_norm": 0.11374569684267044, + "learning_rate": 2.8909053763891414e-07, + "loss": 0.0131, + "step": 7386 + }, + { + "epoch": 0.9759223172705354, + "grad_norm": 0.1675594598054886, + "learning_rate": 2.859412130636452e-07, + "loss": 0.0183, + "step": 7387 + }, + { + "epoch": 0.9760544307560194, + "grad_norm": 0.22921884059906006, + "learning_rate": 2.828091122276555e-07, + "loss": 0.0191, + "step": 7388 + }, + { + "epoch": 0.9761865442415034, + "grad_norm": 0.09144330769777298, + "learning_rate": 2.7969423567195674e-07, + "loss": 0.0056, + "step": 7389 + }, + { + "epoch": 0.9763186577269874, + "grad_norm": 0.14772000908851624, + "learning_rate": 2.76596583934563e-07, + "loss": 0.0117, + "step": 7390 + }, + { + "epoch": 0.9764507712124715, + "grad_norm": 0.1751900613307953, + "learning_rate": 2.7351615755056846e-07, + "loss": 0.018, + "step": 7391 + }, + { + "epoch": 0.9765828846979555, + "grad_norm": 0.14852789044380188, + "learning_rate": 2.7045295705203643e-07, + "loss": 0.0155, + "step": 7392 + }, + { + "epoch": 0.9767149981834395, + "grad_norm": 0.21934881806373596, + "learning_rate": 2.674069829680881e-07, + "loss": 0.014, + "step": 7393 + }, + { + "epoch": 0.9768471116689236, + "grad_norm": 0.17583468556404114, + "learning_rate": 2.643782358248581e-07, + "loss": 0.0123, + "step": 7394 + }, + { + "epoch": 0.9769792251544076, + "grad_norm": 0.12872935831546783, + "learning_rate": 2.6136671614550577e-07, + "loss": 0.0107, + "step": 7395 + }, + { + "epoch": 0.9771113386398916, + "grad_norm": 0.15072745084762573, + "learning_rate": 2.58372424450215e-07, + "loss": 0.0161, + "step": 7396 + }, + { + "epoch": 0.9772434521253757, + "grad_norm": 0.1387665718793869, + "learning_rate": 2.5539536125618324e-07, + "loss": 0.0127, + "step": 7397 + }, + { + "epoch": 0.9773755656108597, + "grad_norm": 0.15194547176361084, + "learning_rate": 2.5243552707765463e-07, + "loss": 0.0097, + "step": 7398 + }, + { + "epoch": 0.9775076790963437, + "grad_norm": 0.19213752448558807, + "learning_rate": 2.4949292242587573e-07, + "loss": 0.0199, + "step": 7399 + }, + { + "epoch": 0.9776397925818278, + "grad_norm": 0.18676453828811646, + "learning_rate": 2.4656754780914004e-07, + "loss": 0.0139, + "step": 7400 + }, + { + "epoch": 0.9777719060673118, + "grad_norm": 0.1353488266468048, + "learning_rate": 2.436594037327433e-07, + "loss": 0.0164, + "step": 7401 + }, + { + "epoch": 0.9779040195527958, + "grad_norm": 0.08223433792591095, + "learning_rate": 2.40768490699006e-07, + "loss": 0.0096, + "step": 7402 + }, + { + "epoch": 0.9780361330382799, + "grad_norm": 0.15485511720180511, + "learning_rate": 2.3789480920729524e-07, + "loss": 0.014, + "step": 7403 + }, + { + "epoch": 0.9781682465237639, + "grad_norm": 0.15698592364788055, + "learning_rate": 2.350383597539696e-07, + "loss": 0.0128, + "step": 7404 + }, + { + "epoch": 0.9783003600092479, + "grad_norm": 0.21309952437877655, + "learning_rate": 2.3219914283243437e-07, + "loss": 0.0365, + "step": 7405 + }, + { + "epoch": 0.978432473494732, + "grad_norm": 0.09512881934642792, + "learning_rate": 2.2937715893311952e-07, + "loss": 0.011, + "step": 7406 + }, + { + "epoch": 0.978564586980216, + "grad_norm": 0.1895619034767151, + "learning_rate": 2.265724085434573e-07, + "loss": 0.02, + "step": 7407 + }, + { + "epoch": 0.9786967004657, + "grad_norm": 0.2142200469970703, + "learning_rate": 2.2378489214791577e-07, + "loss": 0.0251, + "step": 7408 + }, + { + "epoch": 0.978828813951184, + "grad_norm": 0.13200215995311737, + "learning_rate": 2.2101461022799862e-07, + "loss": 0.0175, + "step": 7409 + }, + { + "epoch": 0.9789609274366681, + "grad_norm": 0.18622782826423645, + "learning_rate": 2.1826156326221202e-07, + "loss": 0.0234, + "step": 7410 + }, + { + "epoch": 0.9790930409221521, + "grad_norm": 0.34825804829597473, + "learning_rate": 2.155257517260867e-07, + "loss": 0.0239, + "step": 7411 + }, + { + "epoch": 0.9792251544076361, + "grad_norm": 0.11274074018001556, + "learning_rate": 2.1280717609220013e-07, + "loss": 0.0086, + "step": 7412 + }, + { + "epoch": 0.9793572678931202, + "grad_norm": 0.16548287868499756, + "learning_rate": 2.101058368301212e-07, + "loss": 0.0192, + "step": 7413 + }, + { + "epoch": 0.9794893813786042, + "grad_norm": 0.1395992487668991, + "learning_rate": 2.0742173440646552e-07, + "loss": 0.0112, + "step": 7414 + }, + { + "epoch": 0.9796214948640882, + "grad_norm": 0.1266821175813675, + "learning_rate": 2.0475486928484e-07, + "loss": 0.0115, + "step": 7415 + }, + { + "epoch": 0.9797536083495723, + "grad_norm": 0.17278282344341278, + "learning_rate": 2.0210524192593173e-07, + "loss": 0.016, + "step": 7416 + }, + { + "epoch": 0.9798857218350563, + "grad_norm": 0.15695269405841827, + "learning_rate": 1.994728527873857e-07, + "loss": 0.0088, + "step": 7417 + }, + { + "epoch": 0.9800178353205403, + "grad_norm": 0.1245427280664444, + "learning_rate": 1.9685770232390488e-07, + "loss": 0.0125, + "step": 7418 + }, + { + "epoch": 0.9801499488060244, + "grad_norm": 0.17886166274547577, + "learning_rate": 1.942597909872057e-07, + "loss": 0.0212, + "step": 7419 + }, + { + "epoch": 0.9802820622915084, + "grad_norm": 0.118898406624794, + "learning_rate": 1.916791192260403e-07, + "loss": 0.0174, + "step": 7420 + }, + { + "epoch": 0.9804141757769924, + "grad_norm": 0.13825471699237823, + "learning_rate": 1.8911568748616326e-07, + "loss": 0.0099, + "step": 7421 + }, + { + "epoch": 0.9805462892624764, + "grad_norm": 0.0875687450170517, + "learning_rate": 1.8656949621035368e-07, + "loss": 0.0102, + "step": 7422 + }, + { + "epoch": 0.9806784027479605, + "grad_norm": 0.19940754771232605, + "learning_rate": 1.8404054583842645e-07, + "loss": 0.0276, + "step": 7423 + }, + { + "epoch": 0.9808105162334445, + "grad_norm": 0.1369803100824356, + "learning_rate": 1.815288368072099e-07, + "loss": 0.0128, + "step": 7424 + }, + { + "epoch": 0.9809426297189285, + "grad_norm": 0.1722928136587143, + "learning_rate": 1.7903436955055697e-07, + "loss": 0.0142, + "step": 7425 + }, + { + "epoch": 0.9810747432044126, + "grad_norm": 0.20842589437961578, + "learning_rate": 1.7655714449933413e-07, + "loss": 0.0254, + "step": 7426 + }, + { + "epoch": 0.9812068566898966, + "grad_norm": 0.08559152483940125, + "learning_rate": 1.7409716208144355e-07, + "loss": 0.0085, + "step": 7427 + }, + { + "epoch": 0.9813389701753806, + "grad_norm": 0.1631077080965042, + "learning_rate": 1.716544227217898e-07, + "loss": 0.0158, + "step": 7428 + }, + { + "epoch": 0.9814710836608647, + "grad_norm": 0.09865134954452515, + "learning_rate": 1.6922892684232417e-07, + "loss": 0.0107, + "step": 7429 + }, + { + "epoch": 0.9816031971463487, + "grad_norm": 0.1299627423286438, + "learning_rate": 1.6682067486198937e-07, + "loss": 0.0188, + "step": 7430 + }, + { + "epoch": 0.9817353106318327, + "grad_norm": 0.21306796371936798, + "learning_rate": 1.644296671967749e-07, + "loss": 0.0155, + "step": 7431 + }, + { + "epoch": 0.9818674241173168, + "grad_norm": 0.2713109254837036, + "learning_rate": 1.6205590425969474e-07, + "loss": 0.0084, + "step": 7432 + }, + { + "epoch": 0.9819995376028008, + "grad_norm": 0.14543212950229645, + "learning_rate": 1.5969938646075432e-07, + "loss": 0.0202, + "step": 7433 + }, + { + "epoch": 0.9821316510882848, + "grad_norm": 0.12514202296733856, + "learning_rate": 1.573601142069947e-07, + "loss": 0.0106, + "step": 7434 + }, + { + "epoch": 0.9822637645737688, + "grad_norm": 0.2774215042591095, + "learning_rate": 1.5503808790249263e-07, + "loss": 0.0163, + "step": 7435 + }, + { + "epoch": 0.9823958780592529, + "grad_norm": 0.08826129138469696, + "learning_rate": 1.527333079483384e-07, + "loss": 0.0079, + "step": 7436 + }, + { + "epoch": 0.9825279915447369, + "grad_norm": 0.11560474336147308, + "learning_rate": 1.5044577474263576e-07, + "loss": 0.012, + "step": 7437 + }, + { + "epoch": 0.9826601050302209, + "grad_norm": 0.19101516902446747, + "learning_rate": 1.4817548868050202e-07, + "loss": 0.0245, + "step": 7438 + }, + { + "epoch": 0.982792218515705, + "grad_norm": 0.18168167769908905, + "learning_rate": 1.4592245015410123e-07, + "loss": 0.0234, + "step": 7439 + }, + { + "epoch": 0.982924332001189, + "grad_norm": 0.1375359743833542, + "learning_rate": 1.4368665955259986e-07, + "loss": 0.0144, + "step": 7440 + }, + { + "epoch": 0.983056445486673, + "grad_norm": 0.14244583249092102, + "learning_rate": 1.41468117262189e-07, + "loss": 0.0135, + "step": 7441 + }, + { + "epoch": 0.9831885589721571, + "grad_norm": 0.1177082434296608, + "learning_rate": 1.3926682366607324e-07, + "loss": 0.0163, + "step": 7442 + }, + { + "epoch": 0.9833206724576411, + "grad_norm": 0.13664676249027252, + "learning_rate": 1.3708277914449287e-07, + "loss": 0.0175, + "step": 7443 + }, + { + "epoch": 0.9834527859431251, + "grad_norm": 0.06507763266563416, + "learning_rate": 1.3491598407470162e-07, + "loss": 0.0047, + "step": 7444 + }, + { + "epoch": 0.9835848994286092, + "grad_norm": 0.17224647104740143, + "learning_rate": 1.3276643883096684e-07, + "loss": 0.0095, + "step": 7445 + }, + { + "epoch": 0.9837170129140932, + "grad_norm": 0.13159163296222687, + "learning_rate": 1.3063414378458038e-07, + "loss": 0.0155, + "step": 7446 + }, + { + "epoch": 0.9838491263995772, + "grad_norm": 0.18034628033638, + "learning_rate": 1.2851909930386984e-07, + "loss": 0.0228, + "step": 7447 + }, + { + "epoch": 0.9839812398850613, + "grad_norm": 0.19768694043159485, + "learning_rate": 1.2642130575415413e-07, + "loss": 0.0184, + "step": 7448 + }, + { + "epoch": 0.9841133533705453, + "grad_norm": 0.11957130581140518, + "learning_rate": 1.243407634977878e-07, + "loss": 0.0075, + "step": 7449 + }, + { + "epoch": 0.9842454668560293, + "grad_norm": 0.10868565738201141, + "learning_rate": 1.2227747289416114e-07, + "loss": 0.0097, + "step": 7450 + }, + { + "epoch": 0.9843775803415133, + "grad_norm": 0.11350014060735703, + "learning_rate": 1.2023143429965577e-07, + "loss": 0.012, + "step": 7451 + }, + { + "epoch": 0.9845096938269974, + "grad_norm": 0.10784092545509338, + "learning_rate": 1.182026480677001e-07, + "loss": 0.012, + "step": 7452 + }, + { + "epoch": 0.9846418073124814, + "grad_norm": 0.1857926845550537, + "learning_rate": 1.1619111454871378e-07, + "loss": 0.024, + "step": 7453 + }, + { + "epoch": 0.9847739207979654, + "grad_norm": 0.19607584178447723, + "learning_rate": 1.1419683409015225e-07, + "loss": 0.0086, + "step": 7454 + }, + { + "epoch": 0.9849060342834495, + "grad_norm": 0.1448315978050232, + "learning_rate": 1.1221980703650659e-07, + "loss": 0.0089, + "step": 7455 + }, + { + "epoch": 0.9850381477689335, + "grad_norm": 0.158442422747612, + "learning_rate": 1.1026003372924809e-07, + "loss": 0.0132, + "step": 7456 + }, + { + "epoch": 0.9851702612544175, + "grad_norm": 0.10124783962965012, + "learning_rate": 1.0831751450691707e-07, + "loss": 0.0064, + "step": 7457 + }, + { + "epoch": 0.9853023747399016, + "grad_norm": 0.1521049290895462, + "learning_rate": 1.0639224970502293e-07, + "loss": 0.0164, + "step": 7458 + }, + { + "epoch": 0.9854344882253856, + "grad_norm": 0.20458267629146576, + "learning_rate": 1.0448423965613297e-07, + "loss": 0.0144, + "step": 7459 + }, + { + "epoch": 0.9855666017108696, + "grad_norm": 0.1937059909105301, + "learning_rate": 1.0259348468981689e-07, + "loss": 0.0174, + "step": 7460 + }, + { + "epoch": 0.9856987151963537, + "grad_norm": 0.10319459438323975, + "learning_rate": 1.00719985132669e-07, + "loss": 0.0099, + "step": 7461 + }, + { + "epoch": 0.9858308286818377, + "grad_norm": 0.16178636252880096, + "learning_rate": 9.886374130829711e-08, + "loss": 0.0177, + "step": 7462 + }, + { + "epoch": 0.9859629421673217, + "grad_norm": 0.1864520162343979, + "learning_rate": 9.702475353733364e-08, + "loss": 0.025, + "step": 7463 + }, + { + "epoch": 0.9860950556528058, + "grad_norm": 0.15063433349132538, + "learning_rate": 9.520302213743559e-08, + "loss": 0.0154, + "step": 7464 + }, + { + "epoch": 0.9862271691382898, + "grad_norm": 0.10295557975769043, + "learning_rate": 9.339854742326238e-08, + "loss": 0.0127, + "step": 7465 + }, + { + "epoch": 0.9863592826237738, + "grad_norm": 0.10709338635206223, + "learning_rate": 9.16113297065202e-08, + "loss": 0.006, + "step": 7466 + }, + { + "epoch": 0.9864913961092578, + "grad_norm": 0.11552749574184418, + "learning_rate": 8.984136929589548e-08, + "loss": 0.0054, + "step": 7467 + }, + { + "epoch": 0.9866235095947419, + "grad_norm": 0.1398506462574005, + "learning_rate": 8.808866649713254e-08, + "loss": 0.0067, + "step": 7468 + }, + { + "epoch": 0.9867556230802259, + "grad_norm": 0.2075989991426468, + "learning_rate": 8.6353221612967e-08, + "loss": 0.0234, + "step": 7469 + }, + { + "epoch": 0.9868877365657099, + "grad_norm": 0.16684336960315704, + "learning_rate": 8.463503494317015e-08, + "loss": 0.0233, + "step": 7470 + }, + { + "epoch": 0.987019850051194, + "grad_norm": 0.15726900100708008, + "learning_rate": 8.293410678452685e-08, + "loss": 0.0125, + "step": 7471 + }, + { + "epoch": 0.987151963536678, + "grad_norm": 0.20710338652133942, + "learning_rate": 8.125043743084648e-08, + "loss": 0.0105, + "step": 7472 + }, + { + "epoch": 0.987284077022162, + "grad_norm": 0.14183491468429565, + "learning_rate": 7.958402717294089e-08, + "loss": 0.0151, + "step": 7473 + }, + { + "epoch": 0.9874161905076461, + "grad_norm": 0.26822736859321594, + "learning_rate": 7.793487629865759e-08, + "loss": 0.0245, + "step": 7474 + }, + { + "epoch": 0.9875483039931301, + "grad_norm": 0.1157698780298233, + "learning_rate": 7.63029850928465e-08, + "loss": 0.0082, + "step": 7475 + }, + { + "epoch": 0.9876804174786141, + "grad_norm": 0.12137190997600555, + "learning_rate": 7.468835383740436e-08, + "loss": 0.0115, + "step": 7476 + }, + { + "epoch": 0.9878125309640982, + "grad_norm": 0.1745070219039917, + "learning_rate": 7.309098281120808e-08, + "loss": 0.0104, + "step": 7477 + }, + { + "epoch": 0.9879446444495822, + "grad_norm": 0.1688745766878128, + "learning_rate": 7.151087229019249e-08, + "loss": 0.0143, + "step": 7478 + }, + { + "epoch": 0.9880767579350662, + "grad_norm": 0.1639687716960907, + "learning_rate": 6.994802254728372e-08, + "loss": 0.0112, + "step": 7479 + }, + { + "epoch": 0.9882088714205502, + "grad_norm": 0.26630374789237976, + "learning_rate": 6.84024338524325e-08, + "loss": 0.0197, + "step": 7480 + }, + { + "epoch": 0.9883409849060343, + "grad_norm": 0.41736364364624023, + "learning_rate": 6.687410647260306e-08, + "loss": 0.0486, + "step": 7481 + }, + { + "epoch": 0.9884730983915183, + "grad_norm": 0.11081980168819427, + "learning_rate": 6.536304067180643e-08, + "loss": 0.0095, + "step": 7482 + }, + { + "epoch": 0.9886052118770023, + "grad_norm": 0.12715992331504822, + "learning_rate": 6.386923671103384e-08, + "loss": 0.0074, + "step": 7483 + }, + { + "epoch": 0.9887373253624864, + "grad_norm": 0.12597718834877014, + "learning_rate": 6.239269484832333e-08, + "loss": 0.014, + "step": 7484 + }, + { + "epoch": 0.9888694388479704, + "grad_norm": 0.17149794101715088, + "learning_rate": 6.093341533870422e-08, + "loss": 0.0128, + "step": 7485 + }, + { + "epoch": 0.9890015523334544, + "grad_norm": 0.10480506718158722, + "learning_rate": 5.949139843426377e-08, + "loss": 0.0168, + "step": 7486 + }, + { + "epoch": 0.9891336658189385, + "grad_norm": 0.20579087734222412, + "learning_rate": 5.806664438405829e-08, + "loss": 0.0089, + "step": 7487 + }, + { + "epoch": 0.9892657793044225, + "grad_norm": 0.10642731189727783, + "learning_rate": 5.665915343420203e-08, + "loss": 0.0041, + "step": 7488 + }, + { + "epoch": 0.9893978927899065, + "grad_norm": 0.19486381113529205, + "learning_rate": 5.526892582781162e-08, + "loss": 0.0136, + "step": 7489 + }, + { + "epoch": 0.9895300062753906, + "grad_norm": 0.15670417249202728, + "learning_rate": 5.3895961805017214e-08, + "loss": 0.013, + "step": 7490 + }, + { + "epoch": 0.9896621197608746, + "grad_norm": 0.14204590022563934, + "learning_rate": 5.254026160297354e-08, + "loss": 0.0164, + "step": 7491 + }, + { + "epoch": 0.9897942332463586, + "grad_norm": 0.10373309999704361, + "learning_rate": 5.120182545585994e-08, + "loss": 0.0087, + "step": 7492 + }, + { + "epoch": 0.9899263467318427, + "grad_norm": 0.2570636570453644, + "learning_rate": 4.988065359485816e-08, + "loss": 0.0165, + "step": 7493 + }, + { + "epoch": 0.9900584602173267, + "grad_norm": 0.23979882895946503, + "learning_rate": 4.857674624818565e-08, + "loss": 0.023, + "step": 7494 + }, + { + "epoch": 0.9901905737028107, + "grad_norm": 0.12361893057823181, + "learning_rate": 4.729010364105113e-08, + "loss": 0.0128, + "step": 7495 + }, + { + "epoch": 0.9903226871882947, + "grad_norm": 0.16618198156356812, + "learning_rate": 4.6020725995710166e-08, + "loss": 0.0086, + "step": 7496 + }, + { + "epoch": 0.9904548006737788, + "grad_norm": 0.10690437257289886, + "learning_rate": 4.47686135314318e-08, + "loss": 0.0098, + "step": 7497 + }, + { + "epoch": 0.9905869141592628, + "grad_norm": 0.1558026224374771, + "learning_rate": 4.3533766464476376e-08, + "loss": 0.0118, + "step": 7498 + }, + { + "epoch": 0.9907190276447468, + "grad_norm": 0.22771494090557098, + "learning_rate": 4.231618500815104e-08, + "loss": 0.014, + "step": 7499 + }, + { + "epoch": 0.9908511411302309, + "grad_norm": 0.24591323733329773, + "learning_rate": 4.111586937276535e-08, + "loss": 0.0182, + "step": 7500 + }, + { + "epoch": 0.9909832546157149, + "grad_norm": 0.14383232593536377, + "learning_rate": 3.993281976566454e-08, + "loss": 0.0074, + "step": 7501 + }, + { + "epoch": 0.9911153681011989, + "grad_norm": 0.2168605774641037, + "learning_rate": 3.876703639117407e-08, + "loss": 0.0164, + "step": 7502 + }, + { + "epoch": 0.991247481586683, + "grad_norm": 0.1999928057193756, + "learning_rate": 3.7618519450688394e-08, + "loss": 0.0175, + "step": 7503 + }, + { + "epoch": 0.991379595072167, + "grad_norm": 0.09430862218141556, + "learning_rate": 3.6487269142571055e-08, + "loss": 0.0096, + "step": 7504 + }, + { + "epoch": 0.991511708557651, + "grad_norm": 0.1702381670475006, + "learning_rate": 3.5373285662243515e-08, + "loss": 0.0193, + "step": 7505 + }, + { + "epoch": 0.9916438220431351, + "grad_norm": 0.08822032064199448, + "learning_rate": 3.427656920210742e-08, + "loss": 0.0082, + "step": 7506 + }, + { + "epoch": 0.9917759355286191, + "grad_norm": 0.1684400737285614, + "learning_rate": 3.319711995161123e-08, + "loss": 0.0167, + "step": 7507 + }, + { + "epoch": 0.9919080490141031, + "grad_norm": 0.1515914797782898, + "learning_rate": 3.21349380971947e-08, + "loss": 0.0108, + "step": 7508 + }, + { + "epoch": 0.9920401624995872, + "grad_norm": 0.0880783349275589, + "learning_rate": 3.109002382235548e-08, + "loss": 0.0074, + "step": 7509 + }, + { + "epoch": 0.9921722759850712, + "grad_norm": 0.15489071607589722, + "learning_rate": 3.006237730756034e-08, + "loss": 0.0123, + "step": 7510 + }, + { + "epoch": 0.9923043894705552, + "grad_norm": 0.07852049171924591, + "learning_rate": 2.905199873033393e-08, + "loss": 0.009, + "step": 7511 + }, + { + "epoch": 0.9924365029560392, + "grad_norm": 0.17368489503860474, + "learning_rate": 2.8058888265181105e-08, + "loss": 0.0199, + "step": 7512 + }, + { + "epoch": 0.9925686164415233, + "grad_norm": 0.09940238296985626, + "learning_rate": 2.708304608365353e-08, + "loss": 0.0063, + "step": 7513 + }, + { + "epoch": 0.9927007299270073, + "grad_norm": 0.1165870726108551, + "learning_rate": 2.6124472354316364e-08, + "loss": 0.0082, + "step": 7514 + }, + { + "epoch": 0.9928328434124913, + "grad_norm": 0.2655845582485199, + "learning_rate": 2.518316724272607e-08, + "loss": 0.0209, + "step": 7515 + }, + { + "epoch": 0.9929649568979754, + "grad_norm": 0.07835333049297333, + "learning_rate": 2.425913091149701e-08, + "loss": 0.0067, + "step": 7516 + }, + { + "epoch": 0.9930970703834594, + "grad_norm": 0.15666654706001282, + "learning_rate": 2.335236352022374e-08, + "loss": 0.0079, + "step": 7517 + }, + { + "epoch": 0.9932291838689434, + "grad_norm": 0.11625221371650696, + "learning_rate": 2.2462865225547636e-08, + "loss": 0.0092, + "step": 7518 + }, + { + "epoch": 0.9933612973544275, + "grad_norm": 0.11382373422384262, + "learning_rate": 2.1590636181090252e-08, + "loss": 0.0136, + "step": 7519 + }, + { + "epoch": 0.9934934108399115, + "grad_norm": 0.13483434915542603, + "learning_rate": 2.073567653754216e-08, + "loss": 0.0167, + "step": 7520 + }, + { + "epoch": 0.9936255243253955, + "grad_norm": 0.25301748514175415, + "learning_rate": 1.989798644255192e-08, + "loss": 0.0122, + "step": 7521 + }, + { + "epoch": 0.9937576378108796, + "grad_norm": 0.1368926465511322, + "learning_rate": 1.9077566040837104e-08, + "loss": 0.0115, + "step": 7522 + }, + { + "epoch": 0.9938897512963636, + "grad_norm": 0.11503051221370697, + "learning_rate": 1.8274415474106577e-08, + "loss": 0.0114, + "step": 7523 + }, + { + "epoch": 0.9940218647818476, + "grad_norm": 0.19685514271259308, + "learning_rate": 1.7488534881082708e-08, + "loss": 0.0163, + "step": 7524 + }, + { + "epoch": 0.9941539782673317, + "grad_norm": 0.199909046292305, + "learning_rate": 1.6719924397512465e-08, + "loss": 0.0142, + "step": 7525 + }, + { + "epoch": 0.9942860917528157, + "grad_norm": 0.10640694200992584, + "learning_rate": 1.596858415615632e-08, + "loss": 0.0097, + "step": 7526 + }, + { + "epoch": 0.9944182052382997, + "grad_norm": 0.07238924503326416, + "learning_rate": 1.5234514286810442e-08, + "loss": 0.0044, + "step": 7527 + }, + { + "epoch": 0.9945503187237837, + "grad_norm": 0.15096305310726166, + "learning_rate": 1.4517714916251202e-08, + "loss": 0.0108, + "step": 7528 + }, + { + "epoch": 0.9946824322092678, + "grad_norm": 0.16533154249191284, + "learning_rate": 1.3818186168301772e-08, + "loss": 0.0149, + "step": 7529 + }, + { + "epoch": 0.9948145456947518, + "grad_norm": 0.19295816123485565, + "learning_rate": 1.3135928163787725e-08, + "loss": 0.0393, + "step": 7530 + }, + { + "epoch": 0.9949466591802358, + "grad_norm": 0.23089705407619476, + "learning_rate": 1.2470941020570336e-08, + "loss": 0.0238, + "step": 7531 + }, + { + "epoch": 0.9950787726657199, + "grad_norm": 0.3397480547428131, + "learning_rate": 1.1823224853491077e-08, + "loss": 0.0283, + "step": 7532 + }, + { + "epoch": 0.9952108861512039, + "grad_norm": 0.17770127952098846, + "learning_rate": 1.1192779774449325e-08, + "loss": 0.0163, + "step": 7533 + }, + { + "epoch": 0.9953429996366879, + "grad_norm": 0.16790403425693512, + "learning_rate": 1.0579605892346855e-08, + "loss": 0.0209, + "step": 7534 + }, + { + "epoch": 0.995475113122172, + "grad_norm": 0.18203333020210266, + "learning_rate": 9.983703313076743e-09, + "loss": 0.021, + "step": 7535 + }, + { + "epoch": 0.995607226607656, + "grad_norm": 0.14092746376991272, + "learning_rate": 9.405072139578864e-09, + "loss": 0.0116, + "step": 7536 + }, + { + "epoch": 0.99573934009314, + "grad_norm": 0.11230488121509552, + "learning_rate": 8.843712471806598e-09, + "loss": 0.0073, + "step": 7537 + }, + { + "epoch": 0.995871453578624, + "grad_norm": 0.13066421449184418, + "learning_rate": 8.299624406726824e-09, + "loss": 0.0199, + "step": 7538 + }, + { + "epoch": 0.9960035670641081, + "grad_norm": 0.1082078292965889, + "learning_rate": 7.772808038308822e-09, + "loss": 0.005, + "step": 7539 + }, + { + "epoch": 0.9961356805495921, + "grad_norm": 0.3292711079120636, + "learning_rate": 7.263263457557568e-09, + "loss": 0.0082, + "step": 7540 + }, + { + "epoch": 0.9962677940350761, + "grad_norm": 0.1794513761997223, + "learning_rate": 6.770990752491546e-09, + "loss": 0.0157, + "step": 7541 + }, + { + "epoch": 0.9963999075205602, + "grad_norm": 0.07902058213949203, + "learning_rate": 6.295990008131636e-09, + "loss": 0.0041, + "step": 7542 + }, + { + "epoch": 0.9965320210060442, + "grad_norm": 0.2639276385307312, + "learning_rate": 5.8382613065344204e-09, + "loss": 0.0269, + "step": 7543 + }, + { + "epoch": 0.9966641344915282, + "grad_norm": 0.11883040517568588, + "learning_rate": 5.39780472674778e-09, + "loss": 0.0203, + "step": 7544 + }, + { + "epoch": 0.9967962479770123, + "grad_norm": 0.1411103755235672, + "learning_rate": 4.974620344877501e-09, + "loss": 0.013, + "step": 7545 + }, + { + "epoch": 0.9969283614624963, + "grad_norm": 0.10590819269418716, + "learning_rate": 4.568708233998465e-09, + "loss": 0.0061, + "step": 7546 + }, + { + "epoch": 0.9970604749479803, + "grad_norm": 0.14374881982803345, + "learning_rate": 4.180068464243458e-09, + "loss": 0.0105, + "step": 7547 + }, + { + "epoch": 0.9971925884334644, + "grad_norm": 0.13679523766040802, + "learning_rate": 3.808701102725465e-09, + "loss": 0.0181, + "step": 7548 + }, + { + "epoch": 0.9973247019189484, + "grad_norm": 0.19184891879558563, + "learning_rate": 3.4546062135931702e-09, + "loss": 0.0256, + "step": 7549 + }, + { + "epoch": 0.9974568154044324, + "grad_norm": 0.1876770406961441, + "learning_rate": 3.1177838580198626e-09, + "loss": 0.0209, + "step": 7550 + }, + { + "epoch": 0.9975889288899165, + "grad_norm": 0.2564197778701782, + "learning_rate": 2.7982340941812292e-09, + "loss": 0.0245, + "step": 7551 + }, + { + "epoch": 0.9977210423754005, + "grad_norm": 0.14303652942180634, + "learning_rate": 2.4959569772775583e-09, + "loss": 0.0105, + "step": 7552 + }, + { + "epoch": 0.9978531558608845, + "grad_norm": 0.18371181190013885, + "learning_rate": 2.2109525595115365e-09, + "loss": 0.0211, + "step": 7553 + }, + { + "epoch": 0.9979852693463686, + "grad_norm": 0.16563263535499573, + "learning_rate": 1.9432208901104533e-09, + "loss": 0.0168, + "step": 7554 + }, + { + "epoch": 0.9981173828318526, + "grad_norm": 0.19452422857284546, + "learning_rate": 1.6927620153373013e-09, + "loss": 0.0116, + "step": 7555 + }, + { + "epoch": 0.9982494963173366, + "grad_norm": 0.07965493947267532, + "learning_rate": 1.4595759784463703e-09, + "loss": 0.0047, + "step": 7556 + }, + { + "epoch": 0.9983816098028206, + "grad_norm": 0.14059865474700928, + "learning_rate": 1.243662819705449e-09, + "loss": 0.0098, + "step": 7557 + }, + { + "epoch": 0.9985137232883047, + "grad_norm": 0.11752263456583023, + "learning_rate": 1.045022576418031e-09, + "loss": 0.0104, + "step": 7558 + }, + { + "epoch": 0.9986458367737887, + "grad_norm": 0.09485765546560287, + "learning_rate": 8.636552829011102e-10, + "loss": 0.0087, + "step": 7559 + }, + { + "epoch": 0.9987779502592727, + "grad_norm": 0.10121173411607742, + "learning_rate": 6.99560970474078e-10, + "loss": 0.0118, + "step": 7560 + }, + { + "epoch": 0.9989100637447568, + "grad_norm": 0.1641887128353119, + "learning_rate": 5.527396674809282e-10, + "loss": 0.013, + "step": 7561 + }, + { + "epoch": 0.9990421772302408, + "grad_norm": 0.09525637328624725, + "learning_rate": 4.2319139929025695e-10, + "loss": 0.0092, + "step": 7562 + }, + { + "epoch": 0.9991742907157248, + "grad_norm": 0.1413845717906952, + "learning_rate": 3.1091618827305826e-10, + "loss": 0.0127, + "step": 7563 + }, + { + "epoch": 0.9993064042012089, + "grad_norm": 0.1747020184993744, + "learning_rate": 2.1591405382492824e-10, + "loss": 0.0154, + "step": 7564 + }, + { + "epoch": 0.9994385176866929, + "grad_norm": 0.19588789343833923, + "learning_rate": 1.3818501234386105e-10, + "loss": 0.021, + "step": 7565 + }, + { + "epoch": 0.9995706311721769, + "grad_norm": 0.14852724969387054, + "learning_rate": 7.77290772746575e-11, + "loss": 0.0124, + "step": 7566 + }, + { + "epoch": 0.999702744657661, + "grad_norm": 0.17045824229717255, + "learning_rate": 3.4546259053414023e-11, + "loss": 0.0256, + "step": 7567 + }, + { + "epoch": 0.999834858143145, + "grad_norm": 0.15149812400341034, + "learning_rate": 8.636565140829333e-12, + "loss": 0.0186, + "step": 7568 + }, + { + "epoch": 0.999966971628629, + "grad_norm": 0.1452350914478302, + "learning_rate": 0.0, + "loss": 0.0206, + "step": 7569 + } + ], + "logging_steps": 1, + "max_steps": 7569, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.1252272381910057e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}