diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,32508 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.7002356226546818, + "eval_steps": 500, + "global_step": 92826, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005817843325479244, + "grad_norm": 14.125, + "learning_rate": 9.998063627249894e-06, + "loss": 1.8693, + "step": 20 + }, + { + "epoch": 0.0011635686650958489, + "grad_norm": 11.625, + "learning_rate": 9.996124345577279e-06, + "loss": 1.767, + "step": 40 + }, + { + "epoch": 0.0017453529976437735, + "grad_norm": 13.125, + "learning_rate": 9.994185063904664e-06, + "loss": 1.7496, + "step": 60 + }, + { + "epoch": 0.0023271373301916977, + "grad_norm": 13.3125, + "learning_rate": 9.992245782232049e-06, + "loss": 1.7907, + "step": 80 + }, + { + "epoch": 0.0029089216627396224, + "grad_norm": 12.125, + "learning_rate": 9.990306500559434e-06, + "loss": 1.6286, + "step": 100 + }, + { + "epoch": 0.003490705995287547, + "grad_norm": 12.4375, + "learning_rate": 9.98836721888682e-06, + "loss": 1.8037, + "step": 120 + }, + { + "epoch": 0.004072490327835471, + "grad_norm": 12.5625, + "learning_rate": 9.986427937214204e-06, + "loss": 1.7154, + "step": 140 + }, + { + "epoch": 0.0046542746603833954, + "grad_norm": 14.4375, + "learning_rate": 9.98448865554159e-06, + "loss": 1.7231, + "step": 160 + }, + { + "epoch": 0.0052360589929313205, + "grad_norm": 9.5625, + "learning_rate": 9.982549373868974e-06, + "loss": 1.701, + "step": 180 + }, + { + "epoch": 0.005817843325479245, + "grad_norm": 12.625, + "learning_rate": 9.98061009219636e-06, + "loss": 1.7652, + "step": 200 + }, + { + "epoch": 0.006399627658027169, + "grad_norm": 12.6875, + "learning_rate": 9.978670810523745e-06, + "loss": 1.7116, + "step": 220 + }, + { + "epoch": 0.006981411990575094, + "grad_norm": 12.6875, + "learning_rate": 9.97673152885113e-06, + "loss": 1.6567, + "step": 240 + }, + { + "epoch": 0.007563196323123018, + "grad_norm": 14.8125, + "learning_rate": 9.974792247178515e-06, + "loss": 1.7739, + "step": 260 + }, + { + "epoch": 0.008144980655670942, + "grad_norm": 13.0625, + "learning_rate": 9.9728529655059e-06, + "loss": 1.9183, + "step": 280 + }, + { + "epoch": 0.008726764988218868, + "grad_norm": 14.375, + "learning_rate": 9.970913683833285e-06, + "loss": 1.7885, + "step": 300 + }, + { + "epoch": 0.009308549320766791, + "grad_norm": 13.3125, + "learning_rate": 9.96897440216067e-06, + "loss": 1.7945, + "step": 320 + }, + { + "epoch": 0.009890333653314716, + "grad_norm": 12.5625, + "learning_rate": 9.967035120488055e-06, + "loss": 1.7623, + "step": 340 + }, + { + "epoch": 0.010472117985862641, + "grad_norm": 13.375, + "learning_rate": 9.96509583881544e-06, + "loss": 1.6728, + "step": 360 + }, + { + "epoch": 0.011053902318410564, + "grad_norm": 13.3125, + "learning_rate": 9.963156557142825e-06, + "loss": 1.7254, + "step": 380 + }, + { + "epoch": 0.01163568665095849, + "grad_norm": 15.5, + "learning_rate": 9.96121727547021e-06, + "loss": 1.6798, + "step": 400 + }, + { + "epoch": 0.012217470983506415, + "grad_norm": 12.6875, + "learning_rate": 9.959277993797596e-06, + "loss": 1.7626, + "step": 420 + }, + { + "epoch": 0.012799255316054338, + "grad_norm": 13.0625, + "learning_rate": 9.95733871212498e-06, + "loss": 1.823, + "step": 440 + }, + { + "epoch": 0.013381039648602263, + "grad_norm": 11.75, + "learning_rate": 9.955399430452366e-06, + "loss": 1.688, + "step": 460 + }, + { + "epoch": 0.013962823981150188, + "grad_norm": 13.3125, + "learning_rate": 9.953460148779751e-06, + "loss": 1.762, + "step": 480 + }, + { + "epoch": 0.014544608313698111, + "grad_norm": 12.1875, + "learning_rate": 9.951520867107136e-06, + "loss": 1.6487, + "step": 500 + }, + { + "epoch": 0.015126392646246037, + "grad_norm": 15.0625, + "learning_rate": 9.949581585434521e-06, + "loss": 1.8255, + "step": 520 + }, + { + "epoch": 0.01570817697879396, + "grad_norm": 13.4375, + "learning_rate": 9.947642303761906e-06, + "loss": 1.7325, + "step": 540 + }, + { + "epoch": 0.016289961311341885, + "grad_norm": 15.25, + "learning_rate": 9.945703022089291e-06, + "loss": 1.8285, + "step": 560 + }, + { + "epoch": 0.01687174564388981, + "grad_norm": 12.375, + "learning_rate": 9.943763740416676e-06, + "loss": 1.7398, + "step": 580 + }, + { + "epoch": 0.017453529976437735, + "grad_norm": 16.625, + "learning_rate": 9.941824458744062e-06, + "loss": 1.7851, + "step": 600 + }, + { + "epoch": 0.01803531430898566, + "grad_norm": 11.875, + "learning_rate": 9.939885177071447e-06, + "loss": 1.7592, + "step": 620 + }, + { + "epoch": 0.018617098641533582, + "grad_norm": 16.875, + "learning_rate": 9.937945895398832e-06, + "loss": 1.7966, + "step": 640 + }, + { + "epoch": 0.019198882974081507, + "grad_norm": 10.9375, + "learning_rate": 9.936006613726217e-06, + "loss": 1.704, + "step": 660 + }, + { + "epoch": 0.019780667306629432, + "grad_norm": 18.5, + "learning_rate": 9.934067332053602e-06, + "loss": 1.7116, + "step": 680 + }, + { + "epoch": 0.020362451639177357, + "grad_norm": 14.6875, + "learning_rate": 9.932128050380987e-06, + "loss": 1.7695, + "step": 700 + }, + { + "epoch": 0.020944235971725282, + "grad_norm": 12.625, + "learning_rate": 9.930188768708372e-06, + "loss": 1.7622, + "step": 720 + }, + { + "epoch": 0.021526020304273207, + "grad_norm": 13.3125, + "learning_rate": 9.928249487035757e-06, + "loss": 1.7275, + "step": 740 + }, + { + "epoch": 0.02210780463682113, + "grad_norm": 12.5, + "learning_rate": 9.926310205363142e-06, + "loss": 1.6935, + "step": 760 + }, + { + "epoch": 0.022689588969369054, + "grad_norm": 12.75, + "learning_rate": 9.924370923690527e-06, + "loss": 1.8071, + "step": 780 + }, + { + "epoch": 0.02327137330191698, + "grad_norm": 13.9375, + "learning_rate": 9.922431642017912e-06, + "loss": 1.7624, + "step": 800 + }, + { + "epoch": 0.023853157634464904, + "grad_norm": 12.375, + "learning_rate": 9.920492360345298e-06, + "loss": 1.7581, + "step": 820 + }, + { + "epoch": 0.02443494196701283, + "grad_norm": 11.625, + "learning_rate": 9.918553078672683e-06, + "loss": 1.7039, + "step": 840 + }, + { + "epoch": 0.025016726299560754, + "grad_norm": 12.125, + "learning_rate": 9.916613797000068e-06, + "loss": 1.7564, + "step": 860 + }, + { + "epoch": 0.025598510632108676, + "grad_norm": 13.875, + "learning_rate": 9.914674515327453e-06, + "loss": 1.7778, + "step": 880 + }, + { + "epoch": 0.0261802949646566, + "grad_norm": 12.25, + "learning_rate": 9.912735233654838e-06, + "loss": 1.7182, + "step": 900 + }, + { + "epoch": 0.026762079297204526, + "grad_norm": 14.4375, + "learning_rate": 9.910795951982223e-06, + "loss": 1.6581, + "step": 920 + }, + { + "epoch": 0.02734386362975245, + "grad_norm": 13.5625, + "learning_rate": 9.908856670309608e-06, + "loss": 1.7495, + "step": 940 + }, + { + "epoch": 0.027925647962300376, + "grad_norm": 13.125, + "learning_rate": 9.906917388636993e-06, + "loss": 1.75, + "step": 960 + }, + { + "epoch": 0.0285074322948483, + "grad_norm": 13.4375, + "learning_rate": 9.904978106964378e-06, + "loss": 1.8321, + "step": 980 + }, + { + "epoch": 0.029089216627396223, + "grad_norm": 12.0625, + "learning_rate": 9.903038825291763e-06, + "loss": 1.7237, + "step": 1000 + }, + { + "epoch": 0.029671000959944148, + "grad_norm": 14.0625, + "learning_rate": 9.901099543619149e-06, + "loss": 1.7602, + "step": 1020 + }, + { + "epoch": 0.030252785292492073, + "grad_norm": 10.375, + "learning_rate": 9.899160261946534e-06, + "loss": 1.7526, + "step": 1040 + }, + { + "epoch": 0.030834569625039998, + "grad_norm": 12.0625, + "learning_rate": 9.897220980273919e-06, + "loss": 1.6564, + "step": 1060 + }, + { + "epoch": 0.03141635395758792, + "grad_norm": 14.0, + "learning_rate": 9.895281698601304e-06, + "loss": 1.7071, + "step": 1080 + }, + { + "epoch": 0.03199813829013585, + "grad_norm": 13.25, + "learning_rate": 9.893342416928689e-06, + "loss": 1.7021, + "step": 1100 + }, + { + "epoch": 0.03257992262268377, + "grad_norm": 14.5625, + "learning_rate": 9.891403135256074e-06, + "loss": 1.6877, + "step": 1120 + }, + { + "epoch": 0.0331617069552317, + "grad_norm": 12.5625, + "learning_rate": 9.88946385358346e-06, + "loss": 1.6961, + "step": 1140 + }, + { + "epoch": 0.03374349128777962, + "grad_norm": 13.0625, + "learning_rate": 9.887524571910844e-06, + "loss": 1.7913, + "step": 1160 + }, + { + "epoch": 0.03432527562032754, + "grad_norm": 13.125, + "learning_rate": 9.88558529023823e-06, + "loss": 1.7257, + "step": 1180 + }, + { + "epoch": 0.03490705995287547, + "grad_norm": 14.125, + "learning_rate": 9.883646008565614e-06, + "loss": 1.7231, + "step": 1200 + }, + { + "epoch": 0.03548884428542339, + "grad_norm": 12.75, + "learning_rate": 9.881706726893e-06, + "loss": 1.744, + "step": 1220 + }, + { + "epoch": 0.03607062861797132, + "grad_norm": 12.25, + "learning_rate": 9.879767445220385e-06, + "loss": 1.6365, + "step": 1240 + }, + { + "epoch": 0.03665241295051924, + "grad_norm": 9.375, + "learning_rate": 9.87782816354777e-06, + "loss": 1.7705, + "step": 1260 + }, + { + "epoch": 0.037234197283067164, + "grad_norm": 10.8125, + "learning_rate": 9.875888881875155e-06, + "loss": 1.7309, + "step": 1280 + }, + { + "epoch": 0.03781598161561509, + "grad_norm": 11.0, + "learning_rate": 9.87394960020254e-06, + "loss": 1.7818, + "step": 1300 + }, + { + "epoch": 0.038397765948163014, + "grad_norm": 13.6875, + "learning_rate": 9.872010318529925e-06, + "loss": 1.7461, + "step": 1320 + }, + { + "epoch": 0.03897955028071094, + "grad_norm": 13.8125, + "learning_rate": 9.87007103685731e-06, + "loss": 1.7633, + "step": 1340 + }, + { + "epoch": 0.039561334613258864, + "grad_norm": 12.25, + "learning_rate": 9.868131755184695e-06, + "loss": 1.7222, + "step": 1360 + }, + { + "epoch": 0.04014311894580679, + "grad_norm": 13.3125, + "learning_rate": 9.86619247351208e-06, + "loss": 1.7887, + "step": 1380 + }, + { + "epoch": 0.040724903278354714, + "grad_norm": 12.9375, + "learning_rate": 9.864253191839465e-06, + "loss": 1.7775, + "step": 1400 + }, + { + "epoch": 0.041306687610902636, + "grad_norm": 12.25, + "learning_rate": 9.86231391016685e-06, + "loss": 1.724, + "step": 1420 + }, + { + "epoch": 0.041888471943450564, + "grad_norm": 9.125, + "learning_rate": 9.860374628494236e-06, + "loss": 1.7322, + "step": 1440 + }, + { + "epoch": 0.042470256275998486, + "grad_norm": 13.125, + "learning_rate": 9.858435346821619e-06, + "loss": 1.7101, + "step": 1460 + }, + { + "epoch": 0.043052040608546414, + "grad_norm": 14.0, + "learning_rate": 9.856496065149004e-06, + "loss": 1.7176, + "step": 1480 + }, + { + "epoch": 0.043633824941094336, + "grad_norm": 14.6875, + "learning_rate": 9.85455678347639e-06, + "loss": 1.6656, + "step": 1500 + }, + { + "epoch": 0.04421560927364226, + "grad_norm": 12.5, + "learning_rate": 9.852617501803774e-06, + "loss": 1.7662, + "step": 1520 + }, + { + "epoch": 0.044797393606190186, + "grad_norm": 13.4375, + "learning_rate": 9.85067822013116e-06, + "loss": 1.6732, + "step": 1540 + }, + { + "epoch": 0.04537917793873811, + "grad_norm": 13.0, + "learning_rate": 9.848738938458545e-06, + "loss": 1.7837, + "step": 1560 + }, + { + "epoch": 0.045960962271286036, + "grad_norm": 11.8125, + "learning_rate": 9.84679965678593e-06, + "loss": 1.703, + "step": 1580 + }, + { + "epoch": 0.04654274660383396, + "grad_norm": 9.3125, + "learning_rate": 9.844860375113315e-06, + "loss": 1.7084, + "step": 1600 + }, + { + "epoch": 0.047124530936381887, + "grad_norm": 10.8125, + "learning_rate": 9.8429210934407e-06, + "loss": 1.795, + "step": 1620 + }, + { + "epoch": 0.04770631526892981, + "grad_norm": 10.75, + "learning_rate": 9.840981811768085e-06, + "loss": 1.8042, + "step": 1640 + }, + { + "epoch": 0.04828809960147773, + "grad_norm": 12.4375, + "learning_rate": 9.83904253009547e-06, + "loss": 1.6848, + "step": 1660 + }, + { + "epoch": 0.04886988393402566, + "grad_norm": 14.0, + "learning_rate": 9.837103248422855e-06, + "loss": 1.6867, + "step": 1680 + }, + { + "epoch": 0.04945166826657358, + "grad_norm": 13.9375, + "learning_rate": 9.83516396675024e-06, + "loss": 1.7605, + "step": 1700 + }, + { + "epoch": 0.05003345259912151, + "grad_norm": 13.4375, + "learning_rate": 9.833224685077625e-06, + "loss": 1.6898, + "step": 1720 + }, + { + "epoch": 0.05061523693166943, + "grad_norm": 11.5, + "learning_rate": 9.83128540340501e-06, + "loss": 1.723, + "step": 1740 + }, + { + "epoch": 0.05119702126421735, + "grad_norm": 14.5625, + "learning_rate": 9.829346121732395e-06, + "loss": 1.7505, + "step": 1760 + }, + { + "epoch": 0.05177880559676528, + "grad_norm": 12.1875, + "learning_rate": 9.82740684005978e-06, + "loss": 1.7424, + "step": 1780 + }, + { + "epoch": 0.0523605899293132, + "grad_norm": 13.0625, + "learning_rate": 9.825467558387166e-06, + "loss": 1.7796, + "step": 1800 + }, + { + "epoch": 0.05294237426186113, + "grad_norm": 11.3125, + "learning_rate": 9.82352827671455e-06, + "loss": 1.6866, + "step": 1820 + }, + { + "epoch": 0.05352415859440905, + "grad_norm": 11.375, + "learning_rate": 9.821588995041936e-06, + "loss": 1.812, + "step": 1840 + }, + { + "epoch": 0.054105942926956974, + "grad_norm": 13.625, + "learning_rate": 9.819649713369321e-06, + "loss": 1.7411, + "step": 1860 + }, + { + "epoch": 0.0546877272595049, + "grad_norm": 12.5625, + "learning_rate": 9.817710431696706e-06, + "loss": 1.7386, + "step": 1880 + }, + { + "epoch": 0.055269511592052824, + "grad_norm": 13.9375, + "learning_rate": 9.815771150024091e-06, + "loss": 1.7493, + "step": 1900 + }, + { + "epoch": 0.05585129592460075, + "grad_norm": 16.625, + "learning_rate": 9.813831868351476e-06, + "loss": 1.7573, + "step": 1920 + }, + { + "epoch": 0.056433080257148674, + "grad_norm": 15.3125, + "learning_rate": 9.811892586678861e-06, + "loss": 1.7363, + "step": 1940 + }, + { + "epoch": 0.0570148645896966, + "grad_norm": 13.75, + "learning_rate": 9.809953305006246e-06, + "loss": 1.7155, + "step": 1960 + }, + { + "epoch": 0.057596648922244524, + "grad_norm": 13.5, + "learning_rate": 9.808014023333632e-06, + "loss": 1.7193, + "step": 1980 + }, + { + "epoch": 0.058178433254792446, + "grad_norm": 14.5625, + "learning_rate": 9.806074741661017e-06, + "loss": 1.6673, + "step": 2000 + }, + { + "epoch": 0.058760217587340374, + "grad_norm": 12.375, + "learning_rate": 9.804135459988402e-06, + "loss": 1.7952, + "step": 2020 + }, + { + "epoch": 0.059342001919888296, + "grad_norm": 13.1875, + "learning_rate": 9.802196178315787e-06, + "loss": 1.7525, + "step": 2040 + }, + { + "epoch": 0.059923786252436224, + "grad_norm": 12.6875, + "learning_rate": 9.800256896643172e-06, + "loss": 1.7017, + "step": 2060 + }, + { + "epoch": 0.060505570584984146, + "grad_norm": 14.0625, + "learning_rate": 9.798317614970557e-06, + "loss": 1.6701, + "step": 2080 + }, + { + "epoch": 0.06108735491753207, + "grad_norm": 12.8125, + "learning_rate": 9.796378333297942e-06, + "loss": 1.7619, + "step": 2100 + }, + { + "epoch": 0.061669139250079996, + "grad_norm": 12.875, + "learning_rate": 9.794439051625327e-06, + "loss": 1.6994, + "step": 2120 + }, + { + "epoch": 0.06225092358262792, + "grad_norm": 13.6875, + "learning_rate": 9.792499769952712e-06, + "loss": 1.6774, + "step": 2140 + }, + { + "epoch": 0.06283270791517584, + "grad_norm": 10.6875, + "learning_rate": 9.790560488280097e-06, + "loss": 1.7265, + "step": 2160 + }, + { + "epoch": 0.06341449224772377, + "grad_norm": 11.375, + "learning_rate": 9.788621206607483e-06, + "loss": 1.749, + "step": 2180 + }, + { + "epoch": 0.0639962765802717, + "grad_norm": 11.0, + "learning_rate": 9.786681924934868e-06, + "loss": 1.7317, + "step": 2200 + }, + { + "epoch": 0.06457806091281962, + "grad_norm": 12.4375, + "learning_rate": 9.784742643262253e-06, + "loss": 1.7559, + "step": 2220 + }, + { + "epoch": 0.06515984524536754, + "grad_norm": 10.375, + "learning_rate": 9.782803361589638e-06, + "loss": 1.7727, + "step": 2240 + }, + { + "epoch": 0.06574162957791546, + "grad_norm": 14.0625, + "learning_rate": 9.780864079917023e-06, + "loss": 1.6845, + "step": 2260 + }, + { + "epoch": 0.0663234139104634, + "grad_norm": 12.8125, + "learning_rate": 9.778924798244408e-06, + "loss": 1.762, + "step": 2280 + }, + { + "epoch": 0.06690519824301132, + "grad_norm": 11.9375, + "learning_rate": 9.776985516571793e-06, + "loss": 1.7802, + "step": 2300 + }, + { + "epoch": 0.06748698257555924, + "grad_norm": 10.125, + "learning_rate": 9.775046234899178e-06, + "loss": 1.6035, + "step": 2320 + }, + { + "epoch": 0.06806876690810716, + "grad_norm": 11.4375, + "learning_rate": 9.773106953226563e-06, + "loss": 1.7954, + "step": 2340 + }, + { + "epoch": 0.06865055124065508, + "grad_norm": 13.4375, + "learning_rate": 9.771167671553948e-06, + "loss": 1.7277, + "step": 2360 + }, + { + "epoch": 0.06923233557320302, + "grad_norm": 14.0, + "learning_rate": 9.769228389881334e-06, + "loss": 1.7112, + "step": 2380 + }, + { + "epoch": 0.06981411990575094, + "grad_norm": 11.875, + "learning_rate": 9.767289108208719e-06, + "loss": 1.7412, + "step": 2400 + }, + { + "epoch": 0.07039590423829886, + "grad_norm": 14.6875, + "learning_rate": 9.765349826536104e-06, + "loss": 1.6549, + "step": 2420 + }, + { + "epoch": 0.07097768857084678, + "grad_norm": 13.125, + "learning_rate": 9.763410544863489e-06, + "loss": 1.7308, + "step": 2440 + }, + { + "epoch": 0.0715594729033947, + "grad_norm": 10.25, + "learning_rate": 9.761471263190874e-06, + "loss": 1.7989, + "step": 2460 + }, + { + "epoch": 0.07214125723594264, + "grad_norm": 12.8125, + "learning_rate": 9.759531981518259e-06, + "loss": 1.6674, + "step": 2480 + }, + { + "epoch": 0.07272304156849056, + "grad_norm": 12.125, + "learning_rate": 9.757592699845644e-06, + "loss": 1.7343, + "step": 2500 + }, + { + "epoch": 0.07330482590103848, + "grad_norm": 13.1875, + "learning_rate": 9.75565341817303e-06, + "loss": 1.7163, + "step": 2520 + }, + { + "epoch": 0.0738866102335864, + "grad_norm": 13.3125, + "learning_rate": 9.753714136500414e-06, + "loss": 1.7604, + "step": 2540 + }, + { + "epoch": 0.07446839456613433, + "grad_norm": 12.9375, + "learning_rate": 9.7517748548278e-06, + "loss": 1.73, + "step": 2560 + }, + { + "epoch": 0.07505017889868226, + "grad_norm": 12.625, + "learning_rate": 9.749835573155184e-06, + "loss": 1.721, + "step": 2580 + }, + { + "epoch": 0.07563196323123018, + "grad_norm": 11.6875, + "learning_rate": 9.74789629148257e-06, + "loss": 1.7146, + "step": 2600 + }, + { + "epoch": 0.0762137475637781, + "grad_norm": 15.3125, + "learning_rate": 9.745957009809955e-06, + "loss": 1.7791, + "step": 2620 + }, + { + "epoch": 0.07679553189632603, + "grad_norm": 16.0, + "learning_rate": 9.74401772813734e-06, + "loss": 1.7371, + "step": 2640 + }, + { + "epoch": 0.07737731622887396, + "grad_norm": 11.5625, + "learning_rate": 9.742078446464725e-06, + "loss": 1.7157, + "step": 2660 + }, + { + "epoch": 0.07795910056142188, + "grad_norm": 12.375, + "learning_rate": 9.74013916479211e-06, + "loss": 1.6902, + "step": 2680 + }, + { + "epoch": 0.0785408848939698, + "grad_norm": 13.1875, + "learning_rate": 9.738199883119495e-06, + "loss": 1.7734, + "step": 2700 + }, + { + "epoch": 0.07912266922651773, + "grad_norm": 13.5625, + "learning_rate": 9.73626060144688e-06, + "loss": 1.7238, + "step": 2720 + }, + { + "epoch": 0.07970445355906565, + "grad_norm": 10.9375, + "learning_rate": 9.734321319774265e-06, + "loss": 1.7213, + "step": 2740 + }, + { + "epoch": 0.08028623789161358, + "grad_norm": 12.5625, + "learning_rate": 9.73238203810165e-06, + "loss": 1.647, + "step": 2760 + }, + { + "epoch": 0.0808680222241615, + "grad_norm": 14.875, + "learning_rate": 9.730442756429035e-06, + "loss": 1.6843, + "step": 2780 + }, + { + "epoch": 0.08144980655670943, + "grad_norm": 13.0625, + "learning_rate": 9.72850347475642e-06, + "loss": 1.734, + "step": 2800 + }, + { + "epoch": 0.08203159088925735, + "grad_norm": 11.0, + "learning_rate": 9.726564193083806e-06, + "loss": 1.7499, + "step": 2820 + }, + { + "epoch": 0.08261337522180527, + "grad_norm": 16.75, + "learning_rate": 9.72462491141119e-06, + "loss": 1.7646, + "step": 2840 + }, + { + "epoch": 0.0831951595543532, + "grad_norm": 11.8125, + "learning_rate": 9.722685629738576e-06, + "loss": 1.7599, + "step": 2860 + }, + { + "epoch": 0.08377694388690113, + "grad_norm": 11.5, + "learning_rate": 9.720746348065961e-06, + "loss": 1.7405, + "step": 2880 + }, + { + "epoch": 0.08435872821944905, + "grad_norm": 11.375, + "learning_rate": 9.718807066393346e-06, + "loss": 1.6979, + "step": 2900 + }, + { + "epoch": 0.08494051255199697, + "grad_norm": 11.8125, + "learning_rate": 9.716867784720731e-06, + "loss": 1.7044, + "step": 2920 + }, + { + "epoch": 0.0855222968845449, + "grad_norm": 12.25, + "learning_rate": 9.714928503048116e-06, + "loss": 1.7057, + "step": 2940 + }, + { + "epoch": 0.08610408121709283, + "grad_norm": 12.3125, + "learning_rate": 9.712989221375501e-06, + "loss": 1.6687, + "step": 2960 + }, + { + "epoch": 0.08668586554964075, + "grad_norm": 12.5625, + "learning_rate": 9.711049939702886e-06, + "loss": 1.7973, + "step": 2980 + }, + { + "epoch": 0.08726764988218867, + "grad_norm": 11.4375, + "learning_rate": 9.709110658030272e-06, + "loss": 1.7279, + "step": 3000 + }, + { + "epoch": 0.0878494342147366, + "grad_norm": 12.4375, + "learning_rate": 9.707171376357657e-06, + "loss": 1.6971, + "step": 3020 + }, + { + "epoch": 0.08843121854728452, + "grad_norm": 11.125, + "learning_rate": 9.705232094685042e-06, + "loss": 1.679, + "step": 3040 + }, + { + "epoch": 0.08901300287983245, + "grad_norm": 14.875, + "learning_rate": 9.703292813012427e-06, + "loss": 1.699, + "step": 3060 + }, + { + "epoch": 0.08959478721238037, + "grad_norm": 11.875, + "learning_rate": 9.70135353133981e-06, + "loss": 1.803, + "step": 3080 + }, + { + "epoch": 0.0901765715449283, + "grad_norm": 13.75, + "learning_rate": 9.699414249667195e-06, + "loss": 1.6699, + "step": 3100 + }, + { + "epoch": 0.09075835587747622, + "grad_norm": 11.9375, + "learning_rate": 9.69747496799458e-06, + "loss": 1.7585, + "step": 3120 + }, + { + "epoch": 0.09134014021002414, + "grad_norm": 15.1875, + "learning_rate": 9.695535686321966e-06, + "loss": 1.7541, + "step": 3140 + }, + { + "epoch": 0.09192192454257207, + "grad_norm": 14.125, + "learning_rate": 9.69359640464935e-06, + "loss": 1.7145, + "step": 3160 + }, + { + "epoch": 0.09250370887512, + "grad_norm": 12.9375, + "learning_rate": 9.691657122976736e-06, + "loss": 1.7008, + "step": 3180 + }, + { + "epoch": 0.09308549320766792, + "grad_norm": 11.8125, + "learning_rate": 9.68971784130412e-06, + "loss": 1.6591, + "step": 3200 + }, + { + "epoch": 0.09366727754021584, + "grad_norm": 11.9375, + "learning_rate": 9.687778559631506e-06, + "loss": 1.6305, + "step": 3220 + }, + { + "epoch": 0.09424906187276377, + "grad_norm": 12.0625, + "learning_rate": 9.685839277958891e-06, + "loss": 1.7425, + "step": 3240 + }, + { + "epoch": 0.0948308462053117, + "grad_norm": 12.3125, + "learning_rate": 9.683899996286276e-06, + "loss": 1.7959, + "step": 3260 + }, + { + "epoch": 0.09541263053785962, + "grad_norm": 12.875, + "learning_rate": 9.681960714613661e-06, + "loss": 1.765, + "step": 3280 + }, + { + "epoch": 0.09599441487040754, + "grad_norm": 14.0625, + "learning_rate": 9.680021432941046e-06, + "loss": 1.7248, + "step": 3300 + }, + { + "epoch": 0.09657619920295546, + "grad_norm": 14.875, + "learning_rate": 9.678082151268431e-06, + "loss": 1.6914, + "step": 3320 + }, + { + "epoch": 0.0971579835355034, + "grad_norm": 12.4375, + "learning_rate": 9.676142869595816e-06, + "loss": 1.7039, + "step": 3340 + }, + { + "epoch": 0.09773976786805132, + "grad_norm": 11.8125, + "learning_rate": 9.674203587923202e-06, + "loss": 1.5978, + "step": 3360 + }, + { + "epoch": 0.09832155220059924, + "grad_norm": 12.625, + "learning_rate": 9.672264306250587e-06, + "loss": 1.6436, + "step": 3380 + }, + { + "epoch": 0.09890333653314716, + "grad_norm": 10.0625, + "learning_rate": 9.670325024577972e-06, + "loss": 1.7001, + "step": 3400 + }, + { + "epoch": 0.09948512086569508, + "grad_norm": 12.3125, + "learning_rate": 9.668385742905357e-06, + "loss": 1.7009, + "step": 3420 + }, + { + "epoch": 0.10006690519824302, + "grad_norm": 13.1875, + "learning_rate": 9.666446461232742e-06, + "loss": 1.7319, + "step": 3440 + }, + { + "epoch": 0.10064868953079094, + "grad_norm": 17.75, + "learning_rate": 9.664507179560127e-06, + "loss": 1.7174, + "step": 3460 + }, + { + "epoch": 0.10123047386333886, + "grad_norm": 10.5, + "learning_rate": 9.662567897887512e-06, + "loss": 1.7301, + "step": 3480 + }, + { + "epoch": 0.10181225819588678, + "grad_norm": 10.75, + "learning_rate": 9.660628616214897e-06, + "loss": 1.6914, + "step": 3500 + }, + { + "epoch": 0.1023940425284347, + "grad_norm": 12.75, + "learning_rate": 9.658689334542282e-06, + "loss": 1.6277, + "step": 3520 + }, + { + "epoch": 0.10297582686098264, + "grad_norm": 14.4375, + "learning_rate": 9.656750052869667e-06, + "loss": 1.6839, + "step": 3540 + }, + { + "epoch": 0.10355761119353056, + "grad_norm": 12.5625, + "learning_rate": 9.654810771197053e-06, + "loss": 1.7156, + "step": 3560 + }, + { + "epoch": 0.10413939552607848, + "grad_norm": 13.6875, + "learning_rate": 9.652871489524438e-06, + "loss": 1.6836, + "step": 3580 + }, + { + "epoch": 0.1047211798586264, + "grad_norm": 13.3125, + "learning_rate": 9.650932207851823e-06, + "loss": 1.7005, + "step": 3600 + }, + { + "epoch": 0.10530296419117433, + "grad_norm": 10.6875, + "learning_rate": 9.648992926179208e-06, + "loss": 1.7016, + "step": 3620 + }, + { + "epoch": 0.10588474852372226, + "grad_norm": 13.4375, + "learning_rate": 9.647053644506591e-06, + "loss": 1.6735, + "step": 3640 + }, + { + "epoch": 0.10646653285627018, + "grad_norm": 13.75, + "learning_rate": 9.645114362833976e-06, + "loss": 1.7366, + "step": 3660 + }, + { + "epoch": 0.1070483171888181, + "grad_norm": 12.75, + "learning_rate": 9.643175081161361e-06, + "loss": 1.6118, + "step": 3680 + }, + { + "epoch": 0.10763010152136603, + "grad_norm": 14.75, + "learning_rate": 9.641235799488747e-06, + "loss": 1.7359, + "step": 3700 + }, + { + "epoch": 0.10821188585391395, + "grad_norm": 13.0625, + "learning_rate": 9.639296517816132e-06, + "loss": 1.687, + "step": 3720 + }, + { + "epoch": 0.10879367018646188, + "grad_norm": 12.75, + "learning_rate": 9.637357236143517e-06, + "loss": 1.6533, + "step": 3740 + }, + { + "epoch": 0.1093754545190098, + "grad_norm": 13.9375, + "learning_rate": 9.635417954470902e-06, + "loss": 1.742, + "step": 3760 + }, + { + "epoch": 0.10995723885155773, + "grad_norm": 10.8125, + "learning_rate": 9.633478672798287e-06, + "loss": 1.6269, + "step": 3780 + }, + { + "epoch": 0.11053902318410565, + "grad_norm": 11.625, + "learning_rate": 9.631539391125672e-06, + "loss": 1.7434, + "step": 3800 + }, + { + "epoch": 0.11112080751665358, + "grad_norm": 12.1875, + "learning_rate": 9.629600109453057e-06, + "loss": 1.6067, + "step": 3820 + }, + { + "epoch": 0.1117025918492015, + "grad_norm": 14.375, + "learning_rate": 9.627660827780442e-06, + "loss": 1.6903, + "step": 3840 + }, + { + "epoch": 0.11228437618174943, + "grad_norm": 13.9375, + "learning_rate": 9.625721546107827e-06, + "loss": 1.6103, + "step": 3860 + }, + { + "epoch": 0.11286616051429735, + "grad_norm": 13.1875, + "learning_rate": 9.623782264435212e-06, + "loss": 1.7284, + "step": 3880 + }, + { + "epoch": 0.11344794484684527, + "grad_norm": 14.375, + "learning_rate": 9.621842982762598e-06, + "loss": 1.6879, + "step": 3900 + }, + { + "epoch": 0.1140297291793932, + "grad_norm": 13.9375, + "learning_rate": 9.619903701089983e-06, + "loss": 1.7217, + "step": 3920 + }, + { + "epoch": 0.11461151351194113, + "grad_norm": 10.125, + "learning_rate": 9.617964419417368e-06, + "loss": 1.6572, + "step": 3940 + }, + { + "epoch": 0.11519329784448905, + "grad_norm": 12.6875, + "learning_rate": 9.616025137744753e-06, + "loss": 1.7197, + "step": 3960 + }, + { + "epoch": 0.11577508217703697, + "grad_norm": 11.8125, + "learning_rate": 9.614085856072138e-06, + "loss": 1.7059, + "step": 3980 + }, + { + "epoch": 0.11635686650958489, + "grad_norm": 10.4375, + "learning_rate": 9.612146574399523e-06, + "loss": 1.6629, + "step": 4000 + }, + { + "epoch": 0.11693865084213283, + "grad_norm": 15.75, + "learning_rate": 9.610207292726908e-06, + "loss": 1.5862, + "step": 4020 + }, + { + "epoch": 0.11752043517468075, + "grad_norm": 12.9375, + "learning_rate": 9.608268011054293e-06, + "loss": 1.7666, + "step": 4040 + }, + { + "epoch": 0.11810221950722867, + "grad_norm": 11.1875, + "learning_rate": 9.606328729381678e-06, + "loss": 1.7658, + "step": 4060 + }, + { + "epoch": 0.11868400383977659, + "grad_norm": 12.1875, + "learning_rate": 9.604389447709063e-06, + "loss": 1.6754, + "step": 4080 + }, + { + "epoch": 0.11926578817232451, + "grad_norm": 12.625, + "learning_rate": 9.602450166036448e-06, + "loss": 1.7163, + "step": 4100 + }, + { + "epoch": 0.11984757250487245, + "grad_norm": 12.0, + "learning_rate": 9.600510884363834e-06, + "loss": 1.7055, + "step": 4120 + }, + { + "epoch": 0.12042935683742037, + "grad_norm": 13.8125, + "learning_rate": 9.598571602691219e-06, + "loss": 1.6927, + "step": 4140 + }, + { + "epoch": 0.12101114116996829, + "grad_norm": 11.375, + "learning_rate": 9.596632321018604e-06, + "loss": 1.6708, + "step": 4160 + }, + { + "epoch": 0.12159292550251621, + "grad_norm": 12.3125, + "learning_rate": 9.594693039345989e-06, + "loss": 1.6681, + "step": 4180 + }, + { + "epoch": 0.12217470983506414, + "grad_norm": 13.125, + "learning_rate": 9.592753757673374e-06, + "loss": 1.6725, + "step": 4200 + }, + { + "epoch": 0.12275649416761207, + "grad_norm": 15.0625, + "learning_rate": 9.590814476000759e-06, + "loss": 1.6429, + "step": 4220 + }, + { + "epoch": 0.12333827850015999, + "grad_norm": 12.625, + "learning_rate": 9.588875194328144e-06, + "loss": 1.67, + "step": 4240 + }, + { + "epoch": 0.12392006283270791, + "grad_norm": 10.9375, + "learning_rate": 9.58693591265553e-06, + "loss": 1.6725, + "step": 4260 + }, + { + "epoch": 0.12450184716525584, + "grad_norm": 12.1875, + "learning_rate": 9.584996630982914e-06, + "loss": 1.6875, + "step": 4280 + }, + { + "epoch": 0.12508363149780377, + "grad_norm": 12.125, + "learning_rate": 9.5830573493103e-06, + "loss": 1.6594, + "step": 4300 + }, + { + "epoch": 0.12566541583035168, + "grad_norm": 12.6875, + "learning_rate": 9.581118067637685e-06, + "loss": 1.6841, + "step": 4320 + }, + { + "epoch": 0.12624720016289961, + "grad_norm": 12.75, + "learning_rate": 9.57917878596507e-06, + "loss": 1.6427, + "step": 4340 + }, + { + "epoch": 0.12682898449544755, + "grad_norm": 13.625, + "learning_rate": 9.577239504292455e-06, + "loss": 1.6997, + "step": 4360 + }, + { + "epoch": 0.12741076882799546, + "grad_norm": 14.75, + "learning_rate": 9.57530022261984e-06, + "loss": 1.732, + "step": 4380 + }, + { + "epoch": 0.1279925531605434, + "grad_norm": 15.125, + "learning_rate": 9.573360940947225e-06, + "loss": 1.8135, + "step": 4400 + }, + { + "epoch": 0.1285743374930913, + "grad_norm": 11.75, + "learning_rate": 9.57142165927461e-06, + "loss": 1.6542, + "step": 4420 + }, + { + "epoch": 0.12915612182563924, + "grad_norm": 11.5, + "learning_rate": 9.569482377601995e-06, + "loss": 1.6981, + "step": 4440 + }, + { + "epoch": 0.12973790615818717, + "grad_norm": 11.5, + "learning_rate": 9.56754309592938e-06, + "loss": 1.6306, + "step": 4460 + }, + { + "epoch": 0.13031969049073508, + "grad_norm": 10.75, + "learning_rate": 9.565603814256765e-06, + "loss": 1.6223, + "step": 4480 + }, + { + "epoch": 0.13090147482328301, + "grad_norm": 13.1875, + "learning_rate": 9.56366453258415e-06, + "loss": 1.6891, + "step": 4500 + }, + { + "epoch": 0.13148325915583092, + "grad_norm": 17.125, + "learning_rate": 9.561725250911536e-06, + "loss": 1.691, + "step": 4520 + }, + { + "epoch": 0.13206504348837886, + "grad_norm": 13.6875, + "learning_rate": 9.55978596923892e-06, + "loss": 1.5271, + "step": 4540 + }, + { + "epoch": 0.1326468278209268, + "grad_norm": 11.375, + "learning_rate": 9.557846687566306e-06, + "loss": 1.6127, + "step": 4560 + }, + { + "epoch": 0.1332286121534747, + "grad_norm": 15.1875, + "learning_rate": 9.55590740589369e-06, + "loss": 1.7001, + "step": 4580 + }, + { + "epoch": 0.13381039648602264, + "grad_norm": 15.4375, + "learning_rate": 9.553968124221076e-06, + "loss": 1.6584, + "step": 4600 + }, + { + "epoch": 0.13439218081857054, + "grad_norm": 11.8125, + "learning_rate": 9.552028842548461e-06, + "loss": 1.6233, + "step": 4620 + }, + { + "epoch": 0.13497396515111848, + "grad_norm": 15.625, + "learning_rate": 9.550089560875846e-06, + "loss": 1.7007, + "step": 4640 + }, + { + "epoch": 0.13555574948366642, + "grad_norm": 12.9375, + "learning_rate": 9.548150279203231e-06, + "loss": 1.7278, + "step": 4660 + }, + { + "epoch": 0.13613753381621432, + "grad_norm": 12.6875, + "learning_rate": 9.546210997530616e-06, + "loss": 1.7739, + "step": 4680 + }, + { + "epoch": 0.13671931814876226, + "grad_norm": 13.4375, + "learning_rate": 9.544271715858001e-06, + "loss": 1.6601, + "step": 4700 + }, + { + "epoch": 0.13730110248131017, + "grad_norm": 15.0625, + "learning_rate": 9.542332434185387e-06, + "loss": 1.6969, + "step": 4720 + }, + { + "epoch": 0.1378828868138581, + "grad_norm": 14.4375, + "learning_rate": 9.540393152512772e-06, + "loss": 1.6679, + "step": 4740 + }, + { + "epoch": 0.13846467114640604, + "grad_norm": 12.375, + "learning_rate": 9.538453870840157e-06, + "loss": 1.6858, + "step": 4760 + }, + { + "epoch": 0.13904645547895395, + "grad_norm": 11.3125, + "learning_rate": 9.536514589167542e-06, + "loss": 1.6907, + "step": 4780 + }, + { + "epoch": 0.13962823981150188, + "grad_norm": 11.8125, + "learning_rate": 9.534575307494927e-06, + "loss": 1.6233, + "step": 4800 + }, + { + "epoch": 0.1402100241440498, + "grad_norm": 12.75, + "learning_rate": 9.532636025822312e-06, + "loss": 1.689, + "step": 4820 + }, + { + "epoch": 0.14079180847659772, + "grad_norm": 13.1875, + "learning_rate": 9.530696744149697e-06, + "loss": 1.6551, + "step": 4840 + }, + { + "epoch": 0.14137359280914566, + "grad_norm": 13.8125, + "learning_rate": 9.528757462477082e-06, + "loss": 1.6108, + "step": 4860 + }, + { + "epoch": 0.14195537714169357, + "grad_norm": 11.625, + "learning_rate": 9.526818180804467e-06, + "loss": 1.7142, + "step": 4880 + }, + { + "epoch": 0.1425371614742415, + "grad_norm": 12.75, + "learning_rate": 9.524878899131852e-06, + "loss": 1.6855, + "step": 4900 + }, + { + "epoch": 0.1431189458067894, + "grad_norm": 10.0, + "learning_rate": 9.522939617459237e-06, + "loss": 1.7138, + "step": 4920 + }, + { + "epoch": 0.14370073013933735, + "grad_norm": 11.75, + "learning_rate": 9.521000335786623e-06, + "loss": 1.7594, + "step": 4940 + }, + { + "epoch": 0.14428251447188528, + "grad_norm": 12.0625, + "learning_rate": 9.519061054114008e-06, + "loss": 1.6585, + "step": 4960 + }, + { + "epoch": 0.1448642988044332, + "grad_norm": 14.5625, + "learning_rate": 9.517121772441393e-06, + "loss": 1.7204, + "step": 4980 + }, + { + "epoch": 0.14544608313698112, + "grad_norm": 12.5, + "learning_rate": 9.515182490768778e-06, + "loss": 1.6766, + "step": 5000 + }, + { + "epoch": 0.14602786746952903, + "grad_norm": 13.125, + "learning_rate": 9.513243209096163e-06, + "loss": 1.7624, + "step": 5020 + }, + { + "epoch": 0.14660965180207697, + "grad_norm": 12.1875, + "learning_rate": 9.511303927423548e-06, + "loss": 1.7071, + "step": 5040 + }, + { + "epoch": 0.1471914361346249, + "grad_norm": 12.8125, + "learning_rate": 9.509364645750933e-06, + "loss": 1.7172, + "step": 5060 + }, + { + "epoch": 0.1477732204671728, + "grad_norm": 10.9375, + "learning_rate": 9.507425364078318e-06, + "loss": 1.7156, + "step": 5080 + }, + { + "epoch": 0.14835500479972075, + "grad_norm": 12.9375, + "learning_rate": 9.505486082405703e-06, + "loss": 1.6806, + "step": 5100 + }, + { + "epoch": 0.14893678913226865, + "grad_norm": 13.0, + "learning_rate": 9.503546800733088e-06, + "loss": 1.652, + "step": 5120 + }, + { + "epoch": 0.1495185734648166, + "grad_norm": 14.0625, + "learning_rate": 9.501607519060474e-06, + "loss": 1.6509, + "step": 5140 + }, + { + "epoch": 0.15010035779736453, + "grad_norm": 12.0, + "learning_rate": 9.499668237387859e-06, + "loss": 1.7384, + "step": 5160 + }, + { + "epoch": 0.15068214212991243, + "grad_norm": 13.25, + "learning_rate": 9.497728955715244e-06, + "loss": 1.6139, + "step": 5180 + }, + { + "epoch": 0.15126392646246037, + "grad_norm": 13.3125, + "learning_rate": 9.495789674042629e-06, + "loss": 1.708, + "step": 5200 + }, + { + "epoch": 0.1518457107950083, + "grad_norm": 15.6875, + "learning_rate": 9.493850392370014e-06, + "loss": 1.6423, + "step": 5220 + }, + { + "epoch": 0.1524274951275562, + "grad_norm": 12.9375, + "learning_rate": 9.491911110697399e-06, + "loss": 1.7107, + "step": 5240 + }, + { + "epoch": 0.15300927946010415, + "grad_norm": 12.9375, + "learning_rate": 9.489971829024784e-06, + "loss": 1.6481, + "step": 5260 + }, + { + "epoch": 0.15359106379265205, + "grad_norm": 12.875, + "learning_rate": 9.488032547352168e-06, + "loss": 1.5836, + "step": 5280 + }, + { + "epoch": 0.1541728481252, + "grad_norm": 10.6875, + "learning_rate": 9.486093265679553e-06, + "loss": 1.6645, + "step": 5300 + }, + { + "epoch": 0.15475463245774793, + "grad_norm": 11.875, + "learning_rate": 9.484153984006938e-06, + "loss": 1.7185, + "step": 5320 + }, + { + "epoch": 0.15533641679029583, + "grad_norm": 13.0625, + "learning_rate": 9.482214702334323e-06, + "loss": 1.5641, + "step": 5340 + }, + { + "epoch": 0.15591820112284377, + "grad_norm": 12.875, + "learning_rate": 9.480275420661708e-06, + "loss": 1.7268, + "step": 5360 + }, + { + "epoch": 0.15649998545539168, + "grad_norm": 9.9375, + "learning_rate": 9.478336138989093e-06, + "loss": 1.6341, + "step": 5380 + }, + { + "epoch": 0.1570817697879396, + "grad_norm": 15.8125, + "learning_rate": 9.476396857316478e-06, + "loss": 1.7263, + "step": 5400 + }, + { + "epoch": 0.15766355412048755, + "grad_norm": 12.9375, + "learning_rate": 9.474457575643863e-06, + "loss": 1.7495, + "step": 5420 + }, + { + "epoch": 0.15824533845303546, + "grad_norm": 14.9375, + "learning_rate": 9.472518293971248e-06, + "loss": 1.7751, + "step": 5440 + }, + { + "epoch": 0.1588271227855834, + "grad_norm": 12.375, + "learning_rate": 9.470579012298633e-06, + "loss": 1.6839, + "step": 5460 + }, + { + "epoch": 0.1594089071181313, + "grad_norm": 13.0, + "learning_rate": 9.468639730626019e-06, + "loss": 1.6549, + "step": 5480 + }, + { + "epoch": 0.15999069145067923, + "grad_norm": 12.125, + "learning_rate": 9.466700448953404e-06, + "loss": 1.6218, + "step": 5500 + }, + { + "epoch": 0.16057247578322717, + "grad_norm": 10.125, + "learning_rate": 9.464761167280789e-06, + "loss": 1.7006, + "step": 5520 + }, + { + "epoch": 0.16115426011577508, + "grad_norm": 14.25, + "learning_rate": 9.462821885608174e-06, + "loss": 1.7311, + "step": 5540 + }, + { + "epoch": 0.161736044448323, + "grad_norm": 11.125, + "learning_rate": 9.460882603935559e-06, + "loss": 1.6975, + "step": 5560 + }, + { + "epoch": 0.16231782878087092, + "grad_norm": 10.0625, + "learning_rate": 9.458943322262944e-06, + "loss": 1.6798, + "step": 5580 + }, + { + "epoch": 0.16289961311341886, + "grad_norm": 11.5625, + "learning_rate": 9.457004040590329e-06, + "loss": 1.7398, + "step": 5600 + }, + { + "epoch": 0.1634813974459668, + "grad_norm": 12.125, + "learning_rate": 9.455064758917714e-06, + "loss": 1.6346, + "step": 5620 + }, + { + "epoch": 0.1640631817785147, + "grad_norm": 16.375, + "learning_rate": 9.4531254772451e-06, + "loss": 1.6982, + "step": 5640 + }, + { + "epoch": 0.16464496611106264, + "grad_norm": 13.625, + "learning_rate": 9.451186195572484e-06, + "loss": 1.6089, + "step": 5660 + }, + { + "epoch": 0.16522675044361054, + "grad_norm": 12.125, + "learning_rate": 9.44924691389987e-06, + "loss": 1.5636, + "step": 5680 + }, + { + "epoch": 0.16580853477615848, + "grad_norm": 11.75, + "learning_rate": 9.447307632227255e-06, + "loss": 1.6292, + "step": 5700 + }, + { + "epoch": 0.1663903191087064, + "grad_norm": 11.8125, + "learning_rate": 9.44536835055464e-06, + "loss": 1.6882, + "step": 5720 + }, + { + "epoch": 0.16697210344125432, + "grad_norm": 12.125, + "learning_rate": 9.443429068882025e-06, + "loss": 1.5953, + "step": 5740 + }, + { + "epoch": 0.16755388777380226, + "grad_norm": 11.0, + "learning_rate": 9.44148978720941e-06, + "loss": 1.6087, + "step": 5760 + }, + { + "epoch": 0.16813567210635016, + "grad_norm": 12.5, + "learning_rate": 9.439550505536795e-06, + "loss": 1.6753, + "step": 5780 + }, + { + "epoch": 0.1687174564388981, + "grad_norm": 12.875, + "learning_rate": 9.43761122386418e-06, + "loss": 1.6255, + "step": 5800 + }, + { + "epoch": 0.16929924077144604, + "grad_norm": 13.0625, + "learning_rate": 9.435671942191565e-06, + "loss": 1.6765, + "step": 5820 + }, + { + "epoch": 0.16988102510399394, + "grad_norm": 9.6875, + "learning_rate": 9.43373266051895e-06, + "loss": 1.7139, + "step": 5840 + }, + { + "epoch": 0.17046280943654188, + "grad_norm": 10.6875, + "learning_rate": 9.431793378846335e-06, + "loss": 1.7313, + "step": 5860 + }, + { + "epoch": 0.1710445937690898, + "grad_norm": 12.8125, + "learning_rate": 9.42985409717372e-06, + "loss": 1.6744, + "step": 5880 + }, + { + "epoch": 0.17162637810163772, + "grad_norm": 11.875, + "learning_rate": 9.427914815501106e-06, + "loss": 1.6938, + "step": 5900 + }, + { + "epoch": 0.17220816243418566, + "grad_norm": 13.125, + "learning_rate": 9.42597553382849e-06, + "loss": 1.7012, + "step": 5920 + }, + { + "epoch": 0.17278994676673357, + "grad_norm": 10.625, + "learning_rate": 9.424036252155876e-06, + "loss": 1.6517, + "step": 5940 + }, + { + "epoch": 0.1733717310992815, + "grad_norm": 13.5625, + "learning_rate": 9.422096970483261e-06, + "loss": 1.7294, + "step": 5960 + }, + { + "epoch": 0.1739535154318294, + "grad_norm": 9.25, + "learning_rate": 9.420157688810646e-06, + "loss": 1.761, + "step": 5980 + }, + { + "epoch": 0.17453529976437734, + "grad_norm": 12.9375, + "learning_rate": 9.418218407138031e-06, + "loss": 1.7016, + "step": 6000 + }, + { + "epoch": 0.17511708409692528, + "grad_norm": 13.875, + "learning_rate": 9.416279125465416e-06, + "loss": 1.6872, + "step": 6020 + }, + { + "epoch": 0.1756988684294732, + "grad_norm": 12.0625, + "learning_rate": 9.414339843792801e-06, + "loss": 1.6448, + "step": 6040 + }, + { + "epoch": 0.17628065276202112, + "grad_norm": 14.6875, + "learning_rate": 9.412400562120186e-06, + "loss": 1.7946, + "step": 6060 + }, + { + "epoch": 0.17686243709456903, + "grad_norm": 11.875, + "learning_rate": 9.410461280447571e-06, + "loss": 1.5854, + "step": 6080 + }, + { + "epoch": 0.17744422142711697, + "grad_norm": 15.75, + "learning_rate": 9.408521998774957e-06, + "loss": 1.6513, + "step": 6100 + }, + { + "epoch": 0.1780260057596649, + "grad_norm": 12.0625, + "learning_rate": 9.406582717102342e-06, + "loss": 1.6085, + "step": 6120 + }, + { + "epoch": 0.1786077900922128, + "grad_norm": 14.25, + "learning_rate": 9.404643435429727e-06, + "loss": 1.7134, + "step": 6140 + }, + { + "epoch": 0.17918957442476074, + "grad_norm": 9.25, + "learning_rate": 9.402704153757112e-06, + "loss": 1.6351, + "step": 6160 + }, + { + "epoch": 0.17977135875730865, + "grad_norm": 12.375, + "learning_rate": 9.400764872084497e-06, + "loss": 1.6563, + "step": 6180 + }, + { + "epoch": 0.1803531430898566, + "grad_norm": 15.125, + "learning_rate": 9.398825590411882e-06, + "loss": 1.6515, + "step": 6200 + }, + { + "epoch": 0.18093492742240452, + "grad_norm": 12.8125, + "learning_rate": 9.396886308739267e-06, + "loss": 1.593, + "step": 6220 + }, + { + "epoch": 0.18151671175495243, + "grad_norm": 11.0625, + "learning_rate": 9.394947027066652e-06, + "loss": 1.6267, + "step": 6240 + }, + { + "epoch": 0.18209849608750037, + "grad_norm": 13.875, + "learning_rate": 9.393007745394037e-06, + "loss": 1.6304, + "step": 6260 + }, + { + "epoch": 0.18268028042004827, + "grad_norm": 11.375, + "learning_rate": 9.391068463721422e-06, + "loss": 1.7136, + "step": 6280 + }, + { + "epoch": 0.1832620647525962, + "grad_norm": 12.8125, + "learning_rate": 9.389129182048808e-06, + "loss": 1.7938, + "step": 6300 + }, + { + "epoch": 0.18384384908514415, + "grad_norm": 11.625, + "learning_rate": 9.387189900376193e-06, + "loss": 1.7028, + "step": 6320 + }, + { + "epoch": 0.18442563341769205, + "grad_norm": 11.25, + "learning_rate": 9.385250618703578e-06, + "loss": 1.6818, + "step": 6340 + }, + { + "epoch": 0.18500741775024, + "grad_norm": 12.25, + "learning_rate": 9.383311337030963e-06, + "loss": 1.668, + "step": 6360 + }, + { + "epoch": 0.18558920208278792, + "grad_norm": 19.125, + "learning_rate": 9.381372055358348e-06, + "loss": 1.6622, + "step": 6380 + }, + { + "epoch": 0.18617098641533583, + "grad_norm": 12.125, + "learning_rate": 9.379432773685733e-06, + "loss": 1.6649, + "step": 6400 + }, + { + "epoch": 0.18675277074788377, + "grad_norm": 12.3125, + "learning_rate": 9.377493492013118e-06, + "loss": 1.6025, + "step": 6420 + }, + { + "epoch": 0.18733455508043168, + "grad_norm": 11.4375, + "learning_rate": 9.375554210340503e-06, + "loss": 1.7254, + "step": 6440 + }, + { + "epoch": 0.1879163394129796, + "grad_norm": 13.9375, + "learning_rate": 9.373614928667888e-06, + "loss": 1.6749, + "step": 6460 + }, + { + "epoch": 0.18849812374552755, + "grad_norm": 16.375, + "learning_rate": 9.371675646995273e-06, + "loss": 1.7615, + "step": 6480 + }, + { + "epoch": 0.18907990807807545, + "grad_norm": 11.1875, + "learning_rate": 9.369736365322659e-06, + "loss": 1.582, + "step": 6500 + }, + { + "epoch": 0.1896616924106234, + "grad_norm": 14.4375, + "learning_rate": 9.367797083650044e-06, + "loss": 1.6986, + "step": 6520 + }, + { + "epoch": 0.1902434767431713, + "grad_norm": 15.25, + "learning_rate": 9.365857801977429e-06, + "loss": 1.6597, + "step": 6540 + }, + { + "epoch": 0.19082526107571923, + "grad_norm": 12.5, + "learning_rate": 9.363918520304814e-06, + "loss": 1.6914, + "step": 6560 + }, + { + "epoch": 0.19140704540826717, + "grad_norm": 13.8125, + "learning_rate": 9.361979238632199e-06, + "loss": 1.6375, + "step": 6580 + }, + { + "epoch": 0.19198882974081508, + "grad_norm": 12.125, + "learning_rate": 9.360039956959584e-06, + "loss": 1.6513, + "step": 6600 + }, + { + "epoch": 0.192570614073363, + "grad_norm": 13.625, + "learning_rate": 9.358100675286969e-06, + "loss": 1.6101, + "step": 6620 + }, + { + "epoch": 0.19315239840591092, + "grad_norm": 10.875, + "learning_rate": 9.356161393614354e-06, + "loss": 1.686, + "step": 6640 + }, + { + "epoch": 0.19373418273845885, + "grad_norm": 11.4375, + "learning_rate": 9.35422211194174e-06, + "loss": 1.7196, + "step": 6660 + }, + { + "epoch": 0.1943159670710068, + "grad_norm": 12.5, + "learning_rate": 9.352282830269124e-06, + "loss": 1.6264, + "step": 6680 + }, + { + "epoch": 0.1948977514035547, + "grad_norm": 12.375, + "learning_rate": 9.35034354859651e-06, + "loss": 1.6006, + "step": 6700 + }, + { + "epoch": 0.19547953573610263, + "grad_norm": 13.0625, + "learning_rate": 9.348404266923895e-06, + "loss": 1.6414, + "step": 6720 + }, + { + "epoch": 0.19606132006865054, + "grad_norm": 12.3125, + "learning_rate": 9.34646498525128e-06, + "loss": 1.622, + "step": 6740 + }, + { + "epoch": 0.19664310440119848, + "grad_norm": 15.9375, + "learning_rate": 9.344525703578665e-06, + "loss": 1.6411, + "step": 6760 + }, + { + "epoch": 0.1972248887337464, + "grad_norm": 13.0625, + "learning_rate": 9.34258642190605e-06, + "loss": 1.6437, + "step": 6780 + }, + { + "epoch": 0.19780667306629432, + "grad_norm": 14.75, + "learning_rate": 9.340647140233435e-06, + "loss": 1.7252, + "step": 6800 + }, + { + "epoch": 0.19838845739884226, + "grad_norm": 14.25, + "learning_rate": 9.33870785856082e-06, + "loss": 1.6502, + "step": 6820 + }, + { + "epoch": 0.19897024173139016, + "grad_norm": 13.0, + "learning_rate": 9.336768576888205e-06, + "loss": 1.7225, + "step": 6840 + }, + { + "epoch": 0.1995520260639381, + "grad_norm": 17.0, + "learning_rate": 9.33482929521559e-06, + "loss": 1.6959, + "step": 6860 + }, + { + "epoch": 0.20013381039648603, + "grad_norm": 12.1875, + "learning_rate": 9.332890013542975e-06, + "loss": 1.6273, + "step": 6880 + }, + { + "epoch": 0.20071559472903394, + "grad_norm": 11.875, + "learning_rate": 9.330950731870359e-06, + "loss": 1.7678, + "step": 6900 + }, + { + "epoch": 0.20129737906158188, + "grad_norm": 10.375, + "learning_rate": 9.329011450197744e-06, + "loss": 1.7142, + "step": 6920 + }, + { + "epoch": 0.20187916339412978, + "grad_norm": 13.375, + "learning_rate": 9.327072168525129e-06, + "loss": 1.6364, + "step": 6940 + }, + { + "epoch": 0.20246094772667772, + "grad_norm": 14.25, + "learning_rate": 9.325132886852514e-06, + "loss": 1.7133, + "step": 6960 + }, + { + "epoch": 0.20304273205922566, + "grad_norm": 13.375, + "learning_rate": 9.323193605179899e-06, + "loss": 1.7386, + "step": 6980 + }, + { + "epoch": 0.20362451639177356, + "grad_norm": 10.625, + "learning_rate": 9.321254323507284e-06, + "loss": 1.6479, + "step": 7000 + }, + { + "epoch": 0.2042063007243215, + "grad_norm": 12.8125, + "learning_rate": 9.31931504183467e-06, + "loss": 1.7419, + "step": 7020 + }, + { + "epoch": 0.2047880850568694, + "grad_norm": 14.5625, + "learning_rate": 9.317375760162054e-06, + "loss": 1.6332, + "step": 7040 + }, + { + "epoch": 0.20536986938941734, + "grad_norm": 15.3125, + "learning_rate": 9.31543647848944e-06, + "loss": 1.7959, + "step": 7060 + }, + { + "epoch": 0.20595165372196528, + "grad_norm": 13.9375, + "learning_rate": 9.313497196816825e-06, + "loss": 1.5956, + "step": 7080 + }, + { + "epoch": 0.20653343805451319, + "grad_norm": 9.4375, + "learning_rate": 9.31155791514421e-06, + "loss": 1.5998, + "step": 7100 + }, + { + "epoch": 0.20711522238706112, + "grad_norm": 12.5625, + "learning_rate": 9.309618633471595e-06, + "loss": 1.6106, + "step": 7120 + }, + { + "epoch": 0.20769700671960903, + "grad_norm": 11.375, + "learning_rate": 9.30767935179898e-06, + "loss": 1.6298, + "step": 7140 + }, + { + "epoch": 0.20827879105215696, + "grad_norm": 12.75, + "learning_rate": 9.305740070126365e-06, + "loss": 1.7399, + "step": 7160 + }, + { + "epoch": 0.2088605753847049, + "grad_norm": 14.3125, + "learning_rate": 9.30380078845375e-06, + "loss": 1.649, + "step": 7180 + }, + { + "epoch": 0.2094423597172528, + "grad_norm": 9.1875, + "learning_rate": 9.301861506781135e-06, + "loss": 1.6379, + "step": 7200 + }, + { + "epoch": 0.21002414404980074, + "grad_norm": 11.0, + "learning_rate": 9.29992222510852e-06, + "loss": 1.7574, + "step": 7220 + }, + { + "epoch": 0.21060592838234865, + "grad_norm": 10.9375, + "learning_rate": 9.297982943435905e-06, + "loss": 1.7173, + "step": 7240 + }, + { + "epoch": 0.21118771271489659, + "grad_norm": 13.0625, + "learning_rate": 9.29604366176329e-06, + "loss": 1.6358, + "step": 7260 + }, + { + "epoch": 0.21176949704744452, + "grad_norm": 11.375, + "learning_rate": 9.294104380090676e-06, + "loss": 1.677, + "step": 7280 + }, + { + "epoch": 0.21235128137999243, + "grad_norm": 12.8125, + "learning_rate": 9.29216509841806e-06, + "loss": 1.6471, + "step": 7300 + }, + { + "epoch": 0.21293306571254036, + "grad_norm": 11.625, + "learning_rate": 9.290225816745446e-06, + "loss": 1.7079, + "step": 7320 + }, + { + "epoch": 0.21351485004508827, + "grad_norm": 13.0, + "learning_rate": 9.288286535072831e-06, + "loss": 1.6672, + "step": 7340 + }, + { + "epoch": 0.2140966343776362, + "grad_norm": 12.75, + "learning_rate": 9.286347253400216e-06, + "loss": 1.512, + "step": 7360 + }, + { + "epoch": 0.21467841871018414, + "grad_norm": 13.1875, + "learning_rate": 9.284407971727601e-06, + "loss": 1.6741, + "step": 7380 + }, + { + "epoch": 0.21526020304273205, + "grad_norm": 11.8125, + "learning_rate": 9.282468690054986e-06, + "loss": 1.7275, + "step": 7400 + }, + { + "epoch": 0.21584198737528, + "grad_norm": 12.4375, + "learning_rate": 9.280529408382371e-06, + "loss": 1.6383, + "step": 7420 + }, + { + "epoch": 0.2164237717078279, + "grad_norm": 11.125, + "learning_rate": 9.278590126709756e-06, + "loss": 1.6778, + "step": 7440 + }, + { + "epoch": 0.21700555604037583, + "grad_norm": 12.875, + "learning_rate": 9.276650845037141e-06, + "loss": 1.6757, + "step": 7460 + }, + { + "epoch": 0.21758734037292377, + "grad_norm": 15.6875, + "learning_rate": 9.274711563364527e-06, + "loss": 1.7122, + "step": 7480 + }, + { + "epoch": 0.21816912470547167, + "grad_norm": 15.0625, + "learning_rate": 9.272772281691912e-06, + "loss": 1.7075, + "step": 7500 + }, + { + "epoch": 0.2187509090380196, + "grad_norm": 12.25, + "learning_rate": 9.270833000019297e-06, + "loss": 1.7087, + "step": 7520 + }, + { + "epoch": 0.21933269337056754, + "grad_norm": 12.375, + "learning_rate": 9.268893718346682e-06, + "loss": 1.6896, + "step": 7540 + }, + { + "epoch": 0.21991447770311545, + "grad_norm": 13.75, + "learning_rate": 9.266954436674067e-06, + "loss": 1.7717, + "step": 7560 + }, + { + "epoch": 0.2204962620356634, + "grad_norm": 12.25, + "learning_rate": 9.265015155001452e-06, + "loss": 1.5941, + "step": 7580 + }, + { + "epoch": 0.2210780463682113, + "grad_norm": 12.75, + "learning_rate": 9.263075873328837e-06, + "loss": 1.6637, + "step": 7600 + }, + { + "epoch": 0.22165983070075923, + "grad_norm": 13.6875, + "learning_rate": 9.261136591656222e-06, + "loss": 1.6377, + "step": 7620 + }, + { + "epoch": 0.22224161503330717, + "grad_norm": 12.3125, + "learning_rate": 9.259197309983607e-06, + "loss": 1.6165, + "step": 7640 + }, + { + "epoch": 0.22282339936585507, + "grad_norm": 11.875, + "learning_rate": 9.257258028310992e-06, + "loss": 1.6967, + "step": 7660 + }, + { + "epoch": 0.223405183698403, + "grad_norm": 13.8125, + "learning_rate": 9.255318746638378e-06, + "loss": 1.6089, + "step": 7680 + }, + { + "epoch": 0.22398696803095092, + "grad_norm": 12.75, + "learning_rate": 9.253379464965763e-06, + "loss": 1.6017, + "step": 7700 + }, + { + "epoch": 0.22456875236349885, + "grad_norm": 15.4375, + "learning_rate": 9.251440183293148e-06, + "loss": 1.6056, + "step": 7720 + }, + { + "epoch": 0.2251505366960468, + "grad_norm": 11.9375, + "learning_rate": 9.249500901620533e-06, + "loss": 1.614, + "step": 7740 + }, + { + "epoch": 0.2257323210285947, + "grad_norm": 13.8125, + "learning_rate": 9.247561619947918e-06, + "loss": 1.6657, + "step": 7760 + }, + { + "epoch": 0.22631410536114263, + "grad_norm": 12.5625, + "learning_rate": 9.245622338275303e-06, + "loss": 1.699, + "step": 7780 + }, + { + "epoch": 0.22689588969369054, + "grad_norm": 15.5, + "learning_rate": 9.243683056602688e-06, + "loss": 1.6335, + "step": 7800 + }, + { + "epoch": 0.22747767402623847, + "grad_norm": 14.6875, + "learning_rate": 9.241743774930073e-06, + "loss": 1.7117, + "step": 7820 + }, + { + "epoch": 0.2280594583587864, + "grad_norm": 10.375, + "learning_rate": 9.239804493257457e-06, + "loss": 1.6218, + "step": 7840 + }, + { + "epoch": 0.22864124269133432, + "grad_norm": 12.5, + "learning_rate": 9.237865211584842e-06, + "loss": 1.6866, + "step": 7860 + }, + { + "epoch": 0.22922302702388225, + "grad_norm": 12.5, + "learning_rate": 9.235925929912227e-06, + "loss": 1.6505, + "step": 7880 + }, + { + "epoch": 0.22980481135643016, + "grad_norm": 12.5625, + "learning_rate": 9.233986648239612e-06, + "loss": 1.6149, + "step": 7900 + }, + { + "epoch": 0.2303865956889781, + "grad_norm": 11.3125, + "learning_rate": 9.232047366566997e-06, + "loss": 1.7142, + "step": 7920 + }, + { + "epoch": 0.23096838002152603, + "grad_norm": 13.875, + "learning_rate": 9.230108084894382e-06, + "loss": 1.6732, + "step": 7940 + }, + { + "epoch": 0.23155016435407394, + "grad_norm": 12.25, + "learning_rate": 9.228168803221767e-06, + "loss": 1.797, + "step": 7960 + }, + { + "epoch": 0.23213194868662188, + "grad_norm": 11.8125, + "learning_rate": 9.226229521549152e-06, + "loss": 1.779, + "step": 7980 + }, + { + "epoch": 0.23271373301916978, + "grad_norm": 14.875, + "learning_rate": 9.224290239876537e-06, + "loss": 1.6473, + "step": 8000 + }, + { + "epoch": 0.23329551735171772, + "grad_norm": 10.625, + "learning_rate": 9.222350958203923e-06, + "loss": 1.6575, + "step": 8020 + }, + { + "epoch": 0.23387730168426565, + "grad_norm": 12.8125, + "learning_rate": 9.220411676531308e-06, + "loss": 1.6458, + "step": 8040 + }, + { + "epoch": 0.23445908601681356, + "grad_norm": 13.5, + "learning_rate": 9.218472394858693e-06, + "loss": 1.6488, + "step": 8060 + }, + { + "epoch": 0.2350408703493615, + "grad_norm": 11.5625, + "learning_rate": 9.216533113186078e-06, + "loss": 1.681, + "step": 8080 + }, + { + "epoch": 0.2356226546819094, + "grad_norm": 12.125, + "learning_rate": 9.214593831513463e-06, + "loss": 1.6335, + "step": 8100 + }, + { + "epoch": 0.23620443901445734, + "grad_norm": 8.625, + "learning_rate": 9.212654549840848e-06, + "loss": 1.6555, + "step": 8120 + }, + { + "epoch": 0.23678622334700528, + "grad_norm": 11.5625, + "learning_rate": 9.210715268168233e-06, + "loss": 1.6875, + "step": 8140 + }, + { + "epoch": 0.23736800767955318, + "grad_norm": 13.125, + "learning_rate": 9.208775986495618e-06, + "loss": 1.6405, + "step": 8160 + }, + { + "epoch": 0.23794979201210112, + "grad_norm": 14.1875, + "learning_rate": 9.206836704823003e-06, + "loss": 1.629, + "step": 8180 + }, + { + "epoch": 0.23853157634464903, + "grad_norm": 13.0625, + "learning_rate": 9.204897423150388e-06, + "loss": 1.6058, + "step": 8200 + }, + { + "epoch": 0.23911336067719696, + "grad_norm": 15.6875, + "learning_rate": 9.202958141477773e-06, + "loss": 1.6622, + "step": 8220 + }, + { + "epoch": 0.2396951450097449, + "grad_norm": 13.5, + "learning_rate": 9.201018859805159e-06, + "loss": 1.6367, + "step": 8240 + }, + { + "epoch": 0.2402769293422928, + "grad_norm": 15.375, + "learning_rate": 9.199079578132544e-06, + "loss": 1.6016, + "step": 8260 + }, + { + "epoch": 0.24085871367484074, + "grad_norm": 13.8125, + "learning_rate": 9.197140296459929e-06, + "loss": 1.6196, + "step": 8280 + }, + { + "epoch": 0.24144049800738865, + "grad_norm": 13.0625, + "learning_rate": 9.195201014787314e-06, + "loss": 1.6289, + "step": 8300 + }, + { + "epoch": 0.24202228233993658, + "grad_norm": 12.0, + "learning_rate": 9.193261733114699e-06, + "loss": 1.6877, + "step": 8320 + }, + { + "epoch": 0.24260406667248452, + "grad_norm": 13.25, + "learning_rate": 9.191322451442084e-06, + "loss": 1.7558, + "step": 8340 + }, + { + "epoch": 0.24318585100503243, + "grad_norm": 12.0, + "learning_rate": 9.18938316976947e-06, + "loss": 1.7199, + "step": 8360 + }, + { + "epoch": 0.24376763533758036, + "grad_norm": 12.25, + "learning_rate": 9.187443888096854e-06, + "loss": 1.6592, + "step": 8380 + }, + { + "epoch": 0.24434941967012827, + "grad_norm": 10.25, + "learning_rate": 9.18550460642424e-06, + "loss": 1.6278, + "step": 8400 + }, + { + "epoch": 0.2449312040026762, + "grad_norm": 13.8125, + "learning_rate": 9.183565324751624e-06, + "loss": 1.6542, + "step": 8420 + }, + { + "epoch": 0.24551298833522414, + "grad_norm": 13.8125, + "learning_rate": 9.18162604307901e-06, + "loss": 1.6557, + "step": 8440 + }, + { + "epoch": 0.24609477266777205, + "grad_norm": 12.625, + "learning_rate": 9.179686761406395e-06, + "loss": 1.7821, + "step": 8460 + }, + { + "epoch": 0.24667655700031998, + "grad_norm": 13.5625, + "learning_rate": 9.17774747973378e-06, + "loss": 1.5931, + "step": 8480 + }, + { + "epoch": 0.2472583413328679, + "grad_norm": 12.0, + "learning_rate": 9.175808198061165e-06, + "loss": 1.6858, + "step": 8500 + }, + { + "epoch": 0.24784012566541583, + "grad_norm": 12.375, + "learning_rate": 9.17386891638855e-06, + "loss": 1.667, + "step": 8520 + }, + { + "epoch": 0.24842190999796376, + "grad_norm": 12.875, + "learning_rate": 9.171929634715935e-06, + "loss": 1.6255, + "step": 8540 + }, + { + "epoch": 0.24900369433051167, + "grad_norm": 16.375, + "learning_rate": 9.16999035304332e-06, + "loss": 1.62, + "step": 8560 + }, + { + "epoch": 0.2495854786630596, + "grad_norm": 13.3125, + "learning_rate": 9.168051071370705e-06, + "loss": 1.6483, + "step": 8580 + }, + { + "epoch": 0.25016726299560754, + "grad_norm": 14.0, + "learning_rate": 9.16611178969809e-06, + "loss": 1.7872, + "step": 8600 + }, + { + "epoch": 0.25074904732815545, + "grad_norm": 14.0, + "learning_rate": 9.164172508025475e-06, + "loss": 1.6429, + "step": 8620 + }, + { + "epoch": 0.25133083166070336, + "grad_norm": 13.0, + "learning_rate": 9.16223322635286e-06, + "loss": 1.6614, + "step": 8640 + }, + { + "epoch": 0.2519126159932513, + "grad_norm": 12.25, + "learning_rate": 9.160293944680246e-06, + "loss": 1.7051, + "step": 8660 + }, + { + "epoch": 0.25249440032579923, + "grad_norm": 12.1875, + "learning_rate": 9.15835466300763e-06, + "loss": 1.6515, + "step": 8680 + }, + { + "epoch": 0.25307618465834714, + "grad_norm": 11.25, + "learning_rate": 9.156415381335016e-06, + "loss": 1.6462, + "step": 8700 + }, + { + "epoch": 0.2536579689908951, + "grad_norm": 12.0, + "learning_rate": 9.154476099662401e-06, + "loss": 1.6229, + "step": 8720 + }, + { + "epoch": 0.254239753323443, + "grad_norm": 13.625, + "learning_rate": 9.152536817989786e-06, + "loss": 1.6951, + "step": 8740 + }, + { + "epoch": 0.2548215376559909, + "grad_norm": 11.3125, + "learning_rate": 9.150597536317171e-06, + "loss": 1.6354, + "step": 8760 + }, + { + "epoch": 0.2554033219885388, + "grad_norm": 12.9375, + "learning_rate": 9.148658254644556e-06, + "loss": 1.6501, + "step": 8780 + }, + { + "epoch": 0.2559851063210868, + "grad_norm": 10.5, + "learning_rate": 9.146718972971941e-06, + "loss": 1.7109, + "step": 8800 + }, + { + "epoch": 0.2565668906536347, + "grad_norm": 12.6875, + "learning_rate": 9.144779691299326e-06, + "loss": 1.6589, + "step": 8820 + }, + { + "epoch": 0.2571486749861826, + "grad_norm": 13.625, + "learning_rate": 9.142840409626712e-06, + "loss": 1.7272, + "step": 8840 + }, + { + "epoch": 0.25773045931873056, + "grad_norm": 14.6875, + "learning_rate": 9.140901127954097e-06, + "loss": 1.6892, + "step": 8860 + }, + { + "epoch": 0.2583122436512785, + "grad_norm": 13.1875, + "learning_rate": 9.138961846281482e-06, + "loss": 1.6675, + "step": 8880 + }, + { + "epoch": 0.2588940279838264, + "grad_norm": 11.25, + "learning_rate": 9.137022564608867e-06, + "loss": 1.5748, + "step": 8900 + }, + { + "epoch": 0.25947581231637434, + "grad_norm": 13.6875, + "learning_rate": 9.135083282936252e-06, + "loss": 1.6906, + "step": 8920 + }, + { + "epoch": 0.26005759664892225, + "grad_norm": 13.4375, + "learning_rate": 9.133144001263637e-06, + "loss": 1.7507, + "step": 8940 + }, + { + "epoch": 0.26063938098147016, + "grad_norm": 13.25, + "learning_rate": 9.131204719591022e-06, + "loss": 1.7374, + "step": 8960 + }, + { + "epoch": 0.26122116531401807, + "grad_norm": 13.125, + "learning_rate": 9.129265437918407e-06, + "loss": 1.6141, + "step": 8980 + }, + { + "epoch": 0.26180294964656603, + "grad_norm": 11.375, + "learning_rate": 9.127326156245792e-06, + "loss": 1.6063, + "step": 9000 + }, + { + "epoch": 0.26238473397911394, + "grad_norm": 13.375, + "learning_rate": 9.125386874573177e-06, + "loss": 1.7222, + "step": 9020 + }, + { + "epoch": 0.26296651831166185, + "grad_norm": 14.0625, + "learning_rate": 9.123447592900562e-06, + "loss": 1.7228, + "step": 9040 + }, + { + "epoch": 0.2635483026442098, + "grad_norm": 12.5, + "learning_rate": 9.121508311227948e-06, + "loss": 1.6867, + "step": 9060 + }, + { + "epoch": 0.2641300869767577, + "grad_norm": 11.25, + "learning_rate": 9.119569029555333e-06, + "loss": 1.7106, + "step": 9080 + }, + { + "epoch": 0.2647118713093056, + "grad_norm": 13.0625, + "learning_rate": 9.117629747882716e-06, + "loss": 1.6625, + "step": 9100 + }, + { + "epoch": 0.2652936556418536, + "grad_norm": 12.625, + "learning_rate": 9.115690466210101e-06, + "loss": 1.6357, + "step": 9120 + }, + { + "epoch": 0.2658754399744015, + "grad_norm": 14.3125, + "learning_rate": 9.113751184537486e-06, + "loss": 1.6298, + "step": 9140 + }, + { + "epoch": 0.2664572243069494, + "grad_norm": 12.0, + "learning_rate": 9.111811902864871e-06, + "loss": 1.7246, + "step": 9160 + }, + { + "epoch": 0.2670390086394973, + "grad_norm": 10.625, + "learning_rate": 9.109872621192256e-06, + "loss": 1.658, + "step": 9180 + }, + { + "epoch": 0.2676207929720453, + "grad_norm": 12.875, + "learning_rate": 9.107933339519642e-06, + "loss": 1.6617, + "step": 9200 + }, + { + "epoch": 0.2682025773045932, + "grad_norm": 12.125, + "learning_rate": 9.105994057847027e-06, + "loss": 1.7339, + "step": 9220 + }, + { + "epoch": 0.2687843616371411, + "grad_norm": 15.3125, + "learning_rate": 9.104054776174412e-06, + "loss": 1.7189, + "step": 9240 + }, + { + "epoch": 0.26936614596968905, + "grad_norm": 12.6875, + "learning_rate": 9.102115494501797e-06, + "loss": 1.6779, + "step": 9260 + }, + { + "epoch": 0.26994793030223696, + "grad_norm": 10.75, + "learning_rate": 9.100176212829182e-06, + "loss": 1.6748, + "step": 9280 + }, + { + "epoch": 0.27052971463478487, + "grad_norm": 13.875, + "learning_rate": 9.098236931156567e-06, + "loss": 1.6947, + "step": 9300 + }, + { + "epoch": 0.27111149896733283, + "grad_norm": 13.8125, + "learning_rate": 9.096297649483952e-06, + "loss": 1.6742, + "step": 9320 + }, + { + "epoch": 0.27169328329988074, + "grad_norm": 12.625, + "learning_rate": 9.094358367811337e-06, + "loss": 1.7339, + "step": 9340 + }, + { + "epoch": 0.27227506763242865, + "grad_norm": 14.1875, + "learning_rate": 9.092419086138722e-06, + "loss": 1.6263, + "step": 9360 + }, + { + "epoch": 0.2728568519649766, + "grad_norm": 11.75, + "learning_rate": 9.090479804466107e-06, + "loss": 1.6642, + "step": 9380 + }, + { + "epoch": 0.2734386362975245, + "grad_norm": 11.6875, + "learning_rate": 9.088540522793493e-06, + "loss": 1.684, + "step": 9400 + }, + { + "epoch": 0.2740204206300724, + "grad_norm": 13.125, + "learning_rate": 9.086601241120878e-06, + "loss": 1.6405, + "step": 9420 + }, + { + "epoch": 0.27460220496262033, + "grad_norm": 19.375, + "learning_rate": 9.084661959448263e-06, + "loss": 1.712, + "step": 9440 + }, + { + "epoch": 0.2751839892951683, + "grad_norm": 9.6875, + "learning_rate": 9.082722677775648e-06, + "loss": 1.7061, + "step": 9460 + }, + { + "epoch": 0.2757657736277162, + "grad_norm": 11.875, + "learning_rate": 9.080783396103033e-06, + "loss": 1.7192, + "step": 9480 + }, + { + "epoch": 0.2763475579602641, + "grad_norm": 11.875, + "learning_rate": 9.078844114430418e-06, + "loss": 1.6492, + "step": 9500 + }, + { + "epoch": 0.2769293422928121, + "grad_norm": 11.6875, + "learning_rate": 9.076904832757803e-06, + "loss": 1.714, + "step": 9520 + }, + { + "epoch": 0.27751112662536, + "grad_norm": 12.5, + "learning_rate": 9.074965551085188e-06, + "loss": 1.6707, + "step": 9540 + }, + { + "epoch": 0.2780929109579079, + "grad_norm": 12.8125, + "learning_rate": 9.073026269412573e-06, + "loss": 1.748, + "step": 9560 + }, + { + "epoch": 0.27867469529045585, + "grad_norm": 13.6875, + "learning_rate": 9.071086987739958e-06, + "loss": 1.6314, + "step": 9580 + }, + { + "epoch": 0.27925647962300376, + "grad_norm": 14.6875, + "learning_rate": 9.069147706067344e-06, + "loss": 1.7624, + "step": 9600 + }, + { + "epoch": 0.27983826395555167, + "grad_norm": 14.125, + "learning_rate": 9.067208424394729e-06, + "loss": 1.5784, + "step": 9620 + }, + { + "epoch": 0.2804200482880996, + "grad_norm": 13.75, + "learning_rate": 9.065269142722114e-06, + "loss": 1.6444, + "step": 9640 + }, + { + "epoch": 0.28100183262064754, + "grad_norm": 13.75, + "learning_rate": 9.063329861049499e-06, + "loss": 1.6215, + "step": 9660 + }, + { + "epoch": 0.28158361695319545, + "grad_norm": 12.3125, + "learning_rate": 9.061390579376884e-06, + "loss": 1.7362, + "step": 9680 + }, + { + "epoch": 0.28216540128574336, + "grad_norm": 11.125, + "learning_rate": 9.059451297704269e-06, + "loss": 1.6616, + "step": 9700 + }, + { + "epoch": 0.2827471856182913, + "grad_norm": 15.5, + "learning_rate": 9.057512016031654e-06, + "loss": 1.5841, + "step": 9720 + }, + { + "epoch": 0.2833289699508392, + "grad_norm": 13.0625, + "learning_rate": 9.05557273435904e-06, + "loss": 1.5656, + "step": 9740 + }, + { + "epoch": 0.28391075428338713, + "grad_norm": 14.375, + "learning_rate": 9.053633452686424e-06, + "loss": 1.6562, + "step": 9760 + }, + { + "epoch": 0.2844925386159351, + "grad_norm": 11.5625, + "learning_rate": 9.05169417101381e-06, + "loss": 1.7046, + "step": 9780 + }, + { + "epoch": 0.285074322948483, + "grad_norm": 9.875, + "learning_rate": 9.049754889341195e-06, + "loss": 1.6812, + "step": 9800 + }, + { + "epoch": 0.2856561072810309, + "grad_norm": 12.6875, + "learning_rate": 9.04781560766858e-06, + "loss": 1.6193, + "step": 9820 + }, + { + "epoch": 0.2862378916135788, + "grad_norm": 10.875, + "learning_rate": 9.045876325995965e-06, + "loss": 1.5845, + "step": 9840 + }, + { + "epoch": 0.2868196759461268, + "grad_norm": 10.625, + "learning_rate": 9.04393704432335e-06, + "loss": 1.7287, + "step": 9860 + }, + { + "epoch": 0.2874014602786747, + "grad_norm": 12.9375, + "learning_rate": 9.041997762650735e-06, + "loss": 1.6522, + "step": 9880 + }, + { + "epoch": 0.2879832446112226, + "grad_norm": 13.25, + "learning_rate": 9.04005848097812e-06, + "loss": 1.638, + "step": 9900 + }, + { + "epoch": 0.28856502894377056, + "grad_norm": 13.4375, + "learning_rate": 9.038119199305505e-06, + "loss": 1.726, + "step": 9920 + }, + { + "epoch": 0.28914681327631847, + "grad_norm": 14.4375, + "learning_rate": 9.03617991763289e-06, + "loss": 1.7284, + "step": 9940 + }, + { + "epoch": 0.2897285976088664, + "grad_norm": 14.3125, + "learning_rate": 9.034240635960275e-06, + "loss": 1.6854, + "step": 9960 + }, + { + "epoch": 0.29031038194141434, + "grad_norm": 15.125, + "learning_rate": 9.03230135428766e-06, + "loss": 1.7404, + "step": 9980 + }, + { + "epoch": 0.29089216627396225, + "grad_norm": 12.125, + "learning_rate": 9.030362072615045e-06, + "loss": 1.6527, + "step": 10000 + }, + { + "epoch": 0.29147395060651016, + "grad_norm": 13.0625, + "learning_rate": 9.02842279094243e-06, + "loss": 1.708, + "step": 10020 + }, + { + "epoch": 0.29205573493905806, + "grad_norm": 16.125, + "learning_rate": 9.026483509269816e-06, + "loss": 1.6867, + "step": 10040 + }, + { + "epoch": 0.29263751927160603, + "grad_norm": 13.5, + "learning_rate": 9.0245442275972e-06, + "loss": 1.6858, + "step": 10060 + }, + { + "epoch": 0.29321930360415394, + "grad_norm": 16.375, + "learning_rate": 9.022604945924586e-06, + "loss": 1.6886, + "step": 10080 + }, + { + "epoch": 0.29380108793670184, + "grad_norm": 13.3125, + "learning_rate": 9.020665664251971e-06, + "loss": 1.6543, + "step": 10100 + }, + { + "epoch": 0.2943828722692498, + "grad_norm": 13.25, + "learning_rate": 9.018726382579356e-06, + "loss": 1.6289, + "step": 10120 + }, + { + "epoch": 0.2949646566017977, + "grad_norm": 10.75, + "learning_rate": 9.016787100906741e-06, + "loss": 1.6614, + "step": 10140 + }, + { + "epoch": 0.2955464409343456, + "grad_norm": 13.3125, + "learning_rate": 9.014847819234126e-06, + "loss": 1.6518, + "step": 10160 + }, + { + "epoch": 0.2961282252668936, + "grad_norm": 12.75, + "learning_rate": 9.012908537561511e-06, + "loss": 1.6559, + "step": 10180 + }, + { + "epoch": 0.2967100095994415, + "grad_norm": 14.75, + "learning_rate": 9.010969255888896e-06, + "loss": 1.6365, + "step": 10200 + }, + { + "epoch": 0.2972917939319894, + "grad_norm": 12.5, + "learning_rate": 9.009029974216282e-06, + "loss": 1.7192, + "step": 10220 + }, + { + "epoch": 0.2978735782645373, + "grad_norm": 13.25, + "learning_rate": 9.007090692543667e-06, + "loss": 1.6718, + "step": 10240 + }, + { + "epoch": 0.29845536259708527, + "grad_norm": 10.5625, + "learning_rate": 9.005151410871052e-06, + "loss": 1.6284, + "step": 10260 + }, + { + "epoch": 0.2990371469296332, + "grad_norm": 13.3125, + "learning_rate": 9.003212129198437e-06, + "loss": 1.6932, + "step": 10280 + }, + { + "epoch": 0.2996189312621811, + "grad_norm": 11.0625, + "learning_rate": 9.001272847525822e-06, + "loss": 1.6853, + "step": 10300 + }, + { + "epoch": 0.30020071559472905, + "grad_norm": 12.25, + "learning_rate": 8.999333565853207e-06, + "loss": 1.6984, + "step": 10320 + }, + { + "epoch": 0.30078249992727696, + "grad_norm": 10.75, + "learning_rate": 8.997394284180592e-06, + "loss": 1.6524, + "step": 10340 + }, + { + "epoch": 0.30136428425982487, + "grad_norm": 12.75, + "learning_rate": 8.995455002507977e-06, + "loss": 1.6859, + "step": 10360 + }, + { + "epoch": 0.30194606859237283, + "grad_norm": 13.5, + "learning_rate": 8.993515720835362e-06, + "loss": 1.5609, + "step": 10380 + }, + { + "epoch": 0.30252785292492074, + "grad_norm": 11.8125, + "learning_rate": 8.991576439162747e-06, + "loss": 1.6768, + "step": 10400 + }, + { + "epoch": 0.30310963725746864, + "grad_norm": 12.5625, + "learning_rate": 8.989637157490133e-06, + "loss": 1.6377, + "step": 10420 + }, + { + "epoch": 0.3036914215900166, + "grad_norm": 11.0625, + "learning_rate": 8.987697875817518e-06, + "loss": 1.7098, + "step": 10440 + }, + { + "epoch": 0.3042732059225645, + "grad_norm": 11.6875, + "learning_rate": 8.985758594144903e-06, + "loss": 1.6467, + "step": 10460 + }, + { + "epoch": 0.3048549902551124, + "grad_norm": 15.5, + "learning_rate": 8.983819312472288e-06, + "loss": 1.6654, + "step": 10480 + }, + { + "epoch": 0.30543677458766033, + "grad_norm": 12.625, + "learning_rate": 8.981880030799673e-06, + "loss": 1.7498, + "step": 10500 + }, + { + "epoch": 0.3060185589202083, + "grad_norm": 15.75, + "learning_rate": 8.979940749127058e-06, + "loss": 1.6435, + "step": 10520 + }, + { + "epoch": 0.3066003432527562, + "grad_norm": 12.0625, + "learning_rate": 8.978001467454443e-06, + "loss": 1.6964, + "step": 10540 + }, + { + "epoch": 0.3071821275853041, + "grad_norm": 13.875, + "learning_rate": 8.976062185781828e-06, + "loss": 1.6594, + "step": 10560 + }, + { + "epoch": 0.3077639119178521, + "grad_norm": 13.1875, + "learning_rate": 8.974122904109213e-06, + "loss": 1.6804, + "step": 10580 + }, + { + "epoch": 0.3083456962504, + "grad_norm": 10.25, + "learning_rate": 8.972183622436598e-06, + "loss": 1.735, + "step": 10600 + }, + { + "epoch": 0.3089274805829479, + "grad_norm": 13.5625, + "learning_rate": 8.970244340763983e-06, + "loss": 1.6542, + "step": 10620 + }, + { + "epoch": 0.30950926491549585, + "grad_norm": 9.75, + "learning_rate": 8.968305059091369e-06, + "loss": 1.5944, + "step": 10640 + }, + { + "epoch": 0.31009104924804376, + "grad_norm": 14.125, + "learning_rate": 8.966365777418754e-06, + "loss": 1.653, + "step": 10660 + }, + { + "epoch": 0.31067283358059167, + "grad_norm": 13.125, + "learning_rate": 8.964426495746139e-06, + "loss": 1.6451, + "step": 10680 + }, + { + "epoch": 0.3112546179131396, + "grad_norm": 16.125, + "learning_rate": 8.962487214073524e-06, + "loss": 1.7036, + "step": 10700 + }, + { + "epoch": 0.31183640224568754, + "grad_norm": 11.875, + "learning_rate": 8.960547932400909e-06, + "loss": 1.6025, + "step": 10720 + }, + { + "epoch": 0.31241818657823545, + "grad_norm": 13.5, + "learning_rate": 8.958608650728292e-06, + "loss": 1.6561, + "step": 10740 + }, + { + "epoch": 0.31299997091078335, + "grad_norm": 11.9375, + "learning_rate": 8.956669369055677e-06, + "loss": 1.5292, + "step": 10760 + }, + { + "epoch": 0.3135817552433313, + "grad_norm": 10.5625, + "learning_rate": 8.954730087383063e-06, + "loss": 1.6855, + "step": 10780 + }, + { + "epoch": 0.3141635395758792, + "grad_norm": 11.4375, + "learning_rate": 8.952790805710448e-06, + "loss": 1.6713, + "step": 10800 + }, + { + "epoch": 0.31474532390842713, + "grad_norm": 16.625, + "learning_rate": 8.950851524037833e-06, + "loss": 1.7154, + "step": 10820 + }, + { + "epoch": 0.3153271082409751, + "grad_norm": 13.875, + "learning_rate": 8.948912242365218e-06, + "loss": 1.6844, + "step": 10840 + }, + { + "epoch": 0.315908892573523, + "grad_norm": 12.1875, + "learning_rate": 8.946972960692603e-06, + "loss": 1.6695, + "step": 10860 + }, + { + "epoch": 0.3164906769060709, + "grad_norm": 12.1875, + "learning_rate": 8.945033679019988e-06, + "loss": 1.7592, + "step": 10880 + }, + { + "epoch": 0.3170724612386188, + "grad_norm": 14.5625, + "learning_rate": 8.943094397347373e-06, + "loss": 1.5866, + "step": 10900 + }, + { + "epoch": 0.3176542455711668, + "grad_norm": 14.0, + "learning_rate": 8.941155115674758e-06, + "loss": 1.7119, + "step": 10920 + }, + { + "epoch": 0.3182360299037147, + "grad_norm": 12.375, + "learning_rate": 8.939215834002143e-06, + "loss": 1.6872, + "step": 10940 + }, + { + "epoch": 0.3188178142362626, + "grad_norm": 13.875, + "learning_rate": 8.937276552329528e-06, + "loss": 1.736, + "step": 10960 + }, + { + "epoch": 0.31939959856881056, + "grad_norm": 12.9375, + "learning_rate": 8.935337270656914e-06, + "loss": 1.7081, + "step": 10980 + }, + { + "epoch": 0.31998138290135847, + "grad_norm": 12.6875, + "learning_rate": 8.933397988984299e-06, + "loss": 1.7345, + "step": 11000 + }, + { + "epoch": 0.3205631672339064, + "grad_norm": 12.4375, + "learning_rate": 8.931458707311684e-06, + "loss": 1.7188, + "step": 11020 + }, + { + "epoch": 0.32114495156645434, + "grad_norm": 12.0, + "learning_rate": 8.929519425639069e-06, + "loss": 1.6173, + "step": 11040 + }, + { + "epoch": 0.32172673589900225, + "grad_norm": 15.75, + "learning_rate": 8.927580143966454e-06, + "loss": 1.6647, + "step": 11060 + }, + { + "epoch": 0.32230852023155016, + "grad_norm": 11.6875, + "learning_rate": 8.925640862293839e-06, + "loss": 1.6596, + "step": 11080 + }, + { + "epoch": 0.32289030456409806, + "grad_norm": 10.875, + "learning_rate": 8.923701580621224e-06, + "loss": 1.6738, + "step": 11100 + }, + { + "epoch": 0.323472088896646, + "grad_norm": 12.6875, + "learning_rate": 8.92176229894861e-06, + "loss": 1.7109, + "step": 11120 + }, + { + "epoch": 0.32405387322919393, + "grad_norm": 13.625, + "learning_rate": 8.919823017275994e-06, + "loss": 1.6099, + "step": 11140 + }, + { + "epoch": 0.32463565756174184, + "grad_norm": 13.125, + "learning_rate": 8.91788373560338e-06, + "loss": 1.5996, + "step": 11160 + }, + { + "epoch": 0.3252174418942898, + "grad_norm": 11.875, + "learning_rate": 8.915944453930765e-06, + "loss": 1.6787, + "step": 11180 + }, + { + "epoch": 0.3257992262268377, + "grad_norm": 11.9375, + "learning_rate": 8.91400517225815e-06, + "loss": 1.5997, + "step": 11200 + }, + { + "epoch": 0.3263810105593856, + "grad_norm": 12.5, + "learning_rate": 8.912065890585535e-06, + "loss": 1.7873, + "step": 11220 + }, + { + "epoch": 0.3269627948919336, + "grad_norm": 11.5, + "learning_rate": 8.91012660891292e-06, + "loss": 1.7129, + "step": 11240 + }, + { + "epoch": 0.3275445792244815, + "grad_norm": 15.4375, + "learning_rate": 8.908187327240305e-06, + "loss": 1.6616, + "step": 11260 + }, + { + "epoch": 0.3281263635570294, + "grad_norm": 13.0625, + "learning_rate": 8.90624804556769e-06, + "loss": 1.6324, + "step": 11280 + }, + { + "epoch": 0.3287081478895773, + "grad_norm": 11.875, + "learning_rate": 8.904308763895075e-06, + "loss": 1.62, + "step": 11300 + }, + { + "epoch": 0.32928993222212527, + "grad_norm": 21.75, + "learning_rate": 8.90236948222246e-06, + "loss": 1.7055, + "step": 11320 + }, + { + "epoch": 0.3298717165546732, + "grad_norm": 11.75, + "learning_rate": 8.900430200549845e-06, + "loss": 1.6479, + "step": 11340 + }, + { + "epoch": 0.3304535008872211, + "grad_norm": 11.875, + "learning_rate": 8.89849091887723e-06, + "loss": 1.7365, + "step": 11360 + }, + { + "epoch": 0.33103528521976905, + "grad_norm": 12.375, + "learning_rate": 8.896551637204616e-06, + "loss": 1.7232, + "step": 11380 + }, + { + "epoch": 0.33161706955231696, + "grad_norm": 9.5, + "learning_rate": 8.894612355532e-06, + "loss": 1.6588, + "step": 11400 + }, + { + "epoch": 0.33219885388486486, + "grad_norm": 13.0625, + "learning_rate": 8.892673073859386e-06, + "loss": 1.6273, + "step": 11420 + }, + { + "epoch": 0.3327806382174128, + "grad_norm": 12.0, + "learning_rate": 8.89073379218677e-06, + "loss": 1.6455, + "step": 11440 + }, + { + "epoch": 0.33336242254996074, + "grad_norm": 10.6875, + "learning_rate": 8.888794510514156e-06, + "loss": 1.8009, + "step": 11460 + }, + { + "epoch": 0.33394420688250864, + "grad_norm": 12.125, + "learning_rate": 8.886855228841541e-06, + "loss": 1.7571, + "step": 11480 + }, + { + "epoch": 0.3345259912150566, + "grad_norm": 11.0, + "learning_rate": 8.884915947168926e-06, + "loss": 1.7168, + "step": 11500 + }, + { + "epoch": 0.3351077755476045, + "grad_norm": 16.5, + "learning_rate": 8.882976665496311e-06, + "loss": 1.607, + "step": 11520 + }, + { + "epoch": 0.3356895598801524, + "grad_norm": 12.75, + "learning_rate": 8.881037383823696e-06, + "loss": 1.65, + "step": 11540 + }, + { + "epoch": 0.33627134421270033, + "grad_norm": 13.75, + "learning_rate": 8.879098102151081e-06, + "loss": 1.5926, + "step": 11560 + }, + { + "epoch": 0.3368531285452483, + "grad_norm": 13.625, + "learning_rate": 8.877158820478466e-06, + "loss": 1.6459, + "step": 11580 + }, + { + "epoch": 0.3374349128777962, + "grad_norm": 12.4375, + "learning_rate": 8.875219538805852e-06, + "loss": 1.5444, + "step": 11600 + }, + { + "epoch": 0.3380166972103441, + "grad_norm": 11.75, + "learning_rate": 8.873280257133237e-06, + "loss": 1.6608, + "step": 11620 + }, + { + "epoch": 0.33859848154289207, + "grad_norm": 13.5625, + "learning_rate": 8.871340975460622e-06, + "loss": 1.642, + "step": 11640 + }, + { + "epoch": 0.33918026587544, + "grad_norm": 13.1875, + "learning_rate": 8.869401693788007e-06, + "loss": 1.7346, + "step": 11660 + }, + { + "epoch": 0.3397620502079879, + "grad_norm": 12.6875, + "learning_rate": 8.867462412115392e-06, + "loss": 1.5846, + "step": 11680 + }, + { + "epoch": 0.34034383454053585, + "grad_norm": 13.1875, + "learning_rate": 8.865523130442777e-06, + "loss": 1.5694, + "step": 11700 + }, + { + "epoch": 0.34092561887308376, + "grad_norm": 13.875, + "learning_rate": 8.863583848770162e-06, + "loss": 1.5998, + "step": 11720 + }, + { + "epoch": 0.34150740320563167, + "grad_norm": 13.25, + "learning_rate": 8.861644567097547e-06, + "loss": 1.7103, + "step": 11740 + }, + { + "epoch": 0.3420891875381796, + "grad_norm": 12.125, + "learning_rate": 8.859705285424932e-06, + "loss": 1.616, + "step": 11760 + }, + { + "epoch": 0.34267097187072754, + "grad_norm": 12.3125, + "learning_rate": 8.857766003752317e-06, + "loss": 1.7097, + "step": 11780 + }, + { + "epoch": 0.34325275620327544, + "grad_norm": 15.3125, + "learning_rate": 8.855826722079703e-06, + "loss": 1.733, + "step": 11800 + }, + { + "epoch": 0.34383454053582335, + "grad_norm": 15.25, + "learning_rate": 8.853887440407088e-06, + "loss": 1.5843, + "step": 11820 + }, + { + "epoch": 0.3444163248683713, + "grad_norm": 11.9375, + "learning_rate": 8.851948158734473e-06, + "loss": 1.685, + "step": 11840 + }, + { + "epoch": 0.3449981092009192, + "grad_norm": 11.9375, + "learning_rate": 8.850008877061858e-06, + "loss": 1.5462, + "step": 11860 + }, + { + "epoch": 0.34557989353346713, + "grad_norm": 12.5, + "learning_rate": 8.848069595389243e-06, + "loss": 1.7058, + "step": 11880 + }, + { + "epoch": 0.3461616778660151, + "grad_norm": 10.8125, + "learning_rate": 8.846130313716628e-06, + "loss": 1.6387, + "step": 11900 + }, + { + "epoch": 0.346743462198563, + "grad_norm": 12.8125, + "learning_rate": 8.844191032044013e-06, + "loss": 1.6074, + "step": 11920 + }, + { + "epoch": 0.3473252465311109, + "grad_norm": 13.3125, + "learning_rate": 8.842251750371398e-06, + "loss": 1.6532, + "step": 11940 + }, + { + "epoch": 0.3479070308636588, + "grad_norm": 11.5625, + "learning_rate": 8.840312468698783e-06, + "loss": 1.6338, + "step": 11960 + }, + { + "epoch": 0.3484888151962068, + "grad_norm": 14.3125, + "learning_rate": 8.838373187026168e-06, + "loss": 1.7099, + "step": 11980 + }, + { + "epoch": 0.3490705995287547, + "grad_norm": 12.1875, + "learning_rate": 8.836433905353554e-06, + "loss": 1.6162, + "step": 12000 + }, + { + "epoch": 0.3496523838613026, + "grad_norm": 10.75, + "learning_rate": 8.834494623680939e-06, + "loss": 1.6681, + "step": 12020 + }, + { + "epoch": 0.35023416819385056, + "grad_norm": 9.8125, + "learning_rate": 8.832555342008324e-06, + "loss": 1.6138, + "step": 12040 + }, + { + "epoch": 0.35081595252639847, + "grad_norm": 14.9375, + "learning_rate": 8.830616060335709e-06, + "loss": 1.5801, + "step": 12060 + }, + { + "epoch": 0.3513977368589464, + "grad_norm": 13.6875, + "learning_rate": 8.828676778663092e-06, + "loss": 1.6548, + "step": 12080 + }, + { + "epoch": 0.35197952119149434, + "grad_norm": 12.3125, + "learning_rate": 8.826737496990477e-06, + "loss": 1.6574, + "step": 12100 + }, + { + "epoch": 0.35256130552404225, + "grad_norm": 12.125, + "learning_rate": 8.824798215317862e-06, + "loss": 1.6967, + "step": 12120 + }, + { + "epoch": 0.35314308985659015, + "grad_norm": 16.125, + "learning_rate": 8.822858933645248e-06, + "loss": 1.6683, + "step": 12140 + }, + { + "epoch": 0.35372487418913806, + "grad_norm": 12.0625, + "learning_rate": 8.820919651972633e-06, + "loss": 1.7342, + "step": 12160 + }, + { + "epoch": 0.354306658521686, + "grad_norm": 13.0625, + "learning_rate": 8.818980370300018e-06, + "loss": 1.7129, + "step": 12180 + }, + { + "epoch": 0.35488844285423393, + "grad_norm": 13.0, + "learning_rate": 8.817041088627403e-06, + "loss": 1.6718, + "step": 12200 + }, + { + "epoch": 0.35547022718678184, + "grad_norm": 15.125, + "learning_rate": 8.815101806954788e-06, + "loss": 1.6067, + "step": 12220 + }, + { + "epoch": 0.3560520115193298, + "grad_norm": 10.9375, + "learning_rate": 8.813162525282173e-06, + "loss": 1.6761, + "step": 12240 + }, + { + "epoch": 0.3566337958518777, + "grad_norm": 11.25, + "learning_rate": 8.811223243609558e-06, + "loss": 1.7204, + "step": 12260 + }, + { + "epoch": 0.3572155801844256, + "grad_norm": 14.4375, + "learning_rate": 8.809283961936943e-06, + "loss": 1.6675, + "step": 12280 + }, + { + "epoch": 0.3577973645169736, + "grad_norm": 12.0, + "learning_rate": 8.807344680264328e-06, + "loss": 1.6827, + "step": 12300 + }, + { + "epoch": 0.3583791488495215, + "grad_norm": 12.625, + "learning_rate": 8.805405398591713e-06, + "loss": 1.7405, + "step": 12320 + }, + { + "epoch": 0.3589609331820694, + "grad_norm": 14.5625, + "learning_rate": 8.803466116919098e-06, + "loss": 1.6716, + "step": 12340 + }, + { + "epoch": 0.3595427175146173, + "grad_norm": 12.4375, + "learning_rate": 8.801526835246484e-06, + "loss": 1.6382, + "step": 12360 + }, + { + "epoch": 0.36012450184716527, + "grad_norm": 11.375, + "learning_rate": 8.799587553573869e-06, + "loss": 1.7131, + "step": 12380 + }, + { + "epoch": 0.3607062861797132, + "grad_norm": 12.875, + "learning_rate": 8.797648271901254e-06, + "loss": 1.607, + "step": 12400 + }, + { + "epoch": 0.3612880705122611, + "grad_norm": 12.0625, + "learning_rate": 8.795708990228639e-06, + "loss": 1.7249, + "step": 12420 + }, + { + "epoch": 0.36186985484480905, + "grad_norm": 10.0625, + "learning_rate": 8.793769708556024e-06, + "loss": 1.7111, + "step": 12440 + }, + { + "epoch": 0.36245163917735695, + "grad_norm": 13.375, + "learning_rate": 8.791830426883409e-06, + "loss": 1.6574, + "step": 12460 + }, + { + "epoch": 0.36303342350990486, + "grad_norm": 11.5625, + "learning_rate": 8.789891145210794e-06, + "loss": 1.7416, + "step": 12480 + }, + { + "epoch": 0.3636152078424528, + "grad_norm": 11.0, + "learning_rate": 8.78795186353818e-06, + "loss": 1.6661, + "step": 12500 + }, + { + "epoch": 0.36419699217500073, + "grad_norm": 11.625, + "learning_rate": 8.786012581865564e-06, + "loss": 1.7245, + "step": 12520 + }, + { + "epoch": 0.36477877650754864, + "grad_norm": 15.25, + "learning_rate": 8.78407330019295e-06, + "loss": 1.6578, + "step": 12540 + }, + { + "epoch": 0.36536056084009655, + "grad_norm": 15.1875, + "learning_rate": 8.782134018520335e-06, + "loss": 1.708, + "step": 12560 + }, + { + "epoch": 0.3659423451726445, + "grad_norm": 12.375, + "learning_rate": 8.78019473684772e-06, + "loss": 1.7078, + "step": 12580 + }, + { + "epoch": 0.3665241295051924, + "grad_norm": 12.1875, + "learning_rate": 8.778255455175105e-06, + "loss": 1.6425, + "step": 12600 + }, + { + "epoch": 0.3671059138377403, + "grad_norm": 11.5, + "learning_rate": 8.77631617350249e-06, + "loss": 1.8068, + "step": 12620 + }, + { + "epoch": 0.3676876981702883, + "grad_norm": 12.4375, + "learning_rate": 8.774376891829875e-06, + "loss": 1.6618, + "step": 12640 + }, + { + "epoch": 0.3682694825028362, + "grad_norm": 12.25, + "learning_rate": 8.77243761015726e-06, + "loss": 1.5911, + "step": 12660 + }, + { + "epoch": 0.3688512668353841, + "grad_norm": 13.75, + "learning_rate": 8.770498328484645e-06, + "loss": 1.5994, + "step": 12680 + }, + { + "epoch": 0.36943305116793207, + "grad_norm": 12.75, + "learning_rate": 8.76855904681203e-06, + "loss": 1.6917, + "step": 12700 + }, + { + "epoch": 0.37001483550048, + "grad_norm": 10.0, + "learning_rate": 8.766619765139415e-06, + "loss": 1.6336, + "step": 12720 + }, + { + "epoch": 0.3705966198330279, + "grad_norm": 13.625, + "learning_rate": 8.7646804834668e-06, + "loss": 1.7117, + "step": 12740 + }, + { + "epoch": 0.37117840416557585, + "grad_norm": 12.5, + "learning_rate": 8.762741201794186e-06, + "loss": 1.6783, + "step": 12760 + }, + { + "epoch": 0.37176018849812376, + "grad_norm": 12.9375, + "learning_rate": 8.76080192012157e-06, + "loss": 1.6464, + "step": 12780 + }, + { + "epoch": 0.37234197283067166, + "grad_norm": 11.3125, + "learning_rate": 8.758862638448956e-06, + "loss": 1.7115, + "step": 12800 + }, + { + "epoch": 0.37292375716321957, + "grad_norm": 11.0625, + "learning_rate": 8.75692335677634e-06, + "loss": 1.6268, + "step": 12820 + }, + { + "epoch": 0.37350554149576753, + "grad_norm": 11.1875, + "learning_rate": 8.754984075103726e-06, + "loss": 1.5796, + "step": 12840 + }, + { + "epoch": 0.37408732582831544, + "grad_norm": 13.3125, + "learning_rate": 8.753044793431111e-06, + "loss": 1.6056, + "step": 12860 + }, + { + "epoch": 0.37466911016086335, + "grad_norm": 15.9375, + "learning_rate": 8.751105511758496e-06, + "loss": 1.6311, + "step": 12880 + }, + { + "epoch": 0.3752508944934113, + "grad_norm": 14.8125, + "learning_rate": 8.749166230085881e-06, + "loss": 1.6125, + "step": 12900 + }, + { + "epoch": 0.3758326788259592, + "grad_norm": 11.4375, + "learning_rate": 8.747226948413266e-06, + "loss": 1.6119, + "step": 12920 + }, + { + "epoch": 0.37641446315850713, + "grad_norm": 14.5625, + "learning_rate": 8.74528766674065e-06, + "loss": 1.6739, + "step": 12940 + }, + { + "epoch": 0.3769962474910551, + "grad_norm": 9.25, + "learning_rate": 8.743348385068035e-06, + "loss": 1.7119, + "step": 12960 + }, + { + "epoch": 0.377578031823603, + "grad_norm": 13.0, + "learning_rate": 8.74140910339542e-06, + "loss": 1.6802, + "step": 12980 + }, + { + "epoch": 0.3781598161561509, + "grad_norm": 12.1875, + "learning_rate": 8.739469821722805e-06, + "loss": 1.5522, + "step": 13000 + }, + { + "epoch": 0.3787416004886988, + "grad_norm": 16.375, + "learning_rate": 8.73753054005019e-06, + "loss": 1.6771, + "step": 13020 + }, + { + "epoch": 0.3793233848212468, + "grad_norm": 10.5625, + "learning_rate": 8.735591258377575e-06, + "loss": 1.6635, + "step": 13040 + }, + { + "epoch": 0.3799051691537947, + "grad_norm": 10.8125, + "learning_rate": 8.73365197670496e-06, + "loss": 1.6157, + "step": 13060 + }, + { + "epoch": 0.3804869534863426, + "grad_norm": 11.125, + "learning_rate": 8.731712695032345e-06, + "loss": 1.6574, + "step": 13080 + }, + { + "epoch": 0.38106873781889056, + "grad_norm": 15.8125, + "learning_rate": 8.72977341335973e-06, + "loss": 1.6298, + "step": 13100 + }, + { + "epoch": 0.38165052215143846, + "grad_norm": 13.0, + "learning_rate": 8.727834131687116e-06, + "loss": 1.749, + "step": 13120 + }, + { + "epoch": 0.3822323064839864, + "grad_norm": 16.625, + "learning_rate": 8.7258948500145e-06, + "loss": 1.736, + "step": 13140 + }, + { + "epoch": 0.38281409081653434, + "grad_norm": 11.875, + "learning_rate": 8.723955568341886e-06, + "loss": 1.6702, + "step": 13160 + }, + { + "epoch": 0.38339587514908224, + "grad_norm": 12.75, + "learning_rate": 8.722016286669271e-06, + "loss": 1.6749, + "step": 13180 + }, + { + "epoch": 0.38397765948163015, + "grad_norm": 12.875, + "learning_rate": 8.720077004996656e-06, + "loss": 1.7487, + "step": 13200 + }, + { + "epoch": 0.38455944381417806, + "grad_norm": 12.5625, + "learning_rate": 8.718137723324041e-06, + "loss": 1.6517, + "step": 13220 + }, + { + "epoch": 0.385141228146726, + "grad_norm": 11.5, + "learning_rate": 8.716198441651426e-06, + "loss": 1.6829, + "step": 13240 + }, + { + "epoch": 0.38572301247927393, + "grad_norm": 12.875, + "learning_rate": 8.714259159978811e-06, + "loss": 1.6287, + "step": 13260 + }, + { + "epoch": 0.38630479681182184, + "grad_norm": 13.1875, + "learning_rate": 8.712319878306196e-06, + "loss": 1.7027, + "step": 13280 + }, + { + "epoch": 0.3868865811443698, + "grad_norm": 12.75, + "learning_rate": 8.710380596633581e-06, + "loss": 1.762, + "step": 13300 + }, + { + "epoch": 0.3874683654769177, + "grad_norm": 15.3125, + "learning_rate": 8.708441314960967e-06, + "loss": 1.6645, + "step": 13320 + }, + { + "epoch": 0.3880501498094656, + "grad_norm": 14.0, + "learning_rate": 8.706502033288352e-06, + "loss": 1.6303, + "step": 13340 + }, + { + "epoch": 0.3886319341420136, + "grad_norm": 11.4375, + "learning_rate": 8.704562751615737e-06, + "loss": 1.6327, + "step": 13360 + }, + { + "epoch": 0.3892137184745615, + "grad_norm": 10.8125, + "learning_rate": 8.702623469943122e-06, + "loss": 1.7711, + "step": 13380 + }, + { + "epoch": 0.3897955028071094, + "grad_norm": 12.75, + "learning_rate": 8.700684188270507e-06, + "loss": 1.697, + "step": 13400 + }, + { + "epoch": 0.3903772871396573, + "grad_norm": 12.1875, + "learning_rate": 8.698744906597892e-06, + "loss": 1.6974, + "step": 13420 + }, + { + "epoch": 0.39095907147220527, + "grad_norm": 10.875, + "learning_rate": 8.696805624925277e-06, + "loss": 1.6802, + "step": 13440 + }, + { + "epoch": 0.3915408558047532, + "grad_norm": 7.59375, + "learning_rate": 8.694866343252662e-06, + "loss": 1.6845, + "step": 13460 + }, + { + "epoch": 0.3921226401373011, + "grad_norm": 12.9375, + "learning_rate": 8.692927061580047e-06, + "loss": 1.7635, + "step": 13480 + }, + { + "epoch": 0.39270442446984904, + "grad_norm": 13.5, + "learning_rate": 8.690987779907432e-06, + "loss": 1.6645, + "step": 13500 + }, + { + "epoch": 0.39328620880239695, + "grad_norm": 13.375, + "learning_rate": 8.689048498234818e-06, + "loss": 1.6436, + "step": 13520 + }, + { + "epoch": 0.39386799313494486, + "grad_norm": 11.75, + "learning_rate": 8.687109216562203e-06, + "loss": 1.6567, + "step": 13540 + }, + { + "epoch": 0.3944497774674928, + "grad_norm": 11.0625, + "learning_rate": 8.685169934889588e-06, + "loss": 1.6634, + "step": 13560 + }, + { + "epoch": 0.39503156180004073, + "grad_norm": 11.3125, + "learning_rate": 8.683230653216973e-06, + "loss": 1.6337, + "step": 13580 + }, + { + "epoch": 0.39561334613258864, + "grad_norm": 10.75, + "learning_rate": 8.681291371544358e-06, + "loss": 1.6324, + "step": 13600 + }, + { + "epoch": 0.39619513046513655, + "grad_norm": 12.0625, + "learning_rate": 8.679352089871743e-06, + "loss": 1.7301, + "step": 13620 + }, + { + "epoch": 0.3967769147976845, + "grad_norm": 12.1875, + "learning_rate": 8.677412808199128e-06, + "loss": 1.6491, + "step": 13640 + }, + { + "epoch": 0.3973586991302324, + "grad_norm": 10.5625, + "learning_rate": 8.675473526526513e-06, + "loss": 1.6356, + "step": 13660 + }, + { + "epoch": 0.3979404834627803, + "grad_norm": 13.5625, + "learning_rate": 8.673534244853898e-06, + "loss": 1.6638, + "step": 13680 + }, + { + "epoch": 0.3985222677953283, + "grad_norm": 12.5, + "learning_rate": 8.671594963181283e-06, + "loss": 1.6306, + "step": 13700 + }, + { + "epoch": 0.3991040521278762, + "grad_norm": 11.875, + "learning_rate": 8.669655681508669e-06, + "loss": 1.6401, + "step": 13720 + }, + { + "epoch": 0.3996858364604241, + "grad_norm": 14.1875, + "learning_rate": 8.667716399836054e-06, + "loss": 1.7083, + "step": 13740 + }, + { + "epoch": 0.40026762079297207, + "grad_norm": 11.0, + "learning_rate": 8.665777118163439e-06, + "loss": 1.6409, + "step": 13760 + }, + { + "epoch": 0.40084940512552, + "grad_norm": 12.625, + "learning_rate": 8.663837836490824e-06, + "loss": 1.6309, + "step": 13780 + }, + { + "epoch": 0.4014311894580679, + "grad_norm": 14.0625, + "learning_rate": 8.661898554818209e-06, + "loss": 1.6464, + "step": 13800 + }, + { + "epoch": 0.40201297379061585, + "grad_norm": 12.5625, + "learning_rate": 8.659959273145594e-06, + "loss": 1.6834, + "step": 13820 + }, + { + "epoch": 0.40259475812316375, + "grad_norm": 10.9375, + "learning_rate": 8.658019991472979e-06, + "loss": 1.5307, + "step": 13840 + }, + { + "epoch": 0.40317654245571166, + "grad_norm": 12.0, + "learning_rate": 8.656080709800364e-06, + "loss": 1.6924, + "step": 13860 + }, + { + "epoch": 0.40375832678825957, + "grad_norm": 14.0, + "learning_rate": 8.65414142812775e-06, + "loss": 1.687, + "step": 13880 + }, + { + "epoch": 0.40434011112080753, + "grad_norm": 12.5625, + "learning_rate": 8.652202146455134e-06, + "loss": 1.6729, + "step": 13900 + }, + { + "epoch": 0.40492189545335544, + "grad_norm": 10.6875, + "learning_rate": 8.65026286478252e-06, + "loss": 1.6525, + "step": 13920 + }, + { + "epoch": 0.40550367978590335, + "grad_norm": 11.8125, + "learning_rate": 8.648323583109905e-06, + "loss": 1.6706, + "step": 13940 + }, + { + "epoch": 0.4060854641184513, + "grad_norm": 16.25, + "learning_rate": 8.64638430143729e-06, + "loss": 1.7039, + "step": 13960 + }, + { + "epoch": 0.4066672484509992, + "grad_norm": 13.9375, + "learning_rate": 8.644445019764675e-06, + "loss": 1.6011, + "step": 13980 + }, + { + "epoch": 0.4072490327835471, + "grad_norm": 10.5625, + "learning_rate": 8.64250573809206e-06, + "loss": 1.6004, + "step": 14000 + }, + { + "epoch": 0.4078308171160951, + "grad_norm": 13.875, + "learning_rate": 8.640566456419445e-06, + "loss": 1.6513, + "step": 14020 + }, + { + "epoch": 0.408412601448643, + "grad_norm": 13.125, + "learning_rate": 8.63862717474683e-06, + "loss": 1.637, + "step": 14040 + }, + { + "epoch": 0.4089943857811909, + "grad_norm": 11.375, + "learning_rate": 8.636687893074215e-06, + "loss": 1.6971, + "step": 14060 + }, + { + "epoch": 0.4095761701137388, + "grad_norm": 11.375, + "learning_rate": 8.6347486114016e-06, + "loss": 1.6934, + "step": 14080 + }, + { + "epoch": 0.4101579544462868, + "grad_norm": 14.1875, + "learning_rate": 8.632809329728985e-06, + "loss": 1.6761, + "step": 14100 + }, + { + "epoch": 0.4107397387788347, + "grad_norm": 11.125, + "learning_rate": 8.63087004805637e-06, + "loss": 1.7236, + "step": 14120 + }, + { + "epoch": 0.4113215231113826, + "grad_norm": 12.8125, + "learning_rate": 8.628930766383756e-06, + "loss": 1.7004, + "step": 14140 + }, + { + "epoch": 0.41190330744393056, + "grad_norm": 11.625, + "learning_rate": 8.62699148471114e-06, + "loss": 1.5457, + "step": 14160 + }, + { + "epoch": 0.41248509177647846, + "grad_norm": 14.375, + "learning_rate": 8.625052203038526e-06, + "loss": 1.5415, + "step": 14180 + }, + { + "epoch": 0.41306687610902637, + "grad_norm": 10.5625, + "learning_rate": 8.623112921365911e-06, + "loss": 1.7406, + "step": 14200 + }, + { + "epoch": 0.41364866044157433, + "grad_norm": 19.625, + "learning_rate": 8.621173639693296e-06, + "loss": 1.675, + "step": 14220 + }, + { + "epoch": 0.41423044477412224, + "grad_norm": 12.875, + "learning_rate": 8.619234358020681e-06, + "loss": 1.7145, + "step": 14240 + }, + { + "epoch": 0.41481222910667015, + "grad_norm": 10.75, + "learning_rate": 8.617295076348066e-06, + "loss": 1.6529, + "step": 14260 + }, + { + "epoch": 0.41539401343921806, + "grad_norm": 11.625, + "learning_rate": 8.615355794675451e-06, + "loss": 1.644, + "step": 14280 + }, + { + "epoch": 0.415975797771766, + "grad_norm": 13.3125, + "learning_rate": 8.613416513002836e-06, + "loss": 1.6978, + "step": 14300 + }, + { + "epoch": 0.41655758210431393, + "grad_norm": 13.3125, + "learning_rate": 8.611477231330221e-06, + "loss": 1.683, + "step": 14320 + }, + { + "epoch": 0.41713936643686184, + "grad_norm": 11.25, + "learning_rate": 8.609537949657607e-06, + "loss": 1.6661, + "step": 14340 + }, + { + "epoch": 0.4177211507694098, + "grad_norm": 12.75, + "learning_rate": 8.607598667984992e-06, + "loss": 1.6891, + "step": 14360 + }, + { + "epoch": 0.4183029351019577, + "grad_norm": 13.5, + "learning_rate": 8.605659386312377e-06, + "loss": 1.6421, + "step": 14380 + }, + { + "epoch": 0.4188847194345056, + "grad_norm": 10.8125, + "learning_rate": 8.603720104639762e-06, + "loss": 1.6044, + "step": 14400 + }, + { + "epoch": 0.4194665037670536, + "grad_norm": 13.3125, + "learning_rate": 8.601780822967147e-06, + "loss": 1.6135, + "step": 14420 + }, + { + "epoch": 0.4200482880996015, + "grad_norm": 12.0, + "learning_rate": 8.599841541294532e-06, + "loss": 1.6754, + "step": 14440 + }, + { + "epoch": 0.4206300724321494, + "grad_norm": 13.25, + "learning_rate": 8.597902259621917e-06, + "loss": 1.6302, + "step": 14460 + }, + { + "epoch": 0.4212118567646973, + "grad_norm": 11.0625, + "learning_rate": 8.595962977949302e-06, + "loss": 1.5996, + "step": 14480 + }, + { + "epoch": 0.42179364109724526, + "grad_norm": 12.25, + "learning_rate": 8.594023696276687e-06, + "loss": 1.6721, + "step": 14500 + }, + { + "epoch": 0.42237542542979317, + "grad_norm": 11.8125, + "learning_rate": 8.592084414604072e-06, + "loss": 1.6192, + "step": 14520 + }, + { + "epoch": 0.4229572097623411, + "grad_norm": 14.0, + "learning_rate": 8.590145132931458e-06, + "loss": 1.5439, + "step": 14540 + }, + { + "epoch": 0.42353899409488904, + "grad_norm": 12.5, + "learning_rate": 8.588205851258841e-06, + "loss": 1.661, + "step": 14560 + }, + { + "epoch": 0.42412077842743695, + "grad_norm": 11.4375, + "learning_rate": 8.586266569586226e-06, + "loss": 1.6487, + "step": 14580 + }, + { + "epoch": 0.42470256275998486, + "grad_norm": 15.6875, + "learning_rate": 8.584327287913611e-06, + "loss": 1.6176, + "step": 14600 + }, + { + "epoch": 0.4252843470925328, + "grad_norm": 12.5, + "learning_rate": 8.582388006240996e-06, + "loss": 1.6658, + "step": 14620 + }, + { + "epoch": 0.42586613142508073, + "grad_norm": 12.4375, + "learning_rate": 8.580448724568381e-06, + "loss": 1.6275, + "step": 14640 + }, + { + "epoch": 0.42644791575762864, + "grad_norm": 13.4375, + "learning_rate": 8.578509442895766e-06, + "loss": 1.6152, + "step": 14660 + }, + { + "epoch": 0.42702970009017654, + "grad_norm": 11.5, + "learning_rate": 8.576570161223152e-06, + "loss": 1.6167, + "step": 14680 + }, + { + "epoch": 0.4276114844227245, + "grad_norm": 13.5, + "learning_rate": 8.574630879550537e-06, + "loss": 1.605, + "step": 14700 + }, + { + "epoch": 0.4281932687552724, + "grad_norm": 16.0, + "learning_rate": 8.572691597877922e-06, + "loss": 1.6955, + "step": 14720 + }, + { + "epoch": 0.4287750530878203, + "grad_norm": 13.6875, + "learning_rate": 8.570752316205307e-06, + "loss": 1.5291, + "step": 14740 + }, + { + "epoch": 0.4293568374203683, + "grad_norm": 12.125, + "learning_rate": 8.568813034532692e-06, + "loss": 1.7469, + "step": 14760 + }, + { + "epoch": 0.4299386217529162, + "grad_norm": 11.5625, + "learning_rate": 8.566873752860077e-06, + "loss": 1.7228, + "step": 14780 + }, + { + "epoch": 0.4305204060854641, + "grad_norm": 14.75, + "learning_rate": 8.564934471187462e-06, + "loss": 1.6517, + "step": 14800 + }, + { + "epoch": 0.43110219041801207, + "grad_norm": 11.0, + "learning_rate": 8.562995189514847e-06, + "loss": 1.5406, + "step": 14820 + }, + { + "epoch": 0.43168397475056, + "grad_norm": 12.25, + "learning_rate": 8.561055907842232e-06, + "loss": 1.6401, + "step": 14840 + }, + { + "epoch": 0.4322657590831079, + "grad_norm": 12.0625, + "learning_rate": 8.559116626169617e-06, + "loss": 1.6032, + "step": 14860 + }, + { + "epoch": 0.4328475434156558, + "grad_norm": 12.5, + "learning_rate": 8.557177344497002e-06, + "loss": 1.7027, + "step": 14880 + }, + { + "epoch": 0.43342932774820375, + "grad_norm": 11.8125, + "learning_rate": 8.555238062824388e-06, + "loss": 1.6566, + "step": 14900 + }, + { + "epoch": 0.43401111208075166, + "grad_norm": 13.1875, + "learning_rate": 8.553298781151773e-06, + "loss": 1.6975, + "step": 14920 + }, + { + "epoch": 0.43459289641329957, + "grad_norm": 11.8125, + "learning_rate": 8.551359499479158e-06, + "loss": 1.6332, + "step": 14940 + }, + { + "epoch": 0.43517468074584753, + "grad_norm": 12.5, + "learning_rate": 8.549420217806543e-06, + "loss": 1.6447, + "step": 14960 + }, + { + "epoch": 0.43575646507839544, + "grad_norm": 13.0625, + "learning_rate": 8.547480936133928e-06, + "loss": 1.7399, + "step": 14980 + }, + { + "epoch": 0.43633824941094335, + "grad_norm": 12.6875, + "learning_rate": 8.545541654461313e-06, + "loss": 1.7293, + "step": 15000 + }, + { + "epoch": 0.4369200337434913, + "grad_norm": 13.0625, + "learning_rate": 8.543602372788698e-06, + "loss": 1.6866, + "step": 15020 + }, + { + "epoch": 0.4375018180760392, + "grad_norm": 13.9375, + "learning_rate": 8.541663091116083e-06, + "loss": 1.6139, + "step": 15040 + }, + { + "epoch": 0.4380836024085871, + "grad_norm": 12.9375, + "learning_rate": 8.539723809443468e-06, + "loss": 1.6182, + "step": 15060 + }, + { + "epoch": 0.4386653867411351, + "grad_norm": 13.375, + "learning_rate": 8.537784527770853e-06, + "loss": 1.6955, + "step": 15080 + }, + { + "epoch": 0.439247171073683, + "grad_norm": 12.6875, + "learning_rate": 8.535845246098239e-06, + "loss": 1.5942, + "step": 15100 + }, + { + "epoch": 0.4398289554062309, + "grad_norm": 12.5, + "learning_rate": 8.533905964425624e-06, + "loss": 1.6648, + "step": 15120 + }, + { + "epoch": 0.4404107397387788, + "grad_norm": 12.25, + "learning_rate": 8.531966682753009e-06, + "loss": 1.6914, + "step": 15140 + }, + { + "epoch": 0.4409925240713268, + "grad_norm": 11.9375, + "learning_rate": 8.530027401080394e-06, + "loss": 1.6168, + "step": 15160 + }, + { + "epoch": 0.4415743084038747, + "grad_norm": 11.5625, + "learning_rate": 8.528088119407779e-06, + "loss": 1.7224, + "step": 15180 + }, + { + "epoch": 0.4421560927364226, + "grad_norm": 13.6875, + "learning_rate": 8.526148837735164e-06, + "loss": 1.6296, + "step": 15200 + }, + { + "epoch": 0.44273787706897055, + "grad_norm": 12.25, + "learning_rate": 8.524209556062549e-06, + "loss": 1.5624, + "step": 15220 + }, + { + "epoch": 0.44331966140151846, + "grad_norm": 12.5625, + "learning_rate": 8.522270274389934e-06, + "loss": 1.6387, + "step": 15240 + }, + { + "epoch": 0.44390144573406637, + "grad_norm": 14.375, + "learning_rate": 8.52033099271732e-06, + "loss": 1.6439, + "step": 15260 + }, + { + "epoch": 0.44448323006661433, + "grad_norm": 11.1875, + "learning_rate": 8.518391711044704e-06, + "loss": 1.697, + "step": 15280 + }, + { + "epoch": 0.44506501439916224, + "grad_norm": 13.4375, + "learning_rate": 8.51645242937209e-06, + "loss": 1.7002, + "step": 15300 + }, + { + "epoch": 0.44564679873171015, + "grad_norm": 13.875, + "learning_rate": 8.514513147699475e-06, + "loss": 1.6779, + "step": 15320 + }, + { + "epoch": 0.44622858306425806, + "grad_norm": 12.8125, + "learning_rate": 8.51257386602686e-06, + "loss": 1.6922, + "step": 15340 + }, + { + "epoch": 0.446810367396806, + "grad_norm": 11.3125, + "learning_rate": 8.510634584354245e-06, + "loss": 1.6771, + "step": 15360 + }, + { + "epoch": 0.4473921517293539, + "grad_norm": 14.1875, + "learning_rate": 8.50869530268163e-06, + "loss": 1.7192, + "step": 15380 + }, + { + "epoch": 0.44797393606190183, + "grad_norm": 15.25, + "learning_rate": 8.506756021009015e-06, + "loss": 1.6994, + "step": 15400 + }, + { + "epoch": 0.4485557203944498, + "grad_norm": 11.125, + "learning_rate": 8.5048167393364e-06, + "loss": 1.7224, + "step": 15420 + }, + { + "epoch": 0.4491375047269977, + "grad_norm": 13.5625, + "learning_rate": 8.502877457663785e-06, + "loss": 1.596, + "step": 15440 + }, + { + "epoch": 0.4497192890595456, + "grad_norm": 13.375, + "learning_rate": 8.50093817599117e-06, + "loss": 1.626, + "step": 15460 + }, + { + "epoch": 0.4503010733920936, + "grad_norm": 11.125, + "learning_rate": 8.498998894318555e-06, + "loss": 1.6653, + "step": 15480 + }, + { + "epoch": 0.4508828577246415, + "grad_norm": 11.375, + "learning_rate": 8.49705961264594e-06, + "loss": 1.6906, + "step": 15500 + }, + { + "epoch": 0.4514646420571894, + "grad_norm": 12.1875, + "learning_rate": 8.495120330973326e-06, + "loss": 1.5862, + "step": 15520 + }, + { + "epoch": 0.4520464263897373, + "grad_norm": 12.125, + "learning_rate": 8.49318104930071e-06, + "loss": 1.624, + "step": 15540 + }, + { + "epoch": 0.45262821072228526, + "grad_norm": 10.875, + "learning_rate": 8.491241767628096e-06, + "loss": 1.6233, + "step": 15560 + }, + { + "epoch": 0.45320999505483317, + "grad_norm": 11.0, + "learning_rate": 8.489302485955481e-06, + "loss": 1.6777, + "step": 15580 + }, + { + "epoch": 0.4537917793873811, + "grad_norm": 12.0, + "learning_rate": 8.487363204282866e-06, + "loss": 1.6539, + "step": 15600 + }, + { + "epoch": 0.45437356371992904, + "grad_norm": 12.25, + "learning_rate": 8.485423922610251e-06, + "loss": 1.6355, + "step": 15620 + }, + { + "epoch": 0.45495534805247695, + "grad_norm": 13.3125, + "learning_rate": 8.483484640937636e-06, + "loss": 1.7094, + "step": 15640 + }, + { + "epoch": 0.45553713238502486, + "grad_norm": 12.625, + "learning_rate": 8.481545359265021e-06, + "loss": 1.6191, + "step": 15660 + }, + { + "epoch": 0.4561189167175728, + "grad_norm": 12.75, + "learning_rate": 8.479606077592406e-06, + "loss": 1.6599, + "step": 15680 + }, + { + "epoch": 0.4567007010501207, + "grad_norm": 11.4375, + "learning_rate": 8.477666795919791e-06, + "loss": 1.6194, + "step": 15700 + }, + { + "epoch": 0.45728248538266864, + "grad_norm": 12.375, + "learning_rate": 8.475727514247177e-06, + "loss": 1.7066, + "step": 15720 + }, + { + "epoch": 0.45786426971521654, + "grad_norm": 13.5625, + "learning_rate": 8.473788232574562e-06, + "loss": 1.6937, + "step": 15740 + }, + { + "epoch": 0.4584460540477645, + "grad_norm": 10.625, + "learning_rate": 8.471848950901947e-06, + "loss": 1.6028, + "step": 15760 + }, + { + "epoch": 0.4590278383803124, + "grad_norm": 12.6875, + "learning_rate": 8.469909669229332e-06, + "loss": 1.7428, + "step": 15780 + }, + { + "epoch": 0.4596096227128603, + "grad_norm": 12.75, + "learning_rate": 8.467970387556717e-06, + "loss": 1.6265, + "step": 15800 + }, + { + "epoch": 0.4601914070454083, + "grad_norm": 13.5, + "learning_rate": 8.466031105884102e-06, + "loss": 1.7238, + "step": 15820 + }, + { + "epoch": 0.4607731913779562, + "grad_norm": 17.5, + "learning_rate": 8.464091824211487e-06, + "loss": 1.6624, + "step": 15840 + }, + { + "epoch": 0.4613549757105041, + "grad_norm": 8.875, + "learning_rate": 8.462152542538872e-06, + "loss": 1.6134, + "step": 15860 + }, + { + "epoch": 0.46193676004305206, + "grad_norm": 11.4375, + "learning_rate": 8.460213260866257e-06, + "loss": 1.6501, + "step": 15880 + }, + { + "epoch": 0.46251854437559997, + "grad_norm": 15.25, + "learning_rate": 8.458273979193642e-06, + "loss": 1.6335, + "step": 15900 + }, + { + "epoch": 0.4631003287081479, + "grad_norm": 13.25, + "learning_rate": 8.456334697521028e-06, + "loss": 1.6504, + "step": 15920 + }, + { + "epoch": 0.4636821130406958, + "grad_norm": 13.4375, + "learning_rate": 8.454395415848413e-06, + "loss": 1.5638, + "step": 15940 + }, + { + "epoch": 0.46426389737324375, + "grad_norm": 13.5625, + "learning_rate": 8.452456134175798e-06, + "loss": 1.7835, + "step": 15960 + }, + { + "epoch": 0.46484568170579166, + "grad_norm": 13.5625, + "learning_rate": 8.450516852503183e-06, + "loss": 1.6515, + "step": 15980 + }, + { + "epoch": 0.46542746603833957, + "grad_norm": 12.0, + "learning_rate": 8.448577570830568e-06, + "loss": 1.6011, + "step": 16000 + }, + { + "epoch": 0.46600925037088753, + "grad_norm": 14.6875, + "learning_rate": 8.446638289157953e-06, + "loss": 1.637, + "step": 16020 + }, + { + "epoch": 0.46659103470343544, + "grad_norm": 12.125, + "learning_rate": 8.444699007485338e-06, + "loss": 1.7068, + "step": 16040 + }, + { + "epoch": 0.46717281903598334, + "grad_norm": 12.875, + "learning_rate": 8.442759725812723e-06, + "loss": 1.6211, + "step": 16060 + }, + { + "epoch": 0.4677546033685313, + "grad_norm": 13.6875, + "learning_rate": 8.440820444140108e-06, + "loss": 1.5866, + "step": 16080 + }, + { + "epoch": 0.4683363877010792, + "grad_norm": 11.8125, + "learning_rate": 8.438881162467493e-06, + "loss": 1.721, + "step": 16100 + }, + { + "epoch": 0.4689181720336271, + "grad_norm": 10.9375, + "learning_rate": 8.436941880794879e-06, + "loss": 1.6362, + "step": 16120 + }, + { + "epoch": 0.4694999563661751, + "grad_norm": 12.1875, + "learning_rate": 8.435002599122264e-06, + "loss": 1.6794, + "step": 16140 + }, + { + "epoch": 0.470081740698723, + "grad_norm": 14.375, + "learning_rate": 8.433063317449649e-06, + "loss": 1.6712, + "step": 16160 + }, + { + "epoch": 0.4706635250312709, + "grad_norm": 14.5625, + "learning_rate": 8.431124035777034e-06, + "loss": 1.6059, + "step": 16180 + }, + { + "epoch": 0.4712453093638188, + "grad_norm": 10.8125, + "learning_rate": 8.429184754104417e-06, + "loss": 1.6212, + "step": 16200 + }, + { + "epoch": 0.4718270936963668, + "grad_norm": 13.9375, + "learning_rate": 8.427245472431802e-06, + "loss": 1.6766, + "step": 16220 + }, + { + "epoch": 0.4724088780289147, + "grad_norm": 11.625, + "learning_rate": 8.425306190759187e-06, + "loss": 1.6919, + "step": 16240 + }, + { + "epoch": 0.4729906623614626, + "grad_norm": 12.0, + "learning_rate": 8.423366909086573e-06, + "loss": 1.6914, + "step": 16260 + }, + { + "epoch": 0.47357244669401055, + "grad_norm": 11.5, + "learning_rate": 8.421427627413958e-06, + "loss": 1.7106, + "step": 16280 + }, + { + "epoch": 0.47415423102655846, + "grad_norm": 12.3125, + "learning_rate": 8.419488345741343e-06, + "loss": 1.6553, + "step": 16300 + }, + { + "epoch": 0.47473601535910637, + "grad_norm": 11.9375, + "learning_rate": 8.417549064068728e-06, + "loss": 1.6824, + "step": 16320 + }, + { + "epoch": 0.47531779969165433, + "grad_norm": 14.0, + "learning_rate": 8.415609782396113e-06, + "loss": 1.6005, + "step": 16340 + }, + { + "epoch": 0.47589958402420224, + "grad_norm": 12.8125, + "learning_rate": 8.413670500723498e-06, + "loss": 1.6092, + "step": 16360 + }, + { + "epoch": 0.47648136835675015, + "grad_norm": 21.25, + "learning_rate": 8.411731219050883e-06, + "loss": 1.6701, + "step": 16380 + }, + { + "epoch": 0.47706315268929805, + "grad_norm": 14.625, + "learning_rate": 8.409791937378268e-06, + "loss": 1.6229, + "step": 16400 + }, + { + "epoch": 0.477644937021846, + "grad_norm": 11.375, + "learning_rate": 8.407852655705653e-06, + "loss": 1.5859, + "step": 16420 + }, + { + "epoch": 0.4782267213543939, + "grad_norm": 12.0625, + "learning_rate": 8.405913374033038e-06, + "loss": 1.6195, + "step": 16440 + }, + { + "epoch": 0.47880850568694183, + "grad_norm": 10.6875, + "learning_rate": 8.403974092360423e-06, + "loss": 1.7007, + "step": 16460 + }, + { + "epoch": 0.4793902900194898, + "grad_norm": 14.125, + "learning_rate": 8.402034810687809e-06, + "loss": 1.6735, + "step": 16480 + }, + { + "epoch": 0.4799720743520377, + "grad_norm": 12.5, + "learning_rate": 8.400095529015194e-06, + "loss": 1.6816, + "step": 16500 + }, + { + "epoch": 0.4805538586845856, + "grad_norm": 10.875, + "learning_rate": 8.398156247342579e-06, + "loss": 1.6985, + "step": 16520 + }, + { + "epoch": 0.4811356430171336, + "grad_norm": 11.75, + "learning_rate": 8.396216965669964e-06, + "loss": 1.7448, + "step": 16540 + }, + { + "epoch": 0.4817174273496815, + "grad_norm": 13.375, + "learning_rate": 8.394277683997349e-06, + "loss": 1.6185, + "step": 16560 + }, + { + "epoch": 0.4822992116822294, + "grad_norm": 14.875, + "learning_rate": 8.392338402324734e-06, + "loss": 1.6197, + "step": 16580 + }, + { + "epoch": 0.4828809960147773, + "grad_norm": 13.75, + "learning_rate": 8.39039912065212e-06, + "loss": 1.6744, + "step": 16600 + }, + { + "epoch": 0.48346278034732526, + "grad_norm": 11.875, + "learning_rate": 8.388459838979504e-06, + "loss": 1.6772, + "step": 16620 + }, + { + "epoch": 0.48404456467987317, + "grad_norm": 13.625, + "learning_rate": 8.38652055730689e-06, + "loss": 1.67, + "step": 16640 + }, + { + "epoch": 0.4846263490124211, + "grad_norm": 13.0625, + "learning_rate": 8.384581275634274e-06, + "loss": 1.6667, + "step": 16660 + }, + { + "epoch": 0.48520813334496904, + "grad_norm": 14.3125, + "learning_rate": 8.38264199396166e-06, + "loss": 1.7014, + "step": 16680 + }, + { + "epoch": 0.48578991767751695, + "grad_norm": 14.375, + "learning_rate": 8.380702712289045e-06, + "loss": 1.6499, + "step": 16700 + }, + { + "epoch": 0.48637170201006485, + "grad_norm": 14.0625, + "learning_rate": 8.37876343061643e-06, + "loss": 1.6073, + "step": 16720 + }, + { + "epoch": 0.4869534863426128, + "grad_norm": 13.8125, + "learning_rate": 8.376824148943815e-06, + "loss": 1.6416, + "step": 16740 + }, + { + "epoch": 0.4875352706751607, + "grad_norm": 15.5625, + "learning_rate": 8.374884867271198e-06, + "loss": 1.6485, + "step": 16760 + }, + { + "epoch": 0.48811705500770863, + "grad_norm": 12.3125, + "learning_rate": 8.372945585598583e-06, + "loss": 1.7104, + "step": 16780 + }, + { + "epoch": 0.48869883934025654, + "grad_norm": 10.6875, + "learning_rate": 8.371006303925968e-06, + "loss": 1.5394, + "step": 16800 + }, + { + "epoch": 0.4892806236728045, + "grad_norm": 14.0, + "learning_rate": 8.369067022253354e-06, + "loss": 1.5488, + "step": 16820 + }, + { + "epoch": 0.4898624080053524, + "grad_norm": 12.625, + "learning_rate": 8.367127740580739e-06, + "loss": 1.5977, + "step": 16840 + }, + { + "epoch": 0.4904441923379003, + "grad_norm": 11.875, + "learning_rate": 8.365188458908124e-06, + "loss": 1.5608, + "step": 16860 + }, + { + "epoch": 0.4910259766704483, + "grad_norm": 13.3125, + "learning_rate": 8.363249177235509e-06, + "loss": 1.6741, + "step": 16880 + }, + { + "epoch": 0.4916077610029962, + "grad_norm": 13.3125, + "learning_rate": 8.361309895562894e-06, + "loss": 1.6442, + "step": 16900 + }, + { + "epoch": 0.4921895453355441, + "grad_norm": 12.25, + "learning_rate": 8.359370613890279e-06, + "loss": 1.6331, + "step": 16920 + }, + { + "epoch": 0.49277132966809206, + "grad_norm": 11.75, + "learning_rate": 8.357431332217664e-06, + "loss": 1.627, + "step": 16940 + }, + { + "epoch": 0.49335311400063997, + "grad_norm": 12.6875, + "learning_rate": 8.35549205054505e-06, + "loss": 1.6895, + "step": 16960 + }, + { + "epoch": 0.4939348983331879, + "grad_norm": 10.0625, + "learning_rate": 8.353552768872434e-06, + "loss": 1.6495, + "step": 16980 + }, + { + "epoch": 0.4945166826657358, + "grad_norm": 12.25, + "learning_rate": 8.35161348719982e-06, + "loss": 1.6889, + "step": 17000 + }, + { + "epoch": 0.49509846699828375, + "grad_norm": 14.25, + "learning_rate": 8.349674205527205e-06, + "loss": 1.6643, + "step": 17020 + }, + { + "epoch": 0.49568025133083166, + "grad_norm": 13.625, + "learning_rate": 8.34773492385459e-06, + "loss": 1.572, + "step": 17040 + }, + { + "epoch": 0.49626203566337956, + "grad_norm": 11.4375, + "learning_rate": 8.345795642181975e-06, + "loss": 1.6003, + "step": 17060 + }, + { + "epoch": 0.4968438199959275, + "grad_norm": 13.25, + "learning_rate": 8.34385636050936e-06, + "loss": 1.7059, + "step": 17080 + }, + { + "epoch": 0.49742560432847543, + "grad_norm": 12.75, + "learning_rate": 8.341917078836745e-06, + "loss": 1.5876, + "step": 17100 + }, + { + "epoch": 0.49800738866102334, + "grad_norm": 12.0, + "learning_rate": 8.33997779716413e-06, + "loss": 1.6523, + "step": 17120 + }, + { + "epoch": 0.4985891729935713, + "grad_norm": 12.8125, + "learning_rate": 8.338038515491515e-06, + "loss": 1.7046, + "step": 17140 + }, + { + "epoch": 0.4991709573261192, + "grad_norm": 11.4375, + "learning_rate": 8.3360992338189e-06, + "loss": 1.5631, + "step": 17160 + }, + { + "epoch": 0.4997527416586671, + "grad_norm": 13.3125, + "learning_rate": 8.334159952146285e-06, + "loss": 1.585, + "step": 17180 + }, + { + "epoch": 0.5003345259912151, + "grad_norm": 13.5, + "learning_rate": 8.33222067047367e-06, + "loss": 1.7361, + "step": 17200 + }, + { + "epoch": 0.5009163103237629, + "grad_norm": 14.875, + "learning_rate": 8.330281388801056e-06, + "loss": 1.6168, + "step": 17220 + }, + { + "epoch": 0.5014980946563109, + "grad_norm": 11.6875, + "learning_rate": 8.32834210712844e-06, + "loss": 1.6471, + "step": 17240 + }, + { + "epoch": 0.5020798789888589, + "grad_norm": 12.3125, + "learning_rate": 8.326402825455826e-06, + "loss": 1.6724, + "step": 17260 + }, + { + "epoch": 0.5026616633214067, + "grad_norm": 11.1875, + "learning_rate": 8.32446354378321e-06, + "loss": 1.6533, + "step": 17280 + }, + { + "epoch": 0.5032434476539547, + "grad_norm": 10.9375, + "learning_rate": 8.322524262110596e-06, + "loss": 1.5692, + "step": 17300 + }, + { + "epoch": 0.5038252319865026, + "grad_norm": 13.75, + "learning_rate": 8.320584980437981e-06, + "loss": 1.6431, + "step": 17320 + }, + { + "epoch": 0.5044070163190505, + "grad_norm": 12.1875, + "learning_rate": 8.318645698765366e-06, + "loss": 1.6197, + "step": 17340 + }, + { + "epoch": 0.5049888006515985, + "grad_norm": 14.5, + "learning_rate": 8.316706417092751e-06, + "loss": 1.6078, + "step": 17360 + }, + { + "epoch": 0.5055705849841464, + "grad_norm": 12.75, + "learning_rate": 8.314767135420136e-06, + "loss": 1.7085, + "step": 17380 + }, + { + "epoch": 0.5061523693166943, + "grad_norm": 14.9375, + "learning_rate": 8.312827853747521e-06, + "loss": 1.5984, + "step": 17400 + }, + { + "epoch": 0.5067341536492422, + "grad_norm": 10.75, + "learning_rate": 8.310888572074906e-06, + "loss": 1.5446, + "step": 17420 + }, + { + "epoch": 0.5073159379817902, + "grad_norm": 10.6875, + "learning_rate": 8.308949290402292e-06, + "loss": 1.6235, + "step": 17440 + }, + { + "epoch": 0.507897722314338, + "grad_norm": 14.4375, + "learning_rate": 8.307010008729677e-06, + "loss": 1.6053, + "step": 17460 + }, + { + "epoch": 0.508479506646886, + "grad_norm": 12.5625, + "learning_rate": 8.305070727057062e-06, + "loss": 1.5673, + "step": 17480 + }, + { + "epoch": 0.509061290979434, + "grad_norm": 12.125, + "learning_rate": 8.303131445384447e-06, + "loss": 1.6572, + "step": 17500 + }, + { + "epoch": 0.5096430753119818, + "grad_norm": 14.0625, + "learning_rate": 8.301192163711832e-06, + "loss": 1.6572, + "step": 17520 + }, + { + "epoch": 0.5102248596445298, + "grad_norm": 12.9375, + "learning_rate": 8.299252882039217e-06, + "loss": 1.6663, + "step": 17540 + }, + { + "epoch": 0.5108066439770776, + "grad_norm": 11.8125, + "learning_rate": 8.297313600366602e-06, + "loss": 1.6452, + "step": 17560 + }, + { + "epoch": 0.5113884283096256, + "grad_norm": 14.625, + "learning_rate": 8.295374318693987e-06, + "loss": 1.6668, + "step": 17580 + }, + { + "epoch": 0.5119702126421736, + "grad_norm": 10.8125, + "learning_rate": 8.293435037021372e-06, + "loss": 1.5655, + "step": 17600 + }, + { + "epoch": 0.5125519969747214, + "grad_norm": 12.6875, + "learning_rate": 8.291495755348757e-06, + "loss": 1.761, + "step": 17620 + }, + { + "epoch": 0.5131337813072694, + "grad_norm": 12.75, + "learning_rate": 8.289556473676143e-06, + "loss": 1.6808, + "step": 17640 + }, + { + "epoch": 0.5137155656398174, + "grad_norm": 16.0, + "learning_rate": 8.287617192003528e-06, + "loss": 1.6404, + "step": 17660 + }, + { + "epoch": 0.5142973499723652, + "grad_norm": 11.4375, + "learning_rate": 8.285677910330913e-06, + "loss": 1.7102, + "step": 17680 + }, + { + "epoch": 0.5148791343049132, + "grad_norm": 10.75, + "learning_rate": 8.283738628658298e-06, + "loss": 1.5917, + "step": 17700 + }, + { + "epoch": 0.5154609186374611, + "grad_norm": 13.375, + "learning_rate": 8.281799346985683e-06, + "loss": 1.6896, + "step": 17720 + }, + { + "epoch": 0.516042702970009, + "grad_norm": 12.3125, + "learning_rate": 8.279860065313068e-06, + "loss": 1.6722, + "step": 17740 + }, + { + "epoch": 0.516624487302557, + "grad_norm": 12.75, + "learning_rate": 8.277920783640453e-06, + "loss": 1.6643, + "step": 17760 + }, + { + "epoch": 0.5172062716351049, + "grad_norm": 10.25, + "learning_rate": 8.275981501967838e-06, + "loss": 1.6292, + "step": 17780 + }, + { + "epoch": 0.5177880559676528, + "grad_norm": 13.0, + "learning_rate": 8.274042220295223e-06, + "loss": 1.664, + "step": 17800 + }, + { + "epoch": 0.5183698403002007, + "grad_norm": 12.25, + "learning_rate": 8.272102938622608e-06, + "loss": 1.558, + "step": 17820 + }, + { + "epoch": 0.5189516246327487, + "grad_norm": 13.0625, + "learning_rate": 8.270163656949994e-06, + "loss": 1.6129, + "step": 17840 + }, + { + "epoch": 0.5195334089652965, + "grad_norm": 14.3125, + "learning_rate": 8.268224375277379e-06, + "loss": 1.6074, + "step": 17860 + }, + { + "epoch": 0.5201151932978445, + "grad_norm": 9.625, + "learning_rate": 8.266285093604764e-06, + "loss": 1.6244, + "step": 17880 + }, + { + "epoch": 0.5206969776303925, + "grad_norm": 12.625, + "learning_rate": 8.264345811932149e-06, + "loss": 1.595, + "step": 17900 + }, + { + "epoch": 0.5212787619629403, + "grad_norm": 11.0625, + "learning_rate": 8.262406530259534e-06, + "loss": 1.5696, + "step": 17920 + }, + { + "epoch": 0.5218605462954883, + "grad_norm": 11.1875, + "learning_rate": 8.260467248586919e-06, + "loss": 1.6314, + "step": 17940 + }, + { + "epoch": 0.5224423306280361, + "grad_norm": 11.75, + "learning_rate": 8.258527966914304e-06, + "loss": 1.6623, + "step": 17960 + }, + { + "epoch": 0.5230241149605841, + "grad_norm": 10.5, + "learning_rate": 8.25658868524169e-06, + "loss": 1.6871, + "step": 17980 + }, + { + "epoch": 0.5236058992931321, + "grad_norm": 12.375, + "learning_rate": 8.254649403569074e-06, + "loss": 1.6504, + "step": 18000 + }, + { + "epoch": 0.5241876836256799, + "grad_norm": 12.1875, + "learning_rate": 8.25271012189646e-06, + "loss": 1.5599, + "step": 18020 + }, + { + "epoch": 0.5247694679582279, + "grad_norm": 15.25, + "learning_rate": 8.250770840223844e-06, + "loss": 1.6048, + "step": 18040 + }, + { + "epoch": 0.5253512522907758, + "grad_norm": 14.3125, + "learning_rate": 8.24883155855123e-06, + "loss": 1.6068, + "step": 18060 + }, + { + "epoch": 0.5259330366233237, + "grad_norm": 11.625, + "learning_rate": 8.246892276878615e-06, + "loss": 1.6132, + "step": 18080 + }, + { + "epoch": 0.5265148209558717, + "grad_norm": 12.875, + "learning_rate": 8.244952995206e-06, + "loss": 1.5648, + "step": 18100 + }, + { + "epoch": 0.5270966052884196, + "grad_norm": 13.1875, + "learning_rate": 8.243013713533385e-06, + "loss": 1.6553, + "step": 18120 + }, + { + "epoch": 0.5276783896209675, + "grad_norm": 11.5, + "learning_rate": 8.24107443186077e-06, + "loss": 1.5852, + "step": 18140 + }, + { + "epoch": 0.5282601739535154, + "grad_norm": 13.625, + "learning_rate": 8.239135150188155e-06, + "loss": 1.6632, + "step": 18160 + }, + { + "epoch": 0.5288419582860634, + "grad_norm": 14.5625, + "learning_rate": 8.23719586851554e-06, + "loss": 1.6971, + "step": 18180 + }, + { + "epoch": 0.5294237426186112, + "grad_norm": 17.75, + "learning_rate": 8.235256586842925e-06, + "loss": 1.7631, + "step": 18200 + }, + { + "epoch": 0.5300055269511592, + "grad_norm": 14.625, + "learning_rate": 8.23331730517031e-06, + "loss": 1.6325, + "step": 18220 + }, + { + "epoch": 0.5305873112837072, + "grad_norm": 13.3125, + "learning_rate": 8.231378023497695e-06, + "loss": 1.6736, + "step": 18240 + }, + { + "epoch": 0.531169095616255, + "grad_norm": 12.625, + "learning_rate": 8.22943874182508e-06, + "loss": 1.6393, + "step": 18260 + }, + { + "epoch": 0.531750879948803, + "grad_norm": 14.875, + "learning_rate": 8.227499460152466e-06, + "loss": 1.6378, + "step": 18280 + }, + { + "epoch": 0.532332664281351, + "grad_norm": 14.75, + "learning_rate": 8.22556017847985e-06, + "loss": 1.6205, + "step": 18300 + }, + { + "epoch": 0.5329144486138988, + "grad_norm": 12.625, + "learning_rate": 8.223620896807236e-06, + "loss": 1.6166, + "step": 18320 + }, + { + "epoch": 0.5334962329464468, + "grad_norm": 13.5625, + "learning_rate": 8.221681615134621e-06, + "loss": 1.6742, + "step": 18340 + }, + { + "epoch": 0.5340780172789946, + "grad_norm": 13.25, + "learning_rate": 8.219742333462006e-06, + "loss": 1.6505, + "step": 18360 + }, + { + "epoch": 0.5346598016115426, + "grad_norm": 12.375, + "learning_rate": 8.21780305178939e-06, + "loss": 1.6826, + "step": 18380 + }, + { + "epoch": 0.5352415859440905, + "grad_norm": 12.125, + "learning_rate": 8.215863770116775e-06, + "loss": 1.657, + "step": 18400 + }, + { + "epoch": 0.5358233702766384, + "grad_norm": 14.0625, + "learning_rate": 8.21392448844416e-06, + "loss": 1.5712, + "step": 18420 + }, + { + "epoch": 0.5364051546091864, + "grad_norm": 11.6875, + "learning_rate": 8.211985206771545e-06, + "loss": 1.7179, + "step": 18440 + }, + { + "epoch": 0.5369869389417343, + "grad_norm": 13.5625, + "learning_rate": 8.21004592509893e-06, + "loss": 1.6749, + "step": 18460 + }, + { + "epoch": 0.5375687232742822, + "grad_norm": 13.0625, + "learning_rate": 8.208106643426315e-06, + "loss": 1.6932, + "step": 18480 + }, + { + "epoch": 0.5381505076068301, + "grad_norm": 8.8125, + "learning_rate": 8.2061673617537e-06, + "loss": 1.6385, + "step": 18500 + }, + { + "epoch": 0.5387322919393781, + "grad_norm": 12.125, + "learning_rate": 8.204228080081085e-06, + "loss": 1.683, + "step": 18520 + }, + { + "epoch": 0.539314076271926, + "grad_norm": 13.125, + "learning_rate": 8.20228879840847e-06, + "loss": 1.59, + "step": 18540 + }, + { + "epoch": 0.5398958606044739, + "grad_norm": 12.4375, + "learning_rate": 8.200349516735855e-06, + "loss": 1.5987, + "step": 18560 + }, + { + "epoch": 0.5404776449370219, + "grad_norm": 12.625, + "learning_rate": 8.19841023506324e-06, + "loss": 1.63, + "step": 18580 + }, + { + "epoch": 0.5410594292695697, + "grad_norm": 11.6875, + "learning_rate": 8.196470953390626e-06, + "loss": 1.698, + "step": 18600 + }, + { + "epoch": 0.5416412136021177, + "grad_norm": 12.1875, + "learning_rate": 8.19453167171801e-06, + "loss": 1.6323, + "step": 18620 + }, + { + "epoch": 0.5422229979346657, + "grad_norm": 11.5625, + "learning_rate": 8.192592390045396e-06, + "loss": 1.6039, + "step": 18640 + }, + { + "epoch": 0.5428047822672135, + "grad_norm": 10.3125, + "learning_rate": 8.19065310837278e-06, + "loss": 1.6931, + "step": 18660 + }, + { + "epoch": 0.5433865665997615, + "grad_norm": 10.875, + "learning_rate": 8.188713826700166e-06, + "loss": 1.6977, + "step": 18680 + }, + { + "epoch": 0.5439683509323094, + "grad_norm": 9.8125, + "learning_rate": 8.186774545027551e-06, + "loss": 1.5836, + "step": 18700 + }, + { + "epoch": 0.5445501352648573, + "grad_norm": 12.6875, + "learning_rate": 8.184835263354936e-06, + "loss": 1.6545, + "step": 18720 + }, + { + "epoch": 0.5451319195974053, + "grad_norm": 10.0, + "learning_rate": 8.182895981682321e-06, + "loss": 1.5713, + "step": 18740 + }, + { + "epoch": 0.5457137039299532, + "grad_norm": 11.25, + "learning_rate": 8.180956700009706e-06, + "loss": 1.5724, + "step": 18760 + }, + { + "epoch": 0.5462954882625011, + "grad_norm": 11.5, + "learning_rate": 8.179017418337091e-06, + "loss": 1.7166, + "step": 18780 + }, + { + "epoch": 0.546877272595049, + "grad_norm": 11.9375, + "learning_rate": 8.177078136664477e-06, + "loss": 1.735, + "step": 18800 + }, + { + "epoch": 0.5474590569275969, + "grad_norm": 12.0625, + "learning_rate": 8.175138854991862e-06, + "loss": 1.6193, + "step": 18820 + }, + { + "epoch": 0.5480408412601449, + "grad_norm": 10.875, + "learning_rate": 8.173199573319247e-06, + "loss": 1.6175, + "step": 18840 + }, + { + "epoch": 0.5486226255926928, + "grad_norm": 12.0625, + "learning_rate": 8.171260291646632e-06, + "loss": 1.748, + "step": 18860 + }, + { + "epoch": 0.5492044099252407, + "grad_norm": 15.9375, + "learning_rate": 8.169321009974017e-06, + "loss": 1.66, + "step": 18880 + }, + { + "epoch": 0.5497861942577886, + "grad_norm": 10.6875, + "learning_rate": 8.167381728301402e-06, + "loss": 1.7462, + "step": 18900 + }, + { + "epoch": 0.5503679785903366, + "grad_norm": 12.125, + "learning_rate": 8.165442446628787e-06, + "loss": 1.6313, + "step": 18920 + }, + { + "epoch": 0.5509497629228844, + "grad_norm": 10.8125, + "learning_rate": 8.163503164956172e-06, + "loss": 1.6117, + "step": 18940 + }, + { + "epoch": 0.5515315472554324, + "grad_norm": 10.875, + "learning_rate": 8.161563883283557e-06, + "loss": 1.56, + "step": 18960 + }, + { + "epoch": 0.5521133315879804, + "grad_norm": 12.875, + "learning_rate": 8.159624601610942e-06, + "loss": 1.6744, + "step": 18980 + }, + { + "epoch": 0.5526951159205282, + "grad_norm": 13.25, + "learning_rate": 8.157685319938327e-06, + "loss": 1.6503, + "step": 19000 + }, + { + "epoch": 0.5532769002530762, + "grad_norm": 10.125, + "learning_rate": 8.155746038265713e-06, + "loss": 1.6612, + "step": 19020 + }, + { + "epoch": 0.5538586845856242, + "grad_norm": 14.6875, + "learning_rate": 8.153806756593098e-06, + "loss": 1.6828, + "step": 19040 + }, + { + "epoch": 0.554440468918172, + "grad_norm": 14.75, + "learning_rate": 8.151867474920483e-06, + "loss": 1.6411, + "step": 19060 + }, + { + "epoch": 0.55502225325072, + "grad_norm": 10.75, + "learning_rate": 8.149928193247868e-06, + "loss": 1.614, + "step": 19080 + }, + { + "epoch": 0.5556040375832679, + "grad_norm": 12.25, + "learning_rate": 8.147988911575253e-06, + "loss": 1.5938, + "step": 19100 + }, + { + "epoch": 0.5561858219158158, + "grad_norm": 13.0, + "learning_rate": 8.146049629902638e-06, + "loss": 1.6811, + "step": 19120 + }, + { + "epoch": 0.5567676062483637, + "grad_norm": 11.375, + "learning_rate": 8.144110348230023e-06, + "loss": 1.6226, + "step": 19140 + }, + { + "epoch": 0.5573493905809117, + "grad_norm": 10.5625, + "learning_rate": 8.142171066557408e-06, + "loss": 1.6727, + "step": 19160 + }, + { + "epoch": 0.5579311749134596, + "grad_norm": 15.3125, + "learning_rate": 8.140231784884793e-06, + "loss": 1.6248, + "step": 19180 + }, + { + "epoch": 0.5585129592460075, + "grad_norm": 10.1875, + "learning_rate": 8.138292503212178e-06, + "loss": 1.6383, + "step": 19200 + }, + { + "epoch": 0.5590947435785554, + "grad_norm": 10.4375, + "learning_rate": 8.136353221539564e-06, + "loss": 1.6951, + "step": 19220 + }, + { + "epoch": 0.5596765279111033, + "grad_norm": 12.0625, + "learning_rate": 8.134413939866949e-06, + "loss": 1.722, + "step": 19240 + }, + { + "epoch": 0.5602583122436513, + "grad_norm": 14.625, + "learning_rate": 8.132474658194334e-06, + "loss": 1.6129, + "step": 19260 + }, + { + "epoch": 0.5608400965761992, + "grad_norm": 13.0, + "learning_rate": 8.130535376521719e-06, + "loss": 1.6803, + "step": 19280 + }, + { + "epoch": 0.5614218809087471, + "grad_norm": 12.75, + "learning_rate": 8.128596094849104e-06, + "loss": 1.6305, + "step": 19300 + }, + { + "epoch": 0.5620036652412951, + "grad_norm": 13.3125, + "learning_rate": 8.126656813176489e-06, + "loss": 1.5954, + "step": 19320 + }, + { + "epoch": 0.5625854495738429, + "grad_norm": 11.375, + "learning_rate": 8.124717531503874e-06, + "loss": 1.5627, + "step": 19340 + }, + { + "epoch": 0.5631672339063909, + "grad_norm": 12.0625, + "learning_rate": 8.12277824983126e-06, + "loss": 1.6935, + "step": 19360 + }, + { + "epoch": 0.5637490182389389, + "grad_norm": 11.25, + "learning_rate": 8.120838968158644e-06, + "loss": 1.6396, + "step": 19380 + }, + { + "epoch": 0.5643308025714867, + "grad_norm": 13.0625, + "learning_rate": 8.11889968648603e-06, + "loss": 1.594, + "step": 19400 + }, + { + "epoch": 0.5649125869040347, + "grad_norm": 10.8125, + "learning_rate": 8.116960404813415e-06, + "loss": 1.6131, + "step": 19420 + }, + { + "epoch": 0.5654943712365826, + "grad_norm": 12.125, + "learning_rate": 8.1150211231408e-06, + "loss": 1.6273, + "step": 19440 + }, + { + "epoch": 0.5660761555691305, + "grad_norm": 15.8125, + "learning_rate": 8.113081841468185e-06, + "loss": 1.7437, + "step": 19460 + }, + { + "epoch": 0.5666579399016785, + "grad_norm": 11.8125, + "learning_rate": 8.11114255979557e-06, + "loss": 1.7092, + "step": 19480 + }, + { + "epoch": 0.5672397242342264, + "grad_norm": 16.75, + "learning_rate": 8.109203278122955e-06, + "loss": 1.6279, + "step": 19500 + }, + { + "epoch": 0.5678215085667743, + "grad_norm": 13.75, + "learning_rate": 8.10726399645034e-06, + "loss": 1.7001, + "step": 19520 + }, + { + "epoch": 0.5684032928993222, + "grad_norm": 13.9375, + "learning_rate": 8.105324714777725e-06, + "loss": 1.694, + "step": 19540 + }, + { + "epoch": 0.5689850772318702, + "grad_norm": 14.6875, + "learning_rate": 8.10338543310511e-06, + "loss": 1.6262, + "step": 19560 + }, + { + "epoch": 0.569566861564418, + "grad_norm": 12.75, + "learning_rate": 8.101446151432495e-06, + "loss": 1.6449, + "step": 19580 + }, + { + "epoch": 0.570148645896966, + "grad_norm": 12.9375, + "learning_rate": 8.09950686975988e-06, + "loss": 1.6502, + "step": 19600 + }, + { + "epoch": 0.570730430229514, + "grad_norm": 16.0, + "learning_rate": 8.097567588087266e-06, + "loss": 1.6979, + "step": 19620 + }, + { + "epoch": 0.5713122145620618, + "grad_norm": 14.25, + "learning_rate": 8.09562830641465e-06, + "loss": 1.6853, + "step": 19640 + }, + { + "epoch": 0.5718939988946098, + "grad_norm": 11.75, + "learning_rate": 8.093689024742036e-06, + "loss": 1.7246, + "step": 19660 + }, + { + "epoch": 0.5724757832271576, + "grad_norm": 12.8125, + "learning_rate": 8.09174974306942e-06, + "loss": 1.6622, + "step": 19680 + }, + { + "epoch": 0.5730575675597056, + "grad_norm": 13.25, + "learning_rate": 8.089810461396806e-06, + "loss": 1.6474, + "step": 19700 + }, + { + "epoch": 0.5736393518922536, + "grad_norm": 13.125, + "learning_rate": 8.087871179724191e-06, + "loss": 1.6399, + "step": 19720 + }, + { + "epoch": 0.5742211362248014, + "grad_norm": 8.6875, + "learning_rate": 8.085931898051576e-06, + "loss": 1.5597, + "step": 19740 + }, + { + "epoch": 0.5748029205573494, + "grad_norm": 13.9375, + "learning_rate": 8.083992616378961e-06, + "loss": 1.6188, + "step": 19760 + }, + { + "epoch": 0.5753847048898973, + "grad_norm": 11.3125, + "learning_rate": 8.082053334706346e-06, + "loss": 1.6458, + "step": 19780 + }, + { + "epoch": 0.5759664892224452, + "grad_norm": 14.6875, + "learning_rate": 8.080114053033731e-06, + "loss": 1.6713, + "step": 19800 + }, + { + "epoch": 0.5765482735549932, + "grad_norm": 12.9375, + "learning_rate": 8.078174771361116e-06, + "loss": 1.6076, + "step": 19820 + }, + { + "epoch": 0.5771300578875411, + "grad_norm": 12.875, + "learning_rate": 8.076235489688502e-06, + "loss": 1.6367, + "step": 19840 + }, + { + "epoch": 0.577711842220089, + "grad_norm": 13.375, + "learning_rate": 8.074296208015887e-06, + "loss": 1.6454, + "step": 19860 + }, + { + "epoch": 0.5782936265526369, + "grad_norm": 14.625, + "learning_rate": 8.072356926343272e-06, + "loss": 1.7048, + "step": 19880 + }, + { + "epoch": 0.5788754108851849, + "grad_norm": 13.9375, + "learning_rate": 8.070417644670657e-06, + "loss": 1.5943, + "step": 19900 + }, + { + "epoch": 0.5794571952177328, + "grad_norm": 14.5625, + "learning_rate": 8.068478362998042e-06, + "loss": 1.6203, + "step": 19920 + }, + { + "epoch": 0.5800389795502807, + "grad_norm": 14.4375, + "learning_rate": 8.066539081325427e-06, + "loss": 1.712, + "step": 19940 + }, + { + "epoch": 0.5806207638828287, + "grad_norm": 13.4375, + "learning_rate": 8.064599799652812e-06, + "loss": 1.6642, + "step": 19960 + }, + { + "epoch": 0.5812025482153765, + "grad_norm": 11.75, + "learning_rate": 8.062660517980197e-06, + "loss": 1.5492, + "step": 19980 + }, + { + "epoch": 0.5817843325479245, + "grad_norm": 11.75, + "learning_rate": 8.060721236307582e-06, + "loss": 1.5491, + "step": 20000 + }, + { + "epoch": 0.5823661168804725, + "grad_norm": 10.0, + "learning_rate": 8.058781954634966e-06, + "loss": 1.6416, + "step": 20020 + }, + { + "epoch": 0.5829479012130203, + "grad_norm": 13.6875, + "learning_rate": 8.056842672962351e-06, + "loss": 1.6852, + "step": 20040 + }, + { + "epoch": 0.5835296855455683, + "grad_norm": 17.0, + "learning_rate": 8.054903391289736e-06, + "loss": 1.648, + "step": 20060 + }, + { + "epoch": 0.5841114698781161, + "grad_norm": 12.9375, + "learning_rate": 8.052964109617121e-06, + "loss": 1.7016, + "step": 20080 + }, + { + "epoch": 0.5846932542106641, + "grad_norm": 13.5, + "learning_rate": 8.051024827944506e-06, + "loss": 1.6324, + "step": 20100 + }, + { + "epoch": 0.5852750385432121, + "grad_norm": 13.0, + "learning_rate": 8.049085546271891e-06, + "loss": 1.649, + "step": 20120 + }, + { + "epoch": 0.5858568228757599, + "grad_norm": 10.8125, + "learning_rate": 8.047146264599276e-06, + "loss": 1.6909, + "step": 20140 + }, + { + "epoch": 0.5864386072083079, + "grad_norm": 14.0, + "learning_rate": 8.045206982926661e-06, + "loss": 1.6395, + "step": 20160 + }, + { + "epoch": 0.5870203915408558, + "grad_norm": 12.0625, + "learning_rate": 8.043267701254047e-06, + "loss": 1.713, + "step": 20180 + }, + { + "epoch": 0.5876021758734037, + "grad_norm": 12.75, + "learning_rate": 8.041328419581432e-06, + "loss": 1.606, + "step": 20200 + }, + { + "epoch": 0.5881839602059517, + "grad_norm": 12.8125, + "learning_rate": 8.039389137908817e-06, + "loss": 1.6471, + "step": 20220 + }, + { + "epoch": 0.5887657445384996, + "grad_norm": 14.75, + "learning_rate": 8.037449856236202e-06, + "loss": 1.7117, + "step": 20240 + }, + { + "epoch": 0.5893475288710475, + "grad_norm": 12.4375, + "learning_rate": 8.035510574563587e-06, + "loss": 1.5581, + "step": 20260 + }, + { + "epoch": 0.5899293132035954, + "grad_norm": 12.0, + "learning_rate": 8.033571292890972e-06, + "loss": 1.6135, + "step": 20280 + }, + { + "epoch": 0.5905110975361434, + "grad_norm": 11.25, + "learning_rate": 8.031632011218357e-06, + "loss": 1.6054, + "step": 20300 + }, + { + "epoch": 0.5910928818686912, + "grad_norm": 13.4375, + "learning_rate": 8.029692729545742e-06, + "loss": 1.771, + "step": 20320 + }, + { + "epoch": 0.5916746662012392, + "grad_norm": 11.25, + "learning_rate": 8.027753447873127e-06, + "loss": 1.6584, + "step": 20340 + }, + { + "epoch": 0.5922564505337872, + "grad_norm": 9.875, + "learning_rate": 8.025814166200512e-06, + "loss": 1.6999, + "step": 20360 + }, + { + "epoch": 0.592838234866335, + "grad_norm": 11.4375, + "learning_rate": 8.023874884527898e-06, + "loss": 1.6357, + "step": 20380 + }, + { + "epoch": 0.593420019198883, + "grad_norm": 11.8125, + "learning_rate": 8.021935602855283e-06, + "loss": 1.5729, + "step": 20400 + }, + { + "epoch": 0.594001803531431, + "grad_norm": 13.5, + "learning_rate": 8.019996321182668e-06, + "loss": 1.6684, + "step": 20420 + }, + { + "epoch": 0.5945835878639788, + "grad_norm": 11.875, + "learning_rate": 8.018057039510053e-06, + "loss": 1.6585, + "step": 20440 + }, + { + "epoch": 0.5951653721965268, + "grad_norm": 12.625, + "learning_rate": 8.016117757837438e-06, + "loss": 1.6137, + "step": 20460 + }, + { + "epoch": 0.5957471565290746, + "grad_norm": 10.75, + "learning_rate": 8.014178476164823e-06, + "loss": 1.647, + "step": 20480 + }, + { + "epoch": 0.5963289408616226, + "grad_norm": 12.5625, + "learning_rate": 8.012239194492208e-06, + "loss": 1.6484, + "step": 20500 + }, + { + "epoch": 0.5969107251941705, + "grad_norm": 14.0625, + "learning_rate": 8.010299912819593e-06, + "loss": 1.7112, + "step": 20520 + }, + { + "epoch": 0.5974925095267184, + "grad_norm": 9.75, + "learning_rate": 8.008360631146978e-06, + "loss": 1.6521, + "step": 20540 + }, + { + "epoch": 0.5980742938592664, + "grad_norm": 11.5, + "learning_rate": 8.006421349474363e-06, + "loss": 1.6941, + "step": 20560 + }, + { + "epoch": 0.5986560781918143, + "grad_norm": 11.125, + "learning_rate": 8.004482067801747e-06, + "loss": 1.7052, + "step": 20580 + }, + { + "epoch": 0.5992378625243622, + "grad_norm": 11.5, + "learning_rate": 8.002542786129132e-06, + "loss": 1.5777, + "step": 20600 + }, + { + "epoch": 0.5998196468569101, + "grad_norm": 9.8125, + "learning_rate": 8.000603504456517e-06, + "loss": 1.7117, + "step": 20620 + }, + { + "epoch": 0.6004014311894581, + "grad_norm": 13.1875, + "learning_rate": 7.998664222783902e-06, + "loss": 1.6213, + "step": 20640 + }, + { + "epoch": 0.600983215522006, + "grad_norm": 13.375, + "learning_rate": 7.996724941111287e-06, + "loss": 1.6577, + "step": 20660 + }, + { + "epoch": 0.6015649998545539, + "grad_norm": 13.4375, + "learning_rate": 7.994785659438672e-06, + "loss": 1.6816, + "step": 20680 + }, + { + "epoch": 0.6021467841871019, + "grad_norm": 12.5, + "learning_rate": 7.992846377766057e-06, + "loss": 1.536, + "step": 20700 + }, + { + "epoch": 0.6027285685196497, + "grad_norm": 11.5, + "learning_rate": 7.990907096093442e-06, + "loss": 1.5839, + "step": 20720 + }, + { + "epoch": 0.6033103528521977, + "grad_norm": 10.625, + "learning_rate": 7.988967814420828e-06, + "loss": 1.7227, + "step": 20740 + }, + { + "epoch": 0.6038921371847457, + "grad_norm": 14.0625, + "learning_rate": 7.987028532748213e-06, + "loss": 1.6775, + "step": 20760 + }, + { + "epoch": 0.6044739215172935, + "grad_norm": 12.8125, + "learning_rate": 7.985089251075598e-06, + "loss": 1.6013, + "step": 20780 + }, + { + "epoch": 0.6050557058498415, + "grad_norm": 14.3125, + "learning_rate": 7.983149969402983e-06, + "loss": 1.5694, + "step": 20800 + }, + { + "epoch": 0.6056374901823894, + "grad_norm": 13.875, + "learning_rate": 7.981210687730368e-06, + "loss": 1.5985, + "step": 20820 + }, + { + "epoch": 0.6062192745149373, + "grad_norm": 10.375, + "learning_rate": 7.979271406057753e-06, + "loss": 1.586, + "step": 20840 + }, + { + "epoch": 0.6068010588474853, + "grad_norm": 11.25, + "learning_rate": 7.977332124385138e-06, + "loss": 1.5524, + "step": 20860 + }, + { + "epoch": 0.6073828431800332, + "grad_norm": 12.4375, + "learning_rate": 7.975392842712523e-06, + "loss": 1.5534, + "step": 20880 + }, + { + "epoch": 0.6079646275125811, + "grad_norm": 12.75, + "learning_rate": 7.973453561039908e-06, + "loss": 1.649, + "step": 20900 + }, + { + "epoch": 0.608546411845129, + "grad_norm": 11.8125, + "learning_rate": 7.971514279367293e-06, + "loss": 1.6024, + "step": 20920 + }, + { + "epoch": 0.6091281961776769, + "grad_norm": 15.875, + "learning_rate": 7.969574997694679e-06, + "loss": 1.6998, + "step": 20940 + }, + { + "epoch": 0.6097099805102248, + "grad_norm": 15.0625, + "learning_rate": 7.967635716022064e-06, + "loss": 1.6045, + "step": 20960 + }, + { + "epoch": 0.6102917648427728, + "grad_norm": 11.0625, + "learning_rate": 7.965696434349449e-06, + "loss": 1.6288, + "step": 20980 + }, + { + "epoch": 0.6108735491753207, + "grad_norm": 13.5625, + "learning_rate": 7.963757152676834e-06, + "loss": 1.5685, + "step": 21000 + }, + { + "epoch": 0.6114553335078686, + "grad_norm": 12.0625, + "learning_rate": 7.961817871004219e-06, + "loss": 1.7005, + "step": 21020 + }, + { + "epoch": 0.6120371178404166, + "grad_norm": 16.125, + "learning_rate": 7.959878589331604e-06, + "loss": 1.58, + "step": 21040 + }, + { + "epoch": 0.6126189021729644, + "grad_norm": 11.5, + "learning_rate": 7.957939307658989e-06, + "loss": 1.6479, + "step": 21060 + }, + { + "epoch": 0.6132006865055124, + "grad_norm": 12.375, + "learning_rate": 7.956000025986374e-06, + "loss": 1.5816, + "step": 21080 + }, + { + "epoch": 0.6137824708380604, + "grad_norm": 11.1875, + "learning_rate": 7.95406074431376e-06, + "loss": 1.6428, + "step": 21100 + }, + { + "epoch": 0.6143642551706082, + "grad_norm": 11.25, + "learning_rate": 7.952121462641144e-06, + "loss": 1.6375, + "step": 21120 + }, + { + "epoch": 0.6149460395031562, + "grad_norm": 12.4375, + "learning_rate": 7.95018218096853e-06, + "loss": 1.6391, + "step": 21140 + }, + { + "epoch": 0.6155278238357041, + "grad_norm": 13.875, + "learning_rate": 7.948242899295915e-06, + "loss": 1.6711, + "step": 21160 + }, + { + "epoch": 0.616109608168252, + "grad_norm": 14.875, + "learning_rate": 7.9463036176233e-06, + "loss": 1.6958, + "step": 21180 + }, + { + "epoch": 0.6166913925008, + "grad_norm": 13.125, + "learning_rate": 7.944364335950685e-06, + "loss": 1.5907, + "step": 21200 + }, + { + "epoch": 0.6172731768333479, + "grad_norm": 12.4375, + "learning_rate": 7.94242505427807e-06, + "loss": 1.6955, + "step": 21220 + }, + { + "epoch": 0.6178549611658958, + "grad_norm": 12.1875, + "learning_rate": 7.940485772605455e-06, + "loss": 1.646, + "step": 21240 + }, + { + "epoch": 0.6184367454984437, + "grad_norm": 13.1875, + "learning_rate": 7.93854649093284e-06, + "loss": 1.7521, + "step": 21260 + }, + { + "epoch": 0.6190185298309917, + "grad_norm": 12.625, + "learning_rate": 7.936607209260225e-06, + "loss": 1.6426, + "step": 21280 + }, + { + "epoch": 0.6196003141635396, + "grad_norm": 13.125, + "learning_rate": 7.93466792758761e-06, + "loss": 1.5326, + "step": 21300 + }, + { + "epoch": 0.6201820984960875, + "grad_norm": 12.5, + "learning_rate": 7.932728645914995e-06, + "loss": 1.5717, + "step": 21320 + }, + { + "epoch": 0.6207638828286354, + "grad_norm": 12.0, + "learning_rate": 7.93078936424238e-06, + "loss": 1.5635, + "step": 21340 + }, + { + "epoch": 0.6213456671611833, + "grad_norm": 11.8125, + "learning_rate": 7.928850082569766e-06, + "loss": 1.7181, + "step": 21360 + }, + { + "epoch": 0.6219274514937313, + "grad_norm": 14.3125, + "learning_rate": 7.92691080089715e-06, + "loss": 1.6385, + "step": 21380 + }, + { + "epoch": 0.6225092358262792, + "grad_norm": 13.0, + "learning_rate": 7.924971519224536e-06, + "loss": 1.6784, + "step": 21400 + }, + { + "epoch": 0.6230910201588271, + "grad_norm": 15.5, + "learning_rate": 7.923032237551921e-06, + "loss": 1.7064, + "step": 21420 + }, + { + "epoch": 0.6236728044913751, + "grad_norm": 12.125, + "learning_rate": 7.921092955879306e-06, + "loss": 1.6372, + "step": 21440 + }, + { + "epoch": 0.6242545888239229, + "grad_norm": 11.75, + "learning_rate": 7.919153674206691e-06, + "loss": 1.7176, + "step": 21460 + }, + { + "epoch": 0.6248363731564709, + "grad_norm": 11.875, + "learning_rate": 7.917214392534076e-06, + "loss": 1.6417, + "step": 21480 + }, + { + "epoch": 0.6254181574890189, + "grad_norm": 14.375, + "learning_rate": 7.915275110861461e-06, + "loss": 1.5704, + "step": 21500 + }, + { + "epoch": 0.6259999418215667, + "grad_norm": 10.0625, + "learning_rate": 7.913335829188846e-06, + "loss": 1.5801, + "step": 21520 + }, + { + "epoch": 0.6265817261541147, + "grad_norm": 11.5625, + "learning_rate": 7.911396547516231e-06, + "loss": 1.6788, + "step": 21540 + }, + { + "epoch": 0.6271635104866626, + "grad_norm": 12.625, + "learning_rate": 7.909457265843617e-06, + "loss": 1.6491, + "step": 21560 + }, + { + "epoch": 0.6277452948192105, + "grad_norm": 13.3125, + "learning_rate": 7.907517984171002e-06, + "loss": 1.6589, + "step": 21580 + }, + { + "epoch": 0.6283270791517584, + "grad_norm": 11.9375, + "learning_rate": 7.905578702498387e-06, + "loss": 1.6245, + "step": 21600 + }, + { + "epoch": 0.6289088634843064, + "grad_norm": 11.5, + "learning_rate": 7.903639420825772e-06, + "loss": 1.6734, + "step": 21620 + }, + { + "epoch": 0.6294906478168543, + "grad_norm": 14.6875, + "learning_rate": 7.901700139153157e-06, + "loss": 1.6331, + "step": 21640 + }, + { + "epoch": 0.6300724321494022, + "grad_norm": 9.5625, + "learning_rate": 7.899760857480542e-06, + "loss": 1.6249, + "step": 21660 + }, + { + "epoch": 0.6306542164819502, + "grad_norm": 10.1875, + "learning_rate": 7.897821575807927e-06, + "loss": 1.6278, + "step": 21680 + }, + { + "epoch": 0.631236000814498, + "grad_norm": 13.5625, + "learning_rate": 7.895882294135312e-06, + "loss": 1.689, + "step": 21700 + }, + { + "epoch": 0.631817785147046, + "grad_norm": 11.9375, + "learning_rate": 7.893943012462697e-06, + "loss": 1.6893, + "step": 21720 + }, + { + "epoch": 0.6323995694795939, + "grad_norm": 11.5625, + "learning_rate": 7.892003730790082e-06, + "loss": 1.6971, + "step": 21740 + }, + { + "epoch": 0.6329813538121418, + "grad_norm": 11.6875, + "learning_rate": 7.890064449117468e-06, + "loss": 1.5925, + "step": 21760 + }, + { + "epoch": 0.6335631381446898, + "grad_norm": 12.125, + "learning_rate": 7.888125167444853e-06, + "loss": 1.6094, + "step": 21780 + }, + { + "epoch": 0.6341449224772376, + "grad_norm": 11.4375, + "learning_rate": 7.886185885772238e-06, + "loss": 1.6971, + "step": 21800 + }, + { + "epoch": 0.6347267068097856, + "grad_norm": 12.8125, + "learning_rate": 7.884246604099623e-06, + "loss": 1.6558, + "step": 21820 + }, + { + "epoch": 0.6353084911423336, + "grad_norm": 13.6875, + "learning_rate": 7.882307322427008e-06, + "loss": 1.6819, + "step": 21840 + }, + { + "epoch": 0.6358902754748814, + "grad_norm": 13.5625, + "learning_rate": 7.880368040754393e-06, + "loss": 1.5455, + "step": 21860 + }, + { + "epoch": 0.6364720598074294, + "grad_norm": 12.1875, + "learning_rate": 7.878428759081778e-06, + "loss": 1.6307, + "step": 21880 + }, + { + "epoch": 0.6370538441399773, + "grad_norm": 11.5625, + "learning_rate": 7.876489477409163e-06, + "loss": 1.6101, + "step": 21900 + }, + { + "epoch": 0.6376356284725252, + "grad_norm": 12.6875, + "learning_rate": 7.874550195736548e-06, + "loss": 1.5181, + "step": 21920 + }, + { + "epoch": 0.6382174128050732, + "grad_norm": 12.4375, + "learning_rate": 7.872610914063933e-06, + "loss": 1.6125, + "step": 21940 + }, + { + "epoch": 0.6387991971376211, + "grad_norm": 13.1875, + "learning_rate": 7.870671632391319e-06, + "loss": 1.6586, + "step": 21960 + }, + { + "epoch": 0.639380981470169, + "grad_norm": 12.375, + "learning_rate": 7.868732350718704e-06, + "loss": 1.6246, + "step": 21980 + }, + { + "epoch": 0.6399627658027169, + "grad_norm": 11.875, + "learning_rate": 7.866793069046089e-06, + "loss": 1.5835, + "step": 22000 + }, + { + "epoch": 0.6405445501352649, + "grad_norm": 12.5, + "learning_rate": 7.864853787373474e-06, + "loss": 1.6767, + "step": 22020 + }, + { + "epoch": 0.6411263344678128, + "grad_norm": 12.25, + "learning_rate": 7.862914505700859e-06, + "loss": 1.5982, + "step": 22040 + }, + { + "epoch": 0.6417081188003607, + "grad_norm": 10.1875, + "learning_rate": 7.860975224028244e-06, + "loss": 1.6858, + "step": 22060 + }, + { + "epoch": 0.6422899031329087, + "grad_norm": 13.75, + "learning_rate": 7.859035942355629e-06, + "loss": 1.6265, + "step": 22080 + }, + { + "epoch": 0.6428716874654565, + "grad_norm": 13.5625, + "learning_rate": 7.857096660683014e-06, + "loss": 1.6599, + "step": 22100 + }, + { + "epoch": 0.6434534717980045, + "grad_norm": 12.3125, + "learning_rate": 7.8551573790104e-06, + "loss": 1.6791, + "step": 22120 + }, + { + "epoch": 0.6440352561305525, + "grad_norm": 13.625, + "learning_rate": 7.853218097337784e-06, + "loss": 1.6365, + "step": 22140 + }, + { + "epoch": 0.6446170404631003, + "grad_norm": 13.8125, + "learning_rate": 7.85127881566517e-06, + "loss": 1.6622, + "step": 22160 + }, + { + "epoch": 0.6451988247956483, + "grad_norm": 12.625, + "learning_rate": 7.849339533992555e-06, + "loss": 1.6112, + "step": 22180 + }, + { + "epoch": 0.6457806091281961, + "grad_norm": 12.0625, + "learning_rate": 7.84740025231994e-06, + "loss": 1.7412, + "step": 22200 + }, + { + "epoch": 0.6463623934607441, + "grad_norm": 12.5625, + "learning_rate": 7.845460970647323e-06, + "loss": 1.6402, + "step": 22220 + }, + { + "epoch": 0.646944177793292, + "grad_norm": 13.5625, + "learning_rate": 7.843521688974708e-06, + "loss": 1.6437, + "step": 22240 + }, + { + "epoch": 0.6475259621258399, + "grad_norm": 10.9375, + "learning_rate": 7.841582407302093e-06, + "loss": 1.5291, + "step": 22260 + }, + { + "epoch": 0.6481077464583879, + "grad_norm": 13.125, + "learning_rate": 7.839643125629478e-06, + "loss": 1.6853, + "step": 22280 + }, + { + "epoch": 0.6486895307909358, + "grad_norm": 11.375, + "learning_rate": 7.837703843956863e-06, + "loss": 1.6956, + "step": 22300 + }, + { + "epoch": 0.6492713151234837, + "grad_norm": 15.875, + "learning_rate": 7.835764562284249e-06, + "loss": 1.5831, + "step": 22320 + }, + { + "epoch": 0.6498530994560316, + "grad_norm": 13.1875, + "learning_rate": 7.833825280611634e-06, + "loss": 1.6551, + "step": 22340 + }, + { + "epoch": 0.6504348837885796, + "grad_norm": 11.875, + "learning_rate": 7.831885998939019e-06, + "loss": 1.5802, + "step": 22360 + }, + { + "epoch": 0.6510166681211275, + "grad_norm": 13.125, + "learning_rate": 7.829946717266404e-06, + "loss": 1.6445, + "step": 22380 + }, + { + "epoch": 0.6515984524536754, + "grad_norm": 12.3125, + "learning_rate": 7.828007435593789e-06, + "loss": 1.624, + "step": 22400 + }, + { + "epoch": 0.6521802367862234, + "grad_norm": 10.5, + "learning_rate": 7.826068153921174e-06, + "loss": 1.5483, + "step": 22420 + }, + { + "epoch": 0.6527620211187712, + "grad_norm": 11.5, + "learning_rate": 7.82412887224856e-06, + "loss": 1.6606, + "step": 22440 + }, + { + "epoch": 0.6533438054513192, + "grad_norm": 13.0625, + "learning_rate": 7.822189590575944e-06, + "loss": 1.6794, + "step": 22460 + }, + { + "epoch": 0.6539255897838672, + "grad_norm": 12.5625, + "learning_rate": 7.82025030890333e-06, + "loss": 1.6568, + "step": 22480 + }, + { + "epoch": 0.654507374116415, + "grad_norm": 12.0, + "learning_rate": 7.818311027230714e-06, + "loss": 1.6155, + "step": 22500 + }, + { + "epoch": 0.655089158448963, + "grad_norm": 10.875, + "learning_rate": 7.8163717455581e-06, + "loss": 1.5812, + "step": 22520 + }, + { + "epoch": 0.655670942781511, + "grad_norm": 10.125, + "learning_rate": 7.814432463885485e-06, + "loss": 1.7043, + "step": 22540 + }, + { + "epoch": 0.6562527271140588, + "grad_norm": 10.8125, + "learning_rate": 7.81249318221287e-06, + "loss": 1.6885, + "step": 22560 + }, + { + "epoch": 0.6568345114466068, + "grad_norm": 13.5625, + "learning_rate": 7.810553900540255e-06, + "loss": 1.6724, + "step": 22580 + }, + { + "epoch": 0.6574162957791546, + "grad_norm": 14.3125, + "learning_rate": 7.80861461886764e-06, + "loss": 1.6988, + "step": 22600 + }, + { + "epoch": 0.6579980801117026, + "grad_norm": 14.0625, + "learning_rate": 7.806675337195025e-06, + "loss": 1.5984, + "step": 22620 + }, + { + "epoch": 0.6585798644442505, + "grad_norm": 13.6875, + "learning_rate": 7.80473605552241e-06, + "loss": 1.625, + "step": 22640 + }, + { + "epoch": 0.6591616487767984, + "grad_norm": 11.5625, + "learning_rate": 7.802796773849795e-06, + "loss": 1.6485, + "step": 22660 + }, + { + "epoch": 0.6597434331093464, + "grad_norm": 11.625, + "learning_rate": 7.80085749217718e-06, + "loss": 1.6211, + "step": 22680 + }, + { + "epoch": 0.6603252174418943, + "grad_norm": 11.9375, + "learning_rate": 7.798918210504565e-06, + "loss": 1.6608, + "step": 22700 + }, + { + "epoch": 0.6609070017744422, + "grad_norm": 12.0, + "learning_rate": 7.79697892883195e-06, + "loss": 1.7853, + "step": 22720 + }, + { + "epoch": 0.6614887861069901, + "grad_norm": 12.9375, + "learning_rate": 7.795039647159336e-06, + "loss": 1.6535, + "step": 22740 + }, + { + "epoch": 0.6620705704395381, + "grad_norm": 11.9375, + "learning_rate": 7.79310036548672e-06, + "loss": 1.6026, + "step": 22760 + }, + { + "epoch": 0.662652354772086, + "grad_norm": 11.5, + "learning_rate": 7.791161083814106e-06, + "loss": 1.6255, + "step": 22780 + }, + { + "epoch": 0.6632341391046339, + "grad_norm": 11.875, + "learning_rate": 7.789221802141491e-06, + "loss": 1.6588, + "step": 22800 + }, + { + "epoch": 0.6638159234371819, + "grad_norm": 11.875, + "learning_rate": 7.787282520468876e-06, + "loss": 1.5569, + "step": 22820 + }, + { + "epoch": 0.6643977077697297, + "grad_norm": 12.125, + "learning_rate": 7.785343238796261e-06, + "loss": 1.6181, + "step": 22840 + }, + { + "epoch": 0.6649794921022777, + "grad_norm": 15.0, + "learning_rate": 7.783403957123646e-06, + "loss": 1.5772, + "step": 22860 + }, + { + "epoch": 0.6655612764348257, + "grad_norm": 11.5, + "learning_rate": 7.781464675451031e-06, + "loss": 1.5771, + "step": 22880 + }, + { + "epoch": 0.6661430607673735, + "grad_norm": 13.8125, + "learning_rate": 7.779525393778416e-06, + "loss": 1.704, + "step": 22900 + }, + { + "epoch": 0.6667248450999215, + "grad_norm": 13.125, + "learning_rate": 7.777586112105802e-06, + "loss": 1.6354, + "step": 22920 + }, + { + "epoch": 0.6673066294324694, + "grad_norm": 12.0625, + "learning_rate": 7.775646830433187e-06, + "loss": 1.6523, + "step": 22940 + }, + { + "epoch": 0.6678884137650173, + "grad_norm": 12.5625, + "learning_rate": 7.773707548760572e-06, + "loss": 1.6739, + "step": 22960 + }, + { + "epoch": 0.6684701980975652, + "grad_norm": 11.0, + "learning_rate": 7.771768267087957e-06, + "loss": 1.5769, + "step": 22980 + }, + { + "epoch": 0.6690519824301132, + "grad_norm": 10.8125, + "learning_rate": 7.769828985415342e-06, + "loss": 1.6732, + "step": 23000 + }, + { + "epoch": 0.6696337667626611, + "grad_norm": 12.9375, + "learning_rate": 7.767889703742727e-06, + "loss": 1.601, + "step": 23020 + }, + { + "epoch": 0.670215551095209, + "grad_norm": 19.125, + "learning_rate": 7.765950422070112e-06, + "loss": 1.7081, + "step": 23040 + }, + { + "epoch": 0.6707973354277569, + "grad_norm": 10.0625, + "learning_rate": 7.764011140397497e-06, + "loss": 1.5929, + "step": 23060 + }, + { + "epoch": 0.6713791197603048, + "grad_norm": 11.6875, + "learning_rate": 7.762071858724882e-06, + "loss": 1.6029, + "step": 23080 + }, + { + "epoch": 0.6719609040928528, + "grad_norm": 11.9375, + "learning_rate": 7.760132577052267e-06, + "loss": 1.5392, + "step": 23100 + }, + { + "epoch": 0.6725426884254007, + "grad_norm": 12.125, + "learning_rate": 7.758193295379652e-06, + "loss": 1.5838, + "step": 23120 + }, + { + "epoch": 0.6731244727579486, + "grad_norm": 12.125, + "learning_rate": 7.756254013707038e-06, + "loss": 1.6709, + "step": 23140 + }, + { + "epoch": 0.6737062570904966, + "grad_norm": 11.0625, + "learning_rate": 7.754314732034423e-06, + "loss": 1.6044, + "step": 23160 + }, + { + "epoch": 0.6742880414230444, + "grad_norm": 14.4375, + "learning_rate": 7.752375450361808e-06, + "loss": 1.6394, + "step": 23180 + }, + { + "epoch": 0.6748698257555924, + "grad_norm": 12.4375, + "learning_rate": 7.750436168689193e-06, + "loss": 1.6326, + "step": 23200 + }, + { + "epoch": 0.6754516100881404, + "grad_norm": 14.1875, + "learning_rate": 7.748496887016578e-06, + "loss": 1.6891, + "step": 23220 + }, + { + "epoch": 0.6760333944206882, + "grad_norm": 12.25, + "learning_rate": 7.746557605343963e-06, + "loss": 1.6324, + "step": 23240 + }, + { + "epoch": 0.6766151787532362, + "grad_norm": 13.25, + "learning_rate": 7.744618323671348e-06, + "loss": 1.6257, + "step": 23260 + }, + { + "epoch": 0.6771969630857841, + "grad_norm": 16.0, + "learning_rate": 7.742679041998733e-06, + "loss": 1.6609, + "step": 23280 + }, + { + "epoch": 0.677778747418332, + "grad_norm": 12.875, + "learning_rate": 7.740739760326118e-06, + "loss": 1.6568, + "step": 23300 + }, + { + "epoch": 0.67836053175088, + "grad_norm": 12.25, + "learning_rate": 7.738800478653503e-06, + "loss": 1.6479, + "step": 23320 + }, + { + "epoch": 0.6789423160834279, + "grad_norm": 13.125, + "learning_rate": 7.736861196980889e-06, + "loss": 1.6478, + "step": 23340 + }, + { + "epoch": 0.6795241004159758, + "grad_norm": 14.375, + "learning_rate": 7.734921915308274e-06, + "loss": 1.6291, + "step": 23360 + }, + { + "epoch": 0.6801058847485237, + "grad_norm": 12.3125, + "learning_rate": 7.732982633635659e-06, + "loss": 1.5804, + "step": 23380 + }, + { + "epoch": 0.6806876690810717, + "grad_norm": 9.875, + "learning_rate": 7.731043351963044e-06, + "loss": 1.6415, + "step": 23400 + }, + { + "epoch": 0.6812694534136196, + "grad_norm": 14.8125, + "learning_rate": 7.729104070290429e-06, + "loss": 1.5866, + "step": 23420 + }, + { + "epoch": 0.6818512377461675, + "grad_norm": 13.25, + "learning_rate": 7.727164788617814e-06, + "loss": 1.583, + "step": 23440 + }, + { + "epoch": 0.6824330220787154, + "grad_norm": 13.6875, + "learning_rate": 7.725225506945199e-06, + "loss": 1.6845, + "step": 23460 + }, + { + "epoch": 0.6830148064112633, + "grad_norm": 11.75, + "learning_rate": 7.723286225272584e-06, + "loss": 1.7366, + "step": 23480 + }, + { + "epoch": 0.6835965907438113, + "grad_norm": 8.6875, + "learning_rate": 7.72134694359997e-06, + "loss": 1.6201, + "step": 23500 + }, + { + "epoch": 0.6841783750763591, + "grad_norm": 13.3125, + "learning_rate": 7.719407661927354e-06, + "loss": 1.613, + "step": 23520 + }, + { + "epoch": 0.6847601594089071, + "grad_norm": 12.375, + "learning_rate": 7.71746838025474e-06, + "loss": 1.6545, + "step": 23540 + }, + { + "epoch": 0.6853419437414551, + "grad_norm": 10.875, + "learning_rate": 7.715529098582125e-06, + "loss": 1.5382, + "step": 23560 + }, + { + "epoch": 0.6859237280740029, + "grad_norm": 14.25, + "learning_rate": 7.71358981690951e-06, + "loss": 1.6518, + "step": 23580 + }, + { + "epoch": 0.6865055124065509, + "grad_norm": 12.1875, + "learning_rate": 7.711650535236895e-06, + "loss": 1.6836, + "step": 23600 + }, + { + "epoch": 0.6870872967390989, + "grad_norm": 9.25, + "learning_rate": 7.70971125356428e-06, + "loss": 1.6406, + "step": 23620 + }, + { + "epoch": 0.6876690810716467, + "grad_norm": 11.6875, + "learning_rate": 7.707771971891665e-06, + "loss": 1.6658, + "step": 23640 + }, + { + "epoch": 0.6882508654041947, + "grad_norm": 14.6875, + "learning_rate": 7.70583269021905e-06, + "loss": 1.67, + "step": 23660 + }, + { + "epoch": 0.6888326497367426, + "grad_norm": 12.25, + "learning_rate": 7.703893408546435e-06, + "loss": 1.5416, + "step": 23680 + }, + { + "epoch": 0.6894144340692905, + "grad_norm": 13.3125, + "learning_rate": 7.70195412687382e-06, + "loss": 1.6719, + "step": 23700 + }, + { + "epoch": 0.6899962184018384, + "grad_norm": 13.0625, + "learning_rate": 7.700014845201205e-06, + "loss": 1.6852, + "step": 23720 + }, + { + "epoch": 0.6905780027343864, + "grad_norm": 15.0, + "learning_rate": 7.69807556352859e-06, + "loss": 1.5557, + "step": 23740 + }, + { + "epoch": 0.6911597870669343, + "grad_norm": 13.9375, + "learning_rate": 7.696136281855976e-06, + "loss": 1.6628, + "step": 23760 + }, + { + "epoch": 0.6917415713994822, + "grad_norm": 11.125, + "learning_rate": 7.69419700018336e-06, + "loss": 1.6792, + "step": 23780 + }, + { + "epoch": 0.6923233557320302, + "grad_norm": 12.75, + "learning_rate": 7.692257718510746e-06, + "loss": 1.6234, + "step": 23800 + }, + { + "epoch": 0.692905140064578, + "grad_norm": 16.5, + "learning_rate": 7.690318436838131e-06, + "loss": 1.5968, + "step": 23820 + }, + { + "epoch": 0.693486924397126, + "grad_norm": 13.5, + "learning_rate": 7.688379155165514e-06, + "loss": 1.658, + "step": 23840 + }, + { + "epoch": 0.6940687087296739, + "grad_norm": 11.1875, + "learning_rate": 7.6864398734929e-06, + "loss": 1.6739, + "step": 23860 + }, + { + "epoch": 0.6946504930622218, + "grad_norm": 15.125, + "learning_rate": 7.684500591820284e-06, + "loss": 1.6477, + "step": 23880 + }, + { + "epoch": 0.6952322773947698, + "grad_norm": 12.3125, + "learning_rate": 7.68256131014767e-06, + "loss": 1.5933, + "step": 23900 + }, + { + "epoch": 0.6958140617273176, + "grad_norm": 12.625, + "learning_rate": 7.680622028475055e-06, + "loss": 1.642, + "step": 23920 + }, + { + "epoch": 0.6963958460598656, + "grad_norm": 13.375, + "learning_rate": 7.67868274680244e-06, + "loss": 1.6607, + "step": 23940 + }, + { + "epoch": 0.6969776303924136, + "grad_norm": 15.625, + "learning_rate": 7.676743465129825e-06, + "loss": 1.6606, + "step": 23960 + }, + { + "epoch": 0.6975594147249614, + "grad_norm": 13.625, + "learning_rate": 7.67480418345721e-06, + "loss": 1.6539, + "step": 23980 + }, + { + "epoch": 0.6981411990575094, + "grad_norm": 14.0, + "learning_rate": 7.672864901784595e-06, + "loss": 1.5874, + "step": 24000 + }, + { + "epoch": 0.6987229833900573, + "grad_norm": 9.625, + "learning_rate": 7.67092562011198e-06, + "loss": 1.5782, + "step": 24020 + }, + { + "epoch": 0.6993047677226052, + "grad_norm": 15.625, + "learning_rate": 7.668986338439365e-06, + "loss": 1.6254, + "step": 24040 + }, + { + "epoch": 0.6998865520551532, + "grad_norm": 12.25, + "learning_rate": 7.66704705676675e-06, + "loss": 1.671, + "step": 24060 + }, + { + "epoch": 0.7004683363877011, + "grad_norm": 10.3125, + "learning_rate": 7.665107775094135e-06, + "loss": 1.6397, + "step": 24080 + }, + { + "epoch": 0.701050120720249, + "grad_norm": 11.0, + "learning_rate": 7.66316849342152e-06, + "loss": 1.741, + "step": 24100 + }, + { + "epoch": 0.7016319050527969, + "grad_norm": 12.0, + "learning_rate": 7.661229211748906e-06, + "loss": 1.5878, + "step": 24120 + }, + { + "epoch": 0.7022136893853449, + "grad_norm": 8.375, + "learning_rate": 7.65928993007629e-06, + "loss": 1.6567, + "step": 24140 + }, + { + "epoch": 0.7027954737178927, + "grad_norm": 11.8125, + "learning_rate": 7.657350648403676e-06, + "loss": 1.56, + "step": 24160 + }, + { + "epoch": 0.7033772580504407, + "grad_norm": 12.4375, + "learning_rate": 7.655411366731061e-06, + "loss": 1.6866, + "step": 24180 + }, + { + "epoch": 0.7039590423829887, + "grad_norm": 14.0625, + "learning_rate": 7.653472085058446e-06, + "loss": 1.7478, + "step": 24200 + }, + { + "epoch": 0.7045408267155365, + "grad_norm": 9.9375, + "learning_rate": 7.651532803385831e-06, + "loss": 1.6249, + "step": 24220 + }, + { + "epoch": 0.7051226110480845, + "grad_norm": 12.25, + "learning_rate": 7.649593521713216e-06, + "loss": 1.6672, + "step": 24240 + }, + { + "epoch": 0.7057043953806325, + "grad_norm": 9.8125, + "learning_rate": 7.647654240040601e-06, + "loss": 1.6609, + "step": 24260 + }, + { + "epoch": 0.7062861797131803, + "grad_norm": 12.6875, + "learning_rate": 7.645714958367986e-06, + "loss": 1.6397, + "step": 24280 + }, + { + "epoch": 0.7068679640457283, + "grad_norm": 10.75, + "learning_rate": 7.643775676695372e-06, + "loss": 1.5696, + "step": 24300 + }, + { + "epoch": 0.7074497483782761, + "grad_norm": 7.09375, + "learning_rate": 7.641836395022757e-06, + "loss": 1.6747, + "step": 24320 + }, + { + "epoch": 0.7080315327108241, + "grad_norm": 12.75, + "learning_rate": 7.639897113350142e-06, + "loss": 1.5875, + "step": 24340 + }, + { + "epoch": 0.708613317043372, + "grad_norm": 13.125, + "learning_rate": 7.637957831677527e-06, + "loss": 1.6379, + "step": 24360 + }, + { + "epoch": 0.7091951013759199, + "grad_norm": 10.125, + "learning_rate": 7.636018550004912e-06, + "loss": 1.6031, + "step": 24380 + }, + { + "epoch": 0.7097768857084679, + "grad_norm": 10.375, + "learning_rate": 7.634079268332297e-06, + "loss": 1.6378, + "step": 24400 + }, + { + "epoch": 0.7103586700410158, + "grad_norm": 12.8125, + "learning_rate": 7.632139986659682e-06, + "loss": 1.703, + "step": 24420 + }, + { + "epoch": 0.7109404543735637, + "grad_norm": 13.625, + "learning_rate": 7.630200704987067e-06, + "loss": 1.5996, + "step": 24440 + }, + { + "epoch": 0.7115222387061116, + "grad_norm": 10.3125, + "learning_rate": 7.628261423314452e-06, + "loss": 1.6742, + "step": 24460 + }, + { + "epoch": 0.7121040230386596, + "grad_norm": 11.3125, + "learning_rate": 7.626322141641837e-06, + "loss": 1.6109, + "step": 24480 + }, + { + "epoch": 0.7126858073712075, + "grad_norm": 14.125, + "learning_rate": 7.6243828599692225e-06, + "loss": 1.6273, + "step": 24500 + }, + { + "epoch": 0.7132675917037554, + "grad_norm": 11.875, + "learning_rate": 7.622443578296608e-06, + "loss": 1.5985, + "step": 24520 + }, + { + "epoch": 0.7138493760363034, + "grad_norm": 12.5, + "learning_rate": 7.620504296623993e-06, + "loss": 1.5652, + "step": 24540 + }, + { + "epoch": 0.7144311603688512, + "grad_norm": 13.3125, + "learning_rate": 7.618565014951378e-06, + "loss": 1.604, + "step": 24560 + }, + { + "epoch": 0.7150129447013992, + "grad_norm": 13.125, + "learning_rate": 7.616625733278763e-06, + "loss": 1.5017, + "step": 24580 + }, + { + "epoch": 0.7155947290339472, + "grad_norm": 14.0, + "learning_rate": 7.614686451606148e-06, + "loss": 1.7284, + "step": 24600 + }, + { + "epoch": 0.716176513366495, + "grad_norm": 11.3125, + "learning_rate": 7.612747169933533e-06, + "loss": 1.6873, + "step": 24620 + }, + { + "epoch": 0.716758297699043, + "grad_norm": 14.1875, + "learning_rate": 7.610807888260918e-06, + "loss": 1.7274, + "step": 24640 + }, + { + "epoch": 0.7173400820315909, + "grad_norm": 12.875, + "learning_rate": 7.608868606588303e-06, + "loss": 1.5654, + "step": 24660 + }, + { + "epoch": 0.7179218663641388, + "grad_norm": 10.8125, + "learning_rate": 7.606929324915688e-06, + "loss": 1.5104, + "step": 24680 + }, + { + "epoch": 0.7185036506966868, + "grad_norm": 12.3125, + "learning_rate": 7.6049900432430735e-06, + "loss": 1.6304, + "step": 24700 + }, + { + "epoch": 0.7190854350292346, + "grad_norm": 13.625, + "learning_rate": 7.603050761570459e-06, + "loss": 1.5513, + "step": 24720 + }, + { + "epoch": 0.7196672193617826, + "grad_norm": 11.25, + "learning_rate": 7.601111479897844e-06, + "loss": 1.6296, + "step": 24740 + }, + { + "epoch": 0.7202490036943305, + "grad_norm": 11.375, + "learning_rate": 7.599172198225229e-06, + "loss": 1.6554, + "step": 24760 + }, + { + "epoch": 0.7208307880268784, + "grad_norm": 13.4375, + "learning_rate": 7.597232916552614e-06, + "loss": 1.689, + "step": 24780 + }, + { + "epoch": 0.7214125723594264, + "grad_norm": 13.0625, + "learning_rate": 7.595293634879999e-06, + "loss": 1.6671, + "step": 24800 + }, + { + "epoch": 0.7219943566919743, + "grad_norm": 13.25, + "learning_rate": 7.593354353207382e-06, + "loss": 1.6171, + "step": 24820 + }, + { + "epoch": 0.7225761410245222, + "grad_norm": 13.375, + "learning_rate": 7.5914150715347675e-06, + "loss": 1.6282, + "step": 24840 + }, + { + "epoch": 0.7231579253570701, + "grad_norm": 12.1875, + "learning_rate": 7.5894757898621526e-06, + "loss": 1.6428, + "step": 24860 + }, + { + "epoch": 0.7237397096896181, + "grad_norm": 12.5, + "learning_rate": 7.587536508189538e-06, + "loss": 1.7236, + "step": 24880 + }, + { + "epoch": 0.724321494022166, + "grad_norm": 11.625, + "learning_rate": 7.585597226516923e-06, + "loss": 1.7371, + "step": 24900 + }, + { + "epoch": 0.7249032783547139, + "grad_norm": 11.5625, + "learning_rate": 7.583657944844308e-06, + "loss": 1.6998, + "step": 24920 + }, + { + "epoch": 0.7254850626872619, + "grad_norm": 13.1875, + "learning_rate": 7.581718663171693e-06, + "loss": 1.5971, + "step": 24940 + }, + { + "epoch": 0.7260668470198097, + "grad_norm": 17.25, + "learning_rate": 7.579779381499078e-06, + "loss": 1.7575, + "step": 24960 + }, + { + "epoch": 0.7266486313523577, + "grad_norm": 13.25, + "learning_rate": 7.577840099826463e-06, + "loss": 1.6709, + "step": 24980 + }, + { + "epoch": 0.7272304156849057, + "grad_norm": 14.4375, + "learning_rate": 7.575900818153848e-06, + "loss": 1.5984, + "step": 25000 + }, + { + "epoch": 0.7278122000174535, + "grad_norm": 15.0, + "learning_rate": 7.573961536481233e-06, + "loss": 1.7063, + "step": 25020 + }, + { + "epoch": 0.7283939843500015, + "grad_norm": 11.125, + "learning_rate": 7.5720222548086184e-06, + "loss": 1.5983, + "step": 25040 + }, + { + "epoch": 0.7289757686825494, + "grad_norm": 13.375, + "learning_rate": 7.5700829731360035e-06, + "loss": 1.516, + "step": 25060 + }, + { + "epoch": 0.7295575530150973, + "grad_norm": 12.0, + "learning_rate": 7.568143691463389e-06, + "loss": 1.6036, + "step": 25080 + }, + { + "epoch": 0.7301393373476452, + "grad_norm": 15.9375, + "learning_rate": 7.566204409790774e-06, + "loss": 1.6088, + "step": 25100 + }, + { + "epoch": 0.7307211216801931, + "grad_norm": 12.0, + "learning_rate": 7.564265128118159e-06, + "loss": 1.7613, + "step": 25120 + }, + { + "epoch": 0.7313029060127411, + "grad_norm": 12.125, + "learning_rate": 7.562325846445544e-06, + "loss": 1.638, + "step": 25140 + }, + { + "epoch": 0.731884690345289, + "grad_norm": 13.375, + "learning_rate": 7.560386564772929e-06, + "loss": 1.644, + "step": 25160 + }, + { + "epoch": 0.7324664746778369, + "grad_norm": 12.1875, + "learning_rate": 7.558447283100314e-06, + "loss": 1.6405, + "step": 25180 + }, + { + "epoch": 0.7330482590103848, + "grad_norm": 10.625, + "learning_rate": 7.556508001427699e-06, + "loss": 1.6012, + "step": 25200 + }, + { + "epoch": 0.7336300433429328, + "grad_norm": 10.9375, + "learning_rate": 7.554568719755084e-06, + "loss": 1.6877, + "step": 25220 + }, + { + "epoch": 0.7342118276754807, + "grad_norm": 12.0, + "learning_rate": 7.5526294380824694e-06, + "loss": 1.6231, + "step": 25240 + }, + { + "epoch": 0.7347936120080286, + "grad_norm": 13.6875, + "learning_rate": 7.5506901564098545e-06, + "loss": 1.6582, + "step": 25260 + }, + { + "epoch": 0.7353753963405766, + "grad_norm": 14.1875, + "learning_rate": 7.54875087473724e-06, + "loss": 1.7438, + "step": 25280 + }, + { + "epoch": 0.7359571806731244, + "grad_norm": 12.25, + "learning_rate": 7.546811593064625e-06, + "loss": 1.6458, + "step": 25300 + }, + { + "epoch": 0.7365389650056724, + "grad_norm": 15.5, + "learning_rate": 7.54487231139201e-06, + "loss": 1.5911, + "step": 25320 + }, + { + "epoch": 0.7371207493382204, + "grad_norm": 8.25, + "learning_rate": 7.542933029719395e-06, + "loss": 1.5868, + "step": 25340 + }, + { + "epoch": 0.7377025336707682, + "grad_norm": 15.8125, + "learning_rate": 7.54099374804678e-06, + "loss": 1.583, + "step": 25360 + }, + { + "epoch": 0.7382843180033162, + "grad_norm": 10.3125, + "learning_rate": 7.539054466374165e-06, + "loss": 1.6235, + "step": 25380 + }, + { + "epoch": 0.7388661023358641, + "grad_norm": 11.8125, + "learning_rate": 7.53711518470155e-06, + "loss": 1.5862, + "step": 25400 + }, + { + "epoch": 0.739447886668412, + "grad_norm": 13.125, + "learning_rate": 7.535175903028935e-06, + "loss": 1.6101, + "step": 25420 + }, + { + "epoch": 0.74002967100096, + "grad_norm": 12.875, + "learning_rate": 7.53323662135632e-06, + "loss": 1.6228, + "step": 25440 + }, + { + "epoch": 0.7406114553335079, + "grad_norm": 11.3125, + "learning_rate": 7.5312973396837055e-06, + "loss": 1.5888, + "step": 25460 + }, + { + "epoch": 0.7411932396660558, + "grad_norm": 14.0625, + "learning_rate": 7.529358058011091e-06, + "loss": 1.6804, + "step": 25480 + }, + { + "epoch": 0.7417750239986037, + "grad_norm": 13.375, + "learning_rate": 7.527418776338476e-06, + "loss": 1.6769, + "step": 25500 + }, + { + "epoch": 0.7423568083311517, + "grad_norm": 12.4375, + "learning_rate": 7.525479494665861e-06, + "loss": 1.5462, + "step": 25520 + }, + { + "epoch": 0.7429385926636995, + "grad_norm": 14.1875, + "learning_rate": 7.523540212993246e-06, + "loss": 1.7738, + "step": 25540 + }, + { + "epoch": 0.7435203769962475, + "grad_norm": 8.8125, + "learning_rate": 7.521600931320631e-06, + "loss": 1.6262, + "step": 25560 + }, + { + "epoch": 0.7441021613287954, + "grad_norm": 14.0, + "learning_rate": 7.519661649648016e-06, + "loss": 1.5742, + "step": 25580 + }, + { + "epoch": 0.7446839456613433, + "grad_norm": 11.0625, + "learning_rate": 7.517722367975401e-06, + "loss": 1.7015, + "step": 25600 + }, + { + "epoch": 0.7452657299938913, + "grad_norm": 12.125, + "learning_rate": 7.515783086302786e-06, + "loss": 1.7003, + "step": 25620 + }, + { + "epoch": 0.7458475143264391, + "grad_norm": 13.25, + "learning_rate": 7.5138438046301705e-06, + "loss": 1.6813, + "step": 25640 + }, + { + "epoch": 0.7464292986589871, + "grad_norm": 12.25, + "learning_rate": 7.511904522957556e-06, + "loss": 1.6657, + "step": 25660 + }, + { + "epoch": 0.7470110829915351, + "grad_norm": 13.625, + "learning_rate": 7.509965241284941e-06, + "loss": 1.6187, + "step": 25680 + }, + { + "epoch": 0.7475928673240829, + "grad_norm": 11.375, + "learning_rate": 7.508025959612326e-06, + "loss": 1.6316, + "step": 25700 + }, + { + "epoch": 0.7481746516566309, + "grad_norm": 12.0625, + "learning_rate": 7.506086677939711e-06, + "loss": 1.6577, + "step": 25720 + }, + { + "epoch": 0.7487564359891788, + "grad_norm": 12.5625, + "learning_rate": 7.504147396267096e-06, + "loss": 1.6205, + "step": 25740 + }, + { + "epoch": 0.7493382203217267, + "grad_norm": 15.125, + "learning_rate": 7.502208114594481e-06, + "loss": 1.6341, + "step": 25760 + }, + { + "epoch": 0.7499200046542747, + "grad_norm": 12.9375, + "learning_rate": 7.500268832921866e-06, + "loss": 1.6506, + "step": 25780 + }, + { + "epoch": 0.7505017889868226, + "grad_norm": 13.5625, + "learning_rate": 7.498329551249251e-06, + "loss": 1.7797, + "step": 25800 + }, + { + "epoch": 0.7510835733193705, + "grad_norm": 14.125, + "learning_rate": 7.496390269576636e-06, + "loss": 1.6324, + "step": 25820 + }, + { + "epoch": 0.7516653576519184, + "grad_norm": 11.5625, + "learning_rate": 7.4944509879040215e-06, + "loss": 1.5381, + "step": 25840 + }, + { + "epoch": 0.7522471419844664, + "grad_norm": 12.0, + "learning_rate": 7.492511706231407e-06, + "loss": 1.6465, + "step": 25860 + }, + { + "epoch": 0.7528289263170143, + "grad_norm": 12.3125, + "learning_rate": 7.490572424558792e-06, + "loss": 1.6397, + "step": 25880 + }, + { + "epoch": 0.7534107106495622, + "grad_norm": 12.5, + "learning_rate": 7.488633142886177e-06, + "loss": 1.6028, + "step": 25900 + }, + { + "epoch": 0.7539924949821102, + "grad_norm": 12.125, + "learning_rate": 7.486693861213562e-06, + "loss": 1.6078, + "step": 25920 + }, + { + "epoch": 0.754574279314658, + "grad_norm": 12.75, + "learning_rate": 7.484754579540947e-06, + "loss": 1.5693, + "step": 25940 + }, + { + "epoch": 0.755156063647206, + "grad_norm": 11.5625, + "learning_rate": 7.482815297868332e-06, + "loss": 1.656, + "step": 25960 + }, + { + "epoch": 0.7557378479797539, + "grad_norm": 10.5, + "learning_rate": 7.480876016195717e-06, + "loss": 1.6052, + "step": 25980 + }, + { + "epoch": 0.7563196323123018, + "grad_norm": 14.5625, + "learning_rate": 7.478936734523102e-06, + "loss": 1.5874, + "step": 26000 + }, + { + "epoch": 0.7569014166448498, + "grad_norm": 15.25, + "learning_rate": 7.476997452850487e-06, + "loss": 1.6457, + "step": 26020 + }, + { + "epoch": 0.7574832009773976, + "grad_norm": 11.0625, + "learning_rate": 7.4750581711778725e-06, + "loss": 1.5725, + "step": 26040 + }, + { + "epoch": 0.7580649853099456, + "grad_norm": 12.0, + "learning_rate": 7.4731188895052576e-06, + "loss": 1.6549, + "step": 26060 + }, + { + "epoch": 0.7586467696424936, + "grad_norm": 12.3125, + "learning_rate": 7.471179607832643e-06, + "loss": 1.6379, + "step": 26080 + }, + { + "epoch": 0.7592285539750414, + "grad_norm": 11.625, + "learning_rate": 7.469240326160028e-06, + "loss": 1.5308, + "step": 26100 + }, + { + "epoch": 0.7598103383075894, + "grad_norm": 11.25, + "learning_rate": 7.467301044487413e-06, + "loss": 1.6131, + "step": 26120 + }, + { + "epoch": 0.7603921226401373, + "grad_norm": 16.875, + "learning_rate": 7.465361762814798e-06, + "loss": 1.5696, + "step": 26140 + }, + { + "epoch": 0.7609739069726852, + "grad_norm": 10.625, + "learning_rate": 7.463422481142183e-06, + "loss": 1.635, + "step": 26160 + }, + { + "epoch": 0.7615556913052332, + "grad_norm": 10.8125, + "learning_rate": 7.461483199469568e-06, + "loss": 1.5774, + "step": 26180 + }, + { + "epoch": 0.7621374756377811, + "grad_norm": 12.8125, + "learning_rate": 7.459543917796953e-06, + "loss": 1.5712, + "step": 26200 + }, + { + "epoch": 0.762719259970329, + "grad_norm": 13.375, + "learning_rate": 7.457604636124338e-06, + "loss": 1.5626, + "step": 26220 + }, + { + "epoch": 0.7633010443028769, + "grad_norm": 12.3125, + "learning_rate": 7.4556653544517235e-06, + "loss": 1.6614, + "step": 26240 + }, + { + "epoch": 0.7638828286354249, + "grad_norm": 13.0, + "learning_rate": 7.4537260727791085e-06, + "loss": 1.5953, + "step": 26260 + }, + { + "epoch": 0.7644646129679727, + "grad_norm": 12.125, + "learning_rate": 7.451786791106494e-06, + "loss": 1.6212, + "step": 26280 + }, + { + "epoch": 0.7650463973005207, + "grad_norm": 14.8125, + "learning_rate": 7.449847509433879e-06, + "loss": 1.5182, + "step": 26300 + }, + { + "epoch": 0.7656281816330687, + "grad_norm": 11.6875, + "learning_rate": 7.447908227761264e-06, + "loss": 1.6819, + "step": 26320 + }, + { + "epoch": 0.7662099659656165, + "grad_norm": 12.6875, + "learning_rate": 7.445968946088649e-06, + "loss": 1.5642, + "step": 26340 + }, + { + "epoch": 0.7667917502981645, + "grad_norm": 15.0, + "learning_rate": 7.444029664416034e-06, + "loss": 1.6226, + "step": 26360 + }, + { + "epoch": 0.7673735346307123, + "grad_norm": 14.1875, + "learning_rate": 7.442090382743419e-06, + "loss": 1.611, + "step": 26380 + }, + { + "epoch": 0.7679553189632603, + "grad_norm": 13.6875, + "learning_rate": 7.440151101070804e-06, + "loss": 1.6904, + "step": 26400 + }, + { + "epoch": 0.7685371032958083, + "grad_norm": 12.625, + "learning_rate": 7.438211819398189e-06, + "loss": 1.6579, + "step": 26420 + }, + { + "epoch": 0.7691188876283561, + "grad_norm": 11.125, + "learning_rate": 7.4362725377255744e-06, + "loss": 1.584, + "step": 26440 + }, + { + "epoch": 0.7697006719609041, + "grad_norm": 13.8125, + "learning_rate": 7.434333256052959e-06, + "loss": 1.6818, + "step": 26460 + }, + { + "epoch": 0.770282456293452, + "grad_norm": 15.75, + "learning_rate": 7.432393974380344e-06, + "loss": 1.6551, + "step": 26480 + }, + { + "epoch": 0.7708642406259999, + "grad_norm": 12.75, + "learning_rate": 7.430454692707729e-06, + "loss": 1.672, + "step": 26500 + }, + { + "epoch": 0.7714460249585479, + "grad_norm": 11.875, + "learning_rate": 7.428515411035114e-06, + "loss": 1.6364, + "step": 26520 + }, + { + "epoch": 0.7720278092910958, + "grad_norm": 11.25, + "learning_rate": 7.426576129362499e-06, + "loss": 1.6867, + "step": 26540 + }, + { + "epoch": 0.7726095936236437, + "grad_norm": 12.375, + "learning_rate": 7.424636847689884e-06, + "loss": 1.6134, + "step": 26560 + }, + { + "epoch": 0.7731913779561916, + "grad_norm": 14.1875, + "learning_rate": 7.422697566017269e-06, + "loss": 1.5726, + "step": 26580 + }, + { + "epoch": 0.7737731622887396, + "grad_norm": 14.25, + "learning_rate": 7.420758284344654e-06, + "loss": 1.6205, + "step": 26600 + }, + { + "epoch": 0.7743549466212875, + "grad_norm": 10.875, + "learning_rate": 7.4188190026720395e-06, + "loss": 1.6047, + "step": 26620 + }, + { + "epoch": 0.7749367309538354, + "grad_norm": 13.0, + "learning_rate": 7.4168797209994246e-06, + "loss": 1.6801, + "step": 26640 + }, + { + "epoch": 0.7755185152863834, + "grad_norm": 12.75, + "learning_rate": 7.41494043932681e-06, + "loss": 1.7161, + "step": 26660 + }, + { + "epoch": 0.7761002996189312, + "grad_norm": 12.1875, + "learning_rate": 7.413001157654195e-06, + "loss": 1.6308, + "step": 26680 + }, + { + "epoch": 0.7766820839514792, + "grad_norm": 14.4375, + "learning_rate": 7.41106187598158e-06, + "loss": 1.5738, + "step": 26700 + }, + { + "epoch": 0.7772638682840272, + "grad_norm": 11.5, + "learning_rate": 7.409122594308965e-06, + "loss": 1.6099, + "step": 26720 + }, + { + "epoch": 0.777845652616575, + "grad_norm": 11.375, + "learning_rate": 7.40718331263635e-06, + "loss": 1.6528, + "step": 26740 + }, + { + "epoch": 0.778427436949123, + "grad_norm": 12.75, + "learning_rate": 7.405244030963735e-06, + "loss": 1.6658, + "step": 26760 + }, + { + "epoch": 0.7790092212816709, + "grad_norm": 11.5, + "learning_rate": 7.40330474929112e-06, + "loss": 1.6278, + "step": 26780 + }, + { + "epoch": 0.7795910056142188, + "grad_norm": 11.25, + "learning_rate": 7.401365467618505e-06, + "loss": 1.6695, + "step": 26800 + }, + { + "epoch": 0.7801727899467668, + "grad_norm": 15.0, + "learning_rate": 7.3994261859458904e-06, + "loss": 1.6491, + "step": 26820 + }, + { + "epoch": 0.7807545742793146, + "grad_norm": 13.25, + "learning_rate": 7.3974869042732755e-06, + "loss": 1.6186, + "step": 26840 + }, + { + "epoch": 0.7813363586118626, + "grad_norm": 12.4375, + "learning_rate": 7.395547622600661e-06, + "loss": 1.5884, + "step": 26860 + }, + { + "epoch": 0.7819181429444105, + "grad_norm": 13.25, + "learning_rate": 7.393608340928046e-06, + "loss": 1.6544, + "step": 26880 + }, + { + "epoch": 0.7824999272769584, + "grad_norm": 13.5, + "learning_rate": 7.391669059255431e-06, + "loss": 1.6644, + "step": 26900 + }, + { + "epoch": 0.7830817116095063, + "grad_norm": 9.5, + "learning_rate": 7.389729777582816e-06, + "loss": 1.6021, + "step": 26920 + }, + { + "epoch": 0.7836634959420543, + "grad_norm": 14.0625, + "learning_rate": 7.387790495910201e-06, + "loss": 1.5777, + "step": 26940 + }, + { + "epoch": 0.7842452802746022, + "grad_norm": 10.5, + "learning_rate": 7.385851214237586e-06, + "loss": 1.5951, + "step": 26960 + }, + { + "epoch": 0.7848270646071501, + "grad_norm": 12.6875, + "learning_rate": 7.383911932564971e-06, + "loss": 1.5828, + "step": 26980 + }, + { + "epoch": 0.7854088489396981, + "grad_norm": 14.875, + "learning_rate": 7.381972650892356e-06, + "loss": 1.7095, + "step": 27000 + }, + { + "epoch": 0.7859906332722459, + "grad_norm": 12.25, + "learning_rate": 7.380033369219741e-06, + "loss": 1.642, + "step": 27020 + }, + { + "epoch": 0.7865724176047939, + "grad_norm": 12.5625, + "learning_rate": 7.3780940875471265e-06, + "loss": 1.6102, + "step": 27040 + }, + { + "epoch": 0.7871542019373419, + "grad_norm": 13.1875, + "learning_rate": 7.376154805874512e-06, + "loss": 1.6857, + "step": 27060 + }, + { + "epoch": 0.7877359862698897, + "grad_norm": 11.9375, + "learning_rate": 7.374215524201897e-06, + "loss": 1.6778, + "step": 27080 + }, + { + "epoch": 0.7883177706024377, + "grad_norm": 15.375, + "learning_rate": 7.372276242529282e-06, + "loss": 1.6085, + "step": 27100 + }, + { + "epoch": 0.7888995549349856, + "grad_norm": 13.8125, + "learning_rate": 7.370336960856667e-06, + "loss": 1.6804, + "step": 27120 + }, + { + "epoch": 0.7894813392675335, + "grad_norm": 10.0625, + "learning_rate": 7.368397679184052e-06, + "loss": 1.6082, + "step": 27140 + }, + { + "epoch": 0.7900631236000815, + "grad_norm": 10.4375, + "learning_rate": 7.366458397511437e-06, + "loss": 1.6325, + "step": 27160 + }, + { + "epoch": 0.7906449079326294, + "grad_norm": 12.25, + "learning_rate": 7.364519115838822e-06, + "loss": 1.6147, + "step": 27180 + }, + { + "epoch": 0.7912266922651773, + "grad_norm": 11.0, + "learning_rate": 7.362579834166207e-06, + "loss": 1.6164, + "step": 27200 + }, + { + "epoch": 0.7918084765977252, + "grad_norm": 10.125, + "learning_rate": 7.360640552493592e-06, + "loss": 1.7442, + "step": 27220 + }, + { + "epoch": 0.7923902609302731, + "grad_norm": 12.125, + "learning_rate": 7.3587012708209775e-06, + "loss": 1.6424, + "step": 27240 + }, + { + "epoch": 0.7929720452628211, + "grad_norm": 11.5625, + "learning_rate": 7.356761989148362e-06, + "loss": 1.6828, + "step": 27260 + }, + { + "epoch": 0.793553829595369, + "grad_norm": 11.8125, + "learning_rate": 7.354822707475747e-06, + "loss": 1.6025, + "step": 27280 + }, + { + "epoch": 0.7941356139279169, + "grad_norm": 12.6875, + "learning_rate": 7.352883425803132e-06, + "loss": 1.6738, + "step": 27300 + }, + { + "epoch": 0.7947173982604648, + "grad_norm": 13.875, + "learning_rate": 7.350944144130517e-06, + "loss": 1.5983, + "step": 27320 + }, + { + "epoch": 0.7952991825930128, + "grad_norm": 10.75, + "learning_rate": 7.349004862457902e-06, + "loss": 1.6961, + "step": 27340 + }, + { + "epoch": 0.7958809669255607, + "grad_norm": 12.5625, + "learning_rate": 7.347065580785287e-06, + "loss": 1.6454, + "step": 27360 + }, + { + "epoch": 0.7964627512581086, + "grad_norm": 14.25, + "learning_rate": 7.345126299112672e-06, + "loss": 1.6563, + "step": 27380 + }, + { + "epoch": 0.7970445355906566, + "grad_norm": 14.125, + "learning_rate": 7.343187017440057e-06, + "loss": 1.6725, + "step": 27400 + }, + { + "epoch": 0.7976263199232044, + "grad_norm": 9.8125, + "learning_rate": 7.3412477357674425e-06, + "loss": 1.7038, + "step": 27420 + }, + { + "epoch": 0.7982081042557524, + "grad_norm": 12.125, + "learning_rate": 7.339308454094828e-06, + "loss": 1.6789, + "step": 27440 + }, + { + "epoch": 0.7987898885883004, + "grad_norm": 11.625, + "learning_rate": 7.337369172422213e-06, + "loss": 1.5602, + "step": 27460 + }, + { + "epoch": 0.7993716729208482, + "grad_norm": 11.125, + "learning_rate": 7.335429890749598e-06, + "loss": 1.6001, + "step": 27480 + }, + { + "epoch": 0.7999534572533962, + "grad_norm": 12.1875, + "learning_rate": 7.333490609076983e-06, + "loss": 1.6053, + "step": 27500 + }, + { + "epoch": 0.8005352415859441, + "grad_norm": 12.8125, + "learning_rate": 7.331551327404368e-06, + "loss": 1.5731, + "step": 27520 + }, + { + "epoch": 0.801117025918492, + "grad_norm": 18.25, + "learning_rate": 7.329612045731753e-06, + "loss": 1.6737, + "step": 27540 + }, + { + "epoch": 0.80169881025104, + "grad_norm": 13.8125, + "learning_rate": 7.327672764059138e-06, + "loss": 1.7213, + "step": 27560 + }, + { + "epoch": 0.8022805945835879, + "grad_norm": 11.25, + "learning_rate": 7.325733482386523e-06, + "loss": 1.699, + "step": 27580 + }, + { + "epoch": 0.8028623789161358, + "grad_norm": 16.75, + "learning_rate": 7.323794200713908e-06, + "loss": 1.655, + "step": 27600 + }, + { + "epoch": 0.8034441632486837, + "grad_norm": 13.125, + "learning_rate": 7.3218549190412935e-06, + "loss": 1.6234, + "step": 27620 + }, + { + "epoch": 0.8040259475812317, + "grad_norm": 14.4375, + "learning_rate": 7.319915637368679e-06, + "loss": 1.6227, + "step": 27640 + }, + { + "epoch": 0.8046077319137795, + "grad_norm": 10.25, + "learning_rate": 7.317976355696064e-06, + "loss": 1.6854, + "step": 27660 + }, + { + "epoch": 0.8051895162463275, + "grad_norm": 10.875, + "learning_rate": 7.316037074023449e-06, + "loss": 1.6491, + "step": 27680 + }, + { + "epoch": 0.8057713005788754, + "grad_norm": 11.75, + "learning_rate": 7.314097792350834e-06, + "loss": 1.5881, + "step": 27700 + }, + { + "epoch": 0.8063530849114233, + "grad_norm": 14.5, + "learning_rate": 7.312158510678219e-06, + "loss": 1.6704, + "step": 27720 + }, + { + "epoch": 0.8069348692439713, + "grad_norm": 13.5625, + "learning_rate": 7.310219229005604e-06, + "loss": 1.5579, + "step": 27740 + }, + { + "epoch": 0.8075166535765191, + "grad_norm": 14.6875, + "learning_rate": 7.308279947332989e-06, + "loss": 1.4641, + "step": 27760 + }, + { + "epoch": 0.8080984379090671, + "grad_norm": 14.25, + "learning_rate": 7.306340665660374e-06, + "loss": 1.565, + "step": 27780 + }, + { + "epoch": 0.8086802222416151, + "grad_norm": 13.125, + "learning_rate": 7.304401383987759e-06, + "loss": 1.6324, + "step": 27800 + }, + { + "epoch": 0.8092620065741629, + "grad_norm": 13.0, + "learning_rate": 7.3024621023151445e-06, + "loss": 1.6607, + "step": 27820 + }, + { + "epoch": 0.8098437909067109, + "grad_norm": 12.75, + "learning_rate": 7.3005228206425296e-06, + "loss": 1.6168, + "step": 27840 + }, + { + "epoch": 0.8104255752392588, + "grad_norm": 12.125, + "learning_rate": 7.298583538969915e-06, + "loss": 1.6173, + "step": 27860 + }, + { + "epoch": 0.8110073595718067, + "grad_norm": 10.6875, + "learning_rate": 7.2966442572973e-06, + "loss": 1.6901, + "step": 27880 + }, + { + "epoch": 0.8115891439043547, + "grad_norm": 12.4375, + "learning_rate": 7.294704975624685e-06, + "loss": 1.6655, + "step": 27900 + }, + { + "epoch": 0.8121709282369026, + "grad_norm": 12.8125, + "learning_rate": 7.29276569395207e-06, + "loss": 1.5395, + "step": 27920 + }, + { + "epoch": 0.8127527125694505, + "grad_norm": 13.25, + "learning_rate": 7.290826412279455e-06, + "loss": 1.4986, + "step": 27940 + }, + { + "epoch": 0.8133344969019984, + "grad_norm": 11.6875, + "learning_rate": 7.28888713060684e-06, + "loss": 1.6638, + "step": 27960 + }, + { + "epoch": 0.8139162812345464, + "grad_norm": 11.5, + "learning_rate": 7.286947848934225e-06, + "loss": 1.5842, + "step": 27980 + }, + { + "epoch": 0.8144980655670943, + "grad_norm": 12.75, + "learning_rate": 7.28500856726161e-06, + "loss": 1.6284, + "step": 28000 + }, + { + "epoch": 0.8150798498996422, + "grad_norm": 10.25, + "learning_rate": 7.2830692855889954e-06, + "loss": 1.6638, + "step": 28020 + }, + { + "epoch": 0.8156616342321902, + "grad_norm": 13.125, + "learning_rate": 7.2811300039163805e-06, + "loss": 1.7014, + "step": 28040 + }, + { + "epoch": 0.816243418564738, + "grad_norm": 12.125, + "learning_rate": 7.279190722243766e-06, + "loss": 1.6751, + "step": 28060 + }, + { + "epoch": 0.816825202897286, + "grad_norm": 14.0, + "learning_rate": 7.27725144057115e-06, + "loss": 1.651, + "step": 28080 + }, + { + "epoch": 0.8174069872298338, + "grad_norm": 13.25, + "learning_rate": 7.275312158898535e-06, + "loss": 1.5778, + "step": 28100 + }, + { + "epoch": 0.8179887715623818, + "grad_norm": 15.1875, + "learning_rate": 7.27337287722592e-06, + "loss": 1.6771, + "step": 28120 + }, + { + "epoch": 0.8185705558949298, + "grad_norm": 12.375, + "learning_rate": 7.271433595553305e-06, + "loss": 1.6015, + "step": 28140 + }, + { + "epoch": 0.8191523402274776, + "grad_norm": 12.625, + "learning_rate": 7.26949431388069e-06, + "loss": 1.6831, + "step": 28160 + }, + { + "epoch": 0.8197341245600256, + "grad_norm": 9.8125, + "learning_rate": 7.267555032208075e-06, + "loss": 1.6934, + "step": 28180 + }, + { + "epoch": 0.8203159088925736, + "grad_norm": 13.9375, + "learning_rate": 7.2656157505354605e-06, + "loss": 1.6941, + "step": 28200 + }, + { + "epoch": 0.8208976932251214, + "grad_norm": 11.875, + "learning_rate": 7.2636764688628456e-06, + "loss": 1.5997, + "step": 28220 + }, + { + "epoch": 0.8214794775576694, + "grad_norm": 16.5, + "learning_rate": 7.261737187190231e-06, + "loss": 1.6822, + "step": 28240 + }, + { + "epoch": 0.8220612618902173, + "grad_norm": 11.375, + "learning_rate": 7.259797905517616e-06, + "loss": 1.4695, + "step": 28260 + }, + { + "epoch": 0.8226430462227652, + "grad_norm": 17.0, + "learning_rate": 7.257858623845001e-06, + "loss": 1.6311, + "step": 28280 + }, + { + "epoch": 0.8232248305553131, + "grad_norm": 10.5, + "learning_rate": 7.255919342172386e-06, + "loss": 1.6257, + "step": 28300 + }, + { + "epoch": 0.8238066148878611, + "grad_norm": 13.5, + "learning_rate": 7.253980060499771e-06, + "loss": 1.6623, + "step": 28320 + }, + { + "epoch": 0.824388399220409, + "grad_norm": 12.4375, + "learning_rate": 7.252040778827156e-06, + "loss": 1.7535, + "step": 28340 + }, + { + "epoch": 0.8249701835529569, + "grad_norm": 11.5, + "learning_rate": 7.250101497154541e-06, + "loss": 1.6129, + "step": 28360 + }, + { + "epoch": 0.8255519678855049, + "grad_norm": 13.375, + "learning_rate": 7.248162215481926e-06, + "loss": 1.6641, + "step": 28380 + }, + { + "epoch": 0.8261337522180527, + "grad_norm": 14.0625, + "learning_rate": 7.2462229338093114e-06, + "loss": 1.6582, + "step": 28400 + }, + { + "epoch": 0.8267155365506007, + "grad_norm": 10.75, + "learning_rate": 7.2442836521366965e-06, + "loss": 1.5584, + "step": 28420 + }, + { + "epoch": 0.8272973208831487, + "grad_norm": 12.625, + "learning_rate": 7.242344370464082e-06, + "loss": 1.6738, + "step": 28440 + }, + { + "epoch": 0.8278791052156965, + "grad_norm": 13.375, + "learning_rate": 7.240405088791467e-06, + "loss": 1.6583, + "step": 28460 + }, + { + "epoch": 0.8284608895482445, + "grad_norm": 13.75, + "learning_rate": 7.238465807118852e-06, + "loss": 1.5918, + "step": 28480 + }, + { + "epoch": 0.8290426738807923, + "grad_norm": 16.0, + "learning_rate": 7.236526525446237e-06, + "loss": 1.6061, + "step": 28500 + }, + { + "epoch": 0.8296244582133403, + "grad_norm": 12.0625, + "learning_rate": 7.234587243773622e-06, + "loss": 1.6719, + "step": 28520 + }, + { + "epoch": 0.8302062425458883, + "grad_norm": 13.0, + "learning_rate": 7.232647962101007e-06, + "loss": 1.5677, + "step": 28540 + }, + { + "epoch": 0.8307880268784361, + "grad_norm": 13.1875, + "learning_rate": 7.230708680428392e-06, + "loss": 1.6761, + "step": 28560 + }, + { + "epoch": 0.8313698112109841, + "grad_norm": 15.5, + "learning_rate": 7.228769398755777e-06, + "loss": 1.6747, + "step": 28580 + }, + { + "epoch": 0.831951595543532, + "grad_norm": 11.375, + "learning_rate": 7.226830117083162e-06, + "loss": 1.683, + "step": 28600 + }, + { + "epoch": 0.8325333798760799, + "grad_norm": 12.4375, + "learning_rate": 7.2248908354105475e-06, + "loss": 1.6348, + "step": 28620 + }, + { + "epoch": 0.8331151642086279, + "grad_norm": 21.625, + "learning_rate": 7.222951553737933e-06, + "loss": 1.5586, + "step": 28640 + }, + { + "epoch": 0.8336969485411758, + "grad_norm": 11.5, + "learning_rate": 7.221012272065318e-06, + "loss": 1.6519, + "step": 28660 + }, + { + "epoch": 0.8342787328737237, + "grad_norm": 11.0, + "learning_rate": 7.219072990392703e-06, + "loss": 1.6271, + "step": 28680 + }, + { + "epoch": 0.8348605172062716, + "grad_norm": 12.375, + "learning_rate": 7.217133708720088e-06, + "loss": 1.6389, + "step": 28700 + }, + { + "epoch": 0.8354423015388196, + "grad_norm": 11.9375, + "learning_rate": 7.215194427047473e-06, + "loss": 1.7004, + "step": 28720 + }, + { + "epoch": 0.8360240858713675, + "grad_norm": 13.0625, + "learning_rate": 7.213255145374858e-06, + "loss": 1.6365, + "step": 28740 + }, + { + "epoch": 0.8366058702039154, + "grad_norm": 12.9375, + "learning_rate": 7.211315863702243e-06, + "loss": 1.6538, + "step": 28760 + }, + { + "epoch": 0.8371876545364634, + "grad_norm": 10.625, + "learning_rate": 7.209376582029628e-06, + "loss": 1.6667, + "step": 28780 + }, + { + "epoch": 0.8377694388690112, + "grad_norm": 13.25, + "learning_rate": 7.207437300357013e-06, + "loss": 1.5666, + "step": 28800 + }, + { + "epoch": 0.8383512232015592, + "grad_norm": 14.4375, + "learning_rate": 7.2054980186843985e-06, + "loss": 1.6309, + "step": 28820 + }, + { + "epoch": 0.8389330075341072, + "grad_norm": 13.3125, + "learning_rate": 7.203558737011784e-06, + "loss": 1.622, + "step": 28840 + }, + { + "epoch": 0.839514791866655, + "grad_norm": 13.625, + "learning_rate": 7.201619455339169e-06, + "loss": 1.5475, + "step": 28860 + }, + { + "epoch": 0.840096576199203, + "grad_norm": 12.5, + "learning_rate": 7.199680173666554e-06, + "loss": 1.6515, + "step": 28880 + }, + { + "epoch": 0.8406783605317509, + "grad_norm": 10.25, + "learning_rate": 7.197740891993938e-06, + "loss": 1.5814, + "step": 28900 + }, + { + "epoch": 0.8412601448642988, + "grad_norm": 9.1875, + "learning_rate": 7.195801610321323e-06, + "loss": 1.6171, + "step": 28920 + }, + { + "epoch": 0.8418419291968467, + "grad_norm": 13.0, + "learning_rate": 7.193862328648708e-06, + "loss": 1.6931, + "step": 28940 + }, + { + "epoch": 0.8424237135293946, + "grad_norm": 13.875, + "learning_rate": 7.191923046976093e-06, + "loss": 1.6066, + "step": 28960 + }, + { + "epoch": 0.8430054978619426, + "grad_norm": 11.75, + "learning_rate": 7.189983765303478e-06, + "loss": 1.5293, + "step": 28980 + }, + { + "epoch": 0.8435872821944905, + "grad_norm": 13.5, + "learning_rate": 7.1880444836308635e-06, + "loss": 1.5974, + "step": 29000 + }, + { + "epoch": 0.8441690665270384, + "grad_norm": 12.25, + "learning_rate": 7.186105201958249e-06, + "loss": 1.6792, + "step": 29020 + }, + { + "epoch": 0.8447508508595863, + "grad_norm": 11.0625, + "learning_rate": 7.184165920285634e-06, + "loss": 1.6707, + "step": 29040 + }, + { + "epoch": 0.8453326351921343, + "grad_norm": 11.5, + "learning_rate": 7.182226638613018e-06, + "loss": 1.5823, + "step": 29060 + }, + { + "epoch": 0.8459144195246822, + "grad_norm": 14.875, + "learning_rate": 7.180287356940403e-06, + "loss": 1.6895, + "step": 29080 + }, + { + "epoch": 0.8464962038572301, + "grad_norm": 13.8125, + "learning_rate": 7.178348075267788e-06, + "loss": 1.5561, + "step": 29100 + }, + { + "epoch": 0.8470779881897781, + "grad_norm": 14.3125, + "learning_rate": 7.176408793595173e-06, + "loss": 1.6273, + "step": 29120 + }, + { + "epoch": 0.8476597725223259, + "grad_norm": 12.8125, + "learning_rate": 7.174469511922558e-06, + "loss": 1.6924, + "step": 29140 + }, + { + "epoch": 0.8482415568548739, + "grad_norm": 11.6875, + "learning_rate": 7.1725302302499434e-06, + "loss": 1.5519, + "step": 29160 + }, + { + "epoch": 0.8488233411874219, + "grad_norm": 12.0625, + "learning_rate": 7.1705909485773285e-06, + "loss": 1.7582, + "step": 29180 + }, + { + "epoch": 0.8494051255199697, + "grad_norm": 12.4375, + "learning_rate": 7.168651666904714e-06, + "loss": 1.6532, + "step": 29200 + }, + { + "epoch": 0.8499869098525177, + "grad_norm": 11.875, + "learning_rate": 7.166712385232099e-06, + "loss": 1.6266, + "step": 29220 + }, + { + "epoch": 0.8505686941850656, + "grad_norm": 13.1875, + "learning_rate": 7.164773103559484e-06, + "loss": 1.6186, + "step": 29240 + }, + { + "epoch": 0.8511504785176135, + "grad_norm": 11.0625, + "learning_rate": 7.162833821886869e-06, + "loss": 1.5315, + "step": 29260 + }, + { + "epoch": 0.8517322628501615, + "grad_norm": 12.375, + "learning_rate": 7.160894540214254e-06, + "loss": 1.5611, + "step": 29280 + }, + { + "epoch": 0.8523140471827094, + "grad_norm": 12.125, + "learning_rate": 7.158955258541639e-06, + "loss": 1.56, + "step": 29300 + }, + { + "epoch": 0.8528958315152573, + "grad_norm": 12.5625, + "learning_rate": 7.157015976869024e-06, + "loss": 1.6365, + "step": 29320 + }, + { + "epoch": 0.8534776158478052, + "grad_norm": 13.5, + "learning_rate": 7.155076695196409e-06, + "loss": 1.7178, + "step": 29340 + }, + { + "epoch": 0.8540594001803531, + "grad_norm": 15.1875, + "learning_rate": 7.153137413523794e-06, + "loss": 1.7602, + "step": 29360 + }, + { + "epoch": 0.854641184512901, + "grad_norm": 12.6875, + "learning_rate": 7.1511981318511795e-06, + "loss": 1.6393, + "step": 29380 + }, + { + "epoch": 0.855222968845449, + "grad_norm": 10.6875, + "learning_rate": 7.149258850178565e-06, + "loss": 1.7082, + "step": 29400 + }, + { + "epoch": 0.8558047531779969, + "grad_norm": 11.75, + "learning_rate": 7.14731956850595e-06, + "loss": 1.6048, + "step": 29420 + }, + { + "epoch": 0.8563865375105448, + "grad_norm": 15.1875, + "learning_rate": 7.145380286833335e-06, + "loss": 1.601, + "step": 29440 + }, + { + "epoch": 0.8569683218430928, + "grad_norm": 12.125, + "learning_rate": 7.143441005160719e-06, + "loss": 1.6411, + "step": 29460 + }, + { + "epoch": 0.8575501061756406, + "grad_norm": 14.9375, + "learning_rate": 7.141501723488104e-06, + "loss": 1.6465, + "step": 29480 + }, + { + "epoch": 0.8581318905081886, + "grad_norm": 12.4375, + "learning_rate": 7.139562441815489e-06, + "loss": 1.6785, + "step": 29500 + }, + { + "epoch": 0.8587136748407366, + "grad_norm": 12.5625, + "learning_rate": 7.137623160142874e-06, + "loss": 1.6321, + "step": 29520 + }, + { + "epoch": 0.8592954591732844, + "grad_norm": 12.9375, + "learning_rate": 7.1356838784702595e-06, + "loss": 1.6339, + "step": 29540 + }, + { + "epoch": 0.8598772435058324, + "grad_norm": 14.375, + "learning_rate": 7.1337445967976445e-06, + "loss": 1.6174, + "step": 29560 + }, + { + "epoch": 0.8604590278383804, + "grad_norm": 11.5, + "learning_rate": 7.13180531512503e-06, + "loss": 1.5598, + "step": 29580 + }, + { + "epoch": 0.8610408121709282, + "grad_norm": 8.625, + "learning_rate": 7.129866033452415e-06, + "loss": 1.6395, + "step": 29600 + }, + { + "epoch": 0.8616225965034762, + "grad_norm": 14.0625, + "learning_rate": 7.1279267517798e-06, + "loss": 1.779, + "step": 29620 + }, + { + "epoch": 0.8622043808360241, + "grad_norm": 12.375, + "learning_rate": 7.125987470107185e-06, + "loss": 1.6451, + "step": 29640 + }, + { + "epoch": 0.862786165168572, + "grad_norm": 14.0625, + "learning_rate": 7.12404818843457e-06, + "loss": 1.523, + "step": 29660 + }, + { + "epoch": 0.86336794950112, + "grad_norm": 12.4375, + "learning_rate": 7.122108906761955e-06, + "loss": 1.6644, + "step": 29680 + }, + { + "epoch": 0.8639497338336679, + "grad_norm": 13.75, + "learning_rate": 7.12016962508934e-06, + "loss": 1.6535, + "step": 29700 + }, + { + "epoch": 0.8645315181662158, + "grad_norm": 11.8125, + "learning_rate": 7.118230343416725e-06, + "loss": 1.6443, + "step": 29720 + }, + { + "epoch": 0.8651133024987637, + "grad_norm": 13.75, + "learning_rate": 7.1162910617441104e-06, + "loss": 1.5852, + "step": 29740 + }, + { + "epoch": 0.8656950868313116, + "grad_norm": 9.8125, + "learning_rate": 7.1143517800714955e-06, + "loss": 1.5577, + "step": 29760 + }, + { + "epoch": 0.8662768711638595, + "grad_norm": 14.8125, + "learning_rate": 7.112412498398881e-06, + "loss": 1.5347, + "step": 29780 + }, + { + "epoch": 0.8668586554964075, + "grad_norm": 11.1875, + "learning_rate": 7.110473216726266e-06, + "loss": 1.6938, + "step": 29800 + }, + { + "epoch": 0.8674404398289554, + "grad_norm": 12.1875, + "learning_rate": 7.108533935053651e-06, + "loss": 1.5838, + "step": 29820 + }, + { + "epoch": 0.8680222241615033, + "grad_norm": 12.1875, + "learning_rate": 7.106594653381036e-06, + "loss": 1.654, + "step": 29840 + }, + { + "epoch": 0.8686040084940513, + "grad_norm": 10.8125, + "learning_rate": 7.104655371708421e-06, + "loss": 1.6771, + "step": 29860 + }, + { + "epoch": 0.8691857928265991, + "grad_norm": 14.25, + "learning_rate": 7.102716090035806e-06, + "loss": 1.7066, + "step": 29880 + }, + { + "epoch": 0.8697675771591471, + "grad_norm": 12.1875, + "learning_rate": 7.100776808363191e-06, + "loss": 1.6345, + "step": 29900 + }, + { + "epoch": 0.8703493614916951, + "grad_norm": 13.375, + "learning_rate": 7.098837526690576e-06, + "loss": 1.5612, + "step": 29920 + }, + { + "epoch": 0.8709311458242429, + "grad_norm": 12.0625, + "learning_rate": 7.096898245017961e-06, + "loss": 1.7013, + "step": 29940 + }, + { + "epoch": 0.8715129301567909, + "grad_norm": 13.8125, + "learning_rate": 7.0949589633453465e-06, + "loss": 1.6588, + "step": 29960 + }, + { + "epoch": 0.8720947144893388, + "grad_norm": 11.125, + "learning_rate": 7.093019681672732e-06, + "loss": 1.6883, + "step": 29980 + }, + { + "epoch": 0.8726764988218867, + "grad_norm": 14.875, + "learning_rate": 7.091080400000117e-06, + "loss": 1.657, + "step": 30000 + }, + { + "epoch": 0.8732582831544347, + "grad_norm": 10.375, + "learning_rate": 7.089141118327502e-06, + "loss": 1.6111, + "step": 30020 + }, + { + "epoch": 0.8738400674869826, + "grad_norm": 10.5625, + "learning_rate": 7.087201836654887e-06, + "loss": 1.6375, + "step": 30040 + }, + { + "epoch": 0.8744218518195305, + "grad_norm": 11.0625, + "learning_rate": 7.085262554982272e-06, + "loss": 1.5596, + "step": 30060 + }, + { + "epoch": 0.8750036361520784, + "grad_norm": 9.6875, + "learning_rate": 7.083323273309657e-06, + "loss": 1.6095, + "step": 30080 + }, + { + "epoch": 0.8755854204846264, + "grad_norm": 14.4375, + "learning_rate": 7.081383991637042e-06, + "loss": 1.5589, + "step": 30100 + }, + { + "epoch": 0.8761672048171742, + "grad_norm": 12.125, + "learning_rate": 7.079444709964427e-06, + "loss": 1.5975, + "step": 30120 + }, + { + "epoch": 0.8767489891497222, + "grad_norm": 12.375, + "learning_rate": 7.077505428291812e-06, + "loss": 1.6044, + "step": 30140 + }, + { + "epoch": 0.8773307734822702, + "grad_norm": 10.6875, + "learning_rate": 7.0755661466191975e-06, + "loss": 1.6167, + "step": 30160 + }, + { + "epoch": 0.877912557814818, + "grad_norm": 12.0, + "learning_rate": 7.0736268649465826e-06, + "loss": 1.6748, + "step": 30180 + }, + { + "epoch": 0.878494342147366, + "grad_norm": 10.75, + "learning_rate": 7.071687583273968e-06, + "loss": 1.651, + "step": 30200 + }, + { + "epoch": 0.8790761264799138, + "grad_norm": 13.25, + "learning_rate": 7.069748301601353e-06, + "loss": 1.7351, + "step": 30220 + }, + { + "epoch": 0.8796579108124618, + "grad_norm": 12.5625, + "learning_rate": 7.067809019928738e-06, + "loss": 1.5913, + "step": 30240 + }, + { + "epoch": 0.8802396951450098, + "grad_norm": 16.5, + "learning_rate": 7.065869738256123e-06, + "loss": 1.5976, + "step": 30260 + }, + { + "epoch": 0.8808214794775576, + "grad_norm": 10.5625, + "learning_rate": 7.063930456583507e-06, + "loss": 1.712, + "step": 30280 + }, + { + "epoch": 0.8814032638101056, + "grad_norm": 8.625, + "learning_rate": 7.061991174910892e-06, + "loss": 1.7202, + "step": 30300 + }, + { + "epoch": 0.8819850481426535, + "grad_norm": 11.875, + "learning_rate": 7.060051893238277e-06, + "loss": 1.6646, + "step": 30320 + }, + { + "epoch": 0.8825668324752014, + "grad_norm": 11.4375, + "learning_rate": 7.0581126115656625e-06, + "loss": 1.6245, + "step": 30340 + }, + { + "epoch": 0.8831486168077494, + "grad_norm": 13.75, + "learning_rate": 7.056173329893048e-06, + "loss": 1.6251, + "step": 30360 + }, + { + "epoch": 0.8837304011402973, + "grad_norm": 12.625, + "learning_rate": 7.054234048220433e-06, + "loss": 1.6073, + "step": 30380 + }, + { + "epoch": 0.8843121854728452, + "grad_norm": 12.875, + "learning_rate": 7.052294766547818e-06, + "loss": 1.6597, + "step": 30400 + }, + { + "epoch": 0.8848939698053931, + "grad_norm": 13.875, + "learning_rate": 7.050355484875203e-06, + "loss": 1.5641, + "step": 30420 + }, + { + "epoch": 0.8854757541379411, + "grad_norm": 13.0625, + "learning_rate": 7.048416203202588e-06, + "loss": 1.5914, + "step": 30440 + }, + { + "epoch": 0.886057538470489, + "grad_norm": 11.875, + "learning_rate": 7.046476921529973e-06, + "loss": 1.6748, + "step": 30460 + }, + { + "epoch": 0.8866393228030369, + "grad_norm": 12.0625, + "learning_rate": 7.044537639857358e-06, + "loss": 1.6939, + "step": 30480 + }, + { + "epoch": 0.8872211071355849, + "grad_norm": 11.0, + "learning_rate": 7.042598358184743e-06, + "loss": 1.5961, + "step": 30500 + }, + { + "epoch": 0.8878028914681327, + "grad_norm": 12.625, + "learning_rate": 7.040659076512128e-06, + "loss": 1.6494, + "step": 30520 + }, + { + "epoch": 0.8883846758006807, + "grad_norm": 11.25, + "learning_rate": 7.0387197948395135e-06, + "loss": 1.6075, + "step": 30540 + }, + { + "epoch": 0.8889664601332287, + "grad_norm": 10.875, + "learning_rate": 7.0367805131668986e-06, + "loss": 1.6193, + "step": 30560 + }, + { + "epoch": 0.8895482444657765, + "grad_norm": 15.5625, + "learning_rate": 7.034841231494284e-06, + "loss": 1.5741, + "step": 30580 + }, + { + "epoch": 0.8901300287983245, + "grad_norm": 13.375, + "learning_rate": 7.032901949821669e-06, + "loss": 1.6317, + "step": 30600 + }, + { + "epoch": 0.8907118131308723, + "grad_norm": 12.1875, + "learning_rate": 7.030962668149054e-06, + "loss": 1.6391, + "step": 30620 + }, + { + "epoch": 0.8912935974634203, + "grad_norm": 12.25, + "learning_rate": 7.029023386476439e-06, + "loss": 1.6586, + "step": 30640 + }, + { + "epoch": 0.8918753817959683, + "grad_norm": 11.75, + "learning_rate": 7.027084104803824e-06, + "loss": 1.7167, + "step": 30660 + }, + { + "epoch": 0.8924571661285161, + "grad_norm": 14.0625, + "learning_rate": 7.025144823131209e-06, + "loss": 1.6195, + "step": 30680 + }, + { + "epoch": 0.8930389504610641, + "grad_norm": 11.1875, + "learning_rate": 7.023205541458594e-06, + "loss": 1.6202, + "step": 30700 + }, + { + "epoch": 0.893620734793612, + "grad_norm": 11.1875, + "learning_rate": 7.021266259785979e-06, + "loss": 1.7246, + "step": 30720 + }, + { + "epoch": 0.8942025191261599, + "grad_norm": 14.5, + "learning_rate": 7.0193269781133645e-06, + "loss": 1.6541, + "step": 30740 + }, + { + "epoch": 0.8947843034587079, + "grad_norm": 14.125, + "learning_rate": 7.0173876964407496e-06, + "loss": 1.7151, + "step": 30760 + }, + { + "epoch": 0.8953660877912558, + "grad_norm": 12.5625, + "learning_rate": 7.015448414768135e-06, + "loss": 1.6388, + "step": 30780 + }, + { + "epoch": 0.8959478721238037, + "grad_norm": 12.9375, + "learning_rate": 7.01350913309552e-06, + "loss": 1.6886, + "step": 30800 + }, + { + "epoch": 0.8965296564563516, + "grad_norm": 14.5, + "learning_rate": 7.011569851422905e-06, + "loss": 1.7274, + "step": 30820 + }, + { + "epoch": 0.8971114407888996, + "grad_norm": 11.375, + "learning_rate": 7.00963056975029e-06, + "loss": 1.5491, + "step": 30840 + }, + { + "epoch": 0.8976932251214474, + "grad_norm": 11.75, + "learning_rate": 7.007691288077675e-06, + "loss": 1.7166, + "step": 30860 + }, + { + "epoch": 0.8982750094539954, + "grad_norm": 11.0, + "learning_rate": 7.00575200640506e-06, + "loss": 1.4878, + "step": 30880 + }, + { + "epoch": 0.8988567937865434, + "grad_norm": 14.5, + "learning_rate": 7.003812724732445e-06, + "loss": 1.4887, + "step": 30900 + }, + { + "epoch": 0.8994385781190912, + "grad_norm": 12.375, + "learning_rate": 7.00187344305983e-06, + "loss": 1.6791, + "step": 30920 + }, + { + "epoch": 0.9000203624516392, + "grad_norm": 14.0, + "learning_rate": 6.9999341613872154e-06, + "loss": 1.6454, + "step": 30940 + }, + { + "epoch": 0.9006021467841872, + "grad_norm": 12.375, + "learning_rate": 6.9979948797146005e-06, + "loss": 1.5783, + "step": 30960 + }, + { + "epoch": 0.901183931116735, + "grad_norm": 13.5625, + "learning_rate": 6.996055598041986e-06, + "loss": 1.6508, + "step": 30980 + }, + { + "epoch": 0.901765715449283, + "grad_norm": 15.4375, + "learning_rate": 6.994116316369371e-06, + "loss": 1.6058, + "step": 31000 + }, + { + "epoch": 0.9023474997818309, + "grad_norm": 11.4375, + "learning_rate": 6.992177034696756e-06, + "loss": 1.6019, + "step": 31020 + }, + { + "epoch": 0.9029292841143788, + "grad_norm": 11.125, + "learning_rate": 6.990237753024141e-06, + "loss": 1.7452, + "step": 31040 + }, + { + "epoch": 0.9035110684469267, + "grad_norm": 11.6875, + "learning_rate": 6.988298471351526e-06, + "loss": 1.6808, + "step": 31060 + }, + { + "epoch": 0.9040928527794746, + "grad_norm": 13.125, + "learning_rate": 6.986359189678911e-06, + "loss": 1.6858, + "step": 31080 + }, + { + "epoch": 0.9046746371120226, + "grad_norm": 13.625, + "learning_rate": 6.984419908006295e-06, + "loss": 1.7046, + "step": 31100 + }, + { + "epoch": 0.9052564214445705, + "grad_norm": 12.1875, + "learning_rate": 6.9824806263336805e-06, + "loss": 1.6436, + "step": 31120 + }, + { + "epoch": 0.9058382057771184, + "grad_norm": 11.0, + "learning_rate": 6.9805413446610656e-06, + "loss": 1.7363, + "step": 31140 + }, + { + "epoch": 0.9064199901096663, + "grad_norm": 12.125, + "learning_rate": 6.978602062988451e-06, + "loss": 1.6176, + "step": 31160 + }, + { + "epoch": 0.9070017744422143, + "grad_norm": 12.3125, + "learning_rate": 6.976662781315836e-06, + "loss": 1.6335, + "step": 31180 + }, + { + "epoch": 0.9075835587747622, + "grad_norm": 11.8125, + "learning_rate": 6.974723499643221e-06, + "loss": 1.5611, + "step": 31200 + }, + { + "epoch": 0.9081653431073101, + "grad_norm": 10.6875, + "learning_rate": 6.972784217970606e-06, + "loss": 1.6538, + "step": 31220 + }, + { + "epoch": 0.9087471274398581, + "grad_norm": 12.3125, + "learning_rate": 6.970844936297991e-06, + "loss": 1.6281, + "step": 31240 + }, + { + "epoch": 0.9093289117724059, + "grad_norm": 12.1875, + "learning_rate": 6.968905654625376e-06, + "loss": 1.7004, + "step": 31260 + }, + { + "epoch": 0.9099106961049539, + "grad_norm": 12.6875, + "learning_rate": 6.966966372952761e-06, + "loss": 1.7177, + "step": 31280 + }, + { + "epoch": 0.9104924804375019, + "grad_norm": 11.1875, + "learning_rate": 6.965027091280146e-06, + "loss": 1.591, + "step": 31300 + }, + { + "epoch": 0.9110742647700497, + "grad_norm": 8.375, + "learning_rate": 6.9630878096075314e-06, + "loss": 1.604, + "step": 31320 + }, + { + "epoch": 0.9116560491025977, + "grad_norm": 12.125, + "learning_rate": 6.9611485279349165e-06, + "loss": 1.5641, + "step": 31340 + }, + { + "epoch": 0.9122378334351456, + "grad_norm": 14.125, + "learning_rate": 6.959209246262302e-06, + "loss": 1.6305, + "step": 31360 + }, + { + "epoch": 0.9128196177676935, + "grad_norm": 12.375, + "learning_rate": 6.957269964589687e-06, + "loss": 1.6007, + "step": 31380 + }, + { + "epoch": 0.9134014021002415, + "grad_norm": 14.0625, + "learning_rate": 6.955330682917072e-06, + "loss": 1.7345, + "step": 31400 + }, + { + "epoch": 0.9139831864327894, + "grad_norm": 14.3125, + "learning_rate": 6.953391401244457e-06, + "loss": 1.6524, + "step": 31420 + }, + { + "epoch": 0.9145649707653373, + "grad_norm": 12.875, + "learning_rate": 6.951452119571842e-06, + "loss": 1.5854, + "step": 31440 + }, + { + "epoch": 0.9151467550978852, + "grad_norm": 10.9375, + "learning_rate": 6.949512837899227e-06, + "loss": 1.5793, + "step": 31460 + }, + { + "epoch": 0.9157285394304331, + "grad_norm": 11.625, + "learning_rate": 6.947573556226612e-06, + "loss": 1.6801, + "step": 31480 + }, + { + "epoch": 0.916310323762981, + "grad_norm": 9.5, + "learning_rate": 6.945634274553997e-06, + "loss": 1.64, + "step": 31500 + }, + { + "epoch": 0.916892108095529, + "grad_norm": 11.8125, + "learning_rate": 6.943694992881382e-06, + "loss": 1.5614, + "step": 31520 + }, + { + "epoch": 0.9174738924280769, + "grad_norm": 14.1875, + "learning_rate": 6.9417557112087675e-06, + "loss": 1.6406, + "step": 31540 + }, + { + "epoch": 0.9180556767606248, + "grad_norm": 14.1875, + "learning_rate": 6.939816429536153e-06, + "loss": 1.6144, + "step": 31560 + }, + { + "epoch": 0.9186374610931728, + "grad_norm": 13.875, + "learning_rate": 6.937877147863538e-06, + "loss": 1.6121, + "step": 31580 + }, + { + "epoch": 0.9192192454257206, + "grad_norm": 11.5, + "learning_rate": 6.935937866190923e-06, + "loss": 1.5784, + "step": 31600 + }, + { + "epoch": 0.9198010297582686, + "grad_norm": 15.5, + "learning_rate": 6.933998584518308e-06, + "loss": 1.506, + "step": 31620 + }, + { + "epoch": 0.9203828140908166, + "grad_norm": 11.9375, + "learning_rate": 6.932059302845693e-06, + "loss": 1.6829, + "step": 31640 + }, + { + "epoch": 0.9209645984233644, + "grad_norm": 14.625, + "learning_rate": 6.930120021173078e-06, + "loss": 1.7056, + "step": 31660 + }, + { + "epoch": 0.9215463827559124, + "grad_norm": 11.9375, + "learning_rate": 6.928180739500463e-06, + "loss": 1.6337, + "step": 31680 + }, + { + "epoch": 0.9221281670884603, + "grad_norm": 10.0, + "learning_rate": 6.926241457827848e-06, + "loss": 1.6357, + "step": 31700 + }, + { + "epoch": 0.9227099514210082, + "grad_norm": 12.5, + "learning_rate": 6.924302176155233e-06, + "loss": 1.6074, + "step": 31720 + }, + { + "epoch": 0.9232917357535562, + "grad_norm": 12.625, + "learning_rate": 6.9223628944826185e-06, + "loss": 1.7342, + "step": 31740 + }, + { + "epoch": 0.9238735200861041, + "grad_norm": 14.0, + "learning_rate": 6.920423612810004e-06, + "loss": 1.712, + "step": 31760 + }, + { + "epoch": 0.924455304418652, + "grad_norm": 24.375, + "learning_rate": 6.918484331137389e-06, + "loss": 1.6576, + "step": 31780 + }, + { + "epoch": 0.9250370887511999, + "grad_norm": 10.875, + "learning_rate": 6.916545049464774e-06, + "loss": 1.6502, + "step": 31800 + }, + { + "epoch": 0.9256188730837479, + "grad_norm": 10.625, + "learning_rate": 6.914605767792159e-06, + "loss": 1.5984, + "step": 31820 + }, + { + "epoch": 0.9262006574162958, + "grad_norm": 11.5, + "learning_rate": 6.912666486119544e-06, + "loss": 1.5611, + "step": 31840 + }, + { + "epoch": 0.9267824417488437, + "grad_norm": 12.5625, + "learning_rate": 6.910727204446929e-06, + "loss": 1.5556, + "step": 31860 + }, + { + "epoch": 0.9273642260813916, + "grad_norm": 12.0, + "learning_rate": 6.908787922774314e-06, + "loss": 1.591, + "step": 31880 + }, + { + "epoch": 0.9279460104139395, + "grad_norm": 13.4375, + "learning_rate": 6.906848641101699e-06, + "loss": 1.6348, + "step": 31900 + }, + { + "epoch": 0.9285277947464875, + "grad_norm": 12.25, + "learning_rate": 6.9049093594290835e-06, + "loss": 1.6385, + "step": 31920 + }, + { + "epoch": 0.9291095790790354, + "grad_norm": 12.5625, + "learning_rate": 6.902970077756469e-06, + "loss": 1.7238, + "step": 31940 + }, + { + "epoch": 0.9296913634115833, + "grad_norm": 12.8125, + "learning_rate": 6.901030796083854e-06, + "loss": 1.5489, + "step": 31960 + }, + { + "epoch": 0.9302731477441313, + "grad_norm": 13.75, + "learning_rate": 6.899091514411239e-06, + "loss": 1.647, + "step": 31980 + }, + { + "epoch": 0.9308549320766791, + "grad_norm": 13.75, + "learning_rate": 6.897152232738624e-06, + "loss": 1.6397, + "step": 32000 + }, + { + "epoch": 0.9314367164092271, + "grad_norm": 13.25, + "learning_rate": 6.895212951066009e-06, + "loss": 1.6137, + "step": 32020 + }, + { + "epoch": 0.9320185007417751, + "grad_norm": 10.0, + "learning_rate": 6.893273669393394e-06, + "loss": 1.6826, + "step": 32040 + }, + { + "epoch": 0.9326002850743229, + "grad_norm": 13.9375, + "learning_rate": 6.891334387720779e-06, + "loss": 1.6708, + "step": 32060 + }, + { + "epoch": 0.9331820694068709, + "grad_norm": 10.9375, + "learning_rate": 6.889395106048164e-06, + "loss": 1.5909, + "step": 32080 + }, + { + "epoch": 0.9337638537394188, + "grad_norm": 13.0, + "learning_rate": 6.887455824375549e-06, + "loss": 1.5761, + "step": 32100 + }, + { + "epoch": 0.9343456380719667, + "grad_norm": 11.0625, + "learning_rate": 6.8855165427029345e-06, + "loss": 1.642, + "step": 32120 + }, + { + "epoch": 0.9349274224045147, + "grad_norm": 11.625, + "learning_rate": 6.88357726103032e-06, + "loss": 1.6688, + "step": 32140 + }, + { + "epoch": 0.9355092067370626, + "grad_norm": 13.125, + "learning_rate": 6.881637979357705e-06, + "loss": 1.6207, + "step": 32160 + }, + { + "epoch": 0.9360909910696105, + "grad_norm": 11.125, + "learning_rate": 6.87969869768509e-06, + "loss": 1.6017, + "step": 32180 + }, + { + "epoch": 0.9366727754021584, + "grad_norm": 11.625, + "learning_rate": 6.877759416012475e-06, + "loss": 1.6321, + "step": 32200 + }, + { + "epoch": 0.9372545597347064, + "grad_norm": 12.875, + "learning_rate": 6.87582013433986e-06, + "loss": 1.6127, + "step": 32220 + }, + { + "epoch": 0.9378363440672542, + "grad_norm": 14.875, + "learning_rate": 6.873880852667245e-06, + "loss": 1.7372, + "step": 32240 + }, + { + "epoch": 0.9384181283998022, + "grad_norm": 15.9375, + "learning_rate": 6.87194157099463e-06, + "loss": 1.6319, + "step": 32260 + }, + { + "epoch": 0.9389999127323502, + "grad_norm": 13.625, + "learning_rate": 6.870002289322015e-06, + "loss": 1.5551, + "step": 32280 + }, + { + "epoch": 0.939581697064898, + "grad_norm": 10.6875, + "learning_rate": 6.8680630076494e-06, + "loss": 1.6711, + "step": 32300 + }, + { + "epoch": 0.940163481397446, + "grad_norm": 12.1875, + "learning_rate": 6.8661237259767855e-06, + "loss": 1.5174, + "step": 32320 + }, + { + "epoch": 0.9407452657299938, + "grad_norm": 12.3125, + "learning_rate": 6.8641844443041706e-06, + "loss": 1.7003, + "step": 32340 + }, + { + "epoch": 0.9413270500625418, + "grad_norm": 12.0625, + "learning_rate": 6.862245162631556e-06, + "loss": 1.6688, + "step": 32360 + }, + { + "epoch": 0.9419088343950898, + "grad_norm": 11.8125, + "learning_rate": 6.860305880958941e-06, + "loss": 1.5936, + "step": 32380 + }, + { + "epoch": 0.9424906187276376, + "grad_norm": 10.0625, + "learning_rate": 6.858366599286326e-06, + "loss": 1.67, + "step": 32400 + }, + { + "epoch": 0.9430724030601856, + "grad_norm": 16.75, + "learning_rate": 6.856427317613711e-06, + "loss": 1.6339, + "step": 32420 + }, + { + "epoch": 0.9436541873927335, + "grad_norm": 13.875, + "learning_rate": 6.854488035941096e-06, + "loss": 1.6749, + "step": 32440 + }, + { + "epoch": 0.9442359717252814, + "grad_norm": 14.25, + "learning_rate": 6.852548754268481e-06, + "loss": 1.6833, + "step": 32460 + }, + { + "epoch": 0.9448177560578294, + "grad_norm": 11.1875, + "learning_rate": 6.850609472595866e-06, + "loss": 1.6442, + "step": 32480 + }, + { + "epoch": 0.9453995403903773, + "grad_norm": 12.875, + "learning_rate": 6.848670190923251e-06, + "loss": 1.5534, + "step": 32500 + }, + { + "epoch": 0.9459813247229252, + "grad_norm": 12.5, + "learning_rate": 6.8467309092506364e-06, + "loss": 1.5668, + "step": 32520 + }, + { + "epoch": 0.9465631090554731, + "grad_norm": 13.8125, + "learning_rate": 6.8447916275780215e-06, + "loss": 1.6358, + "step": 32540 + }, + { + "epoch": 0.9471448933880211, + "grad_norm": 13.5, + "learning_rate": 6.842852345905407e-06, + "loss": 1.6178, + "step": 32560 + }, + { + "epoch": 0.947726677720569, + "grad_norm": 12.9375, + "learning_rate": 6.840913064232792e-06, + "loss": 1.5604, + "step": 32580 + }, + { + "epoch": 0.9483084620531169, + "grad_norm": 12.3125, + "learning_rate": 6.838973782560177e-06, + "loss": 1.6273, + "step": 32600 + }, + { + "epoch": 0.9488902463856649, + "grad_norm": 11.25, + "learning_rate": 6.837034500887562e-06, + "loss": 1.629, + "step": 32620 + }, + { + "epoch": 0.9494720307182127, + "grad_norm": 11.1875, + "learning_rate": 6.835095219214947e-06, + "loss": 1.621, + "step": 32640 + }, + { + "epoch": 0.9500538150507607, + "grad_norm": 12.8125, + "learning_rate": 6.833155937542332e-06, + "loss": 1.6799, + "step": 32660 + }, + { + "epoch": 0.9506355993833087, + "grad_norm": 11.25, + "learning_rate": 6.831216655869717e-06, + "loss": 1.5411, + "step": 32680 + }, + { + "epoch": 0.9512173837158565, + "grad_norm": 12.1875, + "learning_rate": 6.829277374197102e-06, + "loss": 1.6182, + "step": 32700 + }, + { + "epoch": 0.9517991680484045, + "grad_norm": 11.1875, + "learning_rate": 6.8273380925244866e-06, + "loss": 1.5881, + "step": 32720 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 12.625, + "learning_rate": 6.825398810851872e-06, + "loss": 1.6739, + "step": 32740 + }, + { + "epoch": 0.9529627367135003, + "grad_norm": 12.125, + "learning_rate": 6.823459529179257e-06, + "loss": 1.6474, + "step": 32760 + }, + { + "epoch": 0.9535445210460483, + "grad_norm": 12.125, + "learning_rate": 6.821520247506642e-06, + "loss": 1.652, + "step": 32780 + }, + { + "epoch": 0.9541263053785961, + "grad_norm": 12.4375, + "learning_rate": 6.819580965834027e-06, + "loss": 1.6006, + "step": 32800 + }, + { + "epoch": 0.9547080897111441, + "grad_norm": 10.8125, + "learning_rate": 6.817641684161412e-06, + "loss": 1.5544, + "step": 32820 + }, + { + "epoch": 0.955289874043692, + "grad_norm": 11.1875, + "learning_rate": 6.815702402488797e-06, + "loss": 1.6969, + "step": 32840 + }, + { + "epoch": 0.9558716583762399, + "grad_norm": 12.25, + "learning_rate": 6.813763120816182e-06, + "loss": 1.5689, + "step": 32860 + }, + { + "epoch": 0.9564534427087878, + "grad_norm": 11.9375, + "learning_rate": 6.811823839143567e-06, + "loss": 1.6163, + "step": 32880 + }, + { + "epoch": 0.9570352270413358, + "grad_norm": 11.625, + "learning_rate": 6.8098845574709524e-06, + "loss": 1.5934, + "step": 32900 + }, + { + "epoch": 0.9576170113738837, + "grad_norm": 11.25, + "learning_rate": 6.8079452757983375e-06, + "loss": 1.6005, + "step": 32920 + }, + { + "epoch": 0.9581987957064316, + "grad_norm": 13.3125, + "learning_rate": 6.806005994125723e-06, + "loss": 1.6646, + "step": 32940 + }, + { + "epoch": 0.9587805800389796, + "grad_norm": 12.4375, + "learning_rate": 6.804066712453108e-06, + "loss": 1.5621, + "step": 32960 + }, + { + "epoch": 0.9593623643715274, + "grad_norm": 12.1875, + "learning_rate": 6.802127430780493e-06, + "loss": 1.5968, + "step": 32980 + }, + { + "epoch": 0.9599441487040754, + "grad_norm": 13.25, + "learning_rate": 6.800188149107878e-06, + "loss": 1.5637, + "step": 33000 + }, + { + "epoch": 0.9605259330366234, + "grad_norm": 16.125, + "learning_rate": 6.798248867435263e-06, + "loss": 1.5478, + "step": 33020 + }, + { + "epoch": 0.9611077173691712, + "grad_norm": 14.5, + "learning_rate": 6.796309585762648e-06, + "loss": 1.4745, + "step": 33040 + }, + { + "epoch": 0.9616895017017192, + "grad_norm": 11.0625, + "learning_rate": 6.794370304090033e-06, + "loss": 1.6644, + "step": 33060 + }, + { + "epoch": 0.9622712860342671, + "grad_norm": 12.3125, + "learning_rate": 6.792431022417418e-06, + "loss": 1.6051, + "step": 33080 + }, + { + "epoch": 0.962853070366815, + "grad_norm": 12.6875, + "learning_rate": 6.790491740744803e-06, + "loss": 1.5699, + "step": 33100 + }, + { + "epoch": 0.963434854699363, + "grad_norm": 12.8125, + "learning_rate": 6.7885524590721885e-06, + "loss": 1.6223, + "step": 33120 + }, + { + "epoch": 0.9640166390319108, + "grad_norm": 12.0, + "learning_rate": 6.786613177399574e-06, + "loss": 1.6234, + "step": 33140 + }, + { + "epoch": 0.9645984233644588, + "grad_norm": 12.625, + "learning_rate": 6.784673895726959e-06, + "loss": 1.6072, + "step": 33160 + }, + { + "epoch": 0.9651802076970067, + "grad_norm": 12.375, + "learning_rate": 6.782734614054344e-06, + "loss": 1.6224, + "step": 33180 + }, + { + "epoch": 0.9657619920295546, + "grad_norm": 12.0625, + "learning_rate": 6.780795332381729e-06, + "loss": 1.6087, + "step": 33200 + }, + { + "epoch": 0.9663437763621026, + "grad_norm": 12.9375, + "learning_rate": 6.778856050709114e-06, + "loss": 1.6673, + "step": 33220 + }, + { + "epoch": 0.9669255606946505, + "grad_norm": 14.0, + "learning_rate": 6.776916769036499e-06, + "loss": 1.6716, + "step": 33240 + }, + { + "epoch": 0.9675073450271984, + "grad_norm": 11.4375, + "learning_rate": 6.774977487363884e-06, + "loss": 1.6317, + "step": 33260 + }, + { + "epoch": 0.9680891293597463, + "grad_norm": 11.875, + "learning_rate": 6.773038205691269e-06, + "loss": 1.6376, + "step": 33280 + }, + { + "epoch": 0.9686709136922943, + "grad_norm": 13.75, + "learning_rate": 6.771098924018654e-06, + "loss": 1.5793, + "step": 33300 + }, + { + "epoch": 0.9692526980248422, + "grad_norm": 11.6875, + "learning_rate": 6.769159642346038e-06, + "loss": 1.5495, + "step": 33320 + }, + { + "epoch": 0.9698344823573901, + "grad_norm": 14.1875, + "learning_rate": 6.767220360673423e-06, + "loss": 1.6489, + "step": 33340 + }, + { + "epoch": 0.9704162666899381, + "grad_norm": 13.5, + "learning_rate": 6.765281079000808e-06, + "loss": 1.6244, + "step": 33360 + }, + { + "epoch": 0.9709980510224859, + "grad_norm": 12.1875, + "learning_rate": 6.763341797328193e-06, + "loss": 1.6047, + "step": 33380 + }, + { + "epoch": 0.9715798353550339, + "grad_norm": 14.125, + "learning_rate": 6.761402515655578e-06, + "loss": 1.6346, + "step": 33400 + }, + { + "epoch": 0.9721616196875819, + "grad_norm": 11.75, + "learning_rate": 6.759463233982963e-06, + "loss": 1.5593, + "step": 33420 + }, + { + "epoch": 0.9727434040201297, + "grad_norm": 13.4375, + "learning_rate": 6.757523952310348e-06, + "loss": 1.4955, + "step": 33440 + }, + { + "epoch": 0.9733251883526777, + "grad_norm": 13.1875, + "learning_rate": 6.7555846706377335e-06, + "loss": 1.583, + "step": 33460 + }, + { + "epoch": 0.9739069726852256, + "grad_norm": 13.625, + "learning_rate": 6.7536453889651186e-06, + "loss": 1.6102, + "step": 33480 + }, + { + "epoch": 0.9744887570177735, + "grad_norm": 12.125, + "learning_rate": 6.751706107292504e-06, + "loss": 1.6518, + "step": 33500 + }, + { + "epoch": 0.9750705413503215, + "grad_norm": 12.625, + "learning_rate": 6.749766825619889e-06, + "loss": 1.6665, + "step": 33520 + }, + { + "epoch": 0.9756523256828694, + "grad_norm": 12.9375, + "learning_rate": 6.747827543947274e-06, + "loss": 1.7361, + "step": 33540 + }, + { + "epoch": 0.9762341100154173, + "grad_norm": 10.9375, + "learning_rate": 6.745888262274659e-06, + "loss": 1.6369, + "step": 33560 + }, + { + "epoch": 0.9768158943479652, + "grad_norm": 16.75, + "learning_rate": 6.743948980602044e-06, + "loss": 1.5685, + "step": 33580 + }, + { + "epoch": 0.9773976786805131, + "grad_norm": 12.9375, + "learning_rate": 6.742009698929429e-06, + "loss": 1.6494, + "step": 33600 + }, + { + "epoch": 0.977979463013061, + "grad_norm": 11.625, + "learning_rate": 6.740070417256814e-06, + "loss": 1.6524, + "step": 33620 + }, + { + "epoch": 0.978561247345609, + "grad_norm": 13.0625, + "learning_rate": 6.738131135584199e-06, + "loss": 1.5863, + "step": 33640 + }, + { + "epoch": 0.9791430316781569, + "grad_norm": 11.25, + "learning_rate": 6.7361918539115844e-06, + "loss": 1.6313, + "step": 33660 + }, + { + "epoch": 0.9797248160107048, + "grad_norm": 16.125, + "learning_rate": 6.7342525722389695e-06, + "loss": 1.6042, + "step": 33680 + }, + { + "epoch": 0.9803066003432528, + "grad_norm": 13.3125, + "learning_rate": 6.732313290566355e-06, + "loss": 1.5964, + "step": 33700 + }, + { + "epoch": 0.9808883846758006, + "grad_norm": 12.5625, + "learning_rate": 6.73037400889374e-06, + "loss": 1.5968, + "step": 33720 + }, + { + "epoch": 0.9814701690083486, + "grad_norm": 13.4375, + "learning_rate": 6.728434727221125e-06, + "loss": 1.6331, + "step": 33740 + }, + { + "epoch": 0.9820519533408966, + "grad_norm": 11.6875, + "learning_rate": 6.72649544554851e-06, + "loss": 1.6312, + "step": 33760 + }, + { + "epoch": 0.9826337376734444, + "grad_norm": 13.375, + "learning_rate": 6.724556163875895e-06, + "loss": 1.6851, + "step": 33780 + }, + { + "epoch": 0.9832155220059924, + "grad_norm": 14.25, + "learning_rate": 6.72261688220328e-06, + "loss": 1.6674, + "step": 33800 + }, + { + "epoch": 0.9837973063385403, + "grad_norm": 12.75, + "learning_rate": 6.720677600530665e-06, + "loss": 1.5453, + "step": 33820 + }, + { + "epoch": 0.9843790906710882, + "grad_norm": 12.25, + "learning_rate": 6.71873831885805e-06, + "loss": 1.7205, + "step": 33840 + }, + { + "epoch": 0.9849608750036362, + "grad_norm": 11.4375, + "learning_rate": 6.7167990371854354e-06, + "loss": 1.672, + "step": 33860 + }, + { + "epoch": 0.9855426593361841, + "grad_norm": 12.125, + "learning_rate": 6.7148597555128205e-06, + "loss": 1.5611, + "step": 33880 + }, + { + "epoch": 0.986124443668732, + "grad_norm": 11.75, + "learning_rate": 6.712920473840206e-06, + "loss": 1.6928, + "step": 33900 + }, + { + "epoch": 0.9867062280012799, + "grad_norm": 11.25, + "learning_rate": 6.710981192167591e-06, + "loss": 1.6471, + "step": 33920 + }, + { + "epoch": 0.9872880123338279, + "grad_norm": 13.5, + "learning_rate": 6.709041910494976e-06, + "loss": 1.6177, + "step": 33940 + }, + { + "epoch": 0.9878697966663758, + "grad_norm": 15.125, + "learning_rate": 6.707102628822361e-06, + "loss": 1.6374, + "step": 33960 + }, + { + "epoch": 0.9884515809989237, + "grad_norm": 13.6875, + "learning_rate": 6.705163347149746e-06, + "loss": 1.5153, + "step": 33980 + }, + { + "epoch": 0.9890333653314716, + "grad_norm": 13.0, + "learning_rate": 6.703224065477131e-06, + "loss": 1.6144, + "step": 34000 + }, + { + "epoch": 0.9896151496640195, + "grad_norm": 11.875, + "learning_rate": 6.701284783804516e-06, + "loss": 1.5558, + "step": 34020 + }, + { + "epoch": 0.9901969339965675, + "grad_norm": 10.6875, + "learning_rate": 6.699345502131901e-06, + "loss": 1.6159, + "step": 34040 + }, + { + "epoch": 0.9907787183291153, + "grad_norm": 14.4375, + "learning_rate": 6.697406220459286e-06, + "loss": 1.6065, + "step": 34060 + }, + { + "epoch": 0.9913605026616633, + "grad_norm": 13.5, + "learning_rate": 6.6954669387866715e-06, + "loss": 1.5929, + "step": 34080 + }, + { + "epoch": 0.9919422869942113, + "grad_norm": 11.8125, + "learning_rate": 6.693527657114056e-06, + "loss": 1.6878, + "step": 34100 + }, + { + "epoch": 0.9925240713267591, + "grad_norm": 12.3125, + "learning_rate": 6.691588375441441e-06, + "loss": 1.5968, + "step": 34120 + }, + { + "epoch": 0.9931058556593071, + "grad_norm": 13.8125, + "learning_rate": 6.689649093768826e-06, + "loss": 1.6993, + "step": 34140 + }, + { + "epoch": 0.993687639991855, + "grad_norm": 14.0, + "learning_rate": 6.687709812096211e-06, + "loss": 1.6291, + "step": 34160 + }, + { + "epoch": 0.9942694243244029, + "grad_norm": 14.5, + "learning_rate": 6.685770530423596e-06, + "loss": 1.5992, + "step": 34180 + }, + { + "epoch": 0.9948512086569509, + "grad_norm": 13.4375, + "learning_rate": 6.683831248750981e-06, + "loss": 1.6653, + "step": 34200 + }, + { + "epoch": 0.9954329929894988, + "grad_norm": 12.75, + "learning_rate": 6.681891967078366e-06, + "loss": 1.6667, + "step": 34220 + }, + { + "epoch": 0.9960147773220467, + "grad_norm": 13.1875, + "learning_rate": 6.6799526854057514e-06, + "loss": 1.6231, + "step": 34240 + }, + { + "epoch": 0.9965965616545946, + "grad_norm": 11.1875, + "learning_rate": 6.6780134037331365e-06, + "loss": 1.7432, + "step": 34260 + }, + { + "epoch": 0.9971783459871426, + "grad_norm": 13.875, + "learning_rate": 6.676074122060522e-06, + "loss": 1.6409, + "step": 34280 + }, + { + "epoch": 0.9977601303196905, + "grad_norm": 12.8125, + "learning_rate": 6.674134840387907e-06, + "loss": 1.6375, + "step": 34300 + }, + { + "epoch": 0.9983419146522384, + "grad_norm": 12.0, + "learning_rate": 6.672195558715292e-06, + "loss": 1.6822, + "step": 34320 + }, + { + "epoch": 0.9989236989847864, + "grad_norm": 15.5625, + "learning_rate": 6.670256277042677e-06, + "loss": 1.5642, + "step": 34340 + }, + { + "epoch": 0.9995054833173342, + "grad_norm": 9.75, + "learning_rate": 6.668316995370062e-06, + "loss": 1.7238, + "step": 34360 + }, + { + "epoch": 1.0000872676498822, + "grad_norm": 10.125, + "learning_rate": 6.666377713697447e-06, + "loss": 1.6699, + "step": 34380 + }, + { + "epoch": 1.0006690519824302, + "grad_norm": 12.5, + "learning_rate": 6.664438432024832e-06, + "loss": 1.5624, + "step": 34400 + }, + { + "epoch": 1.0012508363149781, + "grad_norm": 10.5, + "learning_rate": 6.662499150352217e-06, + "loss": 1.5326, + "step": 34420 + }, + { + "epoch": 1.0018326206475259, + "grad_norm": 14.5, + "learning_rate": 6.660559868679602e-06, + "loss": 1.5176, + "step": 34440 + }, + { + "epoch": 1.0024144049800738, + "grad_norm": 10.6875, + "learning_rate": 6.6586205870069875e-06, + "loss": 1.518, + "step": 34460 + }, + { + "epoch": 1.0029961893126218, + "grad_norm": 13.5625, + "learning_rate": 6.656681305334373e-06, + "loss": 1.519, + "step": 34480 + }, + { + "epoch": 1.0035779736451698, + "grad_norm": 11.625, + "learning_rate": 6.654742023661758e-06, + "loss": 1.5261, + "step": 34500 + }, + { + "epoch": 1.0041597579777177, + "grad_norm": 18.625, + "learning_rate": 6.652802741989143e-06, + "loss": 1.5199, + "step": 34520 + }, + { + "epoch": 1.0047415423102657, + "grad_norm": 14.0625, + "learning_rate": 6.650863460316528e-06, + "loss": 1.4781, + "step": 34540 + }, + { + "epoch": 1.0053233266428134, + "grad_norm": 12.5625, + "learning_rate": 6.648924178643913e-06, + "loss": 1.5739, + "step": 34560 + }, + { + "epoch": 1.0059051109753614, + "grad_norm": 13.9375, + "learning_rate": 6.646984896971298e-06, + "loss": 1.4837, + "step": 34580 + }, + { + "epoch": 1.0064868953079094, + "grad_norm": 12.5, + "learning_rate": 6.645045615298683e-06, + "loss": 1.546, + "step": 34600 + }, + { + "epoch": 1.0070686796404573, + "grad_norm": 14.9375, + "learning_rate": 6.643106333626068e-06, + "loss": 1.4559, + "step": 34620 + }, + { + "epoch": 1.0076504639730053, + "grad_norm": 10.75, + "learning_rate": 6.641167051953453e-06, + "loss": 1.5901, + "step": 34640 + }, + { + "epoch": 1.008232248305553, + "grad_norm": 13.375, + "learning_rate": 6.6392277702808385e-06, + "loss": 1.5853, + "step": 34660 + }, + { + "epoch": 1.008814032638101, + "grad_norm": 10.75, + "learning_rate": 6.6372884886082236e-06, + "loss": 1.5529, + "step": 34680 + }, + { + "epoch": 1.009395816970649, + "grad_norm": 13.4375, + "learning_rate": 6.635349206935609e-06, + "loss": 1.5142, + "step": 34700 + }, + { + "epoch": 1.009977601303197, + "grad_norm": 16.375, + "learning_rate": 6.633409925262994e-06, + "loss": 1.5687, + "step": 34720 + }, + { + "epoch": 1.0105593856357449, + "grad_norm": 15.0, + "learning_rate": 6.631470643590379e-06, + "loss": 1.5669, + "step": 34740 + }, + { + "epoch": 1.0111411699682928, + "grad_norm": 17.625, + "learning_rate": 6.629531361917764e-06, + "loss": 1.4651, + "step": 34760 + }, + { + "epoch": 1.0117229543008406, + "grad_norm": 11.125, + "learning_rate": 6.627592080245149e-06, + "loss": 1.4856, + "step": 34780 + }, + { + "epoch": 1.0123047386333885, + "grad_norm": 11.8125, + "learning_rate": 6.625652798572534e-06, + "loss": 1.6137, + "step": 34800 + }, + { + "epoch": 1.0128865229659365, + "grad_norm": 12.8125, + "learning_rate": 6.623713516899919e-06, + "loss": 1.5333, + "step": 34820 + }, + { + "epoch": 1.0134683072984845, + "grad_norm": 13.1875, + "learning_rate": 6.621774235227304e-06, + "loss": 1.4606, + "step": 34840 + }, + { + "epoch": 1.0140500916310324, + "grad_norm": 13.9375, + "learning_rate": 6.6198349535546895e-06, + "loss": 1.4368, + "step": 34860 + }, + { + "epoch": 1.0146318759635804, + "grad_norm": 12.5625, + "learning_rate": 6.6178956718820745e-06, + "loss": 1.4579, + "step": 34880 + }, + { + "epoch": 1.0152136602961281, + "grad_norm": 11.625, + "learning_rate": 6.61595639020946e-06, + "loss": 1.4373, + "step": 34900 + }, + { + "epoch": 1.015795444628676, + "grad_norm": 14.9375, + "learning_rate": 6.614017108536844e-06, + "loss": 1.5626, + "step": 34920 + }, + { + "epoch": 1.016377228961224, + "grad_norm": 12.25, + "learning_rate": 6.612077826864229e-06, + "loss": 1.5653, + "step": 34940 + }, + { + "epoch": 1.016959013293772, + "grad_norm": 13.0, + "learning_rate": 6.610138545191614e-06, + "loss": 1.54, + "step": 34960 + }, + { + "epoch": 1.01754079762632, + "grad_norm": 11.375, + "learning_rate": 6.608199263518999e-06, + "loss": 1.4087, + "step": 34980 + }, + { + "epoch": 1.018122581958868, + "grad_norm": 12.5, + "learning_rate": 6.606259981846384e-06, + "loss": 1.5305, + "step": 35000 + }, + { + "epoch": 1.0187043662914157, + "grad_norm": 12.75, + "learning_rate": 6.604320700173769e-06, + "loss": 1.5814, + "step": 35020 + }, + { + "epoch": 1.0192861506239637, + "grad_norm": 11.3125, + "learning_rate": 6.6023814185011545e-06, + "loss": 1.4229, + "step": 35040 + }, + { + "epoch": 1.0198679349565116, + "grad_norm": 11.625, + "learning_rate": 6.60044213682854e-06, + "loss": 1.5016, + "step": 35060 + }, + { + "epoch": 1.0204497192890596, + "grad_norm": 13.125, + "learning_rate": 6.598502855155925e-06, + "loss": 1.4902, + "step": 35080 + }, + { + "epoch": 1.0210315036216076, + "grad_norm": 11.4375, + "learning_rate": 6.59656357348331e-06, + "loss": 1.5648, + "step": 35100 + }, + { + "epoch": 1.0216132879541553, + "grad_norm": 13.0, + "learning_rate": 6.594624291810695e-06, + "loss": 1.5088, + "step": 35120 + }, + { + "epoch": 1.0221950722867033, + "grad_norm": 13.125, + "learning_rate": 6.59268501013808e-06, + "loss": 1.5048, + "step": 35140 + }, + { + "epoch": 1.0227768566192512, + "grad_norm": 12.8125, + "learning_rate": 6.590745728465465e-06, + "loss": 1.5046, + "step": 35160 + }, + { + "epoch": 1.0233586409517992, + "grad_norm": 12.375, + "learning_rate": 6.58880644679285e-06, + "loss": 1.5097, + "step": 35180 + }, + { + "epoch": 1.0239404252843471, + "grad_norm": 12.625, + "learning_rate": 6.586867165120235e-06, + "loss": 1.5104, + "step": 35200 + }, + { + "epoch": 1.024522209616895, + "grad_norm": 14.25, + "learning_rate": 6.58492788344762e-06, + "loss": 1.5456, + "step": 35220 + }, + { + "epoch": 1.0251039939494428, + "grad_norm": 12.625, + "learning_rate": 6.5829886017750055e-06, + "loss": 1.5434, + "step": 35240 + }, + { + "epoch": 1.0256857782819908, + "grad_norm": 12.625, + "learning_rate": 6.5810493201023906e-06, + "loss": 1.6952, + "step": 35260 + }, + { + "epoch": 1.0262675626145388, + "grad_norm": 12.5, + "learning_rate": 6.579110038429776e-06, + "loss": 1.5075, + "step": 35280 + }, + { + "epoch": 1.0268493469470867, + "grad_norm": 14.0, + "learning_rate": 6.577170756757161e-06, + "loss": 1.5463, + "step": 35300 + }, + { + "epoch": 1.0274311312796347, + "grad_norm": 14.1875, + "learning_rate": 6.575231475084546e-06, + "loss": 1.5216, + "step": 35320 + }, + { + "epoch": 1.0280129156121827, + "grad_norm": 11.8125, + "learning_rate": 6.573292193411931e-06, + "loss": 1.4992, + "step": 35340 + }, + { + "epoch": 1.0285946999447304, + "grad_norm": 14.125, + "learning_rate": 6.571352911739316e-06, + "loss": 1.5322, + "step": 35360 + }, + { + "epoch": 1.0291764842772784, + "grad_norm": 12.75, + "learning_rate": 6.569413630066701e-06, + "loss": 1.5707, + "step": 35380 + }, + { + "epoch": 1.0297582686098263, + "grad_norm": 12.875, + "learning_rate": 6.567474348394086e-06, + "loss": 1.428, + "step": 35400 + }, + { + "epoch": 1.0303400529423743, + "grad_norm": 13.4375, + "learning_rate": 6.565535066721471e-06, + "loss": 1.5032, + "step": 35420 + }, + { + "epoch": 1.0309218372749223, + "grad_norm": 14.8125, + "learning_rate": 6.5635957850488564e-06, + "loss": 1.5676, + "step": 35440 + }, + { + "epoch": 1.0315036216074702, + "grad_norm": 13.375, + "learning_rate": 6.5616565033762415e-06, + "loss": 1.5432, + "step": 35460 + }, + { + "epoch": 1.032085405940018, + "grad_norm": 11.6875, + "learning_rate": 6.559717221703627e-06, + "loss": 1.5117, + "step": 35480 + }, + { + "epoch": 1.032667190272566, + "grad_norm": 12.625, + "learning_rate": 6.557777940031012e-06, + "loss": 1.4937, + "step": 35500 + }, + { + "epoch": 1.033248974605114, + "grad_norm": 11.75, + "learning_rate": 6.555838658358397e-06, + "loss": 1.5287, + "step": 35520 + }, + { + "epoch": 1.0338307589376619, + "grad_norm": 12.125, + "learning_rate": 6.553899376685782e-06, + "loss": 1.4799, + "step": 35540 + }, + { + "epoch": 1.0344125432702098, + "grad_norm": 13.0, + "learning_rate": 6.551960095013167e-06, + "loss": 1.5311, + "step": 35560 + }, + { + "epoch": 1.0349943276027576, + "grad_norm": 13.625, + "learning_rate": 6.550020813340552e-06, + "loss": 1.4129, + "step": 35580 + }, + { + "epoch": 1.0355761119353055, + "grad_norm": 10.625, + "learning_rate": 6.548081531667937e-06, + "loss": 1.4979, + "step": 35600 + }, + { + "epoch": 1.0361578962678535, + "grad_norm": 13.25, + "learning_rate": 6.546142249995322e-06, + "loss": 1.5552, + "step": 35620 + }, + { + "epoch": 1.0367396806004014, + "grad_norm": 12.375, + "learning_rate": 6.544202968322707e-06, + "loss": 1.5192, + "step": 35640 + }, + { + "epoch": 1.0373214649329494, + "grad_norm": 13.125, + "learning_rate": 6.5422636866500925e-06, + "loss": 1.5752, + "step": 35660 + }, + { + "epoch": 1.0379032492654974, + "grad_norm": 14.125, + "learning_rate": 6.540324404977478e-06, + "loss": 1.4708, + "step": 35680 + }, + { + "epoch": 1.0384850335980451, + "grad_norm": 12.625, + "learning_rate": 6.538385123304863e-06, + "loss": 1.5739, + "step": 35700 + }, + { + "epoch": 1.039066817930593, + "grad_norm": 12.1875, + "learning_rate": 6.536445841632248e-06, + "loss": 1.5387, + "step": 35720 + }, + { + "epoch": 1.039648602263141, + "grad_norm": 15.1875, + "learning_rate": 6.534506559959632e-06, + "loss": 1.4135, + "step": 35740 + }, + { + "epoch": 1.040230386595689, + "grad_norm": 12.5625, + "learning_rate": 6.532567278287017e-06, + "loss": 1.5351, + "step": 35760 + }, + { + "epoch": 1.040812170928237, + "grad_norm": 13.9375, + "learning_rate": 6.530627996614402e-06, + "loss": 1.5075, + "step": 35780 + }, + { + "epoch": 1.041393955260785, + "grad_norm": 11.8125, + "learning_rate": 6.528688714941787e-06, + "loss": 1.5084, + "step": 35800 + }, + { + "epoch": 1.0419757395933327, + "grad_norm": 11.25, + "learning_rate": 6.5267494332691724e-06, + "loss": 1.5374, + "step": 35820 + }, + { + "epoch": 1.0425575239258806, + "grad_norm": 13.4375, + "learning_rate": 6.5248101515965575e-06, + "loss": 1.5538, + "step": 35840 + }, + { + "epoch": 1.0431393082584286, + "grad_norm": 13.3125, + "learning_rate": 6.522870869923943e-06, + "loss": 1.5906, + "step": 35860 + }, + { + "epoch": 1.0437210925909766, + "grad_norm": 9.5625, + "learning_rate": 6.520931588251328e-06, + "loss": 1.4507, + "step": 35880 + }, + { + "epoch": 1.0443028769235245, + "grad_norm": 12.75, + "learning_rate": 6.518992306578713e-06, + "loss": 1.4307, + "step": 35900 + }, + { + "epoch": 1.0448846612560723, + "grad_norm": 11.375, + "learning_rate": 6.517053024906098e-06, + "loss": 1.6122, + "step": 35920 + }, + { + "epoch": 1.0454664455886202, + "grad_norm": 13.25, + "learning_rate": 6.515113743233483e-06, + "loss": 1.5159, + "step": 35940 + }, + { + "epoch": 1.0460482299211682, + "grad_norm": 13.4375, + "learning_rate": 6.513174461560868e-06, + "loss": 1.4978, + "step": 35960 + }, + { + "epoch": 1.0466300142537162, + "grad_norm": 10.9375, + "learning_rate": 6.511235179888253e-06, + "loss": 1.547, + "step": 35980 + }, + { + "epoch": 1.0472117985862641, + "grad_norm": 14.0, + "learning_rate": 6.509295898215638e-06, + "loss": 1.5661, + "step": 36000 + }, + { + "epoch": 1.047793582918812, + "grad_norm": 13.3125, + "learning_rate": 6.507356616543023e-06, + "loss": 1.5157, + "step": 36020 + }, + { + "epoch": 1.0483753672513598, + "grad_norm": 11.5625, + "learning_rate": 6.5054173348704085e-06, + "loss": 1.5278, + "step": 36040 + }, + { + "epoch": 1.0489571515839078, + "grad_norm": 11.125, + "learning_rate": 6.503478053197794e-06, + "loss": 1.4772, + "step": 36060 + }, + { + "epoch": 1.0495389359164558, + "grad_norm": 14.75, + "learning_rate": 6.501538771525179e-06, + "loss": 1.5065, + "step": 36080 + }, + { + "epoch": 1.0501207202490037, + "grad_norm": 11.625, + "learning_rate": 6.499599489852564e-06, + "loss": 1.49, + "step": 36100 + }, + { + "epoch": 1.0507025045815517, + "grad_norm": 12.8125, + "learning_rate": 6.497660208179949e-06, + "loss": 1.5775, + "step": 36120 + }, + { + "epoch": 1.0512842889140996, + "grad_norm": 11.8125, + "learning_rate": 6.495720926507334e-06, + "loss": 1.5412, + "step": 36140 + }, + { + "epoch": 1.0518660732466474, + "grad_norm": 15.125, + "learning_rate": 6.493781644834719e-06, + "loss": 1.5165, + "step": 36160 + }, + { + "epoch": 1.0524478575791953, + "grad_norm": 13.875, + "learning_rate": 6.491842363162104e-06, + "loss": 1.5471, + "step": 36180 + }, + { + "epoch": 1.0530296419117433, + "grad_norm": 15.5, + "learning_rate": 6.489903081489489e-06, + "loss": 1.4839, + "step": 36200 + }, + { + "epoch": 1.0536114262442913, + "grad_norm": 13.875, + "learning_rate": 6.487963799816874e-06, + "loss": 1.5373, + "step": 36220 + }, + { + "epoch": 1.0541932105768392, + "grad_norm": 12.8125, + "learning_rate": 6.4860245181442595e-06, + "loss": 1.5859, + "step": 36240 + }, + { + "epoch": 1.054774994909387, + "grad_norm": 14.5625, + "learning_rate": 6.484085236471645e-06, + "loss": 1.5399, + "step": 36260 + }, + { + "epoch": 1.055356779241935, + "grad_norm": 12.3125, + "learning_rate": 6.48214595479903e-06, + "loss": 1.5187, + "step": 36280 + }, + { + "epoch": 1.055938563574483, + "grad_norm": 11.1875, + "learning_rate": 6.480206673126415e-06, + "loss": 1.503, + "step": 36300 + }, + { + "epoch": 1.0565203479070309, + "grad_norm": 12.875, + "learning_rate": 6.4782673914538e-06, + "loss": 1.5074, + "step": 36320 + }, + { + "epoch": 1.0571021322395788, + "grad_norm": 13.875, + "learning_rate": 6.476328109781185e-06, + "loss": 1.5128, + "step": 36340 + }, + { + "epoch": 1.0576839165721268, + "grad_norm": 12.0, + "learning_rate": 6.47438882810857e-06, + "loss": 1.5213, + "step": 36360 + }, + { + "epoch": 1.0582657009046745, + "grad_norm": 11.6875, + "learning_rate": 6.472449546435955e-06, + "loss": 1.5648, + "step": 36380 + }, + { + "epoch": 1.0588474852372225, + "grad_norm": 13.5625, + "learning_rate": 6.47051026476334e-06, + "loss": 1.5782, + "step": 36400 + }, + { + "epoch": 1.0594292695697705, + "grad_norm": 14.625, + "learning_rate": 6.468570983090725e-06, + "loss": 1.6054, + "step": 36420 + }, + { + "epoch": 1.0600110539023184, + "grad_norm": 14.8125, + "learning_rate": 6.4666317014181105e-06, + "loss": 1.5262, + "step": 36440 + }, + { + "epoch": 1.0605928382348664, + "grad_norm": 15.6875, + "learning_rate": 6.4646924197454956e-06, + "loss": 1.5041, + "step": 36460 + }, + { + "epoch": 1.0611746225674143, + "grad_norm": 12.0, + "learning_rate": 6.462753138072881e-06, + "loss": 1.4741, + "step": 36480 + }, + { + "epoch": 1.061756406899962, + "grad_norm": 13.6875, + "learning_rate": 6.460813856400266e-06, + "loss": 1.4637, + "step": 36500 + }, + { + "epoch": 1.06233819123251, + "grad_norm": 12.25, + "learning_rate": 6.458874574727651e-06, + "loss": 1.5187, + "step": 36520 + }, + { + "epoch": 1.062919975565058, + "grad_norm": 12.9375, + "learning_rate": 6.456935293055036e-06, + "loss": 1.4297, + "step": 36540 + }, + { + "epoch": 1.063501759897606, + "grad_norm": 10.25, + "learning_rate": 6.45499601138242e-06, + "loss": 1.499, + "step": 36560 + }, + { + "epoch": 1.064083544230154, + "grad_norm": 11.125, + "learning_rate": 6.453056729709805e-06, + "loss": 1.5329, + "step": 36580 + }, + { + "epoch": 1.064665328562702, + "grad_norm": 12.0625, + "learning_rate": 6.45111744803719e-06, + "loss": 1.589, + "step": 36600 + }, + { + "epoch": 1.0652471128952496, + "grad_norm": 12.875, + "learning_rate": 6.4491781663645755e-06, + "loss": 1.4859, + "step": 36620 + }, + { + "epoch": 1.0658288972277976, + "grad_norm": 12.5625, + "learning_rate": 6.447238884691961e-06, + "loss": 1.4984, + "step": 36640 + }, + { + "epoch": 1.0664106815603456, + "grad_norm": 12.375, + "learning_rate": 6.445299603019346e-06, + "loss": 1.5674, + "step": 36660 + }, + { + "epoch": 1.0669924658928935, + "grad_norm": 12.3125, + "learning_rate": 6.443360321346731e-06, + "loss": 1.5434, + "step": 36680 + }, + { + "epoch": 1.0675742502254415, + "grad_norm": 10.4375, + "learning_rate": 6.441421039674116e-06, + "loss": 1.4657, + "step": 36700 + }, + { + "epoch": 1.0681560345579895, + "grad_norm": 11.9375, + "learning_rate": 6.439481758001501e-06, + "loss": 1.4707, + "step": 36720 + }, + { + "epoch": 1.0687378188905372, + "grad_norm": 14.6875, + "learning_rate": 6.437542476328886e-06, + "loss": 1.5764, + "step": 36740 + }, + { + "epoch": 1.0693196032230852, + "grad_norm": 12.6875, + "learning_rate": 6.435603194656271e-06, + "loss": 1.525, + "step": 36760 + }, + { + "epoch": 1.0699013875556331, + "grad_norm": 16.125, + "learning_rate": 6.433663912983656e-06, + "loss": 1.5138, + "step": 36780 + }, + { + "epoch": 1.070483171888181, + "grad_norm": 12.0625, + "learning_rate": 6.431724631311041e-06, + "loss": 1.506, + "step": 36800 + }, + { + "epoch": 1.071064956220729, + "grad_norm": 15.5625, + "learning_rate": 6.4297853496384265e-06, + "loss": 1.5367, + "step": 36820 + }, + { + "epoch": 1.0716467405532768, + "grad_norm": 15.4375, + "learning_rate": 6.4278460679658116e-06, + "loss": 1.4997, + "step": 36840 + }, + { + "epoch": 1.0722285248858248, + "grad_norm": 12.5, + "learning_rate": 6.425906786293197e-06, + "loss": 1.5785, + "step": 36860 + }, + { + "epoch": 1.0728103092183727, + "grad_norm": 14.0625, + "learning_rate": 6.423967504620582e-06, + "loss": 1.4386, + "step": 36880 + }, + { + "epoch": 1.0733920935509207, + "grad_norm": 14.75, + "learning_rate": 6.422028222947967e-06, + "loss": 1.4992, + "step": 36900 + }, + { + "epoch": 1.0739738778834687, + "grad_norm": 11.8125, + "learning_rate": 6.420088941275352e-06, + "loss": 1.5329, + "step": 36920 + }, + { + "epoch": 1.0745556622160166, + "grad_norm": 11.125, + "learning_rate": 6.418149659602737e-06, + "loss": 1.5295, + "step": 36940 + }, + { + "epoch": 1.0751374465485644, + "grad_norm": 11.625, + "learning_rate": 6.416210377930122e-06, + "loss": 1.6297, + "step": 36960 + }, + { + "epoch": 1.0757192308811123, + "grad_norm": 14.1875, + "learning_rate": 6.414271096257507e-06, + "loss": 1.5128, + "step": 36980 + }, + { + "epoch": 1.0763010152136603, + "grad_norm": 13.625, + "learning_rate": 6.412331814584892e-06, + "loss": 1.4886, + "step": 37000 + }, + { + "epoch": 1.0768827995462082, + "grad_norm": 14.125, + "learning_rate": 6.4103925329122774e-06, + "loss": 1.5572, + "step": 37020 + }, + { + "epoch": 1.0774645838787562, + "grad_norm": 11.875, + "learning_rate": 6.4084532512396625e-06, + "loss": 1.4906, + "step": 37040 + }, + { + "epoch": 1.0780463682113042, + "grad_norm": 12.8125, + "learning_rate": 6.406513969567048e-06, + "loss": 1.5079, + "step": 37060 + }, + { + "epoch": 1.078628152543852, + "grad_norm": 11.5, + "learning_rate": 6.404574687894433e-06, + "loss": 1.5033, + "step": 37080 + }, + { + "epoch": 1.0792099368763999, + "grad_norm": 15.875, + "learning_rate": 6.402635406221818e-06, + "loss": 1.5184, + "step": 37100 + }, + { + "epoch": 1.0797917212089478, + "grad_norm": 13.75, + "learning_rate": 6.400696124549203e-06, + "loss": 1.5089, + "step": 37120 + }, + { + "epoch": 1.0803735055414958, + "grad_norm": 13.3125, + "learning_rate": 6.398756842876588e-06, + "loss": 1.5207, + "step": 37140 + }, + { + "epoch": 1.0809552898740438, + "grad_norm": 13.25, + "learning_rate": 6.396817561203973e-06, + "loss": 1.5109, + "step": 37160 + }, + { + "epoch": 1.0815370742065915, + "grad_norm": 12.5, + "learning_rate": 6.394878279531358e-06, + "loss": 1.5187, + "step": 37180 + }, + { + "epoch": 1.0821188585391395, + "grad_norm": 13.9375, + "learning_rate": 6.392938997858743e-06, + "loss": 1.494, + "step": 37200 + }, + { + "epoch": 1.0827006428716874, + "grad_norm": 11.875, + "learning_rate": 6.390999716186128e-06, + "loss": 1.4747, + "step": 37220 + }, + { + "epoch": 1.0832824272042354, + "grad_norm": 10.8125, + "learning_rate": 6.3890604345135135e-06, + "loss": 1.4979, + "step": 37240 + }, + { + "epoch": 1.0838642115367834, + "grad_norm": 14.5, + "learning_rate": 6.387121152840899e-06, + "loss": 1.5181, + "step": 37260 + }, + { + "epoch": 1.0844459958693313, + "grad_norm": 19.75, + "learning_rate": 6.385181871168284e-06, + "loss": 1.5811, + "step": 37280 + }, + { + "epoch": 1.085027780201879, + "grad_norm": 10.9375, + "learning_rate": 6.383242589495669e-06, + "loss": 1.5239, + "step": 37300 + }, + { + "epoch": 1.085609564534427, + "grad_norm": 14.1875, + "learning_rate": 6.381303307823054e-06, + "loss": 1.4841, + "step": 37320 + }, + { + "epoch": 1.086191348866975, + "grad_norm": 12.5, + "learning_rate": 6.379364026150439e-06, + "loss": 1.539, + "step": 37340 + }, + { + "epoch": 1.086773133199523, + "grad_norm": 16.0, + "learning_rate": 6.377424744477823e-06, + "loss": 1.495, + "step": 37360 + }, + { + "epoch": 1.087354917532071, + "grad_norm": 13.8125, + "learning_rate": 6.375485462805208e-06, + "loss": 1.5655, + "step": 37380 + }, + { + "epoch": 1.0879367018646189, + "grad_norm": 13.5, + "learning_rate": 6.3735461811325934e-06, + "loss": 1.5101, + "step": 37400 + }, + { + "epoch": 1.0885184861971666, + "grad_norm": 12.5, + "learning_rate": 6.3716068994599785e-06, + "loss": 1.446, + "step": 37420 + }, + { + "epoch": 1.0891002705297146, + "grad_norm": 9.9375, + "learning_rate": 6.369667617787364e-06, + "loss": 1.4292, + "step": 37440 + }, + { + "epoch": 1.0896820548622626, + "grad_norm": 12.125, + "learning_rate": 6.367728336114749e-06, + "loss": 1.4819, + "step": 37460 + }, + { + "epoch": 1.0902638391948105, + "grad_norm": 13.75, + "learning_rate": 6.365789054442134e-06, + "loss": 1.4845, + "step": 37480 + }, + { + "epoch": 1.0908456235273585, + "grad_norm": 12.8125, + "learning_rate": 6.363849772769519e-06, + "loss": 1.5017, + "step": 37500 + }, + { + "epoch": 1.0914274078599062, + "grad_norm": 13.0, + "learning_rate": 6.361910491096904e-06, + "loss": 1.4634, + "step": 37520 + }, + { + "epoch": 1.0920091921924542, + "grad_norm": 12.0, + "learning_rate": 6.359971209424289e-06, + "loss": 1.4176, + "step": 37540 + }, + { + "epoch": 1.0925909765250021, + "grad_norm": 10.5, + "learning_rate": 6.358031927751673e-06, + "loss": 1.5057, + "step": 37560 + }, + { + "epoch": 1.09317276085755, + "grad_norm": 14.25, + "learning_rate": 6.3560926460790585e-06, + "loss": 1.4842, + "step": 37580 + }, + { + "epoch": 1.093754545190098, + "grad_norm": 12.1875, + "learning_rate": 6.3541533644064436e-06, + "loss": 1.5763, + "step": 37600 + }, + { + "epoch": 1.094336329522646, + "grad_norm": 14.6875, + "learning_rate": 6.352214082733829e-06, + "loss": 1.6831, + "step": 37620 + }, + { + "epoch": 1.094918113855194, + "grad_norm": 11.375, + "learning_rate": 6.350274801061214e-06, + "loss": 1.5598, + "step": 37640 + }, + { + "epoch": 1.0954998981877417, + "grad_norm": 13.375, + "learning_rate": 6.348335519388599e-06, + "loss": 1.4496, + "step": 37660 + }, + { + "epoch": 1.0960816825202897, + "grad_norm": 13.1875, + "learning_rate": 6.346396237715984e-06, + "loss": 1.5174, + "step": 37680 + }, + { + "epoch": 1.0966634668528377, + "grad_norm": 14.375, + "learning_rate": 6.344456956043369e-06, + "loss": 1.5247, + "step": 37700 + }, + { + "epoch": 1.0972452511853856, + "grad_norm": 13.5, + "learning_rate": 6.342517674370754e-06, + "loss": 1.5636, + "step": 37720 + }, + { + "epoch": 1.0978270355179336, + "grad_norm": 14.8125, + "learning_rate": 6.340578392698139e-06, + "loss": 1.5537, + "step": 37740 + }, + { + "epoch": 1.0984088198504813, + "grad_norm": 13.5, + "learning_rate": 6.338639111025524e-06, + "loss": 1.5261, + "step": 37760 + }, + { + "epoch": 1.0989906041830293, + "grad_norm": 12.0625, + "learning_rate": 6.3366998293529094e-06, + "loss": 1.5312, + "step": 37780 + }, + { + "epoch": 1.0995723885155773, + "grad_norm": 11.9375, + "learning_rate": 6.3347605476802945e-06, + "loss": 1.5345, + "step": 37800 + }, + { + "epoch": 1.1001541728481252, + "grad_norm": 12.6875, + "learning_rate": 6.33282126600768e-06, + "loss": 1.6277, + "step": 37820 + }, + { + "epoch": 1.1007359571806732, + "grad_norm": 13.625, + "learning_rate": 6.330881984335065e-06, + "loss": 1.5504, + "step": 37840 + }, + { + "epoch": 1.1013177415132211, + "grad_norm": 11.5625, + "learning_rate": 6.32894270266245e-06, + "loss": 1.4967, + "step": 37860 + }, + { + "epoch": 1.101899525845769, + "grad_norm": 12.125, + "learning_rate": 6.327003420989835e-06, + "loss": 1.5639, + "step": 37880 + }, + { + "epoch": 1.1024813101783169, + "grad_norm": 14.375, + "learning_rate": 6.32506413931722e-06, + "loss": 1.5901, + "step": 37900 + }, + { + "epoch": 1.1030630945108648, + "grad_norm": 13.375, + "learning_rate": 6.323124857644605e-06, + "loss": 1.5309, + "step": 37920 + }, + { + "epoch": 1.1036448788434128, + "grad_norm": 13.5, + "learning_rate": 6.321185575971989e-06, + "loss": 1.5142, + "step": 37940 + }, + { + "epoch": 1.1042266631759607, + "grad_norm": 12.875, + "learning_rate": 6.3192462942993745e-06, + "loss": 1.5386, + "step": 37960 + }, + { + "epoch": 1.1048084475085087, + "grad_norm": 12.5, + "learning_rate": 6.3173070126267596e-06, + "loss": 1.5329, + "step": 37980 + }, + { + "epoch": 1.1053902318410564, + "grad_norm": 12.125, + "learning_rate": 6.315367730954145e-06, + "loss": 1.5415, + "step": 38000 + }, + { + "epoch": 1.1059720161736044, + "grad_norm": 13.9375, + "learning_rate": 6.31342844928153e-06, + "loss": 1.5508, + "step": 38020 + }, + { + "epoch": 1.1065538005061524, + "grad_norm": 16.0, + "learning_rate": 6.311489167608915e-06, + "loss": 1.5365, + "step": 38040 + }, + { + "epoch": 1.1071355848387003, + "grad_norm": 12.0, + "learning_rate": 6.3095498859363e-06, + "loss": 1.5612, + "step": 38060 + }, + { + "epoch": 1.1077173691712483, + "grad_norm": 12.0, + "learning_rate": 6.307610604263685e-06, + "loss": 1.46, + "step": 38080 + }, + { + "epoch": 1.108299153503796, + "grad_norm": 12.4375, + "learning_rate": 6.30567132259107e-06, + "loss": 1.5787, + "step": 38100 + }, + { + "epoch": 1.108880937836344, + "grad_norm": 13.25, + "learning_rate": 6.303732040918455e-06, + "loss": 1.543, + "step": 38120 + }, + { + "epoch": 1.109462722168892, + "grad_norm": 12.375, + "learning_rate": 6.30179275924584e-06, + "loss": 1.4542, + "step": 38140 + }, + { + "epoch": 1.11004450650144, + "grad_norm": 12.625, + "learning_rate": 6.2998534775732255e-06, + "loss": 1.4819, + "step": 38160 + }, + { + "epoch": 1.110626290833988, + "grad_norm": 13.0, + "learning_rate": 6.2979141959006105e-06, + "loss": 1.5394, + "step": 38180 + }, + { + "epoch": 1.1112080751665359, + "grad_norm": 12.9375, + "learning_rate": 6.295974914227996e-06, + "loss": 1.6128, + "step": 38200 + }, + { + "epoch": 1.1117898594990836, + "grad_norm": 12.0, + "learning_rate": 6.294035632555381e-06, + "loss": 1.4606, + "step": 38220 + }, + { + "epoch": 1.1123716438316316, + "grad_norm": 14.75, + "learning_rate": 6.292096350882766e-06, + "loss": 1.4817, + "step": 38240 + }, + { + "epoch": 1.1129534281641795, + "grad_norm": 13.75, + "learning_rate": 6.290157069210151e-06, + "loss": 1.5162, + "step": 38260 + }, + { + "epoch": 1.1135352124967275, + "grad_norm": 12.4375, + "learning_rate": 6.288217787537536e-06, + "loss": 1.4722, + "step": 38280 + }, + { + "epoch": 1.1141169968292755, + "grad_norm": 11.75, + "learning_rate": 6.286278505864921e-06, + "loss": 1.4585, + "step": 38300 + }, + { + "epoch": 1.1146987811618234, + "grad_norm": 11.9375, + "learning_rate": 6.284339224192306e-06, + "loss": 1.5375, + "step": 38320 + }, + { + "epoch": 1.1152805654943712, + "grad_norm": 13.1875, + "learning_rate": 6.282399942519691e-06, + "loss": 1.5263, + "step": 38340 + }, + { + "epoch": 1.1158623498269191, + "grad_norm": 11.5625, + "learning_rate": 6.2804606608470764e-06, + "loss": 1.5259, + "step": 38360 + }, + { + "epoch": 1.116444134159467, + "grad_norm": 12.5, + "learning_rate": 6.2785213791744615e-06, + "loss": 1.4978, + "step": 38380 + }, + { + "epoch": 1.117025918492015, + "grad_norm": 13.4375, + "learning_rate": 6.276582097501847e-06, + "loss": 1.4903, + "step": 38400 + }, + { + "epoch": 1.117607702824563, + "grad_norm": 12.0, + "learning_rate": 6.274642815829232e-06, + "loss": 1.5991, + "step": 38420 + }, + { + "epoch": 1.1181894871571108, + "grad_norm": 16.625, + "learning_rate": 6.272703534156617e-06, + "loss": 1.5308, + "step": 38440 + }, + { + "epoch": 1.1187712714896587, + "grad_norm": 17.125, + "learning_rate": 6.270764252484002e-06, + "loss": 1.4371, + "step": 38460 + }, + { + "epoch": 1.1193530558222067, + "grad_norm": 13.25, + "learning_rate": 6.268824970811387e-06, + "loss": 1.5051, + "step": 38480 + }, + { + "epoch": 1.1199348401547546, + "grad_norm": 12.0, + "learning_rate": 6.266885689138772e-06, + "loss": 1.5281, + "step": 38500 + }, + { + "epoch": 1.1205166244873026, + "grad_norm": 14.125, + "learning_rate": 6.264946407466157e-06, + "loss": 1.6, + "step": 38520 + }, + { + "epoch": 1.1210984088198506, + "grad_norm": 12.5, + "learning_rate": 6.263007125793542e-06, + "loss": 1.4649, + "step": 38540 + }, + { + "epoch": 1.1216801931523983, + "grad_norm": 15.5625, + "learning_rate": 6.261067844120927e-06, + "loss": 1.5477, + "step": 38560 + }, + { + "epoch": 1.1222619774849463, + "grad_norm": 11.8125, + "learning_rate": 6.2591285624483125e-06, + "loss": 1.5681, + "step": 38580 + }, + { + "epoch": 1.1228437618174942, + "grad_norm": 12.0625, + "learning_rate": 6.257189280775698e-06, + "loss": 1.4782, + "step": 38600 + }, + { + "epoch": 1.1234255461500422, + "grad_norm": 14.625, + "learning_rate": 6.255249999103083e-06, + "loss": 1.5647, + "step": 38620 + }, + { + "epoch": 1.1240073304825902, + "grad_norm": 12.6875, + "learning_rate": 6.253310717430468e-06, + "loss": 1.4799, + "step": 38640 + }, + { + "epoch": 1.1245891148151381, + "grad_norm": 11.25, + "learning_rate": 6.251371435757853e-06, + "loss": 1.4928, + "step": 38660 + }, + { + "epoch": 1.1251708991476859, + "grad_norm": 12.6875, + "learning_rate": 6.249432154085238e-06, + "loss": 1.5306, + "step": 38680 + }, + { + "epoch": 1.1257526834802338, + "grad_norm": 13.75, + "learning_rate": 6.247492872412623e-06, + "loss": 1.4904, + "step": 38700 + }, + { + "epoch": 1.1263344678127818, + "grad_norm": 12.875, + "learning_rate": 6.245553590740008e-06, + "loss": 1.5334, + "step": 38720 + }, + { + "epoch": 1.1269162521453298, + "grad_norm": 13.375, + "learning_rate": 6.2436143090673924e-06, + "loss": 1.5545, + "step": 38740 + }, + { + "epoch": 1.1274980364778777, + "grad_norm": 14.3125, + "learning_rate": 6.2416750273947775e-06, + "loss": 1.5655, + "step": 38760 + }, + { + "epoch": 1.1280798208104255, + "grad_norm": 17.0, + "learning_rate": 6.239735745722163e-06, + "loss": 1.4399, + "step": 38780 + }, + { + "epoch": 1.1286616051429734, + "grad_norm": 13.125, + "learning_rate": 6.237796464049548e-06, + "loss": 1.5477, + "step": 38800 + }, + { + "epoch": 1.1292433894755214, + "grad_norm": 12.8125, + "learning_rate": 6.235857182376933e-06, + "loss": 1.5395, + "step": 38820 + }, + { + "epoch": 1.1298251738080693, + "grad_norm": 12.0, + "learning_rate": 6.233917900704318e-06, + "loss": 1.5085, + "step": 38840 + }, + { + "epoch": 1.1304069581406173, + "grad_norm": 17.375, + "learning_rate": 6.231978619031703e-06, + "loss": 1.5742, + "step": 38860 + }, + { + "epoch": 1.1309887424731653, + "grad_norm": 12.6875, + "learning_rate": 6.230039337359088e-06, + "loss": 1.4685, + "step": 38880 + }, + { + "epoch": 1.1315705268057132, + "grad_norm": 12.4375, + "learning_rate": 6.228100055686473e-06, + "loss": 1.593, + "step": 38900 + }, + { + "epoch": 1.132152311138261, + "grad_norm": 12.8125, + "learning_rate": 6.226160774013858e-06, + "loss": 1.5004, + "step": 38920 + }, + { + "epoch": 1.132734095470809, + "grad_norm": 13.9375, + "learning_rate": 6.224221492341243e-06, + "loss": 1.5268, + "step": 38940 + }, + { + "epoch": 1.133315879803357, + "grad_norm": 12.625, + "learning_rate": 6.2222822106686285e-06, + "loss": 1.5868, + "step": 38960 + }, + { + "epoch": 1.1338976641359049, + "grad_norm": 17.125, + "learning_rate": 6.220342928996014e-06, + "loss": 1.538, + "step": 38980 + }, + { + "epoch": 1.1344794484684528, + "grad_norm": 12.5625, + "learning_rate": 6.218403647323399e-06, + "loss": 1.4494, + "step": 39000 + }, + { + "epoch": 1.1350612328010006, + "grad_norm": 13.5, + "learning_rate": 6.216464365650784e-06, + "loss": 1.5108, + "step": 39020 + }, + { + "epoch": 1.1356430171335485, + "grad_norm": 14.25, + "learning_rate": 6.214525083978169e-06, + "loss": 1.4658, + "step": 39040 + }, + { + "epoch": 1.1362248014660965, + "grad_norm": 12.25, + "learning_rate": 6.212585802305554e-06, + "loss": 1.5474, + "step": 39060 + }, + { + "epoch": 1.1368065857986445, + "grad_norm": 13.75, + "learning_rate": 6.210646520632939e-06, + "loss": 1.5372, + "step": 39080 + }, + { + "epoch": 1.1373883701311924, + "grad_norm": 13.125, + "learning_rate": 6.208707238960324e-06, + "loss": 1.3963, + "step": 39100 + }, + { + "epoch": 1.1379701544637402, + "grad_norm": 14.0, + "learning_rate": 6.206767957287709e-06, + "loss": 1.527, + "step": 39120 + }, + { + "epoch": 1.1385519387962881, + "grad_norm": 12.4375, + "learning_rate": 6.204828675615094e-06, + "loss": 1.5576, + "step": 39140 + }, + { + "epoch": 1.139133723128836, + "grad_norm": 12.5625, + "learning_rate": 6.2028893939424795e-06, + "loss": 1.5017, + "step": 39160 + }, + { + "epoch": 1.139715507461384, + "grad_norm": 13.4375, + "learning_rate": 6.2009501122698646e-06, + "loss": 1.5192, + "step": 39180 + }, + { + "epoch": 1.140297291793932, + "grad_norm": 11.75, + "learning_rate": 6.19901083059725e-06, + "loss": 1.4632, + "step": 39200 + }, + { + "epoch": 1.14087907612648, + "grad_norm": 14.5625, + "learning_rate": 6.197071548924635e-06, + "loss": 1.5165, + "step": 39220 + }, + { + "epoch": 1.141460860459028, + "grad_norm": 13.5, + "learning_rate": 6.19513226725202e-06, + "loss": 1.5024, + "step": 39240 + }, + { + "epoch": 1.1420426447915757, + "grad_norm": 15.8125, + "learning_rate": 6.193192985579405e-06, + "loss": 1.4939, + "step": 39260 + }, + { + "epoch": 1.1426244291241237, + "grad_norm": 12.125, + "learning_rate": 6.19125370390679e-06, + "loss": 1.5697, + "step": 39280 + }, + { + "epoch": 1.1432062134566716, + "grad_norm": 14.4375, + "learning_rate": 6.189314422234175e-06, + "loss": 1.4966, + "step": 39300 + }, + { + "epoch": 1.1437879977892196, + "grad_norm": 13.0625, + "learning_rate": 6.18737514056156e-06, + "loss": 1.4836, + "step": 39320 + }, + { + "epoch": 1.1443697821217675, + "grad_norm": 10.4375, + "learning_rate": 6.185435858888945e-06, + "loss": 1.5408, + "step": 39340 + }, + { + "epoch": 1.1449515664543153, + "grad_norm": 13.6875, + "learning_rate": 6.1834965772163305e-06, + "loss": 1.3676, + "step": 39360 + }, + { + "epoch": 1.1455333507868632, + "grad_norm": 11.1875, + "learning_rate": 6.1815572955437156e-06, + "loss": 1.5173, + "step": 39380 + }, + { + "epoch": 1.1461151351194112, + "grad_norm": 10.6875, + "learning_rate": 6.179618013871101e-06, + "loss": 1.4561, + "step": 39400 + }, + { + "epoch": 1.1466969194519592, + "grad_norm": 11.1875, + "learning_rate": 6.177678732198486e-06, + "loss": 1.4594, + "step": 39420 + }, + { + "epoch": 1.1472787037845071, + "grad_norm": 15.375, + "learning_rate": 6.175739450525871e-06, + "loss": 1.5095, + "step": 39440 + }, + { + "epoch": 1.147860488117055, + "grad_norm": 13.5625, + "learning_rate": 6.173800168853256e-06, + "loss": 1.5298, + "step": 39460 + }, + { + "epoch": 1.1484422724496028, + "grad_norm": 12.75, + "learning_rate": 6.171860887180641e-06, + "loss": 1.4985, + "step": 39480 + }, + { + "epoch": 1.1490240567821508, + "grad_norm": 11.4375, + "learning_rate": 6.169921605508026e-06, + "loss": 1.5058, + "step": 39500 + }, + { + "epoch": 1.1496058411146988, + "grad_norm": 14.4375, + "learning_rate": 6.167982323835411e-06, + "loss": 1.5532, + "step": 39520 + }, + { + "epoch": 1.1501876254472467, + "grad_norm": 13.25, + "learning_rate": 6.166043042162796e-06, + "loss": 1.5766, + "step": 39540 + }, + { + "epoch": 1.1507694097797947, + "grad_norm": 15.5, + "learning_rate": 6.164103760490181e-06, + "loss": 1.5411, + "step": 39560 + }, + { + "epoch": 1.1513511941123427, + "grad_norm": 15.5625, + "learning_rate": 6.162164478817566e-06, + "loss": 1.6242, + "step": 39580 + }, + { + "epoch": 1.1519329784448904, + "grad_norm": 10.4375, + "learning_rate": 6.160225197144951e-06, + "loss": 1.4599, + "step": 39600 + }, + { + "epoch": 1.1525147627774384, + "grad_norm": 10.5, + "learning_rate": 6.158285915472336e-06, + "loss": 1.5049, + "step": 39620 + }, + { + "epoch": 1.1530965471099863, + "grad_norm": 13.875, + "learning_rate": 6.156346633799721e-06, + "loss": 1.5083, + "step": 39640 + }, + { + "epoch": 1.1536783314425343, + "grad_norm": 15.5625, + "learning_rate": 6.154407352127106e-06, + "loss": 1.578, + "step": 39660 + }, + { + "epoch": 1.1542601157750823, + "grad_norm": 13.3125, + "learning_rate": 6.152468070454491e-06, + "loss": 1.5239, + "step": 39680 + }, + { + "epoch": 1.15484190010763, + "grad_norm": 16.25, + "learning_rate": 6.150528788781876e-06, + "loss": 1.5221, + "step": 39700 + }, + { + "epoch": 1.155423684440178, + "grad_norm": 12.25, + "learning_rate": 6.148589507109261e-06, + "loss": 1.5173, + "step": 39720 + }, + { + "epoch": 1.156005468772726, + "grad_norm": 12.25, + "learning_rate": 6.1466502254366465e-06, + "loss": 1.5572, + "step": 39740 + }, + { + "epoch": 1.1565872531052739, + "grad_norm": 12.5625, + "learning_rate": 6.1447109437640316e-06, + "loss": 1.5101, + "step": 39760 + }, + { + "epoch": 1.1571690374378218, + "grad_norm": 12.6875, + "learning_rate": 6.142771662091417e-06, + "loss": 1.4834, + "step": 39780 + }, + { + "epoch": 1.1577508217703698, + "grad_norm": 11.0, + "learning_rate": 6.140832380418802e-06, + "loss": 1.42, + "step": 39800 + }, + { + "epoch": 1.1583326061029178, + "grad_norm": 12.875, + "learning_rate": 6.138893098746187e-06, + "loss": 1.5027, + "step": 39820 + }, + { + "epoch": 1.1589143904354655, + "grad_norm": 13.0625, + "learning_rate": 6.136953817073572e-06, + "loss": 1.4606, + "step": 39840 + }, + { + "epoch": 1.1594961747680135, + "grad_norm": 15.1875, + "learning_rate": 6.135014535400957e-06, + "loss": 1.4864, + "step": 39860 + }, + { + "epoch": 1.1600779591005614, + "grad_norm": 12.4375, + "learning_rate": 6.133075253728342e-06, + "loss": 1.5325, + "step": 39880 + }, + { + "epoch": 1.1606597434331094, + "grad_norm": 13.625, + "learning_rate": 6.131135972055727e-06, + "loss": 1.6125, + "step": 39900 + }, + { + "epoch": 1.1612415277656574, + "grad_norm": 13.5, + "learning_rate": 6.129196690383112e-06, + "loss": 1.4932, + "step": 39920 + }, + { + "epoch": 1.161823312098205, + "grad_norm": 15.75, + "learning_rate": 6.1272574087104974e-06, + "loss": 1.5064, + "step": 39940 + }, + { + "epoch": 1.162405096430753, + "grad_norm": 13.6875, + "learning_rate": 6.1253181270378825e-06, + "loss": 1.4966, + "step": 39960 + }, + { + "epoch": 1.162986880763301, + "grad_norm": 14.75, + "learning_rate": 6.123378845365268e-06, + "loss": 1.5413, + "step": 39980 + }, + { + "epoch": 1.163568665095849, + "grad_norm": 16.75, + "learning_rate": 6.121439563692653e-06, + "loss": 1.4682, + "step": 40000 + }, + { + "epoch": 1.164150449428397, + "grad_norm": 13.5625, + "learning_rate": 6.119500282020038e-06, + "loss": 1.4839, + "step": 40020 + }, + { + "epoch": 1.1647322337609447, + "grad_norm": 14.25, + "learning_rate": 6.117561000347423e-06, + "loss": 1.5623, + "step": 40040 + }, + { + "epoch": 1.1653140180934927, + "grad_norm": 12.25, + "learning_rate": 6.115621718674808e-06, + "loss": 1.4892, + "step": 40060 + }, + { + "epoch": 1.1658958024260406, + "grad_norm": 12.3125, + "learning_rate": 6.113682437002193e-06, + "loss": 1.5352, + "step": 40080 + }, + { + "epoch": 1.1664775867585886, + "grad_norm": 12.375, + "learning_rate": 6.111743155329578e-06, + "loss": 1.5495, + "step": 40100 + }, + { + "epoch": 1.1670593710911366, + "grad_norm": 13.375, + "learning_rate": 6.109803873656963e-06, + "loss": 1.4987, + "step": 40120 + }, + { + "epoch": 1.1676411554236845, + "grad_norm": 14.0625, + "learning_rate": 6.107864591984348e-06, + "loss": 1.5625, + "step": 40140 + }, + { + "epoch": 1.1682229397562325, + "grad_norm": 14.125, + "learning_rate": 6.1059253103117335e-06, + "loss": 1.5876, + "step": 40160 + }, + { + "epoch": 1.1688047240887802, + "grad_norm": 11.75, + "learning_rate": 6.103986028639119e-06, + "loss": 1.5188, + "step": 40180 + }, + { + "epoch": 1.1693865084213282, + "grad_norm": 14.4375, + "learning_rate": 6.102046746966504e-06, + "loss": 1.5524, + "step": 40200 + }, + { + "epoch": 1.1699682927538761, + "grad_norm": 16.375, + "learning_rate": 6.100107465293889e-06, + "loss": 1.488, + "step": 40220 + }, + { + "epoch": 1.1705500770864241, + "grad_norm": 12.5, + "learning_rate": 6.098168183621274e-06, + "loss": 1.4927, + "step": 40240 + }, + { + "epoch": 1.171131861418972, + "grad_norm": 10.6875, + "learning_rate": 6.096228901948659e-06, + "loss": 1.493, + "step": 40260 + }, + { + "epoch": 1.1717136457515198, + "grad_norm": 11.1875, + "learning_rate": 6.094289620276044e-06, + "loss": 1.4916, + "step": 40280 + }, + { + "epoch": 1.1722954300840678, + "grad_norm": 13.4375, + "learning_rate": 6.092350338603429e-06, + "loss": 1.5093, + "step": 40300 + }, + { + "epoch": 1.1728772144166157, + "grad_norm": 12.5625, + "learning_rate": 6.090411056930814e-06, + "loss": 1.4595, + "step": 40320 + }, + { + "epoch": 1.1734589987491637, + "grad_norm": 15.875, + "learning_rate": 6.088471775258199e-06, + "loss": 1.6267, + "step": 40340 + }, + { + "epoch": 1.1740407830817117, + "grad_norm": 13.875, + "learning_rate": 6.0865324935855845e-06, + "loss": 1.5344, + "step": 40360 + }, + { + "epoch": 1.1746225674142594, + "grad_norm": 14.3125, + "learning_rate": 6.084593211912969e-06, + "loss": 1.5923, + "step": 40380 + }, + { + "epoch": 1.1752043517468074, + "grad_norm": 13.5, + "learning_rate": 6.082653930240354e-06, + "loss": 1.4998, + "step": 40400 + }, + { + "epoch": 1.1757861360793553, + "grad_norm": 13.5625, + "learning_rate": 6.080714648567739e-06, + "loss": 1.4985, + "step": 40420 + }, + { + "epoch": 1.1763679204119033, + "grad_norm": 14.875, + "learning_rate": 6.078775366895124e-06, + "loss": 1.4674, + "step": 40440 + }, + { + "epoch": 1.1769497047444513, + "grad_norm": 13.6875, + "learning_rate": 6.076836085222509e-06, + "loss": 1.5184, + "step": 40460 + }, + { + "epoch": 1.1775314890769992, + "grad_norm": 14.5, + "learning_rate": 6.074896803549894e-06, + "loss": 1.4749, + "step": 40480 + }, + { + "epoch": 1.1781132734095472, + "grad_norm": 13.375, + "learning_rate": 6.072957521877279e-06, + "loss": 1.6339, + "step": 40500 + }, + { + "epoch": 1.178695057742095, + "grad_norm": 12.5625, + "learning_rate": 6.071018240204664e-06, + "loss": 1.4874, + "step": 40520 + }, + { + "epoch": 1.179276842074643, + "grad_norm": 12.0625, + "learning_rate": 6.0690789585320495e-06, + "loss": 1.5409, + "step": 40540 + }, + { + "epoch": 1.1798586264071909, + "grad_norm": 10.75, + "learning_rate": 6.067139676859435e-06, + "loss": 1.6283, + "step": 40560 + }, + { + "epoch": 1.1804404107397388, + "grad_norm": 12.0, + "learning_rate": 6.06520039518682e-06, + "loss": 1.5583, + "step": 40580 + }, + { + "epoch": 1.1810221950722868, + "grad_norm": 11.25, + "learning_rate": 6.063261113514205e-06, + "loss": 1.5938, + "step": 40600 + }, + { + "epoch": 1.1816039794048345, + "grad_norm": 12.4375, + "learning_rate": 6.06132183184159e-06, + "loss": 1.5019, + "step": 40620 + }, + { + "epoch": 1.1821857637373825, + "grad_norm": 13.875, + "learning_rate": 6.059382550168975e-06, + "loss": 1.585, + "step": 40640 + }, + { + "epoch": 1.1827675480699305, + "grad_norm": 11.0625, + "learning_rate": 6.05744326849636e-06, + "loss": 1.5273, + "step": 40660 + }, + { + "epoch": 1.1833493324024784, + "grad_norm": 10.5625, + "learning_rate": 6.055503986823745e-06, + "loss": 1.5024, + "step": 40680 + }, + { + "epoch": 1.1839311167350264, + "grad_norm": 14.8125, + "learning_rate": 6.05356470515113e-06, + "loss": 1.5544, + "step": 40700 + }, + { + "epoch": 1.1845129010675743, + "grad_norm": 12.9375, + "learning_rate": 6.051625423478515e-06, + "loss": 1.4653, + "step": 40720 + }, + { + "epoch": 1.185094685400122, + "grad_norm": 12.125, + "learning_rate": 6.0496861418059005e-06, + "loss": 1.5179, + "step": 40740 + }, + { + "epoch": 1.18567646973267, + "grad_norm": 11.4375, + "learning_rate": 6.047746860133286e-06, + "loss": 1.5144, + "step": 40760 + }, + { + "epoch": 1.186258254065218, + "grad_norm": 8.75, + "learning_rate": 6.045807578460671e-06, + "loss": 1.4804, + "step": 40780 + }, + { + "epoch": 1.186840038397766, + "grad_norm": 11.125, + "learning_rate": 6.043868296788056e-06, + "loss": 1.5792, + "step": 40800 + }, + { + "epoch": 1.187421822730314, + "grad_norm": 13.875, + "learning_rate": 6.041929015115441e-06, + "loss": 1.4975, + "step": 40820 + }, + { + "epoch": 1.188003607062862, + "grad_norm": 13.1875, + "learning_rate": 6.039989733442826e-06, + "loss": 1.5481, + "step": 40840 + }, + { + "epoch": 1.1885853913954096, + "grad_norm": 13.5, + "learning_rate": 6.038050451770211e-06, + "loss": 1.4825, + "step": 40860 + }, + { + "epoch": 1.1891671757279576, + "grad_norm": 12.8125, + "learning_rate": 6.036111170097596e-06, + "loss": 1.4976, + "step": 40880 + }, + { + "epoch": 1.1897489600605056, + "grad_norm": 13.5, + "learning_rate": 6.034171888424981e-06, + "loss": 1.4153, + "step": 40900 + }, + { + "epoch": 1.1903307443930535, + "grad_norm": 12.875, + "learning_rate": 6.032232606752366e-06, + "loss": 1.5186, + "step": 40920 + }, + { + "epoch": 1.1909125287256015, + "grad_norm": 14.6875, + "learning_rate": 6.0302933250797515e-06, + "loss": 1.5677, + "step": 40940 + }, + { + "epoch": 1.1914943130581492, + "grad_norm": 13.5625, + "learning_rate": 6.0283540434071366e-06, + "loss": 1.5399, + "step": 40960 + }, + { + "epoch": 1.1920760973906972, + "grad_norm": 11.875, + "learning_rate": 6.026414761734522e-06, + "loss": 1.543, + "step": 40980 + }, + { + "epoch": 1.1926578817232452, + "grad_norm": 12.625, + "learning_rate": 6.024475480061907e-06, + "loss": 1.5075, + "step": 41000 + }, + { + "epoch": 1.1932396660557931, + "grad_norm": 15.3125, + "learning_rate": 6.022536198389292e-06, + "loss": 1.519, + "step": 41020 + }, + { + "epoch": 1.193821450388341, + "grad_norm": 12.6875, + "learning_rate": 6.020596916716677e-06, + "loss": 1.4778, + "step": 41040 + }, + { + "epoch": 1.194403234720889, + "grad_norm": 11.4375, + "learning_rate": 6.018657635044062e-06, + "loss": 1.4882, + "step": 41060 + }, + { + "epoch": 1.194985019053437, + "grad_norm": 15.0625, + "learning_rate": 6.016718353371447e-06, + "loss": 1.5328, + "step": 41080 + }, + { + "epoch": 1.1955668033859848, + "grad_norm": 13.375, + "learning_rate": 6.014779071698832e-06, + "loss": 1.4973, + "step": 41100 + }, + { + "epoch": 1.1961485877185327, + "grad_norm": 14.875, + "learning_rate": 6.012839790026217e-06, + "loss": 1.5371, + "step": 41120 + }, + { + "epoch": 1.1967303720510807, + "grad_norm": 13.1875, + "learning_rate": 6.0109005083536024e-06, + "loss": 1.5427, + "step": 41140 + }, + { + "epoch": 1.1973121563836286, + "grad_norm": 13.0625, + "learning_rate": 6.0089612266809875e-06, + "loss": 1.4804, + "step": 41160 + }, + { + "epoch": 1.1978939407161766, + "grad_norm": 15.5625, + "learning_rate": 6.007021945008373e-06, + "loss": 1.5384, + "step": 41180 + }, + { + "epoch": 1.1984757250487243, + "grad_norm": 12.625, + "learning_rate": 6.005082663335757e-06, + "loss": 1.5363, + "step": 41200 + }, + { + "epoch": 1.1990575093812723, + "grad_norm": 13.0, + "learning_rate": 6.003143381663142e-06, + "loss": 1.5551, + "step": 41220 + }, + { + "epoch": 1.1996392937138203, + "grad_norm": 12.5, + "learning_rate": 6.001204099990527e-06, + "loss": 1.5575, + "step": 41240 + }, + { + "epoch": 1.2002210780463682, + "grad_norm": 13.0, + "learning_rate": 5.999264818317912e-06, + "loss": 1.586, + "step": 41260 + }, + { + "epoch": 1.2008028623789162, + "grad_norm": 12.8125, + "learning_rate": 5.997325536645297e-06, + "loss": 1.569, + "step": 41280 + }, + { + "epoch": 1.201384646711464, + "grad_norm": 12.1875, + "learning_rate": 5.995386254972682e-06, + "loss": 1.5458, + "step": 41300 + }, + { + "epoch": 1.201966431044012, + "grad_norm": 13.4375, + "learning_rate": 5.9934469733000675e-06, + "loss": 1.523, + "step": 41320 + }, + { + "epoch": 1.2025482153765599, + "grad_norm": 13.9375, + "learning_rate": 5.9915076916274526e-06, + "loss": 1.5145, + "step": 41340 + }, + { + "epoch": 1.2031299997091078, + "grad_norm": 12.875, + "learning_rate": 5.989568409954838e-06, + "loss": 1.5263, + "step": 41360 + }, + { + "epoch": 1.2037117840416558, + "grad_norm": 13.0, + "learning_rate": 5.987629128282223e-06, + "loss": 1.606, + "step": 41380 + }, + { + "epoch": 1.2042935683742038, + "grad_norm": 12.5, + "learning_rate": 5.985689846609608e-06, + "loss": 1.5428, + "step": 41400 + }, + { + "epoch": 1.2048753527067517, + "grad_norm": 12.75, + "learning_rate": 5.983750564936993e-06, + "loss": 1.4788, + "step": 41420 + }, + { + "epoch": 1.2054571370392995, + "grad_norm": 13.875, + "learning_rate": 5.981811283264378e-06, + "loss": 1.5216, + "step": 41440 + }, + { + "epoch": 1.2060389213718474, + "grad_norm": 15.0625, + "learning_rate": 5.979872001591763e-06, + "loss": 1.606, + "step": 41460 + }, + { + "epoch": 1.2066207057043954, + "grad_norm": 12.875, + "learning_rate": 5.977932719919148e-06, + "loss": 1.4665, + "step": 41480 + }, + { + "epoch": 1.2072024900369434, + "grad_norm": 13.5, + "learning_rate": 5.975993438246533e-06, + "loss": 1.5042, + "step": 41500 + }, + { + "epoch": 1.2077842743694913, + "grad_norm": 14.4375, + "learning_rate": 5.9740541565739184e-06, + "loss": 1.5428, + "step": 41520 + }, + { + "epoch": 1.208366058702039, + "grad_norm": 14.625, + "learning_rate": 5.9721148749013035e-06, + "loss": 1.5344, + "step": 41540 + }, + { + "epoch": 1.208947843034587, + "grad_norm": 14.4375, + "learning_rate": 5.970175593228689e-06, + "loss": 1.5693, + "step": 41560 + }, + { + "epoch": 1.209529627367135, + "grad_norm": 14.875, + "learning_rate": 5.968236311556074e-06, + "loss": 1.4785, + "step": 41580 + }, + { + "epoch": 1.210111411699683, + "grad_norm": 14.375, + "learning_rate": 5.966297029883459e-06, + "loss": 1.6008, + "step": 41600 + }, + { + "epoch": 1.210693196032231, + "grad_norm": 14.3125, + "learning_rate": 5.964357748210844e-06, + "loss": 1.4983, + "step": 41620 + }, + { + "epoch": 1.2112749803647787, + "grad_norm": 14.8125, + "learning_rate": 5.962418466538229e-06, + "loss": 1.5593, + "step": 41640 + }, + { + "epoch": 1.2118567646973266, + "grad_norm": 12.875, + "learning_rate": 5.960479184865614e-06, + "loss": 1.4954, + "step": 41660 + }, + { + "epoch": 1.2124385490298746, + "grad_norm": 13.1875, + "learning_rate": 5.958539903192999e-06, + "loss": 1.4851, + "step": 41680 + }, + { + "epoch": 1.2130203333624225, + "grad_norm": 13.5, + "learning_rate": 5.956600621520384e-06, + "loss": 1.4913, + "step": 41700 + }, + { + "epoch": 1.2136021176949705, + "grad_norm": 12.625, + "learning_rate": 5.954661339847769e-06, + "loss": 1.5791, + "step": 41720 + }, + { + "epoch": 1.2141839020275185, + "grad_norm": 12.625, + "learning_rate": 5.9527220581751545e-06, + "loss": 1.5684, + "step": 41740 + }, + { + "epoch": 1.2147656863600664, + "grad_norm": 12.75, + "learning_rate": 5.95078277650254e-06, + "loss": 1.5621, + "step": 41760 + }, + { + "epoch": 1.2153474706926142, + "grad_norm": 13.5, + "learning_rate": 5.948843494829925e-06, + "loss": 1.5606, + "step": 41780 + }, + { + "epoch": 1.2159292550251621, + "grad_norm": 13.3125, + "learning_rate": 5.946904213157308e-06, + "loss": 1.5274, + "step": 41800 + }, + { + "epoch": 1.21651103935771, + "grad_norm": 14.375, + "learning_rate": 5.944964931484693e-06, + "loss": 1.4346, + "step": 41820 + }, + { + "epoch": 1.217092823690258, + "grad_norm": 10.0, + "learning_rate": 5.943025649812078e-06, + "loss": 1.4874, + "step": 41840 + }, + { + "epoch": 1.217674608022806, + "grad_norm": 13.3125, + "learning_rate": 5.941086368139463e-06, + "loss": 1.5017, + "step": 41860 + }, + { + "epoch": 1.2182563923553538, + "grad_norm": 16.125, + "learning_rate": 5.9391470864668485e-06, + "loss": 1.4754, + "step": 41880 + }, + { + "epoch": 1.2188381766879017, + "grad_norm": 14.0625, + "learning_rate": 5.937207804794234e-06, + "loss": 1.5927, + "step": 41900 + }, + { + "epoch": 1.2194199610204497, + "grad_norm": 16.125, + "learning_rate": 5.935268523121619e-06, + "loss": 1.4845, + "step": 41920 + }, + { + "epoch": 1.2200017453529977, + "grad_norm": 13.3125, + "learning_rate": 5.933329241449004e-06, + "loss": 1.53, + "step": 41940 + }, + { + "epoch": 1.2205835296855456, + "grad_norm": 14.0625, + "learning_rate": 5.931389959776389e-06, + "loss": 1.5277, + "step": 41960 + }, + { + "epoch": 1.2211653140180936, + "grad_norm": 12.75, + "learning_rate": 5.929450678103774e-06, + "loss": 1.5369, + "step": 41980 + }, + { + "epoch": 1.2217470983506413, + "grad_norm": 12.3125, + "learning_rate": 5.927511396431159e-06, + "loss": 1.5688, + "step": 42000 + }, + { + "epoch": 1.2223288826831893, + "grad_norm": 12.3125, + "learning_rate": 5.925572114758544e-06, + "loss": 1.5271, + "step": 42020 + }, + { + "epoch": 1.2229106670157373, + "grad_norm": 11.9375, + "learning_rate": 5.923632833085929e-06, + "loss": 1.5179, + "step": 42040 + }, + { + "epoch": 1.2234924513482852, + "grad_norm": 14.9375, + "learning_rate": 5.921693551413314e-06, + "loss": 1.5587, + "step": 42060 + }, + { + "epoch": 1.2240742356808332, + "grad_norm": 13.1875, + "learning_rate": 5.9197542697406995e-06, + "loss": 1.5901, + "step": 42080 + }, + { + "epoch": 1.2246560200133811, + "grad_norm": 11.625, + "learning_rate": 5.9178149880680846e-06, + "loss": 1.5209, + "step": 42100 + }, + { + "epoch": 1.2252378043459289, + "grad_norm": 12.9375, + "learning_rate": 5.91587570639547e-06, + "loss": 1.5068, + "step": 42120 + }, + { + "epoch": 1.2258195886784768, + "grad_norm": 15.375, + "learning_rate": 5.913936424722855e-06, + "loss": 1.4771, + "step": 42140 + }, + { + "epoch": 1.2264013730110248, + "grad_norm": 13.8125, + "learning_rate": 5.91199714305024e-06, + "loss": 1.532, + "step": 42160 + }, + { + "epoch": 1.2269831573435728, + "grad_norm": 13.0625, + "learning_rate": 5.910057861377625e-06, + "loss": 1.5224, + "step": 42180 + }, + { + "epoch": 1.2275649416761207, + "grad_norm": 14.375, + "learning_rate": 5.90811857970501e-06, + "loss": 1.5713, + "step": 42200 + }, + { + "epoch": 1.2281467260086685, + "grad_norm": 13.125, + "learning_rate": 5.906179298032395e-06, + "loss": 1.6152, + "step": 42220 + }, + { + "epoch": 1.2287285103412164, + "grad_norm": 10.125, + "learning_rate": 5.90424001635978e-06, + "loss": 1.5322, + "step": 42240 + }, + { + "epoch": 1.2293102946737644, + "grad_norm": 11.9375, + "learning_rate": 5.902300734687165e-06, + "loss": 1.4988, + "step": 42260 + }, + { + "epoch": 1.2298920790063124, + "grad_norm": 17.375, + "learning_rate": 5.9003614530145504e-06, + "loss": 1.5966, + "step": 42280 + }, + { + "epoch": 1.2304738633388603, + "grad_norm": 10.5, + "learning_rate": 5.8984221713419355e-06, + "loss": 1.5006, + "step": 42300 + }, + { + "epoch": 1.2310556476714083, + "grad_norm": 12.6875, + "learning_rate": 5.896482889669321e-06, + "loss": 1.5911, + "step": 42320 + }, + { + "epoch": 1.2316374320039563, + "grad_norm": 14.9375, + "learning_rate": 5.894543607996706e-06, + "loss": 1.5205, + "step": 42340 + }, + { + "epoch": 1.232219216336504, + "grad_norm": 12.0625, + "learning_rate": 5.892604326324091e-06, + "loss": 1.4876, + "step": 42360 + }, + { + "epoch": 1.232801000669052, + "grad_norm": 13.5, + "learning_rate": 5.890665044651476e-06, + "loss": 1.5186, + "step": 42380 + }, + { + "epoch": 1.2333827850016, + "grad_norm": 12.875, + "learning_rate": 5.888725762978861e-06, + "loss": 1.4763, + "step": 42400 + }, + { + "epoch": 1.2339645693341479, + "grad_norm": 14.125, + "learning_rate": 5.886786481306246e-06, + "loss": 1.5486, + "step": 42420 + }, + { + "epoch": 1.2345463536666959, + "grad_norm": 12.875, + "learning_rate": 5.884847199633631e-06, + "loss": 1.4882, + "step": 42440 + }, + { + "epoch": 1.2351281379992436, + "grad_norm": 12.6875, + "learning_rate": 5.882907917961016e-06, + "loss": 1.5763, + "step": 42460 + }, + { + "epoch": 1.2357099223317916, + "grad_norm": 13.4375, + "learning_rate": 5.8809686362884014e-06, + "loss": 1.452, + "step": 42480 + }, + { + "epoch": 1.2362917066643395, + "grad_norm": 14.5625, + "learning_rate": 5.8790293546157865e-06, + "loss": 1.5535, + "step": 42500 + }, + { + "epoch": 1.2368734909968875, + "grad_norm": 14.8125, + "learning_rate": 5.877090072943172e-06, + "loss": 1.5531, + "step": 42520 + }, + { + "epoch": 1.2374552753294354, + "grad_norm": 12.375, + "learning_rate": 5.875150791270557e-06, + "loss": 1.4908, + "step": 42540 + }, + { + "epoch": 1.2380370596619832, + "grad_norm": 14.0, + "learning_rate": 5.873211509597942e-06, + "loss": 1.4919, + "step": 42560 + }, + { + "epoch": 1.2386188439945311, + "grad_norm": 12.9375, + "learning_rate": 5.871272227925326e-06, + "loss": 1.4404, + "step": 42580 + }, + { + "epoch": 1.2392006283270791, + "grad_norm": 13.9375, + "learning_rate": 5.869332946252711e-06, + "loss": 1.4845, + "step": 42600 + }, + { + "epoch": 1.239782412659627, + "grad_norm": 12.1875, + "learning_rate": 5.867393664580096e-06, + "loss": 1.5165, + "step": 42620 + }, + { + "epoch": 1.240364196992175, + "grad_norm": 13.0, + "learning_rate": 5.865454382907481e-06, + "loss": 1.5542, + "step": 42640 + }, + { + "epoch": 1.240945981324723, + "grad_norm": 12.375, + "learning_rate": 5.8635151012348665e-06, + "loss": 1.4909, + "step": 42660 + }, + { + "epoch": 1.241527765657271, + "grad_norm": 12.375, + "learning_rate": 5.8615758195622516e-06, + "loss": 1.5737, + "step": 42680 + }, + { + "epoch": 1.2421095499898187, + "grad_norm": 12.1875, + "learning_rate": 5.859636537889637e-06, + "loss": 1.5644, + "step": 42700 + }, + { + "epoch": 1.2426913343223667, + "grad_norm": 13.625, + "learning_rate": 5.857697256217022e-06, + "loss": 1.596, + "step": 42720 + }, + { + "epoch": 1.2432731186549146, + "grad_norm": 11.5625, + "learning_rate": 5.855757974544407e-06, + "loss": 1.4895, + "step": 42740 + }, + { + "epoch": 1.2438549029874626, + "grad_norm": 14.5, + "learning_rate": 5.853818692871792e-06, + "loss": 1.5701, + "step": 42760 + }, + { + "epoch": 1.2444366873200106, + "grad_norm": 14.9375, + "learning_rate": 5.851879411199177e-06, + "loss": 1.5158, + "step": 42780 + }, + { + "epoch": 1.2450184716525583, + "grad_norm": 14.8125, + "learning_rate": 5.849940129526562e-06, + "loss": 1.5914, + "step": 42800 + }, + { + "epoch": 1.2456002559851063, + "grad_norm": 11.5625, + "learning_rate": 5.848000847853947e-06, + "loss": 1.5527, + "step": 42820 + }, + { + "epoch": 1.2461820403176542, + "grad_norm": 13.625, + "learning_rate": 5.846061566181332e-06, + "loss": 1.5446, + "step": 42840 + }, + { + "epoch": 1.2467638246502022, + "grad_norm": 12.625, + "learning_rate": 5.8441222845087174e-06, + "loss": 1.4778, + "step": 42860 + }, + { + "epoch": 1.2473456089827502, + "grad_norm": 15.875, + "learning_rate": 5.8421830028361025e-06, + "loss": 1.6255, + "step": 42880 + }, + { + "epoch": 1.247927393315298, + "grad_norm": 14.125, + "learning_rate": 5.840243721163488e-06, + "loss": 1.5588, + "step": 42900 + }, + { + "epoch": 1.2485091776478459, + "grad_norm": 13.125, + "learning_rate": 5.838304439490873e-06, + "loss": 1.5103, + "step": 42920 + }, + { + "epoch": 1.2490909619803938, + "grad_norm": 14.9375, + "learning_rate": 5.836365157818258e-06, + "loss": 1.5243, + "step": 42940 + }, + { + "epoch": 1.2496727463129418, + "grad_norm": 13.125, + "learning_rate": 5.834425876145643e-06, + "loss": 1.5647, + "step": 42960 + }, + { + "epoch": 1.2502545306454897, + "grad_norm": 12.375, + "learning_rate": 5.832486594473028e-06, + "loss": 1.5204, + "step": 42980 + }, + { + "epoch": 1.2508363149780377, + "grad_norm": 11.1875, + "learning_rate": 5.830547312800413e-06, + "loss": 1.5199, + "step": 43000 + }, + { + "epoch": 1.2514180993105857, + "grad_norm": 13.1875, + "learning_rate": 5.828608031127798e-06, + "loss": 1.5245, + "step": 43020 + }, + { + "epoch": 1.2519998836431334, + "grad_norm": 13.3125, + "learning_rate": 5.826668749455183e-06, + "loss": 1.5144, + "step": 43040 + }, + { + "epoch": 1.2525816679756814, + "grad_norm": 13.4375, + "learning_rate": 5.824729467782568e-06, + "loss": 1.5476, + "step": 43060 + }, + { + "epoch": 1.2531634523082293, + "grad_norm": 13.625, + "learning_rate": 5.8227901861099535e-06, + "loss": 1.4843, + "step": 43080 + }, + { + "epoch": 1.2537452366407773, + "grad_norm": 14.125, + "learning_rate": 5.820850904437339e-06, + "loss": 1.4631, + "step": 43100 + }, + { + "epoch": 1.2543270209733253, + "grad_norm": 13.25, + "learning_rate": 5.818911622764724e-06, + "loss": 1.5837, + "step": 43120 + }, + { + "epoch": 1.254908805305873, + "grad_norm": 15.0625, + "learning_rate": 5.816972341092109e-06, + "loss": 1.4783, + "step": 43140 + }, + { + "epoch": 1.255490589638421, + "grad_norm": 11.625, + "learning_rate": 5.815033059419494e-06, + "loss": 1.5691, + "step": 43160 + }, + { + "epoch": 1.256072373970969, + "grad_norm": 12.8125, + "learning_rate": 5.813093777746879e-06, + "loss": 1.5289, + "step": 43180 + }, + { + "epoch": 1.256654158303517, + "grad_norm": 15.625, + "learning_rate": 5.811154496074264e-06, + "loss": 1.5228, + "step": 43200 + }, + { + "epoch": 1.2572359426360649, + "grad_norm": 15.9375, + "learning_rate": 5.809215214401649e-06, + "loss": 1.5698, + "step": 43220 + }, + { + "epoch": 1.2578177269686126, + "grad_norm": 13.6875, + "learning_rate": 5.807275932729034e-06, + "loss": 1.511, + "step": 43240 + }, + { + "epoch": 1.2583995113011608, + "grad_norm": 13.375, + "learning_rate": 5.805336651056419e-06, + "loss": 1.599, + "step": 43260 + }, + { + "epoch": 1.2589812956337085, + "grad_norm": 11.875, + "learning_rate": 5.8033973693838045e-06, + "loss": 1.4926, + "step": 43280 + }, + { + "epoch": 1.2595630799662565, + "grad_norm": 13.9375, + "learning_rate": 5.8014580877111896e-06, + "loss": 1.537, + "step": 43300 + }, + { + "epoch": 1.2601448642988045, + "grad_norm": 16.125, + "learning_rate": 5.799518806038575e-06, + "loss": 1.4502, + "step": 43320 + }, + { + "epoch": 1.2607266486313524, + "grad_norm": 14.25, + "learning_rate": 5.79757952436596e-06, + "loss": 1.4721, + "step": 43340 + }, + { + "epoch": 1.2613084329639004, + "grad_norm": 12.5, + "learning_rate": 5.795640242693345e-06, + "loss": 1.537, + "step": 43360 + }, + { + "epoch": 1.2618902172964481, + "grad_norm": 15.125, + "learning_rate": 5.79370096102073e-06, + "loss": 1.538, + "step": 43380 + }, + { + "epoch": 1.262472001628996, + "grad_norm": 14.0625, + "learning_rate": 5.791761679348114e-06, + "loss": 1.5307, + "step": 43400 + }, + { + "epoch": 1.263053785961544, + "grad_norm": 14.3125, + "learning_rate": 5.789822397675499e-06, + "loss": 1.5407, + "step": 43420 + }, + { + "epoch": 1.263635570294092, + "grad_norm": 11.5, + "learning_rate": 5.787883116002884e-06, + "loss": 1.5532, + "step": 43440 + }, + { + "epoch": 1.26421735462664, + "grad_norm": 11.5, + "learning_rate": 5.7859438343302695e-06, + "loss": 1.6063, + "step": 43460 + }, + { + "epoch": 1.2647991389591877, + "grad_norm": 13.5, + "learning_rate": 5.784004552657655e-06, + "loss": 1.5031, + "step": 43480 + }, + { + "epoch": 1.2653809232917357, + "grad_norm": 13.25, + "learning_rate": 5.78206527098504e-06, + "loss": 1.5268, + "step": 43500 + }, + { + "epoch": 1.2659627076242836, + "grad_norm": 12.0625, + "learning_rate": 5.780125989312425e-06, + "loss": 1.4692, + "step": 43520 + }, + { + "epoch": 1.2665444919568316, + "grad_norm": 14.25, + "learning_rate": 5.77818670763981e-06, + "loss": 1.4345, + "step": 43540 + }, + { + "epoch": 1.2671262762893796, + "grad_norm": 13.375, + "learning_rate": 5.776247425967195e-06, + "loss": 1.4452, + "step": 43560 + }, + { + "epoch": 1.2677080606219273, + "grad_norm": 9.8125, + "learning_rate": 5.77430814429458e-06, + "loss": 1.5405, + "step": 43580 + }, + { + "epoch": 1.2682898449544755, + "grad_norm": 13.5625, + "learning_rate": 5.772368862621965e-06, + "loss": 1.5249, + "step": 43600 + }, + { + "epoch": 1.2688716292870232, + "grad_norm": 15.5, + "learning_rate": 5.77042958094935e-06, + "loss": 1.499, + "step": 43620 + }, + { + "epoch": 1.2694534136195712, + "grad_norm": 13.0625, + "learning_rate": 5.768490299276735e-06, + "loss": 1.4628, + "step": 43640 + }, + { + "epoch": 1.2700351979521192, + "grad_norm": 13.125, + "learning_rate": 5.7665510176041205e-06, + "loss": 1.464, + "step": 43660 + }, + { + "epoch": 1.2706169822846671, + "grad_norm": 11.375, + "learning_rate": 5.764611735931506e-06, + "loss": 1.5615, + "step": 43680 + }, + { + "epoch": 1.271198766617215, + "grad_norm": 13.625, + "learning_rate": 5.762672454258891e-06, + "loss": 1.5503, + "step": 43700 + }, + { + "epoch": 1.2717805509497628, + "grad_norm": 11.25, + "learning_rate": 5.760733172586276e-06, + "loss": 1.3915, + "step": 43720 + }, + { + "epoch": 1.2723623352823108, + "grad_norm": 13.625, + "learning_rate": 5.758793890913661e-06, + "loss": 1.6215, + "step": 43740 + }, + { + "epoch": 1.2729441196148588, + "grad_norm": 16.5, + "learning_rate": 5.756854609241046e-06, + "loss": 1.5395, + "step": 43760 + }, + { + "epoch": 1.2735259039474067, + "grad_norm": 17.125, + "learning_rate": 5.754915327568431e-06, + "loss": 1.5404, + "step": 43780 + }, + { + "epoch": 1.2741076882799547, + "grad_norm": 13.5, + "learning_rate": 5.752976045895816e-06, + "loss": 1.4823, + "step": 43800 + }, + { + "epoch": 1.2746894726125024, + "grad_norm": 12.0625, + "learning_rate": 5.751036764223201e-06, + "loss": 1.5202, + "step": 43820 + }, + { + "epoch": 1.2752712569450504, + "grad_norm": 14.875, + "learning_rate": 5.749097482550586e-06, + "loss": 1.5736, + "step": 43840 + }, + { + "epoch": 1.2758530412775984, + "grad_norm": 12.5625, + "learning_rate": 5.7471582008779715e-06, + "loss": 1.4768, + "step": 43860 + }, + { + "epoch": 1.2764348256101463, + "grad_norm": 16.875, + "learning_rate": 5.7452189192053566e-06, + "loss": 1.5277, + "step": 43880 + }, + { + "epoch": 1.2770166099426943, + "grad_norm": 11.0625, + "learning_rate": 5.743279637532742e-06, + "loss": 1.4631, + "step": 43900 + }, + { + "epoch": 1.2775983942752422, + "grad_norm": 14.875, + "learning_rate": 5.741340355860127e-06, + "loss": 1.5205, + "step": 43920 + }, + { + "epoch": 1.2781801786077902, + "grad_norm": 13.0, + "learning_rate": 5.739401074187512e-06, + "loss": 1.5229, + "step": 43940 + }, + { + "epoch": 1.278761962940338, + "grad_norm": 12.5, + "learning_rate": 5.737461792514897e-06, + "loss": 1.5677, + "step": 43960 + }, + { + "epoch": 1.279343747272886, + "grad_norm": 13.5, + "learning_rate": 5.735522510842282e-06, + "loss": 1.4604, + "step": 43980 + }, + { + "epoch": 1.2799255316054339, + "grad_norm": 13.125, + "learning_rate": 5.733583229169667e-06, + "loss": 1.5695, + "step": 44000 + }, + { + "epoch": 1.2805073159379818, + "grad_norm": 14.375, + "learning_rate": 5.731643947497052e-06, + "loss": 1.5317, + "step": 44020 + }, + { + "epoch": 1.2810891002705298, + "grad_norm": 14.5, + "learning_rate": 5.729704665824437e-06, + "loss": 1.5426, + "step": 44040 + }, + { + "epoch": 1.2816708846030775, + "grad_norm": 15.5625, + "learning_rate": 5.7277653841518224e-06, + "loss": 1.6236, + "step": 44060 + }, + { + "epoch": 1.2822526689356255, + "grad_norm": 12.4375, + "learning_rate": 5.7258261024792075e-06, + "loss": 1.5514, + "step": 44080 + }, + { + "epoch": 1.2828344532681735, + "grad_norm": 13.4375, + "learning_rate": 5.723886820806593e-06, + "loss": 1.5181, + "step": 44100 + }, + { + "epoch": 1.2834162376007214, + "grad_norm": 13.3125, + "learning_rate": 5.721947539133978e-06, + "loss": 1.5044, + "step": 44120 + }, + { + "epoch": 1.2839980219332694, + "grad_norm": 11.75, + "learning_rate": 5.720008257461363e-06, + "loss": 1.5696, + "step": 44140 + }, + { + "epoch": 1.2845798062658171, + "grad_norm": 14.1875, + "learning_rate": 5.718068975788748e-06, + "loss": 1.4719, + "step": 44160 + }, + { + "epoch": 1.2851615905983653, + "grad_norm": 13.375, + "learning_rate": 5.716129694116133e-06, + "loss": 1.4986, + "step": 44180 + }, + { + "epoch": 1.285743374930913, + "grad_norm": 12.8125, + "learning_rate": 5.714190412443517e-06, + "loss": 1.5602, + "step": 44200 + }, + { + "epoch": 1.286325159263461, + "grad_norm": 10.875, + "learning_rate": 5.712251130770902e-06, + "loss": 1.5185, + "step": 44220 + }, + { + "epoch": 1.286906943596009, + "grad_norm": 12.0625, + "learning_rate": 5.7103118490982875e-06, + "loss": 1.4907, + "step": 44240 + }, + { + "epoch": 1.287488727928557, + "grad_norm": 12.0625, + "learning_rate": 5.7083725674256726e-06, + "loss": 1.4145, + "step": 44260 + }, + { + "epoch": 1.288070512261105, + "grad_norm": 14.375, + "learning_rate": 5.706433285753058e-06, + "loss": 1.5275, + "step": 44280 + }, + { + "epoch": 1.2886522965936527, + "grad_norm": 15.3125, + "learning_rate": 5.704494004080443e-06, + "loss": 1.5402, + "step": 44300 + }, + { + "epoch": 1.2892340809262006, + "grad_norm": 12.375, + "learning_rate": 5.702554722407828e-06, + "loss": 1.6014, + "step": 44320 + }, + { + "epoch": 1.2898158652587486, + "grad_norm": 13.75, + "learning_rate": 5.700615440735213e-06, + "loss": 1.6052, + "step": 44340 + }, + { + "epoch": 1.2903976495912965, + "grad_norm": 12.5625, + "learning_rate": 5.698676159062598e-06, + "loss": 1.4923, + "step": 44360 + }, + { + "epoch": 1.2909794339238445, + "grad_norm": 11.5625, + "learning_rate": 5.696736877389983e-06, + "loss": 1.4916, + "step": 44380 + }, + { + "epoch": 1.2915612182563923, + "grad_norm": 13.4375, + "learning_rate": 5.694797595717368e-06, + "loss": 1.5162, + "step": 44400 + }, + { + "epoch": 1.2921430025889402, + "grad_norm": 11.875, + "learning_rate": 5.692858314044753e-06, + "loss": 1.6063, + "step": 44420 + }, + { + "epoch": 1.2927247869214882, + "grad_norm": 11.6875, + "learning_rate": 5.6909190323721384e-06, + "loss": 1.6026, + "step": 44440 + }, + { + "epoch": 1.2933065712540361, + "grad_norm": 12.3125, + "learning_rate": 5.6889797506995235e-06, + "loss": 1.3727, + "step": 44460 + }, + { + "epoch": 1.293888355586584, + "grad_norm": 12.4375, + "learning_rate": 5.687040469026909e-06, + "loss": 1.517, + "step": 44480 + }, + { + "epoch": 1.2944701399191318, + "grad_norm": 11.3125, + "learning_rate": 5.685101187354294e-06, + "loss": 1.4526, + "step": 44500 + }, + { + "epoch": 1.29505192425168, + "grad_norm": 17.375, + "learning_rate": 5.683161905681679e-06, + "loss": 1.6621, + "step": 44520 + }, + { + "epoch": 1.2956337085842278, + "grad_norm": 14.9375, + "learning_rate": 5.681222624009064e-06, + "loss": 1.5943, + "step": 44540 + }, + { + "epoch": 1.2962154929167757, + "grad_norm": 12.4375, + "learning_rate": 5.679283342336449e-06, + "loss": 1.5046, + "step": 44560 + }, + { + "epoch": 1.2967972772493237, + "grad_norm": 14.75, + "learning_rate": 5.677344060663834e-06, + "loss": 1.5444, + "step": 44580 + }, + { + "epoch": 1.2973790615818717, + "grad_norm": 13.8125, + "learning_rate": 5.675404778991219e-06, + "loss": 1.5583, + "step": 44600 + }, + { + "epoch": 1.2979608459144196, + "grad_norm": 14.625, + "learning_rate": 5.673465497318604e-06, + "loss": 1.4507, + "step": 44620 + }, + { + "epoch": 1.2985426302469674, + "grad_norm": 11.0625, + "learning_rate": 5.671526215645989e-06, + "loss": 1.5596, + "step": 44640 + }, + { + "epoch": 1.2991244145795153, + "grad_norm": 13.25, + "learning_rate": 5.6695869339733745e-06, + "loss": 1.5583, + "step": 44660 + }, + { + "epoch": 1.2997061989120633, + "grad_norm": 38.75, + "learning_rate": 5.66764765230076e-06, + "loss": 1.5247, + "step": 44680 + }, + { + "epoch": 1.3002879832446113, + "grad_norm": 7.75, + "learning_rate": 5.665708370628145e-06, + "loss": 1.3917, + "step": 44700 + }, + { + "epoch": 1.3008697675771592, + "grad_norm": 15.0625, + "learning_rate": 5.66376908895553e-06, + "loss": 1.5193, + "step": 44720 + }, + { + "epoch": 1.301451551909707, + "grad_norm": 13.3125, + "learning_rate": 5.661829807282915e-06, + "loss": 1.5081, + "step": 44740 + }, + { + "epoch": 1.302033336242255, + "grad_norm": 14.75, + "learning_rate": 5.6598905256103e-06, + "loss": 1.4606, + "step": 44760 + }, + { + "epoch": 1.3026151205748029, + "grad_norm": 11.4375, + "learning_rate": 5.657951243937685e-06, + "loss": 1.5302, + "step": 44780 + }, + { + "epoch": 1.3031969049073509, + "grad_norm": 10.875, + "learning_rate": 5.65601196226507e-06, + "loss": 1.53, + "step": 44800 + }, + { + "epoch": 1.3037786892398988, + "grad_norm": 12.6875, + "learning_rate": 5.654072680592455e-06, + "loss": 1.5268, + "step": 44820 + }, + { + "epoch": 1.3043604735724468, + "grad_norm": 12.625, + "learning_rate": 5.65213339891984e-06, + "loss": 1.5246, + "step": 44840 + }, + { + "epoch": 1.3049422579049947, + "grad_norm": 13.9375, + "learning_rate": 5.6501941172472255e-06, + "loss": 1.5355, + "step": 44860 + }, + { + "epoch": 1.3055240422375425, + "grad_norm": 12.4375, + "learning_rate": 5.648254835574611e-06, + "loss": 1.5432, + "step": 44880 + }, + { + "epoch": 1.3061058265700904, + "grad_norm": 13.3125, + "learning_rate": 5.646315553901996e-06, + "loss": 1.5031, + "step": 44900 + }, + { + "epoch": 1.3066876109026384, + "grad_norm": 12.75, + "learning_rate": 5.644376272229381e-06, + "loss": 1.4694, + "step": 44920 + }, + { + "epoch": 1.3072693952351864, + "grad_norm": 13.5, + "learning_rate": 5.642436990556766e-06, + "loss": 1.5157, + "step": 44940 + }, + { + "epoch": 1.3078511795677343, + "grad_norm": 12.0625, + "learning_rate": 5.640497708884151e-06, + "loss": 1.461, + "step": 44960 + }, + { + "epoch": 1.308432963900282, + "grad_norm": 12.6875, + "learning_rate": 5.638558427211536e-06, + "loss": 1.5395, + "step": 44980 + }, + { + "epoch": 1.30901474823283, + "grad_norm": 12.0, + "learning_rate": 5.636619145538921e-06, + "loss": 1.5547, + "step": 45000 + }, + { + "epoch": 1.309596532565378, + "grad_norm": 10.1875, + "learning_rate": 5.634679863866305e-06, + "loss": 1.5409, + "step": 45020 + }, + { + "epoch": 1.310178316897926, + "grad_norm": 13.4375, + "learning_rate": 5.6327405821936905e-06, + "loss": 1.4595, + "step": 45040 + }, + { + "epoch": 1.310760101230474, + "grad_norm": 11.3125, + "learning_rate": 5.630801300521076e-06, + "loss": 1.4294, + "step": 45060 + }, + { + "epoch": 1.3113418855630217, + "grad_norm": 15.625, + "learning_rate": 5.628862018848461e-06, + "loss": 1.5729, + "step": 45080 + }, + { + "epoch": 1.3119236698955696, + "grad_norm": 12.875, + "learning_rate": 5.626922737175846e-06, + "loss": 1.4949, + "step": 45100 + }, + { + "epoch": 1.3125054542281176, + "grad_norm": 15.4375, + "learning_rate": 5.624983455503231e-06, + "loss": 1.5148, + "step": 45120 + }, + { + "epoch": 1.3130872385606656, + "grad_norm": 12.5, + "learning_rate": 5.623044173830616e-06, + "loss": 1.5115, + "step": 45140 + }, + { + "epoch": 1.3136690228932135, + "grad_norm": 14.25, + "learning_rate": 5.621104892158001e-06, + "loss": 1.4664, + "step": 45160 + }, + { + "epoch": 1.3142508072257615, + "grad_norm": 10.4375, + "learning_rate": 5.619165610485386e-06, + "loss": 1.5492, + "step": 45180 + }, + { + "epoch": 1.3148325915583094, + "grad_norm": 11.625, + "learning_rate": 5.617226328812771e-06, + "loss": 1.4522, + "step": 45200 + }, + { + "epoch": 1.3154143758908572, + "grad_norm": 12.1875, + "learning_rate": 5.615287047140156e-06, + "loss": 1.582, + "step": 45220 + }, + { + "epoch": 1.3159961602234052, + "grad_norm": 14.0625, + "learning_rate": 5.6133477654675415e-06, + "loss": 1.4966, + "step": 45240 + }, + { + "epoch": 1.3165779445559531, + "grad_norm": 11.9375, + "learning_rate": 5.611408483794927e-06, + "loss": 1.5052, + "step": 45260 + }, + { + "epoch": 1.317159728888501, + "grad_norm": 11.9375, + "learning_rate": 5.609469202122312e-06, + "loss": 1.558, + "step": 45280 + }, + { + "epoch": 1.317741513221049, + "grad_norm": 12.9375, + "learning_rate": 5.607529920449697e-06, + "loss": 1.4795, + "step": 45300 + }, + { + "epoch": 1.3183232975535968, + "grad_norm": 11.8125, + "learning_rate": 5.605590638777082e-06, + "loss": 1.4875, + "step": 45320 + }, + { + "epoch": 1.3189050818861447, + "grad_norm": 14.0625, + "learning_rate": 5.603651357104467e-06, + "loss": 1.4081, + "step": 45340 + }, + { + "epoch": 1.3194868662186927, + "grad_norm": 13.0625, + "learning_rate": 5.601712075431852e-06, + "loss": 1.6053, + "step": 45360 + }, + { + "epoch": 1.3200686505512407, + "grad_norm": 17.0, + "learning_rate": 5.599772793759237e-06, + "loss": 1.5608, + "step": 45380 + }, + { + "epoch": 1.3206504348837886, + "grad_norm": 12.375, + "learning_rate": 5.597833512086622e-06, + "loss": 1.5341, + "step": 45400 + }, + { + "epoch": 1.3212322192163364, + "grad_norm": 15.6875, + "learning_rate": 5.595894230414007e-06, + "loss": 1.5038, + "step": 45420 + }, + { + "epoch": 1.3218140035488846, + "grad_norm": 13.6875, + "learning_rate": 5.5939549487413925e-06, + "loss": 1.4798, + "step": 45440 + }, + { + "epoch": 1.3223957878814323, + "grad_norm": 14.375, + "learning_rate": 5.5920156670687776e-06, + "loss": 1.5253, + "step": 45460 + }, + { + "epoch": 1.3229775722139803, + "grad_norm": 16.125, + "learning_rate": 5.590076385396163e-06, + "loss": 1.4623, + "step": 45480 + }, + { + "epoch": 1.3235593565465282, + "grad_norm": 12.0, + "learning_rate": 5.588137103723548e-06, + "loss": 1.5533, + "step": 45500 + }, + { + "epoch": 1.3241411408790762, + "grad_norm": 11.375, + "learning_rate": 5.586197822050933e-06, + "loss": 1.5577, + "step": 45520 + }, + { + "epoch": 1.3247229252116242, + "grad_norm": 11.125, + "learning_rate": 5.584258540378318e-06, + "loss": 1.5427, + "step": 45540 + }, + { + "epoch": 1.325304709544172, + "grad_norm": 13.9375, + "learning_rate": 5.582319258705703e-06, + "loss": 1.5284, + "step": 45560 + }, + { + "epoch": 1.3258864938767199, + "grad_norm": 12.375, + "learning_rate": 5.580379977033088e-06, + "loss": 1.5029, + "step": 45580 + }, + { + "epoch": 1.3264682782092678, + "grad_norm": 12.1875, + "learning_rate": 5.578440695360473e-06, + "loss": 1.5527, + "step": 45600 + }, + { + "epoch": 1.3270500625418158, + "grad_norm": 12.6875, + "learning_rate": 5.576501413687858e-06, + "loss": 1.5544, + "step": 45620 + }, + { + "epoch": 1.3276318468743638, + "grad_norm": 14.1875, + "learning_rate": 5.5745621320152434e-06, + "loss": 1.5591, + "step": 45640 + }, + { + "epoch": 1.3282136312069115, + "grad_norm": 12.8125, + "learning_rate": 5.5726228503426285e-06, + "loss": 1.4167, + "step": 45660 + }, + { + "epoch": 1.3287954155394595, + "grad_norm": 13.375, + "learning_rate": 5.570683568670014e-06, + "loss": 1.4603, + "step": 45680 + }, + { + "epoch": 1.3293771998720074, + "grad_norm": 15.0625, + "learning_rate": 5.568744286997399e-06, + "loss": 1.5703, + "step": 45700 + }, + { + "epoch": 1.3299589842045554, + "grad_norm": 14.6875, + "learning_rate": 5.566805005324784e-06, + "loss": 1.5153, + "step": 45720 + }, + { + "epoch": 1.3305407685371033, + "grad_norm": 14.375, + "learning_rate": 5.564865723652169e-06, + "loss": 1.4395, + "step": 45740 + }, + { + "epoch": 1.331122552869651, + "grad_norm": 14.875, + "learning_rate": 5.562926441979554e-06, + "loss": 1.394, + "step": 45760 + }, + { + "epoch": 1.3317043372021993, + "grad_norm": 13.5625, + "learning_rate": 5.560987160306939e-06, + "loss": 1.501, + "step": 45780 + }, + { + "epoch": 1.332286121534747, + "grad_norm": 12.8125, + "learning_rate": 5.559047878634324e-06, + "loss": 1.4852, + "step": 45800 + }, + { + "epoch": 1.332867905867295, + "grad_norm": 12.25, + "learning_rate": 5.557108596961709e-06, + "loss": 1.5356, + "step": 45820 + }, + { + "epoch": 1.333449690199843, + "grad_norm": 13.25, + "learning_rate": 5.5551693152890936e-06, + "loss": 1.4918, + "step": 45840 + }, + { + "epoch": 1.334031474532391, + "grad_norm": 15.5, + "learning_rate": 5.553230033616479e-06, + "loss": 1.4625, + "step": 45860 + }, + { + "epoch": 1.3346132588649389, + "grad_norm": 13.0, + "learning_rate": 5.551290751943864e-06, + "loss": 1.4486, + "step": 45880 + }, + { + "epoch": 1.3351950431974866, + "grad_norm": 12.1875, + "learning_rate": 5.549351470271249e-06, + "loss": 1.4094, + "step": 45900 + }, + { + "epoch": 1.3357768275300346, + "grad_norm": 12.875, + "learning_rate": 5.547412188598634e-06, + "loss": 1.5693, + "step": 45920 + }, + { + "epoch": 1.3363586118625825, + "grad_norm": 11.0625, + "learning_rate": 5.545472906926019e-06, + "loss": 1.5614, + "step": 45940 + }, + { + "epoch": 1.3369403961951305, + "grad_norm": 14.25, + "learning_rate": 5.543533625253404e-06, + "loss": 1.5938, + "step": 45960 + }, + { + "epoch": 1.3375221805276785, + "grad_norm": 11.625, + "learning_rate": 5.541594343580789e-06, + "loss": 1.4579, + "step": 45980 + }, + { + "epoch": 1.3381039648602262, + "grad_norm": 14.875, + "learning_rate": 5.539655061908174e-06, + "loss": 1.5772, + "step": 46000 + }, + { + "epoch": 1.3386857491927742, + "grad_norm": 14.0, + "learning_rate": 5.5377157802355594e-06, + "loss": 1.6044, + "step": 46020 + }, + { + "epoch": 1.3392675335253221, + "grad_norm": 13.6875, + "learning_rate": 5.535776498562944e-06, + "loss": 1.5251, + "step": 46040 + }, + { + "epoch": 1.33984931785787, + "grad_norm": 13.125, + "learning_rate": 5.533837216890329e-06, + "loss": 1.5302, + "step": 46060 + }, + { + "epoch": 1.340431102190418, + "grad_norm": 11.9375, + "learning_rate": 5.531897935217714e-06, + "loss": 1.5406, + "step": 46080 + }, + { + "epoch": 1.341012886522966, + "grad_norm": 12.5, + "learning_rate": 5.529958653545099e-06, + "loss": 1.5536, + "step": 46100 + }, + { + "epoch": 1.341594670855514, + "grad_norm": 17.875, + "learning_rate": 5.528019371872484e-06, + "loss": 1.5169, + "step": 46120 + }, + { + "epoch": 1.3421764551880617, + "grad_norm": 16.125, + "learning_rate": 5.526080090199869e-06, + "loss": 1.5308, + "step": 46140 + }, + { + "epoch": 1.3427582395206097, + "grad_norm": 11.3125, + "learning_rate": 5.524140808527254e-06, + "loss": 1.61, + "step": 46160 + }, + { + "epoch": 1.3433400238531576, + "grad_norm": 13.875, + "learning_rate": 5.522201526854639e-06, + "loss": 1.5143, + "step": 46180 + }, + { + "epoch": 1.3439218081857056, + "grad_norm": 13.5, + "learning_rate": 5.5202622451820245e-06, + "loss": 1.5182, + "step": 46200 + }, + { + "epoch": 1.3445035925182536, + "grad_norm": 13.25, + "learning_rate": 5.5183229635094096e-06, + "loss": 1.49, + "step": 46220 + }, + { + "epoch": 1.3450853768508013, + "grad_norm": 13.75, + "learning_rate": 5.516383681836795e-06, + "loss": 1.4638, + "step": 46240 + }, + { + "epoch": 1.3456671611833493, + "grad_norm": 15.5, + "learning_rate": 5.51444440016418e-06, + "loss": 1.4815, + "step": 46260 + }, + { + "epoch": 1.3462489455158972, + "grad_norm": 13.0, + "learning_rate": 5.512505118491565e-06, + "loss": 1.4858, + "step": 46280 + }, + { + "epoch": 1.3468307298484452, + "grad_norm": 15.25, + "learning_rate": 5.51056583681895e-06, + "loss": 1.5412, + "step": 46300 + }, + { + "epoch": 1.3474125141809932, + "grad_norm": 13.375, + "learning_rate": 5.508626555146335e-06, + "loss": 1.5732, + "step": 46320 + }, + { + "epoch": 1.347994298513541, + "grad_norm": 14.1875, + "learning_rate": 5.50668727347372e-06, + "loss": 1.548, + "step": 46340 + }, + { + "epoch": 1.3485760828460889, + "grad_norm": 13.8125, + "learning_rate": 5.504747991801105e-06, + "loss": 1.4905, + "step": 46360 + }, + { + "epoch": 1.3491578671786368, + "grad_norm": 14.25, + "learning_rate": 5.50280871012849e-06, + "loss": 1.4526, + "step": 46380 + }, + { + "epoch": 1.3497396515111848, + "grad_norm": 11.0, + "learning_rate": 5.500869428455875e-06, + "loss": 1.4639, + "step": 46400 + }, + { + "epoch": 1.3503214358437328, + "grad_norm": 13.5625, + "learning_rate": 5.49893014678326e-06, + "loss": 1.5546, + "step": 46420 + }, + { + "epoch": 1.3509032201762807, + "grad_norm": 13.8125, + "learning_rate": 5.496990865110645e-06, + "loss": 1.5003, + "step": 46440 + }, + { + "epoch": 1.3514850045088287, + "grad_norm": 13.3125, + "learning_rate": 5.49505158343803e-06, + "loss": 1.5832, + "step": 46460 + }, + { + "epoch": 1.3520667888413764, + "grad_norm": 13.75, + "learning_rate": 5.493112301765415e-06, + "loss": 1.5994, + "step": 46480 + }, + { + "epoch": 1.3526485731739244, + "grad_norm": 13.625, + "learning_rate": 5.4911730200928e-06, + "loss": 1.4468, + "step": 46500 + }, + { + "epoch": 1.3532303575064724, + "grad_norm": 14.9375, + "learning_rate": 5.489233738420185e-06, + "loss": 1.4975, + "step": 46520 + }, + { + "epoch": 1.3538121418390203, + "grad_norm": 12.375, + "learning_rate": 5.48729445674757e-06, + "loss": 1.4866, + "step": 46540 + }, + { + "epoch": 1.3543939261715683, + "grad_norm": 13.375, + "learning_rate": 5.485355175074955e-06, + "loss": 1.5129, + "step": 46560 + }, + { + "epoch": 1.354975710504116, + "grad_norm": 12.9375, + "learning_rate": 5.4834158934023405e-06, + "loss": 1.5391, + "step": 46580 + }, + { + "epoch": 1.355557494836664, + "grad_norm": 12.8125, + "learning_rate": 5.4814766117297256e-06, + "loss": 1.5308, + "step": 46600 + }, + { + "epoch": 1.356139279169212, + "grad_norm": 13.25, + "learning_rate": 5.479537330057111e-06, + "loss": 1.5014, + "step": 46620 + }, + { + "epoch": 1.35672106350176, + "grad_norm": 13.5, + "learning_rate": 5.477598048384496e-06, + "loss": 1.5595, + "step": 46640 + }, + { + "epoch": 1.3573028478343079, + "grad_norm": 13.5, + "learning_rate": 5.475658766711881e-06, + "loss": 1.4568, + "step": 46660 + }, + { + "epoch": 1.3578846321668556, + "grad_norm": 13.3125, + "learning_rate": 5.473719485039266e-06, + "loss": 1.5577, + "step": 46680 + }, + { + "epoch": 1.3584664164994038, + "grad_norm": 12.6875, + "learning_rate": 5.471780203366651e-06, + "loss": 1.5837, + "step": 46700 + }, + { + "epoch": 1.3590482008319515, + "grad_norm": 12.6875, + "learning_rate": 5.469840921694036e-06, + "loss": 1.5158, + "step": 46720 + }, + { + "epoch": 1.3596299851644995, + "grad_norm": 12.4375, + "learning_rate": 5.467901640021421e-06, + "loss": 1.5101, + "step": 46740 + }, + { + "epoch": 1.3602117694970475, + "grad_norm": 12.75, + "learning_rate": 5.465962358348806e-06, + "loss": 1.488, + "step": 46760 + }, + { + "epoch": 1.3607935538295954, + "grad_norm": 22.5, + "learning_rate": 5.4640230766761915e-06, + "loss": 1.5384, + "step": 46780 + }, + { + "epoch": 1.3613753381621434, + "grad_norm": 13.0625, + "learning_rate": 5.4620837950035766e-06, + "loss": 1.5471, + "step": 46800 + }, + { + "epoch": 1.3619571224946911, + "grad_norm": 14.625, + "learning_rate": 5.460144513330962e-06, + "loss": 1.5273, + "step": 46820 + }, + { + "epoch": 1.362538906827239, + "grad_norm": 13.6875, + "learning_rate": 5.458205231658347e-06, + "loss": 1.5492, + "step": 46840 + }, + { + "epoch": 1.363120691159787, + "grad_norm": 11.625, + "learning_rate": 5.456265949985732e-06, + "loss": 1.4581, + "step": 46860 + }, + { + "epoch": 1.363702475492335, + "grad_norm": 14.6875, + "learning_rate": 5.454326668313117e-06, + "loss": 1.4906, + "step": 46880 + }, + { + "epoch": 1.364284259824883, + "grad_norm": 14.6875, + "learning_rate": 5.452387386640502e-06, + "loss": 1.399, + "step": 46900 + }, + { + "epoch": 1.3648660441574307, + "grad_norm": 13.0, + "learning_rate": 5.450448104967887e-06, + "loss": 1.4955, + "step": 46920 + }, + { + "epoch": 1.3654478284899787, + "grad_norm": 13.0625, + "learning_rate": 5.448508823295272e-06, + "loss": 1.5175, + "step": 46940 + }, + { + "epoch": 1.3660296128225267, + "grad_norm": 13.0, + "learning_rate": 5.446569541622657e-06, + "loss": 1.5245, + "step": 46960 + }, + { + "epoch": 1.3666113971550746, + "grad_norm": 13.0, + "learning_rate": 5.4446302599500424e-06, + "loss": 1.6267, + "step": 46980 + }, + { + "epoch": 1.3671931814876226, + "grad_norm": 12.5625, + "learning_rate": 5.4426909782774275e-06, + "loss": 1.5281, + "step": 47000 + }, + { + "epoch": 1.3677749658201703, + "grad_norm": 13.375, + "learning_rate": 5.440751696604813e-06, + "loss": 1.5513, + "step": 47020 + }, + { + "epoch": 1.3683567501527185, + "grad_norm": 14.125, + "learning_rate": 5.438812414932198e-06, + "loss": 1.483, + "step": 47040 + }, + { + "epoch": 1.3689385344852663, + "grad_norm": 14.8125, + "learning_rate": 5.436873133259583e-06, + "loss": 1.4988, + "step": 47060 + }, + { + "epoch": 1.3695203188178142, + "grad_norm": 12.0625, + "learning_rate": 5.434933851586968e-06, + "loss": 1.532, + "step": 47080 + }, + { + "epoch": 1.3701021031503622, + "grad_norm": 11.125, + "learning_rate": 5.432994569914353e-06, + "loss": 1.4573, + "step": 47100 + }, + { + "epoch": 1.3706838874829101, + "grad_norm": 13.9375, + "learning_rate": 5.431055288241738e-06, + "loss": 1.494, + "step": 47120 + }, + { + "epoch": 1.371265671815458, + "grad_norm": 15.125, + "learning_rate": 5.429116006569123e-06, + "loss": 1.4989, + "step": 47140 + }, + { + "epoch": 1.3718474561480059, + "grad_norm": 12.375, + "learning_rate": 5.427176724896508e-06, + "loss": 1.5388, + "step": 47160 + }, + { + "epoch": 1.3724292404805538, + "grad_norm": 14.625, + "learning_rate": 5.425237443223893e-06, + "loss": 1.4658, + "step": 47180 + }, + { + "epoch": 1.3730110248131018, + "grad_norm": 14.1875, + "learning_rate": 5.4232981615512785e-06, + "loss": 1.4219, + "step": 47200 + }, + { + "epoch": 1.3735928091456497, + "grad_norm": 15.875, + "learning_rate": 5.421358879878663e-06, + "loss": 1.555, + "step": 47220 + }, + { + "epoch": 1.3741745934781977, + "grad_norm": 17.25, + "learning_rate": 5.419419598206048e-06, + "loss": 1.4773, + "step": 47240 + }, + { + "epoch": 1.3747563778107454, + "grad_norm": 13.9375, + "learning_rate": 5.417480316533433e-06, + "loss": 1.5697, + "step": 47260 + }, + { + "epoch": 1.3753381621432934, + "grad_norm": 15.125, + "learning_rate": 5.415541034860818e-06, + "loss": 1.564, + "step": 47280 + }, + { + "epoch": 1.3759199464758414, + "grad_norm": 15.4375, + "learning_rate": 5.413601753188203e-06, + "loss": 1.4945, + "step": 47300 + }, + { + "epoch": 1.3765017308083893, + "grad_norm": 13.9375, + "learning_rate": 5.411662471515588e-06, + "loss": 1.4841, + "step": 47320 + }, + { + "epoch": 1.3770835151409373, + "grad_norm": 12.9375, + "learning_rate": 5.409723189842973e-06, + "loss": 1.512, + "step": 47340 + }, + { + "epoch": 1.3776652994734853, + "grad_norm": 12.125, + "learning_rate": 5.4077839081703584e-06, + "loss": 1.4335, + "step": 47360 + }, + { + "epoch": 1.3782470838060332, + "grad_norm": 11.9375, + "learning_rate": 5.4058446264977435e-06, + "loss": 1.5114, + "step": 47380 + }, + { + "epoch": 1.378828868138581, + "grad_norm": 10.9375, + "learning_rate": 5.403905344825129e-06, + "loss": 1.452, + "step": 47400 + }, + { + "epoch": 1.379410652471129, + "grad_norm": 11.375, + "learning_rate": 5.401966063152514e-06, + "loss": 1.5196, + "step": 47420 + }, + { + "epoch": 1.379992436803677, + "grad_norm": 12.25, + "learning_rate": 5.400026781479899e-06, + "loss": 1.6073, + "step": 47440 + }, + { + "epoch": 1.3805742211362249, + "grad_norm": 11.1875, + "learning_rate": 5.398087499807284e-06, + "loss": 1.5454, + "step": 47460 + }, + { + "epoch": 1.3811560054687728, + "grad_norm": 11.5625, + "learning_rate": 5.396148218134669e-06, + "loss": 1.5641, + "step": 47480 + }, + { + "epoch": 1.3817377898013206, + "grad_norm": 14.0, + "learning_rate": 5.394208936462054e-06, + "loss": 1.6132, + "step": 47500 + }, + { + "epoch": 1.3823195741338685, + "grad_norm": 13.0, + "learning_rate": 5.392269654789439e-06, + "loss": 1.4819, + "step": 47520 + }, + { + "epoch": 1.3829013584664165, + "grad_norm": 13.125, + "learning_rate": 5.390330373116824e-06, + "loss": 1.6146, + "step": 47540 + }, + { + "epoch": 1.3834831427989644, + "grad_norm": 13.875, + "learning_rate": 5.388391091444209e-06, + "loss": 1.5331, + "step": 47560 + }, + { + "epoch": 1.3840649271315124, + "grad_norm": 12.625, + "learning_rate": 5.3864518097715945e-06, + "loss": 1.6131, + "step": 47580 + }, + { + "epoch": 1.3846467114640602, + "grad_norm": 12.1875, + "learning_rate": 5.38451252809898e-06, + "loss": 1.4784, + "step": 47600 + }, + { + "epoch": 1.3852284957966081, + "grad_norm": 14.3125, + "learning_rate": 5.382573246426365e-06, + "loss": 1.5693, + "step": 47620 + }, + { + "epoch": 1.385810280129156, + "grad_norm": 13.6875, + "learning_rate": 5.38063396475375e-06, + "loss": 1.497, + "step": 47640 + }, + { + "epoch": 1.386392064461704, + "grad_norm": 12.875, + "learning_rate": 5.378694683081135e-06, + "loss": 1.4902, + "step": 47660 + }, + { + "epoch": 1.386973848794252, + "grad_norm": 13.875, + "learning_rate": 5.37675540140852e-06, + "loss": 1.4721, + "step": 47680 + }, + { + "epoch": 1.3875556331268, + "grad_norm": 13.3125, + "learning_rate": 5.374816119735905e-06, + "loss": 1.4737, + "step": 47700 + }, + { + "epoch": 1.388137417459348, + "grad_norm": 12.125, + "learning_rate": 5.37287683806329e-06, + "loss": 1.5667, + "step": 47720 + }, + { + "epoch": 1.3887192017918957, + "grad_norm": 11.9375, + "learning_rate": 5.370937556390675e-06, + "loss": 1.5925, + "step": 47740 + }, + { + "epoch": 1.3893009861244436, + "grad_norm": 15.0, + "learning_rate": 5.36899827471806e-06, + "loss": 1.5047, + "step": 47760 + }, + { + "epoch": 1.3898827704569916, + "grad_norm": 13.375, + "learning_rate": 5.3670589930454455e-06, + "loss": 1.5228, + "step": 47780 + }, + { + "epoch": 1.3904645547895396, + "grad_norm": 14.1875, + "learning_rate": 5.365119711372831e-06, + "loss": 1.4962, + "step": 47800 + }, + { + "epoch": 1.3910463391220875, + "grad_norm": 13.5, + "learning_rate": 5.363180429700216e-06, + "loss": 1.5033, + "step": 47820 + }, + { + "epoch": 1.3916281234546353, + "grad_norm": 15.0625, + "learning_rate": 5.361241148027601e-06, + "loss": 1.5003, + "step": 47840 + }, + { + "epoch": 1.3922099077871832, + "grad_norm": 12.75, + "learning_rate": 5.359301866354986e-06, + "loss": 1.54, + "step": 47860 + }, + { + "epoch": 1.3927916921197312, + "grad_norm": 12.9375, + "learning_rate": 5.357362584682371e-06, + "loss": 1.4891, + "step": 47880 + }, + { + "epoch": 1.3933734764522792, + "grad_norm": 15.125, + "learning_rate": 5.355423303009756e-06, + "loss": 1.4649, + "step": 47900 + }, + { + "epoch": 1.3939552607848271, + "grad_norm": 13.5625, + "learning_rate": 5.353484021337141e-06, + "loss": 1.5445, + "step": 47920 + }, + { + "epoch": 1.3945370451173749, + "grad_norm": 13.125, + "learning_rate": 5.351544739664526e-06, + "loss": 1.5875, + "step": 47940 + }, + { + "epoch": 1.395118829449923, + "grad_norm": 12.8125, + "learning_rate": 5.349605457991911e-06, + "loss": 1.5797, + "step": 47960 + }, + { + "epoch": 1.3957006137824708, + "grad_norm": 12.625, + "learning_rate": 5.3476661763192965e-06, + "loss": 1.5652, + "step": 47980 + }, + { + "epoch": 1.3962823981150188, + "grad_norm": 15.375, + "learning_rate": 5.3457268946466816e-06, + "loss": 1.5665, + "step": 48000 + }, + { + "epoch": 1.3968641824475667, + "grad_norm": 16.375, + "learning_rate": 5.343787612974067e-06, + "loss": 1.4943, + "step": 48020 + }, + { + "epoch": 1.3974459667801147, + "grad_norm": 11.5625, + "learning_rate": 5.341848331301451e-06, + "loss": 1.5187, + "step": 48040 + }, + { + "epoch": 1.3980277511126626, + "grad_norm": 13.125, + "learning_rate": 5.339909049628836e-06, + "loss": 1.5284, + "step": 48060 + }, + { + "epoch": 1.3986095354452104, + "grad_norm": 9.375, + "learning_rate": 5.337969767956221e-06, + "loss": 1.4807, + "step": 48080 + }, + { + "epoch": 1.3991913197777583, + "grad_norm": 13.125, + "learning_rate": 5.336030486283606e-06, + "loss": 1.4139, + "step": 48100 + }, + { + "epoch": 1.3997731041103063, + "grad_norm": 13.25, + "learning_rate": 5.334091204610991e-06, + "loss": 1.5286, + "step": 48120 + }, + { + "epoch": 1.4003548884428543, + "grad_norm": 12.0, + "learning_rate": 5.332151922938376e-06, + "loss": 1.4749, + "step": 48140 + }, + { + "epoch": 1.4009366727754022, + "grad_norm": 14.375, + "learning_rate": 5.3302126412657615e-06, + "loss": 1.5266, + "step": 48160 + }, + { + "epoch": 1.40151845710795, + "grad_norm": 12.375, + "learning_rate": 5.328273359593147e-06, + "loss": 1.5522, + "step": 48180 + }, + { + "epoch": 1.402100241440498, + "grad_norm": 15.375, + "learning_rate": 5.326334077920532e-06, + "loss": 1.502, + "step": 48200 + }, + { + "epoch": 1.402682025773046, + "grad_norm": 13.0, + "learning_rate": 5.324394796247917e-06, + "loss": 1.4469, + "step": 48220 + }, + { + "epoch": 1.4032638101055939, + "grad_norm": 12.0, + "learning_rate": 5.322455514575302e-06, + "loss": 1.6373, + "step": 48240 + }, + { + "epoch": 1.4038455944381418, + "grad_norm": 12.9375, + "learning_rate": 5.320516232902687e-06, + "loss": 1.4664, + "step": 48260 + }, + { + "epoch": 1.4044273787706896, + "grad_norm": 13.125, + "learning_rate": 5.318576951230072e-06, + "loss": 1.5461, + "step": 48280 + }, + { + "epoch": 1.4050091631032378, + "grad_norm": 15.9375, + "learning_rate": 5.316637669557457e-06, + "loss": 1.5507, + "step": 48300 + }, + { + "epoch": 1.4055909474357855, + "grad_norm": 13.375, + "learning_rate": 5.314698387884842e-06, + "loss": 1.5017, + "step": 48320 + }, + { + "epoch": 1.4061727317683335, + "grad_norm": 14.6875, + "learning_rate": 5.312759106212227e-06, + "loss": 1.5485, + "step": 48340 + }, + { + "epoch": 1.4067545161008814, + "grad_norm": 14.9375, + "learning_rate": 5.3108198245396125e-06, + "loss": 1.576, + "step": 48360 + }, + { + "epoch": 1.4073363004334294, + "grad_norm": 14.9375, + "learning_rate": 5.3088805428669976e-06, + "loss": 1.5244, + "step": 48380 + }, + { + "epoch": 1.4079180847659774, + "grad_norm": 15.6875, + "learning_rate": 5.306941261194383e-06, + "loss": 1.539, + "step": 48400 + }, + { + "epoch": 1.408499869098525, + "grad_norm": 11.875, + "learning_rate": 5.305001979521768e-06, + "loss": 1.5648, + "step": 48420 + }, + { + "epoch": 1.409081653431073, + "grad_norm": 12.75, + "learning_rate": 5.303062697849153e-06, + "loss": 1.5098, + "step": 48440 + }, + { + "epoch": 1.409663437763621, + "grad_norm": 12.125, + "learning_rate": 5.301123416176538e-06, + "loss": 1.4494, + "step": 48460 + }, + { + "epoch": 1.410245222096169, + "grad_norm": 13.0, + "learning_rate": 5.299184134503923e-06, + "loss": 1.5425, + "step": 48480 + }, + { + "epoch": 1.410827006428717, + "grad_norm": 13.75, + "learning_rate": 5.297244852831308e-06, + "loss": 1.4474, + "step": 48500 + }, + { + "epoch": 1.4114087907612647, + "grad_norm": 11.9375, + "learning_rate": 5.295305571158693e-06, + "loss": 1.4395, + "step": 48520 + }, + { + "epoch": 1.4119905750938126, + "grad_norm": 13.8125, + "learning_rate": 5.293366289486078e-06, + "loss": 1.5865, + "step": 48540 + }, + { + "epoch": 1.4125723594263606, + "grad_norm": 16.375, + "learning_rate": 5.2914270078134634e-06, + "loss": 1.4994, + "step": 48560 + }, + { + "epoch": 1.4131541437589086, + "grad_norm": 11.3125, + "learning_rate": 5.2894877261408485e-06, + "loss": 1.4271, + "step": 48580 + }, + { + "epoch": 1.4137359280914565, + "grad_norm": 11.6875, + "learning_rate": 5.287548444468234e-06, + "loss": 1.5099, + "step": 48600 + }, + { + "epoch": 1.4143177124240045, + "grad_norm": 13.5, + "learning_rate": 5.285609162795619e-06, + "loss": 1.5398, + "step": 48620 + }, + { + "epoch": 1.4148994967565525, + "grad_norm": 12.75, + "learning_rate": 5.283669881123004e-06, + "loss": 1.5669, + "step": 48640 + }, + { + "epoch": 1.4154812810891002, + "grad_norm": 12.0, + "learning_rate": 5.281730599450389e-06, + "loss": 1.5473, + "step": 48660 + }, + { + "epoch": 1.4160630654216482, + "grad_norm": 15.25, + "learning_rate": 5.279791317777774e-06, + "loss": 1.5169, + "step": 48680 + }, + { + "epoch": 1.4166448497541961, + "grad_norm": 12.25, + "learning_rate": 5.277852036105159e-06, + "loss": 1.5115, + "step": 48700 + }, + { + "epoch": 1.417226634086744, + "grad_norm": 13.75, + "learning_rate": 5.275912754432544e-06, + "loss": 1.5089, + "step": 48720 + }, + { + "epoch": 1.417808418419292, + "grad_norm": 14.875, + "learning_rate": 5.273973472759929e-06, + "loss": 1.5885, + "step": 48740 + }, + { + "epoch": 1.4183902027518398, + "grad_norm": 13.6875, + "learning_rate": 5.272034191087314e-06, + "loss": 1.5541, + "step": 48760 + }, + { + "epoch": 1.4189719870843878, + "grad_norm": 11.0, + "learning_rate": 5.2700949094146995e-06, + "loss": 1.532, + "step": 48780 + }, + { + "epoch": 1.4195537714169357, + "grad_norm": 13.625, + "learning_rate": 5.268155627742085e-06, + "loss": 1.5522, + "step": 48800 + }, + { + "epoch": 1.4201355557494837, + "grad_norm": 11.8125, + "learning_rate": 5.26621634606947e-06, + "loss": 1.4616, + "step": 48820 + }, + { + "epoch": 1.4207173400820317, + "grad_norm": 12.3125, + "learning_rate": 5.264277064396855e-06, + "loss": 1.5783, + "step": 48840 + }, + { + "epoch": 1.4212991244145794, + "grad_norm": 12.125, + "learning_rate": 5.262337782724239e-06, + "loss": 1.4606, + "step": 48860 + }, + { + "epoch": 1.4218809087471274, + "grad_norm": 13.0, + "learning_rate": 5.260398501051624e-06, + "loss": 1.5216, + "step": 48880 + }, + { + "epoch": 1.4224626930796753, + "grad_norm": 11.5, + "learning_rate": 5.258459219379009e-06, + "loss": 1.552, + "step": 48900 + }, + { + "epoch": 1.4230444774122233, + "grad_norm": 12.875, + "learning_rate": 5.256519937706394e-06, + "loss": 1.5649, + "step": 48920 + }, + { + "epoch": 1.4236262617447712, + "grad_norm": 16.125, + "learning_rate": 5.2545806560337794e-06, + "loss": 1.4982, + "step": 48940 + }, + { + "epoch": 1.4242080460773192, + "grad_norm": 12.1875, + "learning_rate": 5.2526413743611645e-06, + "loss": 1.5495, + "step": 48960 + }, + { + "epoch": 1.4247898304098672, + "grad_norm": 12.1875, + "learning_rate": 5.25070209268855e-06, + "loss": 1.5104, + "step": 48980 + }, + { + "epoch": 1.425371614742415, + "grad_norm": 11.9375, + "learning_rate": 5.248762811015935e-06, + "loss": 1.4486, + "step": 49000 + }, + { + "epoch": 1.4259533990749629, + "grad_norm": 11.625, + "learning_rate": 5.24682352934332e-06, + "loss": 1.4121, + "step": 49020 + }, + { + "epoch": 1.4265351834075108, + "grad_norm": 13.25, + "learning_rate": 5.244884247670705e-06, + "loss": 1.5312, + "step": 49040 + }, + { + "epoch": 1.4271169677400588, + "grad_norm": 13.875, + "learning_rate": 5.24294496599809e-06, + "loss": 1.4839, + "step": 49060 + }, + { + "epoch": 1.4276987520726068, + "grad_norm": 14.625, + "learning_rate": 5.241005684325475e-06, + "loss": 1.5612, + "step": 49080 + }, + { + "epoch": 1.4282805364051545, + "grad_norm": 14.8125, + "learning_rate": 5.23906640265286e-06, + "loss": 1.5692, + "step": 49100 + }, + { + "epoch": 1.4288623207377025, + "grad_norm": 15.125, + "learning_rate": 5.237127120980245e-06, + "loss": 1.5126, + "step": 49120 + }, + { + "epoch": 1.4294441050702504, + "grad_norm": 13.375, + "learning_rate": 5.23518783930763e-06, + "loss": 1.6138, + "step": 49140 + }, + { + "epoch": 1.4300258894027984, + "grad_norm": 13.0, + "learning_rate": 5.2332485576350155e-06, + "loss": 1.4812, + "step": 49160 + }, + { + "epoch": 1.4306076737353464, + "grad_norm": 13.5, + "learning_rate": 5.231309275962401e-06, + "loss": 1.5094, + "step": 49180 + }, + { + "epoch": 1.431189458067894, + "grad_norm": 13.25, + "learning_rate": 5.229369994289786e-06, + "loss": 1.5418, + "step": 49200 + }, + { + "epoch": 1.4317712424004423, + "grad_norm": 13.875, + "learning_rate": 5.227430712617171e-06, + "loss": 1.7129, + "step": 49220 + }, + { + "epoch": 1.43235302673299, + "grad_norm": 12.5625, + "learning_rate": 5.225491430944556e-06, + "loss": 1.459, + "step": 49240 + }, + { + "epoch": 1.432934811065538, + "grad_norm": 12.625, + "learning_rate": 5.223552149271941e-06, + "loss": 1.4942, + "step": 49260 + }, + { + "epoch": 1.433516595398086, + "grad_norm": 12.3125, + "learning_rate": 5.221612867599326e-06, + "loss": 1.5423, + "step": 49280 + }, + { + "epoch": 1.434098379730634, + "grad_norm": 11.75, + "learning_rate": 5.219673585926711e-06, + "loss": 1.4238, + "step": 49300 + }, + { + "epoch": 1.4346801640631819, + "grad_norm": 13.0625, + "learning_rate": 5.217734304254096e-06, + "loss": 1.4657, + "step": 49320 + }, + { + "epoch": 1.4352619483957296, + "grad_norm": 13.3125, + "learning_rate": 5.215795022581481e-06, + "loss": 1.4548, + "step": 49340 + }, + { + "epoch": 1.4358437327282776, + "grad_norm": 17.125, + "learning_rate": 5.2138557409088665e-06, + "loss": 1.5447, + "step": 49360 + }, + { + "epoch": 1.4364255170608256, + "grad_norm": 13.1875, + "learning_rate": 5.211916459236252e-06, + "loss": 1.485, + "step": 49380 + }, + { + "epoch": 1.4370073013933735, + "grad_norm": 9.8125, + "learning_rate": 5.209977177563637e-06, + "loss": 1.59, + "step": 49400 + }, + { + "epoch": 1.4375890857259215, + "grad_norm": 12.3125, + "learning_rate": 5.208037895891022e-06, + "loss": 1.4542, + "step": 49420 + }, + { + "epoch": 1.4381708700584692, + "grad_norm": 13.875, + "learning_rate": 5.206098614218407e-06, + "loss": 1.5553, + "step": 49440 + }, + { + "epoch": 1.4387526543910172, + "grad_norm": 13.0, + "learning_rate": 5.204159332545792e-06, + "loss": 1.5352, + "step": 49460 + }, + { + "epoch": 1.4393344387235651, + "grad_norm": 12.75, + "learning_rate": 5.202220050873177e-06, + "loss": 1.498, + "step": 49480 + }, + { + "epoch": 1.439916223056113, + "grad_norm": 12.0625, + "learning_rate": 5.200280769200562e-06, + "loss": 1.509, + "step": 49500 + }, + { + "epoch": 1.440498007388661, + "grad_norm": 13.125, + "learning_rate": 5.198341487527947e-06, + "loss": 1.4639, + "step": 49520 + }, + { + "epoch": 1.4410797917212088, + "grad_norm": 10.875, + "learning_rate": 5.196402205855332e-06, + "loss": 1.516, + "step": 49540 + }, + { + "epoch": 1.441661576053757, + "grad_norm": 14.75, + "learning_rate": 5.1944629241827175e-06, + "loss": 1.5425, + "step": 49560 + }, + { + "epoch": 1.4422433603863047, + "grad_norm": 12.5, + "learning_rate": 5.1925236425101026e-06, + "loss": 1.5151, + "step": 49580 + }, + { + "epoch": 1.4428251447188527, + "grad_norm": 12.125, + "learning_rate": 5.190584360837488e-06, + "loss": 1.4453, + "step": 49600 + }, + { + "epoch": 1.4434069290514007, + "grad_norm": 14.8125, + "learning_rate": 5.188645079164873e-06, + "loss": 1.5709, + "step": 49620 + }, + { + "epoch": 1.4439887133839486, + "grad_norm": 11.3125, + "learning_rate": 5.186705797492258e-06, + "loss": 1.5566, + "step": 49640 + }, + { + "epoch": 1.4445704977164966, + "grad_norm": 10.6875, + "learning_rate": 5.184766515819642e-06, + "loss": 1.5536, + "step": 49660 + }, + { + "epoch": 1.4451522820490443, + "grad_norm": 13.375, + "learning_rate": 5.182827234147027e-06, + "loss": 1.57, + "step": 49680 + }, + { + "epoch": 1.4457340663815923, + "grad_norm": 11.875, + "learning_rate": 5.180887952474412e-06, + "loss": 1.5094, + "step": 49700 + }, + { + "epoch": 1.4463158507141403, + "grad_norm": 12.625, + "learning_rate": 5.178948670801797e-06, + "loss": 1.5041, + "step": 49720 + }, + { + "epoch": 1.4468976350466882, + "grad_norm": 12.9375, + "learning_rate": 5.1770093891291825e-06, + "loss": 1.5361, + "step": 49740 + }, + { + "epoch": 1.4474794193792362, + "grad_norm": 13.4375, + "learning_rate": 5.175070107456568e-06, + "loss": 1.51, + "step": 49760 + }, + { + "epoch": 1.448061203711784, + "grad_norm": 12.8125, + "learning_rate": 5.173130825783953e-06, + "loss": 1.4887, + "step": 49780 + }, + { + "epoch": 1.448642988044332, + "grad_norm": 12.8125, + "learning_rate": 5.171191544111338e-06, + "loss": 1.5794, + "step": 49800 + }, + { + "epoch": 1.4492247723768799, + "grad_norm": 12.75, + "learning_rate": 5.169252262438723e-06, + "loss": 1.5677, + "step": 49820 + }, + { + "epoch": 1.4498065567094278, + "grad_norm": 14.0, + "learning_rate": 5.167312980766108e-06, + "loss": 1.5516, + "step": 49840 + }, + { + "epoch": 1.4503883410419758, + "grad_norm": 11.875, + "learning_rate": 5.165373699093493e-06, + "loss": 1.5398, + "step": 49860 + }, + { + "epoch": 1.4509701253745237, + "grad_norm": 12.5, + "learning_rate": 5.163434417420878e-06, + "loss": 1.5098, + "step": 49880 + }, + { + "epoch": 1.4515519097070717, + "grad_norm": 11.625, + "learning_rate": 5.161495135748263e-06, + "loss": 1.5095, + "step": 49900 + }, + { + "epoch": 1.4521336940396194, + "grad_norm": 12.75, + "learning_rate": 5.159555854075648e-06, + "loss": 1.5098, + "step": 49920 + }, + { + "epoch": 1.4527154783721674, + "grad_norm": 14.0, + "learning_rate": 5.1576165724030335e-06, + "loss": 1.4726, + "step": 49940 + }, + { + "epoch": 1.4532972627047154, + "grad_norm": 15.6875, + "learning_rate": 5.1556772907304186e-06, + "loss": 1.5539, + "step": 49960 + }, + { + "epoch": 1.4538790470372633, + "grad_norm": 13.0, + "learning_rate": 5.153738009057804e-06, + "loss": 1.5476, + "step": 49980 + }, + { + "epoch": 1.4544608313698113, + "grad_norm": 10.6875, + "learning_rate": 5.151798727385189e-06, + "loss": 1.5702, + "step": 50000 + }, + { + "epoch": 1.455042615702359, + "grad_norm": 12.9375, + "learning_rate": 5.149859445712574e-06, + "loss": 1.5263, + "step": 50020 + }, + { + "epoch": 1.455624400034907, + "grad_norm": 12.5625, + "learning_rate": 5.147920164039959e-06, + "loss": 1.5251, + "step": 50040 + }, + { + "epoch": 1.456206184367455, + "grad_norm": 11.1875, + "learning_rate": 5.145980882367344e-06, + "loss": 1.5967, + "step": 50060 + }, + { + "epoch": 1.456787968700003, + "grad_norm": 13.0625, + "learning_rate": 5.144041600694729e-06, + "loss": 1.4547, + "step": 50080 + }, + { + "epoch": 1.457369753032551, + "grad_norm": 12.875, + "learning_rate": 5.142102319022114e-06, + "loss": 1.5841, + "step": 50100 + }, + { + "epoch": 1.4579515373650986, + "grad_norm": 10.875, + "learning_rate": 5.140163037349499e-06, + "loss": 1.5134, + "step": 50120 + }, + { + "epoch": 1.4585333216976466, + "grad_norm": 17.625, + "learning_rate": 5.1382237556768844e-06, + "loss": 1.5292, + "step": 50140 + }, + { + "epoch": 1.4591151060301946, + "grad_norm": 12.0625, + "learning_rate": 5.1362844740042695e-06, + "loss": 1.486, + "step": 50160 + }, + { + "epoch": 1.4596968903627425, + "grad_norm": 17.25, + "learning_rate": 5.134345192331655e-06, + "loss": 1.5385, + "step": 50180 + }, + { + "epoch": 1.4602786746952905, + "grad_norm": 10.0, + "learning_rate": 5.13240591065904e-06, + "loss": 1.5024, + "step": 50200 + }, + { + "epoch": 1.4608604590278385, + "grad_norm": 13.75, + "learning_rate": 5.130466628986425e-06, + "loss": 1.529, + "step": 50220 + }, + { + "epoch": 1.4614422433603864, + "grad_norm": 14.875, + "learning_rate": 5.12852734731381e-06, + "loss": 1.4843, + "step": 50240 + }, + { + "epoch": 1.4620240276929342, + "grad_norm": 12.0625, + "learning_rate": 5.126588065641195e-06, + "loss": 1.506, + "step": 50260 + }, + { + "epoch": 1.4626058120254821, + "grad_norm": 12.6875, + "learning_rate": 5.12464878396858e-06, + "loss": 1.4215, + "step": 50280 + }, + { + "epoch": 1.46318759635803, + "grad_norm": 12.375, + "learning_rate": 5.1227095022959635e-06, + "loss": 1.4734, + "step": 50300 + }, + { + "epoch": 1.463769380690578, + "grad_norm": 15.5, + "learning_rate": 5.120770220623349e-06, + "loss": 1.6313, + "step": 50320 + }, + { + "epoch": 1.464351165023126, + "grad_norm": 13.4375, + "learning_rate": 5.118830938950734e-06, + "loss": 1.4654, + "step": 50340 + }, + { + "epoch": 1.4649329493556738, + "grad_norm": 14.0, + "learning_rate": 5.116891657278119e-06, + "loss": 1.471, + "step": 50360 + }, + { + "epoch": 1.4655147336882217, + "grad_norm": 11.0, + "learning_rate": 5.114952375605504e-06, + "loss": 1.5427, + "step": 50380 + }, + { + "epoch": 1.4660965180207697, + "grad_norm": 14.625, + "learning_rate": 5.113013093932889e-06, + "loss": 1.572, + "step": 50400 + }, + { + "epoch": 1.4666783023533176, + "grad_norm": 16.75, + "learning_rate": 5.111073812260274e-06, + "loss": 1.5277, + "step": 50420 + }, + { + "epoch": 1.4672600866858656, + "grad_norm": 9.6875, + "learning_rate": 5.109134530587659e-06, + "loss": 1.471, + "step": 50440 + }, + { + "epoch": 1.4678418710184133, + "grad_norm": 12.8125, + "learning_rate": 5.107195248915044e-06, + "loss": 1.5825, + "step": 50460 + }, + { + "epoch": 1.4684236553509615, + "grad_norm": 14.0, + "learning_rate": 5.105255967242429e-06, + "loss": 1.5129, + "step": 50480 + }, + { + "epoch": 1.4690054396835093, + "grad_norm": 12.9375, + "learning_rate": 5.1033166855698145e-06, + "loss": 1.4023, + "step": 50500 + }, + { + "epoch": 1.4695872240160572, + "grad_norm": 13.375, + "learning_rate": 5.1013774038972e-06, + "loss": 1.5123, + "step": 50520 + }, + { + "epoch": 1.4701690083486052, + "grad_norm": 11.6875, + "learning_rate": 5.099438122224585e-06, + "loss": 1.5481, + "step": 50540 + }, + { + "epoch": 1.4707507926811532, + "grad_norm": 16.125, + "learning_rate": 5.09749884055197e-06, + "loss": 1.4437, + "step": 50560 + }, + { + "epoch": 1.4713325770137011, + "grad_norm": 12.25, + "learning_rate": 5.095559558879355e-06, + "loss": 1.4811, + "step": 50580 + }, + { + "epoch": 1.4719143613462489, + "grad_norm": 13.6875, + "learning_rate": 5.09362027720674e-06, + "loss": 1.5479, + "step": 50600 + }, + { + "epoch": 1.4724961456787968, + "grad_norm": 14.375, + "learning_rate": 5.091680995534125e-06, + "loss": 1.5808, + "step": 50620 + }, + { + "epoch": 1.4730779300113448, + "grad_norm": 13.0625, + "learning_rate": 5.08974171386151e-06, + "loss": 1.6277, + "step": 50640 + }, + { + "epoch": 1.4736597143438928, + "grad_norm": 14.6875, + "learning_rate": 5.087802432188895e-06, + "loss": 1.5107, + "step": 50660 + }, + { + "epoch": 1.4742414986764407, + "grad_norm": 14.5625, + "learning_rate": 5.08586315051628e-06, + "loss": 1.595, + "step": 50680 + }, + { + "epoch": 1.4748232830089885, + "grad_norm": 13.9375, + "learning_rate": 5.0839238688436655e-06, + "loss": 1.5751, + "step": 50700 + }, + { + "epoch": 1.4754050673415364, + "grad_norm": 13.125, + "learning_rate": 5.0819845871710506e-06, + "loss": 1.4475, + "step": 50720 + }, + { + "epoch": 1.4759868516740844, + "grad_norm": 13.5625, + "learning_rate": 5.080045305498436e-06, + "loss": 1.5289, + "step": 50740 + }, + { + "epoch": 1.4765686360066324, + "grad_norm": 15.9375, + "learning_rate": 5.078106023825821e-06, + "loss": 1.497, + "step": 50760 + }, + { + "epoch": 1.4771504203391803, + "grad_norm": 12.5625, + "learning_rate": 5.076166742153206e-06, + "loss": 1.4242, + "step": 50780 + }, + { + "epoch": 1.477732204671728, + "grad_norm": 12.4375, + "learning_rate": 5.074227460480591e-06, + "loss": 1.5828, + "step": 50800 + }, + { + "epoch": 1.4783139890042762, + "grad_norm": 12.125, + "learning_rate": 5.072288178807976e-06, + "loss": 1.4836, + "step": 50820 + }, + { + "epoch": 1.478895773336824, + "grad_norm": 14.0, + "learning_rate": 5.070348897135361e-06, + "loss": 1.5094, + "step": 50840 + }, + { + "epoch": 1.479477557669372, + "grad_norm": 13.5, + "learning_rate": 5.068409615462746e-06, + "loss": 1.5181, + "step": 50860 + }, + { + "epoch": 1.48005934200192, + "grad_norm": 12.625, + "learning_rate": 5.066470333790131e-06, + "loss": 1.4571, + "step": 50880 + }, + { + "epoch": 1.4806411263344679, + "grad_norm": 13.625, + "learning_rate": 5.0645310521175165e-06, + "loss": 1.4324, + "step": 50900 + }, + { + "epoch": 1.4812229106670158, + "grad_norm": 11.9375, + "learning_rate": 5.0625917704449015e-06, + "loss": 1.576, + "step": 50920 + }, + { + "epoch": 1.4818046949995636, + "grad_norm": 15.125, + "learning_rate": 5.060652488772287e-06, + "loss": 1.5361, + "step": 50940 + }, + { + "epoch": 1.4823864793321115, + "grad_norm": 11.8125, + "learning_rate": 5.058713207099672e-06, + "loss": 1.5862, + "step": 50960 + }, + { + "epoch": 1.4829682636646595, + "grad_norm": 13.0625, + "learning_rate": 5.056773925427057e-06, + "loss": 1.5856, + "step": 50980 + }, + { + "epoch": 1.4835500479972075, + "grad_norm": 15.5, + "learning_rate": 5.054834643754442e-06, + "loss": 1.5927, + "step": 51000 + }, + { + "epoch": 1.4841318323297554, + "grad_norm": 12.6875, + "learning_rate": 5.052895362081827e-06, + "loss": 1.5097, + "step": 51020 + }, + { + "epoch": 1.4847136166623032, + "grad_norm": 14.375, + "learning_rate": 5.050956080409211e-06, + "loss": 1.4944, + "step": 51040 + }, + { + "epoch": 1.4852954009948511, + "grad_norm": 12.5625, + "learning_rate": 5.049016798736596e-06, + "loss": 1.572, + "step": 51060 + }, + { + "epoch": 1.485877185327399, + "grad_norm": 16.25, + "learning_rate": 5.0470775170639815e-06, + "loss": 1.5845, + "step": 51080 + }, + { + "epoch": 1.486458969659947, + "grad_norm": 13.625, + "learning_rate": 5.045138235391367e-06, + "loss": 1.5383, + "step": 51100 + }, + { + "epoch": 1.487040753992495, + "grad_norm": 16.0, + "learning_rate": 5.043198953718752e-06, + "loss": 1.5037, + "step": 51120 + }, + { + "epoch": 1.487622538325043, + "grad_norm": 14.3125, + "learning_rate": 5.041259672046137e-06, + "loss": 1.4387, + "step": 51140 + }, + { + "epoch": 1.488204322657591, + "grad_norm": 13.4375, + "learning_rate": 5.039320390373522e-06, + "loss": 1.4998, + "step": 51160 + }, + { + "epoch": 1.4887861069901387, + "grad_norm": 20.75, + "learning_rate": 5.037381108700907e-06, + "loss": 1.5381, + "step": 51180 + }, + { + "epoch": 1.4893678913226867, + "grad_norm": 10.8125, + "learning_rate": 5.035441827028292e-06, + "loss": 1.5232, + "step": 51200 + }, + { + "epoch": 1.4899496756552346, + "grad_norm": 13.875, + "learning_rate": 5.033502545355677e-06, + "loss": 1.5357, + "step": 51220 + }, + { + "epoch": 1.4905314599877826, + "grad_norm": 13.375, + "learning_rate": 5.031563263683062e-06, + "loss": 1.4933, + "step": 51240 + }, + { + "epoch": 1.4911132443203305, + "grad_norm": 13.375, + "learning_rate": 5.029623982010447e-06, + "loss": 1.4801, + "step": 51260 + }, + { + "epoch": 1.4916950286528783, + "grad_norm": 15.25, + "learning_rate": 5.0276847003378325e-06, + "loss": 1.5051, + "step": 51280 + }, + { + "epoch": 1.4922768129854262, + "grad_norm": 17.375, + "learning_rate": 5.0257454186652176e-06, + "loss": 1.4838, + "step": 51300 + }, + { + "epoch": 1.4928585973179742, + "grad_norm": 12.625, + "learning_rate": 5.023806136992603e-06, + "loss": 1.5299, + "step": 51320 + }, + { + "epoch": 1.4934403816505222, + "grad_norm": 11.75, + "learning_rate": 5.021866855319988e-06, + "loss": 1.4594, + "step": 51340 + }, + { + "epoch": 1.4940221659830701, + "grad_norm": 14.3125, + "learning_rate": 5.019927573647373e-06, + "loss": 1.5199, + "step": 51360 + }, + { + "epoch": 1.4946039503156179, + "grad_norm": 13.0, + "learning_rate": 5.017988291974758e-06, + "loss": 1.5827, + "step": 51380 + }, + { + "epoch": 1.4951857346481658, + "grad_norm": 11.3125, + "learning_rate": 5.016049010302143e-06, + "loss": 1.5119, + "step": 51400 + }, + { + "epoch": 1.4957675189807138, + "grad_norm": 14.4375, + "learning_rate": 5.014109728629528e-06, + "loss": 1.5574, + "step": 51420 + }, + { + "epoch": 1.4963493033132618, + "grad_norm": 11.1875, + "learning_rate": 5.012170446956913e-06, + "loss": 1.4693, + "step": 51440 + }, + { + "epoch": 1.4969310876458097, + "grad_norm": 13.8125, + "learning_rate": 5.010231165284298e-06, + "loss": 1.6051, + "step": 51460 + }, + { + "epoch": 1.4975128719783577, + "grad_norm": 13.3125, + "learning_rate": 5.0082918836116834e-06, + "loss": 1.5523, + "step": 51480 + }, + { + "epoch": 1.4980946563109057, + "grad_norm": 13.125, + "learning_rate": 5.0063526019390685e-06, + "loss": 1.5153, + "step": 51500 + }, + { + "epoch": 1.4986764406434534, + "grad_norm": 14.0625, + "learning_rate": 5.004413320266454e-06, + "loss": 1.492, + "step": 51520 + }, + { + "epoch": 1.4992582249760014, + "grad_norm": 11.5, + "learning_rate": 5.002474038593839e-06, + "loss": 1.5005, + "step": 51540 + }, + { + "epoch": 1.4998400093085493, + "grad_norm": 13.9375, + "learning_rate": 5.000534756921224e-06, + "loss": 1.6143, + "step": 51560 + }, + { + "epoch": 1.5004217936410973, + "grad_norm": 15.9375, + "learning_rate": 4.998595475248609e-06, + "loss": 1.5416, + "step": 51580 + }, + { + "epoch": 1.5010035779736453, + "grad_norm": 14.75, + "learning_rate": 4.996656193575994e-06, + "loss": 1.5004, + "step": 51600 + }, + { + "epoch": 1.501585362306193, + "grad_norm": 14.875, + "learning_rate": 4.994716911903379e-06, + "loss": 1.4522, + "step": 51620 + }, + { + "epoch": 1.5021671466387412, + "grad_norm": 13.0, + "learning_rate": 4.992777630230764e-06, + "loss": 1.5349, + "step": 51640 + }, + { + "epoch": 1.502748930971289, + "grad_norm": 14.3125, + "learning_rate": 4.990838348558149e-06, + "loss": 1.4802, + "step": 51660 + }, + { + "epoch": 1.5033307153038369, + "grad_norm": 11.0, + "learning_rate": 4.988899066885534e-06, + "loss": 1.6301, + "step": 51680 + }, + { + "epoch": 1.5039124996363848, + "grad_norm": 11.25, + "learning_rate": 4.9869597852129195e-06, + "loss": 1.49, + "step": 51700 + }, + { + "epoch": 1.5044942839689326, + "grad_norm": 14.0625, + "learning_rate": 4.985020503540305e-06, + "loss": 1.5538, + "step": 51720 + }, + { + "epoch": 1.5050760683014808, + "grad_norm": 12.3125, + "learning_rate": 4.98308122186769e-06, + "loss": 1.5761, + "step": 51740 + }, + { + "epoch": 1.5056578526340285, + "grad_norm": 14.125, + "learning_rate": 4.981141940195075e-06, + "loss": 1.4782, + "step": 51760 + }, + { + "epoch": 1.5062396369665765, + "grad_norm": 15.1875, + "learning_rate": 4.97920265852246e-06, + "loss": 1.4338, + "step": 51780 + }, + { + "epoch": 1.5068214212991244, + "grad_norm": 14.6875, + "learning_rate": 4.977263376849845e-06, + "loss": 1.5249, + "step": 51800 + }, + { + "epoch": 1.5074032056316722, + "grad_norm": 13.3125, + "learning_rate": 4.97532409517723e-06, + "loss": 1.5055, + "step": 51820 + }, + { + "epoch": 1.5079849899642204, + "grad_norm": 13.5625, + "learning_rate": 4.973384813504615e-06, + "loss": 1.5288, + "step": 51840 + }, + { + "epoch": 1.508566774296768, + "grad_norm": 11.9375, + "learning_rate": 4.9714455318319994e-06, + "loss": 1.5375, + "step": 51860 + }, + { + "epoch": 1.509148558629316, + "grad_norm": 14.75, + "learning_rate": 4.9695062501593845e-06, + "loss": 1.4812, + "step": 51880 + }, + { + "epoch": 1.509730342961864, + "grad_norm": 12.8125, + "learning_rate": 4.96756696848677e-06, + "loss": 1.5579, + "step": 51900 + }, + { + "epoch": 1.510312127294412, + "grad_norm": 13.125, + "learning_rate": 4.965627686814155e-06, + "loss": 1.6508, + "step": 51920 + }, + { + "epoch": 1.51089391162696, + "grad_norm": 14.375, + "learning_rate": 4.96368840514154e-06, + "loss": 1.469, + "step": 51940 + }, + { + "epoch": 1.5114756959595077, + "grad_norm": 15.4375, + "learning_rate": 4.961749123468925e-06, + "loss": 1.6221, + "step": 51960 + }, + { + "epoch": 1.5120574802920559, + "grad_norm": 14.0625, + "learning_rate": 4.95980984179631e-06, + "loss": 1.5103, + "step": 51980 + }, + { + "epoch": 1.5126392646246036, + "grad_norm": 12.5625, + "learning_rate": 4.957870560123695e-06, + "loss": 1.5693, + "step": 52000 + }, + { + "epoch": 1.5132210489571516, + "grad_norm": 14.6875, + "learning_rate": 4.95593127845108e-06, + "loss": 1.4608, + "step": 52020 + }, + { + "epoch": 1.5138028332896996, + "grad_norm": 13.375, + "learning_rate": 4.953991996778465e-06, + "loss": 1.533, + "step": 52040 + }, + { + "epoch": 1.5143846176222473, + "grad_norm": 13.75, + "learning_rate": 4.95205271510585e-06, + "loss": 1.5669, + "step": 52060 + }, + { + "epoch": 1.5149664019547955, + "grad_norm": 15.8125, + "learning_rate": 4.9501134334332355e-06, + "loss": 1.519, + "step": 52080 + }, + { + "epoch": 1.5155481862873432, + "grad_norm": 12.375, + "learning_rate": 4.948174151760621e-06, + "loss": 1.5564, + "step": 52100 + }, + { + "epoch": 1.5161299706198912, + "grad_norm": 12.6875, + "learning_rate": 4.946234870088006e-06, + "loss": 1.5168, + "step": 52120 + }, + { + "epoch": 1.5167117549524392, + "grad_norm": 12.25, + "learning_rate": 4.944295588415391e-06, + "loss": 1.593, + "step": 52140 + }, + { + "epoch": 1.517293539284987, + "grad_norm": 12.6875, + "learning_rate": 4.942356306742776e-06, + "loss": 1.6088, + "step": 52160 + }, + { + "epoch": 1.517875323617535, + "grad_norm": 12.125, + "learning_rate": 4.940417025070161e-06, + "loss": 1.4852, + "step": 52180 + }, + { + "epoch": 1.5184571079500828, + "grad_norm": 11.25, + "learning_rate": 4.938477743397546e-06, + "loss": 1.5165, + "step": 52200 + }, + { + "epoch": 1.5190388922826308, + "grad_norm": 13.375, + "learning_rate": 4.936538461724931e-06, + "loss": 1.5314, + "step": 52220 + }, + { + "epoch": 1.5196206766151787, + "grad_norm": 14.125, + "learning_rate": 4.934599180052316e-06, + "loss": 1.5572, + "step": 52240 + }, + { + "epoch": 1.5202024609477267, + "grad_norm": 12.6875, + "learning_rate": 4.932659898379701e-06, + "loss": 1.5518, + "step": 52260 + }, + { + "epoch": 1.5207842452802747, + "grad_norm": 11.1875, + "learning_rate": 4.9307206167070865e-06, + "loss": 1.5268, + "step": 52280 + }, + { + "epoch": 1.5213660296128224, + "grad_norm": 16.625, + "learning_rate": 4.928781335034472e-06, + "loss": 1.5436, + "step": 52300 + }, + { + "epoch": 1.5219478139453706, + "grad_norm": 13.9375, + "learning_rate": 4.926842053361857e-06, + "loss": 1.559, + "step": 52320 + }, + { + "epoch": 1.5225295982779183, + "grad_norm": 11.625, + "learning_rate": 4.924902771689242e-06, + "loss": 1.5251, + "step": 52340 + }, + { + "epoch": 1.5231113826104663, + "grad_norm": 12.4375, + "learning_rate": 4.922963490016627e-06, + "loss": 1.6352, + "step": 52360 + }, + { + "epoch": 1.5236931669430143, + "grad_norm": 14.875, + "learning_rate": 4.921024208344012e-06, + "loss": 1.4764, + "step": 52380 + }, + { + "epoch": 1.524274951275562, + "grad_norm": 13.1875, + "learning_rate": 4.919084926671397e-06, + "loss": 1.5555, + "step": 52400 + }, + { + "epoch": 1.5248567356081102, + "grad_norm": 12.25, + "learning_rate": 4.917145644998782e-06, + "loss": 1.5648, + "step": 52420 + }, + { + "epoch": 1.525438519940658, + "grad_norm": 13.0625, + "learning_rate": 4.915206363326167e-06, + "loss": 1.4756, + "step": 52440 + }, + { + "epoch": 1.526020304273206, + "grad_norm": 11.0, + "learning_rate": 4.913267081653552e-06, + "loss": 1.5144, + "step": 52460 + }, + { + "epoch": 1.5266020886057539, + "grad_norm": 13.375, + "learning_rate": 4.9113277999809375e-06, + "loss": 1.47, + "step": 52480 + }, + { + "epoch": 1.5271838729383018, + "grad_norm": 12.125, + "learning_rate": 4.9093885183083226e-06, + "loss": 1.4436, + "step": 52500 + }, + { + "epoch": 1.5277656572708498, + "grad_norm": 12.0625, + "learning_rate": 4.907449236635708e-06, + "loss": 1.5564, + "step": 52520 + }, + { + "epoch": 1.5283474416033975, + "grad_norm": 13.5625, + "learning_rate": 4.905509954963093e-06, + "loss": 1.4803, + "step": 52540 + }, + { + "epoch": 1.5289292259359455, + "grad_norm": 11.9375, + "learning_rate": 4.903570673290478e-06, + "loss": 1.4645, + "step": 52560 + }, + { + "epoch": 1.5295110102684935, + "grad_norm": 13.25, + "learning_rate": 4.901631391617863e-06, + "loss": 1.5036, + "step": 52580 + }, + { + "epoch": 1.5300927946010414, + "grad_norm": 13.5625, + "learning_rate": 4.899692109945248e-06, + "loss": 1.5826, + "step": 52600 + }, + { + "epoch": 1.5306745789335894, + "grad_norm": 13.4375, + "learning_rate": 4.897752828272633e-06, + "loss": 1.5535, + "step": 52620 + }, + { + "epoch": 1.5312563632661371, + "grad_norm": 13.0, + "learning_rate": 4.895813546600018e-06, + "loss": 1.5745, + "step": 52640 + }, + { + "epoch": 1.5318381475986853, + "grad_norm": 12.1875, + "learning_rate": 4.893874264927403e-06, + "loss": 1.5237, + "step": 52660 + }, + { + "epoch": 1.532419931931233, + "grad_norm": 12.375, + "learning_rate": 4.891934983254788e-06, + "loss": 1.505, + "step": 52680 + }, + { + "epoch": 1.533001716263781, + "grad_norm": 11.9375, + "learning_rate": 4.889995701582173e-06, + "loss": 1.486, + "step": 52700 + }, + { + "epoch": 1.533583500596329, + "grad_norm": 13.25, + "learning_rate": 4.888056419909558e-06, + "loss": 1.5246, + "step": 52720 + }, + { + "epoch": 1.5341652849288767, + "grad_norm": 13.5625, + "learning_rate": 4.886117138236943e-06, + "loss": 1.5384, + "step": 52740 + }, + { + "epoch": 1.534747069261425, + "grad_norm": 14.5, + "learning_rate": 4.884177856564328e-06, + "loss": 1.5441, + "step": 52760 + }, + { + "epoch": 1.5353288535939726, + "grad_norm": 13.25, + "learning_rate": 4.882238574891713e-06, + "loss": 1.5209, + "step": 52780 + }, + { + "epoch": 1.5359106379265206, + "grad_norm": 11.4375, + "learning_rate": 4.880299293219098e-06, + "loss": 1.5664, + "step": 52800 + }, + { + "epoch": 1.5364924222590686, + "grad_norm": 12.6875, + "learning_rate": 4.878360011546483e-06, + "loss": 1.459, + "step": 52820 + }, + { + "epoch": 1.5370742065916165, + "grad_norm": 15.6875, + "learning_rate": 4.876420729873868e-06, + "loss": 1.5468, + "step": 52840 + }, + { + "epoch": 1.5376559909241645, + "grad_norm": 15.1875, + "learning_rate": 4.8744814482012535e-06, + "loss": 1.5809, + "step": 52860 + }, + { + "epoch": 1.5382377752567122, + "grad_norm": 11.625, + "learning_rate": 4.8725421665286386e-06, + "loss": 1.5441, + "step": 52880 + }, + { + "epoch": 1.5388195595892604, + "grad_norm": 17.25, + "learning_rate": 4.870602884856024e-06, + "loss": 1.588, + "step": 52900 + }, + { + "epoch": 1.5394013439218082, + "grad_norm": 13.3125, + "learning_rate": 4.868663603183409e-06, + "loss": 1.4962, + "step": 52920 + }, + { + "epoch": 1.5399831282543561, + "grad_norm": 14.0625, + "learning_rate": 4.866724321510794e-06, + "loss": 1.494, + "step": 52940 + }, + { + "epoch": 1.540564912586904, + "grad_norm": 14.1875, + "learning_rate": 4.864785039838179e-06, + "loss": 1.5254, + "step": 52960 + }, + { + "epoch": 1.5411466969194518, + "grad_norm": 15.9375, + "learning_rate": 4.862845758165564e-06, + "loss": 1.5036, + "step": 52980 + }, + { + "epoch": 1.541728481252, + "grad_norm": 13.125, + "learning_rate": 4.860906476492949e-06, + "loss": 1.4845, + "step": 53000 + }, + { + "epoch": 1.5423102655845478, + "grad_norm": 12.625, + "learning_rate": 4.858967194820334e-06, + "loss": 1.5102, + "step": 53020 + }, + { + "epoch": 1.5428920499170957, + "grad_norm": 10.5625, + "learning_rate": 4.857027913147719e-06, + "loss": 1.5368, + "step": 53040 + }, + { + "epoch": 1.5434738342496437, + "grad_norm": 13.875, + "learning_rate": 4.8550886314751044e-06, + "loss": 1.5532, + "step": 53060 + }, + { + "epoch": 1.5440556185821914, + "grad_norm": 13.75, + "learning_rate": 4.8531493498024895e-06, + "loss": 1.5391, + "step": 53080 + }, + { + "epoch": 1.5446374029147396, + "grad_norm": 14.0, + "learning_rate": 4.851210068129875e-06, + "loss": 1.5926, + "step": 53100 + }, + { + "epoch": 1.5452191872472874, + "grad_norm": 12.5625, + "learning_rate": 4.84927078645726e-06, + "loss": 1.528, + "step": 53120 + }, + { + "epoch": 1.5458009715798353, + "grad_norm": 15.0, + "learning_rate": 4.847331504784645e-06, + "loss": 1.481, + "step": 53140 + }, + { + "epoch": 1.5463827559123833, + "grad_norm": 14.5625, + "learning_rate": 4.84539222311203e-06, + "loss": 1.5169, + "step": 53160 + }, + { + "epoch": 1.5469645402449312, + "grad_norm": 9.125, + "learning_rate": 4.843452941439415e-06, + "loss": 1.537, + "step": 53180 + }, + { + "epoch": 1.5475463245774792, + "grad_norm": 12.0625, + "learning_rate": 4.8415136597668e-06, + "loss": 1.4713, + "step": 53200 + }, + { + "epoch": 1.548128108910027, + "grad_norm": 15.375, + "learning_rate": 4.839574378094185e-06, + "loss": 1.5724, + "step": 53220 + }, + { + "epoch": 1.5487098932425751, + "grad_norm": 11.25, + "learning_rate": 4.83763509642157e-06, + "loss": 1.5441, + "step": 53240 + }, + { + "epoch": 1.5492916775751229, + "grad_norm": 16.5, + "learning_rate": 4.835695814748955e-06, + "loss": 1.5224, + "step": 53260 + }, + { + "epoch": 1.5498734619076708, + "grad_norm": 11.9375, + "learning_rate": 4.8337565330763405e-06, + "loss": 1.4652, + "step": 53280 + }, + { + "epoch": 1.5504552462402188, + "grad_norm": 13.125, + "learning_rate": 4.831817251403726e-06, + "loss": 1.5336, + "step": 53300 + }, + { + "epoch": 1.5510370305727665, + "grad_norm": 10.375, + "learning_rate": 4.829877969731111e-06, + "loss": 1.5128, + "step": 53320 + }, + { + "epoch": 1.5516188149053147, + "grad_norm": 12.8125, + "learning_rate": 4.827938688058496e-06, + "loss": 1.4405, + "step": 53340 + }, + { + "epoch": 1.5522005992378625, + "grad_norm": 11.5, + "learning_rate": 4.825999406385881e-06, + "loss": 1.581, + "step": 53360 + }, + { + "epoch": 1.5527823835704104, + "grad_norm": 12.625, + "learning_rate": 4.824060124713266e-06, + "loss": 1.5412, + "step": 53380 + }, + { + "epoch": 1.5533641679029584, + "grad_norm": 13.6875, + "learning_rate": 4.822120843040651e-06, + "loss": 1.4463, + "step": 53400 + }, + { + "epoch": 1.5539459522355061, + "grad_norm": 13.3125, + "learning_rate": 4.820181561368036e-06, + "loss": 1.5782, + "step": 53420 + }, + { + "epoch": 1.5545277365680543, + "grad_norm": 16.75, + "learning_rate": 4.818242279695421e-06, + "loss": 1.5875, + "step": 53440 + }, + { + "epoch": 1.555109520900602, + "grad_norm": 12.0625, + "learning_rate": 4.816302998022806e-06, + "loss": 1.5127, + "step": 53460 + }, + { + "epoch": 1.55569130523315, + "grad_norm": 10.5625, + "learning_rate": 4.814363716350191e-06, + "loss": 1.5307, + "step": 53480 + }, + { + "epoch": 1.556273089565698, + "grad_norm": 14.625, + "learning_rate": 4.812424434677576e-06, + "loss": 1.4338, + "step": 53500 + }, + { + "epoch": 1.556854873898246, + "grad_norm": 14.5625, + "learning_rate": 4.810485153004961e-06, + "loss": 1.5595, + "step": 53520 + }, + { + "epoch": 1.557436658230794, + "grad_norm": 12.75, + "learning_rate": 4.808545871332346e-06, + "loss": 1.6282, + "step": 53540 + }, + { + "epoch": 1.5580184425633417, + "grad_norm": 13.8125, + "learning_rate": 4.806606589659731e-06, + "loss": 1.4419, + "step": 53560 + }, + { + "epoch": 1.5586002268958898, + "grad_norm": 13.3125, + "learning_rate": 4.804667307987116e-06, + "loss": 1.4877, + "step": 53580 + }, + { + "epoch": 1.5591820112284376, + "grad_norm": 10.75, + "learning_rate": 4.802728026314501e-06, + "loss": 1.4213, + "step": 53600 + }, + { + "epoch": 1.5597637955609855, + "grad_norm": 13.5625, + "learning_rate": 4.800788744641886e-06, + "loss": 1.5356, + "step": 53620 + }, + { + "epoch": 1.5603455798935335, + "grad_norm": 13.125, + "learning_rate": 4.798849462969271e-06, + "loss": 1.4539, + "step": 53640 + }, + { + "epoch": 1.5609273642260812, + "grad_norm": 12.6875, + "learning_rate": 4.7969101812966565e-06, + "loss": 1.4881, + "step": 53660 + }, + { + "epoch": 1.5615091485586294, + "grad_norm": 13.375, + "learning_rate": 4.794970899624042e-06, + "loss": 1.4572, + "step": 53680 + }, + { + "epoch": 1.5620909328911772, + "grad_norm": 14.5625, + "learning_rate": 4.793031617951427e-06, + "loss": 1.4704, + "step": 53700 + }, + { + "epoch": 1.5626727172237251, + "grad_norm": 12.5625, + "learning_rate": 4.791092336278812e-06, + "loss": 1.5089, + "step": 53720 + }, + { + "epoch": 1.563254501556273, + "grad_norm": 14.5625, + "learning_rate": 4.789153054606197e-06, + "loss": 1.4816, + "step": 53740 + }, + { + "epoch": 1.563836285888821, + "grad_norm": 12.4375, + "learning_rate": 4.787213772933582e-06, + "loss": 1.5167, + "step": 53760 + }, + { + "epoch": 1.564418070221369, + "grad_norm": 13.1875, + "learning_rate": 4.785274491260966e-06, + "loss": 1.5706, + "step": 53780 + }, + { + "epoch": 1.5649998545539168, + "grad_norm": 13.4375, + "learning_rate": 4.783335209588351e-06, + "loss": 1.4895, + "step": 53800 + }, + { + "epoch": 1.5655816388864647, + "grad_norm": 14.0, + "learning_rate": 4.7813959279157364e-06, + "loss": 1.5177, + "step": 53820 + }, + { + "epoch": 1.5661634232190127, + "grad_norm": 13.125, + "learning_rate": 4.7794566462431215e-06, + "loss": 1.5508, + "step": 53840 + }, + { + "epoch": 1.5667452075515607, + "grad_norm": 13.875, + "learning_rate": 4.777517364570507e-06, + "loss": 1.6271, + "step": 53860 + }, + { + "epoch": 1.5673269918841086, + "grad_norm": 12.375, + "learning_rate": 4.775578082897892e-06, + "loss": 1.5082, + "step": 53880 + }, + { + "epoch": 1.5679087762166564, + "grad_norm": 12.8125, + "learning_rate": 4.773638801225277e-06, + "loss": 1.5444, + "step": 53900 + }, + { + "epoch": 1.5684905605492045, + "grad_norm": 12.4375, + "learning_rate": 4.771699519552662e-06, + "loss": 1.5745, + "step": 53920 + }, + { + "epoch": 1.5690723448817523, + "grad_norm": 13.25, + "learning_rate": 4.769760237880047e-06, + "loss": 1.5928, + "step": 53940 + }, + { + "epoch": 1.5696541292143003, + "grad_norm": 12.875, + "learning_rate": 4.767820956207432e-06, + "loss": 1.5923, + "step": 53960 + }, + { + "epoch": 1.5702359135468482, + "grad_norm": 11.8125, + "learning_rate": 4.765881674534817e-06, + "loss": 1.5671, + "step": 53980 + }, + { + "epoch": 1.570817697879396, + "grad_norm": 15.1875, + "learning_rate": 4.763942392862202e-06, + "loss": 1.5224, + "step": 54000 + }, + { + "epoch": 1.5713994822119441, + "grad_norm": 14.25, + "learning_rate": 4.762003111189587e-06, + "loss": 1.4886, + "step": 54020 + }, + { + "epoch": 1.5719812665444919, + "grad_norm": 11.8125, + "learning_rate": 4.7600638295169725e-06, + "loss": 1.4678, + "step": 54040 + }, + { + "epoch": 1.5725630508770398, + "grad_norm": 13.75, + "learning_rate": 4.758124547844358e-06, + "loss": 1.4967, + "step": 54060 + }, + { + "epoch": 1.5731448352095878, + "grad_norm": 13.375, + "learning_rate": 4.756185266171743e-06, + "loss": 1.5169, + "step": 54080 + }, + { + "epoch": 1.5737266195421358, + "grad_norm": 12.6875, + "learning_rate": 4.754245984499128e-06, + "loss": 1.4806, + "step": 54100 + }, + { + "epoch": 1.5743084038746837, + "grad_norm": 14.5, + "learning_rate": 4.752306702826513e-06, + "loss": 1.5104, + "step": 54120 + }, + { + "epoch": 1.5748901882072315, + "grad_norm": 11.6875, + "learning_rate": 4.750367421153898e-06, + "loss": 1.5267, + "step": 54140 + }, + { + "epoch": 1.5754719725397797, + "grad_norm": 10.8125, + "learning_rate": 4.748428139481283e-06, + "loss": 1.4444, + "step": 54160 + }, + { + "epoch": 1.5760537568723274, + "grad_norm": 14.75, + "learning_rate": 4.746488857808668e-06, + "loss": 1.5271, + "step": 54180 + }, + { + "epoch": 1.5766355412048754, + "grad_norm": 14.0, + "learning_rate": 4.744549576136053e-06, + "loss": 1.4962, + "step": 54200 + }, + { + "epoch": 1.5772173255374233, + "grad_norm": 12.6875, + "learning_rate": 4.742610294463438e-06, + "loss": 1.5658, + "step": 54220 + }, + { + "epoch": 1.577799109869971, + "grad_norm": 11.3125, + "learning_rate": 4.7406710127908235e-06, + "loss": 1.5937, + "step": 54240 + }, + { + "epoch": 1.5783808942025193, + "grad_norm": 13.5625, + "learning_rate": 4.738731731118209e-06, + "loss": 1.52, + "step": 54260 + }, + { + "epoch": 1.578962678535067, + "grad_norm": 15.1875, + "learning_rate": 4.736792449445594e-06, + "loss": 1.4848, + "step": 54280 + }, + { + "epoch": 1.579544462867615, + "grad_norm": 9.9375, + "learning_rate": 4.734853167772979e-06, + "loss": 1.4543, + "step": 54300 + }, + { + "epoch": 1.580126247200163, + "grad_norm": 13.625, + "learning_rate": 4.732913886100364e-06, + "loss": 1.4826, + "step": 54320 + }, + { + "epoch": 1.5807080315327107, + "grad_norm": 12.1875, + "learning_rate": 4.730974604427749e-06, + "loss": 1.5062, + "step": 54340 + }, + { + "epoch": 1.5812898158652589, + "grad_norm": 10.8125, + "learning_rate": 4.729035322755134e-06, + "loss": 1.4645, + "step": 54360 + }, + { + "epoch": 1.5818716001978066, + "grad_norm": 13.3125, + "learning_rate": 4.727096041082519e-06, + "loss": 1.5349, + "step": 54380 + }, + { + "epoch": 1.5824533845303546, + "grad_norm": 11.6875, + "learning_rate": 4.725156759409904e-06, + "loss": 1.5129, + "step": 54400 + }, + { + "epoch": 1.5830351688629025, + "grad_norm": 15.125, + "learning_rate": 4.723217477737289e-06, + "loss": 1.5458, + "step": 54420 + }, + { + "epoch": 1.5836169531954505, + "grad_norm": 11.375, + "learning_rate": 4.7212781960646745e-06, + "loss": 1.5131, + "step": 54440 + }, + { + "epoch": 1.5841987375279984, + "grad_norm": 10.9375, + "learning_rate": 4.7193389143920596e-06, + "loss": 1.6478, + "step": 54460 + }, + { + "epoch": 1.5847805218605462, + "grad_norm": 14.875, + "learning_rate": 4.717399632719445e-06, + "loss": 1.5146, + "step": 54480 + }, + { + "epoch": 1.5853623061930944, + "grad_norm": 12.0, + "learning_rate": 4.71546035104683e-06, + "loss": 1.5281, + "step": 54500 + }, + { + "epoch": 1.5859440905256421, + "grad_norm": 13.8125, + "learning_rate": 4.713521069374215e-06, + "loss": 1.575, + "step": 54520 + }, + { + "epoch": 1.58652587485819, + "grad_norm": 16.875, + "learning_rate": 4.7115817877016e-06, + "loss": 1.5869, + "step": 54540 + }, + { + "epoch": 1.587107659190738, + "grad_norm": 12.375, + "learning_rate": 4.709642506028985e-06, + "loss": 1.4953, + "step": 54560 + }, + { + "epoch": 1.5876894435232858, + "grad_norm": 10.875, + "learning_rate": 4.70770322435637e-06, + "loss": 1.5689, + "step": 54580 + }, + { + "epoch": 1.588271227855834, + "grad_norm": 17.25, + "learning_rate": 4.705763942683754e-06, + "loss": 1.5589, + "step": 54600 + }, + { + "epoch": 1.5888530121883817, + "grad_norm": 13.0, + "learning_rate": 4.7038246610111395e-06, + "loss": 1.5302, + "step": 54620 + }, + { + "epoch": 1.5894347965209297, + "grad_norm": 13.5625, + "learning_rate": 4.701885379338525e-06, + "loss": 1.3932, + "step": 54640 + }, + { + "epoch": 1.5900165808534776, + "grad_norm": 11.875, + "learning_rate": 4.69994609766591e-06, + "loss": 1.4435, + "step": 54660 + }, + { + "epoch": 1.5905983651860254, + "grad_norm": 13.125, + "learning_rate": 4.698006815993295e-06, + "loss": 1.5546, + "step": 54680 + }, + { + "epoch": 1.5911801495185736, + "grad_norm": 11.625, + "learning_rate": 4.69606753432068e-06, + "loss": 1.5133, + "step": 54700 + }, + { + "epoch": 1.5917619338511213, + "grad_norm": 14.1875, + "learning_rate": 4.694128252648065e-06, + "loss": 1.4977, + "step": 54720 + }, + { + "epoch": 1.5923437181836693, + "grad_norm": 14.6875, + "learning_rate": 4.69218897097545e-06, + "loss": 1.6043, + "step": 54740 + }, + { + "epoch": 1.5929255025162172, + "grad_norm": 15.0625, + "learning_rate": 4.690249689302835e-06, + "loss": 1.5224, + "step": 54760 + }, + { + "epoch": 1.5935072868487652, + "grad_norm": 16.5, + "learning_rate": 4.68831040763022e-06, + "loss": 1.5228, + "step": 54780 + }, + { + "epoch": 1.5940890711813132, + "grad_norm": 16.125, + "learning_rate": 4.686371125957605e-06, + "loss": 1.5182, + "step": 54800 + }, + { + "epoch": 1.594670855513861, + "grad_norm": 16.375, + "learning_rate": 4.6844318442849905e-06, + "loss": 1.4713, + "step": 54820 + }, + { + "epoch": 1.595252639846409, + "grad_norm": 13.5625, + "learning_rate": 4.6824925626123756e-06, + "loss": 1.589, + "step": 54840 + }, + { + "epoch": 1.5958344241789568, + "grad_norm": 12.8125, + "learning_rate": 4.680553280939761e-06, + "loss": 1.5047, + "step": 54860 + }, + { + "epoch": 1.5964162085115048, + "grad_norm": 15.1875, + "learning_rate": 4.678613999267146e-06, + "loss": 1.5411, + "step": 54880 + }, + { + "epoch": 1.5969979928440527, + "grad_norm": 15.4375, + "learning_rate": 4.676674717594531e-06, + "loss": 1.4919, + "step": 54900 + }, + { + "epoch": 1.5975797771766005, + "grad_norm": 16.0, + "learning_rate": 4.674735435921916e-06, + "loss": 1.6321, + "step": 54920 + }, + { + "epoch": 1.5981615615091487, + "grad_norm": 14.25, + "learning_rate": 4.672796154249301e-06, + "loss": 1.4955, + "step": 54940 + }, + { + "epoch": 1.5987433458416964, + "grad_norm": 11.4375, + "learning_rate": 4.670856872576686e-06, + "loss": 1.5507, + "step": 54960 + }, + { + "epoch": 1.5993251301742444, + "grad_norm": 9.8125, + "learning_rate": 4.668917590904071e-06, + "loss": 1.514, + "step": 54980 + }, + { + "epoch": 1.5999069145067923, + "grad_norm": 12.6875, + "learning_rate": 4.666978309231456e-06, + "loss": 1.4391, + "step": 55000 + }, + { + "epoch": 1.6004886988393403, + "grad_norm": 13.0625, + "learning_rate": 4.6650390275588414e-06, + "loss": 1.5631, + "step": 55020 + }, + { + "epoch": 1.6010704831718883, + "grad_norm": 11.625, + "learning_rate": 4.6630997458862265e-06, + "loss": 1.4999, + "step": 55040 + }, + { + "epoch": 1.601652267504436, + "grad_norm": 11.125, + "learning_rate": 4.661160464213612e-06, + "loss": 1.5002, + "step": 55060 + }, + { + "epoch": 1.602234051836984, + "grad_norm": 13.125, + "learning_rate": 4.659221182540997e-06, + "loss": 1.451, + "step": 55080 + }, + { + "epoch": 1.602815836169532, + "grad_norm": 10.5625, + "learning_rate": 4.657281900868382e-06, + "loss": 1.5922, + "step": 55100 + }, + { + "epoch": 1.60339762050208, + "grad_norm": 15.125, + "learning_rate": 4.655342619195767e-06, + "loss": 1.5512, + "step": 55120 + }, + { + "epoch": 1.6039794048346279, + "grad_norm": 13.0, + "learning_rate": 4.653403337523152e-06, + "loss": 1.5206, + "step": 55140 + }, + { + "epoch": 1.6045611891671756, + "grad_norm": 13.9375, + "learning_rate": 4.651464055850537e-06, + "loss": 1.5631, + "step": 55160 + }, + { + "epoch": 1.6051429734997238, + "grad_norm": 13.375, + "learning_rate": 4.649524774177922e-06, + "loss": 1.5608, + "step": 55180 + }, + { + "epoch": 1.6057247578322715, + "grad_norm": 13.6875, + "learning_rate": 4.647585492505307e-06, + "loss": 1.5139, + "step": 55200 + }, + { + "epoch": 1.6063065421648195, + "grad_norm": 13.9375, + "learning_rate": 4.645646210832692e-06, + "loss": 1.5417, + "step": 55220 + }, + { + "epoch": 1.6068883264973675, + "grad_norm": 15.0, + "learning_rate": 4.6437069291600775e-06, + "loss": 1.4508, + "step": 55240 + }, + { + "epoch": 1.6074701108299152, + "grad_norm": 13.125, + "learning_rate": 4.641767647487463e-06, + "loss": 1.5182, + "step": 55260 + }, + { + "epoch": 1.6080518951624634, + "grad_norm": 12.125, + "learning_rate": 4.639828365814848e-06, + "loss": 1.5126, + "step": 55280 + }, + { + "epoch": 1.6086336794950111, + "grad_norm": 14.625, + "learning_rate": 4.637889084142233e-06, + "loss": 1.5812, + "step": 55300 + }, + { + "epoch": 1.609215463827559, + "grad_norm": 14.75, + "learning_rate": 4.635949802469618e-06, + "loss": 1.5074, + "step": 55320 + }, + { + "epoch": 1.609797248160107, + "grad_norm": 12.75, + "learning_rate": 4.634010520797003e-06, + "loss": 1.4601, + "step": 55340 + }, + { + "epoch": 1.610379032492655, + "grad_norm": 12.4375, + "learning_rate": 4.632071239124388e-06, + "loss": 1.5758, + "step": 55360 + }, + { + "epoch": 1.610960816825203, + "grad_norm": 10.25, + "learning_rate": 4.630131957451773e-06, + "loss": 1.5303, + "step": 55380 + }, + { + "epoch": 1.6115426011577507, + "grad_norm": 14.3125, + "learning_rate": 4.6281926757791575e-06, + "loss": 1.506, + "step": 55400 + }, + { + "epoch": 1.612124385490299, + "grad_norm": 14.75, + "learning_rate": 4.6262533941065426e-06, + "loss": 1.4768, + "step": 55420 + }, + { + "epoch": 1.6127061698228466, + "grad_norm": 13.75, + "learning_rate": 4.624314112433928e-06, + "loss": 1.5796, + "step": 55440 + }, + { + "epoch": 1.6132879541553946, + "grad_norm": 13.4375, + "learning_rate": 4.622374830761313e-06, + "loss": 1.5388, + "step": 55460 + }, + { + "epoch": 1.6138697384879426, + "grad_norm": 14.5, + "learning_rate": 4.620435549088698e-06, + "loss": 1.5302, + "step": 55480 + }, + { + "epoch": 1.6144515228204903, + "grad_norm": 11.875, + "learning_rate": 4.618496267416083e-06, + "loss": 1.5086, + "step": 55500 + }, + { + "epoch": 1.6150333071530385, + "grad_norm": 12.125, + "learning_rate": 4.616556985743468e-06, + "loss": 1.5663, + "step": 55520 + }, + { + "epoch": 1.6156150914855862, + "grad_norm": 14.5, + "learning_rate": 4.614617704070853e-06, + "loss": 1.4878, + "step": 55540 + }, + { + "epoch": 1.6161968758181342, + "grad_norm": 15.9375, + "learning_rate": 4.612678422398238e-06, + "loss": 1.5529, + "step": 55560 + }, + { + "epoch": 1.6167786601506822, + "grad_norm": 13.125, + "learning_rate": 4.610739140725623e-06, + "loss": 1.3758, + "step": 55580 + }, + { + "epoch": 1.61736044448323, + "grad_norm": 14.8125, + "learning_rate": 4.6087998590530084e-06, + "loss": 1.4997, + "step": 55600 + }, + { + "epoch": 1.617942228815778, + "grad_norm": 13.75, + "learning_rate": 4.6068605773803935e-06, + "loss": 1.4679, + "step": 55620 + }, + { + "epoch": 1.6185240131483258, + "grad_norm": 18.875, + "learning_rate": 4.604921295707779e-06, + "loss": 1.5344, + "step": 55640 + }, + { + "epoch": 1.6191057974808738, + "grad_norm": 13.375, + "learning_rate": 4.602982014035164e-06, + "loss": 1.4957, + "step": 55660 + }, + { + "epoch": 1.6196875818134218, + "grad_norm": 14.3125, + "learning_rate": 4.601042732362548e-06, + "loss": 1.5895, + "step": 55680 + }, + { + "epoch": 1.6202693661459697, + "grad_norm": 12.0625, + "learning_rate": 4.599103450689933e-06, + "loss": 1.5243, + "step": 55700 + }, + { + "epoch": 1.6208511504785177, + "grad_norm": 13.8125, + "learning_rate": 4.597164169017318e-06, + "loss": 1.5176, + "step": 55720 + }, + { + "epoch": 1.6214329348110654, + "grad_norm": 13.9375, + "learning_rate": 4.595224887344703e-06, + "loss": 1.5591, + "step": 55740 + }, + { + "epoch": 1.6220147191436136, + "grad_norm": 12.75, + "learning_rate": 4.593285605672088e-06, + "loss": 1.5138, + "step": 55760 + }, + { + "epoch": 1.6225965034761614, + "grad_norm": 11.6875, + "learning_rate": 4.5913463239994735e-06, + "loss": 1.5448, + "step": 55780 + }, + { + "epoch": 1.6231782878087093, + "grad_norm": 13.8125, + "learning_rate": 4.5894070423268586e-06, + "loss": 1.5806, + "step": 55800 + }, + { + "epoch": 1.6237600721412573, + "grad_norm": 13.75, + "learning_rate": 4.587467760654244e-06, + "loss": 1.5211, + "step": 55820 + }, + { + "epoch": 1.624341856473805, + "grad_norm": 12.0, + "learning_rate": 4.585528478981629e-06, + "loss": 1.5163, + "step": 55840 + }, + { + "epoch": 1.6249236408063532, + "grad_norm": 12.8125, + "learning_rate": 4.583589197309014e-06, + "loss": 1.5664, + "step": 55860 + }, + { + "epoch": 1.625505425138901, + "grad_norm": 14.125, + "learning_rate": 4.581649915636399e-06, + "loss": 1.464, + "step": 55880 + }, + { + "epoch": 1.626087209471449, + "grad_norm": 12.9375, + "learning_rate": 4.579710633963784e-06, + "loss": 1.5305, + "step": 55900 + }, + { + "epoch": 1.6266689938039969, + "grad_norm": 13.4375, + "learning_rate": 4.577771352291169e-06, + "loss": 1.5297, + "step": 55920 + }, + { + "epoch": 1.6272507781365446, + "grad_norm": 15.75, + "learning_rate": 4.575832070618554e-06, + "loss": 1.4527, + "step": 55940 + }, + { + "epoch": 1.6278325624690928, + "grad_norm": 12.3125, + "learning_rate": 4.573892788945939e-06, + "loss": 1.5322, + "step": 55960 + }, + { + "epoch": 1.6284143468016405, + "grad_norm": 12.4375, + "learning_rate": 4.5719535072733244e-06, + "loss": 1.5373, + "step": 55980 + }, + { + "epoch": 1.6289961311341885, + "grad_norm": 14.25, + "learning_rate": 4.5700142256007095e-06, + "loss": 1.5091, + "step": 56000 + }, + { + "epoch": 1.6295779154667365, + "grad_norm": 13.625, + "learning_rate": 4.568074943928095e-06, + "loss": 1.5388, + "step": 56020 + }, + { + "epoch": 1.6301596997992844, + "grad_norm": 16.625, + "learning_rate": 4.56613566225548e-06, + "loss": 1.5081, + "step": 56040 + }, + { + "epoch": 1.6307414841318324, + "grad_norm": 11.625, + "learning_rate": 4.564196380582865e-06, + "loss": 1.4889, + "step": 56060 + }, + { + "epoch": 1.6313232684643801, + "grad_norm": 11.875, + "learning_rate": 4.56225709891025e-06, + "loss": 1.5055, + "step": 56080 + }, + { + "epoch": 1.6319050527969283, + "grad_norm": 13.6875, + "learning_rate": 4.560317817237635e-06, + "loss": 1.5741, + "step": 56100 + }, + { + "epoch": 1.632486837129476, + "grad_norm": 13.1875, + "learning_rate": 4.55837853556502e-06, + "loss": 1.4833, + "step": 56120 + }, + { + "epoch": 1.633068621462024, + "grad_norm": 14.75, + "learning_rate": 4.556439253892405e-06, + "loss": 1.4771, + "step": 56140 + }, + { + "epoch": 1.633650405794572, + "grad_norm": 13.0, + "learning_rate": 4.55449997221979e-06, + "loss": 1.4479, + "step": 56160 + }, + { + "epoch": 1.6342321901271197, + "grad_norm": 12.6875, + "learning_rate": 4.552560690547175e-06, + "loss": 1.5089, + "step": 56180 + }, + { + "epoch": 1.634813974459668, + "grad_norm": 13.5625, + "learning_rate": 4.5506214088745605e-06, + "loss": 1.5075, + "step": 56200 + }, + { + "epoch": 1.6353957587922157, + "grad_norm": 13.625, + "learning_rate": 4.548682127201946e-06, + "loss": 1.4791, + "step": 56220 + }, + { + "epoch": 1.6359775431247636, + "grad_norm": 15.375, + "learning_rate": 4.546742845529331e-06, + "loss": 1.5792, + "step": 56240 + }, + { + "epoch": 1.6365593274573116, + "grad_norm": 14.375, + "learning_rate": 4.544803563856716e-06, + "loss": 1.5874, + "step": 56260 + }, + { + "epoch": 1.6371411117898595, + "grad_norm": 11.6875, + "learning_rate": 4.542864282184101e-06, + "loss": 1.5322, + "step": 56280 + }, + { + "epoch": 1.6377228961224075, + "grad_norm": 11.6875, + "learning_rate": 4.540925000511486e-06, + "loss": 1.5733, + "step": 56300 + }, + { + "epoch": 1.6383046804549553, + "grad_norm": 12.8125, + "learning_rate": 4.538985718838871e-06, + "loss": 1.4755, + "step": 56320 + }, + { + "epoch": 1.6388864647875034, + "grad_norm": 14.125, + "learning_rate": 4.537046437166256e-06, + "loss": 1.4399, + "step": 56340 + }, + { + "epoch": 1.6394682491200512, + "grad_norm": 16.125, + "learning_rate": 4.535107155493641e-06, + "loss": 1.5363, + "step": 56360 + }, + { + "epoch": 1.6400500334525991, + "grad_norm": 13.0, + "learning_rate": 4.533167873821026e-06, + "loss": 1.539, + "step": 56380 + }, + { + "epoch": 1.640631817785147, + "grad_norm": 11.8125, + "learning_rate": 4.5312285921484115e-06, + "loss": 1.4282, + "step": 56400 + }, + { + "epoch": 1.6412136021176948, + "grad_norm": 12.6875, + "learning_rate": 4.529289310475797e-06, + "loss": 1.5218, + "step": 56420 + }, + { + "epoch": 1.641795386450243, + "grad_norm": 13.3125, + "learning_rate": 4.527350028803182e-06, + "loss": 1.5561, + "step": 56440 + }, + { + "epoch": 1.6423771707827908, + "grad_norm": 10.875, + "learning_rate": 4.525410747130567e-06, + "loss": 1.4688, + "step": 56460 + }, + { + "epoch": 1.6429589551153387, + "grad_norm": 16.375, + "learning_rate": 4.523471465457952e-06, + "loss": 1.5251, + "step": 56480 + }, + { + "epoch": 1.6435407394478867, + "grad_norm": 12.3125, + "learning_rate": 4.521532183785336e-06, + "loss": 1.5564, + "step": 56500 + }, + { + "epoch": 1.6441225237804344, + "grad_norm": 11.3125, + "learning_rate": 4.519592902112721e-06, + "loss": 1.5388, + "step": 56520 + }, + { + "epoch": 1.6447043081129826, + "grad_norm": 9.9375, + "learning_rate": 4.517653620440106e-06, + "loss": 1.5414, + "step": 56540 + }, + { + "epoch": 1.6452860924455304, + "grad_norm": 13.1875, + "learning_rate": 4.515714338767491e-06, + "loss": 1.5193, + "step": 56560 + }, + { + "epoch": 1.6458678767780783, + "grad_norm": 14.375, + "learning_rate": 4.5137750570948765e-06, + "loss": 1.5994, + "step": 56580 + }, + { + "epoch": 1.6464496611106263, + "grad_norm": 12.625, + "learning_rate": 4.511835775422262e-06, + "loss": 1.5887, + "step": 56600 + }, + { + "epoch": 1.6470314454431743, + "grad_norm": 16.75, + "learning_rate": 4.509896493749647e-06, + "loss": 1.5113, + "step": 56620 + }, + { + "epoch": 1.6476132297757222, + "grad_norm": 13.3125, + "learning_rate": 4.507957212077032e-06, + "loss": 1.6203, + "step": 56640 + }, + { + "epoch": 1.64819501410827, + "grad_norm": 11.6875, + "learning_rate": 4.506017930404417e-06, + "loss": 1.4855, + "step": 56660 + }, + { + "epoch": 1.6487767984408181, + "grad_norm": 14.375, + "learning_rate": 4.504078648731802e-06, + "loss": 1.4891, + "step": 56680 + }, + { + "epoch": 1.6493585827733659, + "grad_norm": 12.5625, + "learning_rate": 4.502139367059187e-06, + "loss": 1.5076, + "step": 56700 + }, + { + "epoch": 1.6499403671059139, + "grad_norm": 11.75, + "learning_rate": 4.500200085386572e-06, + "loss": 1.5899, + "step": 56720 + }, + { + "epoch": 1.6505221514384618, + "grad_norm": 13.25, + "learning_rate": 4.498260803713957e-06, + "loss": 1.5738, + "step": 56740 + }, + { + "epoch": 1.6511039357710096, + "grad_norm": 11.9375, + "learning_rate": 4.496321522041342e-06, + "loss": 1.5436, + "step": 56760 + }, + { + "epoch": 1.6516857201035577, + "grad_norm": 14.0, + "learning_rate": 4.4943822403687275e-06, + "loss": 1.5207, + "step": 56780 + }, + { + "epoch": 1.6522675044361055, + "grad_norm": 11.5, + "learning_rate": 4.492442958696113e-06, + "loss": 1.5748, + "step": 56800 + }, + { + "epoch": 1.6528492887686534, + "grad_norm": 15.5625, + "learning_rate": 4.490503677023498e-06, + "loss": 1.5621, + "step": 56820 + }, + { + "epoch": 1.6534310731012014, + "grad_norm": 12.375, + "learning_rate": 4.488564395350883e-06, + "loss": 1.5014, + "step": 56840 + }, + { + "epoch": 1.6540128574337492, + "grad_norm": 12.75, + "learning_rate": 4.486625113678268e-06, + "loss": 1.5741, + "step": 56860 + }, + { + "epoch": 1.6545946417662973, + "grad_norm": 14.1875, + "learning_rate": 4.484685832005653e-06, + "loss": 1.6574, + "step": 56880 + }, + { + "epoch": 1.655176426098845, + "grad_norm": 12.875, + "learning_rate": 4.482746550333038e-06, + "loss": 1.4918, + "step": 56900 + }, + { + "epoch": 1.655758210431393, + "grad_norm": 11.875, + "learning_rate": 4.480807268660423e-06, + "loss": 1.4969, + "step": 56920 + }, + { + "epoch": 1.656339994763941, + "grad_norm": 14.5, + "learning_rate": 4.478867986987808e-06, + "loss": 1.5432, + "step": 56940 + }, + { + "epoch": 1.656921779096489, + "grad_norm": 13.6875, + "learning_rate": 4.476928705315193e-06, + "loss": 1.5645, + "step": 56960 + }, + { + "epoch": 1.657503563429037, + "grad_norm": 15.25, + "learning_rate": 4.4749894236425785e-06, + "loss": 1.4698, + "step": 56980 + }, + { + "epoch": 1.6580853477615847, + "grad_norm": 13.5625, + "learning_rate": 4.4730501419699636e-06, + "loss": 1.5622, + "step": 57000 + }, + { + "epoch": 1.6586671320941329, + "grad_norm": 12.5625, + "learning_rate": 4.471110860297349e-06, + "loss": 1.5406, + "step": 57020 + }, + { + "epoch": 1.6592489164266806, + "grad_norm": 15.25, + "learning_rate": 4.469171578624734e-06, + "loss": 1.5354, + "step": 57040 + }, + { + "epoch": 1.6598307007592286, + "grad_norm": 13.1875, + "learning_rate": 4.467232296952119e-06, + "loss": 1.4825, + "step": 57060 + }, + { + "epoch": 1.6604124850917765, + "grad_norm": 13.6875, + "learning_rate": 4.465293015279504e-06, + "loss": 1.4245, + "step": 57080 + }, + { + "epoch": 1.6609942694243243, + "grad_norm": 13.625, + "learning_rate": 4.463353733606889e-06, + "loss": 1.5259, + "step": 57100 + }, + { + "epoch": 1.6615760537568725, + "grad_norm": 12.375, + "learning_rate": 4.461414451934274e-06, + "loss": 1.562, + "step": 57120 + }, + { + "epoch": 1.6621578380894202, + "grad_norm": 9.0625, + "learning_rate": 4.459475170261659e-06, + "loss": 1.552, + "step": 57140 + }, + { + "epoch": 1.6627396224219682, + "grad_norm": 15.4375, + "learning_rate": 4.457535888589044e-06, + "loss": 1.4998, + "step": 57160 + }, + { + "epoch": 1.6633214067545161, + "grad_norm": 15.0625, + "learning_rate": 4.4555966069164294e-06, + "loss": 1.569, + "step": 57180 + }, + { + "epoch": 1.6639031910870639, + "grad_norm": 12.75, + "learning_rate": 4.4536573252438145e-06, + "loss": 1.4864, + "step": 57200 + }, + { + "epoch": 1.664484975419612, + "grad_norm": 13.1875, + "learning_rate": 4.4517180435712e-06, + "loss": 1.482, + "step": 57220 + }, + { + "epoch": 1.6650667597521598, + "grad_norm": 13.0625, + "learning_rate": 4.449778761898585e-06, + "loss": 1.5329, + "step": 57240 + }, + { + "epoch": 1.6656485440847077, + "grad_norm": 17.0, + "learning_rate": 4.44783948022597e-06, + "loss": 1.497, + "step": 57260 + }, + { + "epoch": 1.6662303284172557, + "grad_norm": 16.5, + "learning_rate": 4.445900198553355e-06, + "loss": 1.5104, + "step": 57280 + }, + { + "epoch": 1.6668121127498037, + "grad_norm": 13.0, + "learning_rate": 4.44396091688074e-06, + "loss": 1.5361, + "step": 57300 + }, + { + "epoch": 1.6673938970823516, + "grad_norm": 14.5, + "learning_rate": 4.442021635208124e-06, + "loss": 1.4874, + "step": 57320 + }, + { + "epoch": 1.6679756814148994, + "grad_norm": 13.625, + "learning_rate": 4.440082353535509e-06, + "loss": 1.4985, + "step": 57340 + }, + { + "epoch": 1.6685574657474476, + "grad_norm": 14.1875, + "learning_rate": 4.4381430718628945e-06, + "loss": 1.5271, + "step": 57360 + }, + { + "epoch": 1.6691392500799953, + "grad_norm": 11.8125, + "learning_rate": 4.4362037901902796e-06, + "loss": 1.5104, + "step": 57380 + }, + { + "epoch": 1.6697210344125433, + "grad_norm": 12.125, + "learning_rate": 4.434264508517665e-06, + "loss": 1.4442, + "step": 57400 + }, + { + "epoch": 1.6703028187450912, + "grad_norm": 14.6875, + "learning_rate": 4.43232522684505e-06, + "loss": 1.5881, + "step": 57420 + }, + { + "epoch": 1.670884603077639, + "grad_norm": 12.0, + "learning_rate": 4.430385945172435e-06, + "loss": 1.4902, + "step": 57440 + }, + { + "epoch": 1.6714663874101872, + "grad_norm": 12.4375, + "learning_rate": 4.42844666349982e-06, + "loss": 1.5443, + "step": 57460 + }, + { + "epoch": 1.672048171742735, + "grad_norm": 10.9375, + "learning_rate": 4.426507381827205e-06, + "loss": 1.5009, + "step": 57480 + }, + { + "epoch": 1.6726299560752829, + "grad_norm": 11.625, + "learning_rate": 4.42456810015459e-06, + "loss": 1.4822, + "step": 57500 + }, + { + "epoch": 1.6732117404078308, + "grad_norm": 12.25, + "learning_rate": 4.422628818481975e-06, + "loss": 1.5753, + "step": 57520 + }, + { + "epoch": 1.6737935247403788, + "grad_norm": 13.9375, + "learning_rate": 4.42068953680936e-06, + "loss": 1.4683, + "step": 57540 + }, + { + "epoch": 1.6743753090729268, + "grad_norm": 11.4375, + "learning_rate": 4.4187502551367454e-06, + "loss": 1.5026, + "step": 57560 + }, + { + "epoch": 1.6749570934054745, + "grad_norm": 16.25, + "learning_rate": 4.4168109734641305e-06, + "loss": 1.5711, + "step": 57580 + }, + { + "epoch": 1.6755388777380227, + "grad_norm": 12.75, + "learning_rate": 4.414871691791516e-06, + "loss": 1.5626, + "step": 57600 + }, + { + "epoch": 1.6761206620705704, + "grad_norm": 14.1875, + "learning_rate": 4.412932410118901e-06, + "loss": 1.5723, + "step": 57620 + }, + { + "epoch": 1.6767024464031184, + "grad_norm": 13.0, + "learning_rate": 4.410993128446286e-06, + "loss": 1.5563, + "step": 57640 + }, + { + "epoch": 1.6772842307356663, + "grad_norm": 15.875, + "learning_rate": 4.409053846773671e-06, + "loss": 1.5386, + "step": 57660 + }, + { + "epoch": 1.677866015068214, + "grad_norm": 15.9375, + "learning_rate": 4.407114565101056e-06, + "loss": 1.494, + "step": 57680 + }, + { + "epoch": 1.6784477994007623, + "grad_norm": 11.3125, + "learning_rate": 4.405175283428441e-06, + "loss": 1.5629, + "step": 57700 + }, + { + "epoch": 1.67902958373331, + "grad_norm": 12.5625, + "learning_rate": 4.403236001755825e-06, + "loss": 1.4474, + "step": 57720 + }, + { + "epoch": 1.679611368065858, + "grad_norm": 11.75, + "learning_rate": 4.4012967200832105e-06, + "loss": 1.6147, + "step": 57740 + }, + { + "epoch": 1.680193152398406, + "grad_norm": 14.4375, + "learning_rate": 4.3993574384105956e-06, + "loss": 1.4958, + "step": 57760 + }, + { + "epoch": 1.6807749367309537, + "grad_norm": 14.625, + "learning_rate": 4.397418156737981e-06, + "loss": 1.5962, + "step": 57780 + }, + { + "epoch": 1.6813567210635019, + "grad_norm": 15.4375, + "learning_rate": 4.395478875065366e-06, + "loss": 1.5277, + "step": 57800 + }, + { + "epoch": 1.6819385053960496, + "grad_norm": 13.5, + "learning_rate": 4.393539593392751e-06, + "loss": 1.5227, + "step": 57820 + }, + { + "epoch": 1.6825202897285976, + "grad_norm": 14.4375, + "learning_rate": 4.391600311720136e-06, + "loss": 1.5584, + "step": 57840 + }, + { + "epoch": 1.6831020740611455, + "grad_norm": 12.875, + "learning_rate": 4.389661030047521e-06, + "loss": 1.6491, + "step": 57860 + }, + { + "epoch": 1.6836838583936935, + "grad_norm": 12.4375, + "learning_rate": 4.387721748374906e-06, + "loss": 1.5882, + "step": 57880 + }, + { + "epoch": 1.6842656427262415, + "grad_norm": 13.3125, + "learning_rate": 4.385782466702291e-06, + "loss": 1.567, + "step": 57900 + }, + { + "epoch": 1.6848474270587892, + "grad_norm": 13.0625, + "learning_rate": 4.383843185029676e-06, + "loss": 1.5049, + "step": 57920 + }, + { + "epoch": 1.6854292113913374, + "grad_norm": 13.375, + "learning_rate": 4.3819039033570614e-06, + "loss": 1.5021, + "step": 57940 + }, + { + "epoch": 1.6860109957238851, + "grad_norm": 12.9375, + "learning_rate": 4.3799646216844465e-06, + "loss": 1.6091, + "step": 57960 + }, + { + "epoch": 1.686592780056433, + "grad_norm": 12.0625, + "learning_rate": 4.378025340011832e-06, + "loss": 1.5639, + "step": 57980 + }, + { + "epoch": 1.687174564388981, + "grad_norm": 14.875, + "learning_rate": 4.376086058339217e-06, + "loss": 1.5785, + "step": 58000 + }, + { + "epoch": 1.6877563487215288, + "grad_norm": 14.375, + "learning_rate": 4.374146776666602e-06, + "loss": 1.5313, + "step": 58020 + }, + { + "epoch": 1.688338133054077, + "grad_norm": 13.5, + "learning_rate": 4.372207494993987e-06, + "loss": 1.5673, + "step": 58040 + }, + { + "epoch": 1.6889199173866247, + "grad_norm": 11.9375, + "learning_rate": 4.370268213321372e-06, + "loss": 1.5271, + "step": 58060 + }, + { + "epoch": 1.6895017017191727, + "grad_norm": 12.375, + "learning_rate": 4.368328931648757e-06, + "loss": 1.5138, + "step": 58080 + }, + { + "epoch": 1.6900834860517207, + "grad_norm": 14.5625, + "learning_rate": 4.366389649976142e-06, + "loss": 1.5513, + "step": 58100 + }, + { + "epoch": 1.6906652703842684, + "grad_norm": 12.625, + "learning_rate": 4.364450368303527e-06, + "loss": 1.4659, + "step": 58120 + }, + { + "epoch": 1.6912470547168166, + "grad_norm": 17.125, + "learning_rate": 4.362511086630912e-06, + "loss": 1.5365, + "step": 58140 + }, + { + "epoch": 1.6918288390493643, + "grad_norm": 14.5625, + "learning_rate": 4.3605718049582975e-06, + "loss": 1.5765, + "step": 58160 + }, + { + "epoch": 1.6924106233819123, + "grad_norm": 14.8125, + "learning_rate": 4.358632523285683e-06, + "loss": 1.5635, + "step": 58180 + }, + { + "epoch": 1.6929924077144602, + "grad_norm": 16.25, + "learning_rate": 4.356693241613068e-06, + "loss": 1.4967, + "step": 58200 + }, + { + "epoch": 1.6935741920470082, + "grad_norm": 13.0, + "learning_rate": 4.354753959940453e-06, + "loss": 1.5344, + "step": 58220 + }, + { + "epoch": 1.6941559763795562, + "grad_norm": 10.25, + "learning_rate": 4.352814678267838e-06, + "loss": 1.5465, + "step": 58240 + }, + { + "epoch": 1.694737760712104, + "grad_norm": 12.0625, + "learning_rate": 4.350875396595223e-06, + "loss": 1.5161, + "step": 58260 + }, + { + "epoch": 1.695319545044652, + "grad_norm": 12.5625, + "learning_rate": 4.348936114922608e-06, + "loss": 1.5355, + "step": 58280 + }, + { + "epoch": 1.6959013293771998, + "grad_norm": 13.8125, + "learning_rate": 4.346996833249993e-06, + "loss": 1.4878, + "step": 58300 + }, + { + "epoch": 1.6964831137097478, + "grad_norm": 11.9375, + "learning_rate": 4.345057551577378e-06, + "loss": 1.5294, + "step": 58320 + }, + { + "epoch": 1.6970648980422958, + "grad_norm": 16.125, + "learning_rate": 4.343118269904763e-06, + "loss": 1.4918, + "step": 58340 + }, + { + "epoch": 1.6976466823748435, + "grad_norm": 14.6875, + "learning_rate": 4.3411789882321485e-06, + "loss": 1.4957, + "step": 58360 + }, + { + "epoch": 1.6982284667073917, + "grad_norm": 14.875, + "learning_rate": 4.339239706559534e-06, + "loss": 1.5475, + "step": 58380 + }, + { + "epoch": 1.6988102510399394, + "grad_norm": 14.75, + "learning_rate": 4.337300424886919e-06, + "loss": 1.4932, + "step": 58400 + }, + { + "epoch": 1.6993920353724874, + "grad_norm": 12.8125, + "learning_rate": 4.335361143214303e-06, + "loss": 1.5463, + "step": 58420 + }, + { + "epoch": 1.6999738197050354, + "grad_norm": 11.6875, + "learning_rate": 4.333421861541688e-06, + "loss": 1.5259, + "step": 58440 + }, + { + "epoch": 1.700555604037583, + "grad_norm": 10.8125, + "learning_rate": 4.331482579869073e-06, + "loss": 1.5044, + "step": 58460 + }, + { + "epoch": 1.7011373883701313, + "grad_norm": 17.875, + "learning_rate": 4.329543298196458e-06, + "loss": 1.5168, + "step": 58480 + }, + { + "epoch": 1.701719172702679, + "grad_norm": 11.5625, + "learning_rate": 4.327604016523843e-06, + "loss": 1.6243, + "step": 58500 + }, + { + "epoch": 1.702300957035227, + "grad_norm": 13.375, + "learning_rate": 4.3256647348512284e-06, + "loss": 1.4814, + "step": 58520 + }, + { + "epoch": 1.702882741367775, + "grad_norm": 13.125, + "learning_rate": 4.3237254531786135e-06, + "loss": 1.4382, + "step": 58540 + }, + { + "epoch": 1.703464525700323, + "grad_norm": 18.25, + "learning_rate": 4.321786171505999e-06, + "loss": 1.5173, + "step": 58560 + }, + { + "epoch": 1.7040463100328709, + "grad_norm": 16.0, + "learning_rate": 4.319846889833384e-06, + "loss": 1.54, + "step": 58580 + }, + { + "epoch": 1.7046280943654186, + "grad_norm": 12.5625, + "learning_rate": 4.317907608160769e-06, + "loss": 1.4458, + "step": 58600 + }, + { + "epoch": 1.7052098786979668, + "grad_norm": 11.8125, + "learning_rate": 4.315968326488154e-06, + "loss": 1.5298, + "step": 58620 + }, + { + "epoch": 1.7057916630305145, + "grad_norm": 13.625, + "learning_rate": 4.314029044815539e-06, + "loss": 1.4869, + "step": 58640 + }, + { + "epoch": 1.7063734473630625, + "grad_norm": 13.0625, + "learning_rate": 4.312089763142924e-06, + "loss": 1.5136, + "step": 58660 + }, + { + "epoch": 1.7069552316956105, + "grad_norm": 17.25, + "learning_rate": 4.310150481470309e-06, + "loss": 1.5782, + "step": 58680 + }, + { + "epoch": 1.7075370160281582, + "grad_norm": 12.25, + "learning_rate": 4.308211199797694e-06, + "loss": 1.5273, + "step": 58700 + }, + { + "epoch": 1.7081188003607064, + "grad_norm": 12.4375, + "learning_rate": 4.306271918125079e-06, + "loss": 1.5192, + "step": 58720 + }, + { + "epoch": 1.7087005846932541, + "grad_norm": 13.6875, + "learning_rate": 4.3043326364524645e-06, + "loss": 1.4448, + "step": 58740 + }, + { + "epoch": 1.709282369025802, + "grad_norm": 14.75, + "learning_rate": 4.30239335477985e-06, + "loss": 1.6084, + "step": 58760 + }, + { + "epoch": 1.70986415335835, + "grad_norm": 12.9375, + "learning_rate": 4.300454073107235e-06, + "loss": 1.497, + "step": 58780 + }, + { + "epoch": 1.710445937690898, + "grad_norm": 13.0625, + "learning_rate": 4.29851479143462e-06, + "loss": 1.5252, + "step": 58800 + }, + { + "epoch": 1.711027722023446, + "grad_norm": 12.8125, + "learning_rate": 4.296575509762005e-06, + "loss": 1.4394, + "step": 58820 + }, + { + "epoch": 1.7116095063559937, + "grad_norm": 14.0625, + "learning_rate": 4.29463622808939e-06, + "loss": 1.5077, + "step": 58840 + }, + { + "epoch": 1.712191290688542, + "grad_norm": 13.375, + "learning_rate": 4.292696946416775e-06, + "loss": 1.5944, + "step": 58860 + }, + { + "epoch": 1.7127730750210897, + "grad_norm": 12.4375, + "learning_rate": 4.29075766474416e-06, + "loss": 1.5142, + "step": 58880 + }, + { + "epoch": 1.7133548593536376, + "grad_norm": 14.0, + "learning_rate": 4.288818383071545e-06, + "loss": 1.5766, + "step": 58900 + }, + { + "epoch": 1.7139366436861856, + "grad_norm": 13.625, + "learning_rate": 4.28687910139893e-06, + "loss": 1.4333, + "step": 58920 + }, + { + "epoch": 1.7145184280187333, + "grad_norm": 14.125, + "learning_rate": 4.2849398197263155e-06, + "loss": 1.553, + "step": 58940 + }, + { + "epoch": 1.7151002123512815, + "grad_norm": 11.625, + "learning_rate": 4.2830005380537006e-06, + "loss": 1.5117, + "step": 58960 + }, + { + "epoch": 1.7156819966838293, + "grad_norm": 13.125, + "learning_rate": 4.281061256381086e-06, + "loss": 1.4692, + "step": 58980 + }, + { + "epoch": 1.7162637810163772, + "grad_norm": 13.3125, + "learning_rate": 4.279121974708471e-06, + "loss": 1.5303, + "step": 59000 + }, + { + "epoch": 1.7168455653489252, + "grad_norm": 13.5, + "learning_rate": 4.277182693035856e-06, + "loss": 1.503, + "step": 59020 + }, + { + "epoch": 1.717427349681473, + "grad_norm": 11.5625, + "learning_rate": 4.275243411363241e-06, + "loss": 1.5862, + "step": 59040 + }, + { + "epoch": 1.718009134014021, + "grad_norm": 12.4375, + "learning_rate": 4.273304129690626e-06, + "loss": 1.4501, + "step": 59060 + }, + { + "epoch": 1.7185909183465689, + "grad_norm": 13.25, + "learning_rate": 4.271364848018011e-06, + "loss": 1.4513, + "step": 59080 + }, + { + "epoch": 1.7191727026791168, + "grad_norm": 12.0, + "learning_rate": 4.269425566345396e-06, + "loss": 1.5399, + "step": 59100 + }, + { + "epoch": 1.7197544870116648, + "grad_norm": 14.125, + "learning_rate": 4.267486284672781e-06, + "loss": 1.395, + "step": 59120 + }, + { + "epoch": 1.7203362713442127, + "grad_norm": 15.3125, + "learning_rate": 4.2655470030001664e-06, + "loss": 1.5789, + "step": 59140 + }, + { + "epoch": 1.7209180556767607, + "grad_norm": 11.9375, + "learning_rate": 4.2636077213275515e-06, + "loss": 1.4762, + "step": 59160 + }, + { + "epoch": 1.7214998400093084, + "grad_norm": 10.375, + "learning_rate": 4.261668439654937e-06, + "loss": 1.5355, + "step": 59180 + }, + { + "epoch": 1.7220816243418566, + "grad_norm": 14.0, + "learning_rate": 4.259729157982322e-06, + "loss": 1.4571, + "step": 59200 + }, + { + "epoch": 1.7226634086744044, + "grad_norm": 12.625, + "learning_rate": 4.257789876309707e-06, + "loss": 1.4757, + "step": 59220 + }, + { + "epoch": 1.7232451930069523, + "grad_norm": 11.875, + "learning_rate": 4.255850594637091e-06, + "loss": 1.5687, + "step": 59240 + }, + { + "epoch": 1.7238269773395003, + "grad_norm": 13.625, + "learning_rate": 4.253911312964476e-06, + "loss": 1.5222, + "step": 59260 + }, + { + "epoch": 1.724408761672048, + "grad_norm": 12.9375, + "learning_rate": 4.251972031291861e-06, + "loss": 1.5058, + "step": 59280 + }, + { + "epoch": 1.7249905460045962, + "grad_norm": 13.125, + "learning_rate": 4.250032749619246e-06, + "loss": 1.4596, + "step": 59300 + }, + { + "epoch": 1.725572330337144, + "grad_norm": 14.5625, + "learning_rate": 4.2480934679466315e-06, + "loss": 1.5361, + "step": 59320 + }, + { + "epoch": 1.726154114669692, + "grad_norm": 15.0625, + "learning_rate": 4.2461541862740166e-06, + "loss": 1.4849, + "step": 59340 + }, + { + "epoch": 1.72673589900224, + "grad_norm": 13.3125, + "learning_rate": 4.244214904601402e-06, + "loss": 1.5622, + "step": 59360 + }, + { + "epoch": 1.7273176833347876, + "grad_norm": 13.25, + "learning_rate": 4.242275622928787e-06, + "loss": 1.5752, + "step": 59380 + }, + { + "epoch": 1.7278994676673358, + "grad_norm": 14.0625, + "learning_rate": 4.240336341256172e-06, + "loss": 1.5047, + "step": 59400 + }, + { + "epoch": 1.7284812519998836, + "grad_norm": 10.1875, + "learning_rate": 4.238397059583557e-06, + "loss": 1.4494, + "step": 59420 + }, + { + "epoch": 1.7290630363324315, + "grad_norm": 11.625, + "learning_rate": 4.236457777910942e-06, + "loss": 1.3919, + "step": 59440 + }, + { + "epoch": 1.7296448206649795, + "grad_norm": 11.125, + "learning_rate": 4.234518496238327e-06, + "loss": 1.4939, + "step": 59460 + }, + { + "epoch": 1.7302266049975275, + "grad_norm": 15.125, + "learning_rate": 4.232579214565712e-06, + "loss": 1.5392, + "step": 59480 + }, + { + "epoch": 1.7308083893300754, + "grad_norm": 17.25, + "learning_rate": 4.230639932893097e-06, + "loss": 1.5289, + "step": 59500 + }, + { + "epoch": 1.7313901736626232, + "grad_norm": 13.0, + "learning_rate": 4.2287006512204825e-06, + "loss": 1.5658, + "step": 59520 + }, + { + "epoch": 1.7319719579951713, + "grad_norm": 7.28125, + "learning_rate": 4.2267613695478675e-06, + "loss": 1.4307, + "step": 59540 + }, + { + "epoch": 1.732553742327719, + "grad_norm": 13.3125, + "learning_rate": 4.224822087875253e-06, + "loss": 1.5388, + "step": 59560 + }, + { + "epoch": 1.733135526660267, + "grad_norm": 15.5, + "learning_rate": 4.222882806202638e-06, + "loss": 1.5646, + "step": 59580 + }, + { + "epoch": 1.733717310992815, + "grad_norm": 14.75, + "learning_rate": 4.220943524530023e-06, + "loss": 1.5158, + "step": 59600 + }, + { + "epoch": 1.7342990953253627, + "grad_norm": 12.9375, + "learning_rate": 4.219004242857408e-06, + "loss": 1.5818, + "step": 59620 + }, + { + "epoch": 1.734880879657911, + "grad_norm": 14.875, + "learning_rate": 4.217064961184793e-06, + "loss": 1.4712, + "step": 59640 + }, + { + "epoch": 1.7354626639904587, + "grad_norm": 10.8125, + "learning_rate": 4.215125679512178e-06, + "loss": 1.5987, + "step": 59660 + }, + { + "epoch": 1.7360444483230066, + "grad_norm": 13.8125, + "learning_rate": 4.213186397839563e-06, + "loss": 1.5238, + "step": 59680 + }, + { + "epoch": 1.7366262326555546, + "grad_norm": 11.5625, + "learning_rate": 4.211247116166948e-06, + "loss": 1.4808, + "step": 59700 + }, + { + "epoch": 1.7372080169881026, + "grad_norm": 13.0, + "learning_rate": 4.2093078344943334e-06, + "loss": 1.5405, + "step": 59720 + }, + { + "epoch": 1.7377898013206505, + "grad_norm": 11.375, + "learning_rate": 4.2073685528217185e-06, + "loss": 1.49, + "step": 59740 + }, + { + "epoch": 1.7383715856531983, + "grad_norm": 14.0, + "learning_rate": 4.205429271149104e-06, + "loss": 1.5166, + "step": 59760 + }, + { + "epoch": 1.7389533699857462, + "grad_norm": 14.4375, + "learning_rate": 4.203489989476489e-06, + "loss": 1.5367, + "step": 59780 + }, + { + "epoch": 1.7395351543182942, + "grad_norm": 11.5, + "learning_rate": 4.201550707803874e-06, + "loss": 1.5738, + "step": 59800 + }, + { + "epoch": 1.7401169386508422, + "grad_norm": 15.75, + "learning_rate": 4.199611426131259e-06, + "loss": 1.4617, + "step": 59820 + }, + { + "epoch": 1.7406987229833901, + "grad_norm": 15.875, + "learning_rate": 4.197672144458643e-06, + "loss": 1.6239, + "step": 59840 + }, + { + "epoch": 1.7412805073159379, + "grad_norm": 12.5, + "learning_rate": 4.195732862786028e-06, + "loss": 1.5498, + "step": 59860 + }, + { + "epoch": 1.741862291648486, + "grad_norm": 12.25, + "learning_rate": 4.193793581113413e-06, + "loss": 1.5468, + "step": 59880 + }, + { + "epoch": 1.7424440759810338, + "grad_norm": 13.8125, + "learning_rate": 4.1918542994407985e-06, + "loss": 1.4817, + "step": 59900 + }, + { + "epoch": 1.7430258603135818, + "grad_norm": 14.75, + "learning_rate": 4.1899150177681836e-06, + "loss": 1.549, + "step": 59920 + }, + { + "epoch": 1.7436076446461297, + "grad_norm": 12.375, + "learning_rate": 4.187975736095569e-06, + "loss": 1.5096, + "step": 59940 + }, + { + "epoch": 1.7441894289786775, + "grad_norm": 14.3125, + "learning_rate": 4.186036454422954e-06, + "loss": 1.5118, + "step": 59960 + }, + { + "epoch": 1.7447712133112256, + "grad_norm": 14.8125, + "learning_rate": 4.184097172750339e-06, + "loss": 1.6053, + "step": 59980 + }, + { + "epoch": 1.7453529976437734, + "grad_norm": 15.4375, + "learning_rate": 4.182157891077724e-06, + "loss": 1.4679, + "step": 60000 + }, + { + "epoch": 1.7459347819763213, + "grad_norm": 10.4375, + "learning_rate": 4.180218609405109e-06, + "loss": 1.5594, + "step": 60020 + }, + { + "epoch": 1.7465165663088693, + "grad_norm": 14.0, + "learning_rate": 4.178279327732494e-06, + "loss": 1.5039, + "step": 60040 + }, + { + "epoch": 1.7470983506414173, + "grad_norm": 13.125, + "learning_rate": 4.176340046059879e-06, + "loss": 1.5033, + "step": 60060 + }, + { + "epoch": 1.7476801349739652, + "grad_norm": 14.1875, + "learning_rate": 4.174400764387264e-06, + "loss": 1.5387, + "step": 60080 + }, + { + "epoch": 1.748261919306513, + "grad_norm": 12.25, + "learning_rate": 4.1724614827146494e-06, + "loss": 1.5421, + "step": 60100 + }, + { + "epoch": 1.7488437036390612, + "grad_norm": 13.625, + "learning_rate": 4.1705222010420345e-06, + "loss": 1.5557, + "step": 60120 + }, + { + "epoch": 1.749425487971609, + "grad_norm": 13.0625, + "learning_rate": 4.16858291936942e-06, + "loss": 1.529, + "step": 60140 + }, + { + "epoch": 1.7500072723041569, + "grad_norm": 13.5625, + "learning_rate": 4.166643637696805e-06, + "loss": 1.59, + "step": 60160 + }, + { + "epoch": 1.7505890566367048, + "grad_norm": 14.75, + "learning_rate": 4.16470435602419e-06, + "loss": 1.5466, + "step": 60180 + }, + { + "epoch": 1.7511708409692526, + "grad_norm": 14.375, + "learning_rate": 4.162765074351575e-06, + "loss": 1.5484, + "step": 60200 + }, + { + "epoch": 1.7517526253018008, + "grad_norm": 13.1875, + "learning_rate": 4.16082579267896e-06, + "loss": 1.5077, + "step": 60220 + }, + { + "epoch": 1.7523344096343485, + "grad_norm": 14.6875, + "learning_rate": 4.158886511006345e-06, + "loss": 1.5449, + "step": 60240 + }, + { + "epoch": 1.7529161939668965, + "grad_norm": 12.9375, + "learning_rate": 4.15694722933373e-06, + "loss": 1.5163, + "step": 60260 + }, + { + "epoch": 1.7534979782994444, + "grad_norm": 12.0625, + "learning_rate": 4.155007947661115e-06, + "loss": 1.4794, + "step": 60280 + }, + { + "epoch": 1.7540797626319922, + "grad_norm": 13.625, + "learning_rate": 4.1530686659885e-06, + "loss": 1.4792, + "step": 60300 + }, + { + "epoch": 1.7546615469645404, + "grad_norm": 14.5625, + "learning_rate": 4.1511293843158855e-06, + "loss": 1.554, + "step": 60320 + }, + { + "epoch": 1.755243331297088, + "grad_norm": 13.625, + "learning_rate": 4.14919010264327e-06, + "loss": 1.5754, + "step": 60340 + }, + { + "epoch": 1.755825115629636, + "grad_norm": 12.8125, + "learning_rate": 4.147250820970655e-06, + "loss": 1.5289, + "step": 60360 + }, + { + "epoch": 1.756406899962184, + "grad_norm": 10.6875, + "learning_rate": 4.14531153929804e-06, + "loss": 1.3856, + "step": 60380 + }, + { + "epoch": 1.756988684294732, + "grad_norm": 13.4375, + "learning_rate": 4.143372257625425e-06, + "loss": 1.5795, + "step": 60400 + }, + { + "epoch": 1.75757046862728, + "grad_norm": 13.8125, + "learning_rate": 4.14143297595281e-06, + "loss": 1.5497, + "step": 60420 + }, + { + "epoch": 1.7581522529598277, + "grad_norm": 11.0625, + "learning_rate": 4.139493694280195e-06, + "loss": 1.5081, + "step": 60440 + }, + { + "epoch": 1.7587340372923759, + "grad_norm": 13.25, + "learning_rate": 4.13755441260758e-06, + "loss": 1.5566, + "step": 60460 + }, + { + "epoch": 1.7593158216249236, + "grad_norm": 12.8125, + "learning_rate": 4.1356151309349654e-06, + "loss": 1.4932, + "step": 60480 + }, + { + "epoch": 1.7598976059574716, + "grad_norm": 13.4375, + "learning_rate": 4.1336758492623505e-06, + "loss": 1.5285, + "step": 60500 + }, + { + "epoch": 1.7604793902900195, + "grad_norm": 14.4375, + "learning_rate": 4.131736567589736e-06, + "loss": 1.5651, + "step": 60520 + }, + { + "epoch": 1.7610611746225673, + "grad_norm": 13.8125, + "learning_rate": 4.129797285917121e-06, + "loss": 1.5124, + "step": 60540 + }, + { + "epoch": 1.7616429589551155, + "grad_norm": 13.9375, + "learning_rate": 4.127858004244506e-06, + "loss": 1.4658, + "step": 60560 + }, + { + "epoch": 1.7622247432876632, + "grad_norm": 17.5, + "learning_rate": 4.125918722571891e-06, + "loss": 1.4134, + "step": 60580 + }, + { + "epoch": 1.7628065276202112, + "grad_norm": 12.375, + "learning_rate": 4.123979440899276e-06, + "loss": 1.5309, + "step": 60600 + }, + { + "epoch": 1.7633883119527591, + "grad_norm": 13.4375, + "learning_rate": 4.122040159226661e-06, + "loss": 1.529, + "step": 60620 + }, + { + "epoch": 1.7639700962853069, + "grad_norm": 12.6875, + "learning_rate": 4.120100877554046e-06, + "loss": 1.5403, + "step": 60640 + }, + { + "epoch": 1.764551880617855, + "grad_norm": 13.1875, + "learning_rate": 4.118161595881431e-06, + "loss": 1.4753, + "step": 60660 + }, + { + "epoch": 1.7651336649504028, + "grad_norm": 18.125, + "learning_rate": 4.116222314208816e-06, + "loss": 1.5695, + "step": 60680 + }, + { + "epoch": 1.7657154492829508, + "grad_norm": 13.25, + "learning_rate": 4.1142830325362015e-06, + "loss": 1.5104, + "step": 60700 + }, + { + "epoch": 1.7662972336154987, + "grad_norm": 12.6875, + "learning_rate": 4.112343750863587e-06, + "loss": 1.5038, + "step": 60720 + }, + { + "epoch": 1.7668790179480467, + "grad_norm": 12.875, + "learning_rate": 4.110404469190972e-06, + "loss": 1.4563, + "step": 60740 + }, + { + "epoch": 1.7674608022805947, + "grad_norm": 13.0, + "learning_rate": 4.108465187518357e-06, + "loss": 1.5103, + "step": 60760 + }, + { + "epoch": 1.7680425866131424, + "grad_norm": 12.4375, + "learning_rate": 4.106525905845742e-06, + "loss": 1.4324, + "step": 60780 + }, + { + "epoch": 1.7686243709456906, + "grad_norm": 14.0625, + "learning_rate": 4.104586624173127e-06, + "loss": 1.4796, + "step": 60800 + }, + { + "epoch": 1.7692061552782383, + "grad_norm": 13.3125, + "learning_rate": 4.102647342500512e-06, + "loss": 1.5313, + "step": 60820 + }, + { + "epoch": 1.7697879396107863, + "grad_norm": 13.0, + "learning_rate": 4.100708060827897e-06, + "loss": 1.5113, + "step": 60840 + }, + { + "epoch": 1.7703697239433343, + "grad_norm": 13.5, + "learning_rate": 4.098768779155282e-06, + "loss": 1.5719, + "step": 60860 + }, + { + "epoch": 1.770951508275882, + "grad_norm": 15.25, + "learning_rate": 4.096829497482667e-06, + "loss": 1.5592, + "step": 60880 + }, + { + "epoch": 1.7715332926084302, + "grad_norm": 14.5, + "learning_rate": 4.0948902158100525e-06, + "loss": 1.5779, + "step": 60900 + }, + { + "epoch": 1.772115076940978, + "grad_norm": 13.5625, + "learning_rate": 4.092950934137438e-06, + "loss": 1.5096, + "step": 60920 + }, + { + "epoch": 1.7726968612735259, + "grad_norm": 14.0625, + "learning_rate": 4.091011652464823e-06, + "loss": 1.5509, + "step": 60940 + }, + { + "epoch": 1.7732786456060738, + "grad_norm": 14.3125, + "learning_rate": 4.089072370792208e-06, + "loss": 1.5349, + "step": 60960 + }, + { + "epoch": 1.7738604299386218, + "grad_norm": 14.4375, + "learning_rate": 4.087133089119593e-06, + "loss": 1.5832, + "step": 60980 + }, + { + "epoch": 1.7744422142711698, + "grad_norm": 13.5, + "learning_rate": 4.085193807446978e-06, + "loss": 1.5272, + "step": 61000 + }, + { + "epoch": 1.7750239986037175, + "grad_norm": 11.375, + "learning_rate": 4.083254525774363e-06, + "loss": 1.4585, + "step": 61020 + }, + { + "epoch": 1.7756057829362655, + "grad_norm": 13.125, + "learning_rate": 4.081315244101748e-06, + "loss": 1.3838, + "step": 61040 + }, + { + "epoch": 1.7761875672688134, + "grad_norm": 12.6875, + "learning_rate": 4.079375962429133e-06, + "loss": 1.6207, + "step": 61060 + }, + { + "epoch": 1.7767693516013614, + "grad_norm": 13.6875, + "learning_rate": 4.077436680756518e-06, + "loss": 1.4856, + "step": 61080 + }, + { + "epoch": 1.7773511359339094, + "grad_norm": 13.25, + "learning_rate": 4.0754973990839035e-06, + "loss": 1.5682, + "step": 61100 + }, + { + "epoch": 1.777932920266457, + "grad_norm": 14.0625, + "learning_rate": 4.0735581174112886e-06, + "loss": 1.577, + "step": 61120 + }, + { + "epoch": 1.7785147045990053, + "grad_norm": 14.8125, + "learning_rate": 4.071618835738673e-06, + "loss": 1.3515, + "step": 61140 + }, + { + "epoch": 1.779096488931553, + "grad_norm": 13.125, + "learning_rate": 4.069679554066058e-06, + "loss": 1.5357, + "step": 61160 + }, + { + "epoch": 1.779678273264101, + "grad_norm": 12.0, + "learning_rate": 4.067740272393443e-06, + "loss": 1.4878, + "step": 61180 + }, + { + "epoch": 1.780260057596649, + "grad_norm": 13.9375, + "learning_rate": 4.065800990720828e-06, + "loss": 1.5168, + "step": 61200 + }, + { + "epoch": 1.7808418419291967, + "grad_norm": 13.5625, + "learning_rate": 4.063861709048213e-06, + "loss": 1.5326, + "step": 61220 + }, + { + "epoch": 1.7814236262617449, + "grad_norm": 14.25, + "learning_rate": 4.061922427375598e-06, + "loss": 1.4947, + "step": 61240 + }, + { + "epoch": 1.7820054105942926, + "grad_norm": 13.25, + "learning_rate": 4.059983145702983e-06, + "loss": 1.559, + "step": 61260 + }, + { + "epoch": 1.7825871949268406, + "grad_norm": 14.6875, + "learning_rate": 4.0580438640303685e-06, + "loss": 1.5615, + "step": 61280 + }, + { + "epoch": 1.7831689792593886, + "grad_norm": 13.6875, + "learning_rate": 4.056104582357754e-06, + "loss": 1.6296, + "step": 61300 + }, + { + "epoch": 1.7837507635919365, + "grad_norm": 15.0, + "learning_rate": 4.054165300685139e-06, + "loss": 1.4333, + "step": 61320 + }, + { + "epoch": 1.7843325479244845, + "grad_norm": 13.1875, + "learning_rate": 4.052226019012524e-06, + "loss": 1.6036, + "step": 61340 + }, + { + "epoch": 1.7849143322570322, + "grad_norm": 12.625, + "learning_rate": 4.050286737339909e-06, + "loss": 1.5461, + "step": 61360 + }, + { + "epoch": 1.7854961165895804, + "grad_norm": 14.25, + "learning_rate": 4.048347455667294e-06, + "loss": 1.5809, + "step": 61380 + }, + { + "epoch": 1.7860779009221281, + "grad_norm": 12.75, + "learning_rate": 4.046408173994679e-06, + "loss": 1.523, + "step": 61400 + }, + { + "epoch": 1.786659685254676, + "grad_norm": 12.625, + "learning_rate": 4.044468892322064e-06, + "loss": 1.4993, + "step": 61420 + }, + { + "epoch": 1.787241469587224, + "grad_norm": 11.375, + "learning_rate": 4.042529610649449e-06, + "loss": 1.4793, + "step": 61440 + }, + { + "epoch": 1.7878232539197718, + "grad_norm": 13.625, + "learning_rate": 4.040590328976834e-06, + "loss": 1.5591, + "step": 61460 + }, + { + "epoch": 1.78840503825232, + "grad_norm": 14.6875, + "learning_rate": 4.0386510473042195e-06, + "loss": 1.6111, + "step": 61480 + }, + { + "epoch": 1.7889868225848677, + "grad_norm": 17.625, + "learning_rate": 4.0367117656316046e-06, + "loss": 1.503, + "step": 61500 + }, + { + "epoch": 1.7895686069174157, + "grad_norm": 15.625, + "learning_rate": 4.03477248395899e-06, + "loss": 1.5823, + "step": 61520 + }, + { + "epoch": 1.7901503912499637, + "grad_norm": 14.375, + "learning_rate": 4.032833202286375e-06, + "loss": 1.5218, + "step": 61540 + }, + { + "epoch": 1.7907321755825114, + "grad_norm": 12.875, + "learning_rate": 4.03089392061376e-06, + "loss": 1.4664, + "step": 61560 + }, + { + "epoch": 1.7913139599150596, + "grad_norm": 16.375, + "learning_rate": 4.028954638941145e-06, + "loss": 1.4916, + "step": 61580 + }, + { + "epoch": 1.7918957442476073, + "grad_norm": 14.75, + "learning_rate": 4.02701535726853e-06, + "loss": 1.5355, + "step": 61600 + }, + { + "epoch": 1.7924775285801553, + "grad_norm": 15.1875, + "learning_rate": 4.025076075595915e-06, + "loss": 1.5685, + "step": 61620 + }, + { + "epoch": 1.7930593129127033, + "grad_norm": 14.875, + "learning_rate": 4.0231367939233e-06, + "loss": 1.6289, + "step": 61640 + }, + { + "epoch": 1.7936410972452512, + "grad_norm": 12.8125, + "learning_rate": 4.021197512250685e-06, + "loss": 1.4561, + "step": 61660 + }, + { + "epoch": 1.7942228815777992, + "grad_norm": 12.0625, + "learning_rate": 4.0192582305780704e-06, + "loss": 1.4562, + "step": 61680 + }, + { + "epoch": 1.794804665910347, + "grad_norm": 13.5, + "learning_rate": 4.0173189489054555e-06, + "loss": 1.4413, + "step": 61700 + }, + { + "epoch": 1.7953864502428951, + "grad_norm": 15.25, + "learning_rate": 4.015379667232841e-06, + "loss": 1.4938, + "step": 61720 + }, + { + "epoch": 1.7959682345754429, + "grad_norm": 11.1875, + "learning_rate": 4.013440385560226e-06, + "loss": 1.5027, + "step": 61740 + }, + { + "epoch": 1.7965500189079908, + "grad_norm": 13.875, + "learning_rate": 4.011501103887611e-06, + "loss": 1.5781, + "step": 61760 + }, + { + "epoch": 1.7971318032405388, + "grad_norm": 13.8125, + "learning_rate": 4.009561822214996e-06, + "loss": 1.4809, + "step": 61780 + }, + { + "epoch": 1.7977135875730865, + "grad_norm": 12.1875, + "learning_rate": 4.007622540542381e-06, + "loss": 1.5166, + "step": 61800 + }, + { + "epoch": 1.7982953719056347, + "grad_norm": 7.1875, + "learning_rate": 4.005683258869766e-06, + "loss": 1.4717, + "step": 61820 + }, + { + "epoch": 1.7988771562381825, + "grad_norm": 11.0, + "learning_rate": 4.003743977197151e-06, + "loss": 1.5472, + "step": 61840 + }, + { + "epoch": 1.7994589405707304, + "grad_norm": 13.25, + "learning_rate": 4.001804695524536e-06, + "loss": 1.4978, + "step": 61860 + }, + { + "epoch": 1.8000407249032784, + "grad_norm": 11.0, + "learning_rate": 3.999865413851921e-06, + "loss": 1.505, + "step": 61880 + }, + { + "epoch": 1.8006225092358261, + "grad_norm": 17.0, + "learning_rate": 3.9979261321793065e-06, + "loss": 1.5679, + "step": 61900 + }, + { + "epoch": 1.8012042935683743, + "grad_norm": 11.375, + "learning_rate": 3.995986850506692e-06, + "loss": 1.4906, + "step": 61920 + }, + { + "epoch": 1.801786077900922, + "grad_norm": 12.6875, + "learning_rate": 3.994047568834077e-06, + "loss": 1.4992, + "step": 61940 + }, + { + "epoch": 1.80236786223347, + "grad_norm": 10.8125, + "learning_rate": 3.992108287161461e-06, + "loss": 1.5971, + "step": 61960 + }, + { + "epoch": 1.802949646566018, + "grad_norm": 12.875, + "learning_rate": 3.990169005488846e-06, + "loss": 1.4525, + "step": 61980 + }, + { + "epoch": 1.803531430898566, + "grad_norm": 12.3125, + "learning_rate": 3.988229723816231e-06, + "loss": 1.5309, + "step": 62000 + }, + { + "epoch": 1.804113215231114, + "grad_norm": 12.8125, + "learning_rate": 3.986290442143616e-06, + "loss": 1.5509, + "step": 62020 + }, + { + "epoch": 1.8046949995636616, + "grad_norm": 11.0, + "learning_rate": 3.984351160471001e-06, + "loss": 1.5824, + "step": 62040 + }, + { + "epoch": 1.8052767838962098, + "grad_norm": 14.0, + "learning_rate": 3.9824118787983864e-06, + "loss": 1.5651, + "step": 62060 + }, + { + "epoch": 1.8058585682287576, + "grad_norm": 12.1875, + "learning_rate": 3.9804725971257715e-06, + "loss": 1.5661, + "step": 62080 + }, + { + "epoch": 1.8064403525613055, + "grad_norm": 11.3125, + "learning_rate": 3.978533315453157e-06, + "loss": 1.5185, + "step": 62100 + }, + { + "epoch": 1.8070221368938535, + "grad_norm": 12.0, + "learning_rate": 3.976594033780542e-06, + "loss": 1.6406, + "step": 62120 + }, + { + "epoch": 1.8076039212264012, + "grad_norm": 13.25, + "learning_rate": 3.974654752107927e-06, + "loss": 1.5302, + "step": 62140 + }, + { + "epoch": 1.8081857055589494, + "grad_norm": 13.5625, + "learning_rate": 3.972715470435312e-06, + "loss": 1.5314, + "step": 62160 + }, + { + "epoch": 1.8087674898914972, + "grad_norm": 14.3125, + "learning_rate": 3.970776188762697e-06, + "loss": 1.6031, + "step": 62180 + }, + { + "epoch": 1.8093492742240451, + "grad_norm": 12.0, + "learning_rate": 3.968836907090082e-06, + "loss": 1.4817, + "step": 62200 + }, + { + "epoch": 1.809931058556593, + "grad_norm": 13.8125, + "learning_rate": 3.966897625417467e-06, + "loss": 1.585, + "step": 62220 + }, + { + "epoch": 1.810512842889141, + "grad_norm": 13.75, + "learning_rate": 3.9649583437448515e-06, + "loss": 1.513, + "step": 62240 + }, + { + "epoch": 1.811094627221689, + "grad_norm": 10.8125, + "learning_rate": 3.9630190620722366e-06, + "loss": 1.5597, + "step": 62260 + }, + { + "epoch": 1.8116764115542368, + "grad_norm": 11.375, + "learning_rate": 3.961079780399622e-06, + "loss": 1.5187, + "step": 62280 + }, + { + "epoch": 1.8122581958867847, + "grad_norm": 13.6875, + "learning_rate": 3.959140498727007e-06, + "loss": 1.4977, + "step": 62300 + }, + { + "epoch": 1.8128399802193327, + "grad_norm": 11.125, + "learning_rate": 3.957201217054392e-06, + "loss": 1.5414, + "step": 62320 + }, + { + "epoch": 1.8134217645518806, + "grad_norm": 13.0, + "learning_rate": 3.955261935381777e-06, + "loss": 1.5543, + "step": 62340 + }, + { + "epoch": 1.8140035488844286, + "grad_norm": 11.9375, + "learning_rate": 3.953322653709162e-06, + "loss": 1.4898, + "step": 62360 + }, + { + "epoch": 1.8145853332169763, + "grad_norm": 12.9375, + "learning_rate": 3.951383372036547e-06, + "loss": 1.4591, + "step": 62380 + }, + { + "epoch": 1.8151671175495245, + "grad_norm": 12.75, + "learning_rate": 3.949444090363932e-06, + "loss": 1.5138, + "step": 62400 + }, + { + "epoch": 1.8157489018820723, + "grad_norm": 14.0, + "learning_rate": 3.947504808691317e-06, + "loss": 1.5751, + "step": 62420 + }, + { + "epoch": 1.8163306862146202, + "grad_norm": 11.125, + "learning_rate": 3.9455655270187024e-06, + "loss": 1.4689, + "step": 62440 + }, + { + "epoch": 1.8169124705471682, + "grad_norm": 12.8125, + "learning_rate": 3.9436262453460875e-06, + "loss": 1.5339, + "step": 62460 + }, + { + "epoch": 1.817494254879716, + "grad_norm": 10.0, + "learning_rate": 3.941686963673473e-06, + "loss": 1.5708, + "step": 62480 + }, + { + "epoch": 1.8180760392122641, + "grad_norm": 12.6875, + "learning_rate": 3.939747682000858e-06, + "loss": 1.5428, + "step": 62500 + }, + { + "epoch": 1.8186578235448119, + "grad_norm": 13.0625, + "learning_rate": 3.937808400328243e-06, + "loss": 1.4831, + "step": 62520 + }, + { + "epoch": 1.8192396078773598, + "grad_norm": 13.3125, + "learning_rate": 3.935869118655628e-06, + "loss": 1.5751, + "step": 62540 + }, + { + "epoch": 1.8198213922099078, + "grad_norm": 13.0625, + "learning_rate": 3.933929836983013e-06, + "loss": 1.4526, + "step": 62560 + }, + { + "epoch": 1.8204031765424558, + "grad_norm": 13.25, + "learning_rate": 3.931990555310398e-06, + "loss": 1.4694, + "step": 62580 + }, + { + "epoch": 1.8209849608750037, + "grad_norm": 12.9375, + "learning_rate": 3.930051273637783e-06, + "loss": 1.5837, + "step": 62600 + }, + { + "epoch": 1.8215667452075515, + "grad_norm": 13.6875, + "learning_rate": 3.928111991965168e-06, + "loss": 1.5205, + "step": 62620 + }, + { + "epoch": 1.8221485295400996, + "grad_norm": 13.3125, + "learning_rate": 3.926172710292553e-06, + "loss": 1.4634, + "step": 62640 + }, + { + "epoch": 1.8227303138726474, + "grad_norm": 11.6875, + "learning_rate": 3.9242334286199385e-06, + "loss": 1.593, + "step": 62660 + }, + { + "epoch": 1.8233120982051954, + "grad_norm": 13.9375, + "learning_rate": 3.922294146947324e-06, + "loss": 1.4876, + "step": 62680 + }, + { + "epoch": 1.8238938825377433, + "grad_norm": 15.5625, + "learning_rate": 3.920354865274709e-06, + "loss": 1.5392, + "step": 62700 + }, + { + "epoch": 1.824475666870291, + "grad_norm": 9.25, + "learning_rate": 3.918415583602094e-06, + "loss": 1.5972, + "step": 62720 + }, + { + "epoch": 1.8250574512028392, + "grad_norm": 12.8125, + "learning_rate": 3.916476301929479e-06, + "loss": 1.4216, + "step": 62740 + }, + { + "epoch": 1.825639235535387, + "grad_norm": 14.3125, + "learning_rate": 3.914537020256864e-06, + "loss": 1.5259, + "step": 62760 + }, + { + "epoch": 1.826221019867935, + "grad_norm": 16.0, + "learning_rate": 3.912597738584249e-06, + "loss": 1.5084, + "step": 62780 + }, + { + "epoch": 1.826802804200483, + "grad_norm": 12.6875, + "learning_rate": 3.910658456911634e-06, + "loss": 1.583, + "step": 62800 + }, + { + "epoch": 1.8273845885330307, + "grad_norm": 13.375, + "learning_rate": 3.908719175239019e-06, + "loss": 1.5648, + "step": 62820 + }, + { + "epoch": 1.8279663728655788, + "grad_norm": 13.875, + "learning_rate": 3.906779893566404e-06, + "loss": 1.5481, + "step": 62840 + }, + { + "epoch": 1.8285481571981266, + "grad_norm": 13.3125, + "learning_rate": 3.9048406118937895e-06, + "loss": 1.6228, + "step": 62860 + }, + { + "epoch": 1.8291299415306745, + "grad_norm": 15.625, + "learning_rate": 3.902901330221175e-06, + "loss": 1.4922, + "step": 62880 + }, + { + "epoch": 1.8297117258632225, + "grad_norm": 15.75, + "learning_rate": 3.90096204854856e-06, + "loss": 1.5099, + "step": 62900 + }, + { + "epoch": 1.8302935101957705, + "grad_norm": 11.6875, + "learning_rate": 3.899022766875945e-06, + "loss": 1.5234, + "step": 62920 + }, + { + "epoch": 1.8308752945283184, + "grad_norm": 13.5625, + "learning_rate": 3.89708348520333e-06, + "loss": 1.5024, + "step": 62940 + }, + { + "epoch": 1.8314570788608662, + "grad_norm": 13.5625, + "learning_rate": 3.895144203530715e-06, + "loss": 1.5681, + "step": 62960 + }, + { + "epoch": 1.8320388631934144, + "grad_norm": 10.6875, + "learning_rate": 3.8932049218581e-06, + "loss": 1.5527, + "step": 62980 + }, + { + "epoch": 1.832620647525962, + "grad_norm": 13.6875, + "learning_rate": 3.891265640185485e-06, + "loss": 1.4973, + "step": 63000 + }, + { + "epoch": 1.83320243185851, + "grad_norm": 13.9375, + "learning_rate": 3.88932635851287e-06, + "loss": 1.5763, + "step": 63020 + }, + { + "epoch": 1.833784216191058, + "grad_norm": 12.3125, + "learning_rate": 3.887387076840255e-06, + "loss": 1.5511, + "step": 63040 + }, + { + "epoch": 1.8343660005236058, + "grad_norm": 14.0625, + "learning_rate": 3.88544779516764e-06, + "loss": 1.5727, + "step": 63060 + }, + { + "epoch": 1.834947784856154, + "grad_norm": 21.375, + "learning_rate": 3.883508513495025e-06, + "loss": 1.4631, + "step": 63080 + }, + { + "epoch": 1.8355295691887017, + "grad_norm": 15.0, + "learning_rate": 3.88156923182241e-06, + "loss": 1.5165, + "step": 63100 + }, + { + "epoch": 1.8361113535212497, + "grad_norm": 12.9375, + "learning_rate": 3.879629950149795e-06, + "loss": 1.5668, + "step": 63120 + }, + { + "epoch": 1.8366931378537976, + "grad_norm": 17.125, + "learning_rate": 3.87769066847718e-06, + "loss": 1.5744, + "step": 63140 + }, + { + "epoch": 1.8372749221863454, + "grad_norm": 13.9375, + "learning_rate": 3.875751386804565e-06, + "loss": 1.5037, + "step": 63160 + }, + { + "epoch": 1.8378567065188935, + "grad_norm": 14.0625, + "learning_rate": 3.87381210513195e-06, + "loss": 1.5699, + "step": 63180 + }, + { + "epoch": 1.8384384908514413, + "grad_norm": 15.875, + "learning_rate": 3.871872823459335e-06, + "loss": 1.5373, + "step": 63200 + }, + { + "epoch": 1.8390202751839893, + "grad_norm": 11.25, + "learning_rate": 3.86993354178672e-06, + "loss": 1.5129, + "step": 63220 + }, + { + "epoch": 1.8396020595165372, + "grad_norm": 14.5625, + "learning_rate": 3.8679942601141055e-06, + "loss": 1.499, + "step": 63240 + }, + { + "epoch": 1.8401838438490852, + "grad_norm": 13.625, + "learning_rate": 3.866054978441491e-06, + "loss": 1.5906, + "step": 63260 + }, + { + "epoch": 1.8407656281816331, + "grad_norm": 12.75, + "learning_rate": 3.864115696768876e-06, + "loss": 1.466, + "step": 63280 + }, + { + "epoch": 1.8413474125141809, + "grad_norm": 12.25, + "learning_rate": 3.862176415096261e-06, + "loss": 1.5761, + "step": 63300 + }, + { + "epoch": 1.841929196846729, + "grad_norm": 13.75, + "learning_rate": 3.860237133423646e-06, + "loss": 1.4375, + "step": 63320 + }, + { + "epoch": 1.8425109811792768, + "grad_norm": 11.5, + "learning_rate": 3.858297851751031e-06, + "loss": 1.4889, + "step": 63340 + }, + { + "epoch": 1.8430927655118248, + "grad_norm": 14.125, + "learning_rate": 3.856358570078416e-06, + "loss": 1.5614, + "step": 63360 + }, + { + "epoch": 1.8436745498443727, + "grad_norm": 13.6875, + "learning_rate": 3.854419288405801e-06, + "loss": 1.5795, + "step": 63380 + }, + { + "epoch": 1.8442563341769205, + "grad_norm": 16.375, + "learning_rate": 3.852480006733186e-06, + "loss": 1.5741, + "step": 63400 + }, + { + "epoch": 1.8448381185094687, + "grad_norm": 11.4375, + "learning_rate": 3.850540725060571e-06, + "loss": 1.5355, + "step": 63420 + }, + { + "epoch": 1.8454199028420164, + "grad_norm": 12.3125, + "learning_rate": 3.8486014433879565e-06, + "loss": 1.4689, + "step": 63440 + }, + { + "epoch": 1.8460016871745644, + "grad_norm": 12.3125, + "learning_rate": 3.8466621617153416e-06, + "loss": 1.5845, + "step": 63460 + }, + { + "epoch": 1.8465834715071123, + "grad_norm": 13.0625, + "learning_rate": 3.844722880042727e-06, + "loss": 1.5011, + "step": 63480 + }, + { + "epoch": 1.8471652558396603, + "grad_norm": 14.6875, + "learning_rate": 3.842783598370112e-06, + "loss": 1.5878, + "step": 63500 + }, + { + "epoch": 1.8477470401722083, + "grad_norm": 11.8125, + "learning_rate": 3.840844316697497e-06, + "loss": 1.536, + "step": 63520 + }, + { + "epoch": 1.848328824504756, + "grad_norm": 13.625, + "learning_rate": 3.838905035024882e-06, + "loss": 1.619, + "step": 63540 + }, + { + "epoch": 1.848910608837304, + "grad_norm": 14.8125, + "learning_rate": 3.836965753352267e-06, + "loss": 1.5491, + "step": 63560 + }, + { + "epoch": 1.849492393169852, + "grad_norm": 14.6875, + "learning_rate": 3.835026471679652e-06, + "loss": 1.4322, + "step": 63580 + }, + { + "epoch": 1.8500741775023999, + "grad_norm": 10.125, + "learning_rate": 3.833087190007037e-06, + "loss": 1.5235, + "step": 63600 + }, + { + "epoch": 1.8506559618349478, + "grad_norm": 10.375, + "learning_rate": 3.831147908334422e-06, + "loss": 1.4756, + "step": 63620 + }, + { + "epoch": 1.8512377461674956, + "grad_norm": 14.1875, + "learning_rate": 3.8292086266618074e-06, + "loss": 1.5428, + "step": 63640 + }, + { + "epoch": 1.8518195305000438, + "grad_norm": 12.8125, + "learning_rate": 3.8272693449891925e-06, + "loss": 1.4258, + "step": 63660 + }, + { + "epoch": 1.8524013148325915, + "grad_norm": 14.4375, + "learning_rate": 3.825330063316578e-06, + "loss": 1.5605, + "step": 63680 + }, + { + "epoch": 1.8529830991651395, + "grad_norm": 11.9375, + "learning_rate": 3.823390781643963e-06, + "loss": 1.6128, + "step": 63700 + }, + { + "epoch": 1.8535648834976874, + "grad_norm": 14.125, + "learning_rate": 3.821451499971348e-06, + "loss": 1.4939, + "step": 63720 + }, + { + "epoch": 1.8541466678302352, + "grad_norm": 14.125, + "learning_rate": 3.819512218298733e-06, + "loss": 1.5789, + "step": 63740 + }, + { + "epoch": 1.8547284521627834, + "grad_norm": 13.6875, + "learning_rate": 3.817572936626118e-06, + "loss": 1.5197, + "step": 63760 + }, + { + "epoch": 1.855310236495331, + "grad_norm": 13.9375, + "learning_rate": 3.815633654953503e-06, + "loss": 1.5028, + "step": 63780 + }, + { + "epoch": 1.855892020827879, + "grad_norm": 11.125, + "learning_rate": 3.813694373280888e-06, + "loss": 1.5733, + "step": 63800 + }, + { + "epoch": 1.856473805160427, + "grad_norm": 13.375, + "learning_rate": 3.811755091608273e-06, + "loss": 1.4623, + "step": 63820 + }, + { + "epoch": 1.857055589492975, + "grad_norm": 11.75, + "learning_rate": 3.809815809935658e-06, + "loss": 1.5711, + "step": 63840 + }, + { + "epoch": 1.857637373825523, + "grad_norm": 13.625, + "learning_rate": 3.807876528263043e-06, + "loss": 1.5969, + "step": 63860 + }, + { + "epoch": 1.8582191581580707, + "grad_norm": 13.0625, + "learning_rate": 3.805937246590428e-06, + "loss": 1.593, + "step": 63880 + }, + { + "epoch": 1.858800942490619, + "grad_norm": 14.375, + "learning_rate": 3.8039979649178133e-06, + "loss": 1.5419, + "step": 63900 + }, + { + "epoch": 1.8593827268231666, + "grad_norm": 12.375, + "learning_rate": 3.8020586832451984e-06, + "loss": 1.5262, + "step": 63920 + }, + { + "epoch": 1.8599645111557146, + "grad_norm": 14.125, + "learning_rate": 3.8001194015725835e-06, + "loss": 1.5379, + "step": 63940 + }, + { + "epoch": 1.8605462954882626, + "grad_norm": 14.0, + "learning_rate": 3.7981801198999686e-06, + "loss": 1.4929, + "step": 63960 + }, + { + "epoch": 1.8611280798208103, + "grad_norm": 12.5, + "learning_rate": 3.7962408382273537e-06, + "loss": 1.5153, + "step": 63980 + }, + { + "epoch": 1.8617098641533585, + "grad_norm": 14.3125, + "learning_rate": 3.7943015565547388e-06, + "loss": 1.4638, + "step": 64000 + }, + { + "epoch": 1.8622916484859062, + "grad_norm": 13.1875, + "learning_rate": 3.792362274882124e-06, + "loss": 1.5608, + "step": 64020 + }, + { + "epoch": 1.8628734328184542, + "grad_norm": 14.0, + "learning_rate": 3.790422993209509e-06, + "loss": 1.5902, + "step": 64040 + }, + { + "epoch": 1.8634552171510022, + "grad_norm": 13.125, + "learning_rate": 3.788483711536894e-06, + "loss": 1.4654, + "step": 64060 + }, + { + "epoch": 1.86403700148355, + "grad_norm": 13.625, + "learning_rate": 3.7865444298642783e-06, + "loss": 1.5492, + "step": 64080 + }, + { + "epoch": 1.864618785816098, + "grad_norm": 13.5625, + "learning_rate": 3.7846051481916634e-06, + "loss": 1.5703, + "step": 64100 + }, + { + "epoch": 1.8652005701486458, + "grad_norm": 12.4375, + "learning_rate": 3.7826658665190485e-06, + "loss": 1.4844, + "step": 64120 + }, + { + "epoch": 1.8657823544811938, + "grad_norm": 13.75, + "learning_rate": 3.7807265848464336e-06, + "loss": 1.5887, + "step": 64140 + }, + { + "epoch": 1.8663641388137417, + "grad_norm": 15.25, + "learning_rate": 3.7787873031738187e-06, + "loss": 1.5731, + "step": 64160 + }, + { + "epoch": 1.8669459231462897, + "grad_norm": 15.875, + "learning_rate": 3.776848021501204e-06, + "loss": 1.5551, + "step": 64180 + }, + { + "epoch": 1.8675277074788377, + "grad_norm": 14.125, + "learning_rate": 3.774908739828589e-06, + "loss": 1.5833, + "step": 64200 + }, + { + "epoch": 1.8681094918113854, + "grad_norm": 13.9375, + "learning_rate": 3.772969458155974e-06, + "loss": 1.5592, + "step": 64220 + }, + { + "epoch": 1.8686912761439336, + "grad_norm": 11.1875, + "learning_rate": 3.771030176483359e-06, + "loss": 1.4486, + "step": 64240 + }, + { + "epoch": 1.8692730604764813, + "grad_norm": 12.4375, + "learning_rate": 3.769090894810744e-06, + "loss": 1.5575, + "step": 64260 + }, + { + "epoch": 1.8698548448090293, + "grad_norm": 14.0625, + "learning_rate": 3.7671516131381293e-06, + "loss": 1.4613, + "step": 64280 + }, + { + "epoch": 1.8704366291415773, + "grad_norm": 13.375, + "learning_rate": 3.7652123314655144e-06, + "loss": 1.5542, + "step": 64300 + }, + { + "epoch": 1.871018413474125, + "grad_norm": 14.4375, + "learning_rate": 3.7632730497928995e-06, + "loss": 1.449, + "step": 64320 + }, + { + "epoch": 1.8716001978066732, + "grad_norm": 14.25, + "learning_rate": 3.7613337681202846e-06, + "loss": 1.5446, + "step": 64340 + }, + { + "epoch": 1.872181982139221, + "grad_norm": 12.0625, + "learning_rate": 3.7593944864476693e-06, + "loss": 1.5131, + "step": 64360 + }, + { + "epoch": 1.872763766471769, + "grad_norm": 13.375, + "learning_rate": 3.7574552047750544e-06, + "loss": 1.4918, + "step": 64380 + }, + { + "epoch": 1.8733455508043169, + "grad_norm": 14.25, + "learning_rate": 3.7555159231024395e-06, + "loss": 1.5377, + "step": 64400 + }, + { + "epoch": 1.8739273351368646, + "grad_norm": 14.125, + "learning_rate": 3.7535766414298246e-06, + "loss": 1.5263, + "step": 64420 + }, + { + "epoch": 1.8745091194694128, + "grad_norm": 12.75, + "learning_rate": 3.7516373597572097e-06, + "loss": 1.5512, + "step": 64440 + }, + { + "epoch": 1.8750909038019605, + "grad_norm": 12.5625, + "learning_rate": 3.7496980780845948e-06, + "loss": 1.5967, + "step": 64460 + }, + { + "epoch": 1.8756726881345085, + "grad_norm": 12.875, + "learning_rate": 3.74775879641198e-06, + "loss": 1.5346, + "step": 64480 + }, + { + "epoch": 1.8762544724670565, + "grad_norm": 12.1875, + "learning_rate": 3.745819514739365e-06, + "loss": 1.5279, + "step": 64500 + }, + { + "epoch": 1.8768362567996044, + "grad_norm": 14.3125, + "learning_rate": 3.74388023306675e-06, + "loss": 1.5636, + "step": 64520 + }, + { + "epoch": 1.8774180411321524, + "grad_norm": 13.625, + "learning_rate": 3.741940951394135e-06, + "loss": 1.5338, + "step": 64540 + }, + { + "epoch": 1.8779998254647001, + "grad_norm": 12.1875, + "learning_rate": 3.7400016697215202e-06, + "loss": 1.5769, + "step": 64560 + }, + { + "epoch": 1.8785816097972483, + "grad_norm": 13.5, + "learning_rate": 3.7380623880489053e-06, + "loss": 1.5317, + "step": 64580 + }, + { + "epoch": 1.879163394129796, + "grad_norm": 14.0625, + "learning_rate": 3.7361231063762904e-06, + "loss": 1.4931, + "step": 64600 + }, + { + "epoch": 1.879745178462344, + "grad_norm": 12.5625, + "learning_rate": 3.7341838247036755e-06, + "loss": 1.5744, + "step": 64620 + }, + { + "epoch": 1.880326962794892, + "grad_norm": 11.1875, + "learning_rate": 3.7322445430310606e-06, + "loss": 1.5326, + "step": 64640 + }, + { + "epoch": 1.8809087471274397, + "grad_norm": 12.5, + "learning_rate": 3.7303052613584457e-06, + "loss": 1.5492, + "step": 64660 + }, + { + "epoch": 1.881490531459988, + "grad_norm": 9.9375, + "learning_rate": 3.728365979685831e-06, + "loss": 1.4694, + "step": 64680 + }, + { + "epoch": 1.8820723157925356, + "grad_norm": 14.0, + "learning_rate": 3.726426698013216e-06, + "loss": 1.541, + "step": 64700 + }, + { + "epoch": 1.8826541001250836, + "grad_norm": 13.9375, + "learning_rate": 3.724487416340601e-06, + "loss": 1.5134, + "step": 64720 + }, + { + "epoch": 1.8832358844576316, + "grad_norm": 15.1875, + "learning_rate": 3.722548134667986e-06, + "loss": 1.5849, + "step": 64740 + }, + { + "epoch": 1.8838176687901795, + "grad_norm": 14.0, + "learning_rate": 3.720608852995371e-06, + "loss": 1.4249, + "step": 64760 + }, + { + "epoch": 1.8843994531227275, + "grad_norm": 13.0, + "learning_rate": 3.718669571322756e-06, + "loss": 1.5384, + "step": 64780 + }, + { + "epoch": 1.8849812374552752, + "grad_norm": 13.6875, + "learning_rate": 3.716730289650141e-06, + "loss": 1.5399, + "step": 64800 + }, + { + "epoch": 1.8855630217878232, + "grad_norm": 13.0, + "learning_rate": 3.714791007977526e-06, + "loss": 1.5114, + "step": 64820 + }, + { + "epoch": 1.8861448061203712, + "grad_norm": 11.4375, + "learning_rate": 3.712851726304911e-06, + "loss": 1.522, + "step": 64840 + }, + { + "epoch": 1.8867265904529191, + "grad_norm": 16.125, + "learning_rate": 3.7109124446322963e-06, + "loss": 1.4984, + "step": 64860 + }, + { + "epoch": 1.887308374785467, + "grad_norm": 14.5625, + "learning_rate": 3.7089731629596814e-06, + "loss": 1.4274, + "step": 64880 + }, + { + "epoch": 1.8878901591180148, + "grad_norm": 11.8125, + "learning_rate": 3.7070338812870665e-06, + "loss": 1.556, + "step": 64900 + }, + { + "epoch": 1.888471943450563, + "grad_norm": 13.0625, + "learning_rate": 3.7050945996144516e-06, + "loss": 1.5648, + "step": 64920 + }, + { + "epoch": 1.8890537277831108, + "grad_norm": 15.25, + "learning_rate": 3.7031553179418367e-06, + "loss": 1.6345, + "step": 64940 + }, + { + "epoch": 1.8896355121156587, + "grad_norm": 13.25, + "learning_rate": 3.7012160362692218e-06, + "loss": 1.5228, + "step": 64960 + }, + { + "epoch": 1.8902172964482067, + "grad_norm": 14.4375, + "learning_rate": 3.699276754596607e-06, + "loss": 1.4963, + "step": 64980 + }, + { + "epoch": 1.8907990807807544, + "grad_norm": 13.6875, + "learning_rate": 3.697337472923992e-06, + "loss": 1.5216, + "step": 65000 + }, + { + "epoch": 1.8913808651133026, + "grad_norm": 14.5625, + "learning_rate": 3.695398191251377e-06, + "loss": 1.4629, + "step": 65020 + }, + { + "epoch": 1.8919626494458504, + "grad_norm": 14.8125, + "learning_rate": 3.693458909578762e-06, + "loss": 1.4844, + "step": 65040 + }, + { + "epoch": 1.8925444337783983, + "grad_norm": 11.75, + "learning_rate": 3.6915196279061473e-06, + "loss": 1.6662, + "step": 65060 + }, + { + "epoch": 1.8931262181109463, + "grad_norm": 16.75, + "learning_rate": 3.6895803462335323e-06, + "loss": 1.5558, + "step": 65080 + }, + { + "epoch": 1.8937080024434942, + "grad_norm": 14.8125, + "learning_rate": 3.6876410645609174e-06, + "loss": 1.5334, + "step": 65100 + }, + { + "epoch": 1.8942897867760422, + "grad_norm": 13.0, + "learning_rate": 3.6857017828883025e-06, + "loss": 1.5759, + "step": 65120 + }, + { + "epoch": 1.89487157110859, + "grad_norm": 10.4375, + "learning_rate": 3.6837625012156876e-06, + "loss": 1.5059, + "step": 65140 + }, + { + "epoch": 1.8954533554411381, + "grad_norm": 11.8125, + "learning_rate": 3.6818232195430727e-06, + "loss": 1.5883, + "step": 65160 + }, + { + "epoch": 1.8960351397736859, + "grad_norm": 12.375, + "learning_rate": 3.6798839378704574e-06, + "loss": 1.5496, + "step": 65180 + }, + { + "epoch": 1.8966169241062338, + "grad_norm": 13.125, + "learning_rate": 3.6779446561978425e-06, + "loss": 1.4951, + "step": 65200 + }, + { + "epoch": 1.8971987084387818, + "grad_norm": 13.9375, + "learning_rate": 3.6760053745252276e-06, + "loss": 1.4773, + "step": 65220 + }, + { + "epoch": 1.8977804927713295, + "grad_norm": 14.25, + "learning_rate": 3.6740660928526127e-06, + "loss": 1.5271, + "step": 65240 + }, + { + "epoch": 1.8983622771038777, + "grad_norm": 12.6875, + "learning_rate": 3.672126811179998e-06, + "loss": 1.5227, + "step": 65260 + }, + { + "epoch": 1.8989440614364255, + "grad_norm": 16.5, + "learning_rate": 3.670187529507383e-06, + "loss": 1.567, + "step": 65280 + }, + { + "epoch": 1.8995258457689734, + "grad_norm": 14.625, + "learning_rate": 3.668248247834768e-06, + "loss": 1.5158, + "step": 65300 + }, + { + "epoch": 1.9001076301015214, + "grad_norm": 13.4375, + "learning_rate": 3.666308966162153e-06, + "loss": 1.5731, + "step": 65320 + }, + { + "epoch": 1.9006894144340691, + "grad_norm": 12.625, + "learning_rate": 3.664369684489538e-06, + "loss": 1.633, + "step": 65340 + }, + { + "epoch": 1.9012711987666173, + "grad_norm": 14.0625, + "learning_rate": 3.6624304028169233e-06, + "loss": 1.466, + "step": 65360 + }, + { + "epoch": 1.901852983099165, + "grad_norm": 13.6875, + "learning_rate": 3.6604911211443084e-06, + "loss": 1.5367, + "step": 65380 + }, + { + "epoch": 1.902434767431713, + "grad_norm": 11.125, + "learning_rate": 3.6585518394716935e-06, + "loss": 1.5736, + "step": 65400 + }, + { + "epoch": 1.903016551764261, + "grad_norm": 17.75, + "learning_rate": 3.6566125577990786e-06, + "loss": 1.5669, + "step": 65420 + }, + { + "epoch": 1.903598336096809, + "grad_norm": 14.6875, + "learning_rate": 3.6546732761264637e-06, + "loss": 1.5658, + "step": 65440 + }, + { + "epoch": 1.904180120429357, + "grad_norm": 12.6875, + "learning_rate": 3.6527339944538488e-06, + "loss": 1.5102, + "step": 65460 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 12.0, + "learning_rate": 3.650794712781234e-06, + "loss": 1.4242, + "step": 65480 + }, + { + "epoch": 1.9053436890944528, + "grad_norm": 14.8125, + "learning_rate": 3.648855431108619e-06, + "loss": 1.58, + "step": 65500 + }, + { + "epoch": 1.9059254734270006, + "grad_norm": 15.875, + "learning_rate": 3.646916149436004e-06, + "loss": 1.4414, + "step": 65520 + }, + { + "epoch": 1.9065072577595485, + "grad_norm": 12.1875, + "learning_rate": 3.644976867763389e-06, + "loss": 1.5314, + "step": 65540 + }, + { + "epoch": 1.9070890420920965, + "grad_norm": 14.625, + "learning_rate": 3.6430375860907743e-06, + "loss": 1.5462, + "step": 65560 + }, + { + "epoch": 1.9076708264246443, + "grad_norm": 12.75, + "learning_rate": 3.641098304418159e-06, + "loss": 1.4362, + "step": 65580 + }, + { + "epoch": 1.9082526107571924, + "grad_norm": 15.4375, + "learning_rate": 3.639159022745544e-06, + "loss": 1.6175, + "step": 65600 + }, + { + "epoch": 1.9088343950897402, + "grad_norm": 12.8125, + "learning_rate": 3.637219741072929e-06, + "loss": 1.4896, + "step": 65620 + }, + { + "epoch": 1.9094161794222881, + "grad_norm": 14.25, + "learning_rate": 3.6352804594003142e-06, + "loss": 1.5441, + "step": 65640 + }, + { + "epoch": 1.909997963754836, + "grad_norm": 12.625, + "learning_rate": 3.6333411777276993e-06, + "loss": 1.5906, + "step": 65660 + }, + { + "epoch": 1.9105797480873838, + "grad_norm": 13.6875, + "learning_rate": 3.6314018960550844e-06, + "loss": 1.5146, + "step": 65680 + }, + { + "epoch": 1.911161532419932, + "grad_norm": 14.0625, + "learning_rate": 3.6294626143824695e-06, + "loss": 1.4349, + "step": 65700 + }, + { + "epoch": 1.9117433167524798, + "grad_norm": 15.5625, + "learning_rate": 3.6275233327098546e-06, + "loss": 1.5319, + "step": 65720 + }, + { + "epoch": 1.9123251010850277, + "grad_norm": 14.5, + "learning_rate": 3.6255840510372397e-06, + "loss": 1.4693, + "step": 65740 + }, + { + "epoch": 1.9129068854175757, + "grad_norm": 15.5625, + "learning_rate": 3.623644769364625e-06, + "loss": 1.6036, + "step": 65760 + }, + { + "epoch": 1.9134886697501237, + "grad_norm": 13.8125, + "learning_rate": 3.62170548769201e-06, + "loss": 1.5009, + "step": 65780 + }, + { + "epoch": 1.9140704540826716, + "grad_norm": 15.3125, + "learning_rate": 3.619766206019395e-06, + "loss": 1.5315, + "step": 65800 + }, + { + "epoch": 1.9146522384152194, + "grad_norm": 11.875, + "learning_rate": 3.61782692434678e-06, + "loss": 1.6201, + "step": 65820 + }, + { + "epoch": 1.9152340227477676, + "grad_norm": 8.375, + "learning_rate": 3.615887642674165e-06, + "loss": 1.5182, + "step": 65840 + }, + { + "epoch": 1.9158158070803153, + "grad_norm": 13.9375, + "learning_rate": 3.6139483610015503e-06, + "loss": 1.5305, + "step": 65860 + }, + { + "epoch": 1.9163975914128633, + "grad_norm": 11.25, + "learning_rate": 3.6120090793289354e-06, + "loss": 1.4922, + "step": 65880 + }, + { + "epoch": 1.9169793757454112, + "grad_norm": 13.375, + "learning_rate": 3.6100697976563205e-06, + "loss": 1.5899, + "step": 65900 + }, + { + "epoch": 1.917561160077959, + "grad_norm": 14.4375, + "learning_rate": 3.6081305159837056e-06, + "loss": 1.4647, + "step": 65920 + }, + { + "epoch": 1.9181429444105071, + "grad_norm": 12.25, + "learning_rate": 3.6061912343110907e-06, + "loss": 1.6362, + "step": 65940 + }, + { + "epoch": 1.9187247287430549, + "grad_norm": 13.9375, + "learning_rate": 3.604251952638476e-06, + "loss": 1.5538, + "step": 65960 + }, + { + "epoch": 1.9193065130756028, + "grad_norm": 13.0, + "learning_rate": 3.602312670965861e-06, + "loss": 1.484, + "step": 65980 + }, + { + "epoch": 1.9198882974081508, + "grad_norm": 17.125, + "learning_rate": 3.6003733892932456e-06, + "loss": 1.4788, + "step": 66000 + }, + { + "epoch": 1.9204700817406988, + "grad_norm": 14.375, + "learning_rate": 3.5984341076206307e-06, + "loss": 1.5489, + "step": 66020 + }, + { + "epoch": 1.9210518660732467, + "grad_norm": 12.875, + "learning_rate": 3.5964948259480158e-06, + "loss": 1.461, + "step": 66040 + }, + { + "epoch": 1.9216336504057945, + "grad_norm": 13.25, + "learning_rate": 3.594555544275401e-06, + "loss": 1.5596, + "step": 66060 + }, + { + "epoch": 1.9222154347383424, + "grad_norm": 12.0625, + "learning_rate": 3.592616262602786e-06, + "loss": 1.5128, + "step": 66080 + }, + { + "epoch": 1.9227972190708904, + "grad_norm": 18.25, + "learning_rate": 3.590676980930171e-06, + "loss": 1.4885, + "step": 66100 + }, + { + "epoch": 1.9233790034034384, + "grad_norm": 12.375, + "learning_rate": 3.588737699257556e-06, + "loss": 1.496, + "step": 66120 + }, + { + "epoch": 1.9239607877359863, + "grad_norm": 15.25, + "learning_rate": 3.5867984175849412e-06, + "loss": 1.5943, + "step": 66140 + }, + { + "epoch": 1.924542572068534, + "grad_norm": 13.75, + "learning_rate": 3.5848591359123263e-06, + "loss": 1.5122, + "step": 66160 + }, + { + "epoch": 1.9251243564010823, + "grad_norm": 11.6875, + "learning_rate": 3.5829198542397114e-06, + "loss": 1.5545, + "step": 66180 + }, + { + "epoch": 1.92570614073363, + "grad_norm": 12.0, + "learning_rate": 3.580980572567096e-06, + "loss": 1.5628, + "step": 66200 + }, + { + "epoch": 1.926287925066178, + "grad_norm": 12.9375, + "learning_rate": 3.579041290894481e-06, + "loss": 1.4236, + "step": 66220 + }, + { + "epoch": 1.926869709398726, + "grad_norm": 15.3125, + "learning_rate": 3.5771020092218663e-06, + "loss": 1.5351, + "step": 66240 + }, + { + "epoch": 1.9274514937312737, + "grad_norm": 11.0625, + "learning_rate": 3.5751627275492514e-06, + "loss": 1.5499, + "step": 66260 + }, + { + "epoch": 1.9280332780638219, + "grad_norm": 13.1875, + "learning_rate": 3.573223445876636e-06, + "loss": 1.5883, + "step": 66280 + }, + { + "epoch": 1.9286150623963696, + "grad_norm": 12.0625, + "learning_rate": 3.571284164204021e-06, + "loss": 1.4578, + "step": 66300 + }, + { + "epoch": 1.9291968467289176, + "grad_norm": 11.5, + "learning_rate": 3.5693448825314063e-06, + "loss": 1.4527, + "step": 66320 + }, + { + "epoch": 1.9297786310614655, + "grad_norm": 13.8125, + "learning_rate": 3.5674056008587914e-06, + "loss": 1.5393, + "step": 66340 + }, + { + "epoch": 1.9303604153940135, + "grad_norm": 11.3125, + "learning_rate": 3.5654663191861765e-06, + "loss": 1.5419, + "step": 66360 + }, + { + "epoch": 1.9309421997265614, + "grad_norm": 13.875, + "learning_rate": 3.5635270375135616e-06, + "loss": 1.5344, + "step": 66380 + }, + { + "epoch": 1.9315239840591092, + "grad_norm": 12.625, + "learning_rate": 3.5615877558409467e-06, + "loss": 1.5528, + "step": 66400 + }, + { + "epoch": 1.9321057683916574, + "grad_norm": 13.5625, + "learning_rate": 3.5596484741683318e-06, + "loss": 1.4706, + "step": 66420 + }, + { + "epoch": 1.9326875527242051, + "grad_norm": 13.0, + "learning_rate": 3.557709192495717e-06, + "loss": 1.5307, + "step": 66440 + }, + { + "epoch": 1.933269337056753, + "grad_norm": 14.875, + "learning_rate": 3.555769910823102e-06, + "loss": 1.5119, + "step": 66460 + }, + { + "epoch": 1.933851121389301, + "grad_norm": 11.875, + "learning_rate": 3.553830629150487e-06, + "loss": 1.474, + "step": 66480 + }, + { + "epoch": 1.9344329057218488, + "grad_norm": 14.5, + "learning_rate": 3.551891347477872e-06, + "loss": 1.5309, + "step": 66500 + }, + { + "epoch": 1.935014690054397, + "grad_norm": 12.125, + "learning_rate": 3.5499520658052573e-06, + "loss": 1.509, + "step": 66520 + }, + { + "epoch": 1.9355964743869447, + "grad_norm": 14.5625, + "learning_rate": 3.5480127841326423e-06, + "loss": 1.5758, + "step": 66540 + }, + { + "epoch": 1.9361782587194927, + "grad_norm": 15.3125, + "learning_rate": 3.5460735024600274e-06, + "loss": 1.4817, + "step": 66560 + }, + { + "epoch": 1.9367600430520406, + "grad_norm": 15.125, + "learning_rate": 3.5441342207874125e-06, + "loss": 1.4302, + "step": 66580 + }, + { + "epoch": 1.9373418273845884, + "grad_norm": 13.5, + "learning_rate": 3.5421949391147976e-06, + "loss": 1.5171, + "step": 66600 + }, + { + "epoch": 1.9379236117171366, + "grad_norm": 13.5, + "learning_rate": 3.5402556574421827e-06, + "loss": 1.5666, + "step": 66620 + }, + { + "epoch": 1.9385053960496843, + "grad_norm": 13.5, + "learning_rate": 3.538316375769568e-06, + "loss": 1.4887, + "step": 66640 + }, + { + "epoch": 1.9390871803822323, + "grad_norm": 13.0625, + "learning_rate": 3.536377094096953e-06, + "loss": 1.4681, + "step": 66660 + }, + { + "epoch": 1.9396689647147802, + "grad_norm": 12.9375, + "learning_rate": 3.5344378124243376e-06, + "loss": 1.4435, + "step": 66680 + }, + { + "epoch": 1.9402507490473282, + "grad_norm": 15.0625, + "learning_rate": 3.5324985307517227e-06, + "loss": 1.4985, + "step": 66700 + }, + { + "epoch": 1.9408325333798762, + "grad_norm": 12.5, + "learning_rate": 3.530559249079108e-06, + "loss": 1.4723, + "step": 66720 + }, + { + "epoch": 1.941414317712424, + "grad_norm": 11.4375, + "learning_rate": 3.528619967406493e-06, + "loss": 1.5144, + "step": 66740 + }, + { + "epoch": 1.941996102044972, + "grad_norm": 15.3125, + "learning_rate": 3.526680685733878e-06, + "loss": 1.5583, + "step": 66760 + }, + { + "epoch": 1.9425778863775198, + "grad_norm": 15.0, + "learning_rate": 3.524741404061263e-06, + "loss": 1.5899, + "step": 66780 + }, + { + "epoch": 1.9431596707100678, + "grad_norm": 13.5, + "learning_rate": 3.522802122388648e-06, + "loss": 1.5051, + "step": 66800 + }, + { + "epoch": 1.9437414550426158, + "grad_norm": 14.6875, + "learning_rate": 3.5208628407160333e-06, + "loss": 1.5192, + "step": 66820 + }, + { + "epoch": 1.9443232393751635, + "grad_norm": 11.0, + "learning_rate": 3.5189235590434184e-06, + "loss": 1.5179, + "step": 66840 + }, + { + "epoch": 1.9449050237077117, + "grad_norm": 13.4375, + "learning_rate": 3.5169842773708035e-06, + "loss": 1.4844, + "step": 66860 + }, + { + "epoch": 1.9454868080402594, + "grad_norm": 13.4375, + "learning_rate": 3.5150449956981886e-06, + "loss": 1.4773, + "step": 66880 + }, + { + "epoch": 1.9460685923728074, + "grad_norm": 15.4375, + "learning_rate": 3.5131057140255737e-06, + "loss": 1.4834, + "step": 66900 + }, + { + "epoch": 1.9466503767053553, + "grad_norm": 13.4375, + "learning_rate": 3.5111664323529588e-06, + "loss": 1.4458, + "step": 66920 + }, + { + "epoch": 1.947232161037903, + "grad_norm": 13.6875, + "learning_rate": 3.509227150680344e-06, + "loss": 1.3961, + "step": 66940 + }, + { + "epoch": 1.9478139453704513, + "grad_norm": 18.625, + "learning_rate": 3.507287869007729e-06, + "loss": 1.4644, + "step": 66960 + }, + { + "epoch": 1.948395729702999, + "grad_norm": 15.125, + "learning_rate": 3.505348587335114e-06, + "loss": 1.4645, + "step": 66980 + }, + { + "epoch": 1.948977514035547, + "grad_norm": 12.25, + "learning_rate": 3.503409305662499e-06, + "loss": 1.5278, + "step": 67000 + }, + { + "epoch": 1.949559298368095, + "grad_norm": 13.5625, + "learning_rate": 3.5014700239898843e-06, + "loss": 1.5145, + "step": 67020 + }, + { + "epoch": 1.950141082700643, + "grad_norm": 14.25, + "learning_rate": 3.4995307423172694e-06, + "loss": 1.4483, + "step": 67040 + }, + { + "epoch": 1.9507228670331909, + "grad_norm": 14.8125, + "learning_rate": 3.4975914606446545e-06, + "loss": 1.5122, + "step": 67060 + }, + { + "epoch": 1.9513046513657386, + "grad_norm": 12.3125, + "learning_rate": 3.4956521789720396e-06, + "loss": 1.4802, + "step": 67080 + }, + { + "epoch": 1.9518864356982868, + "grad_norm": 12.625, + "learning_rate": 3.4937128972994242e-06, + "loss": 1.5416, + "step": 67100 + }, + { + "epoch": 1.9524682200308345, + "grad_norm": 12.875, + "learning_rate": 3.4917736156268093e-06, + "loss": 1.5825, + "step": 67120 + }, + { + "epoch": 1.9530500043633825, + "grad_norm": 11.1875, + "learning_rate": 3.4898343339541944e-06, + "loss": 1.4703, + "step": 67140 + }, + { + "epoch": 1.9536317886959305, + "grad_norm": 14.3125, + "learning_rate": 3.4878950522815795e-06, + "loss": 1.5021, + "step": 67160 + }, + { + "epoch": 1.9542135730284782, + "grad_norm": 13.125, + "learning_rate": 3.4859557706089646e-06, + "loss": 1.5426, + "step": 67180 + }, + { + "epoch": 1.9547953573610264, + "grad_norm": 13.125, + "learning_rate": 3.4840164889363497e-06, + "loss": 1.6451, + "step": 67200 + }, + { + "epoch": 1.9553771416935741, + "grad_norm": 11.9375, + "learning_rate": 3.482077207263735e-06, + "loss": 1.4643, + "step": 67220 + }, + { + "epoch": 1.955958926026122, + "grad_norm": 11.875, + "learning_rate": 3.48013792559112e-06, + "loss": 1.4222, + "step": 67240 + }, + { + "epoch": 1.95654071035867, + "grad_norm": 13.4375, + "learning_rate": 3.478198643918505e-06, + "loss": 1.5199, + "step": 67260 + }, + { + "epoch": 1.957122494691218, + "grad_norm": 14.75, + "learning_rate": 3.47625936224589e-06, + "loss": 1.52, + "step": 67280 + }, + { + "epoch": 1.957704279023766, + "grad_norm": 12.8125, + "learning_rate": 3.474320080573275e-06, + "loss": 1.5264, + "step": 67300 + }, + { + "epoch": 1.9582860633563137, + "grad_norm": 15.6875, + "learning_rate": 3.4723807989006603e-06, + "loss": 1.4517, + "step": 67320 + }, + { + "epoch": 1.9588678476888617, + "grad_norm": 13.3125, + "learning_rate": 3.4704415172280454e-06, + "loss": 1.4373, + "step": 67340 + }, + { + "epoch": 1.9594496320214096, + "grad_norm": 11.6875, + "learning_rate": 3.4685022355554305e-06, + "loss": 1.4871, + "step": 67360 + }, + { + "epoch": 1.9600314163539576, + "grad_norm": 13.625, + "learning_rate": 3.4665629538828156e-06, + "loss": 1.456, + "step": 67380 + }, + { + "epoch": 1.9606132006865056, + "grad_norm": 17.25, + "learning_rate": 3.4646236722102007e-06, + "loss": 1.6015, + "step": 67400 + }, + { + "epoch": 1.9611949850190533, + "grad_norm": 13.5, + "learning_rate": 3.462684390537586e-06, + "loss": 1.439, + "step": 67420 + }, + { + "epoch": 1.9617767693516015, + "grad_norm": 17.125, + "learning_rate": 3.460745108864971e-06, + "loss": 1.4945, + "step": 67440 + }, + { + "epoch": 1.9623585536841492, + "grad_norm": 14.0625, + "learning_rate": 3.458805827192356e-06, + "loss": 1.4674, + "step": 67460 + }, + { + "epoch": 1.9629403380166972, + "grad_norm": 12.3125, + "learning_rate": 3.456866545519741e-06, + "loss": 1.5322, + "step": 67480 + }, + { + "epoch": 1.9635221223492452, + "grad_norm": 15.0625, + "learning_rate": 3.4549272638471258e-06, + "loss": 1.472, + "step": 67500 + }, + { + "epoch": 1.964103906681793, + "grad_norm": 11.125, + "learning_rate": 3.452987982174511e-06, + "loss": 1.5724, + "step": 67520 + }, + { + "epoch": 1.964685691014341, + "grad_norm": 14.25, + "learning_rate": 3.451048700501896e-06, + "loss": 1.6187, + "step": 67540 + }, + { + "epoch": 1.9652674753468888, + "grad_norm": 12.625, + "learning_rate": 3.449109418829281e-06, + "loss": 1.4714, + "step": 67560 + }, + { + "epoch": 1.9658492596794368, + "grad_norm": 12.25, + "learning_rate": 3.447170137156666e-06, + "loss": 1.4059, + "step": 67580 + }, + { + "epoch": 1.9664310440119848, + "grad_norm": 13.0625, + "learning_rate": 3.4452308554840512e-06, + "loss": 1.5153, + "step": 67600 + }, + { + "epoch": 1.9670128283445327, + "grad_norm": 14.125, + "learning_rate": 3.4432915738114363e-06, + "loss": 1.4906, + "step": 67620 + }, + { + "epoch": 1.9675946126770807, + "grad_norm": 14.375, + "learning_rate": 3.4413522921388214e-06, + "loss": 1.5138, + "step": 67640 + }, + { + "epoch": 1.9681763970096284, + "grad_norm": 12.9375, + "learning_rate": 3.4394130104662065e-06, + "loss": 1.4947, + "step": 67660 + }, + { + "epoch": 1.9687581813421766, + "grad_norm": 13.75, + "learning_rate": 3.4374737287935916e-06, + "loss": 1.502, + "step": 67680 + }, + { + "epoch": 1.9693399656747244, + "grad_norm": 14.6875, + "learning_rate": 3.4355344471209767e-06, + "loss": 1.6162, + "step": 67700 + }, + { + "epoch": 1.9699217500072723, + "grad_norm": 14.9375, + "learning_rate": 3.433595165448362e-06, + "loss": 1.5033, + "step": 67720 + }, + { + "epoch": 1.9705035343398203, + "grad_norm": 13.8125, + "learning_rate": 3.431655883775747e-06, + "loss": 1.516, + "step": 67740 + }, + { + "epoch": 1.971085318672368, + "grad_norm": 13.125, + "learning_rate": 3.429716602103132e-06, + "loss": 1.5823, + "step": 67760 + }, + { + "epoch": 1.9716671030049162, + "grad_norm": 15.4375, + "learning_rate": 3.427777320430517e-06, + "loss": 1.5057, + "step": 67780 + }, + { + "epoch": 1.972248887337464, + "grad_norm": 13.75, + "learning_rate": 3.4258380387579022e-06, + "loss": 1.4956, + "step": 67800 + }, + { + "epoch": 1.972830671670012, + "grad_norm": 16.375, + "learning_rate": 3.4238987570852873e-06, + "loss": 1.4952, + "step": 67820 + }, + { + "epoch": 1.9734124560025599, + "grad_norm": 14.75, + "learning_rate": 3.4219594754126724e-06, + "loss": 1.5201, + "step": 67840 + }, + { + "epoch": 1.9739942403351076, + "grad_norm": 13.4375, + "learning_rate": 3.4200201937400575e-06, + "loss": 1.576, + "step": 67860 + }, + { + "epoch": 1.9745760246676558, + "grad_norm": 12.5625, + "learning_rate": 3.4180809120674426e-06, + "loss": 1.5281, + "step": 67880 + }, + { + "epoch": 1.9751578090002035, + "grad_norm": 12.1875, + "learning_rate": 3.4161416303948273e-06, + "loss": 1.4996, + "step": 67900 + }, + { + "epoch": 1.9757395933327515, + "grad_norm": 11.8125, + "learning_rate": 3.4142023487222124e-06, + "loss": 1.5158, + "step": 67920 + }, + { + "epoch": 1.9763213776652995, + "grad_norm": 14.3125, + "learning_rate": 3.4122630670495975e-06, + "loss": 1.4658, + "step": 67940 + }, + { + "epoch": 1.9769031619978474, + "grad_norm": 13.4375, + "learning_rate": 3.4103237853769826e-06, + "loss": 1.46, + "step": 67960 + }, + { + "epoch": 1.9774849463303954, + "grad_norm": 17.875, + "learning_rate": 3.4083845037043677e-06, + "loss": 1.5656, + "step": 67980 + }, + { + "epoch": 1.9780667306629431, + "grad_norm": 12.4375, + "learning_rate": 3.4064452220317528e-06, + "loss": 1.4976, + "step": 68000 + }, + { + "epoch": 1.9786485149954913, + "grad_norm": 14.1875, + "learning_rate": 3.404505940359138e-06, + "loss": 1.4997, + "step": 68020 + }, + { + "epoch": 1.979230299328039, + "grad_norm": 17.0, + "learning_rate": 3.402566658686523e-06, + "loss": 1.4584, + "step": 68040 + }, + { + "epoch": 1.979812083660587, + "grad_norm": 14.625, + "learning_rate": 3.400627377013908e-06, + "loss": 1.5321, + "step": 68060 + }, + { + "epoch": 1.980393867993135, + "grad_norm": 12.1875, + "learning_rate": 3.398688095341293e-06, + "loss": 1.3941, + "step": 68080 + }, + { + "epoch": 1.9809756523256827, + "grad_norm": 12.1875, + "learning_rate": 3.3967488136686783e-06, + "loss": 1.5388, + "step": 68100 + }, + { + "epoch": 1.981557436658231, + "grad_norm": 14.0, + "learning_rate": 3.3948095319960634e-06, + "loss": 1.5299, + "step": 68120 + }, + { + "epoch": 1.9821392209907787, + "grad_norm": 12.6875, + "learning_rate": 3.3928702503234485e-06, + "loss": 1.4172, + "step": 68140 + }, + { + "epoch": 1.9827210053233266, + "grad_norm": 15.5625, + "learning_rate": 3.3909309686508335e-06, + "loss": 1.4967, + "step": 68160 + }, + { + "epoch": 1.9833027896558746, + "grad_norm": 14.4375, + "learning_rate": 3.3889916869782186e-06, + "loss": 1.4214, + "step": 68180 + }, + { + "epoch": 1.9838845739884223, + "grad_norm": 12.75, + "learning_rate": 3.3870524053056037e-06, + "loss": 1.5038, + "step": 68200 + }, + { + "epoch": 1.9844663583209705, + "grad_norm": 13.75, + "learning_rate": 3.385113123632989e-06, + "loss": 1.568, + "step": 68220 + }, + { + "epoch": 1.9850481426535183, + "grad_norm": 14.3125, + "learning_rate": 3.383173841960374e-06, + "loss": 1.5062, + "step": 68240 + }, + { + "epoch": 1.9856299269860662, + "grad_norm": 11.6875, + "learning_rate": 3.381234560287759e-06, + "loss": 1.4475, + "step": 68260 + }, + { + "epoch": 1.9862117113186142, + "grad_norm": 13.0, + "learning_rate": 3.379295278615144e-06, + "loss": 1.4878, + "step": 68280 + }, + { + "epoch": 1.9867934956511621, + "grad_norm": 12.375, + "learning_rate": 3.3773559969425292e-06, + "loss": 1.5715, + "step": 68300 + }, + { + "epoch": 1.98737527998371, + "grad_norm": 15.0, + "learning_rate": 3.375416715269914e-06, + "loss": 1.4876, + "step": 68320 + }, + { + "epoch": 1.9879570643162578, + "grad_norm": 16.5, + "learning_rate": 3.3734774335972986e-06, + "loss": 1.4497, + "step": 68340 + }, + { + "epoch": 1.988538848648806, + "grad_norm": 15.4375, + "learning_rate": 3.3715381519246837e-06, + "loss": 1.4886, + "step": 68360 + }, + { + "epoch": 1.9891206329813538, + "grad_norm": 11.6875, + "learning_rate": 3.3695988702520688e-06, + "loss": 1.4935, + "step": 68380 + }, + { + "epoch": 1.9897024173139017, + "grad_norm": 13.5, + "learning_rate": 3.367659588579454e-06, + "loss": 1.4687, + "step": 68400 + }, + { + "epoch": 1.9902842016464497, + "grad_norm": 14.0, + "learning_rate": 3.365720306906839e-06, + "loss": 1.5841, + "step": 68420 + }, + { + "epoch": 1.9908659859789974, + "grad_norm": 12.125, + "learning_rate": 3.363781025234224e-06, + "loss": 1.5813, + "step": 68440 + }, + { + "epoch": 1.9914477703115456, + "grad_norm": 13.875, + "learning_rate": 3.361841743561609e-06, + "loss": 1.5345, + "step": 68460 + }, + { + "epoch": 1.9920295546440934, + "grad_norm": 13.0, + "learning_rate": 3.3599024618889943e-06, + "loss": 1.5137, + "step": 68480 + }, + { + "epoch": 1.9926113389766413, + "grad_norm": 13.4375, + "learning_rate": 3.3579631802163794e-06, + "loss": 1.4971, + "step": 68500 + }, + { + "epoch": 1.9931931233091893, + "grad_norm": 14.375, + "learning_rate": 3.3560238985437645e-06, + "loss": 1.4585, + "step": 68520 + }, + { + "epoch": 1.9937749076417373, + "grad_norm": 9.375, + "learning_rate": 3.3540846168711496e-06, + "loss": 1.4688, + "step": 68540 + }, + { + "epoch": 1.9943566919742852, + "grad_norm": 14.6875, + "learning_rate": 3.3521453351985347e-06, + "loss": 1.5335, + "step": 68560 + }, + { + "epoch": 1.994938476306833, + "grad_norm": 13.3125, + "learning_rate": 3.3502060535259197e-06, + "loss": 1.4753, + "step": 68580 + }, + { + "epoch": 1.995520260639381, + "grad_norm": 14.6875, + "learning_rate": 3.3482667718533044e-06, + "loss": 1.5276, + "step": 68600 + }, + { + "epoch": 1.996102044971929, + "grad_norm": 10.625, + "learning_rate": 3.3463274901806895e-06, + "loss": 1.589, + "step": 68620 + }, + { + "epoch": 1.9966838293044769, + "grad_norm": 12.3125, + "learning_rate": 3.3443882085080746e-06, + "loss": 1.448, + "step": 68640 + }, + { + "epoch": 1.9972656136370248, + "grad_norm": 11.3125, + "learning_rate": 3.3424489268354597e-06, + "loss": 1.5355, + "step": 68660 + }, + { + "epoch": 1.9978473979695726, + "grad_norm": 11.4375, + "learning_rate": 3.340509645162845e-06, + "loss": 1.462, + "step": 68680 + }, + { + "epoch": 1.9984291823021207, + "grad_norm": 14.375, + "learning_rate": 3.33857036349023e-06, + "loss": 1.47, + "step": 68700 + }, + { + "epoch": 1.9990109666346685, + "grad_norm": 12.6875, + "learning_rate": 3.336631081817615e-06, + "loss": 1.4833, + "step": 68720 + }, + { + "epoch": 1.9995927509672164, + "grad_norm": 12.6875, + "learning_rate": 3.334691800145e-06, + "loss": 1.5627, + "step": 68740 + }, + { + "epoch": 2.0001745352997644, + "grad_norm": 12.6875, + "learning_rate": 3.332752518472385e-06, + "loss": 1.4391, + "step": 68760 + }, + { + "epoch": 2.000756319632312, + "grad_norm": 16.375, + "learning_rate": 3.3308132367997703e-06, + "loss": 1.5977, + "step": 68780 + }, + { + "epoch": 2.0013381039648603, + "grad_norm": 12.875, + "learning_rate": 3.3288739551271554e-06, + "loss": 1.4505, + "step": 68800 + }, + { + "epoch": 2.001919888297408, + "grad_norm": 12.75, + "learning_rate": 3.3269346734545405e-06, + "loss": 1.5455, + "step": 68820 + }, + { + "epoch": 2.0025016726299563, + "grad_norm": 12.9375, + "learning_rate": 3.3249953917819256e-06, + "loss": 1.4766, + "step": 68840 + }, + { + "epoch": 2.003083456962504, + "grad_norm": 13.8125, + "learning_rate": 3.3230561101093107e-06, + "loss": 1.6074, + "step": 68860 + }, + { + "epoch": 2.0036652412950517, + "grad_norm": 14.3125, + "learning_rate": 3.3211168284366958e-06, + "loss": 1.4583, + "step": 68880 + }, + { + "epoch": 2.0042470256276, + "grad_norm": 14.8125, + "learning_rate": 3.319177546764081e-06, + "loss": 1.5819, + "step": 68900 + }, + { + "epoch": 2.0048288099601477, + "grad_norm": 12.5, + "learning_rate": 3.317238265091466e-06, + "loss": 1.4428, + "step": 68920 + }, + { + "epoch": 2.005410594292696, + "grad_norm": 13.0625, + "learning_rate": 3.315298983418851e-06, + "loss": 1.513, + "step": 68940 + }, + { + "epoch": 2.0059923786252436, + "grad_norm": 12.5, + "learning_rate": 3.313359701746236e-06, + "loss": 1.4663, + "step": 68960 + }, + { + "epoch": 2.0065741629577913, + "grad_norm": 11.375, + "learning_rate": 3.3114204200736213e-06, + "loss": 1.5026, + "step": 68980 + }, + { + "epoch": 2.0071559472903395, + "grad_norm": 14.9375, + "learning_rate": 3.309481138401006e-06, + "loss": 1.5613, + "step": 69000 + }, + { + "epoch": 2.0077377316228873, + "grad_norm": 13.625, + "learning_rate": 3.307541856728391e-06, + "loss": 1.4829, + "step": 69020 + }, + { + "epoch": 2.0083195159554355, + "grad_norm": 12.1875, + "learning_rate": 3.305602575055776e-06, + "loss": 1.4124, + "step": 69040 + }, + { + "epoch": 2.008901300287983, + "grad_norm": 18.625, + "learning_rate": 3.3036632933831612e-06, + "loss": 1.4377, + "step": 69060 + }, + { + "epoch": 2.0094830846205314, + "grad_norm": 14.3125, + "learning_rate": 3.3017240117105463e-06, + "loss": 1.5308, + "step": 69080 + }, + { + "epoch": 2.010064868953079, + "grad_norm": 16.625, + "learning_rate": 3.2997847300379314e-06, + "loss": 1.4723, + "step": 69100 + }, + { + "epoch": 2.010646653285627, + "grad_norm": 17.25, + "learning_rate": 3.2978454483653165e-06, + "loss": 1.5134, + "step": 69120 + }, + { + "epoch": 2.011228437618175, + "grad_norm": 15.125, + "learning_rate": 3.2959061666927016e-06, + "loss": 1.4609, + "step": 69140 + }, + { + "epoch": 2.011810221950723, + "grad_norm": 13.625, + "learning_rate": 3.2939668850200867e-06, + "loss": 1.5012, + "step": 69160 + }, + { + "epoch": 2.012392006283271, + "grad_norm": 16.75, + "learning_rate": 3.292027603347472e-06, + "loss": 1.5458, + "step": 69180 + }, + { + "epoch": 2.0129737906158187, + "grad_norm": 15.0, + "learning_rate": 3.290088321674857e-06, + "loss": 1.5486, + "step": 69200 + }, + { + "epoch": 2.0135555749483665, + "grad_norm": 17.625, + "learning_rate": 3.288149040002242e-06, + "loss": 1.4424, + "step": 69220 + }, + { + "epoch": 2.0141373592809146, + "grad_norm": 14.4375, + "learning_rate": 3.286209758329627e-06, + "loss": 1.5176, + "step": 69240 + }, + { + "epoch": 2.0147191436134624, + "grad_norm": 11.3125, + "learning_rate": 3.2842704766570122e-06, + "loss": 1.5829, + "step": 69260 + }, + { + "epoch": 2.0153009279460106, + "grad_norm": 16.625, + "learning_rate": 3.2823311949843973e-06, + "loss": 1.4862, + "step": 69280 + }, + { + "epoch": 2.0158827122785583, + "grad_norm": 15.4375, + "learning_rate": 3.2803919133117824e-06, + "loss": 1.4982, + "step": 69300 + }, + { + "epoch": 2.016464496611106, + "grad_norm": 11.8125, + "learning_rate": 3.2784526316391675e-06, + "loss": 1.5102, + "step": 69320 + }, + { + "epoch": 2.0170462809436542, + "grad_norm": 13.625, + "learning_rate": 3.2765133499665526e-06, + "loss": 1.488, + "step": 69340 + }, + { + "epoch": 2.017628065276202, + "grad_norm": 14.9375, + "learning_rate": 3.2745740682939377e-06, + "loss": 1.5736, + "step": 69360 + }, + { + "epoch": 2.01820984960875, + "grad_norm": 16.25, + "learning_rate": 3.272634786621323e-06, + "loss": 1.519, + "step": 69380 + }, + { + "epoch": 2.018791633941298, + "grad_norm": 14.25, + "learning_rate": 3.270695504948708e-06, + "loss": 1.4977, + "step": 69400 + }, + { + "epoch": 2.019373418273846, + "grad_norm": 15.25, + "learning_rate": 3.2687562232760926e-06, + "loss": 1.4406, + "step": 69420 + }, + { + "epoch": 2.019955202606394, + "grad_norm": 14.625, + "learning_rate": 3.2668169416034777e-06, + "loss": 1.5004, + "step": 69440 + }, + { + "epoch": 2.0205369869389416, + "grad_norm": 12.6875, + "learning_rate": 3.2648776599308628e-06, + "loss": 1.4982, + "step": 69460 + }, + { + "epoch": 2.0211187712714898, + "grad_norm": 15.3125, + "learning_rate": 3.262938378258248e-06, + "loss": 1.535, + "step": 69480 + }, + { + "epoch": 2.0217005556040375, + "grad_norm": 14.8125, + "learning_rate": 3.260999096585633e-06, + "loss": 1.5355, + "step": 69500 + }, + { + "epoch": 2.0222823399365857, + "grad_norm": 11.3125, + "learning_rate": 3.259059814913018e-06, + "loss": 1.5343, + "step": 69520 + }, + { + "epoch": 2.0228641242691334, + "grad_norm": 12.875, + "learning_rate": 3.257120533240403e-06, + "loss": 1.503, + "step": 69540 + }, + { + "epoch": 2.023445908601681, + "grad_norm": 14.1875, + "learning_rate": 3.2551812515677883e-06, + "loss": 1.5212, + "step": 69560 + }, + { + "epoch": 2.0240276929342293, + "grad_norm": 13.875, + "learning_rate": 3.2532419698951734e-06, + "loss": 1.4834, + "step": 69580 + }, + { + "epoch": 2.024609477266777, + "grad_norm": 12.8125, + "learning_rate": 3.2513026882225584e-06, + "loss": 1.5213, + "step": 69600 + }, + { + "epoch": 2.0251912615993253, + "grad_norm": 13.5, + "learning_rate": 3.2493634065499435e-06, + "loss": 1.5377, + "step": 69620 + }, + { + "epoch": 2.025773045931873, + "grad_norm": 13.5, + "learning_rate": 3.2474241248773286e-06, + "loss": 1.4808, + "step": 69640 + }, + { + "epoch": 2.026354830264421, + "grad_norm": 14.75, + "learning_rate": 3.2454848432047137e-06, + "loss": 1.4268, + "step": 69660 + }, + { + "epoch": 2.026936614596969, + "grad_norm": 13.125, + "learning_rate": 3.243545561532099e-06, + "loss": 1.5044, + "step": 69680 + }, + { + "epoch": 2.0275183989295167, + "grad_norm": 14.4375, + "learning_rate": 3.241606279859484e-06, + "loss": 1.52, + "step": 69700 + }, + { + "epoch": 2.028100183262065, + "grad_norm": 12.8125, + "learning_rate": 3.239666998186869e-06, + "loss": 1.4395, + "step": 69720 + }, + { + "epoch": 2.0286819675946126, + "grad_norm": 12.25, + "learning_rate": 3.237727716514254e-06, + "loss": 1.3982, + "step": 69740 + }, + { + "epoch": 2.029263751927161, + "grad_norm": 13.0, + "learning_rate": 3.2357884348416392e-06, + "loss": 1.4692, + "step": 69760 + }, + { + "epoch": 2.0298455362597085, + "grad_norm": 11.625, + "learning_rate": 3.2338491531690243e-06, + "loss": 1.412, + "step": 69780 + }, + { + "epoch": 2.0304273205922563, + "grad_norm": 13.875, + "learning_rate": 3.2319098714964094e-06, + "loss": 1.5024, + "step": 69800 + }, + { + "epoch": 2.0310091049248045, + "grad_norm": 10.6875, + "learning_rate": 3.229970589823794e-06, + "loss": 1.5416, + "step": 69820 + }, + { + "epoch": 2.031590889257352, + "grad_norm": 13.0, + "learning_rate": 3.228031308151179e-06, + "loss": 1.4766, + "step": 69840 + }, + { + "epoch": 2.0321726735899004, + "grad_norm": 15.4375, + "learning_rate": 3.2260920264785643e-06, + "loss": 1.4842, + "step": 69860 + }, + { + "epoch": 2.032754457922448, + "grad_norm": 12.25, + "learning_rate": 3.2241527448059494e-06, + "loss": 1.483, + "step": 69880 + }, + { + "epoch": 2.033336242254996, + "grad_norm": 10.75, + "learning_rate": 3.2222134631333345e-06, + "loss": 1.4638, + "step": 69900 + }, + { + "epoch": 2.033918026587544, + "grad_norm": 12.3125, + "learning_rate": 3.2202741814607196e-06, + "loss": 1.5076, + "step": 69920 + }, + { + "epoch": 2.034499810920092, + "grad_norm": 12.9375, + "learning_rate": 3.2183348997881047e-06, + "loss": 1.4741, + "step": 69940 + }, + { + "epoch": 2.03508159525264, + "grad_norm": 12.375, + "learning_rate": 3.2163956181154898e-06, + "loss": 1.4623, + "step": 69960 + }, + { + "epoch": 2.0356633795851877, + "grad_norm": 15.25, + "learning_rate": 3.214456336442875e-06, + "loss": 1.6044, + "step": 69980 + }, + { + "epoch": 2.036245163917736, + "grad_norm": 13.4375, + "learning_rate": 3.21251705477026e-06, + "loss": 1.4792, + "step": 70000 + }, + { + "epoch": 2.0368269482502837, + "grad_norm": 13.375, + "learning_rate": 3.210577773097645e-06, + "loss": 1.5712, + "step": 70020 + }, + { + "epoch": 2.0374087325828314, + "grad_norm": 11.25, + "learning_rate": 3.20863849142503e-06, + "loss": 1.4752, + "step": 70040 + }, + { + "epoch": 2.0379905169153796, + "grad_norm": 14.3125, + "learning_rate": 3.2066992097524153e-06, + "loss": 1.5129, + "step": 70060 + }, + { + "epoch": 2.0385723012479273, + "grad_norm": 13.25, + "learning_rate": 3.2047599280798004e-06, + "loss": 1.5771, + "step": 70080 + }, + { + "epoch": 2.0391540855804755, + "grad_norm": 12.4375, + "learning_rate": 3.2028206464071855e-06, + "loss": 1.4345, + "step": 70100 + }, + { + "epoch": 2.0397358699130232, + "grad_norm": 12.625, + "learning_rate": 3.2008813647345706e-06, + "loss": 1.4438, + "step": 70120 + }, + { + "epoch": 2.040317654245571, + "grad_norm": 12.75, + "learning_rate": 3.1989420830619557e-06, + "loss": 1.5879, + "step": 70140 + }, + { + "epoch": 2.040899438578119, + "grad_norm": 14.125, + "learning_rate": 3.1970028013893408e-06, + "loss": 1.5241, + "step": 70160 + }, + { + "epoch": 2.041481222910667, + "grad_norm": 14.3125, + "learning_rate": 3.195063519716726e-06, + "loss": 1.6103, + "step": 70180 + }, + { + "epoch": 2.042063007243215, + "grad_norm": 14.125, + "learning_rate": 3.193124238044111e-06, + "loss": 1.4937, + "step": 70200 + }, + { + "epoch": 2.042644791575763, + "grad_norm": 12.0, + "learning_rate": 3.1911849563714956e-06, + "loss": 1.47, + "step": 70220 + }, + { + "epoch": 2.0432265759083106, + "grad_norm": 13.75, + "learning_rate": 3.1892456746988807e-06, + "loss": 1.5478, + "step": 70240 + }, + { + "epoch": 2.0438083602408588, + "grad_norm": 14.6875, + "learning_rate": 3.187306393026266e-06, + "loss": 1.4608, + "step": 70260 + }, + { + "epoch": 2.0443901445734065, + "grad_norm": 15.8125, + "learning_rate": 3.185367111353651e-06, + "loss": 1.545, + "step": 70280 + }, + { + "epoch": 2.0449719289059547, + "grad_norm": 13.25, + "learning_rate": 3.183427829681036e-06, + "loss": 1.5346, + "step": 70300 + }, + { + "epoch": 2.0455537132385024, + "grad_norm": 13.875, + "learning_rate": 3.181488548008421e-06, + "loss": 1.4697, + "step": 70320 + }, + { + "epoch": 2.0461354975710506, + "grad_norm": 14.5625, + "learning_rate": 3.179549266335806e-06, + "loss": 1.4163, + "step": 70340 + }, + { + "epoch": 2.0467172819035984, + "grad_norm": 15.6875, + "learning_rate": 3.1776099846631913e-06, + "loss": 1.4977, + "step": 70360 + }, + { + "epoch": 2.047299066236146, + "grad_norm": 15.5, + "learning_rate": 3.1756707029905764e-06, + "loss": 1.6234, + "step": 70380 + }, + { + "epoch": 2.0478808505686943, + "grad_norm": 14.375, + "learning_rate": 3.1737314213179615e-06, + "loss": 1.4976, + "step": 70400 + }, + { + "epoch": 2.048462634901242, + "grad_norm": 13.875, + "learning_rate": 3.1717921396453466e-06, + "loss": 1.5693, + "step": 70420 + }, + { + "epoch": 2.04904441923379, + "grad_norm": 12.5625, + "learning_rate": 3.1698528579727317e-06, + "loss": 1.5112, + "step": 70440 + }, + { + "epoch": 2.049626203566338, + "grad_norm": 13.5, + "learning_rate": 3.1679135763001164e-06, + "loss": 1.4604, + "step": 70460 + }, + { + "epoch": 2.0502079878988857, + "grad_norm": 13.625, + "learning_rate": 3.1659742946275015e-06, + "loss": 1.4615, + "step": 70480 + }, + { + "epoch": 2.050789772231434, + "grad_norm": 13.875, + "learning_rate": 3.164035012954886e-06, + "loss": 1.5743, + "step": 70500 + }, + { + "epoch": 2.0513715565639816, + "grad_norm": 13.375, + "learning_rate": 3.1620957312822712e-06, + "loss": 1.4923, + "step": 70520 + }, + { + "epoch": 2.05195334089653, + "grad_norm": 13.0, + "learning_rate": 3.1601564496096563e-06, + "loss": 1.4632, + "step": 70540 + }, + { + "epoch": 2.0525351252290776, + "grad_norm": 14.625, + "learning_rate": 3.1582171679370414e-06, + "loss": 1.5149, + "step": 70560 + }, + { + "epoch": 2.0531169095616253, + "grad_norm": 14.0625, + "learning_rate": 3.1562778862644265e-06, + "loss": 1.525, + "step": 70580 + }, + { + "epoch": 2.0536986938941735, + "grad_norm": 14.0, + "learning_rate": 3.1543386045918116e-06, + "loss": 1.5286, + "step": 70600 + }, + { + "epoch": 2.054280478226721, + "grad_norm": 14.25, + "learning_rate": 3.1523993229191967e-06, + "loss": 1.4856, + "step": 70620 + }, + { + "epoch": 2.0548622625592694, + "grad_norm": 14.4375, + "learning_rate": 3.150460041246582e-06, + "loss": 1.4911, + "step": 70640 + }, + { + "epoch": 2.055444046891817, + "grad_norm": 13.5, + "learning_rate": 3.148520759573967e-06, + "loss": 1.5228, + "step": 70660 + }, + { + "epoch": 2.0560258312243653, + "grad_norm": 12.6875, + "learning_rate": 3.146581477901352e-06, + "loss": 1.4383, + "step": 70680 + }, + { + "epoch": 2.056607615556913, + "grad_norm": 12.8125, + "learning_rate": 3.144642196228737e-06, + "loss": 1.4915, + "step": 70700 + }, + { + "epoch": 2.057189399889461, + "grad_norm": 13.0, + "learning_rate": 3.1427029145561222e-06, + "loss": 1.4782, + "step": 70720 + }, + { + "epoch": 2.057771184222009, + "grad_norm": 11.4375, + "learning_rate": 3.1407636328835073e-06, + "loss": 1.485, + "step": 70740 + }, + { + "epoch": 2.0583529685545567, + "grad_norm": 14.25, + "learning_rate": 3.1388243512108924e-06, + "loss": 1.6041, + "step": 70760 + }, + { + "epoch": 2.058934752887105, + "grad_norm": 13.1875, + "learning_rate": 3.1368850695382775e-06, + "loss": 1.4376, + "step": 70780 + }, + { + "epoch": 2.0595165372196527, + "grad_norm": 13.8125, + "learning_rate": 3.1349457878656626e-06, + "loss": 1.4167, + "step": 70800 + }, + { + "epoch": 2.0600983215522004, + "grad_norm": 13.6875, + "learning_rate": 3.1330065061930477e-06, + "loss": 1.4912, + "step": 70820 + }, + { + "epoch": 2.0606801058847486, + "grad_norm": 12.9375, + "learning_rate": 3.131067224520433e-06, + "loss": 1.5176, + "step": 70840 + }, + { + "epoch": 2.0612618902172963, + "grad_norm": 13.1875, + "learning_rate": 3.129127942847818e-06, + "loss": 1.4853, + "step": 70860 + }, + { + "epoch": 2.0618436745498445, + "grad_norm": 13.4375, + "learning_rate": 3.127188661175203e-06, + "loss": 1.4644, + "step": 70880 + }, + { + "epoch": 2.0624254588823923, + "grad_norm": 11.0625, + "learning_rate": 3.125249379502588e-06, + "loss": 1.4847, + "step": 70900 + }, + { + "epoch": 2.0630072432149404, + "grad_norm": 12.5, + "learning_rate": 3.1233100978299728e-06, + "loss": 1.5436, + "step": 70920 + }, + { + "epoch": 2.063589027547488, + "grad_norm": 12.875, + "learning_rate": 3.121370816157358e-06, + "loss": 1.4474, + "step": 70940 + }, + { + "epoch": 2.064170811880036, + "grad_norm": 11.625, + "learning_rate": 3.119431534484743e-06, + "loss": 1.5005, + "step": 70960 + }, + { + "epoch": 2.064752596212584, + "grad_norm": 11.8125, + "learning_rate": 3.117492252812128e-06, + "loss": 1.5289, + "step": 70980 + }, + { + "epoch": 2.065334380545132, + "grad_norm": 12.625, + "learning_rate": 3.115552971139513e-06, + "loss": 1.4181, + "step": 71000 + }, + { + "epoch": 2.06591616487768, + "grad_norm": 10.5, + "learning_rate": 3.1136136894668983e-06, + "loss": 1.405, + "step": 71020 + }, + { + "epoch": 2.066497949210228, + "grad_norm": 12.8125, + "learning_rate": 3.1116744077942834e-06, + "loss": 1.522, + "step": 71040 + }, + { + "epoch": 2.0670797335427755, + "grad_norm": 15.8125, + "learning_rate": 3.1097351261216684e-06, + "loss": 1.5006, + "step": 71060 + }, + { + "epoch": 2.0676615178753237, + "grad_norm": 12.3125, + "learning_rate": 3.1077958444490535e-06, + "loss": 1.529, + "step": 71080 + }, + { + "epoch": 2.0682433022078714, + "grad_norm": 14.125, + "learning_rate": 3.1058565627764386e-06, + "loss": 1.4701, + "step": 71100 + }, + { + "epoch": 2.0688250865404196, + "grad_norm": 16.0, + "learning_rate": 3.1039172811038237e-06, + "loss": 1.5067, + "step": 71120 + }, + { + "epoch": 2.0694068708729674, + "grad_norm": 12.0, + "learning_rate": 3.101977999431209e-06, + "loss": 1.4489, + "step": 71140 + }, + { + "epoch": 2.069988655205515, + "grad_norm": 12.125, + "learning_rate": 3.100038717758594e-06, + "loss": 1.4791, + "step": 71160 + }, + { + "epoch": 2.0705704395380633, + "grad_norm": 14.25, + "learning_rate": 3.098099436085979e-06, + "loss": 1.4841, + "step": 71180 + }, + { + "epoch": 2.071152223870611, + "grad_norm": 11.1875, + "learning_rate": 3.096160154413364e-06, + "loss": 1.4563, + "step": 71200 + }, + { + "epoch": 2.0717340082031592, + "grad_norm": 14.125, + "learning_rate": 3.0942208727407492e-06, + "loss": 1.4879, + "step": 71220 + }, + { + "epoch": 2.072315792535707, + "grad_norm": 14.875, + "learning_rate": 3.0922815910681343e-06, + "loss": 1.4111, + "step": 71240 + }, + { + "epoch": 2.0728975768682547, + "grad_norm": 14.5625, + "learning_rate": 3.0903423093955194e-06, + "loss": 1.5508, + "step": 71260 + }, + { + "epoch": 2.073479361200803, + "grad_norm": 14.875, + "learning_rate": 3.0884030277229045e-06, + "loss": 1.4548, + "step": 71280 + }, + { + "epoch": 2.0740611455333506, + "grad_norm": 15.0625, + "learning_rate": 3.0864637460502896e-06, + "loss": 1.5113, + "step": 71300 + }, + { + "epoch": 2.074642929865899, + "grad_norm": 13.4375, + "learning_rate": 3.0845244643776743e-06, + "loss": 1.4911, + "step": 71320 + }, + { + "epoch": 2.0752247141984466, + "grad_norm": 12.1875, + "learning_rate": 3.0825851827050594e-06, + "loss": 1.5903, + "step": 71340 + }, + { + "epoch": 2.0758064985309947, + "grad_norm": 14.0, + "learning_rate": 3.0806459010324445e-06, + "loss": 1.4759, + "step": 71360 + }, + { + "epoch": 2.0763882828635425, + "grad_norm": 13.0, + "learning_rate": 3.0787066193598296e-06, + "loss": 1.493, + "step": 71380 + }, + { + "epoch": 2.0769700671960902, + "grad_norm": 14.125, + "learning_rate": 3.0767673376872147e-06, + "loss": 1.4959, + "step": 71400 + }, + { + "epoch": 2.0775518515286384, + "grad_norm": 14.4375, + "learning_rate": 3.0748280560145998e-06, + "loss": 1.4924, + "step": 71420 + }, + { + "epoch": 2.078133635861186, + "grad_norm": 12.625, + "learning_rate": 3.072888774341985e-06, + "loss": 1.4623, + "step": 71440 + }, + { + "epoch": 2.0787154201937343, + "grad_norm": 15.375, + "learning_rate": 3.07094949266937e-06, + "loss": 1.4677, + "step": 71460 + }, + { + "epoch": 2.079297204526282, + "grad_norm": 13.6875, + "learning_rate": 3.069010210996755e-06, + "loss": 1.4981, + "step": 71480 + }, + { + "epoch": 2.07987898885883, + "grad_norm": 13.875, + "learning_rate": 3.06707092932414e-06, + "loss": 1.4813, + "step": 71500 + }, + { + "epoch": 2.080460773191378, + "grad_norm": 12.1875, + "learning_rate": 3.0651316476515253e-06, + "loss": 1.454, + "step": 71520 + }, + { + "epoch": 2.0810425575239258, + "grad_norm": 11.5, + "learning_rate": 3.0631923659789104e-06, + "loss": 1.5398, + "step": 71540 + }, + { + "epoch": 2.081624341856474, + "grad_norm": 13.9375, + "learning_rate": 3.0612530843062955e-06, + "loss": 1.5193, + "step": 71560 + }, + { + "epoch": 2.0822061261890217, + "grad_norm": 11.875, + "learning_rate": 3.0593138026336806e-06, + "loss": 1.485, + "step": 71580 + }, + { + "epoch": 2.08278791052157, + "grad_norm": 12.5625, + "learning_rate": 3.0573745209610657e-06, + "loss": 1.4731, + "step": 71600 + }, + { + "epoch": 2.0833696948541176, + "grad_norm": 13.4375, + "learning_rate": 3.0554352392884508e-06, + "loss": 1.4991, + "step": 71620 + }, + { + "epoch": 2.0839514791866653, + "grad_norm": 12.8125, + "learning_rate": 3.053495957615836e-06, + "loss": 1.5984, + "step": 71640 + }, + { + "epoch": 2.0845332635192135, + "grad_norm": 18.25, + "learning_rate": 3.051556675943221e-06, + "loss": 1.4401, + "step": 71660 + }, + { + "epoch": 2.0851150478517613, + "grad_norm": 15.0, + "learning_rate": 3.049617394270606e-06, + "loss": 1.4982, + "step": 71680 + }, + { + "epoch": 2.0856968321843095, + "grad_norm": 12.125, + "learning_rate": 3.047678112597991e-06, + "loss": 1.442, + "step": 71700 + }, + { + "epoch": 2.086278616516857, + "grad_norm": 14.9375, + "learning_rate": 3.0457388309253762e-06, + "loss": 1.4435, + "step": 71720 + }, + { + "epoch": 2.086860400849405, + "grad_norm": 12.875, + "learning_rate": 3.043799549252761e-06, + "loss": 1.4565, + "step": 71740 + }, + { + "epoch": 2.087442185181953, + "grad_norm": 16.75, + "learning_rate": 3.041860267580146e-06, + "loss": 1.5337, + "step": 71760 + }, + { + "epoch": 2.088023969514501, + "grad_norm": 16.125, + "learning_rate": 3.039920985907531e-06, + "loss": 1.5205, + "step": 71780 + }, + { + "epoch": 2.088605753847049, + "grad_norm": 13.5, + "learning_rate": 3.037981704234916e-06, + "loss": 1.4813, + "step": 71800 + }, + { + "epoch": 2.089187538179597, + "grad_norm": 15.1875, + "learning_rate": 3.0360424225623013e-06, + "loss": 1.5243, + "step": 71820 + }, + { + "epoch": 2.0897693225121445, + "grad_norm": 14.875, + "learning_rate": 3.0341031408896864e-06, + "loss": 1.4947, + "step": 71840 + }, + { + "epoch": 2.0903511068446927, + "grad_norm": 14.5625, + "learning_rate": 3.0321638592170715e-06, + "loss": 1.4967, + "step": 71860 + }, + { + "epoch": 2.0909328911772405, + "grad_norm": 11.875, + "learning_rate": 3.0302245775444566e-06, + "loss": 1.527, + "step": 71880 + }, + { + "epoch": 2.0915146755097886, + "grad_norm": 13.625, + "learning_rate": 3.0282852958718417e-06, + "loss": 1.4975, + "step": 71900 + }, + { + "epoch": 2.0920964598423364, + "grad_norm": 14.875, + "learning_rate": 3.026346014199227e-06, + "loss": 1.535, + "step": 71920 + }, + { + "epoch": 2.0926782441748846, + "grad_norm": 16.375, + "learning_rate": 3.024406732526612e-06, + "loss": 1.5116, + "step": 71940 + }, + { + "epoch": 2.0932600285074323, + "grad_norm": 14.5625, + "learning_rate": 3.022467450853997e-06, + "loss": 1.5866, + "step": 71960 + }, + { + "epoch": 2.09384181283998, + "grad_norm": 12.75, + "learning_rate": 3.020528169181382e-06, + "loss": 1.468, + "step": 71980 + }, + { + "epoch": 2.0944235971725282, + "grad_norm": 11.3125, + "learning_rate": 3.018588887508767e-06, + "loss": 1.5545, + "step": 72000 + }, + { + "epoch": 2.095005381505076, + "grad_norm": 11.6875, + "learning_rate": 3.0166496058361523e-06, + "loss": 1.5561, + "step": 72020 + }, + { + "epoch": 2.095587165837624, + "grad_norm": 16.5, + "learning_rate": 3.0147103241635374e-06, + "loss": 1.4659, + "step": 72040 + }, + { + "epoch": 2.096168950170172, + "grad_norm": 12.875, + "learning_rate": 3.0127710424909225e-06, + "loss": 1.5107, + "step": 72060 + }, + { + "epoch": 2.0967507345027196, + "grad_norm": 12.375, + "learning_rate": 3.0108317608183076e-06, + "loss": 1.4283, + "step": 72080 + }, + { + "epoch": 2.097332518835268, + "grad_norm": 12.25, + "learning_rate": 3.0088924791456927e-06, + "loss": 1.5606, + "step": 72100 + }, + { + "epoch": 2.0979143031678156, + "grad_norm": 11.6875, + "learning_rate": 3.0069531974730778e-06, + "loss": 1.5268, + "step": 72120 + }, + { + "epoch": 2.0984960875003638, + "grad_norm": 13.375, + "learning_rate": 3.0050139158004624e-06, + "loss": 1.502, + "step": 72140 + }, + { + "epoch": 2.0990778718329115, + "grad_norm": 14.375, + "learning_rate": 3.0030746341278475e-06, + "loss": 1.5983, + "step": 72160 + }, + { + "epoch": 2.0996596561654597, + "grad_norm": 13.5, + "learning_rate": 3.0011353524552326e-06, + "loss": 1.586, + "step": 72180 + }, + { + "epoch": 2.1002414404980074, + "grad_norm": 14.625, + "learning_rate": 2.9991960707826177e-06, + "loss": 1.4499, + "step": 72200 + }, + { + "epoch": 2.100823224830555, + "grad_norm": 13.1875, + "learning_rate": 2.997256789110003e-06, + "loss": 1.5428, + "step": 72220 + }, + { + "epoch": 2.1014050091631034, + "grad_norm": 15.4375, + "learning_rate": 2.995317507437388e-06, + "loss": 1.4782, + "step": 72240 + }, + { + "epoch": 2.101986793495651, + "grad_norm": 12.875, + "learning_rate": 2.993378225764773e-06, + "loss": 1.4888, + "step": 72260 + }, + { + "epoch": 2.1025685778281993, + "grad_norm": 11.25, + "learning_rate": 2.991438944092158e-06, + "loss": 1.521, + "step": 72280 + }, + { + "epoch": 2.103150362160747, + "grad_norm": 14.625, + "learning_rate": 2.9894996624195432e-06, + "loss": 1.5041, + "step": 72300 + }, + { + "epoch": 2.1037321464932948, + "grad_norm": 12.6875, + "learning_rate": 2.9875603807469283e-06, + "loss": 1.4956, + "step": 72320 + }, + { + "epoch": 2.104313930825843, + "grad_norm": 12.75, + "learning_rate": 2.9856210990743134e-06, + "loss": 1.4933, + "step": 72340 + }, + { + "epoch": 2.1048957151583907, + "grad_norm": 12.8125, + "learning_rate": 2.9836818174016985e-06, + "loss": 1.5132, + "step": 72360 + }, + { + "epoch": 2.105477499490939, + "grad_norm": 11.75, + "learning_rate": 2.9817425357290836e-06, + "loss": 1.527, + "step": 72380 + }, + { + "epoch": 2.1060592838234866, + "grad_norm": 13.8125, + "learning_rate": 2.9798032540564687e-06, + "loss": 1.4684, + "step": 72400 + }, + { + "epoch": 2.1066410681560344, + "grad_norm": 12.0, + "learning_rate": 2.977863972383854e-06, + "loss": 1.5328, + "step": 72420 + }, + { + "epoch": 2.1072228524885825, + "grad_norm": 14.6875, + "learning_rate": 2.975924690711239e-06, + "loss": 1.4832, + "step": 72440 + }, + { + "epoch": 2.1078046368211303, + "grad_norm": 14.75, + "learning_rate": 2.973985409038624e-06, + "loss": 1.4761, + "step": 72460 + }, + { + "epoch": 2.1083864211536785, + "grad_norm": 14.125, + "learning_rate": 2.972046127366009e-06, + "loss": 1.5431, + "step": 72480 + }, + { + "epoch": 2.108968205486226, + "grad_norm": 15.5, + "learning_rate": 2.970106845693394e-06, + "loss": 1.5485, + "step": 72500 + }, + { + "epoch": 2.109549989818774, + "grad_norm": 11.5625, + "learning_rate": 2.9681675640207793e-06, + "loss": 1.4798, + "step": 72520 + }, + { + "epoch": 2.110131774151322, + "grad_norm": 15.75, + "learning_rate": 2.966228282348164e-06, + "loss": 1.5801, + "step": 72540 + }, + { + "epoch": 2.11071355848387, + "grad_norm": 13.0, + "learning_rate": 2.964289000675549e-06, + "loss": 1.4505, + "step": 72560 + }, + { + "epoch": 2.111295342816418, + "grad_norm": 13.5625, + "learning_rate": 2.9623497190029337e-06, + "loss": 1.4387, + "step": 72580 + }, + { + "epoch": 2.111877127148966, + "grad_norm": 14.75, + "learning_rate": 2.960410437330319e-06, + "loss": 1.4776, + "step": 72600 + }, + { + "epoch": 2.112458911481514, + "grad_norm": 14.75, + "learning_rate": 2.958471155657704e-06, + "loss": 1.4937, + "step": 72620 + }, + { + "epoch": 2.1130406958140617, + "grad_norm": 12.6875, + "learning_rate": 2.956531873985089e-06, + "loss": 1.5073, + "step": 72640 + }, + { + "epoch": 2.1136224801466095, + "grad_norm": 14.375, + "learning_rate": 2.954592592312474e-06, + "loss": 1.4216, + "step": 72660 + }, + { + "epoch": 2.1142042644791577, + "grad_norm": 14.875, + "learning_rate": 2.9526533106398592e-06, + "loss": 1.4707, + "step": 72680 + }, + { + "epoch": 2.1147860488117054, + "grad_norm": 15.0625, + "learning_rate": 2.9507140289672443e-06, + "loss": 1.4336, + "step": 72700 + }, + { + "epoch": 2.1153678331442536, + "grad_norm": 15.125, + "learning_rate": 2.9487747472946294e-06, + "loss": 1.4724, + "step": 72720 + }, + { + "epoch": 2.1159496174768013, + "grad_norm": 15.375, + "learning_rate": 2.9468354656220145e-06, + "loss": 1.4837, + "step": 72740 + }, + { + "epoch": 2.116531401809349, + "grad_norm": 15.0625, + "learning_rate": 2.9448961839493996e-06, + "loss": 1.433, + "step": 72760 + }, + { + "epoch": 2.1171131861418973, + "grad_norm": 13.5, + "learning_rate": 2.9429569022767847e-06, + "loss": 1.5166, + "step": 72780 + }, + { + "epoch": 2.117694970474445, + "grad_norm": 14.625, + "learning_rate": 2.94101762060417e-06, + "loss": 1.4642, + "step": 72800 + }, + { + "epoch": 2.118276754806993, + "grad_norm": 13.25, + "learning_rate": 2.939078338931555e-06, + "loss": 1.4825, + "step": 72820 + }, + { + "epoch": 2.118858539139541, + "grad_norm": 13.0625, + "learning_rate": 2.9371390572589396e-06, + "loss": 1.4745, + "step": 72840 + }, + { + "epoch": 2.119440323472089, + "grad_norm": 10.625, + "learning_rate": 2.9351997755863247e-06, + "loss": 1.4926, + "step": 72860 + }, + { + "epoch": 2.120022107804637, + "grad_norm": 12.3125, + "learning_rate": 2.9332604939137098e-06, + "loss": 1.4766, + "step": 72880 + }, + { + "epoch": 2.1206038921371846, + "grad_norm": 12.25, + "learning_rate": 2.931321212241095e-06, + "loss": 1.4237, + "step": 72900 + }, + { + "epoch": 2.1211856764697328, + "grad_norm": 13.625, + "learning_rate": 2.92938193056848e-06, + "loss": 1.4522, + "step": 72920 + }, + { + "epoch": 2.1217674608022805, + "grad_norm": 11.0625, + "learning_rate": 2.927442648895865e-06, + "loss": 1.4988, + "step": 72940 + }, + { + "epoch": 2.1223492451348287, + "grad_norm": 12.3125, + "learning_rate": 2.92550336722325e-06, + "loss": 1.5132, + "step": 72960 + }, + { + "epoch": 2.1229310294673764, + "grad_norm": 14.75, + "learning_rate": 2.9235640855506353e-06, + "loss": 1.4619, + "step": 72980 + }, + { + "epoch": 2.123512813799924, + "grad_norm": 12.1875, + "learning_rate": 2.9216248038780204e-06, + "loss": 1.4034, + "step": 73000 + }, + { + "epoch": 2.1240945981324724, + "grad_norm": 14.6875, + "learning_rate": 2.9196855222054055e-06, + "loss": 1.5266, + "step": 73020 + }, + { + "epoch": 2.12467638246502, + "grad_norm": 14.5625, + "learning_rate": 2.9177462405327906e-06, + "loss": 1.5114, + "step": 73040 + }, + { + "epoch": 2.1252581667975683, + "grad_norm": 14.3125, + "learning_rate": 2.9158069588601757e-06, + "loss": 1.4811, + "step": 73060 + }, + { + "epoch": 2.125839951130116, + "grad_norm": 17.25, + "learning_rate": 2.9138676771875608e-06, + "loss": 1.5085, + "step": 73080 + }, + { + "epoch": 2.1264217354626638, + "grad_norm": 14.1875, + "learning_rate": 2.911928395514946e-06, + "loss": 1.4951, + "step": 73100 + }, + { + "epoch": 2.127003519795212, + "grad_norm": 12.8125, + "learning_rate": 2.909989113842331e-06, + "loss": 1.4602, + "step": 73120 + }, + { + "epoch": 2.1275853041277597, + "grad_norm": 14.0, + "learning_rate": 2.908049832169716e-06, + "loss": 1.4742, + "step": 73140 + }, + { + "epoch": 2.128167088460308, + "grad_norm": 14.4375, + "learning_rate": 2.906110550497101e-06, + "loss": 1.5108, + "step": 73160 + }, + { + "epoch": 2.1287488727928556, + "grad_norm": 13.3125, + "learning_rate": 2.9041712688244862e-06, + "loss": 1.5393, + "step": 73180 + }, + { + "epoch": 2.129330657125404, + "grad_norm": 12.625, + "learning_rate": 2.9022319871518713e-06, + "loss": 1.4763, + "step": 73200 + }, + { + "epoch": 2.1299124414579516, + "grad_norm": 12.5, + "learning_rate": 2.9002927054792564e-06, + "loss": 1.4555, + "step": 73220 + }, + { + "epoch": 2.1304942257904993, + "grad_norm": 13.375, + "learning_rate": 2.898353423806641e-06, + "loss": 1.5222, + "step": 73240 + }, + { + "epoch": 2.1310760101230475, + "grad_norm": 15.5, + "learning_rate": 2.896414142134026e-06, + "loss": 1.5048, + "step": 73260 + }, + { + "epoch": 2.1316577944555952, + "grad_norm": 15.625, + "learning_rate": 2.8944748604614113e-06, + "loss": 1.5817, + "step": 73280 + }, + { + "epoch": 2.1322395787881434, + "grad_norm": 14.125, + "learning_rate": 2.8925355787887964e-06, + "loss": 1.4889, + "step": 73300 + }, + { + "epoch": 2.132821363120691, + "grad_norm": 14.1875, + "learning_rate": 2.8905962971161815e-06, + "loss": 1.5541, + "step": 73320 + }, + { + "epoch": 2.133403147453239, + "grad_norm": 13.875, + "learning_rate": 2.8886570154435666e-06, + "loss": 1.5182, + "step": 73340 + }, + { + "epoch": 2.133984931785787, + "grad_norm": 13.25, + "learning_rate": 2.8867177337709517e-06, + "loss": 1.5519, + "step": 73360 + }, + { + "epoch": 2.134566716118335, + "grad_norm": 14.375, + "learning_rate": 2.884778452098337e-06, + "loss": 1.4621, + "step": 73380 + }, + { + "epoch": 2.135148500450883, + "grad_norm": 10.0, + "learning_rate": 2.882839170425722e-06, + "loss": 1.441, + "step": 73400 + }, + { + "epoch": 2.1357302847834307, + "grad_norm": 12.9375, + "learning_rate": 2.880899888753107e-06, + "loss": 1.4363, + "step": 73420 + }, + { + "epoch": 2.136312069115979, + "grad_norm": 15.875, + "learning_rate": 2.878960607080492e-06, + "loss": 1.5074, + "step": 73440 + }, + { + "epoch": 2.1368938534485267, + "grad_norm": 14.5, + "learning_rate": 2.877021325407877e-06, + "loss": 1.4954, + "step": 73460 + }, + { + "epoch": 2.1374756377810744, + "grad_norm": 17.375, + "learning_rate": 2.8750820437352623e-06, + "loss": 1.6136, + "step": 73480 + }, + { + "epoch": 2.1380574221136226, + "grad_norm": 12.9375, + "learning_rate": 2.8731427620626474e-06, + "loss": 1.4908, + "step": 73500 + }, + { + "epoch": 2.1386392064461703, + "grad_norm": 13.8125, + "learning_rate": 2.8712034803900325e-06, + "loss": 1.4283, + "step": 73520 + }, + { + "epoch": 2.1392209907787185, + "grad_norm": 12.375, + "learning_rate": 2.8692641987174176e-06, + "loss": 1.4775, + "step": 73540 + }, + { + "epoch": 2.1398027751112663, + "grad_norm": 14.1875, + "learning_rate": 2.8673249170448027e-06, + "loss": 1.562, + "step": 73560 + }, + { + "epoch": 2.140384559443814, + "grad_norm": 14.0, + "learning_rate": 2.8653856353721878e-06, + "loss": 1.4407, + "step": 73580 + }, + { + "epoch": 2.140966343776362, + "grad_norm": 12.5625, + "learning_rate": 2.863446353699573e-06, + "loss": 1.5493, + "step": 73600 + }, + { + "epoch": 2.14154812810891, + "grad_norm": 12.25, + "learning_rate": 2.861507072026958e-06, + "loss": 1.4486, + "step": 73620 + }, + { + "epoch": 2.142129912441458, + "grad_norm": 14.75, + "learning_rate": 2.8595677903543426e-06, + "loss": 1.5861, + "step": 73640 + }, + { + "epoch": 2.142711696774006, + "grad_norm": 12.375, + "learning_rate": 2.8576285086817277e-06, + "loss": 1.5289, + "step": 73660 + }, + { + "epoch": 2.1432934811065536, + "grad_norm": 14.0, + "learning_rate": 2.855689227009113e-06, + "loss": 1.5793, + "step": 73680 + }, + { + "epoch": 2.143875265439102, + "grad_norm": 13.5625, + "learning_rate": 2.853749945336498e-06, + "loss": 1.481, + "step": 73700 + }, + { + "epoch": 2.1444570497716495, + "grad_norm": 15.9375, + "learning_rate": 2.851810663663883e-06, + "loss": 1.4722, + "step": 73720 + }, + { + "epoch": 2.1450388341041977, + "grad_norm": 12.1875, + "learning_rate": 2.849871381991268e-06, + "loss": 1.3964, + "step": 73740 + }, + { + "epoch": 2.1456206184367455, + "grad_norm": 17.0, + "learning_rate": 2.8479321003186532e-06, + "loss": 1.4954, + "step": 73760 + }, + { + "epoch": 2.146202402769293, + "grad_norm": 12.8125, + "learning_rate": 2.8459928186460383e-06, + "loss": 1.5448, + "step": 73780 + }, + { + "epoch": 2.1467841871018414, + "grad_norm": 13.125, + "learning_rate": 2.8440535369734234e-06, + "loss": 1.5075, + "step": 73800 + }, + { + "epoch": 2.147365971434389, + "grad_norm": 14.125, + "learning_rate": 2.8421142553008085e-06, + "loss": 1.485, + "step": 73820 + }, + { + "epoch": 2.1479477557669373, + "grad_norm": 12.9375, + "learning_rate": 2.8401749736281936e-06, + "loss": 1.5189, + "step": 73840 + }, + { + "epoch": 2.148529540099485, + "grad_norm": 13.5, + "learning_rate": 2.8382356919555787e-06, + "loss": 1.5812, + "step": 73860 + }, + { + "epoch": 2.1491113244320332, + "grad_norm": 13.125, + "learning_rate": 2.836296410282964e-06, + "loss": 1.499, + "step": 73880 + }, + { + "epoch": 2.149693108764581, + "grad_norm": 15.9375, + "learning_rate": 2.834357128610349e-06, + "loss": 1.4625, + "step": 73900 + }, + { + "epoch": 2.1502748930971287, + "grad_norm": 13.1875, + "learning_rate": 2.832417846937734e-06, + "loss": 1.4651, + "step": 73920 + }, + { + "epoch": 2.150856677429677, + "grad_norm": 10.4375, + "learning_rate": 2.830478565265119e-06, + "loss": 1.5115, + "step": 73940 + }, + { + "epoch": 2.1514384617622246, + "grad_norm": 12.1875, + "learning_rate": 2.828539283592504e-06, + "loss": 1.4763, + "step": 73960 + }, + { + "epoch": 2.152020246094773, + "grad_norm": 11.625, + "learning_rate": 2.8266000019198893e-06, + "loss": 1.5031, + "step": 73980 + }, + { + "epoch": 2.1526020304273206, + "grad_norm": 14.6875, + "learning_rate": 2.8246607202472744e-06, + "loss": 1.5051, + "step": 74000 + }, + { + "epoch": 2.1531838147598683, + "grad_norm": 14.875, + "learning_rate": 2.8227214385746595e-06, + "loss": 1.4666, + "step": 74020 + }, + { + "epoch": 2.1537655990924165, + "grad_norm": 12.25, + "learning_rate": 2.8207821569020446e-06, + "loss": 1.4348, + "step": 74040 + }, + { + "epoch": 2.1543473834249642, + "grad_norm": 13.0, + "learning_rate": 2.8188428752294293e-06, + "loss": 1.4955, + "step": 74060 + }, + { + "epoch": 2.1549291677575124, + "grad_norm": 16.25, + "learning_rate": 2.8169035935568144e-06, + "loss": 1.5198, + "step": 74080 + }, + { + "epoch": 2.15551095209006, + "grad_norm": 13.5, + "learning_rate": 2.8149643118841995e-06, + "loss": 1.4427, + "step": 74100 + }, + { + "epoch": 2.1560927364226083, + "grad_norm": 13.625, + "learning_rate": 2.8130250302115845e-06, + "loss": 1.4765, + "step": 74120 + }, + { + "epoch": 2.156674520755156, + "grad_norm": 11.125, + "learning_rate": 2.8110857485389696e-06, + "loss": 1.4595, + "step": 74140 + }, + { + "epoch": 2.157256305087704, + "grad_norm": 13.5625, + "learning_rate": 2.8091464668663547e-06, + "loss": 1.5147, + "step": 74160 + }, + { + "epoch": 2.157838089420252, + "grad_norm": 12.8125, + "learning_rate": 2.80720718519374e-06, + "loss": 1.5748, + "step": 74180 + }, + { + "epoch": 2.1584198737527998, + "grad_norm": 13.4375, + "learning_rate": 2.805267903521125e-06, + "loss": 1.5331, + "step": 74200 + }, + { + "epoch": 2.159001658085348, + "grad_norm": 14.4375, + "learning_rate": 2.80332862184851e-06, + "loss": 1.4877, + "step": 74220 + }, + { + "epoch": 2.1595834424178957, + "grad_norm": 13.25, + "learning_rate": 2.801389340175895e-06, + "loss": 1.4491, + "step": 74240 + }, + { + "epoch": 2.1601652267504434, + "grad_norm": 14.125, + "learning_rate": 2.7994500585032802e-06, + "loss": 1.4192, + "step": 74260 + }, + { + "epoch": 2.1607470110829916, + "grad_norm": 15.125, + "learning_rate": 2.7975107768306653e-06, + "loss": 1.5257, + "step": 74280 + }, + { + "epoch": 2.1613287954155393, + "grad_norm": 13.75, + "learning_rate": 2.7955714951580504e-06, + "loss": 1.4681, + "step": 74300 + }, + { + "epoch": 2.1619105797480875, + "grad_norm": 17.0, + "learning_rate": 2.7936322134854355e-06, + "loss": 1.5076, + "step": 74320 + }, + { + "epoch": 2.1624923640806353, + "grad_norm": 12.625, + "learning_rate": 2.7916929318128206e-06, + "loss": 1.5422, + "step": 74340 + }, + { + "epoch": 2.163074148413183, + "grad_norm": 13.9375, + "learning_rate": 2.7897536501402057e-06, + "loss": 1.5043, + "step": 74360 + }, + { + "epoch": 2.163655932745731, + "grad_norm": 13.5, + "learning_rate": 2.787814368467591e-06, + "loss": 1.5271, + "step": 74380 + }, + { + "epoch": 2.164237717078279, + "grad_norm": 13.875, + "learning_rate": 2.785875086794976e-06, + "loss": 1.3912, + "step": 74400 + }, + { + "epoch": 2.164819501410827, + "grad_norm": 20.125, + "learning_rate": 2.783935805122361e-06, + "loss": 1.5108, + "step": 74420 + }, + { + "epoch": 2.165401285743375, + "grad_norm": 15.6875, + "learning_rate": 2.781996523449746e-06, + "loss": 1.5373, + "step": 74440 + }, + { + "epoch": 2.165983070075923, + "grad_norm": 12.25, + "learning_rate": 2.7800572417771308e-06, + "loss": 1.5474, + "step": 74460 + }, + { + "epoch": 2.166564854408471, + "grad_norm": 13.6875, + "learning_rate": 2.778117960104516e-06, + "loss": 1.4762, + "step": 74480 + }, + { + "epoch": 2.1671466387410185, + "grad_norm": 13.1875, + "learning_rate": 2.776178678431901e-06, + "loss": 1.485, + "step": 74500 + }, + { + "epoch": 2.1677284230735667, + "grad_norm": 13.375, + "learning_rate": 2.774239396759286e-06, + "loss": 1.4827, + "step": 74520 + }, + { + "epoch": 2.1683102074061145, + "grad_norm": 15.0, + "learning_rate": 2.772300115086671e-06, + "loss": 1.4605, + "step": 74540 + }, + { + "epoch": 2.1688919917386627, + "grad_norm": 14.1875, + "learning_rate": 2.7703608334140563e-06, + "loss": 1.4337, + "step": 74560 + }, + { + "epoch": 2.1694737760712104, + "grad_norm": 13.625, + "learning_rate": 2.7684215517414414e-06, + "loss": 1.4334, + "step": 74580 + }, + { + "epoch": 2.170055560403758, + "grad_norm": 13.1875, + "learning_rate": 2.7664822700688265e-06, + "loss": 1.5381, + "step": 74600 + }, + { + "epoch": 2.1706373447363063, + "grad_norm": 14.0625, + "learning_rate": 2.7645429883962116e-06, + "loss": 1.4843, + "step": 74620 + }, + { + "epoch": 2.171219129068854, + "grad_norm": 14.5, + "learning_rate": 2.7626037067235967e-06, + "loss": 1.5029, + "step": 74640 + }, + { + "epoch": 2.1718009134014022, + "grad_norm": 11.8125, + "learning_rate": 2.7606644250509818e-06, + "loss": 1.4978, + "step": 74660 + }, + { + "epoch": 2.17238269773395, + "grad_norm": 13.625, + "learning_rate": 2.758725143378367e-06, + "loss": 1.537, + "step": 74680 + }, + { + "epoch": 2.172964482066498, + "grad_norm": 14.5, + "learning_rate": 2.7567858617057515e-06, + "loss": 1.5458, + "step": 74700 + }, + { + "epoch": 2.173546266399046, + "grad_norm": 15.875, + "learning_rate": 2.7548465800331366e-06, + "loss": 1.544, + "step": 74720 + }, + { + "epoch": 2.1741280507315937, + "grad_norm": 12.625, + "learning_rate": 2.7529072983605213e-06, + "loss": 1.4829, + "step": 74740 + }, + { + "epoch": 2.174709835064142, + "grad_norm": 15.5, + "learning_rate": 2.7509680166879064e-06, + "loss": 1.5048, + "step": 74760 + }, + { + "epoch": 2.1752916193966896, + "grad_norm": 13.3125, + "learning_rate": 2.7490287350152915e-06, + "loss": 1.5169, + "step": 74780 + }, + { + "epoch": 2.1758734037292378, + "grad_norm": 12.1875, + "learning_rate": 2.7470894533426766e-06, + "loss": 1.5291, + "step": 74800 + }, + { + "epoch": 2.1764551880617855, + "grad_norm": 16.125, + "learning_rate": 2.7451501716700617e-06, + "loss": 1.5344, + "step": 74820 + }, + { + "epoch": 2.1770369723943332, + "grad_norm": 11.375, + "learning_rate": 2.743210889997447e-06, + "loss": 1.558, + "step": 74840 + }, + { + "epoch": 2.1776187567268814, + "grad_norm": 10.875, + "learning_rate": 2.741271608324832e-06, + "loss": 1.4821, + "step": 74860 + }, + { + "epoch": 2.178200541059429, + "grad_norm": 13.4375, + "learning_rate": 2.739332326652217e-06, + "loss": 1.4888, + "step": 74880 + }, + { + "epoch": 2.1787823253919774, + "grad_norm": 14.8125, + "learning_rate": 2.737393044979602e-06, + "loss": 1.5181, + "step": 74900 + }, + { + "epoch": 2.179364109724525, + "grad_norm": 13.625, + "learning_rate": 2.735453763306987e-06, + "loss": 1.5534, + "step": 74920 + }, + { + "epoch": 2.179945894057073, + "grad_norm": 15.8125, + "learning_rate": 2.7335144816343723e-06, + "loss": 1.4914, + "step": 74940 + }, + { + "epoch": 2.180527678389621, + "grad_norm": 12.5, + "learning_rate": 2.7315751999617574e-06, + "loss": 1.4862, + "step": 74960 + }, + { + "epoch": 2.1811094627221688, + "grad_norm": 22.5, + "learning_rate": 2.7296359182891425e-06, + "loss": 1.5262, + "step": 74980 + }, + { + "epoch": 2.181691247054717, + "grad_norm": 13.8125, + "learning_rate": 2.7276966366165276e-06, + "loss": 1.4733, + "step": 75000 + }, + { + "epoch": 2.1822730313872647, + "grad_norm": 13.1875, + "learning_rate": 2.7257573549439127e-06, + "loss": 1.3873, + "step": 75020 + }, + { + "epoch": 2.1828548157198124, + "grad_norm": 11.375, + "learning_rate": 2.7238180732712978e-06, + "loss": 1.5585, + "step": 75040 + }, + { + "epoch": 2.1834366000523606, + "grad_norm": 18.25, + "learning_rate": 2.721878791598683e-06, + "loss": 1.5439, + "step": 75060 + }, + { + "epoch": 2.1840183843849084, + "grad_norm": 12.5, + "learning_rate": 2.719939509926068e-06, + "loss": 1.4403, + "step": 75080 + }, + { + "epoch": 2.1846001687174565, + "grad_norm": 12.625, + "learning_rate": 2.718000228253453e-06, + "loss": 1.4441, + "step": 75100 + }, + { + "epoch": 2.1851819530500043, + "grad_norm": 14.0625, + "learning_rate": 2.716060946580838e-06, + "loss": 1.6107, + "step": 75120 + }, + { + "epoch": 2.1857637373825525, + "grad_norm": 13.0, + "learning_rate": 2.7141216649082233e-06, + "loss": 1.4784, + "step": 75140 + }, + { + "epoch": 2.1863455217151, + "grad_norm": 15.6875, + "learning_rate": 2.712182383235608e-06, + "loss": 1.5381, + "step": 75160 + }, + { + "epoch": 2.186927306047648, + "grad_norm": 13.1875, + "learning_rate": 2.710243101562993e-06, + "loss": 1.4521, + "step": 75180 + }, + { + "epoch": 2.187509090380196, + "grad_norm": 11.9375, + "learning_rate": 2.708303819890378e-06, + "loss": 1.4395, + "step": 75200 + }, + { + "epoch": 2.188090874712744, + "grad_norm": 12.75, + "learning_rate": 2.7063645382177632e-06, + "loss": 1.5815, + "step": 75220 + }, + { + "epoch": 2.188672659045292, + "grad_norm": 10.625, + "learning_rate": 2.7044252565451483e-06, + "loss": 1.4308, + "step": 75240 + }, + { + "epoch": 2.18925444337784, + "grad_norm": 14.75, + "learning_rate": 2.7024859748725334e-06, + "loss": 1.5653, + "step": 75260 + }, + { + "epoch": 2.189836227710388, + "grad_norm": 12.25, + "learning_rate": 2.7005466931999185e-06, + "loss": 1.5253, + "step": 75280 + }, + { + "epoch": 2.1904180120429357, + "grad_norm": 14.25, + "learning_rate": 2.6986074115273036e-06, + "loss": 1.4762, + "step": 75300 + }, + { + "epoch": 2.1909997963754835, + "grad_norm": 11.1875, + "learning_rate": 2.6966681298546887e-06, + "loss": 1.5231, + "step": 75320 + }, + { + "epoch": 2.1915815807080317, + "grad_norm": 12.4375, + "learning_rate": 2.694728848182074e-06, + "loss": 1.566, + "step": 75340 + }, + { + "epoch": 2.1921633650405794, + "grad_norm": 11.75, + "learning_rate": 2.692789566509459e-06, + "loss": 1.5473, + "step": 75360 + }, + { + "epoch": 2.1927451493731276, + "grad_norm": 13.625, + "learning_rate": 2.690850284836844e-06, + "loss": 1.4593, + "step": 75380 + }, + { + "epoch": 2.1933269337056753, + "grad_norm": 14.3125, + "learning_rate": 2.688911003164229e-06, + "loss": 1.4661, + "step": 75400 + }, + { + "epoch": 2.193908718038223, + "grad_norm": 13.3125, + "learning_rate": 2.686971721491614e-06, + "loss": 1.4729, + "step": 75420 + }, + { + "epoch": 2.1944905023707713, + "grad_norm": 14.4375, + "learning_rate": 2.6850324398189993e-06, + "loss": 1.4975, + "step": 75440 + }, + { + "epoch": 2.195072286703319, + "grad_norm": 11.375, + "learning_rate": 2.6830931581463844e-06, + "loss": 1.4666, + "step": 75460 + }, + { + "epoch": 2.195654071035867, + "grad_norm": 14.4375, + "learning_rate": 2.6811538764737695e-06, + "loss": 1.4428, + "step": 75480 + }, + { + "epoch": 2.196235855368415, + "grad_norm": 14.0625, + "learning_rate": 2.6792145948011546e-06, + "loss": 1.5145, + "step": 75500 + }, + { + "epoch": 2.1968176397009627, + "grad_norm": 15.25, + "learning_rate": 2.6772753131285397e-06, + "loss": 1.5086, + "step": 75520 + }, + { + "epoch": 2.197399424033511, + "grad_norm": 13.5, + "learning_rate": 2.6753360314559248e-06, + "loss": 1.4997, + "step": 75540 + }, + { + "epoch": 2.1979812083660586, + "grad_norm": 14.3125, + "learning_rate": 2.6733967497833095e-06, + "loss": 1.4753, + "step": 75560 + }, + { + "epoch": 2.1985629926986068, + "grad_norm": 12.75, + "learning_rate": 2.6714574681106945e-06, + "loss": 1.5529, + "step": 75580 + }, + { + "epoch": 2.1991447770311545, + "grad_norm": 13.4375, + "learning_rate": 2.6695181864380796e-06, + "loss": 1.5082, + "step": 75600 + }, + { + "epoch": 2.1997265613637023, + "grad_norm": 12.0625, + "learning_rate": 2.6675789047654647e-06, + "loss": 1.6554, + "step": 75620 + }, + { + "epoch": 2.2003083456962504, + "grad_norm": 14.5, + "learning_rate": 2.66563962309285e-06, + "loss": 1.5155, + "step": 75640 + }, + { + "epoch": 2.200890130028798, + "grad_norm": 10.625, + "learning_rate": 2.663700341420235e-06, + "loss": 1.4859, + "step": 75660 + }, + { + "epoch": 2.2014719143613464, + "grad_norm": 13.0625, + "learning_rate": 2.66176105974762e-06, + "loss": 1.4973, + "step": 75680 + }, + { + "epoch": 2.202053698693894, + "grad_norm": 11.9375, + "learning_rate": 2.659821778075005e-06, + "loss": 1.4262, + "step": 75700 + }, + { + "epoch": 2.2026354830264423, + "grad_norm": 14.0625, + "learning_rate": 2.6578824964023902e-06, + "loss": 1.4614, + "step": 75720 + }, + { + "epoch": 2.20321726735899, + "grad_norm": 12.0, + "learning_rate": 2.6559432147297753e-06, + "loss": 1.5203, + "step": 75740 + }, + { + "epoch": 2.203799051691538, + "grad_norm": 13.0625, + "learning_rate": 2.6540039330571604e-06, + "loss": 1.5362, + "step": 75760 + }, + { + "epoch": 2.204380836024086, + "grad_norm": 12.8125, + "learning_rate": 2.6520646513845455e-06, + "loss": 1.5148, + "step": 75780 + }, + { + "epoch": 2.2049626203566337, + "grad_norm": 8.5625, + "learning_rate": 2.6501253697119306e-06, + "loss": 1.4596, + "step": 75800 + }, + { + "epoch": 2.205544404689182, + "grad_norm": 14.0, + "learning_rate": 2.6481860880393157e-06, + "loss": 1.5355, + "step": 75820 + }, + { + "epoch": 2.2061261890217296, + "grad_norm": 14.75, + "learning_rate": 2.646246806366701e-06, + "loss": 1.5136, + "step": 75840 + }, + { + "epoch": 2.2067079733542774, + "grad_norm": 11.4375, + "learning_rate": 2.644307524694086e-06, + "loss": 1.4803, + "step": 75860 + }, + { + "epoch": 2.2072897576868256, + "grad_norm": 12.9375, + "learning_rate": 2.642368243021471e-06, + "loss": 1.5108, + "step": 75880 + }, + { + "epoch": 2.2078715420193733, + "grad_norm": 13.1875, + "learning_rate": 2.640428961348856e-06, + "loss": 1.4315, + "step": 75900 + }, + { + "epoch": 2.2084533263519215, + "grad_norm": 12.3125, + "learning_rate": 2.638489679676241e-06, + "loss": 1.487, + "step": 75920 + }, + { + "epoch": 2.2090351106844692, + "grad_norm": 11.9375, + "learning_rate": 2.6365503980036263e-06, + "loss": 1.486, + "step": 75940 + }, + { + "epoch": 2.2096168950170174, + "grad_norm": 11.9375, + "learning_rate": 2.634611116331011e-06, + "loss": 1.574, + "step": 75960 + }, + { + "epoch": 2.210198679349565, + "grad_norm": 13.8125, + "learning_rate": 2.632671834658396e-06, + "loss": 1.4623, + "step": 75980 + }, + { + "epoch": 2.210780463682113, + "grad_norm": 13.125, + "learning_rate": 2.630732552985781e-06, + "loss": 1.4845, + "step": 76000 + }, + { + "epoch": 2.211362248014661, + "grad_norm": 16.125, + "learning_rate": 2.6287932713131663e-06, + "loss": 1.4634, + "step": 76020 + }, + { + "epoch": 2.211944032347209, + "grad_norm": 15.5625, + "learning_rate": 2.6268539896405514e-06, + "loss": 1.4881, + "step": 76040 + }, + { + "epoch": 2.212525816679757, + "grad_norm": 14.8125, + "learning_rate": 2.6249147079679365e-06, + "loss": 1.5763, + "step": 76060 + }, + { + "epoch": 2.2131076010123047, + "grad_norm": 10.8125, + "learning_rate": 2.6229754262953216e-06, + "loss": 1.3982, + "step": 76080 + }, + { + "epoch": 2.2136893853448525, + "grad_norm": 11.9375, + "learning_rate": 2.6210361446227067e-06, + "loss": 1.5016, + "step": 76100 + }, + { + "epoch": 2.2142711696774007, + "grad_norm": 13.5625, + "learning_rate": 2.6190968629500918e-06, + "loss": 1.5301, + "step": 76120 + }, + { + "epoch": 2.2148529540099484, + "grad_norm": 14.25, + "learning_rate": 2.617157581277477e-06, + "loss": 1.5179, + "step": 76140 + }, + { + "epoch": 2.2154347383424966, + "grad_norm": 10.9375, + "learning_rate": 2.615218299604862e-06, + "loss": 1.4687, + "step": 76160 + }, + { + "epoch": 2.2160165226750443, + "grad_norm": 12.75, + "learning_rate": 2.613279017932247e-06, + "loss": 1.4994, + "step": 76180 + }, + { + "epoch": 2.216598307007592, + "grad_norm": 12.4375, + "learning_rate": 2.611339736259632e-06, + "loss": 1.4692, + "step": 76200 + }, + { + "epoch": 2.2171800913401403, + "grad_norm": 13.875, + "learning_rate": 2.6094004545870172e-06, + "loss": 1.5165, + "step": 76220 + }, + { + "epoch": 2.217761875672688, + "grad_norm": 13.1875, + "learning_rate": 2.6074611729144023e-06, + "loss": 1.4748, + "step": 76240 + }, + { + "epoch": 2.218343660005236, + "grad_norm": 14.625, + "learning_rate": 2.6055218912417874e-06, + "loss": 1.4563, + "step": 76260 + }, + { + "epoch": 2.218925444337784, + "grad_norm": 13.0625, + "learning_rate": 2.6035826095691725e-06, + "loss": 1.5219, + "step": 76280 + }, + { + "epoch": 2.2195072286703317, + "grad_norm": 14.0, + "learning_rate": 2.6016433278965576e-06, + "loss": 1.5895, + "step": 76300 + }, + { + "epoch": 2.22008901300288, + "grad_norm": 14.5625, + "learning_rate": 2.5997040462239427e-06, + "loss": 1.534, + "step": 76320 + }, + { + "epoch": 2.2206707973354276, + "grad_norm": 13.3125, + "learning_rate": 2.597764764551328e-06, + "loss": 1.4324, + "step": 76340 + }, + { + "epoch": 2.221252581667976, + "grad_norm": 13.5, + "learning_rate": 2.595825482878713e-06, + "loss": 1.5725, + "step": 76360 + }, + { + "epoch": 2.2218343660005235, + "grad_norm": 15.125, + "learning_rate": 2.5938862012060976e-06, + "loss": 1.4665, + "step": 76380 + }, + { + "epoch": 2.2224161503330717, + "grad_norm": 15.875, + "learning_rate": 2.5919469195334827e-06, + "loss": 1.5412, + "step": 76400 + }, + { + "epoch": 2.2229979346656195, + "grad_norm": 12.875, + "learning_rate": 2.590007637860868e-06, + "loss": 1.452, + "step": 76420 + }, + { + "epoch": 2.223579718998167, + "grad_norm": 12.9375, + "learning_rate": 2.588068356188253e-06, + "loss": 1.5128, + "step": 76440 + }, + { + "epoch": 2.2241615033307154, + "grad_norm": 13.0625, + "learning_rate": 2.586129074515638e-06, + "loss": 1.5441, + "step": 76460 + }, + { + "epoch": 2.224743287663263, + "grad_norm": 11.625, + "learning_rate": 2.584189792843023e-06, + "loss": 1.5348, + "step": 76480 + }, + { + "epoch": 2.2253250719958113, + "grad_norm": 12.625, + "learning_rate": 2.582250511170408e-06, + "loss": 1.4825, + "step": 76500 + }, + { + "epoch": 2.225906856328359, + "grad_norm": 12.3125, + "learning_rate": 2.5803112294977933e-06, + "loss": 1.5192, + "step": 76520 + }, + { + "epoch": 2.2264886406609072, + "grad_norm": 15.9375, + "learning_rate": 2.5783719478251784e-06, + "loss": 1.5104, + "step": 76540 + }, + { + "epoch": 2.227070424993455, + "grad_norm": 12.4375, + "learning_rate": 2.5764326661525635e-06, + "loss": 1.4994, + "step": 76560 + }, + { + "epoch": 2.2276522093260027, + "grad_norm": 14.5, + "learning_rate": 2.5744933844799486e-06, + "loss": 1.5483, + "step": 76580 + }, + { + "epoch": 2.228233993658551, + "grad_norm": 16.75, + "learning_rate": 2.5725541028073337e-06, + "loss": 1.4975, + "step": 76600 + }, + { + "epoch": 2.2288157779910986, + "grad_norm": 13.0, + "learning_rate": 2.5706148211347188e-06, + "loss": 1.4466, + "step": 76620 + }, + { + "epoch": 2.229397562323647, + "grad_norm": 12.0, + "learning_rate": 2.568675539462104e-06, + "loss": 1.444, + "step": 76640 + }, + { + "epoch": 2.2299793466561946, + "grad_norm": 15.0, + "learning_rate": 2.566736257789489e-06, + "loss": 1.4896, + "step": 76660 + }, + { + "epoch": 2.2305611309887423, + "grad_norm": 13.5625, + "learning_rate": 2.564796976116874e-06, + "loss": 1.4873, + "step": 76680 + }, + { + "epoch": 2.2311429153212905, + "grad_norm": 15.0625, + "learning_rate": 2.562857694444259e-06, + "loss": 1.4281, + "step": 76700 + }, + { + "epoch": 2.2317246996538382, + "grad_norm": 13.8125, + "learning_rate": 2.5609184127716443e-06, + "loss": 1.5257, + "step": 76720 + }, + { + "epoch": 2.2323064839863864, + "grad_norm": 14.6875, + "learning_rate": 2.5589791310990294e-06, + "loss": 1.4407, + "step": 76740 + }, + { + "epoch": 2.232888268318934, + "grad_norm": 12.8125, + "learning_rate": 2.5570398494264145e-06, + "loss": 1.4499, + "step": 76760 + }, + { + "epoch": 2.233470052651482, + "grad_norm": 16.25, + "learning_rate": 2.555100567753799e-06, + "loss": 1.4619, + "step": 76780 + }, + { + "epoch": 2.23405183698403, + "grad_norm": 13.5625, + "learning_rate": 2.5531612860811842e-06, + "loss": 1.5441, + "step": 76800 + }, + { + "epoch": 2.234633621316578, + "grad_norm": 11.125, + "learning_rate": 2.551222004408569e-06, + "loss": 1.5077, + "step": 76820 + }, + { + "epoch": 2.235215405649126, + "grad_norm": 12.125, + "learning_rate": 2.549282722735954e-06, + "loss": 1.5183, + "step": 76840 + }, + { + "epoch": 2.2357971899816738, + "grad_norm": 14.3125, + "learning_rate": 2.547343441063339e-06, + "loss": 1.4919, + "step": 76860 + }, + { + "epoch": 2.2363789743142215, + "grad_norm": 16.0, + "learning_rate": 2.545404159390724e-06, + "loss": 1.544, + "step": 76880 + }, + { + "epoch": 2.2369607586467697, + "grad_norm": 13.4375, + "learning_rate": 2.5434648777181093e-06, + "loss": 1.4385, + "step": 76900 + }, + { + "epoch": 2.2375425429793174, + "grad_norm": 12.375, + "learning_rate": 2.5415255960454944e-06, + "loss": 1.5499, + "step": 76920 + }, + { + "epoch": 2.2381243273118656, + "grad_norm": 11.75, + "learning_rate": 2.5395863143728795e-06, + "loss": 1.5124, + "step": 76940 + }, + { + "epoch": 2.2387061116444134, + "grad_norm": 13.875, + "learning_rate": 2.5376470327002646e-06, + "loss": 1.5421, + "step": 76960 + }, + { + "epoch": 2.2392878959769615, + "grad_norm": 14.1875, + "learning_rate": 2.5357077510276497e-06, + "loss": 1.5072, + "step": 76980 + }, + { + "epoch": 2.2398696803095093, + "grad_norm": 13.0, + "learning_rate": 2.5337684693550348e-06, + "loss": 1.5009, + "step": 77000 + }, + { + "epoch": 2.240451464642057, + "grad_norm": 13.1875, + "learning_rate": 2.53182918768242e-06, + "loss": 1.4685, + "step": 77020 + }, + { + "epoch": 2.241033248974605, + "grad_norm": 13.0, + "learning_rate": 2.529889906009805e-06, + "loss": 1.5346, + "step": 77040 + }, + { + "epoch": 2.241615033307153, + "grad_norm": 11.875, + "learning_rate": 2.5279506243371896e-06, + "loss": 1.4772, + "step": 77060 + }, + { + "epoch": 2.242196817639701, + "grad_norm": 14.5625, + "learning_rate": 2.5260113426645747e-06, + "loss": 1.5247, + "step": 77080 + }, + { + "epoch": 2.242778601972249, + "grad_norm": 14.5, + "learning_rate": 2.52407206099196e-06, + "loss": 1.5067, + "step": 77100 + }, + { + "epoch": 2.2433603863047966, + "grad_norm": 13.25, + "learning_rate": 2.522132779319345e-06, + "loss": 1.5404, + "step": 77120 + }, + { + "epoch": 2.243942170637345, + "grad_norm": 11.0625, + "learning_rate": 2.52019349764673e-06, + "loss": 1.4202, + "step": 77140 + }, + { + "epoch": 2.2445239549698925, + "grad_norm": 12.5625, + "learning_rate": 2.518254215974115e-06, + "loss": 1.512, + "step": 77160 + }, + { + "epoch": 2.2451057393024407, + "grad_norm": 12.25, + "learning_rate": 2.5163149343015002e-06, + "loss": 1.4833, + "step": 77180 + }, + { + "epoch": 2.2456875236349885, + "grad_norm": 11.3125, + "learning_rate": 2.5143756526288853e-06, + "loss": 1.4444, + "step": 77200 + }, + { + "epoch": 2.2462693079675367, + "grad_norm": 14.9375, + "learning_rate": 2.5124363709562704e-06, + "loss": 1.4819, + "step": 77220 + }, + { + "epoch": 2.2468510923000844, + "grad_norm": 14.9375, + "learning_rate": 2.5104970892836555e-06, + "loss": 1.4552, + "step": 77240 + }, + { + "epoch": 2.247432876632632, + "grad_norm": 12.625, + "learning_rate": 2.5085578076110406e-06, + "loss": 1.6139, + "step": 77260 + }, + { + "epoch": 2.2480146609651803, + "grad_norm": 15.8125, + "learning_rate": 2.5066185259384257e-06, + "loss": 1.5939, + "step": 77280 + }, + { + "epoch": 2.248596445297728, + "grad_norm": 10.625, + "learning_rate": 2.504679244265811e-06, + "loss": 1.4841, + "step": 77300 + }, + { + "epoch": 2.2491782296302762, + "grad_norm": 13.125, + "learning_rate": 2.502739962593196e-06, + "loss": 1.5859, + "step": 77320 + }, + { + "epoch": 2.249760013962824, + "grad_norm": 13.5, + "learning_rate": 2.500800680920581e-06, + "loss": 1.4813, + "step": 77340 + }, + { + "epoch": 2.2503417982953717, + "grad_norm": 14.125, + "learning_rate": 2.498861399247966e-06, + "loss": 1.4882, + "step": 77360 + }, + { + "epoch": 2.25092358262792, + "grad_norm": 14.1875, + "learning_rate": 2.496922117575351e-06, + "loss": 1.445, + "step": 77380 + }, + { + "epoch": 2.2515053669604677, + "grad_norm": 12.625, + "learning_rate": 2.4949828359027363e-06, + "loss": 1.5974, + "step": 77400 + }, + { + "epoch": 2.252087151293016, + "grad_norm": 12.8125, + "learning_rate": 2.4930435542301214e-06, + "loss": 1.562, + "step": 77420 + }, + { + "epoch": 2.2526689356255636, + "grad_norm": 14.4375, + "learning_rate": 2.4911042725575065e-06, + "loss": 1.5872, + "step": 77440 + }, + { + "epoch": 2.2532507199581113, + "grad_norm": 14.4375, + "learning_rate": 2.4891649908848916e-06, + "loss": 1.5831, + "step": 77460 + }, + { + "epoch": 2.2538325042906595, + "grad_norm": 11.75, + "learning_rate": 2.4872257092122763e-06, + "loss": 1.5468, + "step": 77480 + }, + { + "epoch": 2.2544142886232073, + "grad_norm": 12.875, + "learning_rate": 2.4852864275396614e-06, + "loss": 1.4803, + "step": 77500 + }, + { + "epoch": 2.2549960729557554, + "grad_norm": 12.375, + "learning_rate": 2.4833471458670465e-06, + "loss": 1.4564, + "step": 77520 + }, + { + "epoch": 2.255577857288303, + "grad_norm": 13.0625, + "learning_rate": 2.4814078641944316e-06, + "loss": 1.5319, + "step": 77540 + }, + { + "epoch": 2.256159641620851, + "grad_norm": 16.375, + "learning_rate": 2.4794685825218167e-06, + "loss": 1.5106, + "step": 77560 + }, + { + "epoch": 2.256741425953399, + "grad_norm": 13.8125, + "learning_rate": 2.4775293008492018e-06, + "loss": 1.5285, + "step": 77580 + }, + { + "epoch": 2.257323210285947, + "grad_norm": 12.375, + "learning_rate": 2.475590019176587e-06, + "loss": 1.47, + "step": 77600 + }, + { + "epoch": 2.257904994618495, + "grad_norm": 13.6875, + "learning_rate": 2.473650737503972e-06, + "loss": 1.4906, + "step": 77620 + }, + { + "epoch": 2.2584867789510428, + "grad_norm": 15.375, + "learning_rate": 2.471711455831357e-06, + "loss": 1.5675, + "step": 77640 + }, + { + "epoch": 2.259068563283591, + "grad_norm": 13.5, + "learning_rate": 2.469772174158742e-06, + "loss": 1.3821, + "step": 77660 + }, + { + "epoch": 2.2596503476161387, + "grad_norm": 13.625, + "learning_rate": 2.4678328924861272e-06, + "loss": 1.5581, + "step": 77680 + }, + { + "epoch": 2.2602321319486864, + "grad_norm": 12.0, + "learning_rate": 2.4658936108135123e-06, + "loss": 1.4272, + "step": 77700 + }, + { + "epoch": 2.2608139162812346, + "grad_norm": 13.0, + "learning_rate": 2.4639543291408974e-06, + "loss": 1.4901, + "step": 77720 + }, + { + "epoch": 2.2613957006137824, + "grad_norm": 10.625, + "learning_rate": 2.4620150474682825e-06, + "loss": 1.4473, + "step": 77740 + }, + { + "epoch": 2.2619774849463306, + "grad_norm": 12.875, + "learning_rate": 2.4600757657956676e-06, + "loss": 1.5642, + "step": 77760 + }, + { + "epoch": 2.2625592692788783, + "grad_norm": 13.9375, + "learning_rate": 2.4581364841230527e-06, + "loss": 1.4057, + "step": 77780 + }, + { + "epoch": 2.2631410536114265, + "grad_norm": 11.6875, + "learning_rate": 2.456197202450438e-06, + "loss": 1.5791, + "step": 77800 + }, + { + "epoch": 2.263722837943974, + "grad_norm": 12.3125, + "learning_rate": 2.454257920777823e-06, + "loss": 1.4757, + "step": 77820 + }, + { + "epoch": 2.264304622276522, + "grad_norm": 13.5, + "learning_rate": 2.452318639105208e-06, + "loss": 1.4728, + "step": 77840 + }, + { + "epoch": 2.26488640660907, + "grad_norm": 12.625, + "learning_rate": 2.450379357432593e-06, + "loss": 1.5004, + "step": 77860 + }, + { + "epoch": 2.265468190941618, + "grad_norm": 13.0, + "learning_rate": 2.448440075759978e-06, + "loss": 1.4871, + "step": 77880 + }, + { + "epoch": 2.266049975274166, + "grad_norm": 15.1875, + "learning_rate": 2.446500794087363e-06, + "loss": 1.5657, + "step": 77900 + }, + { + "epoch": 2.266631759606714, + "grad_norm": 11.9375, + "learning_rate": 2.444561512414748e-06, + "loss": 1.553, + "step": 77920 + }, + { + "epoch": 2.2672135439392616, + "grad_norm": 12.4375, + "learning_rate": 2.442622230742133e-06, + "loss": 1.4405, + "step": 77940 + }, + { + "epoch": 2.2677953282718097, + "grad_norm": 14.3125, + "learning_rate": 2.440682949069518e-06, + "loss": 1.5006, + "step": 77960 + }, + { + "epoch": 2.2683771126043575, + "grad_norm": 12.5, + "learning_rate": 2.4387436673969033e-06, + "loss": 1.4933, + "step": 77980 + }, + { + "epoch": 2.2689588969369057, + "grad_norm": 13.75, + "learning_rate": 2.4368043857242884e-06, + "loss": 1.452, + "step": 78000 + }, + { + "epoch": 2.2695406812694534, + "grad_norm": 14.75, + "learning_rate": 2.4348651040516735e-06, + "loss": 1.6161, + "step": 78020 + }, + { + "epoch": 2.270122465602001, + "grad_norm": 11.8125, + "learning_rate": 2.4329258223790586e-06, + "loss": 1.5615, + "step": 78040 + }, + { + "epoch": 2.2707042499345493, + "grad_norm": 13.0, + "learning_rate": 2.4309865407064437e-06, + "loss": 1.468, + "step": 78060 + }, + { + "epoch": 2.271286034267097, + "grad_norm": 10.1875, + "learning_rate": 2.4290472590338288e-06, + "loss": 1.5075, + "step": 78080 + }, + { + "epoch": 2.2718678185996453, + "grad_norm": 17.5, + "learning_rate": 2.427107977361214e-06, + "loss": 1.5169, + "step": 78100 + }, + { + "epoch": 2.272449602932193, + "grad_norm": 15.125, + "learning_rate": 2.425168695688599e-06, + "loss": 1.5831, + "step": 78120 + }, + { + "epoch": 2.2730313872647407, + "grad_norm": 13.6875, + "learning_rate": 2.423229414015984e-06, + "loss": 1.4668, + "step": 78140 + }, + { + "epoch": 2.273613171597289, + "grad_norm": 13.0, + "learning_rate": 2.421290132343369e-06, + "loss": 1.5313, + "step": 78160 + }, + { + "epoch": 2.2741949559298367, + "grad_norm": 12.4375, + "learning_rate": 2.4193508506707543e-06, + "loss": 1.5282, + "step": 78180 + }, + { + "epoch": 2.274776740262385, + "grad_norm": 13.6875, + "learning_rate": 2.4174115689981394e-06, + "loss": 1.5503, + "step": 78200 + }, + { + "epoch": 2.2753585245949326, + "grad_norm": 16.0, + "learning_rate": 2.4154722873255245e-06, + "loss": 1.5384, + "step": 78220 + }, + { + "epoch": 2.2759403089274803, + "grad_norm": 15.75, + "learning_rate": 2.4135330056529095e-06, + "loss": 1.4713, + "step": 78240 + }, + { + "epoch": 2.2765220932600285, + "grad_norm": 13.3125, + "learning_rate": 2.4115937239802946e-06, + "loss": 1.4791, + "step": 78260 + }, + { + "epoch": 2.2771038775925763, + "grad_norm": 14.75, + "learning_rate": 2.4096544423076797e-06, + "loss": 1.5024, + "step": 78280 + }, + { + "epoch": 2.2776856619251244, + "grad_norm": 14.75, + "learning_rate": 2.4077151606350644e-06, + "loss": 1.4897, + "step": 78300 + }, + { + "epoch": 2.278267446257672, + "grad_norm": 14.9375, + "learning_rate": 2.4057758789624495e-06, + "loss": 1.4642, + "step": 78320 + }, + { + "epoch": 2.2788492305902204, + "grad_norm": 13.5, + "learning_rate": 2.4038365972898346e-06, + "loss": 1.4546, + "step": 78340 + }, + { + "epoch": 2.279431014922768, + "grad_norm": 15.75, + "learning_rate": 2.4018973156172197e-06, + "loss": 1.544, + "step": 78360 + }, + { + "epoch": 2.2800127992553163, + "grad_norm": 16.0, + "learning_rate": 2.399958033944605e-06, + "loss": 1.477, + "step": 78380 + }, + { + "epoch": 2.280594583587864, + "grad_norm": 14.6875, + "learning_rate": 2.39801875227199e-06, + "loss": 1.5448, + "step": 78400 + }, + { + "epoch": 2.281176367920412, + "grad_norm": 14.5, + "learning_rate": 2.396079470599375e-06, + "loss": 1.4392, + "step": 78420 + }, + { + "epoch": 2.28175815225296, + "grad_norm": 15.125, + "learning_rate": 2.3941401889267597e-06, + "loss": 1.5462, + "step": 78440 + }, + { + "epoch": 2.2823399365855077, + "grad_norm": 16.375, + "learning_rate": 2.3922009072541448e-06, + "loss": 1.5173, + "step": 78460 + }, + { + "epoch": 2.282921720918056, + "grad_norm": 13.5, + "learning_rate": 2.39026162558153e-06, + "loss": 1.4916, + "step": 78480 + }, + { + "epoch": 2.2835035052506036, + "grad_norm": 12.875, + "learning_rate": 2.388322343908915e-06, + "loss": 1.4347, + "step": 78500 + }, + { + "epoch": 2.2840852895831514, + "grad_norm": 11.3125, + "learning_rate": 2.3863830622363e-06, + "loss": 1.5101, + "step": 78520 + }, + { + "epoch": 2.2846670739156996, + "grad_norm": 15.0625, + "learning_rate": 2.384443780563685e-06, + "loss": 1.4632, + "step": 78540 + }, + { + "epoch": 2.2852488582482473, + "grad_norm": 14.875, + "learning_rate": 2.3825044988910703e-06, + "loss": 1.5455, + "step": 78560 + }, + { + "epoch": 2.2858306425807955, + "grad_norm": 14.125, + "learning_rate": 2.3805652172184554e-06, + "loss": 1.4849, + "step": 78580 + }, + { + "epoch": 2.2864124269133432, + "grad_norm": 12.5625, + "learning_rate": 2.3786259355458405e-06, + "loss": 1.5309, + "step": 78600 + }, + { + "epoch": 2.286994211245891, + "grad_norm": 11.75, + "learning_rate": 2.3766866538732256e-06, + "loss": 1.5735, + "step": 78620 + }, + { + "epoch": 2.287575995578439, + "grad_norm": 14.5, + "learning_rate": 2.3747473722006107e-06, + "loss": 1.5037, + "step": 78640 + }, + { + "epoch": 2.288157779910987, + "grad_norm": 13.0, + "learning_rate": 2.3728080905279957e-06, + "loss": 1.4424, + "step": 78660 + }, + { + "epoch": 2.288739564243535, + "grad_norm": 14.25, + "learning_rate": 2.370868808855381e-06, + "loss": 1.4635, + "step": 78680 + }, + { + "epoch": 2.289321348576083, + "grad_norm": 13.125, + "learning_rate": 2.368929527182766e-06, + "loss": 1.4897, + "step": 78700 + }, + { + "epoch": 2.2899031329086306, + "grad_norm": 14.3125, + "learning_rate": 2.366990245510151e-06, + "loss": 1.4852, + "step": 78720 + }, + { + "epoch": 2.2904849172411788, + "grad_norm": 13.875, + "learning_rate": 2.365050963837536e-06, + "loss": 1.5879, + "step": 78740 + }, + { + "epoch": 2.2910667015737265, + "grad_norm": 12.375, + "learning_rate": 2.3631116821649212e-06, + "loss": 1.5611, + "step": 78760 + }, + { + "epoch": 2.2916484859062747, + "grad_norm": 13.75, + "learning_rate": 2.3611724004923063e-06, + "loss": 1.5068, + "step": 78780 + }, + { + "epoch": 2.2922302702388224, + "grad_norm": 15.0, + "learning_rate": 2.3592331188196914e-06, + "loss": 1.4878, + "step": 78800 + }, + { + "epoch": 2.29281205457137, + "grad_norm": 17.5, + "learning_rate": 2.3572938371470765e-06, + "loss": 1.3869, + "step": 78820 + }, + { + "epoch": 2.2933938389039183, + "grad_norm": 15.25, + "learning_rate": 2.355354555474461e-06, + "loss": 1.5822, + "step": 78840 + }, + { + "epoch": 2.293975623236466, + "grad_norm": 12.25, + "learning_rate": 2.3534152738018463e-06, + "loss": 1.5676, + "step": 78860 + }, + { + "epoch": 2.2945574075690143, + "grad_norm": 10.375, + "learning_rate": 2.3514759921292314e-06, + "loss": 1.4993, + "step": 78880 + }, + { + "epoch": 2.295139191901562, + "grad_norm": 13.25, + "learning_rate": 2.3495367104566165e-06, + "loss": 1.4298, + "step": 78900 + }, + { + "epoch": 2.29572097623411, + "grad_norm": 12.4375, + "learning_rate": 2.3475974287840016e-06, + "loss": 1.5453, + "step": 78920 + }, + { + "epoch": 2.296302760566658, + "grad_norm": 15.3125, + "learning_rate": 2.3456581471113867e-06, + "loss": 1.4572, + "step": 78940 + }, + { + "epoch": 2.2968845448992057, + "grad_norm": 16.5, + "learning_rate": 2.3437188654387718e-06, + "loss": 1.4612, + "step": 78960 + }, + { + "epoch": 2.297466329231754, + "grad_norm": 13.375, + "learning_rate": 2.341779583766157e-06, + "loss": 1.584, + "step": 78980 + }, + { + "epoch": 2.2980481135643016, + "grad_norm": 14.125, + "learning_rate": 2.339840302093542e-06, + "loss": 1.4853, + "step": 79000 + }, + { + "epoch": 2.29862989789685, + "grad_norm": 9.25, + "learning_rate": 2.337901020420927e-06, + "loss": 1.4815, + "step": 79020 + }, + { + "epoch": 2.2992116822293975, + "grad_norm": 12.25, + "learning_rate": 2.335961738748312e-06, + "loss": 1.4859, + "step": 79040 + }, + { + "epoch": 2.2997934665619457, + "grad_norm": 14.0625, + "learning_rate": 2.3340224570756973e-06, + "loss": 1.5659, + "step": 79060 + }, + { + "epoch": 2.3003752508944935, + "grad_norm": 13.6875, + "learning_rate": 2.3320831754030824e-06, + "loss": 1.498, + "step": 79080 + }, + { + "epoch": 2.300957035227041, + "grad_norm": 12.8125, + "learning_rate": 2.3301438937304675e-06, + "loss": 1.4827, + "step": 79100 + }, + { + "epoch": 2.3015388195595894, + "grad_norm": 12.0625, + "learning_rate": 2.3282046120578526e-06, + "loss": 1.4264, + "step": 79120 + }, + { + "epoch": 2.302120603892137, + "grad_norm": 15.3125, + "learning_rate": 2.3262653303852377e-06, + "loss": 1.4727, + "step": 79140 + }, + { + "epoch": 2.3027023882246853, + "grad_norm": 14.25, + "learning_rate": 2.3243260487126228e-06, + "loss": 1.4976, + "step": 79160 + }, + { + "epoch": 2.303284172557233, + "grad_norm": 14.125, + "learning_rate": 2.322386767040008e-06, + "loss": 1.4303, + "step": 79180 + }, + { + "epoch": 2.303865956889781, + "grad_norm": 17.875, + "learning_rate": 2.320447485367393e-06, + "loss": 1.4618, + "step": 79200 + }, + { + "epoch": 2.304447741222329, + "grad_norm": 14.0625, + "learning_rate": 2.318508203694778e-06, + "loss": 1.463, + "step": 79220 + }, + { + "epoch": 2.3050295255548767, + "grad_norm": 12.6875, + "learning_rate": 2.3165689220221627e-06, + "loss": 1.4862, + "step": 79240 + }, + { + "epoch": 2.305611309887425, + "grad_norm": 14.5625, + "learning_rate": 2.314629640349548e-06, + "loss": 1.4419, + "step": 79260 + }, + { + "epoch": 2.3061930942199727, + "grad_norm": 11.75, + "learning_rate": 2.312690358676933e-06, + "loss": 1.4422, + "step": 79280 + }, + { + "epoch": 2.3067748785525204, + "grad_norm": 14.5, + "learning_rate": 2.310751077004318e-06, + "loss": 1.5241, + "step": 79300 + }, + { + "epoch": 2.3073566628850686, + "grad_norm": 13.25, + "learning_rate": 2.308811795331703e-06, + "loss": 1.4881, + "step": 79320 + }, + { + "epoch": 2.3079384472176163, + "grad_norm": 13.25, + "learning_rate": 2.3068725136590882e-06, + "loss": 1.5045, + "step": 79340 + }, + { + "epoch": 2.3085202315501645, + "grad_norm": 13.8125, + "learning_rate": 2.3049332319864733e-06, + "loss": 1.4625, + "step": 79360 + }, + { + "epoch": 2.3091020158827122, + "grad_norm": 13.75, + "learning_rate": 2.3029939503138584e-06, + "loss": 1.5605, + "step": 79380 + }, + { + "epoch": 2.30968380021526, + "grad_norm": 13.625, + "learning_rate": 2.3010546686412435e-06, + "loss": 1.4955, + "step": 79400 + }, + { + "epoch": 2.310265584547808, + "grad_norm": 13.4375, + "learning_rate": 2.2991153869686286e-06, + "loss": 1.5451, + "step": 79420 + }, + { + "epoch": 2.310847368880356, + "grad_norm": 15.5, + "learning_rate": 2.2971761052960137e-06, + "loss": 1.4791, + "step": 79440 + }, + { + "epoch": 2.311429153212904, + "grad_norm": 13.1875, + "learning_rate": 2.295236823623399e-06, + "loss": 1.4711, + "step": 79460 + }, + { + "epoch": 2.312010937545452, + "grad_norm": 14.375, + "learning_rate": 2.2932975419507835e-06, + "loss": 1.5716, + "step": 79480 + }, + { + "epoch": 2.3125927218779996, + "grad_norm": 13.5, + "learning_rate": 2.2913582602781686e-06, + "loss": 1.5021, + "step": 79500 + }, + { + "epoch": 2.3131745062105478, + "grad_norm": 15.0625, + "learning_rate": 2.2894189786055537e-06, + "loss": 1.4842, + "step": 79520 + }, + { + "epoch": 2.3137562905430955, + "grad_norm": 14.25, + "learning_rate": 2.2874796969329388e-06, + "loss": 1.4506, + "step": 79540 + }, + { + "epoch": 2.3143380748756437, + "grad_norm": 11.875, + "learning_rate": 2.285540415260324e-06, + "loss": 1.5117, + "step": 79560 + }, + { + "epoch": 2.3149198592081914, + "grad_norm": 11.8125, + "learning_rate": 2.283601133587709e-06, + "loss": 1.5294, + "step": 79580 + }, + { + "epoch": 2.3155016435407396, + "grad_norm": 12.9375, + "learning_rate": 2.281661851915094e-06, + "loss": 1.4868, + "step": 79600 + }, + { + "epoch": 2.3160834278732874, + "grad_norm": 12.0625, + "learning_rate": 2.279722570242479e-06, + "loss": 1.5197, + "step": 79620 + }, + { + "epoch": 2.3166652122058355, + "grad_norm": 15.0, + "learning_rate": 2.2777832885698643e-06, + "loss": 1.5152, + "step": 79640 + }, + { + "epoch": 2.3172469965383833, + "grad_norm": 15.8125, + "learning_rate": 2.2758440068972494e-06, + "loss": 1.5211, + "step": 79660 + }, + { + "epoch": 2.317828780870931, + "grad_norm": 12.625, + "learning_rate": 2.2739047252246344e-06, + "loss": 1.4938, + "step": 79680 + }, + { + "epoch": 2.318410565203479, + "grad_norm": 13.0, + "learning_rate": 2.2719654435520195e-06, + "loss": 1.5331, + "step": 79700 + }, + { + "epoch": 2.318992349536027, + "grad_norm": 12.8125, + "learning_rate": 2.2700261618794046e-06, + "loss": 1.4789, + "step": 79720 + }, + { + "epoch": 2.319574133868575, + "grad_norm": 16.875, + "learning_rate": 2.2680868802067897e-06, + "loss": 1.4278, + "step": 79740 + }, + { + "epoch": 2.320155918201123, + "grad_norm": 14.5625, + "learning_rate": 2.266147598534175e-06, + "loss": 1.4458, + "step": 79760 + }, + { + "epoch": 2.3207377025336706, + "grad_norm": 13.8125, + "learning_rate": 2.26420831686156e-06, + "loss": 1.4504, + "step": 79780 + }, + { + "epoch": 2.321319486866219, + "grad_norm": 12.875, + "learning_rate": 2.2622690351889446e-06, + "loss": 1.4409, + "step": 79800 + }, + { + "epoch": 2.3219012711987665, + "grad_norm": 11.75, + "learning_rate": 2.2603297535163297e-06, + "loss": 1.51, + "step": 79820 + }, + { + "epoch": 2.3224830555313147, + "grad_norm": 15.625, + "learning_rate": 2.258390471843715e-06, + "loss": 1.4906, + "step": 79840 + }, + { + "epoch": 2.3230648398638625, + "grad_norm": 12.5625, + "learning_rate": 2.2564511901711e-06, + "loss": 1.526, + "step": 79860 + }, + { + "epoch": 2.32364662419641, + "grad_norm": 13.875, + "learning_rate": 2.254511908498485e-06, + "loss": 1.4035, + "step": 79880 + }, + { + "epoch": 2.3242284085289584, + "grad_norm": 15.6875, + "learning_rate": 2.25257262682587e-06, + "loss": 1.4974, + "step": 79900 + }, + { + "epoch": 2.324810192861506, + "grad_norm": 13.125, + "learning_rate": 2.250633345153255e-06, + "loss": 1.4432, + "step": 79920 + }, + { + "epoch": 2.3253919771940543, + "grad_norm": 11.875, + "learning_rate": 2.2486940634806403e-06, + "loss": 1.4812, + "step": 79940 + }, + { + "epoch": 2.325973761526602, + "grad_norm": 12.4375, + "learning_rate": 2.2467547818080254e-06, + "loss": 1.5583, + "step": 79960 + }, + { + "epoch": 2.32655554585915, + "grad_norm": 12.25, + "learning_rate": 2.2448155001354105e-06, + "loss": 1.4932, + "step": 79980 + }, + { + "epoch": 2.327137330191698, + "grad_norm": 13.4375, + "learning_rate": 2.2428762184627956e-06, + "loss": 1.5036, + "step": 80000 + }, + { + "epoch": 2.3277191145242457, + "grad_norm": 13.9375, + "learning_rate": 2.2409369367901807e-06, + "loss": 1.5853, + "step": 80020 + }, + { + "epoch": 2.328300898856794, + "grad_norm": 15.0, + "learning_rate": 2.2389976551175658e-06, + "loss": 1.492, + "step": 80040 + }, + { + "epoch": 2.3288826831893417, + "grad_norm": 18.0, + "learning_rate": 2.237058373444951e-06, + "loss": 1.4796, + "step": 80060 + }, + { + "epoch": 2.3294644675218894, + "grad_norm": 12.25, + "learning_rate": 2.235119091772336e-06, + "loss": 1.5104, + "step": 80080 + }, + { + "epoch": 2.3300462518544376, + "grad_norm": 13.5, + "learning_rate": 2.233179810099721e-06, + "loss": 1.5557, + "step": 80100 + }, + { + "epoch": 2.3306280361869853, + "grad_norm": 16.375, + "learning_rate": 2.231240528427106e-06, + "loss": 1.5181, + "step": 80120 + }, + { + "epoch": 2.3312098205195335, + "grad_norm": 14.875, + "learning_rate": 2.2293012467544913e-06, + "loss": 1.5075, + "step": 80140 + }, + { + "epoch": 2.3317916048520813, + "grad_norm": 11.0, + "learning_rate": 2.2273619650818764e-06, + "loss": 1.5286, + "step": 80160 + }, + { + "epoch": 2.3323733891846294, + "grad_norm": 12.1875, + "learning_rate": 2.2254226834092615e-06, + "loss": 1.4977, + "step": 80180 + }, + { + "epoch": 2.332955173517177, + "grad_norm": 12.0, + "learning_rate": 2.223483401736646e-06, + "loss": 1.5509, + "step": 80200 + }, + { + "epoch": 2.333536957849725, + "grad_norm": 14.6875, + "learning_rate": 2.2215441200640312e-06, + "loss": 1.5433, + "step": 80220 + }, + { + "epoch": 2.334118742182273, + "grad_norm": 13.0625, + "learning_rate": 2.2196048383914163e-06, + "loss": 1.4002, + "step": 80240 + }, + { + "epoch": 2.334700526514821, + "grad_norm": 13.0, + "learning_rate": 2.2176655567188014e-06, + "loss": 1.4283, + "step": 80260 + }, + { + "epoch": 2.335282310847369, + "grad_norm": 12.5625, + "learning_rate": 2.2157262750461865e-06, + "loss": 1.619, + "step": 80280 + }, + { + "epoch": 2.3358640951799168, + "grad_norm": 14.3125, + "learning_rate": 2.2137869933735716e-06, + "loss": 1.5174, + "step": 80300 + }, + { + "epoch": 2.336445879512465, + "grad_norm": 14.625, + "learning_rate": 2.2118477117009567e-06, + "loss": 1.4691, + "step": 80320 + }, + { + "epoch": 2.3370276638450127, + "grad_norm": 14.3125, + "learning_rate": 2.209908430028342e-06, + "loss": 1.5607, + "step": 80340 + }, + { + "epoch": 2.3376094481775604, + "grad_norm": 13.5, + "learning_rate": 2.207969148355727e-06, + "loss": 1.4788, + "step": 80360 + }, + { + "epoch": 2.3381912325101086, + "grad_norm": 12.875, + "learning_rate": 2.206029866683112e-06, + "loss": 1.5194, + "step": 80380 + }, + { + "epoch": 2.3387730168426564, + "grad_norm": 9.4375, + "learning_rate": 2.204090585010497e-06, + "loss": 1.5032, + "step": 80400 + }, + { + "epoch": 2.3393548011752046, + "grad_norm": 14.9375, + "learning_rate": 2.202151303337882e-06, + "loss": 1.4447, + "step": 80420 + }, + { + "epoch": 2.3399365855077523, + "grad_norm": 17.5, + "learning_rate": 2.2002120216652673e-06, + "loss": 1.4967, + "step": 80440 + }, + { + "epoch": 2.3405183698403, + "grad_norm": 12.5, + "learning_rate": 2.1982727399926524e-06, + "loss": 1.4616, + "step": 80460 + }, + { + "epoch": 2.3411001541728482, + "grad_norm": 11.6875, + "learning_rate": 2.1963334583200375e-06, + "loss": 1.4305, + "step": 80480 + }, + { + "epoch": 2.341681938505396, + "grad_norm": 14.6875, + "learning_rate": 2.1943941766474226e-06, + "loss": 1.5134, + "step": 80500 + }, + { + "epoch": 2.342263722837944, + "grad_norm": 12.125, + "learning_rate": 2.1924548949748077e-06, + "loss": 1.4399, + "step": 80520 + }, + { + "epoch": 2.342845507170492, + "grad_norm": 8.5625, + "learning_rate": 2.1905156133021924e-06, + "loss": 1.4176, + "step": 80540 + }, + { + "epoch": 2.3434272915030396, + "grad_norm": 13.3125, + "learning_rate": 2.1885763316295775e-06, + "loss": 1.5658, + "step": 80560 + }, + { + "epoch": 2.344009075835588, + "grad_norm": 11.3125, + "learning_rate": 2.1866370499569626e-06, + "loss": 1.6313, + "step": 80580 + }, + { + "epoch": 2.3445908601681356, + "grad_norm": 15.1875, + "learning_rate": 2.1846977682843477e-06, + "loss": 1.5179, + "step": 80600 + }, + { + "epoch": 2.3451726445006837, + "grad_norm": 13.0625, + "learning_rate": 2.1827584866117328e-06, + "loss": 1.501, + "step": 80620 + }, + { + "epoch": 2.3457544288332315, + "grad_norm": 13.0, + "learning_rate": 2.180819204939118e-06, + "loss": 1.5677, + "step": 80640 + }, + { + "epoch": 2.3463362131657792, + "grad_norm": 14.8125, + "learning_rate": 2.178879923266503e-06, + "loss": 1.4799, + "step": 80660 + }, + { + "epoch": 2.3469179974983274, + "grad_norm": 13.6875, + "learning_rate": 2.176940641593888e-06, + "loss": 1.437, + "step": 80680 + }, + { + "epoch": 2.347499781830875, + "grad_norm": 13.5, + "learning_rate": 2.175001359921273e-06, + "loss": 1.5217, + "step": 80700 + }, + { + "epoch": 2.3480815661634233, + "grad_norm": 13.125, + "learning_rate": 2.1730620782486582e-06, + "loss": 1.5573, + "step": 80720 + }, + { + "epoch": 2.348663350495971, + "grad_norm": 14.6875, + "learning_rate": 2.1711227965760433e-06, + "loss": 1.4689, + "step": 80740 + }, + { + "epoch": 2.349245134828519, + "grad_norm": 14.8125, + "learning_rate": 2.169183514903428e-06, + "loss": 1.5098, + "step": 80760 + }, + { + "epoch": 2.349826919161067, + "grad_norm": 15.0, + "learning_rate": 2.167244233230813e-06, + "loss": 1.5224, + "step": 80780 + }, + { + "epoch": 2.3504087034936147, + "grad_norm": 13.25, + "learning_rate": 2.1653049515581982e-06, + "loss": 1.5475, + "step": 80800 + }, + { + "epoch": 2.350990487826163, + "grad_norm": 14.5, + "learning_rate": 2.1633656698855833e-06, + "loss": 1.5486, + "step": 80820 + }, + { + "epoch": 2.3515722721587107, + "grad_norm": 15.6875, + "learning_rate": 2.1614263882129684e-06, + "loss": 1.4639, + "step": 80840 + }, + { + "epoch": 2.352154056491259, + "grad_norm": 13.3125, + "learning_rate": 2.1594871065403535e-06, + "loss": 1.4772, + "step": 80860 + }, + { + "epoch": 2.3527358408238066, + "grad_norm": 11.875, + "learning_rate": 2.1575478248677386e-06, + "loss": 1.5233, + "step": 80880 + }, + { + "epoch": 2.353317625156355, + "grad_norm": 14.0625, + "learning_rate": 2.1556085431951237e-06, + "loss": 1.4569, + "step": 80900 + }, + { + "epoch": 2.3538994094889025, + "grad_norm": 15.125, + "learning_rate": 2.153669261522509e-06, + "loss": 1.5201, + "step": 80920 + }, + { + "epoch": 2.3544811938214503, + "grad_norm": 13.9375, + "learning_rate": 2.151729979849894e-06, + "loss": 1.5271, + "step": 80940 + }, + { + "epoch": 2.3550629781539985, + "grad_norm": 14.3125, + "learning_rate": 2.149790698177279e-06, + "loss": 1.525, + "step": 80960 + }, + { + "epoch": 2.355644762486546, + "grad_norm": 13.5625, + "learning_rate": 2.147851416504664e-06, + "loss": 1.5975, + "step": 80980 + }, + { + "epoch": 2.3562265468190944, + "grad_norm": 14.1875, + "learning_rate": 2.145912134832049e-06, + "loss": 1.4742, + "step": 81000 + }, + { + "epoch": 2.356808331151642, + "grad_norm": 12.8125, + "learning_rate": 2.1439728531594343e-06, + "loss": 1.4587, + "step": 81020 + }, + { + "epoch": 2.35739011548419, + "grad_norm": 11.5, + "learning_rate": 2.1420335714868194e-06, + "loss": 1.47, + "step": 81040 + }, + { + "epoch": 2.357971899816738, + "grad_norm": 13.75, + "learning_rate": 2.1400942898142045e-06, + "loss": 1.4474, + "step": 81060 + }, + { + "epoch": 2.358553684149286, + "grad_norm": 14.9375, + "learning_rate": 2.1381550081415896e-06, + "loss": 1.5069, + "step": 81080 + }, + { + "epoch": 2.359135468481834, + "grad_norm": 14.0625, + "learning_rate": 2.1362157264689747e-06, + "loss": 1.5437, + "step": 81100 + }, + { + "epoch": 2.3597172528143817, + "grad_norm": 15.875, + "learning_rate": 2.1342764447963598e-06, + "loss": 1.4676, + "step": 81120 + }, + { + "epoch": 2.3602990371469295, + "grad_norm": 13.4375, + "learning_rate": 2.132337163123745e-06, + "loss": 1.5713, + "step": 81140 + }, + { + "epoch": 2.3608808214794776, + "grad_norm": 13.3125, + "learning_rate": 2.1303978814511295e-06, + "loss": 1.513, + "step": 81160 + }, + { + "epoch": 2.3614626058120254, + "grad_norm": 12.5, + "learning_rate": 2.1284585997785146e-06, + "loss": 1.589, + "step": 81180 + }, + { + "epoch": 2.3620443901445736, + "grad_norm": 10.25, + "learning_rate": 2.1265193181058997e-06, + "loss": 1.5326, + "step": 81200 + }, + { + "epoch": 2.3626261744771213, + "grad_norm": 16.25, + "learning_rate": 2.124580036433285e-06, + "loss": 1.5033, + "step": 81220 + }, + { + "epoch": 2.363207958809669, + "grad_norm": 11.75, + "learning_rate": 2.12264075476067e-06, + "loss": 1.5411, + "step": 81240 + }, + { + "epoch": 2.3637897431422172, + "grad_norm": 13.1875, + "learning_rate": 2.120701473088055e-06, + "loss": 1.5313, + "step": 81260 + }, + { + "epoch": 2.364371527474765, + "grad_norm": 12.75, + "learning_rate": 2.11876219141544e-06, + "loss": 1.4944, + "step": 81280 + }, + { + "epoch": 2.364953311807313, + "grad_norm": 15.9375, + "learning_rate": 2.1168229097428252e-06, + "loss": 1.4613, + "step": 81300 + }, + { + "epoch": 2.365535096139861, + "grad_norm": 13.5, + "learning_rate": 2.1148836280702103e-06, + "loss": 1.5501, + "step": 81320 + }, + { + "epoch": 2.3661168804724086, + "grad_norm": 12.75, + "learning_rate": 2.1129443463975954e-06, + "loss": 1.546, + "step": 81340 + }, + { + "epoch": 2.366698664804957, + "grad_norm": 12.875, + "learning_rate": 2.1110050647249805e-06, + "loss": 1.4815, + "step": 81360 + }, + { + "epoch": 2.3672804491375046, + "grad_norm": 12.6875, + "learning_rate": 2.1090657830523656e-06, + "loss": 1.4589, + "step": 81380 + }, + { + "epoch": 2.3678622334700528, + "grad_norm": 11.4375, + "learning_rate": 2.1071265013797507e-06, + "loss": 1.5355, + "step": 81400 + }, + { + "epoch": 2.3684440178026005, + "grad_norm": 13.125, + "learning_rate": 2.105187219707136e-06, + "loss": 1.5244, + "step": 81420 + }, + { + "epoch": 2.3690258021351487, + "grad_norm": 12.1875, + "learning_rate": 2.103247938034521e-06, + "loss": 1.4817, + "step": 81440 + }, + { + "epoch": 2.3696075864676964, + "grad_norm": 13.375, + "learning_rate": 2.101308656361906e-06, + "loss": 1.4578, + "step": 81460 + }, + { + "epoch": 2.370189370800244, + "grad_norm": 12.8125, + "learning_rate": 2.099369374689291e-06, + "loss": 1.4558, + "step": 81480 + }, + { + "epoch": 2.3707711551327924, + "grad_norm": 13.0625, + "learning_rate": 2.097430093016676e-06, + "loss": 1.5164, + "step": 81500 + }, + { + "epoch": 2.37135293946534, + "grad_norm": 13.0625, + "learning_rate": 2.0954908113440613e-06, + "loss": 1.5401, + "step": 81520 + }, + { + "epoch": 2.3719347237978883, + "grad_norm": 14.25, + "learning_rate": 2.0935515296714464e-06, + "loss": 1.5447, + "step": 81540 + }, + { + "epoch": 2.372516508130436, + "grad_norm": 13.6875, + "learning_rate": 2.091612247998831e-06, + "loss": 1.524, + "step": 81560 + }, + { + "epoch": 2.373098292462984, + "grad_norm": 12.125, + "learning_rate": 2.089672966326216e-06, + "loss": 1.5211, + "step": 81580 + }, + { + "epoch": 2.373680076795532, + "grad_norm": 16.625, + "learning_rate": 2.0877336846536013e-06, + "loss": 1.5303, + "step": 81600 + }, + { + "epoch": 2.3742618611280797, + "grad_norm": 14.625, + "learning_rate": 2.0857944029809864e-06, + "loss": 1.5686, + "step": 81620 + }, + { + "epoch": 2.374843645460628, + "grad_norm": 13.4375, + "learning_rate": 2.0838551213083715e-06, + "loss": 1.5455, + "step": 81640 + }, + { + "epoch": 2.3754254297931756, + "grad_norm": 13.0625, + "learning_rate": 2.0819158396357566e-06, + "loss": 1.4859, + "step": 81660 + }, + { + "epoch": 2.376007214125724, + "grad_norm": 11.625, + "learning_rate": 2.0799765579631417e-06, + "loss": 1.4473, + "step": 81680 + }, + { + "epoch": 2.3765889984582715, + "grad_norm": 14.9375, + "learning_rate": 2.0780372762905263e-06, + "loss": 1.5468, + "step": 81700 + }, + { + "epoch": 2.3771707827908193, + "grad_norm": 13.25, + "learning_rate": 2.0760979946179114e-06, + "loss": 1.4947, + "step": 81720 + }, + { + "epoch": 2.3777525671233675, + "grad_norm": 14.25, + "learning_rate": 2.0741587129452965e-06, + "loss": 1.4489, + "step": 81740 + }, + { + "epoch": 2.378334351455915, + "grad_norm": 14.375, + "learning_rate": 2.0722194312726816e-06, + "loss": 1.4634, + "step": 81760 + }, + { + "epoch": 2.3789161357884634, + "grad_norm": 12.875, + "learning_rate": 2.0702801496000667e-06, + "loss": 1.5208, + "step": 81780 + }, + { + "epoch": 2.379497920121011, + "grad_norm": 14.5, + "learning_rate": 2.068340867927452e-06, + "loss": 1.4866, + "step": 81800 + }, + { + "epoch": 2.380079704453559, + "grad_norm": 14.5, + "learning_rate": 2.066401586254837e-06, + "loss": 1.4926, + "step": 81820 + }, + { + "epoch": 2.380661488786107, + "grad_norm": 12.6875, + "learning_rate": 2.064462304582222e-06, + "loss": 1.5062, + "step": 81840 + }, + { + "epoch": 2.381243273118655, + "grad_norm": 15.625, + "learning_rate": 2.062523022909607e-06, + "loss": 1.4599, + "step": 81860 + }, + { + "epoch": 2.381825057451203, + "grad_norm": 12.0, + "learning_rate": 2.060583741236992e-06, + "loss": 1.516, + "step": 81880 + }, + { + "epoch": 2.3824068417837507, + "grad_norm": 13.25, + "learning_rate": 2.0586444595643773e-06, + "loss": 1.4665, + "step": 81900 + }, + { + "epoch": 2.3829886261162985, + "grad_norm": 17.375, + "learning_rate": 2.0567051778917624e-06, + "loss": 1.5056, + "step": 81920 + }, + { + "epoch": 2.3835704104488467, + "grad_norm": 14.125, + "learning_rate": 2.0547658962191475e-06, + "loss": 1.4422, + "step": 81940 + }, + { + "epoch": 2.3841521947813944, + "grad_norm": 13.875, + "learning_rate": 2.0528266145465326e-06, + "loss": 1.5094, + "step": 81960 + }, + { + "epoch": 2.3847339791139426, + "grad_norm": 15.5, + "learning_rate": 2.0508873328739177e-06, + "loss": 1.6147, + "step": 81980 + }, + { + "epoch": 2.3853157634464903, + "grad_norm": 12.6875, + "learning_rate": 2.048948051201303e-06, + "loss": 1.4948, + "step": 82000 + }, + { + "epoch": 2.385897547779038, + "grad_norm": 13.625, + "learning_rate": 2.047008769528688e-06, + "loss": 1.4844, + "step": 82020 + }, + { + "epoch": 2.3864793321115862, + "grad_norm": 13.625, + "learning_rate": 2.045069487856073e-06, + "loss": 1.5263, + "step": 82040 + }, + { + "epoch": 2.387061116444134, + "grad_norm": 10.125, + "learning_rate": 2.043130206183458e-06, + "loss": 1.4915, + "step": 82060 + }, + { + "epoch": 2.387642900776682, + "grad_norm": 12.875, + "learning_rate": 2.041190924510843e-06, + "loss": 1.4541, + "step": 82080 + }, + { + "epoch": 2.38822468510923, + "grad_norm": 13.625, + "learning_rate": 2.0392516428382283e-06, + "loss": 1.4651, + "step": 82100 + }, + { + "epoch": 2.388806469441778, + "grad_norm": 14.0, + "learning_rate": 2.037312361165613e-06, + "loss": 1.4914, + "step": 82120 + }, + { + "epoch": 2.389388253774326, + "grad_norm": 13.5625, + "learning_rate": 2.035373079492998e-06, + "loss": 1.498, + "step": 82140 + }, + { + "epoch": 2.389970038106874, + "grad_norm": 13.5, + "learning_rate": 2.033433797820383e-06, + "loss": 1.4332, + "step": 82160 + }, + { + "epoch": 2.3905518224394218, + "grad_norm": 11.3125, + "learning_rate": 2.0314945161477682e-06, + "loss": 1.4921, + "step": 82180 + }, + { + "epoch": 2.3911336067719695, + "grad_norm": 10.875, + "learning_rate": 2.0295552344751533e-06, + "loss": 1.4908, + "step": 82200 + }, + { + "epoch": 2.3917153911045177, + "grad_norm": 15.625, + "learning_rate": 2.0276159528025384e-06, + "loss": 1.4405, + "step": 82220 + }, + { + "epoch": 2.3922971754370654, + "grad_norm": 12.875, + "learning_rate": 2.0256766711299235e-06, + "loss": 1.5628, + "step": 82240 + }, + { + "epoch": 2.3928789597696136, + "grad_norm": 13.6875, + "learning_rate": 2.0237373894573086e-06, + "loss": 1.4273, + "step": 82260 + }, + { + "epoch": 2.3934607441021614, + "grad_norm": 13.375, + "learning_rate": 2.0217981077846937e-06, + "loss": 1.5593, + "step": 82280 + }, + { + "epoch": 2.394042528434709, + "grad_norm": 14.375, + "learning_rate": 2.019858826112079e-06, + "loss": 1.5318, + "step": 82300 + }, + { + "epoch": 2.3946243127672573, + "grad_norm": 13.5, + "learning_rate": 2.017919544439464e-06, + "loss": 1.4808, + "step": 82320 + }, + { + "epoch": 2.395206097099805, + "grad_norm": 14.5625, + "learning_rate": 2.015980262766849e-06, + "loss": 1.5295, + "step": 82340 + }, + { + "epoch": 2.395787881432353, + "grad_norm": 12.0, + "learning_rate": 2.014040981094234e-06, + "loss": 1.6113, + "step": 82360 + }, + { + "epoch": 2.396369665764901, + "grad_norm": 14.0, + "learning_rate": 2.0121016994216192e-06, + "loss": 1.5515, + "step": 82380 + }, + { + "epoch": 2.3969514500974487, + "grad_norm": 13.25, + "learning_rate": 2.0101624177490043e-06, + "loss": 1.4164, + "step": 82400 + }, + { + "epoch": 2.397533234429997, + "grad_norm": 14.4375, + "learning_rate": 2.0082231360763894e-06, + "loss": 1.5213, + "step": 82420 + }, + { + "epoch": 2.3981150187625446, + "grad_norm": 11.9375, + "learning_rate": 2.0062838544037745e-06, + "loss": 1.4736, + "step": 82440 + }, + { + "epoch": 2.398696803095093, + "grad_norm": 15.0, + "learning_rate": 2.0043445727311596e-06, + "loss": 1.5123, + "step": 82460 + }, + { + "epoch": 2.3992785874276406, + "grad_norm": 14.1875, + "learning_rate": 2.0024052910585447e-06, + "loss": 1.399, + "step": 82480 + }, + { + "epoch": 2.3998603717601883, + "grad_norm": 17.625, + "learning_rate": 2.00046600938593e-06, + "loss": 1.4898, + "step": 82500 + }, + { + "epoch": 2.4004421560927365, + "grad_norm": 13.5, + "learning_rate": 1.9985267277133145e-06, + "loss": 1.4799, + "step": 82520 + }, + { + "epoch": 2.401023940425284, + "grad_norm": 13.5, + "learning_rate": 1.9965874460406996e-06, + "loss": 1.5474, + "step": 82540 + }, + { + "epoch": 2.4016057247578324, + "grad_norm": 14.625, + "learning_rate": 1.9946481643680847e-06, + "loss": 1.4957, + "step": 82560 + }, + { + "epoch": 2.40218750909038, + "grad_norm": 12.375, + "learning_rate": 1.9927088826954698e-06, + "loss": 1.4922, + "step": 82580 + }, + { + "epoch": 2.402769293422928, + "grad_norm": 16.875, + "learning_rate": 1.990769601022855e-06, + "loss": 1.4538, + "step": 82600 + }, + { + "epoch": 2.403351077755476, + "grad_norm": 14.1875, + "learning_rate": 1.98883031935024e-06, + "loss": 1.474, + "step": 82620 + }, + { + "epoch": 2.403932862088024, + "grad_norm": 13.875, + "learning_rate": 1.986891037677625e-06, + "loss": 1.531, + "step": 82640 + }, + { + "epoch": 2.404514646420572, + "grad_norm": 13.5, + "learning_rate": 1.9849517560050097e-06, + "loss": 1.5763, + "step": 82660 + }, + { + "epoch": 2.4050964307531197, + "grad_norm": 12.5625, + "learning_rate": 1.983012474332395e-06, + "loss": 1.5443, + "step": 82680 + }, + { + "epoch": 2.405678215085668, + "grad_norm": 11.125, + "learning_rate": 1.98107319265978e-06, + "loss": 1.5399, + "step": 82700 + }, + { + "epoch": 2.4062599994182157, + "grad_norm": 14.9375, + "learning_rate": 1.979133910987165e-06, + "loss": 1.5399, + "step": 82720 + }, + { + "epoch": 2.4068417837507634, + "grad_norm": 15.625, + "learning_rate": 1.97719462931455e-06, + "loss": 1.4963, + "step": 82740 + }, + { + "epoch": 2.4074235680833116, + "grad_norm": 13.9375, + "learning_rate": 1.9752553476419352e-06, + "loss": 1.6571, + "step": 82760 + }, + { + "epoch": 2.4080053524158593, + "grad_norm": 13.5, + "learning_rate": 1.9733160659693203e-06, + "loss": 1.4761, + "step": 82780 + }, + { + "epoch": 2.4085871367484075, + "grad_norm": 14.5625, + "learning_rate": 1.9713767842967054e-06, + "loss": 1.5335, + "step": 82800 + }, + { + "epoch": 2.4091689210809553, + "grad_norm": 17.625, + "learning_rate": 1.9694375026240905e-06, + "loss": 1.4916, + "step": 82820 + }, + { + "epoch": 2.4097507054135034, + "grad_norm": 16.75, + "learning_rate": 1.9674982209514756e-06, + "loss": 1.5589, + "step": 82840 + }, + { + "epoch": 2.410332489746051, + "grad_norm": 14.1875, + "learning_rate": 1.9655589392788607e-06, + "loss": 1.4952, + "step": 82860 + }, + { + "epoch": 2.410914274078599, + "grad_norm": 12.875, + "learning_rate": 1.963619657606246e-06, + "loss": 1.4848, + "step": 82880 + }, + { + "epoch": 2.411496058411147, + "grad_norm": 13.625, + "learning_rate": 1.961680375933631e-06, + "loss": 1.4803, + "step": 82900 + }, + { + "epoch": 2.412077842743695, + "grad_norm": 15.0, + "learning_rate": 1.959741094261016e-06, + "loss": 1.5074, + "step": 82920 + }, + { + "epoch": 2.412659627076243, + "grad_norm": 15.25, + "learning_rate": 1.957801812588401e-06, + "loss": 1.4036, + "step": 82940 + }, + { + "epoch": 2.413241411408791, + "grad_norm": 11.4375, + "learning_rate": 1.955862530915786e-06, + "loss": 1.4796, + "step": 82960 + }, + { + "epoch": 2.4138231957413385, + "grad_norm": 13.0625, + "learning_rate": 1.9539232492431713e-06, + "loss": 1.449, + "step": 82980 + }, + { + "epoch": 2.4144049800738867, + "grad_norm": 13.0625, + "learning_rate": 1.9519839675705564e-06, + "loss": 1.4359, + "step": 83000 + }, + { + "epoch": 2.4149867644064344, + "grad_norm": 15.0, + "learning_rate": 1.9500446858979415e-06, + "loss": 1.4583, + "step": 83020 + }, + { + "epoch": 2.4155685487389826, + "grad_norm": 16.0, + "learning_rate": 1.9481054042253266e-06, + "loss": 1.5212, + "step": 83040 + }, + { + "epoch": 2.4161503330715304, + "grad_norm": 18.125, + "learning_rate": 1.9461661225527117e-06, + "loss": 1.5268, + "step": 83060 + }, + { + "epoch": 2.416732117404078, + "grad_norm": 12.375, + "learning_rate": 1.9442268408800964e-06, + "loss": 1.5542, + "step": 83080 + }, + { + "epoch": 2.4173139017366263, + "grad_norm": 16.5, + "learning_rate": 1.9422875592074815e-06, + "loss": 1.3943, + "step": 83100 + }, + { + "epoch": 2.417895686069174, + "grad_norm": 12.125, + "learning_rate": 1.9403482775348666e-06, + "loss": 1.4659, + "step": 83120 + }, + { + "epoch": 2.4184774704017222, + "grad_norm": 15.0, + "learning_rate": 1.9384089958622517e-06, + "loss": 1.4778, + "step": 83140 + }, + { + "epoch": 2.41905925473427, + "grad_norm": 14.125, + "learning_rate": 1.9364697141896368e-06, + "loss": 1.4649, + "step": 83160 + }, + { + "epoch": 2.4196410390668177, + "grad_norm": 14.375, + "learning_rate": 1.934530432517022e-06, + "loss": 1.5323, + "step": 83180 + }, + { + "epoch": 2.420222823399366, + "grad_norm": 18.625, + "learning_rate": 1.932591150844407e-06, + "loss": 1.5303, + "step": 83200 + }, + { + "epoch": 2.4208046077319136, + "grad_norm": 11.4375, + "learning_rate": 1.930651869171792e-06, + "loss": 1.4254, + "step": 83220 + }, + { + "epoch": 2.421386392064462, + "grad_norm": 14.5, + "learning_rate": 1.928712587499177e-06, + "loss": 1.4953, + "step": 83240 + }, + { + "epoch": 2.4219681763970096, + "grad_norm": 14.375, + "learning_rate": 1.9267733058265622e-06, + "loss": 1.4438, + "step": 83260 + }, + { + "epoch": 2.4225499607295573, + "grad_norm": 13.1875, + "learning_rate": 1.9248340241539473e-06, + "loss": 1.4613, + "step": 83280 + }, + { + "epoch": 2.4231317450621055, + "grad_norm": 12.0, + "learning_rate": 1.9228947424813324e-06, + "loss": 1.5023, + "step": 83300 + }, + { + "epoch": 2.4237135293946532, + "grad_norm": 12.6875, + "learning_rate": 1.9209554608087175e-06, + "loss": 1.5418, + "step": 83320 + }, + { + "epoch": 2.4242953137272014, + "grad_norm": 12.9375, + "learning_rate": 1.9190161791361026e-06, + "loss": 1.497, + "step": 83340 + }, + { + "epoch": 2.424877098059749, + "grad_norm": 10.0, + "learning_rate": 1.9170768974634877e-06, + "loss": 1.559, + "step": 83360 + }, + { + "epoch": 2.4254588823922973, + "grad_norm": 13.125, + "learning_rate": 1.915137615790873e-06, + "loss": 1.5427, + "step": 83380 + }, + { + "epoch": 2.426040666724845, + "grad_norm": 14.9375, + "learning_rate": 1.913198334118258e-06, + "loss": 1.4278, + "step": 83400 + }, + { + "epoch": 2.4266224510573933, + "grad_norm": 12.5625, + "learning_rate": 1.911259052445643e-06, + "loss": 1.4889, + "step": 83420 + }, + { + "epoch": 2.427204235389941, + "grad_norm": 12.625, + "learning_rate": 1.909319770773028e-06, + "loss": 1.4222, + "step": 83440 + }, + { + "epoch": 2.4277860197224888, + "grad_norm": 15.0, + "learning_rate": 1.9073804891004132e-06, + "loss": 1.5064, + "step": 83460 + }, + { + "epoch": 2.428367804055037, + "grad_norm": 14.8125, + "learning_rate": 1.905441207427798e-06, + "loss": 1.5303, + "step": 83480 + }, + { + "epoch": 2.4289495883875847, + "grad_norm": 14.875, + "learning_rate": 1.9035019257551832e-06, + "loss": 1.4656, + "step": 83500 + }, + { + "epoch": 2.429531372720133, + "grad_norm": 15.125, + "learning_rate": 1.9015626440825683e-06, + "loss": 1.5907, + "step": 83520 + }, + { + "epoch": 2.4301131570526806, + "grad_norm": 12.8125, + "learning_rate": 1.8996233624099534e-06, + "loss": 1.4374, + "step": 83540 + }, + { + "epoch": 2.4306949413852283, + "grad_norm": 14.8125, + "learning_rate": 1.8976840807373385e-06, + "loss": 1.5289, + "step": 83560 + }, + { + "epoch": 2.4312767257177765, + "grad_norm": 14.4375, + "learning_rate": 1.8957447990647234e-06, + "loss": 1.5548, + "step": 83580 + }, + { + "epoch": 2.4318585100503243, + "grad_norm": 13.5, + "learning_rate": 1.8938055173921085e-06, + "loss": 1.4922, + "step": 83600 + }, + { + "epoch": 2.4324402943828725, + "grad_norm": 14.9375, + "learning_rate": 1.8918662357194936e-06, + "loss": 1.4578, + "step": 83620 + }, + { + "epoch": 2.43302207871542, + "grad_norm": 13.8125, + "learning_rate": 1.8899269540468787e-06, + "loss": 1.4666, + "step": 83640 + }, + { + "epoch": 2.433603863047968, + "grad_norm": 16.25, + "learning_rate": 1.8879876723742638e-06, + "loss": 1.5009, + "step": 83660 + }, + { + "epoch": 2.434185647380516, + "grad_norm": 13.4375, + "learning_rate": 1.8860483907016489e-06, + "loss": 1.4532, + "step": 83680 + }, + { + "epoch": 2.434767431713064, + "grad_norm": 18.5, + "learning_rate": 1.884109109029034e-06, + "loss": 1.4642, + "step": 83700 + }, + { + "epoch": 2.435349216045612, + "grad_norm": 15.6875, + "learning_rate": 1.8821698273564186e-06, + "loss": 1.569, + "step": 83720 + }, + { + "epoch": 2.43593100037816, + "grad_norm": 14.25, + "learning_rate": 1.8802305456838037e-06, + "loss": 1.5864, + "step": 83740 + }, + { + "epoch": 2.4365127847107075, + "grad_norm": 11.0625, + "learning_rate": 1.8782912640111888e-06, + "loss": 1.5732, + "step": 83760 + }, + { + "epoch": 2.4370945690432557, + "grad_norm": 14.375, + "learning_rate": 1.876351982338574e-06, + "loss": 1.4666, + "step": 83780 + }, + { + "epoch": 2.4376763533758035, + "grad_norm": 12.375, + "learning_rate": 1.874412700665959e-06, + "loss": 1.4964, + "step": 83800 + }, + { + "epoch": 2.4382581377083516, + "grad_norm": 12.375, + "learning_rate": 1.8724734189933441e-06, + "loss": 1.5458, + "step": 83820 + }, + { + "epoch": 2.4388399220408994, + "grad_norm": 13.125, + "learning_rate": 1.8705341373207292e-06, + "loss": 1.522, + "step": 83840 + }, + { + "epoch": 2.439421706373447, + "grad_norm": 14.875, + "learning_rate": 1.8685948556481143e-06, + "loss": 1.5432, + "step": 83860 + }, + { + "epoch": 2.4400034907059953, + "grad_norm": 14.0, + "learning_rate": 1.8666555739754994e-06, + "loss": 1.4725, + "step": 83880 + }, + { + "epoch": 2.440585275038543, + "grad_norm": 14.0, + "learning_rate": 1.8647162923028845e-06, + "loss": 1.5032, + "step": 83900 + }, + { + "epoch": 2.4411670593710912, + "grad_norm": 15.375, + "learning_rate": 1.8627770106302694e-06, + "loss": 1.4386, + "step": 83920 + }, + { + "epoch": 2.441748843703639, + "grad_norm": 10.5625, + "learning_rate": 1.8608377289576545e-06, + "loss": 1.5889, + "step": 83940 + }, + { + "epoch": 2.442330628036187, + "grad_norm": 13.75, + "learning_rate": 1.8588984472850396e-06, + "loss": 1.5089, + "step": 83960 + }, + { + "epoch": 2.442912412368735, + "grad_norm": 16.0, + "learning_rate": 1.8569591656124247e-06, + "loss": 1.5052, + "step": 83980 + }, + { + "epoch": 2.4434941967012827, + "grad_norm": 14.125, + "learning_rate": 1.8550198839398098e-06, + "loss": 1.4531, + "step": 84000 + }, + { + "epoch": 2.444075981033831, + "grad_norm": 14.5625, + "learning_rate": 1.8530806022671949e-06, + "loss": 1.5164, + "step": 84020 + }, + { + "epoch": 2.4446577653663786, + "grad_norm": 14.1875, + "learning_rate": 1.85114132059458e-06, + "loss": 1.5387, + "step": 84040 + }, + { + "epoch": 2.4452395496989268, + "grad_norm": 16.25, + "learning_rate": 1.849202038921965e-06, + "loss": 1.4059, + "step": 84060 + }, + { + "epoch": 2.4458213340314745, + "grad_norm": 12.4375, + "learning_rate": 1.8472627572493502e-06, + "loss": 1.4368, + "step": 84080 + }, + { + "epoch": 2.4464031183640227, + "grad_norm": 12.125, + "learning_rate": 1.8453234755767353e-06, + "loss": 1.461, + "step": 84100 + }, + { + "epoch": 2.4469849026965704, + "grad_norm": 13.25, + "learning_rate": 1.8433841939041204e-06, + "loss": 1.5223, + "step": 84120 + }, + { + "epoch": 2.447566687029118, + "grad_norm": 15.6875, + "learning_rate": 1.8414449122315053e-06, + "loss": 1.5211, + "step": 84140 + }, + { + "epoch": 2.4481484713616664, + "grad_norm": 13.9375, + "learning_rate": 1.8395056305588904e-06, + "loss": 1.4521, + "step": 84160 + }, + { + "epoch": 2.448730255694214, + "grad_norm": 14.25, + "learning_rate": 1.8375663488862755e-06, + "loss": 1.5413, + "step": 84180 + }, + { + "epoch": 2.4493120400267623, + "grad_norm": 11.0, + "learning_rate": 1.8356270672136605e-06, + "loss": 1.5836, + "step": 84200 + }, + { + "epoch": 2.44989382435931, + "grad_norm": 14.8125, + "learning_rate": 1.8336877855410456e-06, + "loss": 1.5283, + "step": 84220 + }, + { + "epoch": 2.4504756086918578, + "grad_norm": 12.4375, + "learning_rate": 1.8317485038684307e-06, + "loss": 1.3703, + "step": 84240 + }, + { + "epoch": 2.451057393024406, + "grad_norm": 13.5625, + "learning_rate": 1.8298092221958158e-06, + "loss": 1.5435, + "step": 84260 + }, + { + "epoch": 2.4516391773569537, + "grad_norm": 13.875, + "learning_rate": 1.827869940523201e-06, + "loss": 1.4617, + "step": 84280 + }, + { + "epoch": 2.452220961689502, + "grad_norm": 13.875, + "learning_rate": 1.825930658850586e-06, + "loss": 1.4998, + "step": 84300 + }, + { + "epoch": 2.4528027460220496, + "grad_norm": 14.625, + "learning_rate": 1.8239913771779711e-06, + "loss": 1.5317, + "step": 84320 + }, + { + "epoch": 2.4533845303545974, + "grad_norm": 13.5, + "learning_rate": 1.822052095505356e-06, + "loss": 1.4806, + "step": 84340 + }, + { + "epoch": 2.4539663146871455, + "grad_norm": 14.8125, + "learning_rate": 1.8201128138327411e-06, + "loss": 1.5489, + "step": 84360 + }, + { + "epoch": 2.4545480990196933, + "grad_norm": 11.6875, + "learning_rate": 1.8181735321601262e-06, + "loss": 1.466, + "step": 84380 + }, + { + "epoch": 2.4551298833522415, + "grad_norm": 16.75, + "learning_rate": 1.8162342504875113e-06, + "loss": 1.4696, + "step": 84400 + }, + { + "epoch": 2.455711667684789, + "grad_norm": 13.3125, + "learning_rate": 1.8142949688148964e-06, + "loss": 1.5239, + "step": 84420 + }, + { + "epoch": 2.456293452017337, + "grad_norm": 13.5625, + "learning_rate": 1.8123556871422815e-06, + "loss": 1.5328, + "step": 84440 + }, + { + "epoch": 2.456875236349885, + "grad_norm": 15.1875, + "learning_rate": 1.8104164054696666e-06, + "loss": 1.449, + "step": 84460 + }, + { + "epoch": 2.457457020682433, + "grad_norm": 12.25, + "learning_rate": 1.8084771237970517e-06, + "loss": 1.527, + "step": 84480 + }, + { + "epoch": 2.458038805014981, + "grad_norm": 11.3125, + "learning_rate": 1.8065378421244368e-06, + "loss": 1.5881, + "step": 84500 + }, + { + "epoch": 2.458620589347529, + "grad_norm": 16.5, + "learning_rate": 1.804598560451822e-06, + "loss": 1.4317, + "step": 84520 + }, + { + "epoch": 2.4592023736800765, + "grad_norm": 15.25, + "learning_rate": 1.8026592787792068e-06, + "loss": 1.4889, + "step": 84540 + }, + { + "epoch": 2.4597841580126247, + "grad_norm": 14.25, + "learning_rate": 1.8007199971065919e-06, + "loss": 1.5569, + "step": 84560 + }, + { + "epoch": 2.4603659423451725, + "grad_norm": 14.0625, + "learning_rate": 1.798780715433977e-06, + "loss": 1.4411, + "step": 84580 + }, + { + "epoch": 2.4609477266777207, + "grad_norm": 13.0625, + "learning_rate": 1.796841433761362e-06, + "loss": 1.4549, + "step": 84600 + }, + { + "epoch": 2.4615295110102684, + "grad_norm": 12.75, + "learning_rate": 1.7949021520887472e-06, + "loss": 1.5386, + "step": 84620 + }, + { + "epoch": 2.4621112953428166, + "grad_norm": 14.0, + "learning_rate": 1.7929628704161323e-06, + "loss": 1.6189, + "step": 84640 + }, + { + "epoch": 2.4626930796753643, + "grad_norm": 16.75, + "learning_rate": 1.7910235887435174e-06, + "loss": 1.5666, + "step": 84660 + }, + { + "epoch": 2.4632748640079125, + "grad_norm": 12.125, + "learning_rate": 1.7890843070709025e-06, + "loss": 1.4781, + "step": 84680 + }, + { + "epoch": 2.4638566483404603, + "grad_norm": 13.0, + "learning_rate": 1.7871450253982876e-06, + "loss": 1.4691, + "step": 84700 + }, + { + "epoch": 2.464438432673008, + "grad_norm": 7.90625, + "learning_rate": 1.7852057437256727e-06, + "loss": 1.5353, + "step": 84720 + }, + { + "epoch": 2.465020217005556, + "grad_norm": 12.5, + "learning_rate": 1.7832664620530575e-06, + "loss": 1.3974, + "step": 84740 + }, + { + "epoch": 2.465602001338104, + "grad_norm": 13.4375, + "learning_rate": 1.7813271803804426e-06, + "loss": 1.4819, + "step": 84760 + }, + { + "epoch": 2.466183785670652, + "grad_norm": 14.4375, + "learning_rate": 1.7793878987078275e-06, + "loss": 1.5848, + "step": 84780 + }, + { + "epoch": 2.4667655700032, + "grad_norm": 11.625, + "learning_rate": 1.7774486170352126e-06, + "loss": 1.4865, + "step": 84800 + }, + { + "epoch": 2.4673473543357476, + "grad_norm": 14.375, + "learning_rate": 1.7755093353625977e-06, + "loss": 1.4644, + "step": 84820 + }, + { + "epoch": 2.4679291386682958, + "grad_norm": 14.4375, + "learning_rate": 1.7735700536899828e-06, + "loss": 1.4834, + "step": 84840 + }, + { + "epoch": 2.4685109230008435, + "grad_norm": 14.6875, + "learning_rate": 1.771630772017368e-06, + "loss": 1.5563, + "step": 84860 + }, + { + "epoch": 2.4690927073333917, + "grad_norm": 12.375, + "learning_rate": 1.7696914903447528e-06, + "loss": 1.5087, + "step": 84880 + }, + { + "epoch": 2.4696744916659394, + "grad_norm": 14.0, + "learning_rate": 1.767752208672138e-06, + "loss": 1.4945, + "step": 84900 + }, + { + "epoch": 2.470256275998487, + "grad_norm": 14.8125, + "learning_rate": 1.765812926999523e-06, + "loss": 1.4434, + "step": 84920 + }, + { + "epoch": 2.4708380603310354, + "grad_norm": 15.75, + "learning_rate": 1.763873645326908e-06, + "loss": 1.5025, + "step": 84940 + }, + { + "epoch": 2.471419844663583, + "grad_norm": 15.8125, + "learning_rate": 1.7619343636542932e-06, + "loss": 1.5365, + "step": 84960 + }, + { + "epoch": 2.4720016289961313, + "grad_norm": 14.625, + "learning_rate": 1.7599950819816783e-06, + "loss": 1.4666, + "step": 84980 + }, + { + "epoch": 2.472583413328679, + "grad_norm": 14.1875, + "learning_rate": 1.7580558003090634e-06, + "loss": 1.5651, + "step": 85000 + }, + { + "epoch": 2.4731651976612268, + "grad_norm": 14.9375, + "learning_rate": 1.7561165186364485e-06, + "loss": 1.4765, + "step": 85020 + }, + { + "epoch": 2.473746981993775, + "grad_norm": 12.3125, + "learning_rate": 1.7541772369638336e-06, + "loss": 1.4852, + "step": 85040 + }, + { + "epoch": 2.4743287663263227, + "grad_norm": 12.1875, + "learning_rate": 1.7522379552912187e-06, + "loss": 1.5172, + "step": 85060 + }, + { + "epoch": 2.474910550658871, + "grad_norm": 16.125, + "learning_rate": 1.7502986736186036e-06, + "loss": 1.5443, + "step": 85080 + }, + { + "epoch": 2.4754923349914186, + "grad_norm": 14.3125, + "learning_rate": 1.7483593919459887e-06, + "loss": 1.4828, + "step": 85100 + }, + { + "epoch": 2.4760741193239664, + "grad_norm": 15.0, + "learning_rate": 1.7464201102733738e-06, + "loss": 1.5358, + "step": 85120 + }, + { + "epoch": 2.4766559036565146, + "grad_norm": 13.9375, + "learning_rate": 1.7444808286007589e-06, + "loss": 1.4983, + "step": 85140 + }, + { + "epoch": 2.4772376879890623, + "grad_norm": 16.375, + "learning_rate": 1.742541546928144e-06, + "loss": 1.4259, + "step": 85160 + }, + { + "epoch": 2.4778194723216105, + "grad_norm": 10.5, + "learning_rate": 1.740602265255529e-06, + "loss": 1.473, + "step": 85180 + }, + { + "epoch": 2.4784012566541582, + "grad_norm": 13.4375, + "learning_rate": 1.7386629835829142e-06, + "loss": 1.5687, + "step": 85200 + }, + { + "epoch": 2.4789830409867064, + "grad_norm": 13.1875, + "learning_rate": 1.7367237019102992e-06, + "loss": 1.3671, + "step": 85220 + }, + { + "epoch": 2.479564825319254, + "grad_norm": 14.625, + "learning_rate": 1.7347844202376843e-06, + "loss": 1.4198, + "step": 85240 + }, + { + "epoch": 2.480146609651802, + "grad_norm": 13.8125, + "learning_rate": 1.7328451385650694e-06, + "loss": 1.543, + "step": 85260 + }, + { + "epoch": 2.48072839398435, + "grad_norm": 13.3125, + "learning_rate": 1.7309058568924545e-06, + "loss": 1.5451, + "step": 85280 + }, + { + "epoch": 2.481310178316898, + "grad_norm": 16.25, + "learning_rate": 1.7289665752198394e-06, + "loss": 1.5445, + "step": 85300 + }, + { + "epoch": 2.481891962649446, + "grad_norm": 14.6875, + "learning_rate": 1.7270272935472245e-06, + "loss": 1.5054, + "step": 85320 + }, + { + "epoch": 2.4824737469819937, + "grad_norm": 12.125, + "learning_rate": 1.7250880118746096e-06, + "loss": 1.563, + "step": 85340 + }, + { + "epoch": 2.483055531314542, + "grad_norm": 14.8125, + "learning_rate": 1.7231487302019947e-06, + "loss": 1.4879, + "step": 85360 + }, + { + "epoch": 2.4836373156470897, + "grad_norm": 13.4375, + "learning_rate": 1.7212094485293798e-06, + "loss": 1.4319, + "step": 85380 + }, + { + "epoch": 2.4842190999796374, + "grad_norm": 13.125, + "learning_rate": 1.719270166856765e-06, + "loss": 1.4411, + "step": 85400 + }, + { + "epoch": 2.4848008843121856, + "grad_norm": 14.625, + "learning_rate": 1.71733088518415e-06, + "loss": 1.5097, + "step": 85420 + }, + { + "epoch": 2.4853826686447333, + "grad_norm": 16.75, + "learning_rate": 1.7153916035115351e-06, + "loss": 1.524, + "step": 85440 + }, + { + "epoch": 2.4859644529772815, + "grad_norm": 16.25, + "learning_rate": 1.7134523218389202e-06, + "loss": 1.5037, + "step": 85460 + }, + { + "epoch": 2.4865462373098293, + "grad_norm": 12.6875, + "learning_rate": 1.7115130401663053e-06, + "loss": 1.5351, + "step": 85480 + }, + { + "epoch": 2.487128021642377, + "grad_norm": 12.5625, + "learning_rate": 1.7095737584936902e-06, + "loss": 1.454, + "step": 85500 + }, + { + "epoch": 2.487709805974925, + "grad_norm": 14.3125, + "learning_rate": 1.7076344768210753e-06, + "loss": 1.484, + "step": 85520 + }, + { + "epoch": 2.488291590307473, + "grad_norm": 14.3125, + "learning_rate": 1.7056951951484604e-06, + "loss": 1.5233, + "step": 85540 + }, + { + "epoch": 2.488873374640021, + "grad_norm": 11.625, + "learning_rate": 1.7037559134758455e-06, + "loss": 1.4799, + "step": 85560 + }, + { + "epoch": 2.489455158972569, + "grad_norm": 14.4375, + "learning_rate": 1.7018166318032306e-06, + "loss": 1.4214, + "step": 85580 + }, + { + "epoch": 2.4900369433051166, + "grad_norm": 13.8125, + "learning_rate": 1.6998773501306157e-06, + "loss": 1.3949, + "step": 85600 + }, + { + "epoch": 2.490618727637665, + "grad_norm": 14.4375, + "learning_rate": 1.6979380684580008e-06, + "loss": 1.5463, + "step": 85620 + }, + { + "epoch": 2.4912005119702125, + "grad_norm": 13.3125, + "learning_rate": 1.6959987867853859e-06, + "loss": 1.455, + "step": 85640 + }, + { + "epoch": 2.4917822963027607, + "grad_norm": 16.25, + "learning_rate": 1.694059505112771e-06, + "loss": 1.4947, + "step": 85660 + }, + { + "epoch": 2.4923640806353085, + "grad_norm": 10.6875, + "learning_rate": 1.692120223440156e-06, + "loss": 1.5354, + "step": 85680 + }, + { + "epoch": 2.492945864967856, + "grad_norm": 13.5, + "learning_rate": 1.690180941767541e-06, + "loss": 1.5464, + "step": 85700 + }, + { + "epoch": 2.4935276493004044, + "grad_norm": 13.8125, + "learning_rate": 1.688241660094926e-06, + "loss": 1.5747, + "step": 85720 + }, + { + "epoch": 2.494109433632952, + "grad_norm": 13.375, + "learning_rate": 1.6863023784223111e-06, + "loss": 1.3954, + "step": 85740 + }, + { + "epoch": 2.4946912179655003, + "grad_norm": 12.9375, + "learning_rate": 1.6843630967496962e-06, + "loss": 1.5419, + "step": 85760 + }, + { + "epoch": 2.495273002298048, + "grad_norm": 13.9375, + "learning_rate": 1.6824238150770813e-06, + "loss": 1.4634, + "step": 85780 + }, + { + "epoch": 2.495854786630596, + "grad_norm": 13.375, + "learning_rate": 1.6804845334044664e-06, + "loss": 1.5401, + "step": 85800 + }, + { + "epoch": 2.496436570963144, + "grad_norm": 12.1875, + "learning_rate": 1.6785452517318515e-06, + "loss": 1.4788, + "step": 85820 + }, + { + "epoch": 2.4970183552956917, + "grad_norm": 12.0625, + "learning_rate": 1.6766059700592362e-06, + "loss": 1.4634, + "step": 85840 + }, + { + "epoch": 2.49760013962824, + "grad_norm": 15.875, + "learning_rate": 1.6746666883866213e-06, + "loss": 1.5184, + "step": 85860 + }, + { + "epoch": 2.4981819239607876, + "grad_norm": 13.5, + "learning_rate": 1.6727274067140064e-06, + "loss": 1.5197, + "step": 85880 + }, + { + "epoch": 2.498763708293336, + "grad_norm": 11.5625, + "learning_rate": 1.6707881250413915e-06, + "loss": 1.5003, + "step": 85900 + }, + { + "epoch": 2.4993454926258836, + "grad_norm": 12.875, + "learning_rate": 1.6688488433687766e-06, + "loss": 1.4576, + "step": 85920 + }, + { + "epoch": 2.4999272769584318, + "grad_norm": 12.875, + "learning_rate": 1.6669095616961617e-06, + "loss": 1.5049, + "step": 85940 + }, + { + "epoch": 2.5005090612909795, + "grad_norm": 13.75, + "learning_rate": 1.6649702800235468e-06, + "loss": 1.5107, + "step": 85960 + }, + { + "epoch": 2.5010908456235272, + "grad_norm": 12.125, + "learning_rate": 1.663030998350932e-06, + "loss": 1.4735, + "step": 85980 + }, + { + "epoch": 2.5016726299560754, + "grad_norm": 16.5, + "learning_rate": 1.661091716678317e-06, + "loss": 1.4646, + "step": 86000 + }, + { + "epoch": 2.502254414288623, + "grad_norm": 12.75, + "learning_rate": 1.659152435005702e-06, + "loss": 1.4567, + "step": 86020 + }, + { + "epoch": 2.5028361986211713, + "grad_norm": 14.1875, + "learning_rate": 1.657213153333087e-06, + "loss": 1.4665, + "step": 86040 + }, + { + "epoch": 2.503417982953719, + "grad_norm": 15.25, + "learning_rate": 1.655273871660472e-06, + "loss": 1.5428, + "step": 86060 + }, + { + "epoch": 2.503999767286267, + "grad_norm": 11.8125, + "learning_rate": 1.6533345899878572e-06, + "loss": 1.5118, + "step": 86080 + }, + { + "epoch": 2.504581551618815, + "grad_norm": 13.8125, + "learning_rate": 1.6513953083152423e-06, + "loss": 1.5431, + "step": 86100 + }, + { + "epoch": 2.5051633359513628, + "grad_norm": 14.375, + "learning_rate": 1.6494560266426274e-06, + "loss": 1.5274, + "step": 86120 + }, + { + "epoch": 2.505745120283911, + "grad_norm": 13.75, + "learning_rate": 1.6475167449700125e-06, + "loss": 1.5037, + "step": 86140 + }, + { + "epoch": 2.5063269046164587, + "grad_norm": 15.5, + "learning_rate": 1.6455774632973976e-06, + "loss": 1.5036, + "step": 86160 + }, + { + "epoch": 2.5069086889490064, + "grad_norm": 19.5, + "learning_rate": 1.6436381816247827e-06, + "loss": 1.4302, + "step": 86180 + }, + { + "epoch": 2.5074904732815546, + "grad_norm": 11.75, + "learning_rate": 1.6416988999521678e-06, + "loss": 1.5004, + "step": 86200 + }, + { + "epoch": 2.5080722576141024, + "grad_norm": 12.0, + "learning_rate": 1.6397596182795529e-06, + "loss": 1.4269, + "step": 86220 + }, + { + "epoch": 2.5086540419466505, + "grad_norm": 12.4375, + "learning_rate": 1.6378203366069377e-06, + "loss": 1.4466, + "step": 86240 + }, + { + "epoch": 2.5092358262791983, + "grad_norm": 13.875, + "learning_rate": 1.6358810549343228e-06, + "loss": 1.5068, + "step": 86260 + }, + { + "epoch": 2.509817610611746, + "grad_norm": 14.75, + "learning_rate": 1.633941773261708e-06, + "loss": 1.4814, + "step": 86280 + }, + { + "epoch": 2.510399394944294, + "grad_norm": 13.1875, + "learning_rate": 1.632002491589093e-06, + "loss": 1.4891, + "step": 86300 + }, + { + "epoch": 2.510981179276842, + "grad_norm": 11.375, + "learning_rate": 1.6300632099164781e-06, + "loss": 1.5594, + "step": 86320 + }, + { + "epoch": 2.51156296360939, + "grad_norm": 14.1875, + "learning_rate": 1.6281239282438632e-06, + "loss": 1.5167, + "step": 86340 + }, + { + "epoch": 2.512144747941938, + "grad_norm": 14.0625, + "learning_rate": 1.6261846465712483e-06, + "loss": 1.5381, + "step": 86360 + }, + { + "epoch": 2.5127265322744856, + "grad_norm": 13.1875, + "learning_rate": 1.6242453648986334e-06, + "loss": 1.445, + "step": 86380 + }, + { + "epoch": 2.513308316607034, + "grad_norm": 11.9375, + "learning_rate": 1.6223060832260185e-06, + "loss": 1.5091, + "step": 86400 + }, + { + "epoch": 2.5138901009395815, + "grad_norm": 13.5625, + "learning_rate": 1.6203668015534036e-06, + "loss": 1.5194, + "step": 86420 + }, + { + "epoch": 2.5144718852721297, + "grad_norm": 16.375, + "learning_rate": 1.6184275198807887e-06, + "loss": 1.4995, + "step": 86440 + }, + { + "epoch": 2.5150536696046775, + "grad_norm": 13.8125, + "learning_rate": 1.6164882382081736e-06, + "loss": 1.5071, + "step": 86460 + }, + { + "epoch": 2.515635453937225, + "grad_norm": 10.1875, + "learning_rate": 1.6145489565355587e-06, + "loss": 1.4894, + "step": 86480 + }, + { + "epoch": 2.5162172382697734, + "grad_norm": 11.9375, + "learning_rate": 1.6126096748629438e-06, + "loss": 1.5848, + "step": 86500 + }, + { + "epoch": 2.5167990226023216, + "grad_norm": 13.4375, + "learning_rate": 1.6106703931903289e-06, + "loss": 1.5928, + "step": 86520 + }, + { + "epoch": 2.5173808069348693, + "grad_norm": 12.4375, + "learning_rate": 1.608731111517714e-06, + "loss": 1.4975, + "step": 86540 + }, + { + "epoch": 2.517962591267417, + "grad_norm": 14.0, + "learning_rate": 1.606791829845099e-06, + "loss": 1.5314, + "step": 86560 + }, + { + "epoch": 2.5185443755999652, + "grad_norm": 14.4375, + "learning_rate": 1.6048525481724842e-06, + "loss": 1.4563, + "step": 86580 + }, + { + "epoch": 2.519126159932513, + "grad_norm": 13.9375, + "learning_rate": 1.6029132664998693e-06, + "loss": 1.4916, + "step": 86600 + }, + { + "epoch": 2.519707944265061, + "grad_norm": 16.375, + "learning_rate": 1.6009739848272544e-06, + "loss": 1.5425, + "step": 86620 + }, + { + "epoch": 2.520289728597609, + "grad_norm": 13.5, + "learning_rate": 1.5990347031546395e-06, + "loss": 1.4995, + "step": 86640 + }, + { + "epoch": 2.5208715129301567, + "grad_norm": 14.75, + "learning_rate": 1.5970954214820244e-06, + "loss": 1.4136, + "step": 86660 + }, + { + "epoch": 2.521453297262705, + "grad_norm": 11.25, + "learning_rate": 1.5951561398094095e-06, + "loss": 1.5536, + "step": 86680 + }, + { + "epoch": 2.5220350815952526, + "grad_norm": 11.0, + "learning_rate": 1.5932168581367946e-06, + "loss": 1.5162, + "step": 86700 + }, + { + "epoch": 2.5226168659278008, + "grad_norm": 16.125, + "learning_rate": 1.5912775764641797e-06, + "loss": 1.5166, + "step": 86720 + }, + { + "epoch": 2.5231986502603485, + "grad_norm": 10.4375, + "learning_rate": 1.5893382947915648e-06, + "loss": 1.4429, + "step": 86740 + }, + { + "epoch": 2.5237804345928962, + "grad_norm": 17.5, + "learning_rate": 1.5873990131189498e-06, + "loss": 1.4458, + "step": 86760 + }, + { + "epoch": 2.5243622189254444, + "grad_norm": 14.0625, + "learning_rate": 1.585459731446335e-06, + "loss": 1.4595, + "step": 86780 + }, + { + "epoch": 2.524944003257992, + "grad_norm": 13.4375, + "learning_rate": 1.58352044977372e-06, + "loss": 1.5339, + "step": 86800 + }, + { + "epoch": 2.5255257875905404, + "grad_norm": 14.5, + "learning_rate": 1.5815811681011051e-06, + "loss": 1.5444, + "step": 86820 + }, + { + "epoch": 2.526107571923088, + "grad_norm": 16.5, + "learning_rate": 1.5796418864284902e-06, + "loss": 1.4465, + "step": 86840 + }, + { + "epoch": 2.526689356255636, + "grad_norm": 14.6875, + "learning_rate": 1.5777026047558751e-06, + "loss": 1.5121, + "step": 86860 + }, + { + "epoch": 2.527271140588184, + "grad_norm": 11.5625, + "learning_rate": 1.5757633230832602e-06, + "loss": 1.4736, + "step": 86880 + }, + { + "epoch": 2.5278529249207318, + "grad_norm": 16.375, + "learning_rate": 1.5738240414106451e-06, + "loss": 1.546, + "step": 86900 + }, + { + "epoch": 2.52843470925328, + "grad_norm": 13.5, + "learning_rate": 1.5718847597380302e-06, + "loss": 1.4419, + "step": 86920 + }, + { + "epoch": 2.5290164935858277, + "grad_norm": 12.1875, + "learning_rate": 1.5699454780654153e-06, + "loss": 1.4853, + "step": 86940 + }, + { + "epoch": 2.5295982779183754, + "grad_norm": 13.8125, + "learning_rate": 1.5680061963928004e-06, + "loss": 1.492, + "step": 86960 + }, + { + "epoch": 2.5301800622509236, + "grad_norm": 14.0, + "learning_rate": 1.5660669147201855e-06, + "loss": 1.4927, + "step": 86980 + }, + { + "epoch": 2.5307618465834714, + "grad_norm": 13.5625, + "learning_rate": 1.5641276330475704e-06, + "loss": 1.504, + "step": 87000 + }, + { + "epoch": 2.5313436309160195, + "grad_norm": 14.0, + "learning_rate": 1.5621883513749555e-06, + "loss": 1.4816, + "step": 87020 + }, + { + "epoch": 2.5319254152485673, + "grad_norm": 15.5625, + "learning_rate": 1.5602490697023406e-06, + "loss": 1.5125, + "step": 87040 + }, + { + "epoch": 2.532507199581115, + "grad_norm": 11.4375, + "learning_rate": 1.5583097880297257e-06, + "loss": 1.5393, + "step": 87060 + }, + { + "epoch": 2.533088983913663, + "grad_norm": 13.5625, + "learning_rate": 1.5563705063571108e-06, + "loss": 1.4948, + "step": 87080 + }, + { + "epoch": 2.5336707682462114, + "grad_norm": 17.375, + "learning_rate": 1.5544312246844959e-06, + "loss": 1.5009, + "step": 87100 + }, + { + "epoch": 2.534252552578759, + "grad_norm": 13.25, + "learning_rate": 1.552491943011881e-06, + "loss": 1.491, + "step": 87120 + }, + { + "epoch": 2.534834336911307, + "grad_norm": 12.25, + "learning_rate": 1.550552661339266e-06, + "loss": 1.4954, + "step": 87140 + }, + { + "epoch": 2.5354161212438546, + "grad_norm": 14.625, + "learning_rate": 1.5486133796666512e-06, + "loss": 1.4881, + "step": 87160 + }, + { + "epoch": 2.535997905576403, + "grad_norm": 11.375, + "learning_rate": 1.5466740979940363e-06, + "loss": 1.4248, + "step": 87180 + }, + { + "epoch": 2.536579689908951, + "grad_norm": 14.8125, + "learning_rate": 1.5447348163214211e-06, + "loss": 1.4026, + "step": 87200 + }, + { + "epoch": 2.5371614742414987, + "grad_norm": 15.375, + "learning_rate": 1.5427955346488062e-06, + "loss": 1.5348, + "step": 87220 + }, + { + "epoch": 2.5377432585740465, + "grad_norm": 13.9375, + "learning_rate": 1.5408562529761913e-06, + "loss": 1.4726, + "step": 87240 + }, + { + "epoch": 2.5383250429065947, + "grad_norm": 11.9375, + "learning_rate": 1.5389169713035764e-06, + "loss": 1.4464, + "step": 87260 + }, + { + "epoch": 2.5389068272391424, + "grad_norm": 11.625, + "learning_rate": 1.5369776896309615e-06, + "loss": 1.5215, + "step": 87280 + }, + { + "epoch": 2.5394886115716906, + "grad_norm": 12.25, + "learning_rate": 1.5350384079583466e-06, + "loss": 1.5135, + "step": 87300 + }, + { + "epoch": 2.5400703959042383, + "grad_norm": 14.25, + "learning_rate": 1.5330991262857317e-06, + "loss": 1.5343, + "step": 87320 + }, + { + "epoch": 2.540652180236786, + "grad_norm": 13.8125, + "learning_rate": 1.5311598446131168e-06, + "loss": 1.4614, + "step": 87340 + }, + { + "epoch": 2.5412339645693343, + "grad_norm": 12.5, + "learning_rate": 1.529220562940502e-06, + "loss": 1.3836, + "step": 87360 + }, + { + "epoch": 2.541815748901882, + "grad_norm": 15.625, + "learning_rate": 1.527281281267887e-06, + "loss": 1.4345, + "step": 87380 + }, + { + "epoch": 2.54239753323443, + "grad_norm": 12.0625, + "learning_rate": 1.5253419995952721e-06, + "loss": 1.5226, + "step": 87400 + }, + { + "epoch": 2.542979317566978, + "grad_norm": 11.8125, + "learning_rate": 1.523402717922657e-06, + "loss": 1.4521, + "step": 87420 + }, + { + "epoch": 2.5435611018995257, + "grad_norm": 13.0625, + "learning_rate": 1.521463436250042e-06, + "loss": 1.5146, + "step": 87440 + }, + { + "epoch": 2.544142886232074, + "grad_norm": 11.6875, + "learning_rate": 1.5195241545774272e-06, + "loss": 1.6255, + "step": 87460 + }, + { + "epoch": 2.5447246705646216, + "grad_norm": 14.125, + "learning_rate": 1.5175848729048123e-06, + "loss": 1.43, + "step": 87480 + }, + { + "epoch": 2.54530645489717, + "grad_norm": 14.375, + "learning_rate": 1.5156455912321974e-06, + "loss": 1.4608, + "step": 87500 + }, + { + "epoch": 2.5458882392297175, + "grad_norm": 12.0, + "learning_rate": 1.5137063095595825e-06, + "loss": 1.5352, + "step": 87520 + }, + { + "epoch": 2.5464700235622653, + "grad_norm": 13.625, + "learning_rate": 1.5117670278869676e-06, + "loss": 1.575, + "step": 87540 + }, + { + "epoch": 2.5470518078948134, + "grad_norm": 12.8125, + "learning_rate": 1.5098277462143527e-06, + "loss": 1.4455, + "step": 87560 + }, + { + "epoch": 2.547633592227361, + "grad_norm": 12.5, + "learning_rate": 1.5078884645417378e-06, + "loss": 1.4836, + "step": 87580 + }, + { + "epoch": 2.5482153765599094, + "grad_norm": 13.5625, + "learning_rate": 1.5059491828691229e-06, + "loss": 1.3722, + "step": 87600 + }, + { + "epoch": 2.548797160892457, + "grad_norm": 13.125, + "learning_rate": 1.5040099011965078e-06, + "loss": 1.4748, + "step": 87620 + }, + { + "epoch": 2.549378945225005, + "grad_norm": 11.875, + "learning_rate": 1.5020706195238929e-06, + "loss": 1.5758, + "step": 87640 + }, + { + "epoch": 2.549960729557553, + "grad_norm": 16.875, + "learning_rate": 1.500131337851278e-06, + "loss": 1.5294, + "step": 87660 + }, + { + "epoch": 2.550542513890101, + "grad_norm": 13.625, + "learning_rate": 1.498192056178663e-06, + "loss": 1.5163, + "step": 87680 + }, + { + "epoch": 2.551124298222649, + "grad_norm": 15.125, + "learning_rate": 1.4962527745060482e-06, + "loss": 1.5919, + "step": 87700 + }, + { + "epoch": 2.5517060825551967, + "grad_norm": 12.25, + "learning_rate": 1.4943134928334333e-06, + "loss": 1.5041, + "step": 87720 + }, + { + "epoch": 2.5522878668877444, + "grad_norm": 15.8125, + "learning_rate": 1.4923742111608184e-06, + "loss": 1.4404, + "step": 87740 + }, + { + "epoch": 2.5528696512202926, + "grad_norm": 11.75, + "learning_rate": 1.4904349294882035e-06, + "loss": 1.5303, + "step": 87760 + }, + { + "epoch": 2.553451435552841, + "grad_norm": 12.8125, + "learning_rate": 1.4884956478155885e-06, + "loss": 1.5042, + "step": 87780 + }, + { + "epoch": 2.5540332198853886, + "grad_norm": 14.5625, + "learning_rate": 1.4865563661429736e-06, + "loss": 1.4936, + "step": 87800 + }, + { + "epoch": 2.5546150042179363, + "grad_norm": 16.25, + "learning_rate": 1.4846170844703585e-06, + "loss": 1.5642, + "step": 87820 + }, + { + "epoch": 2.5551967885504845, + "grad_norm": 9.0625, + "learning_rate": 1.4826778027977436e-06, + "loss": 1.4549, + "step": 87840 + }, + { + "epoch": 2.5557785728830322, + "grad_norm": 13.75, + "learning_rate": 1.4807385211251287e-06, + "loss": 1.514, + "step": 87860 + }, + { + "epoch": 2.5563603572155804, + "grad_norm": 13.5, + "learning_rate": 1.4787992394525138e-06, + "loss": 1.4641, + "step": 87880 + }, + { + "epoch": 2.556942141548128, + "grad_norm": 12.1875, + "learning_rate": 1.476859957779899e-06, + "loss": 1.5321, + "step": 87900 + }, + { + "epoch": 2.557523925880676, + "grad_norm": 17.25, + "learning_rate": 1.474920676107284e-06, + "loss": 1.462, + "step": 87920 + }, + { + "epoch": 2.558105710213224, + "grad_norm": 15.5625, + "learning_rate": 1.4729813944346691e-06, + "loss": 1.5143, + "step": 87940 + }, + { + "epoch": 2.558687494545772, + "grad_norm": 12.5625, + "learning_rate": 1.4710421127620538e-06, + "loss": 1.4539, + "step": 87960 + }, + { + "epoch": 2.55926927887832, + "grad_norm": 12.125, + "learning_rate": 1.4691028310894389e-06, + "loss": 1.451, + "step": 87980 + }, + { + "epoch": 2.5598510632108677, + "grad_norm": 13.5, + "learning_rate": 1.467163549416824e-06, + "loss": 1.5119, + "step": 88000 + }, + { + "epoch": 2.5604328475434155, + "grad_norm": 13.3125, + "learning_rate": 1.465224267744209e-06, + "loss": 1.4436, + "step": 88020 + }, + { + "epoch": 2.5610146318759637, + "grad_norm": 11.5, + "learning_rate": 1.4632849860715942e-06, + "loss": 1.436, + "step": 88040 + }, + { + "epoch": 2.5615964162085114, + "grad_norm": 12.1875, + "learning_rate": 1.4613457043989793e-06, + "loss": 1.4874, + "step": 88060 + }, + { + "epoch": 2.5621782005410596, + "grad_norm": 16.25, + "learning_rate": 1.4594064227263644e-06, + "loss": 1.4848, + "step": 88080 + }, + { + "epoch": 2.5627599848736073, + "grad_norm": 12.6875, + "learning_rate": 1.4574671410537495e-06, + "loss": 1.4135, + "step": 88100 + }, + { + "epoch": 2.563341769206155, + "grad_norm": 13.625, + "learning_rate": 1.4555278593811346e-06, + "loss": 1.4105, + "step": 88120 + }, + { + "epoch": 2.5639235535387033, + "grad_norm": 15.9375, + "learning_rate": 1.4535885777085197e-06, + "loss": 1.4694, + "step": 88140 + }, + { + "epoch": 2.564505337871251, + "grad_norm": 13.5625, + "learning_rate": 1.4516492960359046e-06, + "loss": 1.4228, + "step": 88160 + }, + { + "epoch": 2.565087122203799, + "grad_norm": 12.1875, + "learning_rate": 1.4497100143632897e-06, + "loss": 1.4681, + "step": 88180 + }, + { + "epoch": 2.565668906536347, + "grad_norm": 13.375, + "learning_rate": 1.4477707326906747e-06, + "loss": 1.552, + "step": 88200 + }, + { + "epoch": 2.5662506908688947, + "grad_norm": 10.75, + "learning_rate": 1.4458314510180598e-06, + "loss": 1.5275, + "step": 88220 + }, + { + "epoch": 2.566832475201443, + "grad_norm": 13.0625, + "learning_rate": 1.443892169345445e-06, + "loss": 1.4954, + "step": 88240 + }, + { + "epoch": 2.5674142595339906, + "grad_norm": 12.625, + "learning_rate": 1.44195288767283e-06, + "loss": 1.5454, + "step": 88260 + }, + { + "epoch": 2.567996043866539, + "grad_norm": 10.6875, + "learning_rate": 1.4400136060002151e-06, + "loss": 1.4329, + "step": 88280 + }, + { + "epoch": 2.5685778281990865, + "grad_norm": 13.1875, + "learning_rate": 1.4380743243276002e-06, + "loss": 1.4786, + "step": 88300 + }, + { + "epoch": 2.5691596125316343, + "grad_norm": 15.0, + "learning_rate": 1.4361350426549853e-06, + "loss": 1.4678, + "step": 88320 + }, + { + "epoch": 2.5697413968641825, + "grad_norm": 11.25, + "learning_rate": 1.4341957609823704e-06, + "loss": 1.5508, + "step": 88340 + }, + { + "epoch": 2.5703231811967306, + "grad_norm": 11.0625, + "learning_rate": 1.4322564793097553e-06, + "loss": 1.4506, + "step": 88360 + }, + { + "epoch": 2.5709049655292784, + "grad_norm": 12.0, + "learning_rate": 1.4303171976371404e-06, + "loss": 1.4891, + "step": 88380 + }, + { + "epoch": 2.571486749861826, + "grad_norm": 15.125, + "learning_rate": 1.4283779159645255e-06, + "loss": 1.5108, + "step": 88400 + }, + { + "epoch": 2.5720685341943743, + "grad_norm": 12.875, + "learning_rate": 1.4264386342919106e-06, + "loss": 1.4628, + "step": 88420 + }, + { + "epoch": 2.572650318526922, + "grad_norm": 7.8125, + "learning_rate": 1.4244993526192957e-06, + "loss": 1.4989, + "step": 88440 + }, + { + "epoch": 2.5732321028594702, + "grad_norm": 14.0, + "learning_rate": 1.4225600709466808e-06, + "loss": 1.5064, + "step": 88460 + }, + { + "epoch": 2.573813887192018, + "grad_norm": 14.4375, + "learning_rate": 1.420620789274066e-06, + "loss": 1.4556, + "step": 88480 + }, + { + "epoch": 2.5743956715245657, + "grad_norm": 16.25, + "learning_rate": 1.418681507601451e-06, + "loss": 1.3963, + "step": 88500 + }, + { + "epoch": 2.574977455857114, + "grad_norm": 13.75, + "learning_rate": 1.416742225928836e-06, + "loss": 1.5654, + "step": 88520 + }, + { + "epoch": 2.5755592401896616, + "grad_norm": 16.5, + "learning_rate": 1.4148029442562212e-06, + "loss": 1.5069, + "step": 88540 + }, + { + "epoch": 2.57614102452221, + "grad_norm": 13.875, + "learning_rate": 1.4128636625836063e-06, + "loss": 1.5128, + "step": 88560 + }, + { + "epoch": 2.5767228088547576, + "grad_norm": 13.5, + "learning_rate": 1.4109243809109912e-06, + "loss": 1.4906, + "step": 88580 + }, + { + "epoch": 2.5773045931873053, + "grad_norm": 15.375, + "learning_rate": 1.4089850992383763e-06, + "loss": 1.5292, + "step": 88600 + }, + { + "epoch": 2.5778863775198535, + "grad_norm": 15.75, + "learning_rate": 1.4070458175657614e-06, + "loss": 1.4152, + "step": 88620 + }, + { + "epoch": 2.5784681618524012, + "grad_norm": 13.375, + "learning_rate": 1.4051065358931465e-06, + "loss": 1.4743, + "step": 88640 + }, + { + "epoch": 2.5790499461849494, + "grad_norm": 13.8125, + "learning_rate": 1.4031672542205316e-06, + "loss": 1.5106, + "step": 88660 + }, + { + "epoch": 2.579631730517497, + "grad_norm": 14.125, + "learning_rate": 1.4012279725479167e-06, + "loss": 1.4018, + "step": 88680 + }, + { + "epoch": 2.580213514850045, + "grad_norm": 14.0, + "learning_rate": 1.3992886908753018e-06, + "loss": 1.5591, + "step": 88700 + }, + { + "epoch": 2.580795299182593, + "grad_norm": 15.5625, + "learning_rate": 1.3973494092026869e-06, + "loss": 1.4992, + "step": 88720 + }, + { + "epoch": 2.581377083515141, + "grad_norm": 12.125, + "learning_rate": 1.395410127530072e-06, + "loss": 1.478, + "step": 88740 + }, + { + "epoch": 2.581958867847689, + "grad_norm": 12.25, + "learning_rate": 1.393470845857457e-06, + "loss": 1.4129, + "step": 88760 + }, + { + "epoch": 2.5825406521802368, + "grad_norm": 17.75, + "learning_rate": 1.391531564184842e-06, + "loss": 1.4793, + "step": 88780 + }, + { + "epoch": 2.5831224365127845, + "grad_norm": 13.5625, + "learning_rate": 1.389592282512227e-06, + "loss": 1.5708, + "step": 88800 + }, + { + "epoch": 2.5837042208453327, + "grad_norm": 12.75, + "learning_rate": 1.3876530008396121e-06, + "loss": 1.5207, + "step": 88820 + }, + { + "epoch": 2.5842860051778804, + "grad_norm": 13.5625, + "learning_rate": 1.3857137191669972e-06, + "loss": 1.5063, + "step": 88840 + }, + { + "epoch": 2.5848677895104286, + "grad_norm": 12.0625, + "learning_rate": 1.3837744374943823e-06, + "loss": 1.5515, + "step": 88860 + }, + { + "epoch": 2.5854495738429764, + "grad_norm": 14.9375, + "learning_rate": 1.3818351558217674e-06, + "loss": 1.5853, + "step": 88880 + }, + { + "epoch": 2.586031358175524, + "grad_norm": 13.625, + "learning_rate": 1.3798958741491525e-06, + "loss": 1.569, + "step": 88900 + }, + { + "epoch": 2.5866131425080723, + "grad_norm": 13.4375, + "learning_rate": 1.3779565924765376e-06, + "loss": 1.5617, + "step": 88920 + }, + { + "epoch": 2.58719492684062, + "grad_norm": 13.0625, + "learning_rate": 1.3760173108039227e-06, + "loss": 1.4424, + "step": 88940 + }, + { + "epoch": 2.587776711173168, + "grad_norm": 12.75, + "learning_rate": 1.3740780291313078e-06, + "loss": 1.5559, + "step": 88960 + }, + { + "epoch": 2.588358495505716, + "grad_norm": 11.375, + "learning_rate": 1.3721387474586927e-06, + "loss": 1.5516, + "step": 88980 + }, + { + "epoch": 2.5889402798382637, + "grad_norm": 12.625, + "learning_rate": 1.3701994657860778e-06, + "loss": 1.5243, + "step": 89000 + }, + { + "epoch": 2.589522064170812, + "grad_norm": 14.0625, + "learning_rate": 1.3682601841134627e-06, + "loss": 1.4705, + "step": 89020 + }, + { + "epoch": 2.59010384850336, + "grad_norm": 14.375, + "learning_rate": 1.3663209024408478e-06, + "loss": 1.4796, + "step": 89040 + }, + { + "epoch": 2.590685632835908, + "grad_norm": 12.3125, + "learning_rate": 1.3643816207682329e-06, + "loss": 1.5771, + "step": 89060 + }, + { + "epoch": 2.5912674171684555, + "grad_norm": 16.25, + "learning_rate": 1.362442339095618e-06, + "loss": 1.5066, + "step": 89080 + }, + { + "epoch": 2.5918492015010037, + "grad_norm": 11.9375, + "learning_rate": 1.360503057423003e-06, + "loss": 1.5614, + "step": 89100 + }, + { + "epoch": 2.5924309858335515, + "grad_norm": 15.5, + "learning_rate": 1.358563775750388e-06, + "loss": 1.5666, + "step": 89120 + }, + { + "epoch": 2.5930127701660997, + "grad_norm": 14.9375, + "learning_rate": 1.356624494077773e-06, + "loss": 1.5544, + "step": 89140 + }, + { + "epoch": 2.5935945544986474, + "grad_norm": 14.5, + "learning_rate": 1.3546852124051582e-06, + "loss": 1.4609, + "step": 89160 + }, + { + "epoch": 2.594176338831195, + "grad_norm": 14.375, + "learning_rate": 1.3527459307325433e-06, + "loss": 1.4836, + "step": 89180 + }, + { + "epoch": 2.5947581231637433, + "grad_norm": 8.625, + "learning_rate": 1.3508066490599284e-06, + "loss": 1.4095, + "step": 89200 + }, + { + "epoch": 2.595339907496291, + "grad_norm": 17.875, + "learning_rate": 1.3488673673873135e-06, + "loss": 1.4704, + "step": 89220 + }, + { + "epoch": 2.5959216918288393, + "grad_norm": 13.3125, + "learning_rate": 1.3469280857146985e-06, + "loss": 1.5322, + "step": 89240 + }, + { + "epoch": 2.596503476161387, + "grad_norm": 13.125, + "learning_rate": 1.3449888040420836e-06, + "loss": 1.5451, + "step": 89260 + }, + { + "epoch": 2.5970852604939347, + "grad_norm": 13.125, + "learning_rate": 1.3430495223694687e-06, + "loss": 1.5784, + "step": 89280 + }, + { + "epoch": 2.597667044826483, + "grad_norm": 13.125, + "learning_rate": 1.3411102406968538e-06, + "loss": 1.5883, + "step": 89300 + }, + { + "epoch": 2.5982488291590307, + "grad_norm": 15.375, + "learning_rate": 1.3391709590242387e-06, + "loss": 1.5509, + "step": 89320 + }, + { + "epoch": 2.598830613491579, + "grad_norm": 10.75, + "learning_rate": 1.3372316773516238e-06, + "loss": 1.5002, + "step": 89340 + }, + { + "epoch": 2.5994123978241266, + "grad_norm": 16.25, + "learning_rate": 1.335292395679009e-06, + "loss": 1.5081, + "step": 89360 + }, + { + "epoch": 2.5999941821566743, + "grad_norm": 15.6875, + "learning_rate": 1.333353114006394e-06, + "loss": 1.4374, + "step": 89380 + }, + { + "epoch": 2.6005759664892225, + "grad_norm": 13.5, + "learning_rate": 1.3314138323337791e-06, + "loss": 1.5208, + "step": 89400 + }, + { + "epoch": 2.6011577508217703, + "grad_norm": 15.25, + "learning_rate": 1.3294745506611642e-06, + "loss": 1.5339, + "step": 89420 + }, + { + "epoch": 2.6017395351543184, + "grad_norm": 12.3125, + "learning_rate": 1.3275352689885493e-06, + "loss": 1.4808, + "step": 89440 + }, + { + "epoch": 2.602321319486866, + "grad_norm": 11.8125, + "learning_rate": 1.3255959873159344e-06, + "loss": 1.4434, + "step": 89460 + }, + { + "epoch": 2.602903103819414, + "grad_norm": 15.0625, + "learning_rate": 1.3236567056433195e-06, + "loss": 1.4781, + "step": 89480 + }, + { + "epoch": 2.603484888151962, + "grad_norm": 13.4375, + "learning_rate": 1.3217174239707046e-06, + "loss": 1.4987, + "step": 89500 + }, + { + "epoch": 2.60406667248451, + "grad_norm": 16.625, + "learning_rate": 1.3197781422980895e-06, + "loss": 1.4983, + "step": 89520 + }, + { + "epoch": 2.604648456817058, + "grad_norm": 10.9375, + "learning_rate": 1.3178388606254746e-06, + "loss": 1.5454, + "step": 89540 + }, + { + "epoch": 2.6052302411496058, + "grad_norm": 12.75, + "learning_rate": 1.3158995789528597e-06, + "loss": 1.569, + "step": 89560 + }, + { + "epoch": 2.6058120254821535, + "grad_norm": 14.0625, + "learning_rate": 1.3139602972802448e-06, + "loss": 1.4457, + "step": 89580 + }, + { + "epoch": 2.6063938098147017, + "grad_norm": 13.8125, + "learning_rate": 1.3120210156076299e-06, + "loss": 1.5113, + "step": 89600 + }, + { + "epoch": 2.60697559414725, + "grad_norm": 16.125, + "learning_rate": 1.310081733935015e-06, + "loss": 1.4794, + "step": 89620 + }, + { + "epoch": 2.6075573784797976, + "grad_norm": 15.125, + "learning_rate": 1.3081424522624e-06, + "loss": 1.4272, + "step": 89640 + }, + { + "epoch": 2.6081391628123454, + "grad_norm": 12.6875, + "learning_rate": 1.3062031705897852e-06, + "loss": 1.5364, + "step": 89660 + }, + { + "epoch": 2.6087209471448936, + "grad_norm": 11.6875, + "learning_rate": 1.3042638889171703e-06, + "loss": 1.4851, + "step": 89680 + }, + { + "epoch": 2.6093027314774413, + "grad_norm": 12.6875, + "learning_rate": 1.3023246072445554e-06, + "loss": 1.4141, + "step": 89700 + }, + { + "epoch": 2.6098845158099895, + "grad_norm": 12.0625, + "learning_rate": 1.3003853255719405e-06, + "loss": 1.5233, + "step": 89720 + }, + { + "epoch": 2.610466300142537, + "grad_norm": 12.4375, + "learning_rate": 1.2984460438993253e-06, + "loss": 1.4895, + "step": 89740 + }, + { + "epoch": 2.611048084475085, + "grad_norm": 16.0, + "learning_rate": 1.2965067622267104e-06, + "loss": 1.4961, + "step": 89760 + }, + { + "epoch": 2.611629868807633, + "grad_norm": 10.625, + "learning_rate": 1.2945674805540955e-06, + "loss": 1.5121, + "step": 89780 + }, + { + "epoch": 2.612211653140181, + "grad_norm": 14.3125, + "learning_rate": 1.2926281988814806e-06, + "loss": 1.5176, + "step": 89800 + }, + { + "epoch": 2.612793437472729, + "grad_norm": 14.625, + "learning_rate": 1.2906889172088657e-06, + "loss": 1.488, + "step": 89820 + }, + { + "epoch": 2.613375221805277, + "grad_norm": 15.0, + "learning_rate": 1.2887496355362508e-06, + "loss": 1.5184, + "step": 89840 + }, + { + "epoch": 2.6139570061378246, + "grad_norm": 14.1875, + "learning_rate": 1.286810353863636e-06, + "loss": 1.525, + "step": 89860 + }, + { + "epoch": 2.6145387904703727, + "grad_norm": 12.375, + "learning_rate": 1.284871072191021e-06, + "loss": 1.4853, + "step": 89880 + }, + { + "epoch": 2.6151205748029205, + "grad_norm": 12.1875, + "learning_rate": 1.2829317905184061e-06, + "loss": 1.5604, + "step": 89900 + }, + { + "epoch": 2.6157023591354687, + "grad_norm": 15.1875, + "learning_rate": 1.2809925088457912e-06, + "loss": 1.5214, + "step": 89920 + }, + { + "epoch": 2.6162841434680164, + "grad_norm": 15.25, + "learning_rate": 1.2790532271731761e-06, + "loss": 1.4771, + "step": 89940 + }, + { + "epoch": 2.616865927800564, + "grad_norm": 14.8125, + "learning_rate": 1.2771139455005612e-06, + "loss": 1.414, + "step": 89960 + }, + { + "epoch": 2.6174477121331123, + "grad_norm": 13.3125, + "learning_rate": 1.2751746638279463e-06, + "loss": 1.43, + "step": 89980 + }, + { + "epoch": 2.61802949646566, + "grad_norm": 13.75, + "learning_rate": 1.2732353821553314e-06, + "loss": 1.3977, + "step": 90000 + }, + { + "epoch": 2.6186112807982083, + "grad_norm": 12.9375, + "learning_rate": 1.2712961004827165e-06, + "loss": 1.491, + "step": 90020 + }, + { + "epoch": 2.619193065130756, + "grad_norm": 14.25, + "learning_rate": 1.2693568188101016e-06, + "loss": 1.541, + "step": 90040 + }, + { + "epoch": 2.6197748494633037, + "grad_norm": 13.875, + "learning_rate": 1.2674175371374867e-06, + "loss": 1.4521, + "step": 90060 + }, + { + "epoch": 2.620356633795852, + "grad_norm": 13.1875, + "learning_rate": 1.2654782554648714e-06, + "loss": 1.5054, + "step": 90080 + }, + { + "epoch": 2.6209384181283997, + "grad_norm": 11.5, + "learning_rate": 1.2635389737922565e-06, + "loss": 1.5068, + "step": 90100 + }, + { + "epoch": 2.621520202460948, + "grad_norm": 15.1875, + "learning_rate": 1.2615996921196416e-06, + "loss": 1.4902, + "step": 90120 + }, + { + "epoch": 2.6221019867934956, + "grad_norm": 12.5625, + "learning_rate": 1.2596604104470267e-06, + "loss": 1.3949, + "step": 90140 + }, + { + "epoch": 2.6226837711260433, + "grad_norm": 13.0625, + "learning_rate": 1.2577211287744118e-06, + "loss": 1.4714, + "step": 90160 + }, + { + "epoch": 2.6232655554585915, + "grad_norm": 14.5, + "learning_rate": 1.2557818471017969e-06, + "loss": 1.4378, + "step": 90180 + }, + { + "epoch": 2.6238473397911393, + "grad_norm": 14.0, + "learning_rate": 1.253842565429182e-06, + "loss": 1.5399, + "step": 90200 + }, + { + "epoch": 2.6244291241236875, + "grad_norm": 12.375, + "learning_rate": 1.251903283756567e-06, + "loss": 1.5688, + "step": 90220 + }, + { + "epoch": 2.625010908456235, + "grad_norm": 13.1875, + "learning_rate": 1.2499640020839522e-06, + "loss": 1.4745, + "step": 90240 + }, + { + "epoch": 2.625592692788783, + "grad_norm": 14.5625, + "learning_rate": 1.2480247204113372e-06, + "loss": 1.5646, + "step": 90260 + }, + { + "epoch": 2.626174477121331, + "grad_norm": 12.625, + "learning_rate": 1.2460854387387223e-06, + "loss": 1.435, + "step": 90280 + }, + { + "epoch": 2.6267562614538793, + "grad_norm": 15.8125, + "learning_rate": 1.2441461570661074e-06, + "loss": 1.558, + "step": 90300 + }, + { + "epoch": 2.627338045786427, + "grad_norm": 12.875, + "learning_rate": 1.2422068753934925e-06, + "loss": 1.4167, + "step": 90320 + }, + { + "epoch": 2.627919830118975, + "grad_norm": 15.25, + "learning_rate": 1.2402675937208776e-06, + "loss": 1.4858, + "step": 90340 + }, + { + "epoch": 2.628501614451523, + "grad_norm": 12.8125, + "learning_rate": 1.2383283120482625e-06, + "loss": 1.4881, + "step": 90360 + }, + { + "epoch": 2.6290833987840707, + "grad_norm": 15.75, + "learning_rate": 1.2363890303756476e-06, + "loss": 1.5406, + "step": 90380 + }, + { + "epoch": 2.629665183116619, + "grad_norm": 13.0, + "learning_rate": 1.2344497487030327e-06, + "loss": 1.5149, + "step": 90400 + }, + { + "epoch": 2.6302469674491666, + "grad_norm": 13.875, + "learning_rate": 1.2325104670304178e-06, + "loss": 1.5103, + "step": 90420 + }, + { + "epoch": 2.6308287517817144, + "grad_norm": 14.5625, + "learning_rate": 1.230571185357803e-06, + "loss": 1.459, + "step": 90440 + }, + { + "epoch": 2.6314105361142626, + "grad_norm": 16.0, + "learning_rate": 1.228631903685188e-06, + "loss": 1.4727, + "step": 90460 + }, + { + "epoch": 2.6319923204468103, + "grad_norm": 14.5625, + "learning_rate": 1.226692622012573e-06, + "loss": 1.4068, + "step": 90480 + }, + { + "epoch": 2.6325741047793585, + "grad_norm": 15.75, + "learning_rate": 1.224753340339958e-06, + "loss": 1.532, + "step": 90500 + }, + { + "epoch": 2.6331558891119062, + "grad_norm": 14.0, + "learning_rate": 1.222814058667343e-06, + "loss": 1.5277, + "step": 90520 + }, + { + "epoch": 2.633737673444454, + "grad_norm": 16.25, + "learning_rate": 1.2208747769947282e-06, + "loss": 1.5638, + "step": 90540 + }, + { + "epoch": 2.634319457777002, + "grad_norm": 14.625, + "learning_rate": 1.2189354953221133e-06, + "loss": 1.4908, + "step": 90560 + }, + { + "epoch": 2.63490124210955, + "grad_norm": 13.5625, + "learning_rate": 1.2169962136494984e-06, + "loss": 1.541, + "step": 90580 + }, + { + "epoch": 2.635483026442098, + "grad_norm": 14.125, + "learning_rate": 1.2150569319768835e-06, + "loss": 1.5557, + "step": 90600 + }, + { + "epoch": 2.636064810774646, + "grad_norm": 14.5625, + "learning_rate": 1.2131176503042686e-06, + "loss": 1.4827, + "step": 90620 + }, + { + "epoch": 2.6366465951071936, + "grad_norm": 13.5625, + "learning_rate": 1.2111783686316537e-06, + "loss": 1.4529, + "step": 90640 + }, + { + "epoch": 2.6372283794397418, + "grad_norm": 12.875, + "learning_rate": 1.2092390869590388e-06, + "loss": 1.5072, + "step": 90660 + }, + { + "epoch": 2.6378101637722895, + "grad_norm": 12.4375, + "learning_rate": 1.2072998052864237e-06, + "loss": 1.4667, + "step": 90680 + }, + { + "epoch": 2.6383919481048377, + "grad_norm": 15.625, + "learning_rate": 1.2053605236138088e-06, + "loss": 1.4551, + "step": 90700 + }, + { + "epoch": 2.6389737324373854, + "grad_norm": 12.0, + "learning_rate": 1.2034212419411939e-06, + "loss": 1.4715, + "step": 90720 + }, + { + "epoch": 2.639555516769933, + "grad_norm": 12.25, + "learning_rate": 1.201481960268579e-06, + "loss": 1.4934, + "step": 90740 + }, + { + "epoch": 2.6401373011024813, + "grad_norm": 16.125, + "learning_rate": 1.199542678595964e-06, + "loss": 1.4717, + "step": 90760 + }, + { + "epoch": 2.640719085435029, + "grad_norm": 12.5625, + "learning_rate": 1.1976033969233491e-06, + "loss": 1.4353, + "step": 90780 + }, + { + "epoch": 2.6413008697675773, + "grad_norm": 10.0, + "learning_rate": 1.1956641152507342e-06, + "loss": 1.4902, + "step": 90800 + }, + { + "epoch": 2.641882654100125, + "grad_norm": 13.0625, + "learning_rate": 1.1937248335781193e-06, + "loss": 1.4954, + "step": 90820 + }, + { + "epoch": 2.6424644384326728, + "grad_norm": 12.8125, + "learning_rate": 1.1917855519055044e-06, + "loss": 1.4846, + "step": 90840 + }, + { + "epoch": 2.643046222765221, + "grad_norm": 17.0, + "learning_rate": 1.1898462702328895e-06, + "loss": 1.5161, + "step": 90860 + }, + { + "epoch": 2.643628007097769, + "grad_norm": 15.1875, + "learning_rate": 1.1879069885602744e-06, + "loss": 1.4884, + "step": 90880 + }, + { + "epoch": 2.644209791430317, + "grad_norm": 17.75, + "learning_rate": 1.1859677068876595e-06, + "loss": 1.4544, + "step": 90900 + }, + { + "epoch": 2.6447915757628646, + "grad_norm": 15.5625, + "learning_rate": 1.1840284252150446e-06, + "loss": 1.5529, + "step": 90920 + }, + { + "epoch": 2.645373360095413, + "grad_norm": 13.625, + "learning_rate": 1.1820891435424297e-06, + "loss": 1.4614, + "step": 90940 + }, + { + "epoch": 2.6459551444279605, + "grad_norm": 13.9375, + "learning_rate": 1.1801498618698146e-06, + "loss": 1.4789, + "step": 90960 + }, + { + "epoch": 2.6465369287605087, + "grad_norm": 15.0, + "learning_rate": 1.1782105801971997e-06, + "loss": 1.5151, + "step": 90980 + }, + { + "epoch": 2.6471187130930565, + "grad_norm": 13.5625, + "learning_rate": 1.1762712985245848e-06, + "loss": 1.4421, + "step": 91000 + }, + { + "epoch": 2.647700497425604, + "grad_norm": 12.4375, + "learning_rate": 1.1743320168519699e-06, + "loss": 1.5315, + "step": 91020 + }, + { + "epoch": 2.6482822817581524, + "grad_norm": 14.75, + "learning_rate": 1.172392735179355e-06, + "loss": 1.4751, + "step": 91040 + }, + { + "epoch": 2.6488640660907, + "grad_norm": 15.8125, + "learning_rate": 1.17045345350674e-06, + "loss": 1.5342, + "step": 91060 + }, + { + "epoch": 2.6494458504232483, + "grad_norm": 13.0625, + "learning_rate": 1.1685141718341252e-06, + "loss": 1.5897, + "step": 91080 + }, + { + "epoch": 2.650027634755796, + "grad_norm": 15.0625, + "learning_rate": 1.1665748901615103e-06, + "loss": 1.4432, + "step": 91100 + }, + { + "epoch": 2.650609419088344, + "grad_norm": 12.3125, + "learning_rate": 1.1646356084888954e-06, + "loss": 1.471, + "step": 91120 + }, + { + "epoch": 2.651191203420892, + "grad_norm": 13.1875, + "learning_rate": 1.1626963268162805e-06, + "loss": 1.5025, + "step": 91140 + }, + { + "epoch": 2.6517729877534397, + "grad_norm": 13.25, + "learning_rate": 1.1607570451436654e-06, + "loss": 1.5553, + "step": 91160 + }, + { + "epoch": 2.652354772085988, + "grad_norm": 13.125, + "learning_rate": 1.1588177634710505e-06, + "loss": 1.4099, + "step": 91180 + }, + { + "epoch": 2.6529365564185357, + "grad_norm": 13.5625, + "learning_rate": 1.1568784817984356e-06, + "loss": 1.5497, + "step": 91200 + }, + { + "epoch": 2.6535183407510834, + "grad_norm": 15.5, + "learning_rate": 1.1549392001258207e-06, + "loss": 1.5543, + "step": 91220 + }, + { + "epoch": 2.6541001250836316, + "grad_norm": 8.625, + "learning_rate": 1.1529999184532058e-06, + "loss": 1.4677, + "step": 91240 + }, + { + "epoch": 2.6546819094161793, + "grad_norm": 15.125, + "learning_rate": 1.1510606367805909e-06, + "loss": 1.5021, + "step": 91260 + }, + { + "epoch": 2.6552636937487275, + "grad_norm": 14.8125, + "learning_rate": 1.149121355107976e-06, + "loss": 1.5207, + "step": 91280 + }, + { + "epoch": 2.6558454780812752, + "grad_norm": 14.625, + "learning_rate": 1.147182073435361e-06, + "loss": 1.5079, + "step": 91300 + }, + { + "epoch": 2.656427262413823, + "grad_norm": 15.5625, + "learning_rate": 1.1452427917627461e-06, + "loss": 1.3869, + "step": 91320 + }, + { + "epoch": 2.657009046746371, + "grad_norm": 10.875, + "learning_rate": 1.1433035100901312e-06, + "loss": 1.4704, + "step": 91340 + }, + { + "epoch": 2.657590831078919, + "grad_norm": 14.9375, + "learning_rate": 1.1413642284175163e-06, + "loss": 1.5158, + "step": 91360 + }, + { + "epoch": 2.658172615411467, + "grad_norm": 13.625, + "learning_rate": 1.1394249467449012e-06, + "loss": 1.4783, + "step": 91380 + }, + { + "epoch": 2.658754399744015, + "grad_norm": 12.5, + "learning_rate": 1.1374856650722863e-06, + "loss": 1.4656, + "step": 91400 + }, + { + "epoch": 2.6593361840765626, + "grad_norm": 16.625, + "learning_rate": 1.1355463833996714e-06, + "loss": 1.5551, + "step": 91420 + }, + { + "epoch": 2.6599179684091108, + "grad_norm": 11.4375, + "learning_rate": 1.1336071017270563e-06, + "loss": 1.5305, + "step": 91440 + }, + { + "epoch": 2.6604997527416585, + "grad_norm": 19.0, + "learning_rate": 1.1316678200544414e-06, + "loss": 1.5697, + "step": 91460 + }, + { + "epoch": 2.6610815370742067, + "grad_norm": 10.625, + "learning_rate": 1.1297285383818265e-06, + "loss": 1.4515, + "step": 91480 + }, + { + "epoch": 2.6616633214067544, + "grad_norm": 13.25, + "learning_rate": 1.1277892567092116e-06, + "loss": 1.5167, + "step": 91500 + }, + { + "epoch": 2.662245105739302, + "grad_norm": 12.0625, + "learning_rate": 1.1258499750365967e-06, + "loss": 1.4129, + "step": 91520 + }, + { + "epoch": 2.6628268900718504, + "grad_norm": 14.0625, + "learning_rate": 1.1239106933639818e-06, + "loss": 1.5106, + "step": 91540 + }, + { + "epoch": 2.6634086744043985, + "grad_norm": 13.1875, + "learning_rate": 1.1219714116913669e-06, + "loss": 1.5203, + "step": 91560 + }, + { + "epoch": 2.6639904587369463, + "grad_norm": 14.5, + "learning_rate": 1.120032130018752e-06, + "loss": 1.5161, + "step": 91580 + }, + { + "epoch": 2.664572243069494, + "grad_norm": 14.125, + "learning_rate": 1.118092848346137e-06, + "loss": 1.5112, + "step": 91600 + }, + { + "epoch": 2.665154027402042, + "grad_norm": 13.6875, + "learning_rate": 1.1161535666735222e-06, + "loss": 1.417, + "step": 91620 + }, + { + "epoch": 2.66573581173459, + "grad_norm": 13.375, + "learning_rate": 1.114214285000907e-06, + "loss": 1.4804, + "step": 91640 + }, + { + "epoch": 2.666317596067138, + "grad_norm": 11.5625, + "learning_rate": 1.1122750033282922e-06, + "loss": 1.4938, + "step": 91660 + }, + { + "epoch": 2.666899380399686, + "grad_norm": 11.3125, + "learning_rate": 1.1103357216556773e-06, + "loss": 1.5055, + "step": 91680 + }, + { + "epoch": 2.6674811647322336, + "grad_norm": 9.25, + "learning_rate": 1.1083964399830624e-06, + "loss": 1.4847, + "step": 91700 + }, + { + "epoch": 2.668062949064782, + "grad_norm": 15.0, + "learning_rate": 1.1064571583104475e-06, + "loss": 1.438, + "step": 91720 + }, + { + "epoch": 2.6686447333973295, + "grad_norm": 11.5625, + "learning_rate": 1.1045178766378326e-06, + "loss": 1.5066, + "step": 91740 + }, + { + "epoch": 2.6692265177298777, + "grad_norm": 14.1875, + "learning_rate": 1.1025785949652177e-06, + "loss": 1.4833, + "step": 91760 + }, + { + "epoch": 2.6698083020624255, + "grad_norm": 10.625, + "learning_rate": 1.1006393132926028e-06, + "loss": 1.509, + "step": 91780 + }, + { + "epoch": 2.670390086394973, + "grad_norm": 16.625, + "learning_rate": 1.0987000316199878e-06, + "loss": 1.584, + "step": 91800 + }, + { + "epoch": 2.6709718707275214, + "grad_norm": 15.1875, + "learning_rate": 1.096760749947373e-06, + "loss": 1.4688, + "step": 91820 + }, + { + "epoch": 2.671553655060069, + "grad_norm": 15.625, + "learning_rate": 1.0948214682747578e-06, + "loss": 1.4821, + "step": 91840 + }, + { + "epoch": 2.6721354393926173, + "grad_norm": 12.0625, + "learning_rate": 1.092882186602143e-06, + "loss": 1.4099, + "step": 91860 + }, + { + "epoch": 2.672717223725165, + "grad_norm": 16.375, + "learning_rate": 1.090942904929528e-06, + "loss": 1.4567, + "step": 91880 + }, + { + "epoch": 2.673299008057713, + "grad_norm": 14.125, + "learning_rate": 1.0890036232569131e-06, + "loss": 1.5113, + "step": 91900 + }, + { + "epoch": 2.673880792390261, + "grad_norm": 13.6875, + "learning_rate": 1.0870643415842982e-06, + "loss": 1.4174, + "step": 91920 + }, + { + "epoch": 2.6744625767228087, + "grad_norm": 15.875, + "learning_rate": 1.0851250599116831e-06, + "loss": 1.5172, + "step": 91940 + }, + { + "epoch": 2.675044361055357, + "grad_norm": 13.1875, + "learning_rate": 1.0831857782390682e-06, + "loss": 1.5045, + "step": 91960 + }, + { + "epoch": 2.6756261453879047, + "grad_norm": 14.8125, + "learning_rate": 1.0812464965664533e-06, + "loss": 1.4731, + "step": 91980 + }, + { + "epoch": 2.6762079297204524, + "grad_norm": 13.9375, + "learning_rate": 1.0793072148938384e-06, + "loss": 1.4835, + "step": 92000 + }, + { + "epoch": 2.6767897140530006, + "grad_norm": 12.3125, + "learning_rate": 1.0773679332212235e-06, + "loss": 1.4878, + "step": 92020 + }, + { + "epoch": 2.6773714983855483, + "grad_norm": 14.375, + "learning_rate": 1.0754286515486086e-06, + "loss": 1.4873, + "step": 92040 + }, + { + "epoch": 2.6779532827180965, + "grad_norm": 12.625, + "learning_rate": 1.0734893698759937e-06, + "loss": 1.5304, + "step": 92060 + }, + { + "epoch": 2.6785350670506443, + "grad_norm": 11.8125, + "learning_rate": 1.0715500882033788e-06, + "loss": 1.4758, + "step": 92080 + }, + { + "epoch": 2.679116851383192, + "grad_norm": 14.5625, + "learning_rate": 1.0696108065307639e-06, + "loss": 1.4777, + "step": 92100 + }, + { + "epoch": 2.67969863571574, + "grad_norm": 11.75, + "learning_rate": 1.0676715248581488e-06, + "loss": 1.4517, + "step": 92120 + }, + { + "epoch": 2.6802804200482884, + "grad_norm": 13.6875, + "learning_rate": 1.0657322431855339e-06, + "loss": 1.5198, + "step": 92140 + }, + { + "epoch": 2.680862204380836, + "grad_norm": 14.0, + "learning_rate": 1.063792961512919e-06, + "loss": 1.4252, + "step": 92160 + }, + { + "epoch": 2.681443988713384, + "grad_norm": 12.75, + "learning_rate": 1.061853679840304e-06, + "loss": 1.5404, + "step": 92180 + }, + { + "epoch": 2.682025773045932, + "grad_norm": 12.3125, + "learning_rate": 1.0599143981676892e-06, + "loss": 1.4618, + "step": 92200 + }, + { + "epoch": 2.68260755737848, + "grad_norm": 13.875, + "learning_rate": 1.0579751164950743e-06, + "loss": 1.6344, + "step": 92220 + }, + { + "epoch": 2.683189341711028, + "grad_norm": 14.125, + "learning_rate": 1.0560358348224594e-06, + "loss": 1.3988, + "step": 92240 + }, + { + "epoch": 2.6837711260435757, + "grad_norm": 12.375, + "learning_rate": 1.0540965531498445e-06, + "loss": 1.4478, + "step": 92260 + }, + { + "epoch": 2.6843529103761234, + "grad_norm": 11.875, + "learning_rate": 1.0521572714772296e-06, + "loss": 1.4711, + "step": 92280 + }, + { + "epoch": 2.6849346947086716, + "grad_norm": 16.75, + "learning_rate": 1.0502179898046146e-06, + "loss": 1.48, + "step": 92300 + }, + { + "epoch": 2.6855164790412194, + "grad_norm": 11.25, + "learning_rate": 1.0482787081319995e-06, + "loss": 1.4759, + "step": 92320 + }, + { + "epoch": 2.6860982633737676, + "grad_norm": 13.75, + "learning_rate": 1.0463394264593846e-06, + "loss": 1.4739, + "step": 92340 + }, + { + "epoch": 2.6866800477063153, + "grad_norm": 13.375, + "learning_rate": 1.0444001447867697e-06, + "loss": 1.4706, + "step": 92360 + }, + { + "epoch": 2.687261832038863, + "grad_norm": 12.5, + "learning_rate": 1.0424608631141548e-06, + "loss": 1.5112, + "step": 92380 + }, + { + "epoch": 2.6878436163714112, + "grad_norm": 15.0625, + "learning_rate": 1.04052158144154e-06, + "loss": 1.5621, + "step": 92400 + }, + { + "epoch": 2.688425400703959, + "grad_norm": 12.4375, + "learning_rate": 1.038582299768925e-06, + "loss": 1.5636, + "step": 92420 + }, + { + "epoch": 2.689007185036507, + "grad_norm": 13.0625, + "learning_rate": 1.0366430180963101e-06, + "loss": 1.4564, + "step": 92440 + }, + { + "epoch": 2.689588969369055, + "grad_norm": 12.125, + "learning_rate": 1.0347037364236952e-06, + "loss": 1.4961, + "step": 92460 + }, + { + "epoch": 2.6901707537016026, + "grad_norm": 14.125, + "learning_rate": 1.03276445475108e-06, + "loss": 1.4921, + "step": 92480 + }, + { + "epoch": 2.690752538034151, + "grad_norm": 12.6875, + "learning_rate": 1.0308251730784652e-06, + "loss": 1.5171, + "step": 92500 + }, + { + "epoch": 2.6913343223666986, + "grad_norm": 12.25, + "learning_rate": 1.0288858914058503e-06, + "loss": 1.4865, + "step": 92520 + }, + { + "epoch": 2.6919161066992467, + "grad_norm": 10.375, + "learning_rate": 1.0269466097332354e-06, + "loss": 1.4231, + "step": 92540 + }, + { + "epoch": 2.6924978910317945, + "grad_norm": 17.0, + "learning_rate": 1.0250073280606205e-06, + "loss": 1.468, + "step": 92560 + }, + { + "epoch": 2.6930796753643422, + "grad_norm": 10.8125, + "learning_rate": 1.0230680463880056e-06, + "loss": 1.5362, + "step": 92580 + }, + { + "epoch": 2.6936614596968904, + "grad_norm": 11.625, + "learning_rate": 1.0211287647153905e-06, + "loss": 1.4689, + "step": 92600 + }, + { + "epoch": 2.694243244029438, + "grad_norm": 14.0, + "learning_rate": 1.0191894830427756e-06, + "loss": 1.4739, + "step": 92620 + }, + { + "epoch": 2.6948250283619863, + "grad_norm": 12.125, + "learning_rate": 1.0172502013701607e-06, + "loss": 1.4328, + "step": 92640 + }, + { + "epoch": 2.695406812694534, + "grad_norm": 13.6875, + "learning_rate": 1.0153109196975458e-06, + "loss": 1.5491, + "step": 92660 + }, + { + "epoch": 2.695988597027082, + "grad_norm": 12.125, + "learning_rate": 1.0133716380249309e-06, + "loss": 1.5193, + "step": 92680 + }, + { + "epoch": 2.69657038135963, + "grad_norm": 12.6875, + "learning_rate": 1.011432356352316e-06, + "loss": 1.539, + "step": 92700 + }, + { + "epoch": 2.6971521656921777, + "grad_norm": 13.125, + "learning_rate": 1.009493074679701e-06, + "loss": 1.4706, + "step": 92720 + }, + { + "epoch": 2.697733950024726, + "grad_norm": 15.0, + "learning_rate": 1.0075537930070862e-06, + "loss": 1.6127, + "step": 92740 + }, + { + "epoch": 2.6983157343572737, + "grad_norm": 12.1875, + "learning_rate": 1.0056145113344713e-06, + "loss": 1.6467, + "step": 92760 + }, + { + "epoch": 2.6988975186898214, + "grad_norm": 13.9375, + "learning_rate": 1.0036752296618564e-06, + "loss": 1.5672, + "step": 92780 + }, + { + "epoch": 2.6994793030223696, + "grad_norm": 12.0625, + "learning_rate": 1.0017359479892412e-06, + "loss": 1.4779, + "step": 92800 + }, + { + "epoch": 2.700061087354918, + "grad_norm": 12.375, + "learning_rate": 9.997966663166263e-07, + "loss": 1.4741, + "step": 92820 + } + ], + "logging_steps": 20, + "max_steps": 103131, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 10314, + "total_flos": 6.503393522895938e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}