diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,42869 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 30584, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 3.269683494637719e-05, + "grad_norm": 11.53246177543574, + "learning_rate": 6.538084341288003e-09, + "loss": 1.6976, + "step": 1 + }, + { + "epoch": 0.00016348417473188595, + "grad_norm": 7.866519483173926, + "learning_rate": 3.269042170644002e-08, + "loss": 1.5545, + "step": 5 + }, + { + "epoch": 0.0003269683494637719, + "grad_norm": 8.385429816593474, + "learning_rate": 6.538084341288004e-08, + "loss": 1.6656, + "step": 10 + }, + { + "epoch": 0.0004904525241956579, + "grad_norm": 8.74230968183078, + "learning_rate": 9.807126511932004e-08, + "loss": 1.6516, + "step": 15 + }, + { + "epoch": 0.0006539366989275438, + "grad_norm": 6.458642393702712, + "learning_rate": 1.3076168682576007e-07, + "loss": 1.7937, + "step": 20 + }, + { + "epoch": 0.0008174208736594298, + "grad_norm": 8.633717765366027, + "learning_rate": 1.6345210853220009e-07, + "loss": 1.5648, + "step": 25 + }, + { + "epoch": 0.0009809050483913157, + "grad_norm": 6.883431555454866, + "learning_rate": 1.9614253023864007e-07, + "loss": 1.5733, + "step": 30 + }, + { + "epoch": 0.0011443892231232018, + "grad_norm": 6.681670031728655, + "learning_rate": 2.288329519450801e-07, + "loss": 1.6525, + "step": 35 + }, + { + "epoch": 0.0013078733978550876, + "grad_norm": 6.973999627725365, + "learning_rate": 2.6152337365152015e-07, + "loss": 1.8073, + "step": 40 + }, + { + "epoch": 0.0014713575725869737, + "grad_norm": 6.084370877022341, + "learning_rate": 2.9421379535796013e-07, + "loss": 1.6404, + "step": 45 + }, + { + "epoch": 0.0016348417473188595, + "grad_norm": 6.527828587199688, + "learning_rate": 3.2690421706440017e-07, + "loss": 1.6401, + "step": 50 + }, + { + "epoch": 0.0017983259220507456, + "grad_norm": 5.792917176310678, + "learning_rate": 3.5959463877084016e-07, + "loss": 1.6067, + "step": 55 + }, + { + "epoch": 0.0019618100967826314, + "grad_norm": 6.614504103315009, + "learning_rate": 3.9228506047728014e-07, + "loss": 1.5855, + "step": 60 + }, + { + "epoch": 0.0021252942715145173, + "grad_norm": 5.43531290355998, + "learning_rate": 4.2497548218372024e-07, + "loss": 1.6904, + "step": 65 + }, + { + "epoch": 0.0022887784462464035, + "grad_norm": 5.936713675914012, + "learning_rate": 4.576659038901602e-07, + "loss": 1.7444, + "step": 70 + }, + { + "epoch": 0.0024522626209782894, + "grad_norm": 5.57985349993025, + "learning_rate": 4.903563255966003e-07, + "loss": 1.6449, + "step": 75 + }, + { + "epoch": 0.0026157467957101752, + "grad_norm": 5.421400218496103, + "learning_rate": 5.230467473030403e-07, + "loss": 1.4972, + "step": 80 + }, + { + "epoch": 0.002779230970442061, + "grad_norm": 5.1088204732498435, + "learning_rate": 5.557371690094803e-07, + "loss": 1.4126, + "step": 85 + }, + { + "epoch": 0.0029427151451739473, + "grad_norm": 4.615887912188339, + "learning_rate": 5.884275907159203e-07, + "loss": 1.5413, + "step": 90 + }, + { + "epoch": 0.003106199319905833, + "grad_norm": 5.151943672483561, + "learning_rate": 6.211180124223603e-07, + "loss": 1.5935, + "step": 95 + }, + { + "epoch": 0.003269683494637719, + "grad_norm": 4.778629505444277, + "learning_rate": 6.538084341288003e-07, + "loss": 1.675, + "step": 100 + }, + { + "epoch": 0.003433167669369605, + "grad_norm": 4.853864722759421, + "learning_rate": 6.864988558352403e-07, + "loss": 1.5063, + "step": 105 + }, + { + "epoch": 0.003596651844101491, + "grad_norm": 4.8145179159673, + "learning_rate": 7.191892775416803e-07, + "loss": 1.4785, + "step": 110 + }, + { + "epoch": 0.003760136018833377, + "grad_norm": 4.6338668900005615, + "learning_rate": 7.518796992481203e-07, + "loss": 1.5398, + "step": 115 + }, + { + "epoch": 0.003923620193565263, + "grad_norm": 4.758388759866877, + "learning_rate": 7.845701209545603e-07, + "loss": 1.5166, + "step": 120 + }, + { + "epoch": 0.004087104368297149, + "grad_norm": 4.625389052930953, + "learning_rate": 8.172605426610005e-07, + "loss": 1.5195, + "step": 125 + }, + { + "epoch": 0.0042505885430290345, + "grad_norm": 4.565979457242895, + "learning_rate": 8.499509643674405e-07, + "loss": 1.5034, + "step": 130 + }, + { + "epoch": 0.004414072717760921, + "grad_norm": 4.821950478432614, + "learning_rate": 8.826413860738805e-07, + "loss": 1.5561, + "step": 135 + }, + { + "epoch": 0.004577556892492807, + "grad_norm": 4.37502114477393, + "learning_rate": 9.153318077803204e-07, + "loss": 1.5053, + "step": 140 + }, + { + "epoch": 0.0047410410672246925, + "grad_norm": 4.483298273085954, + "learning_rate": 9.480222294867604e-07, + "loss": 1.53, + "step": 145 + }, + { + "epoch": 0.004904525241956579, + "grad_norm": 4.297160365423383, + "learning_rate": 9.807126511932006e-07, + "loss": 1.3466, + "step": 150 + }, + { + "epoch": 0.005068009416688464, + "grad_norm": 4.212920159871134, + "learning_rate": 1.0134030728996405e-06, + "loss": 1.5488, + "step": 155 + }, + { + "epoch": 0.0052314935914203504, + "grad_norm": 4.6933298172723115, + "learning_rate": 1.0460934946060806e-06, + "loss": 1.4913, + "step": 160 + }, + { + "epoch": 0.005394977766152237, + "grad_norm": 4.454728458402107, + "learning_rate": 1.0787839163125205e-06, + "loss": 1.6233, + "step": 165 + }, + { + "epoch": 0.005558461940884122, + "grad_norm": 4.224377003305751, + "learning_rate": 1.1114743380189606e-06, + "loss": 1.4809, + "step": 170 + }, + { + "epoch": 0.005721946115616008, + "grad_norm": 4.203291901882852, + "learning_rate": 1.1441647597254007e-06, + "loss": 1.5015, + "step": 175 + }, + { + "epoch": 0.005885430290347895, + "grad_norm": 4.354525777494214, + "learning_rate": 1.1768551814318405e-06, + "loss": 1.6, + "step": 180 + }, + { + "epoch": 0.00604891446507978, + "grad_norm": 4.239131330203625, + "learning_rate": 1.2095456031382806e-06, + "loss": 1.5341, + "step": 185 + }, + { + "epoch": 0.006212398639811666, + "grad_norm": 4.84799690808271, + "learning_rate": 1.2422360248447205e-06, + "loss": 1.5432, + "step": 190 + }, + { + "epoch": 0.006375882814543552, + "grad_norm": 4.455414519074772, + "learning_rate": 1.2749264465511608e-06, + "loss": 1.5008, + "step": 195 + }, + { + "epoch": 0.006539366989275438, + "grad_norm": 4.679352478583967, + "learning_rate": 1.3076168682576007e-06, + "loss": 1.5581, + "step": 200 + }, + { + "epoch": 0.006702851164007324, + "grad_norm": 4.368025608177606, + "learning_rate": 1.3403072899640408e-06, + "loss": 1.5992, + "step": 205 + }, + { + "epoch": 0.00686633533873921, + "grad_norm": 4.744983241737685, + "learning_rate": 1.3729977116704807e-06, + "loss": 1.5676, + "step": 210 + }, + { + "epoch": 0.007029819513471096, + "grad_norm": 4.503709991408367, + "learning_rate": 1.4056881333769208e-06, + "loss": 1.46, + "step": 215 + }, + { + "epoch": 0.007193303688202982, + "grad_norm": 4.566724131929041, + "learning_rate": 1.4383785550833606e-06, + "loss": 1.599, + "step": 220 + }, + { + "epoch": 0.007356787862934868, + "grad_norm": 4.003933820619696, + "learning_rate": 1.4710689767898007e-06, + "loss": 1.4507, + "step": 225 + }, + { + "epoch": 0.007520272037666754, + "grad_norm": 4.41414051125347, + "learning_rate": 1.5037593984962406e-06, + "loss": 1.5999, + "step": 230 + }, + { + "epoch": 0.007683756212398639, + "grad_norm": 3.875171489052989, + "learning_rate": 1.5364498202026807e-06, + "loss": 1.4652, + "step": 235 + }, + { + "epoch": 0.007847240387130526, + "grad_norm": 4.297010254693229, + "learning_rate": 1.5691402419091206e-06, + "loss": 1.5696, + "step": 240 + }, + { + "epoch": 0.008010724561862411, + "grad_norm": 4.620755292226773, + "learning_rate": 1.6018306636155609e-06, + "loss": 1.6173, + "step": 245 + }, + { + "epoch": 0.008174208736594298, + "grad_norm": 4.2913348888644975, + "learning_rate": 1.634521085322001e-06, + "loss": 1.5476, + "step": 250 + }, + { + "epoch": 0.008337692911326184, + "grad_norm": 4.403548593141661, + "learning_rate": 1.6672115070284409e-06, + "loss": 1.4895, + "step": 255 + }, + { + "epoch": 0.008501177086058069, + "grad_norm": 4.4403369555937235, + "learning_rate": 1.699901928734881e-06, + "loss": 1.4954, + "step": 260 + }, + { + "epoch": 0.008664661260789956, + "grad_norm": 4.3842123659313, + "learning_rate": 1.7325923504413208e-06, + "loss": 1.3513, + "step": 265 + }, + { + "epoch": 0.008828145435521842, + "grad_norm": 4.538133628306449, + "learning_rate": 1.765282772147761e-06, + "loss": 1.5185, + "step": 270 + }, + { + "epoch": 0.008991629610253727, + "grad_norm": 4.498096798433422, + "learning_rate": 1.7979731938542008e-06, + "loss": 1.5617, + "step": 275 + }, + { + "epoch": 0.009155113784985614, + "grad_norm": 4.155320677589489, + "learning_rate": 1.8306636155606409e-06, + "loss": 1.4197, + "step": 280 + }, + { + "epoch": 0.0093185979597175, + "grad_norm": 4.5248615721685335, + "learning_rate": 1.8633540372670808e-06, + "loss": 1.5553, + "step": 285 + }, + { + "epoch": 0.009482082134449385, + "grad_norm": 4.072787603691544, + "learning_rate": 1.8960444589735209e-06, + "loss": 1.3947, + "step": 290 + }, + { + "epoch": 0.009645566309181272, + "grad_norm": 4.154149679974504, + "learning_rate": 1.928734880679961e-06, + "loss": 1.4309, + "step": 295 + }, + { + "epoch": 0.009809050483913158, + "grad_norm": 4.0067197783937845, + "learning_rate": 1.9614253023864012e-06, + "loss": 1.4146, + "step": 300 + }, + { + "epoch": 0.009972534658645043, + "grad_norm": 4.792171409551452, + "learning_rate": 1.994115724092841e-06, + "loss": 1.4588, + "step": 305 + }, + { + "epoch": 0.010136018833376928, + "grad_norm": 4.841541208623851, + "learning_rate": 2.026806145799281e-06, + "loss": 1.5245, + "step": 310 + }, + { + "epoch": 0.010299503008108815, + "grad_norm": 4.729733459241869, + "learning_rate": 2.059496567505721e-06, + "loss": 1.5689, + "step": 315 + }, + { + "epoch": 0.010462987182840701, + "grad_norm": 4.111362759808273, + "learning_rate": 2.092186989212161e-06, + "loss": 1.3592, + "step": 320 + }, + { + "epoch": 0.010626471357572586, + "grad_norm": 4.384010947736498, + "learning_rate": 2.124877410918601e-06, + "loss": 1.4826, + "step": 325 + }, + { + "epoch": 0.010789955532304473, + "grad_norm": 4.494560147011721, + "learning_rate": 2.157567832625041e-06, + "loss": 1.5034, + "step": 330 + }, + { + "epoch": 0.010953439707036359, + "grad_norm": 3.9505329696428597, + "learning_rate": 2.190258254331481e-06, + "loss": 1.4102, + "step": 335 + }, + { + "epoch": 0.011116923881768244, + "grad_norm": 4.4626824917483106, + "learning_rate": 2.222948676037921e-06, + "loss": 1.4165, + "step": 340 + }, + { + "epoch": 0.011280408056500131, + "grad_norm": 4.449609696214007, + "learning_rate": 2.255639097744361e-06, + "loss": 1.4835, + "step": 345 + }, + { + "epoch": 0.011443892231232017, + "grad_norm": 4.238272552874597, + "learning_rate": 2.2883295194508013e-06, + "loss": 1.5034, + "step": 350 + }, + { + "epoch": 0.011607376405963902, + "grad_norm": 4.661745033490215, + "learning_rate": 2.321019941157241e-06, + "loss": 1.4658, + "step": 355 + }, + { + "epoch": 0.01177086058069579, + "grad_norm": 4.40194771403147, + "learning_rate": 2.353710362863681e-06, + "loss": 1.4926, + "step": 360 + }, + { + "epoch": 0.011934344755427675, + "grad_norm": 4.458750953502561, + "learning_rate": 2.3864007845701214e-06, + "loss": 1.5758, + "step": 365 + }, + { + "epoch": 0.01209782893015956, + "grad_norm": 4.364136133629769, + "learning_rate": 2.4190912062765613e-06, + "loss": 1.7018, + "step": 370 + }, + { + "epoch": 0.012261313104891447, + "grad_norm": 5.745622884779079, + "learning_rate": 2.451781627983001e-06, + "loss": 1.463, + "step": 375 + }, + { + "epoch": 0.012424797279623333, + "grad_norm": 4.412600690017748, + "learning_rate": 2.484472049689441e-06, + "loss": 1.4214, + "step": 380 + }, + { + "epoch": 0.012588281454355218, + "grad_norm": 4.075313457900715, + "learning_rate": 2.5171624713958813e-06, + "loss": 1.4885, + "step": 385 + }, + { + "epoch": 0.012751765629087104, + "grad_norm": 4.496652078407448, + "learning_rate": 2.5498528931023216e-06, + "loss": 1.5128, + "step": 390 + }, + { + "epoch": 0.01291524980381899, + "grad_norm": 4.593465050422002, + "learning_rate": 2.5825433148087615e-06, + "loss": 1.4868, + "step": 395 + }, + { + "epoch": 0.013078733978550876, + "grad_norm": 4.271462328969999, + "learning_rate": 2.6152337365152014e-06, + "loss": 1.5123, + "step": 400 + }, + { + "epoch": 0.013242218153282762, + "grad_norm": 4.232399375931288, + "learning_rate": 2.6479241582216413e-06, + "loss": 1.409, + "step": 405 + }, + { + "epoch": 0.013405702328014649, + "grad_norm": 4.599995802245311, + "learning_rate": 2.6806145799280816e-06, + "loss": 1.5326, + "step": 410 + }, + { + "epoch": 0.013569186502746534, + "grad_norm": 4.149118975678685, + "learning_rate": 2.7133050016345214e-06, + "loss": 1.485, + "step": 415 + }, + { + "epoch": 0.01373267067747842, + "grad_norm": 4.526834507678867, + "learning_rate": 2.7459954233409613e-06, + "loss": 1.5772, + "step": 420 + }, + { + "epoch": 0.013896154852210307, + "grad_norm": 4.01272047804602, + "learning_rate": 2.778685845047401e-06, + "loss": 1.4248, + "step": 425 + }, + { + "epoch": 0.014059639026942192, + "grad_norm": 4.3062761742318845, + "learning_rate": 2.8113762667538415e-06, + "loss": 1.554, + "step": 430 + }, + { + "epoch": 0.014223123201674077, + "grad_norm": 4.590605781459307, + "learning_rate": 2.8440666884602814e-06, + "loss": 1.413, + "step": 435 + }, + { + "epoch": 0.014386607376405965, + "grad_norm": 4.144051114690088, + "learning_rate": 2.8767571101667213e-06, + "loss": 1.4797, + "step": 440 + }, + { + "epoch": 0.01455009155113785, + "grad_norm": 4.356681694832766, + "learning_rate": 2.909447531873161e-06, + "loss": 1.4533, + "step": 445 + }, + { + "epoch": 0.014713575725869735, + "grad_norm": 4.518320186278322, + "learning_rate": 2.9421379535796015e-06, + "loss": 1.5215, + "step": 450 + }, + { + "epoch": 0.014877059900601623, + "grad_norm": 4.469201628209385, + "learning_rate": 2.9748283752860413e-06, + "loss": 1.6134, + "step": 455 + }, + { + "epoch": 0.015040544075333508, + "grad_norm": 4.56364710765259, + "learning_rate": 3.007518796992481e-06, + "loss": 1.6731, + "step": 460 + }, + { + "epoch": 0.015204028250065393, + "grad_norm": 4.559955167348464, + "learning_rate": 3.040209218698921e-06, + "loss": 1.5151, + "step": 465 + }, + { + "epoch": 0.015367512424797279, + "grad_norm": 4.251554589378753, + "learning_rate": 3.0728996404053614e-06, + "loss": 1.3725, + "step": 470 + }, + { + "epoch": 0.015530996599529166, + "grad_norm": 4.048682024838568, + "learning_rate": 3.1055900621118013e-06, + "loss": 1.5524, + "step": 475 + }, + { + "epoch": 0.01569448077426105, + "grad_norm": 4.223058466309042, + "learning_rate": 3.138280483818241e-06, + "loss": 1.4821, + "step": 480 + }, + { + "epoch": 0.01585796494899294, + "grad_norm": 4.260523448673345, + "learning_rate": 3.170970905524682e-06, + "loss": 1.5725, + "step": 485 + }, + { + "epoch": 0.016021449123724822, + "grad_norm": 4.258531388497235, + "learning_rate": 3.2036613272311218e-06, + "loss": 1.5346, + "step": 490 + }, + { + "epoch": 0.01618493329845671, + "grad_norm": 4.437548149775433, + "learning_rate": 3.2363517489375616e-06, + "loss": 1.5439, + "step": 495 + }, + { + "epoch": 0.016348417473188596, + "grad_norm": 4.112215435711217, + "learning_rate": 3.269042170644002e-06, + "loss": 1.5229, + "step": 500 + }, + { + "epoch": 0.01651190164792048, + "grad_norm": 4.245731966248816, + "learning_rate": 3.301732592350442e-06, + "loss": 1.673, + "step": 505 + }, + { + "epoch": 0.016675385822652367, + "grad_norm": 4.361022433505159, + "learning_rate": 3.3344230140568817e-06, + "loss": 1.4359, + "step": 510 + }, + { + "epoch": 0.016838869997384254, + "grad_norm": 4.205536275087419, + "learning_rate": 3.3671134357633216e-06, + "loss": 1.4127, + "step": 515 + }, + { + "epoch": 0.017002354172116138, + "grad_norm": 5.251054859452115, + "learning_rate": 3.399803857469762e-06, + "loss": 1.5817, + "step": 520 + }, + { + "epoch": 0.017165838346848025, + "grad_norm": 4.285472976258201, + "learning_rate": 3.4324942791762018e-06, + "loss": 1.5916, + "step": 525 + }, + { + "epoch": 0.017329322521579912, + "grad_norm": 4.322779432815829, + "learning_rate": 3.4651847008826416e-06, + "loss": 1.5469, + "step": 530 + }, + { + "epoch": 0.017492806696311796, + "grad_norm": 4.088166723614205, + "learning_rate": 3.4978751225890815e-06, + "loss": 1.5497, + "step": 535 + }, + { + "epoch": 0.017656290871043683, + "grad_norm": 4.434063998400928, + "learning_rate": 3.530565544295522e-06, + "loss": 1.453, + "step": 540 + }, + { + "epoch": 0.01781977504577557, + "grad_norm": 4.31915778318682, + "learning_rate": 3.5632559660019617e-06, + "loss": 1.4363, + "step": 545 + }, + { + "epoch": 0.017983259220507454, + "grad_norm": 4.3756123750021665, + "learning_rate": 3.5959463877084016e-06, + "loss": 1.5622, + "step": 550 + }, + { + "epoch": 0.01814674339523934, + "grad_norm": 4.361607152720975, + "learning_rate": 3.6286368094148415e-06, + "loss": 1.4164, + "step": 555 + }, + { + "epoch": 0.01831022756997123, + "grad_norm": 4.22948728550788, + "learning_rate": 3.6613272311212818e-06, + "loss": 1.4778, + "step": 560 + }, + { + "epoch": 0.018473711744703112, + "grad_norm": 4.216416173706302, + "learning_rate": 3.6940176528277216e-06, + "loss": 1.5113, + "step": 565 + }, + { + "epoch": 0.018637195919435, + "grad_norm": 4.295985852952525, + "learning_rate": 3.7267080745341615e-06, + "loss": 1.4923, + "step": 570 + }, + { + "epoch": 0.018800680094166886, + "grad_norm": 4.361354717544308, + "learning_rate": 3.7593984962406014e-06, + "loss": 1.5188, + "step": 575 + }, + { + "epoch": 0.01896416426889877, + "grad_norm": 3.981535072141482, + "learning_rate": 3.7920889179470417e-06, + "loss": 1.5557, + "step": 580 + }, + { + "epoch": 0.019127648443630657, + "grad_norm": 4.226913952786837, + "learning_rate": 3.824779339653482e-06, + "loss": 1.5889, + "step": 585 + }, + { + "epoch": 0.019291132618362544, + "grad_norm": 3.9594245185575656, + "learning_rate": 3.857469761359922e-06, + "loss": 1.4345, + "step": 590 + }, + { + "epoch": 0.019454616793094428, + "grad_norm": 4.694788180392296, + "learning_rate": 3.890160183066362e-06, + "loss": 1.5502, + "step": 595 + }, + { + "epoch": 0.019618100967826315, + "grad_norm": 4.316511307108296, + "learning_rate": 3.9228506047728025e-06, + "loss": 1.4398, + "step": 600 + }, + { + "epoch": 0.0197815851425582, + "grad_norm": 3.9898905558208604, + "learning_rate": 3.955541026479242e-06, + "loss": 1.5031, + "step": 605 + }, + { + "epoch": 0.019945069317290086, + "grad_norm": 4.141083175826108, + "learning_rate": 3.988231448185682e-06, + "loss": 1.47, + "step": 610 + }, + { + "epoch": 0.020108553492021973, + "grad_norm": 4.789181356184849, + "learning_rate": 4.020921869892122e-06, + "loss": 1.4277, + "step": 615 + }, + { + "epoch": 0.020272037666753857, + "grad_norm": 4.10294150104687, + "learning_rate": 4.053612291598562e-06, + "loss": 1.5631, + "step": 620 + }, + { + "epoch": 0.020435521841485744, + "grad_norm": 4.457129510032734, + "learning_rate": 4.086302713305002e-06, + "loss": 1.5305, + "step": 625 + }, + { + "epoch": 0.02059900601621763, + "grad_norm": 4.4558077729314665, + "learning_rate": 4.118993135011442e-06, + "loss": 1.3461, + "step": 630 + }, + { + "epoch": 0.020762490190949515, + "grad_norm": 3.9716450290667584, + "learning_rate": 4.151683556717882e-06, + "loss": 1.3819, + "step": 635 + }, + { + "epoch": 0.020925974365681402, + "grad_norm": 4.930344023358277, + "learning_rate": 4.184373978424322e-06, + "loss": 1.5418, + "step": 640 + }, + { + "epoch": 0.02108945854041329, + "grad_norm": 4.474975420883922, + "learning_rate": 4.217064400130762e-06, + "loss": 1.526, + "step": 645 + }, + { + "epoch": 0.021252942715145173, + "grad_norm": 4.769747451146745, + "learning_rate": 4.249754821837202e-06, + "loss": 1.5262, + "step": 650 + }, + { + "epoch": 0.02141642688987706, + "grad_norm": 4.333299809272065, + "learning_rate": 4.282445243543642e-06, + "loss": 1.5167, + "step": 655 + }, + { + "epoch": 0.021579911064608947, + "grad_norm": 4.1541586955508825, + "learning_rate": 4.315135665250082e-06, + "loss": 1.4854, + "step": 660 + }, + { + "epoch": 0.02174339523934083, + "grad_norm": 4.604248602113252, + "learning_rate": 4.347826086956522e-06, + "loss": 1.5877, + "step": 665 + }, + { + "epoch": 0.021906879414072718, + "grad_norm": 4.050455935047983, + "learning_rate": 4.380516508662962e-06, + "loss": 1.438, + "step": 670 + }, + { + "epoch": 0.022070363588804605, + "grad_norm": 4.378324060420521, + "learning_rate": 4.413206930369402e-06, + "loss": 1.4768, + "step": 675 + }, + { + "epoch": 0.02223384776353649, + "grad_norm": 4.007755515093857, + "learning_rate": 4.445897352075842e-06, + "loss": 1.4387, + "step": 680 + }, + { + "epoch": 0.022397331938268376, + "grad_norm": 4.39016079531668, + "learning_rate": 4.478587773782282e-06, + "loss": 1.4821, + "step": 685 + }, + { + "epoch": 0.022560816113000263, + "grad_norm": 4.362439860334016, + "learning_rate": 4.511278195488722e-06, + "loss": 1.5313, + "step": 690 + }, + { + "epoch": 0.022724300287732146, + "grad_norm": 4.009495767185107, + "learning_rate": 4.543968617195162e-06, + "loss": 1.4905, + "step": 695 + }, + { + "epoch": 0.022887784462464034, + "grad_norm": 4.235339702229562, + "learning_rate": 4.576659038901603e-06, + "loss": 1.5454, + "step": 700 + }, + { + "epoch": 0.02305126863719592, + "grad_norm": 4.341710155485561, + "learning_rate": 4.609349460608042e-06, + "loss": 1.4486, + "step": 705 + }, + { + "epoch": 0.023214752811927804, + "grad_norm": 4.367908314804537, + "learning_rate": 4.642039882314482e-06, + "loss": 1.4358, + "step": 710 + }, + { + "epoch": 0.02337823698665969, + "grad_norm": 4.416397263729252, + "learning_rate": 4.674730304020923e-06, + "loss": 1.5786, + "step": 715 + }, + { + "epoch": 0.02354172116139158, + "grad_norm": 4.158823406022106, + "learning_rate": 4.707420725727362e-06, + "loss": 1.6092, + "step": 720 + }, + { + "epoch": 0.023705205336123462, + "grad_norm": 4.455249933951134, + "learning_rate": 4.7401111474338025e-06, + "loss": 1.4738, + "step": 725 + }, + { + "epoch": 0.02386868951085535, + "grad_norm": 4.014834413653053, + "learning_rate": 4.772801569140243e-06, + "loss": 1.5509, + "step": 730 + }, + { + "epoch": 0.024032173685587237, + "grad_norm": 4.390063119249711, + "learning_rate": 4.805491990846682e-06, + "loss": 1.5214, + "step": 735 + }, + { + "epoch": 0.02419565786031912, + "grad_norm": 4.395222447719511, + "learning_rate": 4.8381824125531225e-06, + "loss": 1.4705, + "step": 740 + }, + { + "epoch": 0.024359142035051008, + "grad_norm": 4.307000287177215, + "learning_rate": 4.870872834259562e-06, + "loss": 1.682, + "step": 745 + }, + { + "epoch": 0.024522626209782895, + "grad_norm": 4.309814694237488, + "learning_rate": 4.903563255966002e-06, + "loss": 1.5483, + "step": 750 + }, + { + "epoch": 0.02468611038451478, + "grad_norm": 4.279635520154343, + "learning_rate": 4.936253677672443e-06, + "loss": 1.6198, + "step": 755 + }, + { + "epoch": 0.024849594559246665, + "grad_norm": 4.057543930443958, + "learning_rate": 4.968944099378882e-06, + "loss": 1.516, + "step": 760 + }, + { + "epoch": 0.02501307873397855, + "grad_norm": 4.240572571135748, + "learning_rate": 5.001634521085322e-06, + "loss": 1.3095, + "step": 765 + }, + { + "epoch": 0.025176562908710436, + "grad_norm": 4.216830813656435, + "learning_rate": 5.034324942791763e-06, + "loss": 1.5588, + "step": 770 + }, + { + "epoch": 0.025340047083442323, + "grad_norm": 4.33585673032408, + "learning_rate": 5.067015364498202e-06, + "loss": 1.6088, + "step": 775 + }, + { + "epoch": 0.025503531258174207, + "grad_norm": 4.274001437653932, + "learning_rate": 5.099705786204643e-06, + "loss": 1.5788, + "step": 780 + }, + { + "epoch": 0.025667015432906094, + "grad_norm": 4.237876914966019, + "learning_rate": 5.132396207911083e-06, + "loss": 1.6693, + "step": 785 + }, + { + "epoch": 0.02583049960763798, + "grad_norm": 4.3308831620944375, + "learning_rate": 5.165086629617523e-06, + "loss": 1.4572, + "step": 790 + }, + { + "epoch": 0.025993983782369865, + "grad_norm": 4.309533241685453, + "learning_rate": 5.1977770513239625e-06, + "loss": 1.4684, + "step": 795 + }, + { + "epoch": 0.026157467957101752, + "grad_norm": 4.262586226118156, + "learning_rate": 5.230467473030403e-06, + "loss": 1.3548, + "step": 800 + }, + { + "epoch": 0.02632095213183364, + "grad_norm": 4.652396167472839, + "learning_rate": 5.263157894736842e-06, + "loss": 1.4974, + "step": 805 + }, + { + "epoch": 0.026484436306565523, + "grad_norm": 4.171223164478124, + "learning_rate": 5.2958483164432825e-06, + "loss": 1.5104, + "step": 810 + }, + { + "epoch": 0.02664792048129741, + "grad_norm": 4.282185044299118, + "learning_rate": 5.328538738149722e-06, + "loss": 1.4006, + "step": 815 + }, + { + "epoch": 0.026811404656029297, + "grad_norm": 4.157672428367128, + "learning_rate": 5.361229159856163e-06, + "loss": 1.3238, + "step": 820 + }, + { + "epoch": 0.02697488883076118, + "grad_norm": 4.2675036558870705, + "learning_rate": 5.393919581562603e-06, + "loss": 1.4734, + "step": 825 + }, + { + "epoch": 0.027138373005493068, + "grad_norm": 4.053764392308675, + "learning_rate": 5.426610003269043e-06, + "loss": 1.4671, + "step": 830 + }, + { + "epoch": 0.027301857180224955, + "grad_norm": 4.195959891118389, + "learning_rate": 5.459300424975482e-06, + "loss": 1.4097, + "step": 835 + }, + { + "epoch": 0.02746534135495684, + "grad_norm": 4.144658176868027, + "learning_rate": 5.491990846681923e-06, + "loss": 1.5506, + "step": 840 + }, + { + "epoch": 0.027628825529688726, + "grad_norm": 4.5627323434974905, + "learning_rate": 5.524681268388362e-06, + "loss": 1.6143, + "step": 845 + }, + { + "epoch": 0.027792309704420613, + "grad_norm": 4.2327570270296775, + "learning_rate": 5.557371690094802e-06, + "loss": 1.541, + "step": 850 + }, + { + "epoch": 0.027955793879152497, + "grad_norm": 4.163878819206847, + "learning_rate": 5.590062111801242e-06, + "loss": 1.448, + "step": 855 + }, + { + "epoch": 0.028119278053884384, + "grad_norm": 3.7949053264844124, + "learning_rate": 5.622752533507683e-06, + "loss": 1.4098, + "step": 860 + }, + { + "epoch": 0.02828276222861627, + "grad_norm": 4.295143133469959, + "learning_rate": 5.655442955214123e-06, + "loss": 1.4333, + "step": 865 + }, + { + "epoch": 0.028446246403348155, + "grad_norm": 3.963608194306386, + "learning_rate": 5.688133376920563e-06, + "loss": 1.5726, + "step": 870 + }, + { + "epoch": 0.028609730578080042, + "grad_norm": 4.153874173216181, + "learning_rate": 5.720823798627003e-06, + "loss": 1.3877, + "step": 875 + }, + { + "epoch": 0.02877321475281193, + "grad_norm": 4.161794183672799, + "learning_rate": 5.7535142203334425e-06, + "loss": 1.5396, + "step": 880 + }, + { + "epoch": 0.028936698927543813, + "grad_norm": 4.089427218687475, + "learning_rate": 5.786204642039883e-06, + "loss": 1.498, + "step": 885 + }, + { + "epoch": 0.0291001831022757, + "grad_norm": 4.1913272129258266, + "learning_rate": 5.818895063746322e-06, + "loss": 1.5084, + "step": 890 + }, + { + "epoch": 0.029263667277007587, + "grad_norm": 4.114611626262438, + "learning_rate": 5.8515854854527634e-06, + "loss": 1.4475, + "step": 895 + }, + { + "epoch": 0.02942715145173947, + "grad_norm": 3.9860626007837827, + "learning_rate": 5.884275907159203e-06, + "loss": 1.363, + "step": 900 + }, + { + "epoch": 0.029590635626471358, + "grad_norm": 4.213419654658515, + "learning_rate": 5.916966328865643e-06, + "loss": 1.4449, + "step": 905 + }, + { + "epoch": 0.029754119801203245, + "grad_norm": 4.15183776577538, + "learning_rate": 5.949656750572083e-06, + "loss": 1.537, + "step": 910 + }, + { + "epoch": 0.02991760397593513, + "grad_norm": 4.407607653200135, + "learning_rate": 5.982347172278523e-06, + "loss": 1.6604, + "step": 915 + }, + { + "epoch": 0.030081088150667016, + "grad_norm": 4.240581262757686, + "learning_rate": 6.015037593984962e-06, + "loss": 1.4144, + "step": 920 + }, + { + "epoch": 0.030244572325398903, + "grad_norm": 4.11747090241219, + "learning_rate": 6.047728015691403e-06, + "loss": 1.4969, + "step": 925 + }, + { + "epoch": 0.030408056500130787, + "grad_norm": 3.986941437264067, + "learning_rate": 6.080418437397842e-06, + "loss": 1.5739, + "step": 930 + }, + { + "epoch": 0.030571540674862674, + "grad_norm": 4.431387598779245, + "learning_rate": 6.113108859104283e-06, + "loss": 1.5508, + "step": 935 + }, + { + "epoch": 0.030735024849594558, + "grad_norm": 4.027168221606797, + "learning_rate": 6.145799280810723e-06, + "loss": 1.6235, + "step": 940 + }, + { + "epoch": 0.030898509024326445, + "grad_norm": 4.053341226867391, + "learning_rate": 6.178489702517163e-06, + "loss": 1.464, + "step": 945 + }, + { + "epoch": 0.031061993199058332, + "grad_norm": 4.253916129932654, + "learning_rate": 6.2111801242236025e-06, + "loss": 1.5351, + "step": 950 + }, + { + "epoch": 0.031225477373790216, + "grad_norm": 4.3886346195582675, + "learning_rate": 6.243870545930043e-06, + "loss": 1.4889, + "step": 955 + }, + { + "epoch": 0.0313889615485221, + "grad_norm": 4.103538808422949, + "learning_rate": 6.276560967636482e-06, + "loss": 1.4132, + "step": 960 + }, + { + "epoch": 0.03155244572325399, + "grad_norm": 4.719176064437973, + "learning_rate": 6.309251389342923e-06, + "loss": 1.6032, + "step": 965 + }, + { + "epoch": 0.03171592989798588, + "grad_norm": 3.8969190272193877, + "learning_rate": 6.341941811049364e-06, + "loss": 1.3237, + "step": 970 + }, + { + "epoch": 0.031879414072717764, + "grad_norm": 4.034595224450556, + "learning_rate": 6.374632232755803e-06, + "loss": 1.3565, + "step": 975 + }, + { + "epoch": 0.032042898247449644, + "grad_norm": 4.392747203767942, + "learning_rate": 6.4073226544622435e-06, + "loss": 1.6049, + "step": 980 + }, + { + "epoch": 0.03220638242218153, + "grad_norm": 4.4215469174725115, + "learning_rate": 6.440013076168683e-06, + "loss": 1.5112, + "step": 985 + }, + { + "epoch": 0.03236986659691342, + "grad_norm": 3.8373413293094916, + "learning_rate": 6.472703497875123e-06, + "loss": 1.459, + "step": 990 + }, + { + "epoch": 0.032533350771645306, + "grad_norm": 4.2534130113083375, + "learning_rate": 6.505393919581563e-06, + "loss": 1.4608, + "step": 995 + }, + { + "epoch": 0.03269683494637719, + "grad_norm": 4.0965993524142155, + "learning_rate": 6.538084341288004e-06, + "loss": 1.4073, + "step": 1000 + }, + { + "epoch": 0.03286031912110908, + "grad_norm": 4.415711949576092, + "learning_rate": 6.570774762994443e-06, + "loss": 1.4919, + "step": 1005 + }, + { + "epoch": 0.03302380329584096, + "grad_norm": 4.0702996982756, + "learning_rate": 6.603465184700884e-06, + "loss": 1.5331, + "step": 1010 + }, + { + "epoch": 0.03318728747057285, + "grad_norm": 3.9810603540474574, + "learning_rate": 6.636155606407323e-06, + "loss": 1.5156, + "step": 1015 + }, + { + "epoch": 0.033350771645304735, + "grad_norm": 4.025314093888485, + "learning_rate": 6.668846028113763e-06, + "loss": 1.3719, + "step": 1020 + }, + { + "epoch": 0.03351425582003662, + "grad_norm": 4.409136482486669, + "learning_rate": 6.701536449820203e-06, + "loss": 1.4584, + "step": 1025 + }, + { + "epoch": 0.03367773999476851, + "grad_norm": 4.2887702322086945, + "learning_rate": 6.734226871526643e-06, + "loss": 1.5128, + "step": 1030 + }, + { + "epoch": 0.03384122416950039, + "grad_norm": 4.311271683180026, + "learning_rate": 6.766917293233083e-06, + "loss": 1.5426, + "step": 1035 + }, + { + "epoch": 0.034004708344232276, + "grad_norm": 4.049902913340995, + "learning_rate": 6.799607714939524e-06, + "loss": 1.4529, + "step": 1040 + }, + { + "epoch": 0.03416819251896416, + "grad_norm": 3.8200839111718463, + "learning_rate": 6.832298136645963e-06, + "loss": 1.4612, + "step": 1045 + }, + { + "epoch": 0.03433167669369605, + "grad_norm": 4.241727181348703, + "learning_rate": 6.8649885583524035e-06, + "loss": 1.5739, + "step": 1050 + }, + { + "epoch": 0.03449516086842794, + "grad_norm": 4.219783806464152, + "learning_rate": 6.897678980058843e-06, + "loss": 1.449, + "step": 1055 + }, + { + "epoch": 0.034658645043159825, + "grad_norm": 4.347510414126138, + "learning_rate": 6.930369401765283e-06, + "loss": 1.5269, + "step": 1060 + }, + { + "epoch": 0.034822129217891705, + "grad_norm": 4.414073016442943, + "learning_rate": 6.963059823471723e-06, + "loss": 1.7183, + "step": 1065 + }, + { + "epoch": 0.03498561339262359, + "grad_norm": 4.2870142426600415, + "learning_rate": 6.995750245178163e-06, + "loss": 1.5326, + "step": 1070 + }, + { + "epoch": 0.03514909756735548, + "grad_norm": 4.12923767861467, + "learning_rate": 7.028440666884604e-06, + "loss": 1.4009, + "step": 1075 + }, + { + "epoch": 0.035312581742087366, + "grad_norm": 3.8719200527445334, + "learning_rate": 7.061131088591044e-06, + "loss": 1.5157, + "step": 1080 + }, + { + "epoch": 0.035476065916819254, + "grad_norm": 3.9139787225006946, + "learning_rate": 7.093821510297484e-06, + "loss": 1.4113, + "step": 1085 + }, + { + "epoch": 0.03563955009155114, + "grad_norm": 4.2785251475952535, + "learning_rate": 7.126511932003923e-06, + "loss": 1.3903, + "step": 1090 + }, + { + "epoch": 0.03580303426628302, + "grad_norm": 4.040999435751379, + "learning_rate": 7.159202353710364e-06, + "loss": 1.4963, + "step": 1095 + }, + { + "epoch": 0.03596651844101491, + "grad_norm": 4.037567297542597, + "learning_rate": 7.191892775416803e-06, + "loss": 1.4622, + "step": 1100 + }, + { + "epoch": 0.036130002615746795, + "grad_norm": 4.040972768823374, + "learning_rate": 7.2245831971232435e-06, + "loss": 1.4607, + "step": 1105 + }, + { + "epoch": 0.03629348679047868, + "grad_norm": 4.161302883897017, + "learning_rate": 7.257273618829683e-06, + "loss": 1.4292, + "step": 1110 + }, + { + "epoch": 0.03645697096521057, + "grad_norm": 4.471415080629235, + "learning_rate": 7.289964040536124e-06, + "loss": 1.5022, + "step": 1115 + }, + { + "epoch": 0.03662045513994246, + "grad_norm": 4.431004881912511, + "learning_rate": 7.3226544622425635e-06, + "loss": 1.4872, + "step": 1120 + }, + { + "epoch": 0.03678393931467434, + "grad_norm": 4.599333514243306, + "learning_rate": 7.355344883949004e-06, + "loss": 1.569, + "step": 1125 + }, + { + "epoch": 0.036947423489406224, + "grad_norm": 4.112868230048562, + "learning_rate": 7.388035305655443e-06, + "loss": 1.383, + "step": 1130 + }, + { + "epoch": 0.03711090766413811, + "grad_norm": 4.500052474602167, + "learning_rate": 7.420725727361884e-06, + "loss": 1.538, + "step": 1135 + }, + { + "epoch": 0.03727439183887, + "grad_norm": 4.288208834441987, + "learning_rate": 7.453416149068323e-06, + "loss": 1.5504, + "step": 1140 + }, + { + "epoch": 0.037437876013601885, + "grad_norm": 4.441647328812847, + "learning_rate": 7.486106570774763e-06, + "loss": 1.4884, + "step": 1145 + }, + { + "epoch": 0.03760136018833377, + "grad_norm": 4.108147319408199, + "learning_rate": 7.518796992481203e-06, + "loss": 1.4708, + "step": 1150 + }, + { + "epoch": 0.03776484436306565, + "grad_norm": 3.9993995031724836, + "learning_rate": 7.551487414187644e-06, + "loss": 1.3931, + "step": 1155 + }, + { + "epoch": 0.03792832853779754, + "grad_norm": 4.53863337248771, + "learning_rate": 7.584177835894083e-06, + "loss": 1.5571, + "step": 1160 + }, + { + "epoch": 0.03809181271252943, + "grad_norm": 4.484403890926167, + "learning_rate": 7.616868257600524e-06, + "loss": 1.5483, + "step": 1165 + }, + { + "epoch": 0.038255296887261314, + "grad_norm": 3.938277253681699, + "learning_rate": 7.649558679306963e-06, + "loss": 1.4636, + "step": 1170 + }, + { + "epoch": 0.0384187810619932, + "grad_norm": 4.199798286574563, + "learning_rate": 7.682249101013403e-06, + "loss": 1.6237, + "step": 1175 + }, + { + "epoch": 0.03858226523672509, + "grad_norm": 4.175257247538653, + "learning_rate": 7.714939522719844e-06, + "loss": 1.5643, + "step": 1180 + }, + { + "epoch": 0.03874574941145697, + "grad_norm": 4.1718795926442525, + "learning_rate": 7.747629944426284e-06, + "loss": 1.5943, + "step": 1185 + }, + { + "epoch": 0.038909233586188856, + "grad_norm": 4.136319431471888, + "learning_rate": 7.780320366132724e-06, + "loss": 1.4473, + "step": 1190 + }, + { + "epoch": 0.03907271776092074, + "grad_norm": 4.080596850970854, + "learning_rate": 7.813010787839163e-06, + "loss": 1.4892, + "step": 1195 + }, + { + "epoch": 0.03923620193565263, + "grad_norm": 4.013350110177807, + "learning_rate": 7.845701209545605e-06, + "loss": 1.5475, + "step": 1200 + }, + { + "epoch": 0.03939968611038452, + "grad_norm": 4.309141224138997, + "learning_rate": 7.878391631252044e-06, + "loss": 1.492, + "step": 1205 + }, + { + "epoch": 0.0395631702851164, + "grad_norm": 4.131244466175692, + "learning_rate": 7.911082052958484e-06, + "loss": 1.5653, + "step": 1210 + }, + { + "epoch": 0.039726654459848285, + "grad_norm": 4.17247654667396, + "learning_rate": 7.943772474664924e-06, + "loss": 1.4571, + "step": 1215 + }, + { + "epoch": 0.03989013863458017, + "grad_norm": 4.109666570648127, + "learning_rate": 7.976462896371365e-06, + "loss": 1.4091, + "step": 1220 + }, + { + "epoch": 0.04005362280931206, + "grad_norm": 4.161439133542191, + "learning_rate": 8.009153318077803e-06, + "loss": 1.5848, + "step": 1225 + }, + { + "epoch": 0.040217106984043946, + "grad_norm": 4.305603136388289, + "learning_rate": 8.041843739784243e-06, + "loss": 1.4808, + "step": 1230 + }, + { + "epoch": 0.04038059115877583, + "grad_norm": 4.0758982878931, + "learning_rate": 8.074534161490684e-06, + "loss": 1.3898, + "step": 1235 + }, + { + "epoch": 0.04054407533350771, + "grad_norm": 4.019177248919927, + "learning_rate": 8.107224583197124e-06, + "loss": 1.5222, + "step": 1240 + }, + { + "epoch": 0.0407075595082396, + "grad_norm": 4.339523390992735, + "learning_rate": 8.139915004903564e-06, + "loss": 1.5569, + "step": 1245 + }, + { + "epoch": 0.04087104368297149, + "grad_norm": 4.061072452335712, + "learning_rate": 8.172605426610005e-06, + "loss": 1.4921, + "step": 1250 + }, + { + "epoch": 0.041034527857703375, + "grad_norm": 3.8373417015702844, + "learning_rate": 8.205295848316443e-06, + "loss": 1.3738, + "step": 1255 + }, + { + "epoch": 0.04119801203243526, + "grad_norm": 4.091372049106534, + "learning_rate": 8.237986270022884e-06, + "loss": 1.4122, + "step": 1260 + }, + { + "epoch": 0.04136149620716715, + "grad_norm": 3.902030955489958, + "learning_rate": 8.270676691729324e-06, + "loss": 1.5234, + "step": 1265 + }, + { + "epoch": 0.04152498038189903, + "grad_norm": 4.00783374549792, + "learning_rate": 8.303367113435764e-06, + "loss": 1.4618, + "step": 1270 + }, + { + "epoch": 0.041688464556630916, + "grad_norm": 4.456675835357237, + "learning_rate": 8.336057535142203e-06, + "loss": 1.6116, + "step": 1275 + }, + { + "epoch": 0.041851948731362804, + "grad_norm": 4.144700820107038, + "learning_rate": 8.368747956848645e-06, + "loss": 1.595, + "step": 1280 + }, + { + "epoch": 0.04201543290609469, + "grad_norm": 4.081844068164076, + "learning_rate": 8.401438378555085e-06, + "loss": 1.4477, + "step": 1285 + }, + { + "epoch": 0.04217891708082658, + "grad_norm": 4.115894900566234, + "learning_rate": 8.434128800261524e-06, + "loss": 1.5398, + "step": 1290 + }, + { + "epoch": 0.042342401255558465, + "grad_norm": 4.0376229396717545, + "learning_rate": 8.466819221967964e-06, + "loss": 1.3445, + "step": 1295 + }, + { + "epoch": 0.042505885430290345, + "grad_norm": 3.874112569954244, + "learning_rate": 8.499509643674404e-06, + "loss": 1.4486, + "step": 1300 + }, + { + "epoch": 0.04266936960502223, + "grad_norm": 4.241897419754403, + "learning_rate": 8.532200065380845e-06, + "loss": 1.5143, + "step": 1305 + }, + { + "epoch": 0.04283285377975412, + "grad_norm": 4.531705036166636, + "learning_rate": 8.564890487087283e-06, + "loss": 1.5048, + "step": 1310 + }, + { + "epoch": 0.04299633795448601, + "grad_norm": 4.078798872462162, + "learning_rate": 8.597580908793725e-06, + "loss": 1.5069, + "step": 1315 + }, + { + "epoch": 0.043159822129217894, + "grad_norm": 4.228403915578121, + "learning_rate": 8.630271330500164e-06, + "loss": 1.4375, + "step": 1320 + }, + { + "epoch": 0.04332330630394978, + "grad_norm": 4.513692545864303, + "learning_rate": 8.662961752206604e-06, + "loss": 1.5652, + "step": 1325 + }, + { + "epoch": 0.04348679047868166, + "grad_norm": 4.293394236332433, + "learning_rate": 8.695652173913044e-06, + "loss": 1.4411, + "step": 1330 + }, + { + "epoch": 0.04365027465341355, + "grad_norm": 4.516383929257263, + "learning_rate": 8.728342595619485e-06, + "loss": 1.3831, + "step": 1335 + }, + { + "epoch": 0.043813758828145435, + "grad_norm": 4.285227857848155, + "learning_rate": 8.761033017325923e-06, + "loss": 1.5729, + "step": 1340 + }, + { + "epoch": 0.04397724300287732, + "grad_norm": 3.9918747131543713, + "learning_rate": 8.793723439032364e-06, + "loss": 1.4253, + "step": 1345 + }, + { + "epoch": 0.04414072717760921, + "grad_norm": 4.231279431898971, + "learning_rate": 8.826413860738804e-06, + "loss": 1.5911, + "step": 1350 + }, + { + "epoch": 0.04430421135234109, + "grad_norm": 4.08119259428481, + "learning_rate": 8.859104282445244e-06, + "loss": 1.4306, + "step": 1355 + }, + { + "epoch": 0.04446769552707298, + "grad_norm": 4.193516888547258, + "learning_rate": 8.891794704151685e-06, + "loss": 1.4129, + "step": 1360 + }, + { + "epoch": 0.044631179701804864, + "grad_norm": 3.967618076648784, + "learning_rate": 8.924485125858125e-06, + "loss": 1.5236, + "step": 1365 + }, + { + "epoch": 0.04479466387653675, + "grad_norm": 4.327674421481089, + "learning_rate": 8.957175547564563e-06, + "loss": 1.5264, + "step": 1370 + }, + { + "epoch": 0.04495814805126864, + "grad_norm": 4.888272631463772, + "learning_rate": 8.989865969271004e-06, + "loss": 1.5202, + "step": 1375 + }, + { + "epoch": 0.045121632226000526, + "grad_norm": 3.7498614532993098, + "learning_rate": 9.022556390977444e-06, + "loss": 1.5174, + "step": 1380 + }, + { + "epoch": 0.045285116400732406, + "grad_norm": 4.094725385794139, + "learning_rate": 9.055246812683884e-06, + "loss": 1.3827, + "step": 1385 + }, + { + "epoch": 0.04544860057546429, + "grad_norm": 4.481102693531489, + "learning_rate": 9.087937234390325e-06, + "loss": 1.6328, + "step": 1390 + }, + { + "epoch": 0.04561208475019618, + "grad_norm": 4.352996407775465, + "learning_rate": 9.120627656096765e-06, + "loss": 1.558, + "step": 1395 + }, + { + "epoch": 0.04577556892492807, + "grad_norm": 4.262737680749788, + "learning_rate": 9.153318077803205e-06, + "loss": 1.3695, + "step": 1400 + }, + { + "epoch": 0.045939053099659954, + "grad_norm": 3.985143163801148, + "learning_rate": 9.186008499509644e-06, + "loss": 1.4564, + "step": 1405 + }, + { + "epoch": 0.04610253727439184, + "grad_norm": 4.155441791568684, + "learning_rate": 9.218698921216084e-06, + "loss": 1.602, + "step": 1410 + }, + { + "epoch": 0.04626602144912372, + "grad_norm": 4.086447902431365, + "learning_rate": 9.251389342922524e-06, + "loss": 1.4723, + "step": 1415 + }, + { + "epoch": 0.04642950562385561, + "grad_norm": 4.234828865676168, + "learning_rate": 9.284079764628965e-06, + "loss": 1.5305, + "step": 1420 + }, + { + "epoch": 0.046592989798587496, + "grad_norm": 4.160849597640286, + "learning_rate": 9.316770186335405e-06, + "loss": 1.4528, + "step": 1425 + }, + { + "epoch": 0.04675647397331938, + "grad_norm": 4.073471072787389, + "learning_rate": 9.349460608041845e-06, + "loss": 1.4348, + "step": 1430 + }, + { + "epoch": 0.04691995814805127, + "grad_norm": 4.176381863101213, + "learning_rate": 9.382151029748284e-06, + "loss": 1.6219, + "step": 1435 + }, + { + "epoch": 0.04708344232278316, + "grad_norm": 4.481254620342426, + "learning_rate": 9.414841451454724e-06, + "loss": 1.3312, + "step": 1440 + }, + { + "epoch": 0.04724692649751504, + "grad_norm": 4.596512147261685, + "learning_rate": 9.447531873161165e-06, + "loss": 1.6405, + "step": 1445 + }, + { + "epoch": 0.047410410672246925, + "grad_norm": 4.29360817630898, + "learning_rate": 9.480222294867605e-06, + "loss": 1.6298, + "step": 1450 + }, + { + "epoch": 0.04757389484697881, + "grad_norm": 4.128692964039102, + "learning_rate": 9.512912716574044e-06, + "loss": 1.4851, + "step": 1455 + }, + { + "epoch": 0.0477373790217107, + "grad_norm": 3.9346558468391093, + "learning_rate": 9.545603138280486e-06, + "loss": 1.4425, + "step": 1460 + }, + { + "epoch": 0.047900863196442586, + "grad_norm": 4.051221493400848, + "learning_rate": 9.578293559986924e-06, + "loss": 1.5193, + "step": 1465 + }, + { + "epoch": 0.04806434737117447, + "grad_norm": 3.9856449650621406, + "learning_rate": 9.610983981693364e-06, + "loss": 1.3905, + "step": 1470 + }, + { + "epoch": 0.048227831545906354, + "grad_norm": 4.244448194292507, + "learning_rate": 9.643674403399805e-06, + "loss": 1.4932, + "step": 1475 + }, + { + "epoch": 0.04839131572063824, + "grad_norm": 3.978412668487893, + "learning_rate": 9.676364825106245e-06, + "loss": 1.484, + "step": 1480 + }, + { + "epoch": 0.04855479989537013, + "grad_norm": 3.9220664409132837, + "learning_rate": 9.709055246812684e-06, + "loss": 1.4325, + "step": 1485 + }, + { + "epoch": 0.048718284070102015, + "grad_norm": 4.4223832557125675, + "learning_rate": 9.741745668519124e-06, + "loss": 1.6967, + "step": 1490 + }, + { + "epoch": 0.0488817682448339, + "grad_norm": 4.224928042845509, + "learning_rate": 9.774436090225564e-06, + "loss": 1.4713, + "step": 1495 + }, + { + "epoch": 0.04904525241956579, + "grad_norm": 4.195155305084779, + "learning_rate": 9.807126511932005e-06, + "loss": 1.5423, + "step": 1500 + }, + { + "epoch": 0.04920873659429767, + "grad_norm": 3.8951066627167217, + "learning_rate": 9.839816933638445e-06, + "loss": 1.6546, + "step": 1505 + }, + { + "epoch": 0.04937222076902956, + "grad_norm": 4.23961061116842, + "learning_rate": 9.872507355344885e-06, + "loss": 1.5861, + "step": 1510 + }, + { + "epoch": 0.049535704943761444, + "grad_norm": 4.1155212512536625, + "learning_rate": 9.905197777051325e-06, + "loss": 1.51, + "step": 1515 + }, + { + "epoch": 0.04969918911849333, + "grad_norm": 4.571373507152174, + "learning_rate": 9.937888198757764e-06, + "loss": 1.6237, + "step": 1520 + }, + { + "epoch": 0.04986267329322522, + "grad_norm": 4.216815336309937, + "learning_rate": 9.970578620464204e-06, + "loss": 1.4556, + "step": 1525 + }, + { + "epoch": 0.0500261574679571, + "grad_norm": 4.462118709096205, + "learning_rate": 1.0003269042170645e-05, + "loss": 1.5264, + "step": 1530 + }, + { + "epoch": 0.050189641642688985, + "grad_norm": 3.835218467211254, + "learning_rate": 1.0035959463877085e-05, + "loss": 1.4957, + "step": 1535 + }, + { + "epoch": 0.05035312581742087, + "grad_norm": 4.319105021375595, + "learning_rate": 1.0068649885583525e-05, + "loss": 1.5764, + "step": 1540 + }, + { + "epoch": 0.05051660999215276, + "grad_norm": 3.8624871572596318, + "learning_rate": 1.0101340307289964e-05, + "loss": 1.4652, + "step": 1545 + }, + { + "epoch": 0.05068009416688465, + "grad_norm": 4.130120902300648, + "learning_rate": 1.0134030728996404e-05, + "loss": 1.5222, + "step": 1550 + }, + { + "epoch": 0.050843578341616534, + "grad_norm": 4.028600997460362, + "learning_rate": 1.0166721150702845e-05, + "loss": 1.5248, + "step": 1555 + }, + { + "epoch": 0.051007062516348414, + "grad_norm": 3.9249492879118404, + "learning_rate": 1.0199411572409286e-05, + "loss": 1.5438, + "step": 1560 + }, + { + "epoch": 0.0511705466910803, + "grad_norm": 4.06998451647525, + "learning_rate": 1.0232101994115723e-05, + "loss": 1.4437, + "step": 1565 + }, + { + "epoch": 0.05133403086581219, + "grad_norm": 3.925026987416034, + "learning_rate": 1.0264792415822165e-05, + "loss": 1.593, + "step": 1570 + }, + { + "epoch": 0.051497515040544076, + "grad_norm": 4.0159325006466595, + "learning_rate": 1.0297482837528606e-05, + "loss": 1.4983, + "step": 1575 + }, + { + "epoch": 0.05166099921527596, + "grad_norm": 4.107501013828358, + "learning_rate": 1.0330173259235046e-05, + "loss": 1.5503, + "step": 1580 + }, + { + "epoch": 0.05182448339000785, + "grad_norm": 4.12662773715568, + "learning_rate": 1.0362863680941485e-05, + "loss": 1.6032, + "step": 1585 + }, + { + "epoch": 0.05198796756473973, + "grad_norm": 4.055066122454956, + "learning_rate": 1.0395554102647925e-05, + "loss": 1.4442, + "step": 1590 + }, + { + "epoch": 0.05215145173947162, + "grad_norm": 3.857415132485782, + "learning_rate": 1.0428244524354365e-05, + "loss": 1.6027, + "step": 1595 + }, + { + "epoch": 0.052314935914203504, + "grad_norm": 3.988853155353498, + "learning_rate": 1.0460934946060806e-05, + "loss": 1.4695, + "step": 1600 + }, + { + "epoch": 0.05247842008893539, + "grad_norm": 3.797407260090423, + "learning_rate": 1.0493625367767246e-05, + "loss": 1.5085, + "step": 1605 + }, + { + "epoch": 0.05264190426366728, + "grad_norm": 4.312675137107097, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.5457, + "step": 1610 + }, + { + "epoch": 0.052805388438399166, + "grad_norm": 3.9633887516634907, + "learning_rate": 1.0559006211180125e-05, + "loss": 1.5534, + "step": 1615 + }, + { + "epoch": 0.052968872613131046, + "grad_norm": 4.327993869119312, + "learning_rate": 1.0591696632886565e-05, + "loss": 1.4359, + "step": 1620 + }, + { + "epoch": 0.05313235678786293, + "grad_norm": 3.904803354810955, + "learning_rate": 1.0624387054593005e-05, + "loss": 1.4945, + "step": 1625 + }, + { + "epoch": 0.05329584096259482, + "grad_norm": 3.963514684045784, + "learning_rate": 1.0657077476299444e-05, + "loss": 1.4362, + "step": 1630 + }, + { + "epoch": 0.05345932513732671, + "grad_norm": 4.220282217671509, + "learning_rate": 1.0689767898005884e-05, + "loss": 1.3935, + "step": 1635 + }, + { + "epoch": 0.053622809312058595, + "grad_norm": 3.9960364350036928, + "learning_rate": 1.0722458319712326e-05, + "loss": 1.5534, + "step": 1640 + }, + { + "epoch": 0.05378629348679048, + "grad_norm": 4.199972755189737, + "learning_rate": 1.0755148741418767e-05, + "loss": 1.4625, + "step": 1645 + }, + { + "epoch": 0.05394977766152236, + "grad_norm": 4.113240691960026, + "learning_rate": 1.0787839163125205e-05, + "loss": 1.5982, + "step": 1650 + }, + { + "epoch": 0.05411326183625425, + "grad_norm": 4.015834842020327, + "learning_rate": 1.0820529584831645e-05, + "loss": 1.4819, + "step": 1655 + }, + { + "epoch": 0.054276746010986136, + "grad_norm": 4.585936689457373, + "learning_rate": 1.0853220006538086e-05, + "loss": 1.7039, + "step": 1660 + }, + { + "epoch": 0.054440230185718023, + "grad_norm": 4.158732075815134, + "learning_rate": 1.0885910428244526e-05, + "loss": 1.5618, + "step": 1665 + }, + { + "epoch": 0.05460371436044991, + "grad_norm": 3.950239928457831, + "learning_rate": 1.0918600849950965e-05, + "loss": 1.3102, + "step": 1670 + }, + { + "epoch": 0.0547671985351818, + "grad_norm": 4.021368052643606, + "learning_rate": 1.0951291271657405e-05, + "loss": 1.5846, + "step": 1675 + }, + { + "epoch": 0.05493068270991368, + "grad_norm": 3.8157381315173744, + "learning_rate": 1.0983981693363845e-05, + "loss": 1.5172, + "step": 1680 + }, + { + "epoch": 0.055094166884645565, + "grad_norm": 4.043042762586303, + "learning_rate": 1.1016672115070286e-05, + "loss": 1.5573, + "step": 1685 + }, + { + "epoch": 0.05525765105937745, + "grad_norm": 3.841673893066891, + "learning_rate": 1.1049362536776724e-05, + "loss": 1.4255, + "step": 1690 + }, + { + "epoch": 0.05542113523410934, + "grad_norm": 4.229200695020527, + "learning_rate": 1.1082052958483165e-05, + "loss": 1.6378, + "step": 1695 + }, + { + "epoch": 0.05558461940884123, + "grad_norm": 4.15120784224622, + "learning_rate": 1.1114743380189605e-05, + "loss": 1.5715, + "step": 1700 + }, + { + "epoch": 0.05574810358357311, + "grad_norm": 4.169657965990517, + "learning_rate": 1.1147433801896045e-05, + "loss": 1.5852, + "step": 1705 + }, + { + "epoch": 0.055911587758304994, + "grad_norm": 4.679976936793839, + "learning_rate": 1.1180124223602484e-05, + "loss": 1.4875, + "step": 1710 + }, + { + "epoch": 0.05607507193303688, + "grad_norm": 4.077212109009481, + "learning_rate": 1.1212814645308924e-05, + "loss": 1.4629, + "step": 1715 + }, + { + "epoch": 0.05623855610776877, + "grad_norm": 4.1103488573890985, + "learning_rate": 1.1245505067015366e-05, + "loss": 1.4842, + "step": 1720 + }, + { + "epoch": 0.056402040282500655, + "grad_norm": 4.368501390237179, + "learning_rate": 1.1278195488721806e-05, + "loss": 1.5857, + "step": 1725 + }, + { + "epoch": 0.05656552445723254, + "grad_norm": 4.106874647352762, + "learning_rate": 1.1310885910428247e-05, + "loss": 1.4506, + "step": 1730 + }, + { + "epoch": 0.05672900863196442, + "grad_norm": 4.08748920381656, + "learning_rate": 1.1343576332134685e-05, + "loss": 1.4815, + "step": 1735 + }, + { + "epoch": 0.05689249280669631, + "grad_norm": 3.752450805408211, + "learning_rate": 1.1376266753841126e-05, + "loss": 1.3983, + "step": 1740 + }, + { + "epoch": 0.0570559769814282, + "grad_norm": 3.9603214126364894, + "learning_rate": 1.1408957175547566e-05, + "loss": 1.4232, + "step": 1745 + }, + { + "epoch": 0.057219461156160084, + "grad_norm": 3.9561211783510344, + "learning_rate": 1.1441647597254006e-05, + "loss": 1.4309, + "step": 1750 + }, + { + "epoch": 0.05738294533089197, + "grad_norm": 3.7916472646999115, + "learning_rate": 1.1474338018960445e-05, + "loss": 1.5559, + "step": 1755 + }, + { + "epoch": 0.05754642950562386, + "grad_norm": 4.115283353929558, + "learning_rate": 1.1507028440666885e-05, + "loss": 1.5734, + "step": 1760 + }, + { + "epoch": 0.05770991368035574, + "grad_norm": 3.9137402103541556, + "learning_rate": 1.1539718862373325e-05, + "loss": 1.3875, + "step": 1765 + }, + { + "epoch": 0.057873397855087626, + "grad_norm": 4.2975845953780505, + "learning_rate": 1.1572409284079766e-05, + "loss": 1.4972, + "step": 1770 + }, + { + "epoch": 0.05803688202981951, + "grad_norm": 4.268258805052147, + "learning_rate": 1.1605099705786204e-05, + "loss": 1.5632, + "step": 1775 + }, + { + "epoch": 0.0582003662045514, + "grad_norm": 3.9156280707591082, + "learning_rate": 1.1637790127492645e-05, + "loss": 1.5679, + "step": 1780 + }, + { + "epoch": 0.05836385037928329, + "grad_norm": 3.818267208139547, + "learning_rate": 1.1670480549199087e-05, + "loss": 1.3699, + "step": 1785 + }, + { + "epoch": 0.058527334554015174, + "grad_norm": 3.8420099678820403, + "learning_rate": 1.1703170970905527e-05, + "loss": 1.3929, + "step": 1790 + }, + { + "epoch": 0.058690818728747055, + "grad_norm": 4.144488369158861, + "learning_rate": 1.1735861392611966e-05, + "loss": 1.5398, + "step": 1795 + }, + { + "epoch": 0.05885430290347894, + "grad_norm": 3.9355493610856653, + "learning_rate": 1.1768551814318406e-05, + "loss": 1.553, + "step": 1800 + }, + { + "epoch": 0.05901778707821083, + "grad_norm": 4.354922594645776, + "learning_rate": 1.1801242236024846e-05, + "loss": 1.6322, + "step": 1805 + }, + { + "epoch": 0.059181271252942716, + "grad_norm": 4.16582479075181, + "learning_rate": 1.1833932657731286e-05, + "loss": 1.6959, + "step": 1810 + }, + { + "epoch": 0.0593447554276746, + "grad_norm": 4.054599356614909, + "learning_rate": 1.1866623079437725e-05, + "loss": 1.4311, + "step": 1815 + }, + { + "epoch": 0.05950823960240649, + "grad_norm": 3.9106309135625645, + "learning_rate": 1.1899313501144165e-05, + "loss": 1.4594, + "step": 1820 + }, + { + "epoch": 0.05967172377713837, + "grad_norm": 3.9564656848183426, + "learning_rate": 1.1932003922850606e-05, + "loss": 1.5453, + "step": 1825 + }, + { + "epoch": 0.05983520795187026, + "grad_norm": 4.12417372154128, + "learning_rate": 1.1964694344557046e-05, + "loss": 1.4726, + "step": 1830 + }, + { + "epoch": 0.059998692126602145, + "grad_norm": 4.544897588162846, + "learning_rate": 1.1997384766263486e-05, + "loss": 1.6094, + "step": 1835 + }, + { + "epoch": 0.06016217630133403, + "grad_norm": 4.07040176671889, + "learning_rate": 1.2030075187969925e-05, + "loss": 1.5095, + "step": 1840 + }, + { + "epoch": 0.06032566047606592, + "grad_norm": 4.2076887713023305, + "learning_rate": 1.2062765609676365e-05, + "loss": 1.5316, + "step": 1845 + }, + { + "epoch": 0.060489144650797806, + "grad_norm": 4.335423758077807, + "learning_rate": 1.2095456031382805e-05, + "loss": 1.4998, + "step": 1850 + }, + { + "epoch": 0.060652628825529686, + "grad_norm": 3.8232048963451435, + "learning_rate": 1.2128146453089247e-05, + "loss": 1.5271, + "step": 1855 + }, + { + "epoch": 0.060816113000261574, + "grad_norm": 4.26142028854896, + "learning_rate": 1.2160836874795684e-05, + "loss": 1.6326, + "step": 1860 + }, + { + "epoch": 0.06097959717499346, + "grad_norm": 4.130739363711538, + "learning_rate": 1.2193527296502126e-05, + "loss": 1.3612, + "step": 1865 + }, + { + "epoch": 0.06114308134972535, + "grad_norm": 4.215300233997929, + "learning_rate": 1.2226217718208567e-05, + "loss": 1.5608, + "step": 1870 + }, + { + "epoch": 0.061306565524457235, + "grad_norm": 4.123061397076126, + "learning_rate": 1.2258908139915007e-05, + "loss": 1.4625, + "step": 1875 + }, + { + "epoch": 0.061470049699189115, + "grad_norm": 4.059675474857096, + "learning_rate": 1.2291598561621446e-05, + "loss": 1.4854, + "step": 1880 + }, + { + "epoch": 0.061633533873921, + "grad_norm": 3.8290004020952733, + "learning_rate": 1.2324288983327886e-05, + "loss": 1.5042, + "step": 1885 + }, + { + "epoch": 0.06179701804865289, + "grad_norm": 3.8995227389209934, + "learning_rate": 1.2356979405034326e-05, + "loss": 1.42, + "step": 1890 + }, + { + "epoch": 0.06196050222338478, + "grad_norm": 4.168005181766937, + "learning_rate": 1.2389669826740766e-05, + "loss": 1.5127, + "step": 1895 + }, + { + "epoch": 0.062123986398116664, + "grad_norm": 4.087351137662911, + "learning_rate": 1.2422360248447205e-05, + "loss": 1.5243, + "step": 1900 + }, + { + "epoch": 0.06228747057284855, + "grad_norm": 3.9147858437864094, + "learning_rate": 1.2455050670153645e-05, + "loss": 1.5669, + "step": 1905 + }, + { + "epoch": 0.06245095474758043, + "grad_norm": 4.238819137435517, + "learning_rate": 1.2487741091860086e-05, + "loss": 1.6025, + "step": 1910 + }, + { + "epoch": 0.06261443892231232, + "grad_norm": 3.8948540690697486, + "learning_rate": 1.2520431513566526e-05, + "loss": 1.4617, + "step": 1915 + }, + { + "epoch": 0.0627779230970442, + "grad_norm": 3.889254627769063, + "learning_rate": 1.2553121935272965e-05, + "loss": 1.4251, + "step": 1920 + }, + { + "epoch": 0.06294140727177609, + "grad_norm": 4.278276146959723, + "learning_rate": 1.2585812356979405e-05, + "loss": 1.4575, + "step": 1925 + }, + { + "epoch": 0.06310489144650798, + "grad_norm": 4.273464174649508, + "learning_rate": 1.2618502778685845e-05, + "loss": 1.413, + "step": 1930 + }, + { + "epoch": 0.06326837562123987, + "grad_norm": 3.9585102636303793, + "learning_rate": 1.2651193200392287e-05, + "loss": 1.3763, + "step": 1935 + }, + { + "epoch": 0.06343185979597175, + "grad_norm": 4.1538138496668235, + "learning_rate": 1.2683883622098728e-05, + "loss": 1.6249, + "step": 1940 + }, + { + "epoch": 0.06359534397070364, + "grad_norm": 4.33785676884528, + "learning_rate": 1.2716574043805166e-05, + "loss": 1.6324, + "step": 1945 + }, + { + "epoch": 0.06375882814543553, + "grad_norm": 4.2635007050452085, + "learning_rate": 1.2749264465511606e-05, + "loss": 1.4996, + "step": 1950 + }, + { + "epoch": 0.0639223123201674, + "grad_norm": 3.7359450229077504, + "learning_rate": 1.2781954887218047e-05, + "loss": 1.4067, + "step": 1955 + }, + { + "epoch": 0.06408579649489929, + "grad_norm": 3.9138508638949703, + "learning_rate": 1.2814645308924487e-05, + "loss": 1.5291, + "step": 1960 + }, + { + "epoch": 0.06424928066963118, + "grad_norm": 3.7404314038165, + "learning_rate": 1.2847335730630926e-05, + "loss": 1.5587, + "step": 1965 + }, + { + "epoch": 0.06441276484436306, + "grad_norm": 3.801529915632806, + "learning_rate": 1.2880026152337366e-05, + "loss": 1.4827, + "step": 1970 + }, + { + "epoch": 0.06457624901909495, + "grad_norm": 3.9435184286670797, + "learning_rate": 1.2912716574043806e-05, + "loss": 1.5291, + "step": 1975 + }, + { + "epoch": 0.06473973319382684, + "grad_norm": 3.816409003560507, + "learning_rate": 1.2945406995750247e-05, + "loss": 1.4664, + "step": 1980 + }, + { + "epoch": 0.06490321736855872, + "grad_norm": 4.131600118035873, + "learning_rate": 1.2978097417456685e-05, + "loss": 1.4992, + "step": 1985 + }, + { + "epoch": 0.06506670154329061, + "grad_norm": 4.393967985778447, + "learning_rate": 1.3010787839163125e-05, + "loss": 1.552, + "step": 1990 + }, + { + "epoch": 0.0652301857180225, + "grad_norm": 3.94029949322812, + "learning_rate": 1.3043478260869566e-05, + "loss": 1.4129, + "step": 1995 + }, + { + "epoch": 0.06539366989275439, + "grad_norm": 4.156025550715022, + "learning_rate": 1.3076168682576008e-05, + "loss": 1.5501, + "step": 2000 + }, + { + "epoch": 0.06555715406748627, + "grad_norm": 4.090345702978151, + "learning_rate": 1.3108859104282445e-05, + "loss": 1.533, + "step": 2005 + }, + { + "epoch": 0.06572063824221816, + "grad_norm": 3.794516862417502, + "learning_rate": 1.3141549525988887e-05, + "loss": 1.5105, + "step": 2010 + }, + { + "epoch": 0.06588412241695003, + "grad_norm": 4.065120183824479, + "learning_rate": 1.3174239947695327e-05, + "loss": 1.4248, + "step": 2015 + }, + { + "epoch": 0.06604760659168192, + "grad_norm": 4.216023070171067, + "learning_rate": 1.3206930369401767e-05, + "loss": 1.4307, + "step": 2020 + }, + { + "epoch": 0.06621109076641381, + "grad_norm": 4.234509361299377, + "learning_rate": 1.3239620791108206e-05, + "loss": 1.53, + "step": 2025 + }, + { + "epoch": 0.0663745749411457, + "grad_norm": 3.969281838114021, + "learning_rate": 1.3272311212814646e-05, + "loss": 1.6056, + "step": 2030 + }, + { + "epoch": 0.06653805911587758, + "grad_norm": 4.018252733034183, + "learning_rate": 1.3305001634521087e-05, + "loss": 1.4713, + "step": 2035 + }, + { + "epoch": 0.06670154329060947, + "grad_norm": 3.8495064641152594, + "learning_rate": 1.3337692056227527e-05, + "loss": 1.6085, + "step": 2040 + }, + { + "epoch": 0.06686502746534136, + "grad_norm": 3.8813656562542014, + "learning_rate": 1.3370382477933967e-05, + "loss": 1.4024, + "step": 2045 + }, + { + "epoch": 0.06702851164007324, + "grad_norm": 4.000639052981349, + "learning_rate": 1.3403072899640406e-05, + "loss": 1.4387, + "step": 2050 + }, + { + "epoch": 0.06719199581480513, + "grad_norm": 4.303319362535804, + "learning_rate": 1.3435763321346846e-05, + "loss": 1.5674, + "step": 2055 + }, + { + "epoch": 0.06735547998953702, + "grad_norm": 4.427546289103875, + "learning_rate": 1.3468453743053286e-05, + "loss": 1.5721, + "step": 2060 + }, + { + "epoch": 0.0675189641642689, + "grad_norm": 4.2898419205015, + "learning_rate": 1.3501144164759727e-05, + "loss": 1.6766, + "step": 2065 + }, + { + "epoch": 0.06768244833900078, + "grad_norm": 3.694050742305806, + "learning_rate": 1.3533834586466165e-05, + "loss": 1.4337, + "step": 2070 + }, + { + "epoch": 0.06784593251373267, + "grad_norm": 4.411838874778856, + "learning_rate": 1.3566525008172606e-05, + "loss": 1.5808, + "step": 2075 + }, + { + "epoch": 0.06800941668846455, + "grad_norm": 4.324813256151371, + "learning_rate": 1.3599215429879048e-05, + "loss": 1.5478, + "step": 2080 + }, + { + "epoch": 0.06817290086319644, + "grad_norm": 3.9995478670832316, + "learning_rate": 1.3631905851585488e-05, + "loss": 1.5048, + "step": 2085 + }, + { + "epoch": 0.06833638503792833, + "grad_norm": 4.1296621050539954, + "learning_rate": 1.3664596273291926e-05, + "loss": 1.5843, + "step": 2090 + }, + { + "epoch": 0.06849986921266021, + "grad_norm": 3.8419224705884916, + "learning_rate": 1.3697286694998367e-05, + "loss": 1.4337, + "step": 2095 + }, + { + "epoch": 0.0686633533873921, + "grad_norm": 4.0925699704626215, + "learning_rate": 1.3729977116704807e-05, + "loss": 1.6119, + "step": 2100 + }, + { + "epoch": 0.06882683756212399, + "grad_norm": 3.9400058087361347, + "learning_rate": 1.3762667538411247e-05, + "loss": 1.4636, + "step": 2105 + }, + { + "epoch": 0.06899032173685588, + "grad_norm": 4.042690489323879, + "learning_rate": 1.3795357960117686e-05, + "loss": 1.6007, + "step": 2110 + }, + { + "epoch": 0.06915380591158776, + "grad_norm": 4.004649724632341, + "learning_rate": 1.3828048381824126e-05, + "loss": 1.5291, + "step": 2115 + }, + { + "epoch": 0.06931729008631965, + "grad_norm": 4.177526676449015, + "learning_rate": 1.3860738803530567e-05, + "loss": 1.6113, + "step": 2120 + }, + { + "epoch": 0.06948077426105154, + "grad_norm": 4.115857115248765, + "learning_rate": 1.3893429225237007e-05, + "loss": 1.4602, + "step": 2125 + }, + { + "epoch": 0.06964425843578341, + "grad_norm": 4.3186122308623505, + "learning_rate": 1.3926119646943445e-05, + "loss": 1.6162, + "step": 2130 + }, + { + "epoch": 0.0698077426105153, + "grad_norm": 3.774113990206829, + "learning_rate": 1.3958810068649886e-05, + "loss": 1.4959, + "step": 2135 + }, + { + "epoch": 0.06997122678524718, + "grad_norm": 4.014843502154494, + "learning_rate": 1.3991500490356326e-05, + "loss": 1.6289, + "step": 2140 + }, + { + "epoch": 0.07013471095997907, + "grad_norm": 4.035003250612408, + "learning_rate": 1.4024190912062768e-05, + "loss": 1.6044, + "step": 2145 + }, + { + "epoch": 0.07029819513471096, + "grad_norm": 4.282796275332359, + "learning_rate": 1.4056881333769208e-05, + "loss": 1.5935, + "step": 2150 + }, + { + "epoch": 0.07046167930944285, + "grad_norm": 3.626010813009457, + "learning_rate": 1.4089571755475645e-05, + "loss": 1.4758, + "step": 2155 + }, + { + "epoch": 0.07062516348417473, + "grad_norm": 4.0106536370883425, + "learning_rate": 1.4122262177182087e-05, + "loss": 1.4603, + "step": 2160 + }, + { + "epoch": 0.07078864765890662, + "grad_norm": 3.8168615595927866, + "learning_rate": 1.4154952598888528e-05, + "loss": 1.4913, + "step": 2165 + }, + { + "epoch": 0.07095213183363851, + "grad_norm": 4.112214546938449, + "learning_rate": 1.4187643020594968e-05, + "loss": 1.6285, + "step": 2170 + }, + { + "epoch": 0.0711156160083704, + "grad_norm": 3.815638628492836, + "learning_rate": 1.4220333442301407e-05, + "loss": 1.6895, + "step": 2175 + }, + { + "epoch": 0.07127910018310228, + "grad_norm": 4.349643526993218, + "learning_rate": 1.4253023864007847e-05, + "loss": 1.5087, + "step": 2180 + }, + { + "epoch": 0.07144258435783417, + "grad_norm": 3.842677770789675, + "learning_rate": 1.4285714285714287e-05, + "loss": 1.5371, + "step": 2185 + }, + { + "epoch": 0.07160606853256604, + "grad_norm": 3.765592197974487, + "learning_rate": 1.4318404707420727e-05, + "loss": 1.4325, + "step": 2190 + }, + { + "epoch": 0.07176955270729793, + "grad_norm": 4.022136580598047, + "learning_rate": 1.4351095129127166e-05, + "loss": 1.4076, + "step": 2195 + }, + { + "epoch": 0.07193303688202982, + "grad_norm": 4.131653021732223, + "learning_rate": 1.4383785550833606e-05, + "loss": 1.5542, + "step": 2200 + }, + { + "epoch": 0.0720965210567617, + "grad_norm": 3.9047007913062806, + "learning_rate": 1.4416475972540047e-05, + "loss": 1.5005, + "step": 2205 + }, + { + "epoch": 0.07226000523149359, + "grad_norm": 3.7292931472522723, + "learning_rate": 1.4449166394246487e-05, + "loss": 1.5932, + "step": 2210 + }, + { + "epoch": 0.07242348940622548, + "grad_norm": 3.9753168434191024, + "learning_rate": 1.4481856815952926e-05, + "loss": 1.395, + "step": 2215 + }, + { + "epoch": 0.07258697358095736, + "grad_norm": 3.9690803700008437, + "learning_rate": 1.4514547237659366e-05, + "loss": 1.3997, + "step": 2220 + }, + { + "epoch": 0.07275045775568925, + "grad_norm": 3.938961015436906, + "learning_rate": 1.4547237659365808e-05, + "loss": 1.6846, + "step": 2225 + }, + { + "epoch": 0.07291394193042114, + "grad_norm": 4.14779457340421, + "learning_rate": 1.4579928081072248e-05, + "loss": 1.503, + "step": 2230 + }, + { + "epoch": 0.07307742610515303, + "grad_norm": 4.004104220798025, + "learning_rate": 1.4612618502778687e-05, + "loss": 1.5116, + "step": 2235 + }, + { + "epoch": 0.07324091027988491, + "grad_norm": 4.031801815252137, + "learning_rate": 1.4645308924485127e-05, + "loss": 1.5206, + "step": 2240 + }, + { + "epoch": 0.07340439445461679, + "grad_norm": 4.065437365875164, + "learning_rate": 1.4677999346191567e-05, + "loss": 1.665, + "step": 2245 + }, + { + "epoch": 0.07356787862934867, + "grad_norm": 4.040396840939771, + "learning_rate": 1.4710689767898008e-05, + "loss": 1.4403, + "step": 2250 + }, + { + "epoch": 0.07373136280408056, + "grad_norm": 4.096332332868726, + "learning_rate": 1.4743380189604448e-05, + "loss": 1.6049, + "step": 2255 + }, + { + "epoch": 0.07389484697881245, + "grad_norm": 3.7183159139250823, + "learning_rate": 1.4776070611310887e-05, + "loss": 1.3959, + "step": 2260 + }, + { + "epoch": 0.07405833115354434, + "grad_norm": 3.8988983357452462, + "learning_rate": 1.4808761033017327e-05, + "loss": 1.4235, + "step": 2265 + }, + { + "epoch": 0.07422181532827622, + "grad_norm": 3.86920824068245, + "learning_rate": 1.4841451454723767e-05, + "loss": 1.52, + "step": 2270 + }, + { + "epoch": 0.07438529950300811, + "grad_norm": 3.8474783512125406, + "learning_rate": 1.4874141876430207e-05, + "loss": 1.5325, + "step": 2275 + }, + { + "epoch": 0.07454878367774, + "grad_norm": 3.746177057775758, + "learning_rate": 1.4906832298136646e-05, + "loss": 1.4217, + "step": 2280 + }, + { + "epoch": 0.07471226785247188, + "grad_norm": 4.157186635575076, + "learning_rate": 1.4939522719843086e-05, + "loss": 1.5863, + "step": 2285 + }, + { + "epoch": 0.07487575202720377, + "grad_norm": 3.898041775680472, + "learning_rate": 1.4972213141549527e-05, + "loss": 1.5921, + "step": 2290 + }, + { + "epoch": 0.07503923620193566, + "grad_norm": 3.971054548454294, + "learning_rate": 1.5004903563255969e-05, + "loss": 1.6808, + "step": 2295 + }, + { + "epoch": 0.07520272037666755, + "grad_norm": 3.857081892892923, + "learning_rate": 1.5037593984962406e-05, + "loss": 1.4624, + "step": 2300 + }, + { + "epoch": 0.07536620455139942, + "grad_norm": 3.734209910019424, + "learning_rate": 1.5070284406668848e-05, + "loss": 1.4352, + "step": 2305 + }, + { + "epoch": 0.0755296887261313, + "grad_norm": 4.150032980409472, + "learning_rate": 1.5102974828375288e-05, + "loss": 1.5966, + "step": 2310 + }, + { + "epoch": 0.07569317290086319, + "grad_norm": 3.8052982965985773, + "learning_rate": 1.5135665250081728e-05, + "loss": 1.5537, + "step": 2315 + }, + { + "epoch": 0.07585665707559508, + "grad_norm": 4.0144876876198, + "learning_rate": 1.5168355671788167e-05, + "loss": 1.4055, + "step": 2320 + }, + { + "epoch": 0.07602014125032697, + "grad_norm": 3.848135296907062, + "learning_rate": 1.5201046093494607e-05, + "loss": 1.5454, + "step": 2325 + }, + { + "epoch": 0.07618362542505885, + "grad_norm": 3.9597051881303638, + "learning_rate": 1.5233736515201047e-05, + "loss": 1.6007, + "step": 2330 + }, + { + "epoch": 0.07634710959979074, + "grad_norm": 3.8945544525518203, + "learning_rate": 1.5266426936907488e-05, + "loss": 1.5501, + "step": 2335 + }, + { + "epoch": 0.07651059377452263, + "grad_norm": 3.672349174601466, + "learning_rate": 1.5299117358613926e-05, + "loss": 1.4306, + "step": 2340 + }, + { + "epoch": 0.07667407794925452, + "grad_norm": 4.736492666105702, + "learning_rate": 1.533180778032037e-05, + "loss": 1.5597, + "step": 2345 + }, + { + "epoch": 0.0768375621239864, + "grad_norm": 4.104074898076693, + "learning_rate": 1.5364498202026807e-05, + "loss": 1.4336, + "step": 2350 + }, + { + "epoch": 0.07700104629871829, + "grad_norm": 3.9741720930913345, + "learning_rate": 1.539718862373325e-05, + "loss": 1.4456, + "step": 2355 + }, + { + "epoch": 0.07716453047345018, + "grad_norm": 3.87222961784108, + "learning_rate": 1.5429879045439688e-05, + "loss": 1.4819, + "step": 2360 + }, + { + "epoch": 0.07732801464818205, + "grad_norm": 3.9168737926536132, + "learning_rate": 1.5462569467146126e-05, + "loss": 1.5661, + "step": 2365 + }, + { + "epoch": 0.07749149882291394, + "grad_norm": 3.908155217277627, + "learning_rate": 1.5495259888852568e-05, + "loss": 1.6314, + "step": 2370 + }, + { + "epoch": 0.07765498299764582, + "grad_norm": 3.9444989723783808, + "learning_rate": 1.5527950310559007e-05, + "loss": 1.4315, + "step": 2375 + }, + { + "epoch": 0.07781846717237771, + "grad_norm": 3.9527202772667063, + "learning_rate": 1.556064073226545e-05, + "loss": 1.6617, + "step": 2380 + }, + { + "epoch": 0.0779819513471096, + "grad_norm": 4.527260107609461, + "learning_rate": 1.5593331153971887e-05, + "loss": 1.5609, + "step": 2385 + }, + { + "epoch": 0.07814543552184149, + "grad_norm": 3.9474416940351724, + "learning_rate": 1.5626021575678326e-05, + "loss": 1.5099, + "step": 2390 + }, + { + "epoch": 0.07830891969657337, + "grad_norm": 4.173110499498873, + "learning_rate": 1.5658711997384768e-05, + "loss": 1.6002, + "step": 2395 + }, + { + "epoch": 0.07847240387130526, + "grad_norm": 4.112026886011532, + "learning_rate": 1.569140241909121e-05, + "loss": 1.5334, + "step": 2400 + }, + { + "epoch": 0.07863588804603715, + "grad_norm": 4.250692059394191, + "learning_rate": 1.5724092840797645e-05, + "loss": 1.5796, + "step": 2405 + }, + { + "epoch": 0.07879937222076903, + "grad_norm": 3.6427113403821547, + "learning_rate": 1.5756783262504087e-05, + "loss": 1.4042, + "step": 2410 + }, + { + "epoch": 0.07896285639550092, + "grad_norm": 3.992647825386925, + "learning_rate": 1.578947368421053e-05, + "loss": 1.5515, + "step": 2415 + }, + { + "epoch": 0.0791263405702328, + "grad_norm": 3.835914794338429, + "learning_rate": 1.5822164105916968e-05, + "loss": 1.4712, + "step": 2420 + }, + { + "epoch": 0.07928982474496468, + "grad_norm": 3.866873295424797, + "learning_rate": 1.5854854527623406e-05, + "loss": 1.5041, + "step": 2425 + }, + { + "epoch": 0.07945330891969657, + "grad_norm": 3.5393104002542564, + "learning_rate": 1.588754494932985e-05, + "loss": 1.4265, + "step": 2430 + }, + { + "epoch": 0.07961679309442846, + "grad_norm": 3.981739927538918, + "learning_rate": 1.5920235371036287e-05, + "loss": 1.5245, + "step": 2435 + }, + { + "epoch": 0.07978027726916034, + "grad_norm": 4.108380460675413, + "learning_rate": 1.595292579274273e-05, + "loss": 1.5007, + "step": 2440 + }, + { + "epoch": 0.07994376144389223, + "grad_norm": 4.130942384467043, + "learning_rate": 1.5985616214449168e-05, + "loss": 1.5085, + "step": 2445 + }, + { + "epoch": 0.08010724561862412, + "grad_norm": 3.8710156030961738, + "learning_rate": 1.6018306636155606e-05, + "loss": 1.5102, + "step": 2450 + }, + { + "epoch": 0.080270729793356, + "grad_norm": 3.9906538705653016, + "learning_rate": 1.6050997057862048e-05, + "loss": 1.5803, + "step": 2455 + }, + { + "epoch": 0.08043421396808789, + "grad_norm": 4.286833538370939, + "learning_rate": 1.6083687479568487e-05, + "loss": 1.6479, + "step": 2460 + }, + { + "epoch": 0.08059769814281978, + "grad_norm": 3.9534515496984257, + "learning_rate": 1.611637790127493e-05, + "loss": 1.5541, + "step": 2465 + }, + { + "epoch": 0.08076118231755167, + "grad_norm": 3.892221079027397, + "learning_rate": 1.6149068322981367e-05, + "loss": 1.5193, + "step": 2470 + }, + { + "epoch": 0.08092466649228355, + "grad_norm": 3.794863284156802, + "learning_rate": 1.6181758744687806e-05, + "loss": 1.4609, + "step": 2475 + }, + { + "epoch": 0.08108815066701543, + "grad_norm": 3.8150842387621315, + "learning_rate": 1.6214449166394248e-05, + "loss": 1.5875, + "step": 2480 + }, + { + "epoch": 0.08125163484174731, + "grad_norm": 3.73107742326742, + "learning_rate": 1.624713958810069e-05, + "loss": 1.5817, + "step": 2485 + }, + { + "epoch": 0.0814151190164792, + "grad_norm": 3.8892831774741445, + "learning_rate": 1.627983000980713e-05, + "loss": 1.3765, + "step": 2490 + }, + { + "epoch": 0.08157860319121109, + "grad_norm": 3.7518739683742623, + "learning_rate": 1.6312520431513567e-05, + "loss": 1.4246, + "step": 2495 + }, + { + "epoch": 0.08174208736594298, + "grad_norm": 3.862564096654626, + "learning_rate": 1.634521085322001e-05, + "loss": 1.5465, + "step": 2500 + }, + { + "epoch": 0.08190557154067486, + "grad_norm": 3.7996879561978862, + "learning_rate": 1.6377901274926448e-05, + "loss": 1.3935, + "step": 2505 + }, + { + "epoch": 0.08206905571540675, + "grad_norm": 3.835499662526411, + "learning_rate": 1.6410591696632887e-05, + "loss": 1.5075, + "step": 2510 + }, + { + "epoch": 0.08223253989013864, + "grad_norm": 3.849750383319034, + "learning_rate": 1.644328211833933e-05, + "loss": 1.5212, + "step": 2515 + }, + { + "epoch": 0.08239602406487052, + "grad_norm": 4.140957494072488, + "learning_rate": 1.6475972540045767e-05, + "loss": 1.5491, + "step": 2520 + }, + { + "epoch": 0.08255950823960241, + "grad_norm": 3.987980561605536, + "learning_rate": 1.650866296175221e-05, + "loss": 1.5932, + "step": 2525 + }, + { + "epoch": 0.0827229924143343, + "grad_norm": 4.022642796485821, + "learning_rate": 1.6541353383458648e-05, + "loss": 1.5657, + "step": 2530 + }, + { + "epoch": 0.08288647658906619, + "grad_norm": 3.7027939626128683, + "learning_rate": 1.6574043805165086e-05, + "loss": 1.5468, + "step": 2535 + }, + { + "epoch": 0.08304996076379806, + "grad_norm": 3.8317710665127636, + "learning_rate": 1.660673422687153e-05, + "loss": 1.5746, + "step": 2540 + }, + { + "epoch": 0.08321344493852995, + "grad_norm": 3.6170297988266977, + "learning_rate": 1.663942464857797e-05, + "loss": 1.3969, + "step": 2545 + }, + { + "epoch": 0.08337692911326183, + "grad_norm": 3.889180221489327, + "learning_rate": 1.6672115070284406e-05, + "loss": 1.5605, + "step": 2550 + }, + { + "epoch": 0.08354041328799372, + "grad_norm": 3.745899161735655, + "learning_rate": 1.6704805491990848e-05, + "loss": 1.4149, + "step": 2555 + }, + { + "epoch": 0.08370389746272561, + "grad_norm": 3.8597116900148767, + "learning_rate": 1.673749591369729e-05, + "loss": 1.522, + "step": 2560 + }, + { + "epoch": 0.0838673816374575, + "grad_norm": 3.816255101639947, + "learning_rate": 1.6770186335403728e-05, + "loss": 1.4709, + "step": 2565 + }, + { + "epoch": 0.08403086581218938, + "grad_norm": 3.609003687045627, + "learning_rate": 1.680287675711017e-05, + "loss": 1.5177, + "step": 2570 + }, + { + "epoch": 0.08419434998692127, + "grad_norm": 3.8307710055015574, + "learning_rate": 1.683556717881661e-05, + "loss": 1.6121, + "step": 2575 + }, + { + "epoch": 0.08435783416165316, + "grad_norm": 3.963406420134072, + "learning_rate": 1.6868257600523047e-05, + "loss": 1.5418, + "step": 2580 + }, + { + "epoch": 0.08452131833638504, + "grad_norm": 4.027145804283867, + "learning_rate": 1.690094802222949e-05, + "loss": 1.7389, + "step": 2585 + }, + { + "epoch": 0.08468480251111693, + "grad_norm": 3.990507741537985, + "learning_rate": 1.6933638443935928e-05, + "loss": 1.5924, + "step": 2590 + }, + { + "epoch": 0.0848482866858488, + "grad_norm": 3.9078887747526676, + "learning_rate": 1.6966328865642367e-05, + "loss": 1.5349, + "step": 2595 + }, + { + "epoch": 0.08501177086058069, + "grad_norm": 4.037663982543767, + "learning_rate": 1.699901928734881e-05, + "loss": 1.4855, + "step": 2600 + }, + { + "epoch": 0.08517525503531258, + "grad_norm": 3.937440266281854, + "learning_rate": 1.7031709709055247e-05, + "loss": 1.518, + "step": 2605 + }, + { + "epoch": 0.08533873921004446, + "grad_norm": 3.8564310227926857, + "learning_rate": 1.706440013076169e-05, + "loss": 1.4345, + "step": 2610 + }, + { + "epoch": 0.08550222338477635, + "grad_norm": 4.029858766675916, + "learning_rate": 1.7097090552468128e-05, + "loss": 1.5784, + "step": 2615 + }, + { + "epoch": 0.08566570755950824, + "grad_norm": 3.9289081081686312, + "learning_rate": 1.7129780974174566e-05, + "loss": 1.4707, + "step": 2620 + }, + { + "epoch": 0.08582919173424013, + "grad_norm": 3.841016708307184, + "learning_rate": 1.716247139588101e-05, + "loss": 1.6027, + "step": 2625 + }, + { + "epoch": 0.08599267590897201, + "grad_norm": 4.04513133910495, + "learning_rate": 1.719516181758745e-05, + "loss": 1.6259, + "step": 2630 + }, + { + "epoch": 0.0861561600837039, + "grad_norm": 3.8680074139188254, + "learning_rate": 1.722785223929389e-05, + "loss": 1.7084, + "step": 2635 + }, + { + "epoch": 0.08631964425843579, + "grad_norm": 4.117012557540263, + "learning_rate": 1.7260542661000328e-05, + "loss": 1.4894, + "step": 2640 + }, + { + "epoch": 0.08648312843316767, + "grad_norm": 3.8913161949783346, + "learning_rate": 1.729323308270677e-05, + "loss": 1.6518, + "step": 2645 + }, + { + "epoch": 0.08664661260789956, + "grad_norm": 4.153310022167821, + "learning_rate": 1.7325923504413208e-05, + "loss": 1.5268, + "step": 2650 + }, + { + "epoch": 0.08681009678263144, + "grad_norm": 3.5624625810584383, + "learning_rate": 1.7358613926119647e-05, + "loss": 1.4703, + "step": 2655 + }, + { + "epoch": 0.08697358095736332, + "grad_norm": 3.7669293067203364, + "learning_rate": 1.739130434782609e-05, + "loss": 1.5119, + "step": 2660 + }, + { + "epoch": 0.08713706513209521, + "grad_norm": 3.7435942465738234, + "learning_rate": 1.7423994769532527e-05, + "loss": 1.5119, + "step": 2665 + }, + { + "epoch": 0.0873005493068271, + "grad_norm": 4.23173969182354, + "learning_rate": 1.745668519123897e-05, + "loss": 1.4451, + "step": 2670 + }, + { + "epoch": 0.08746403348155898, + "grad_norm": 3.7949279914460563, + "learning_rate": 1.7489375612945408e-05, + "loss": 1.4657, + "step": 2675 + }, + { + "epoch": 0.08762751765629087, + "grad_norm": 3.7075748364517227, + "learning_rate": 1.7522066034651847e-05, + "loss": 1.6149, + "step": 2680 + }, + { + "epoch": 0.08779100183102276, + "grad_norm": 3.9287358284818135, + "learning_rate": 1.755475645635829e-05, + "loss": 1.5347, + "step": 2685 + }, + { + "epoch": 0.08795448600575465, + "grad_norm": 3.9116262092760237, + "learning_rate": 1.7587446878064727e-05, + "loss": 1.5436, + "step": 2690 + }, + { + "epoch": 0.08811797018048653, + "grad_norm": 3.9547801337347965, + "learning_rate": 1.762013729977117e-05, + "loss": 1.5789, + "step": 2695 + }, + { + "epoch": 0.08828145435521842, + "grad_norm": 3.9158789727158703, + "learning_rate": 1.7652827721477608e-05, + "loss": 1.4628, + "step": 2700 + }, + { + "epoch": 0.0884449385299503, + "grad_norm": 4.010716818161736, + "learning_rate": 1.768551814318405e-05, + "loss": 1.4776, + "step": 2705 + }, + { + "epoch": 0.08860842270468218, + "grad_norm": 3.9686086149787614, + "learning_rate": 1.771820856489049e-05, + "loss": 1.6343, + "step": 2710 + }, + { + "epoch": 0.08877190687941407, + "grad_norm": 4.021456285647782, + "learning_rate": 1.775089898659693e-05, + "loss": 1.5333, + "step": 2715 + }, + { + "epoch": 0.08893539105414595, + "grad_norm": 3.787302968173945, + "learning_rate": 1.778358940830337e-05, + "loss": 1.5141, + "step": 2720 + }, + { + "epoch": 0.08909887522887784, + "grad_norm": 4.041590751980877, + "learning_rate": 1.7816279830009808e-05, + "loss": 1.5711, + "step": 2725 + }, + { + "epoch": 0.08926235940360973, + "grad_norm": 3.692361267782154, + "learning_rate": 1.784897025171625e-05, + "loss": 1.3976, + "step": 2730 + }, + { + "epoch": 0.08942584357834162, + "grad_norm": 3.6893325511384893, + "learning_rate": 1.7881660673422688e-05, + "loss": 1.384, + "step": 2735 + }, + { + "epoch": 0.0895893277530735, + "grad_norm": 3.6921251770046806, + "learning_rate": 1.7914351095129127e-05, + "loss": 1.5854, + "step": 2740 + }, + { + "epoch": 0.08975281192780539, + "grad_norm": 3.7958329745972277, + "learning_rate": 1.794704151683557e-05, + "loss": 1.5929, + "step": 2745 + }, + { + "epoch": 0.08991629610253728, + "grad_norm": 3.7515198717682336, + "learning_rate": 1.7979731938542008e-05, + "loss": 1.6241, + "step": 2750 + }, + { + "epoch": 0.09007978027726916, + "grad_norm": 3.890403237032007, + "learning_rate": 1.801242236024845e-05, + "loss": 1.3846, + "step": 2755 + }, + { + "epoch": 0.09024326445200105, + "grad_norm": 4.077738213707022, + "learning_rate": 1.8045112781954888e-05, + "loss": 1.5402, + "step": 2760 + }, + { + "epoch": 0.09040674862673294, + "grad_norm": 3.9193664062317475, + "learning_rate": 1.8077803203661327e-05, + "loss": 1.4775, + "step": 2765 + }, + { + "epoch": 0.09057023280146481, + "grad_norm": 3.4454815746349348, + "learning_rate": 1.811049362536777e-05, + "loss": 1.4326, + "step": 2770 + }, + { + "epoch": 0.0907337169761967, + "grad_norm": 4.149763937517356, + "learning_rate": 1.814318404707421e-05, + "loss": 1.6351, + "step": 2775 + }, + { + "epoch": 0.09089720115092859, + "grad_norm": 3.747855457335754, + "learning_rate": 1.817587446878065e-05, + "loss": 1.4522, + "step": 2780 + }, + { + "epoch": 0.09106068532566047, + "grad_norm": 3.8545105848909342, + "learning_rate": 1.8208564890487088e-05, + "loss": 1.6052, + "step": 2785 + }, + { + "epoch": 0.09122416950039236, + "grad_norm": 3.670957013069443, + "learning_rate": 1.824125531219353e-05, + "loss": 1.4244, + "step": 2790 + }, + { + "epoch": 0.09138765367512425, + "grad_norm": 3.6254681870976944, + "learning_rate": 1.827394573389997e-05, + "loss": 1.524, + "step": 2795 + }, + { + "epoch": 0.09155113784985613, + "grad_norm": 3.9252580262647694, + "learning_rate": 1.830663615560641e-05, + "loss": 1.5488, + "step": 2800 + }, + { + "epoch": 0.09171462202458802, + "grad_norm": 4.12748000407981, + "learning_rate": 1.833932657731285e-05, + "loss": 1.6047, + "step": 2805 + }, + { + "epoch": 0.09187810619931991, + "grad_norm": 3.419858541952374, + "learning_rate": 1.8372016999019288e-05, + "loss": 1.4122, + "step": 2810 + }, + { + "epoch": 0.0920415903740518, + "grad_norm": 4.041241126536955, + "learning_rate": 1.840470742072573e-05, + "loss": 1.6582, + "step": 2815 + }, + { + "epoch": 0.09220507454878368, + "grad_norm": 4.1751623173585, + "learning_rate": 1.843739784243217e-05, + "loss": 1.509, + "step": 2820 + }, + { + "epoch": 0.09236855872351557, + "grad_norm": 3.687538186411203, + "learning_rate": 1.8470088264138607e-05, + "loss": 1.4689, + "step": 2825 + }, + { + "epoch": 0.09253204289824744, + "grad_norm": 3.8056239862456773, + "learning_rate": 1.850277868584505e-05, + "loss": 1.5248, + "step": 2830 + }, + { + "epoch": 0.09269552707297933, + "grad_norm": 4.238027612395347, + "learning_rate": 1.8535469107551488e-05, + "loss": 1.5399, + "step": 2835 + }, + { + "epoch": 0.09285901124771122, + "grad_norm": 3.681233830076371, + "learning_rate": 1.856815952925793e-05, + "loss": 1.3878, + "step": 2840 + }, + { + "epoch": 0.0930224954224431, + "grad_norm": 3.8774213049024935, + "learning_rate": 1.8600849950964368e-05, + "loss": 1.6883, + "step": 2845 + }, + { + "epoch": 0.09318597959717499, + "grad_norm": 3.8422524131154163, + "learning_rate": 1.863354037267081e-05, + "loss": 1.5367, + "step": 2850 + }, + { + "epoch": 0.09334946377190688, + "grad_norm": 3.8058504263153012, + "learning_rate": 1.866623079437725e-05, + "loss": 1.5357, + "step": 2855 + }, + { + "epoch": 0.09351294794663877, + "grad_norm": 3.9384109814478787, + "learning_rate": 1.869892121608369e-05, + "loss": 1.623, + "step": 2860 + }, + { + "epoch": 0.09367643212137065, + "grad_norm": 3.869436331925625, + "learning_rate": 1.873161163779013e-05, + "loss": 1.5852, + "step": 2865 + }, + { + "epoch": 0.09383991629610254, + "grad_norm": 3.870943549433104, + "learning_rate": 1.8764302059496568e-05, + "loss": 1.6063, + "step": 2870 + }, + { + "epoch": 0.09400340047083443, + "grad_norm": 3.929581768984165, + "learning_rate": 1.879699248120301e-05, + "loss": 1.7223, + "step": 2875 + }, + { + "epoch": 0.09416688464556631, + "grad_norm": 3.7519095395831403, + "learning_rate": 1.882968290290945e-05, + "loss": 1.4919, + "step": 2880 + }, + { + "epoch": 0.09433036882029819, + "grad_norm": 4.1560313505664235, + "learning_rate": 1.8862373324615887e-05, + "loss": 1.5889, + "step": 2885 + }, + { + "epoch": 0.09449385299503008, + "grad_norm": 3.9747444369639164, + "learning_rate": 1.889506374632233e-05, + "loss": 1.4905, + "step": 2890 + }, + { + "epoch": 0.09465733716976196, + "grad_norm": 4.25236227235218, + "learning_rate": 1.8927754168028768e-05, + "loss": 1.5032, + "step": 2895 + }, + { + "epoch": 0.09482082134449385, + "grad_norm": 3.607901781385602, + "learning_rate": 1.896044458973521e-05, + "loss": 1.5905, + "step": 2900 + }, + { + "epoch": 0.09498430551922574, + "grad_norm": 3.8494304289727217, + "learning_rate": 1.8993135011441652e-05, + "loss": 1.4849, + "step": 2905 + }, + { + "epoch": 0.09514778969395762, + "grad_norm": 3.7682908711742473, + "learning_rate": 1.9025825433148087e-05, + "loss": 1.4702, + "step": 2910 + }, + { + "epoch": 0.09531127386868951, + "grad_norm": 3.660231650121196, + "learning_rate": 1.905851585485453e-05, + "loss": 1.5111, + "step": 2915 + }, + { + "epoch": 0.0954747580434214, + "grad_norm": 3.934830372724098, + "learning_rate": 1.909120627656097e-05, + "loss": 1.4773, + "step": 2920 + }, + { + "epoch": 0.09563824221815329, + "grad_norm": 3.862517893533473, + "learning_rate": 1.912389669826741e-05, + "loss": 1.4836, + "step": 2925 + }, + { + "epoch": 0.09580172639288517, + "grad_norm": 3.9996852045875717, + "learning_rate": 1.9156587119973848e-05, + "loss": 1.5376, + "step": 2930 + }, + { + "epoch": 0.09596521056761706, + "grad_norm": 3.521677945809555, + "learning_rate": 1.918927754168029e-05, + "loss": 1.5171, + "step": 2935 + }, + { + "epoch": 0.09612869474234895, + "grad_norm": 4.0531527333477335, + "learning_rate": 1.922196796338673e-05, + "loss": 1.555, + "step": 2940 + }, + { + "epoch": 0.09629217891708082, + "grad_norm": 3.9584927478188177, + "learning_rate": 1.925465838509317e-05, + "loss": 1.5198, + "step": 2945 + }, + { + "epoch": 0.09645566309181271, + "grad_norm": 3.631576667520586, + "learning_rate": 1.928734880679961e-05, + "loss": 1.4312, + "step": 2950 + }, + { + "epoch": 0.0966191472665446, + "grad_norm": 3.815747263317347, + "learning_rate": 1.9320039228506048e-05, + "loss": 1.4841, + "step": 2955 + }, + { + "epoch": 0.09678263144127648, + "grad_norm": 4.065507604175729, + "learning_rate": 1.935272965021249e-05, + "loss": 1.6891, + "step": 2960 + }, + { + "epoch": 0.09694611561600837, + "grad_norm": 3.958444399606792, + "learning_rate": 1.938542007191893e-05, + "loss": 1.6078, + "step": 2965 + }, + { + "epoch": 0.09710959979074026, + "grad_norm": 3.6032165456416982, + "learning_rate": 1.9418110493625367e-05, + "loss": 1.4665, + "step": 2970 + }, + { + "epoch": 0.09727308396547214, + "grad_norm": 3.6436053104698125, + "learning_rate": 1.945080091533181e-05, + "loss": 1.5463, + "step": 2975 + }, + { + "epoch": 0.09743656814020403, + "grad_norm": 3.7186539517826214, + "learning_rate": 1.9483491337038248e-05, + "loss": 1.5698, + "step": 2980 + }, + { + "epoch": 0.09760005231493592, + "grad_norm": 3.7624546452192735, + "learning_rate": 1.951618175874469e-05, + "loss": 1.7065, + "step": 2985 + }, + { + "epoch": 0.0977635364896678, + "grad_norm": 3.726770381138607, + "learning_rate": 1.954887218045113e-05, + "loss": 1.6651, + "step": 2990 + }, + { + "epoch": 0.09792702066439969, + "grad_norm": 3.538352386836169, + "learning_rate": 1.958156260215757e-05, + "loss": 1.4937, + "step": 2995 + }, + { + "epoch": 0.09809050483913158, + "grad_norm": 3.5930088598250753, + "learning_rate": 1.961425302386401e-05, + "loss": 1.4719, + "step": 3000 + }, + { + "epoch": 0.09825398901386345, + "grad_norm": 3.3465593464078465, + "learning_rate": 1.964694344557045e-05, + "loss": 1.4875, + "step": 3005 + }, + { + "epoch": 0.09841747318859534, + "grad_norm": 3.8099757523884685, + "learning_rate": 1.967963386727689e-05, + "loss": 1.4717, + "step": 3010 + }, + { + "epoch": 0.09858095736332723, + "grad_norm": 3.8348028865789043, + "learning_rate": 1.971232428898333e-05, + "loss": 1.5976, + "step": 3015 + }, + { + "epoch": 0.09874444153805911, + "grad_norm": 3.839441860891903, + "learning_rate": 1.974501471068977e-05, + "loss": 1.5034, + "step": 3020 + }, + { + "epoch": 0.098907925712791, + "grad_norm": 3.678184744792054, + "learning_rate": 1.977770513239621e-05, + "loss": 1.472, + "step": 3025 + }, + { + "epoch": 0.09907140988752289, + "grad_norm": 3.5140526758743214, + "learning_rate": 1.981039555410265e-05, + "loss": 1.4639, + "step": 3030 + }, + { + "epoch": 0.09923489406225477, + "grad_norm": 3.807100959599707, + "learning_rate": 1.984308597580909e-05, + "loss": 1.5573, + "step": 3035 + }, + { + "epoch": 0.09939837823698666, + "grad_norm": 3.967443756817193, + "learning_rate": 1.9875776397515528e-05, + "loss": 1.4321, + "step": 3040 + }, + { + "epoch": 0.09956186241171855, + "grad_norm": 3.935147730023855, + "learning_rate": 1.990846681922197e-05, + "loss": 1.5231, + "step": 3045 + }, + { + "epoch": 0.09972534658645044, + "grad_norm": 3.49570142188034, + "learning_rate": 1.994115724092841e-05, + "loss": 1.4277, + "step": 3050 + }, + { + "epoch": 0.09988883076118232, + "grad_norm": 3.542511064783324, + "learning_rate": 1.9973847662634847e-05, + "loss": 1.5502, + "step": 3055 + }, + { + "epoch": 0.1000523149359142, + "grad_norm": 3.8553905851964236, + "learning_rate": 1.99999999348649e-05, + "loss": 1.512, + "step": 3060 + }, + { + "epoch": 0.10021579911064608, + "grad_norm": 3.8999681740245045, + "learning_rate": 1.9999997655136437e-05, + "loss": 1.4802, + "step": 3065 + }, + { + "epoch": 0.10037928328537797, + "grad_norm": 3.7578328719782865, + "learning_rate": 1.999999211865375e-05, + "loss": 1.4641, + "step": 3070 + }, + { + "epoch": 0.10054276746010986, + "grad_norm": 3.9271675029646262, + "learning_rate": 1.9999983325418642e-05, + "loss": 1.5353, + "step": 3075 + }, + { + "epoch": 0.10070625163484175, + "grad_norm": 3.695877968317878, + "learning_rate": 1.9999971275433978e-05, + "loss": 1.594, + "step": 3080 + }, + { + "epoch": 0.10086973580957363, + "grad_norm": 3.6766384998292865, + "learning_rate": 1.9999955968703682e-05, + "loss": 1.4733, + "step": 3085 + }, + { + "epoch": 0.10103321998430552, + "grad_norm": 4.0363941370924135, + "learning_rate": 1.9999937405232735e-05, + "loss": 1.5708, + "step": 3090 + }, + { + "epoch": 0.1011967041590374, + "grad_norm": 3.765561372105954, + "learning_rate": 1.9999915585027184e-05, + "loss": 1.4645, + "step": 3095 + }, + { + "epoch": 0.1013601883337693, + "grad_norm": 3.743716859379929, + "learning_rate": 1.999989050809414e-05, + "loss": 1.5617, + "step": 3100 + }, + { + "epoch": 0.10152367250850118, + "grad_norm": 3.6817245699139947, + "learning_rate": 1.9999862174441764e-05, + "loss": 1.5375, + "step": 3105 + }, + { + "epoch": 0.10168715668323307, + "grad_norm": 3.840304966187688, + "learning_rate": 1.999983058407929e-05, + "loss": 1.5619, + "step": 3110 + }, + { + "epoch": 0.10185064085796496, + "grad_norm": 3.992993129943934, + "learning_rate": 1.9999795737017e-05, + "loss": 1.6253, + "step": 3115 + }, + { + "epoch": 0.10201412503269683, + "grad_norm": 3.5387889922716145, + "learning_rate": 1.9999757633266246e-05, + "loss": 1.6773, + "step": 3120 + }, + { + "epoch": 0.10217760920742872, + "grad_norm": 3.6981491486051548, + "learning_rate": 1.9999716272839434e-05, + "loss": 1.6063, + "step": 3125 + }, + { + "epoch": 0.1023410933821606, + "grad_norm": 3.5517442638138377, + "learning_rate": 1.9999671655750043e-05, + "loss": 1.3945, + "step": 3130 + }, + { + "epoch": 0.10250457755689249, + "grad_norm": 3.8194654265086143, + "learning_rate": 1.9999623782012595e-05, + "loss": 1.3984, + "step": 3135 + }, + { + "epoch": 0.10266806173162438, + "grad_norm": 3.8839016711835646, + "learning_rate": 1.999957265164268e-05, + "loss": 1.638, + "step": 3140 + }, + { + "epoch": 0.10283154590635626, + "grad_norm": 3.3323230164028503, + "learning_rate": 1.999951826465696e-05, + "loss": 1.5077, + "step": 3145 + }, + { + "epoch": 0.10299503008108815, + "grad_norm": 3.58507181989749, + "learning_rate": 1.9999460621073137e-05, + "loss": 1.5405, + "step": 3150 + }, + { + "epoch": 0.10315851425582004, + "grad_norm": 3.8159147832958586, + "learning_rate": 1.999939972090999e-05, + "loss": 1.4669, + "step": 3155 + }, + { + "epoch": 0.10332199843055193, + "grad_norm": 3.76395159076416, + "learning_rate": 1.9999335564187348e-05, + "loss": 1.4623, + "step": 3160 + }, + { + "epoch": 0.10348548260528381, + "grad_norm": 3.7399597161338733, + "learning_rate": 1.9999268150926112e-05, + "loss": 1.5514, + "step": 3165 + }, + { + "epoch": 0.1036489667800157, + "grad_norm": 3.5904972412530896, + "learning_rate": 1.9999197481148235e-05, + "loss": 1.5866, + "step": 3170 + }, + { + "epoch": 0.10381245095474759, + "grad_norm": 3.898280702169868, + "learning_rate": 1.9999123554876724e-05, + "loss": 1.4608, + "step": 3175 + }, + { + "epoch": 0.10397593512947946, + "grad_norm": 4.078366482041691, + "learning_rate": 1.9999046372135667e-05, + "loss": 1.4894, + "step": 3180 + }, + { + "epoch": 0.10413941930421135, + "grad_norm": 3.5975633888630516, + "learning_rate": 1.9998965932950193e-05, + "loss": 1.533, + "step": 3185 + }, + { + "epoch": 0.10430290347894323, + "grad_norm": 3.8801356569830254, + "learning_rate": 1.9998882237346502e-05, + "loss": 1.5029, + "step": 3190 + }, + { + "epoch": 0.10446638765367512, + "grad_norm": 3.8386367986586314, + "learning_rate": 1.999879528535185e-05, + "loss": 1.5034, + "step": 3195 + }, + { + "epoch": 0.10462987182840701, + "grad_norm": 3.601883224238057, + "learning_rate": 1.999870507699456e-05, + "loss": 1.4924, + "step": 3200 + }, + { + "epoch": 0.1047933560031389, + "grad_norm": 3.7854290947013496, + "learning_rate": 1.9998611612304006e-05, + "loss": 1.6765, + "step": 3205 + }, + { + "epoch": 0.10495684017787078, + "grad_norm": 3.800914195965795, + "learning_rate": 1.9998514891310622e-05, + "loss": 1.4351, + "step": 3210 + }, + { + "epoch": 0.10512032435260267, + "grad_norm": 3.695152006877561, + "learning_rate": 1.9998414914045918e-05, + "loss": 1.5751, + "step": 3215 + }, + { + "epoch": 0.10528380852733456, + "grad_norm": 3.6511525061869596, + "learning_rate": 1.999831168054245e-05, + "loss": 1.7068, + "step": 3220 + }, + { + "epoch": 0.10544729270206644, + "grad_norm": 3.679427518129998, + "learning_rate": 1.9998205190833834e-05, + "loss": 1.5366, + "step": 3225 + }, + { + "epoch": 0.10561077687679833, + "grad_norm": 3.801542791822094, + "learning_rate": 1.9998095444954756e-05, + "loss": 1.4531, + "step": 3230 + }, + { + "epoch": 0.1057742610515302, + "grad_norm": 3.818023922172962, + "learning_rate": 1.999798244294096e-05, + "loss": 1.526, + "step": 3235 + }, + { + "epoch": 0.10593774522626209, + "grad_norm": 3.6658063306995357, + "learning_rate": 1.9997866184829244e-05, + "loss": 1.7927, + "step": 3240 + }, + { + "epoch": 0.10610122940099398, + "grad_norm": 3.686890951658766, + "learning_rate": 1.999774667065747e-05, + "loss": 1.4713, + "step": 3245 + }, + { + "epoch": 0.10626471357572587, + "grad_norm": 3.8727329823908865, + "learning_rate": 1.999762390046456e-05, + "loss": 1.6882, + "step": 3250 + }, + { + "epoch": 0.10642819775045775, + "grad_norm": 4.074030200975753, + "learning_rate": 1.9997497874290506e-05, + "loss": 1.5027, + "step": 3255 + }, + { + "epoch": 0.10659168192518964, + "grad_norm": 3.382516195375834, + "learning_rate": 1.999736859217634e-05, + "loss": 1.4593, + "step": 3260 + }, + { + "epoch": 0.10675516609992153, + "grad_norm": 3.3961492748188116, + "learning_rate": 1.9997236054164173e-05, + "loss": 1.4625, + "step": 3265 + }, + { + "epoch": 0.10691865027465342, + "grad_norm": 3.5348512103965537, + "learning_rate": 1.9997100260297167e-05, + "loss": 1.4667, + "step": 3270 + }, + { + "epoch": 0.1070821344493853, + "grad_norm": 3.576380768088592, + "learning_rate": 1.9996961210619545e-05, + "loss": 1.5065, + "step": 3275 + }, + { + "epoch": 0.10724561862411719, + "grad_norm": 3.5755209889284014, + "learning_rate": 1.9996818905176596e-05, + "loss": 1.5335, + "step": 3280 + }, + { + "epoch": 0.10740910279884908, + "grad_norm": 3.641350041983998, + "learning_rate": 1.9996673344014663e-05, + "loss": 1.4711, + "step": 3285 + }, + { + "epoch": 0.10757258697358096, + "grad_norm": 3.6477819467481165, + "learning_rate": 1.9996524527181153e-05, + "loss": 1.5099, + "step": 3290 + }, + { + "epoch": 0.10773607114831284, + "grad_norm": 3.9460734804270463, + "learning_rate": 1.9996372454724532e-05, + "loss": 1.4979, + "step": 3295 + }, + { + "epoch": 0.10789955532304472, + "grad_norm": 3.7924186651463065, + "learning_rate": 1.9996217126694323e-05, + "loss": 1.5132, + "step": 3300 + }, + { + "epoch": 0.10806303949777661, + "grad_norm": 3.805508967955034, + "learning_rate": 1.999605854314112e-05, + "loss": 1.5926, + "step": 3305 + }, + { + "epoch": 0.1082265236725085, + "grad_norm": 3.6856250968400106, + "learning_rate": 1.999589670411656e-05, + "loss": 1.6045, + "step": 3310 + }, + { + "epoch": 0.10839000784724039, + "grad_norm": 3.8791332769230586, + "learning_rate": 1.9995731609673354e-05, + "loss": 1.4909, + "step": 3315 + }, + { + "epoch": 0.10855349202197227, + "grad_norm": 3.5430455652682964, + "learning_rate": 1.9995563259865274e-05, + "loss": 1.5054, + "step": 3320 + }, + { + "epoch": 0.10871697619670416, + "grad_norm": 3.9041783455947767, + "learning_rate": 1.999539165474714e-05, + "loss": 1.5023, + "step": 3325 + }, + { + "epoch": 0.10888046037143605, + "grad_norm": 3.8171439698207963, + "learning_rate": 1.999521679437485e-05, + "loss": 1.5829, + "step": 3330 + }, + { + "epoch": 0.10904394454616793, + "grad_norm": 4.003251581080997, + "learning_rate": 1.9995038678805338e-05, + "loss": 1.7439, + "step": 3335 + }, + { + "epoch": 0.10920742872089982, + "grad_norm": 3.74332447089556, + "learning_rate": 1.9994857308096616e-05, + "loss": 1.5647, + "step": 3340 + }, + { + "epoch": 0.10937091289563171, + "grad_norm": 3.875611384289001, + "learning_rate": 1.999467268230776e-05, + "loss": 1.5408, + "step": 3345 + }, + { + "epoch": 0.1095343970703636, + "grad_norm": 3.7392767507411886, + "learning_rate": 1.9994484801498895e-05, + "loss": 1.6729, + "step": 3350 + }, + { + "epoch": 0.10969788124509547, + "grad_norm": 3.3944487197301965, + "learning_rate": 1.99942936657312e-05, + "loss": 1.5278, + "step": 3355 + }, + { + "epoch": 0.10986136541982736, + "grad_norm": 3.8649467622943745, + "learning_rate": 1.999409927506694e-05, + "loss": 1.5261, + "step": 3360 + }, + { + "epoch": 0.11002484959455924, + "grad_norm": 3.8699882707989897, + "learning_rate": 1.9993901629569406e-05, + "loss": 1.5821, + "step": 3365 + }, + { + "epoch": 0.11018833376929113, + "grad_norm": 3.5985421281391887, + "learning_rate": 1.9993700729302975e-05, + "loss": 1.4459, + "step": 3370 + }, + { + "epoch": 0.11035181794402302, + "grad_norm": 3.54335428307766, + "learning_rate": 1.9993496574333072e-05, + "loss": 1.5272, + "step": 3375 + }, + { + "epoch": 0.1105153021187549, + "grad_norm": 3.3102949927465297, + "learning_rate": 1.999328916472619e-05, + "loss": 1.6255, + "step": 3380 + }, + { + "epoch": 0.11067878629348679, + "grad_norm": 3.5238948231741327, + "learning_rate": 1.9993078500549875e-05, + "loss": 1.5137, + "step": 3385 + }, + { + "epoch": 0.11084227046821868, + "grad_norm": 3.715785146018331, + "learning_rate": 1.9992864581872733e-05, + "loss": 1.525, + "step": 3390 + }, + { + "epoch": 0.11100575464295057, + "grad_norm": 3.801822447682004, + "learning_rate": 1.9992647408764437e-05, + "loss": 1.5214, + "step": 3395 + }, + { + "epoch": 0.11116923881768245, + "grad_norm": 3.8088939580518524, + "learning_rate": 1.999242698129571e-05, + "loss": 1.6205, + "step": 3400 + }, + { + "epoch": 0.11133272299241434, + "grad_norm": 3.949076648572656, + "learning_rate": 1.999220329953834e-05, + "loss": 1.5004, + "step": 3405 + }, + { + "epoch": 0.11149620716714621, + "grad_norm": 3.7273749977183597, + "learning_rate": 1.9991976363565174e-05, + "loss": 1.5691, + "step": 3410 + }, + { + "epoch": 0.1116596913418781, + "grad_norm": 3.743302923324857, + "learning_rate": 1.9991746173450128e-05, + "loss": 1.5446, + "step": 3415 + }, + { + "epoch": 0.11182317551660999, + "grad_norm": 3.5086438997407177, + "learning_rate": 1.999151272926816e-05, + "loss": 1.4601, + "step": 3420 + }, + { + "epoch": 0.11198665969134187, + "grad_norm": 3.6299659803130413, + "learning_rate": 1.99912760310953e-05, + "loss": 1.5006, + "step": 3425 + }, + { + "epoch": 0.11215014386607376, + "grad_norm": 3.6825780663332393, + "learning_rate": 1.9991036079008635e-05, + "loss": 1.534, + "step": 3430 + }, + { + "epoch": 0.11231362804080565, + "grad_norm": 3.666457055180824, + "learning_rate": 1.999079287308631e-05, + "loss": 1.5111, + "step": 3435 + }, + { + "epoch": 0.11247711221553754, + "grad_norm": 3.6537218180173787, + "learning_rate": 1.9990546413407535e-05, + "loss": 1.4909, + "step": 3440 + }, + { + "epoch": 0.11264059639026942, + "grad_norm": 3.590155963327672, + "learning_rate": 1.999029670005257e-05, + "loss": 1.51, + "step": 3445 + }, + { + "epoch": 0.11280408056500131, + "grad_norm": 3.526928533052813, + "learning_rate": 1.9990043733102748e-05, + "loss": 1.4292, + "step": 3450 + }, + { + "epoch": 0.1129675647397332, + "grad_norm": 3.6515267301893926, + "learning_rate": 1.9989787512640448e-05, + "loss": 1.5894, + "step": 3455 + }, + { + "epoch": 0.11313104891446508, + "grad_norm": 3.4327192674289977, + "learning_rate": 1.9989528038749117e-05, + "loss": 1.5025, + "step": 3460 + }, + { + "epoch": 0.11329453308919697, + "grad_norm": 3.4918804512133836, + "learning_rate": 1.998926531151326e-05, + "loss": 1.573, + "step": 3465 + }, + { + "epoch": 0.11345801726392885, + "grad_norm": 3.547568039863795, + "learning_rate": 1.9988999331018438e-05, + "loss": 1.6653, + "step": 3470 + }, + { + "epoch": 0.11362150143866073, + "grad_norm": 3.8452240319722986, + "learning_rate": 1.9988730097351278e-05, + "loss": 1.6364, + "step": 3475 + }, + { + "epoch": 0.11378498561339262, + "grad_norm": 3.786459153875742, + "learning_rate": 1.998845761059946e-05, + "loss": 1.5687, + "step": 3480 + }, + { + "epoch": 0.1139484697881245, + "grad_norm": 3.7709636913798588, + "learning_rate": 1.9988181870851728e-05, + "loss": 1.6467, + "step": 3485 + }, + { + "epoch": 0.1141119539628564, + "grad_norm": 3.4254721615596817, + "learning_rate": 1.9987902878197886e-05, + "loss": 1.4564, + "step": 3490 + }, + { + "epoch": 0.11427543813758828, + "grad_norm": 3.970091629017017, + "learning_rate": 1.9987620632728786e-05, + "loss": 1.5541, + "step": 3495 + }, + { + "epoch": 0.11443892231232017, + "grad_norm": 3.50036029505066, + "learning_rate": 1.9987335134536357e-05, + "loss": 1.4822, + "step": 3500 + }, + { + "epoch": 0.11460240648705206, + "grad_norm": 3.863763536449612, + "learning_rate": 1.9987046383713578e-05, + "loss": 1.6086, + "step": 3505 + }, + { + "epoch": 0.11476589066178394, + "grad_norm": 3.3811413626916718, + "learning_rate": 1.9986754380354487e-05, + "loss": 1.4817, + "step": 3510 + }, + { + "epoch": 0.11492937483651583, + "grad_norm": 3.7085467882612395, + "learning_rate": 1.998645912455418e-05, + "loss": 1.5729, + "step": 3515 + }, + { + "epoch": 0.11509285901124772, + "grad_norm": 3.3403928048314926, + "learning_rate": 1.9986160616408816e-05, + "loss": 1.6092, + "step": 3520 + }, + { + "epoch": 0.1152563431859796, + "grad_norm": 3.580435668114936, + "learning_rate": 1.9985858856015613e-05, + "loss": 1.6299, + "step": 3525 + }, + { + "epoch": 0.11541982736071148, + "grad_norm": 3.8283222777264334, + "learning_rate": 1.9985553843472846e-05, + "loss": 1.5633, + "step": 3530 + }, + { + "epoch": 0.11558331153544336, + "grad_norm": 4.2086142465215275, + "learning_rate": 1.998524557887985e-05, + "loss": 1.6714, + "step": 3535 + }, + { + "epoch": 0.11574679571017525, + "grad_norm": 3.689871987943439, + "learning_rate": 1.998493406233702e-05, + "loss": 1.4833, + "step": 3540 + }, + { + "epoch": 0.11591027988490714, + "grad_norm": 3.716315512848329, + "learning_rate": 1.9984619293945807e-05, + "loss": 1.4304, + "step": 3545 + }, + { + "epoch": 0.11607376405963903, + "grad_norm": 3.637624071755964, + "learning_rate": 1.9984301273808727e-05, + "loss": 1.5198, + "step": 3550 + }, + { + "epoch": 0.11623724823437091, + "grad_norm": 3.829415924899315, + "learning_rate": 1.9983980002029348e-05, + "loss": 1.4936, + "step": 3555 + }, + { + "epoch": 0.1164007324091028, + "grad_norm": 3.847152906839731, + "learning_rate": 1.99836554787123e-05, + "loss": 1.5459, + "step": 3560 + }, + { + "epoch": 0.11656421658383469, + "grad_norm": 3.852419619176706, + "learning_rate": 1.9983327703963278e-05, + "loss": 1.472, + "step": 3565 + }, + { + "epoch": 0.11672770075856657, + "grad_norm": 3.8201992864807823, + "learning_rate": 1.9982996677889023e-05, + "loss": 1.6085, + "step": 3570 + }, + { + "epoch": 0.11689118493329846, + "grad_norm": 4.057890874032681, + "learning_rate": 1.9982662400597348e-05, + "loss": 1.5207, + "step": 3575 + }, + { + "epoch": 0.11705466910803035, + "grad_norm": 3.664589165102271, + "learning_rate": 1.9982324872197116e-05, + "loss": 1.6636, + "step": 3580 + }, + { + "epoch": 0.11721815328276222, + "grad_norm": 3.491076980924965, + "learning_rate": 1.998198409279825e-05, + "loss": 1.5606, + "step": 3585 + }, + { + "epoch": 0.11738163745749411, + "grad_norm": 3.6056765711525305, + "learning_rate": 1.9981640062511734e-05, + "loss": 1.5466, + "step": 3590 + }, + { + "epoch": 0.117545121632226, + "grad_norm": 4.172342821852726, + "learning_rate": 1.9981292781449618e-05, + "loss": 1.6452, + "step": 3595 + }, + { + "epoch": 0.11770860580695788, + "grad_norm": 3.777971112458677, + "learning_rate": 1.998094224972499e-05, + "loss": 1.5022, + "step": 3600 + }, + { + "epoch": 0.11787208998168977, + "grad_norm": 3.6880838790750166, + "learning_rate": 1.998058846745202e-05, + "loss": 1.4266, + "step": 3605 + }, + { + "epoch": 0.11803557415642166, + "grad_norm": 3.8737934575576274, + "learning_rate": 1.9980231434745922e-05, + "loss": 1.4667, + "step": 3610 + }, + { + "epoch": 0.11819905833115354, + "grad_norm": 3.7243293876393193, + "learning_rate": 1.9979871151722973e-05, + "loss": 1.582, + "step": 3615 + }, + { + "epoch": 0.11836254250588543, + "grad_norm": 3.5626341445358625, + "learning_rate": 1.997950761850051e-05, + "loss": 1.5466, + "step": 3620 + }, + { + "epoch": 0.11852602668061732, + "grad_norm": 3.750527347321608, + "learning_rate": 1.9979140835196925e-05, + "loss": 1.5962, + "step": 3625 + }, + { + "epoch": 0.1186895108553492, + "grad_norm": 3.535247481468008, + "learning_rate": 1.997877080193167e-05, + "loss": 1.537, + "step": 3630 + }, + { + "epoch": 0.1188529950300811, + "grad_norm": 3.418992994110544, + "learning_rate": 1.9978397518825255e-05, + "loss": 1.4455, + "step": 3635 + }, + { + "epoch": 0.11901647920481298, + "grad_norm": 3.642204950890085, + "learning_rate": 1.9978020985999252e-05, + "loss": 1.514, + "step": 3640 + }, + { + "epoch": 0.11917996337954485, + "grad_norm": 3.8278462858084783, + "learning_rate": 1.9977641203576287e-05, + "loss": 1.5949, + "step": 3645 + }, + { + "epoch": 0.11934344755427674, + "grad_norm": 3.787309559463081, + "learning_rate": 1.9977258171680044e-05, + "loss": 1.4868, + "step": 3650 + }, + { + "epoch": 0.11950693172900863, + "grad_norm": 3.7356140365016803, + "learning_rate": 1.9976871890435274e-05, + "loss": 1.5519, + "step": 3655 + }, + { + "epoch": 0.11967041590374052, + "grad_norm": 3.643764105631874, + "learning_rate": 1.9976482359967774e-05, + "loss": 1.5894, + "step": 3660 + }, + { + "epoch": 0.1198339000784724, + "grad_norm": 3.6503426644149326, + "learning_rate": 1.99760895804044e-05, + "loss": 1.6912, + "step": 3665 + }, + { + "epoch": 0.11999738425320429, + "grad_norm": 4.927038064537145, + "learning_rate": 1.9975693551873082e-05, + "loss": 1.5157, + "step": 3670 + }, + { + "epoch": 0.12016086842793618, + "grad_norm": 3.577232524911499, + "learning_rate": 1.9975294274502787e-05, + "loss": 1.5332, + "step": 3675 + }, + { + "epoch": 0.12032435260266806, + "grad_norm": 3.5888851683696585, + "learning_rate": 1.9974891748423553e-05, + "loss": 1.5322, + "step": 3680 + }, + { + "epoch": 0.12048783677739995, + "grad_norm": 3.849772291338502, + "learning_rate": 1.9974485973766476e-05, + "loss": 1.6308, + "step": 3685 + }, + { + "epoch": 0.12065132095213184, + "grad_norm": 3.2289563204283276, + "learning_rate": 1.9974076950663705e-05, + "loss": 1.473, + "step": 3690 + }, + { + "epoch": 0.12081480512686373, + "grad_norm": 3.7516252515047843, + "learning_rate": 1.9973664679248443e-05, + "loss": 1.7211, + "step": 3695 + }, + { + "epoch": 0.12097828930159561, + "grad_norm": 3.446195338047843, + "learning_rate": 1.9973249159654965e-05, + "loss": 1.4745, + "step": 3700 + }, + { + "epoch": 0.12114177347632749, + "grad_norm": 3.572256456722119, + "learning_rate": 1.9972830392018593e-05, + "loss": 1.5312, + "step": 3705 + }, + { + "epoch": 0.12130525765105937, + "grad_norm": 3.4672695198883745, + "learning_rate": 1.9972408376475703e-05, + "loss": 1.4194, + "step": 3710 + }, + { + "epoch": 0.12146874182579126, + "grad_norm": 3.6409792065131246, + "learning_rate": 1.9971983113163745e-05, + "loss": 1.6299, + "step": 3715 + }, + { + "epoch": 0.12163222600052315, + "grad_norm": 3.501630223364216, + "learning_rate": 1.9971554602221213e-05, + "loss": 1.5632, + "step": 3720 + }, + { + "epoch": 0.12179571017525503, + "grad_norm": 3.820129509439725, + "learning_rate": 1.9971122843787662e-05, + "loss": 1.4919, + "step": 3725 + }, + { + "epoch": 0.12195919434998692, + "grad_norm": 3.7046437263611516, + "learning_rate": 1.9970687838003704e-05, + "loss": 1.5417, + "step": 3730 + }, + { + "epoch": 0.12212267852471881, + "grad_norm": 3.9345416059428553, + "learning_rate": 1.997024958501101e-05, + "loss": 1.705, + "step": 3735 + }, + { + "epoch": 0.1222861626994507, + "grad_norm": 3.5618676976185433, + "learning_rate": 1.996980808495231e-05, + "loss": 1.5375, + "step": 3740 + }, + { + "epoch": 0.12244964687418258, + "grad_norm": 3.4780029374975348, + "learning_rate": 1.9969363337971392e-05, + "loss": 1.4841, + "step": 3745 + }, + { + "epoch": 0.12261313104891447, + "grad_norm": 3.4536171490721497, + "learning_rate": 1.9968915344213094e-05, + "loss": 1.6252, + "step": 3750 + }, + { + "epoch": 0.12277661522364636, + "grad_norm": 3.583711224204737, + "learning_rate": 1.9968464103823317e-05, + "loss": 1.4235, + "step": 3755 + }, + { + "epoch": 0.12294009939837823, + "grad_norm": 3.435417989038651, + "learning_rate": 1.996800961694902e-05, + "loss": 1.481, + "step": 3760 + }, + { + "epoch": 0.12310358357311012, + "grad_norm": 3.726998214712221, + "learning_rate": 1.9967551883738223e-05, + "loss": 1.5332, + "step": 3765 + }, + { + "epoch": 0.123267067747842, + "grad_norm": 3.5134106779084906, + "learning_rate": 1.996709090433999e-05, + "loss": 1.6441, + "step": 3770 + }, + { + "epoch": 0.12343055192257389, + "grad_norm": 3.8555247467247353, + "learning_rate": 1.996662667890446e-05, + "loss": 1.574, + "step": 3775 + }, + { + "epoch": 0.12359403609730578, + "grad_norm": 3.514314457981985, + "learning_rate": 1.996615920758281e-05, + "loss": 1.4467, + "step": 3780 + }, + { + "epoch": 0.12375752027203767, + "grad_norm": 3.4946524840012945, + "learning_rate": 1.996568849052729e-05, + "loss": 1.4381, + "step": 3785 + }, + { + "epoch": 0.12392100444676955, + "grad_norm": 3.6171410017811634, + "learning_rate": 1.9965214527891202e-05, + "loss": 1.5404, + "step": 3790 + }, + { + "epoch": 0.12408448862150144, + "grad_norm": 3.745814100615064, + "learning_rate": 1.99647373198289e-05, + "loss": 1.5863, + "step": 3795 + }, + { + "epoch": 0.12424797279623333, + "grad_norm": 3.6373379950213267, + "learning_rate": 1.9964256866495804e-05, + "loss": 1.6133, + "step": 3800 + }, + { + "epoch": 0.12441145697096521, + "grad_norm": 3.6081305241435304, + "learning_rate": 1.996377316804838e-05, + "loss": 1.5394, + "step": 3805 + }, + { + "epoch": 0.1245749411456971, + "grad_norm": 3.838134720152952, + "learning_rate": 1.996328622464416e-05, + "loss": 1.433, + "step": 3810 + }, + { + "epoch": 0.12473842532042899, + "grad_norm": 3.6709744251180885, + "learning_rate": 1.996279603644173e-05, + "loss": 1.4871, + "step": 3815 + }, + { + "epoch": 0.12490190949516086, + "grad_norm": 3.549806938901301, + "learning_rate": 1.996230260360073e-05, + "loss": 1.5208, + "step": 3820 + }, + { + "epoch": 0.12506539366989275, + "grad_norm": 3.54351373411021, + "learning_rate": 1.996180592628186e-05, + "loss": 1.7538, + "step": 3825 + }, + { + "epoch": 0.12522887784462464, + "grad_norm": 3.799216765314157, + "learning_rate": 1.9961306004646878e-05, + "loss": 1.6339, + "step": 3830 + }, + { + "epoch": 0.12539236201935652, + "grad_norm": 3.6488158665634014, + "learning_rate": 1.996080283885859e-05, + "loss": 1.4869, + "step": 3835 + }, + { + "epoch": 0.1255558461940884, + "grad_norm": 3.7165376819263494, + "learning_rate": 1.9960296429080875e-05, + "loss": 1.5398, + "step": 3840 + }, + { + "epoch": 0.1257193303688203, + "grad_norm": 3.3338683014786112, + "learning_rate": 1.9959786775478646e-05, + "loss": 1.5302, + "step": 3845 + }, + { + "epoch": 0.12588281454355219, + "grad_norm": 3.699406113493008, + "learning_rate": 1.9959273878217895e-05, + "loss": 1.5692, + "step": 3850 + }, + { + "epoch": 0.12604629871828407, + "grad_norm": 3.391594308450122, + "learning_rate": 1.9958757737465656e-05, + "loss": 1.599, + "step": 3855 + }, + { + "epoch": 0.12620978289301596, + "grad_norm": 3.8387517358562397, + "learning_rate": 1.9958238353390022e-05, + "loss": 1.6003, + "step": 3860 + }, + { + "epoch": 0.12637326706774785, + "grad_norm": 3.456231257697692, + "learning_rate": 1.9957715726160144e-05, + "loss": 1.4746, + "step": 3865 + }, + { + "epoch": 0.12653675124247973, + "grad_norm": 3.677537945717539, + "learning_rate": 1.995718985594623e-05, + "loss": 1.5793, + "step": 3870 + }, + { + "epoch": 0.12670023541721162, + "grad_norm": 3.4809352245497536, + "learning_rate": 1.9956660742919546e-05, + "loss": 1.4969, + "step": 3875 + }, + { + "epoch": 0.1268637195919435, + "grad_norm": 3.3902597266928907, + "learning_rate": 1.9956128387252405e-05, + "loss": 1.4872, + "step": 3880 + }, + { + "epoch": 0.1270272037666754, + "grad_norm": 3.5345587011741095, + "learning_rate": 1.9955592789118188e-05, + "loss": 1.5256, + "step": 3885 + }, + { + "epoch": 0.12719068794140728, + "grad_norm": 3.6280255802277925, + "learning_rate": 1.9955053948691317e-05, + "loss": 1.6512, + "step": 3890 + }, + { + "epoch": 0.12735417211613917, + "grad_norm": 3.3040212573884995, + "learning_rate": 1.9954511866147293e-05, + "loss": 1.4992, + "step": 3895 + }, + { + "epoch": 0.12751765629087106, + "grad_norm": 3.3695456063528058, + "learning_rate": 1.9953966541662648e-05, + "loss": 1.5273, + "step": 3900 + }, + { + "epoch": 0.12768114046560292, + "grad_norm": 3.7784376666579345, + "learning_rate": 1.9953417975414988e-05, + "loss": 1.6121, + "step": 3905 + }, + { + "epoch": 0.1278446246403348, + "grad_norm": 3.718021428293509, + "learning_rate": 1.9952866167582962e-05, + "loss": 1.5095, + "step": 3910 + }, + { + "epoch": 0.1280081088150667, + "grad_norm": 3.7238448805039437, + "learning_rate": 1.995231111834628e-05, + "loss": 1.5463, + "step": 3915 + }, + { + "epoch": 0.12817159298979858, + "grad_norm": 3.1902398367017017, + "learning_rate": 1.995175282788571e-05, + "loss": 1.444, + "step": 3920 + }, + { + "epoch": 0.12833507716453046, + "grad_norm": 3.5468859391025616, + "learning_rate": 1.9951191296383078e-05, + "loss": 1.5517, + "step": 3925 + }, + { + "epoch": 0.12849856133926235, + "grad_norm": 3.8649593185052438, + "learning_rate": 1.9950626524021256e-05, + "loss": 1.528, + "step": 3930 + }, + { + "epoch": 0.12866204551399424, + "grad_norm": 3.667315469267436, + "learning_rate": 1.9950058510984175e-05, + "loss": 1.602, + "step": 3935 + }, + { + "epoch": 0.12882552968872613, + "grad_norm": 3.642493448620478, + "learning_rate": 1.9949487257456827e-05, + "loss": 1.6155, + "step": 3940 + }, + { + "epoch": 0.128989013863458, + "grad_norm": 4.101106025290424, + "learning_rate": 1.994891276362525e-05, + "loss": 1.4951, + "step": 3945 + }, + { + "epoch": 0.1291524980381899, + "grad_norm": 3.5609592028013584, + "learning_rate": 1.994833502967655e-05, + "loss": 1.4508, + "step": 3950 + }, + { + "epoch": 0.1293159822129218, + "grad_norm": 3.3928387118094903, + "learning_rate": 1.9947754055798878e-05, + "loss": 1.4617, + "step": 3955 + }, + { + "epoch": 0.12947946638765367, + "grad_norm": 3.575966220434321, + "learning_rate": 1.994716984218144e-05, + "loss": 1.5311, + "step": 3960 + }, + { + "epoch": 0.12964295056238556, + "grad_norm": 3.532166656672411, + "learning_rate": 1.99465823890145e-05, + "loss": 1.5751, + "step": 3965 + }, + { + "epoch": 0.12980643473711745, + "grad_norm": 3.181783023678665, + "learning_rate": 1.9945991696489382e-05, + "loss": 1.4849, + "step": 3970 + }, + { + "epoch": 0.12996991891184934, + "grad_norm": 3.5978252130251613, + "learning_rate": 1.9945397764798454e-05, + "loss": 1.6728, + "step": 3975 + }, + { + "epoch": 0.13013340308658122, + "grad_norm": 3.6880655830281937, + "learning_rate": 1.994480059413515e-05, + "loss": 1.5036, + "step": 3980 + }, + { + "epoch": 0.1302968872613131, + "grad_norm": 3.541018254387831, + "learning_rate": 1.9944200184693953e-05, + "loss": 1.5936, + "step": 3985 + }, + { + "epoch": 0.130460371436045, + "grad_norm": 3.621727171786409, + "learning_rate": 1.99435965366704e-05, + "loss": 1.5549, + "step": 3990 + }, + { + "epoch": 0.13062385561077688, + "grad_norm": 3.4840297325662672, + "learning_rate": 1.9942989650261083e-05, + "loss": 1.5714, + "step": 3995 + }, + { + "epoch": 0.13078733978550877, + "grad_norm": 3.559329829015446, + "learning_rate": 1.9942379525663656e-05, + "loss": 1.5119, + "step": 4000 + }, + { + "epoch": 0.13095082396024066, + "grad_norm": 3.693059807567086, + "learning_rate": 1.9941766163076814e-05, + "loss": 1.601, + "step": 4005 + }, + { + "epoch": 0.13111430813497255, + "grad_norm": 3.6749821800427225, + "learning_rate": 1.994114956270032e-05, + "loss": 1.5711, + "step": 4010 + }, + { + "epoch": 0.13127779230970443, + "grad_norm": 3.6776942546520837, + "learning_rate": 1.9940529724734982e-05, + "loss": 1.5937, + "step": 4015 + }, + { + "epoch": 0.13144127648443632, + "grad_norm": 3.689893791695127, + "learning_rate": 1.9939906649382666e-05, + "loss": 1.5409, + "step": 4020 + }, + { + "epoch": 0.13160476065916818, + "grad_norm": 3.809688060580903, + "learning_rate": 1.99392803368463e-05, + "loss": 1.5449, + "step": 4025 + }, + { + "epoch": 0.13176824483390007, + "grad_norm": 3.555279448671788, + "learning_rate": 1.9938650787329846e-05, + "loss": 1.4702, + "step": 4030 + }, + { + "epoch": 0.13193172900863195, + "grad_norm": 3.392793456017738, + "learning_rate": 1.993801800103834e-05, + "loss": 1.5828, + "step": 4035 + }, + { + "epoch": 0.13209521318336384, + "grad_norm": 3.7935078857020446, + "learning_rate": 1.9937381978177867e-05, + "loss": 1.6875, + "step": 4040 + }, + { + "epoch": 0.13225869735809573, + "grad_norm": 3.5604286802678744, + "learning_rate": 1.993674271895556e-05, + "loss": 1.646, + "step": 4045 + }, + { + "epoch": 0.13242218153282762, + "grad_norm": 3.584047812083405, + "learning_rate": 1.9936100223579612e-05, + "loss": 1.5561, + "step": 4050 + }, + { + "epoch": 0.1325856657075595, + "grad_norm": 3.506526319784307, + "learning_rate": 1.993545449225927e-05, + "loss": 1.4188, + "step": 4055 + }, + { + "epoch": 0.1327491498822914, + "grad_norm": 3.5880430542480743, + "learning_rate": 1.9934805525204827e-05, + "loss": 1.5597, + "step": 4060 + }, + { + "epoch": 0.13291263405702328, + "grad_norm": 3.6269916868079943, + "learning_rate": 1.9934153322627637e-05, + "loss": 1.5344, + "step": 4065 + }, + { + "epoch": 0.13307611823175516, + "grad_norm": 3.417702217016454, + "learning_rate": 1.9933497884740113e-05, + "loss": 1.4003, + "step": 4070 + }, + { + "epoch": 0.13323960240648705, + "grad_norm": 3.213197193265627, + "learning_rate": 1.993283921175571e-05, + "loss": 1.305, + "step": 4075 + }, + { + "epoch": 0.13340308658121894, + "grad_norm": 3.559598496125269, + "learning_rate": 1.993217730388894e-05, + "loss": 1.504, + "step": 4080 + }, + { + "epoch": 0.13356657075595083, + "grad_norm": 3.527132123390398, + "learning_rate": 1.9931512161355372e-05, + "loss": 1.4704, + "step": 4085 + }, + { + "epoch": 0.1337300549306827, + "grad_norm": 3.27120012095793, + "learning_rate": 1.993084378437163e-05, + "loss": 1.5874, + "step": 4090 + }, + { + "epoch": 0.1338935391054146, + "grad_norm": 3.7405621813711236, + "learning_rate": 1.9930172173155382e-05, + "loss": 1.4789, + "step": 4095 + }, + { + "epoch": 0.1340570232801465, + "grad_norm": 3.3226913294359215, + "learning_rate": 1.992949732792536e-05, + "loss": 1.3976, + "step": 4100 + }, + { + "epoch": 0.13422050745487837, + "grad_norm": 3.7050808130368016, + "learning_rate": 1.992881924890134e-05, + "loss": 1.5697, + "step": 4105 + }, + { + "epoch": 0.13438399162961026, + "grad_norm": 3.7583800792360895, + "learning_rate": 1.9928137936304163e-05, + "loss": 1.5949, + "step": 4110 + }, + { + "epoch": 0.13454747580434215, + "grad_norm": 3.489629413369423, + "learning_rate": 1.992745339035571e-05, + "loss": 1.6381, + "step": 4115 + }, + { + "epoch": 0.13471095997907404, + "grad_norm": 3.5129938086920496, + "learning_rate": 1.992676561127892e-05, + "loss": 1.5415, + "step": 4120 + }, + { + "epoch": 0.13487444415380592, + "grad_norm": 3.4354657804600355, + "learning_rate": 1.992607459929779e-05, + "loss": 1.5983, + "step": 4125 + }, + { + "epoch": 0.1350379283285378, + "grad_norm": 3.419227557619476, + "learning_rate": 1.9925380354637363e-05, + "loss": 1.5724, + "step": 4130 + }, + { + "epoch": 0.1352014125032697, + "grad_norm": 3.5803373254985393, + "learning_rate": 1.9924682877523738e-05, + "loss": 1.5403, + "step": 4135 + }, + { + "epoch": 0.13536489667800156, + "grad_norm": 3.5224357275737908, + "learning_rate": 1.9923982168184065e-05, + "loss": 1.5751, + "step": 4140 + }, + { + "epoch": 0.13552838085273344, + "grad_norm": 3.4791615957421698, + "learning_rate": 1.9923278226846553e-05, + "loss": 1.504, + "step": 4145 + }, + { + "epoch": 0.13569186502746533, + "grad_norm": 3.4655419908818414, + "learning_rate": 1.992257105374045e-05, + "loss": 1.6308, + "step": 4150 + }, + { + "epoch": 0.13585534920219722, + "grad_norm": 3.556476985865519, + "learning_rate": 1.9921860649096073e-05, + "loss": 1.5042, + "step": 4155 + }, + { + "epoch": 0.1360188333769291, + "grad_norm": 3.6112891083273913, + "learning_rate": 1.9921147013144782e-05, + "loss": 1.4986, + "step": 4160 + }, + { + "epoch": 0.136182317551661, + "grad_norm": 3.3791773764254405, + "learning_rate": 1.9920430146118982e-05, + "loss": 1.6133, + "step": 4165 + }, + { + "epoch": 0.13634580172639288, + "grad_norm": 3.5780674793003007, + "learning_rate": 1.991971004825215e-05, + "loss": 1.533, + "step": 4170 + }, + { + "epoch": 0.13650928590112477, + "grad_norm": 3.4972015478048726, + "learning_rate": 1.9918986719778802e-05, + "loss": 1.5674, + "step": 4175 + }, + { + "epoch": 0.13667277007585665, + "grad_norm": 3.3854774287486493, + "learning_rate": 1.9918260160934504e-05, + "loss": 1.5171, + "step": 4180 + }, + { + "epoch": 0.13683625425058854, + "grad_norm": 3.4623232121701077, + "learning_rate": 1.991753037195588e-05, + "loss": 1.5054, + "step": 4185 + }, + { + "epoch": 0.13699973842532043, + "grad_norm": 3.6474255799182997, + "learning_rate": 1.9916797353080606e-05, + "loss": 1.5888, + "step": 4190 + }, + { + "epoch": 0.13716322260005231, + "grad_norm": 3.545278321504593, + "learning_rate": 1.9916061104547407e-05, + "loss": 1.4676, + "step": 4195 + }, + { + "epoch": 0.1373267067747842, + "grad_norm": 3.7985921241573446, + "learning_rate": 1.991532162659606e-05, + "loss": 1.7309, + "step": 4200 + }, + { + "epoch": 0.1374901909495161, + "grad_norm": 3.739282441899159, + "learning_rate": 1.99145789194674e-05, + "loss": 1.5594, + "step": 4205 + }, + { + "epoch": 0.13765367512424798, + "grad_norm": 3.9777269539623306, + "learning_rate": 1.9913832983403303e-05, + "loss": 1.6431, + "step": 4210 + }, + { + "epoch": 0.13781715929897986, + "grad_norm": 3.42151798705863, + "learning_rate": 1.9913083818646706e-05, + "loss": 1.5473, + "step": 4215 + }, + { + "epoch": 0.13798064347371175, + "grad_norm": 3.1538620002537123, + "learning_rate": 1.991233142544159e-05, + "loss": 1.4084, + "step": 4220 + }, + { + "epoch": 0.13814412764844364, + "grad_norm": 3.5546269903196612, + "learning_rate": 1.9911575804032994e-05, + "loss": 1.5219, + "step": 4225 + }, + { + "epoch": 0.13830761182317552, + "grad_norm": 3.490874052563651, + "learning_rate": 1.9910816954667002e-05, + "loss": 1.6169, + "step": 4230 + }, + { + "epoch": 0.1384710959979074, + "grad_norm": 3.405513214502549, + "learning_rate": 1.9910054877590754e-05, + "loss": 1.6122, + "step": 4235 + }, + { + "epoch": 0.1386345801726393, + "grad_norm": 3.429158531556809, + "learning_rate": 1.9909289573052445e-05, + "loss": 1.5405, + "step": 4240 + }, + { + "epoch": 0.1387980643473712, + "grad_norm": 3.1854365416120016, + "learning_rate": 1.9908521041301308e-05, + "loss": 1.4368, + "step": 4245 + }, + { + "epoch": 0.13896154852210307, + "grad_norm": 3.536524447680052, + "learning_rate": 1.990774928258764e-05, + "loss": 1.5295, + "step": 4250 + }, + { + "epoch": 0.13912503269683493, + "grad_norm": 3.5620976065292105, + "learning_rate": 1.9906974297162776e-05, + "loss": 1.6544, + "step": 4255 + }, + { + "epoch": 0.13928851687156682, + "grad_norm": 3.6307468615076233, + "learning_rate": 1.9906196085279123e-05, + "loss": 1.5339, + "step": 4260 + }, + { + "epoch": 0.1394520010462987, + "grad_norm": 3.4217346168827802, + "learning_rate": 1.990541464719012e-05, + "loss": 1.4643, + "step": 4265 + }, + { + "epoch": 0.1396154852210306, + "grad_norm": 3.6756409916094723, + "learning_rate": 1.9904629983150256e-05, + "loss": 1.5241, + "step": 4270 + }, + { + "epoch": 0.13977896939576248, + "grad_norm": 3.454806890830729, + "learning_rate": 1.9903842093415085e-05, + "loss": 1.5416, + "step": 4275 + }, + { + "epoch": 0.13994245357049437, + "grad_norm": 4.418657263130195, + "learning_rate": 1.99030509782412e-05, + "loss": 1.6284, + "step": 4280 + }, + { + "epoch": 0.14010593774522626, + "grad_norm": 3.5548305971201493, + "learning_rate": 1.9902256637886242e-05, + "loss": 1.4278, + "step": 4285 + }, + { + "epoch": 0.14026942191995814, + "grad_norm": 3.407484467312623, + "learning_rate": 1.990145907260892e-05, + "loss": 1.6626, + "step": 4290 + }, + { + "epoch": 0.14043290609469003, + "grad_norm": 3.325570085500193, + "learning_rate": 1.9900658282668978e-05, + "loss": 1.5683, + "step": 4295 + }, + { + "epoch": 0.14059639026942192, + "grad_norm": 3.535479943843812, + "learning_rate": 1.9899854268327207e-05, + "loss": 1.6055, + "step": 4300 + }, + { + "epoch": 0.1407598744441538, + "grad_norm": 3.5314104812826557, + "learning_rate": 1.989904702984546e-05, + "loss": 1.5754, + "step": 4305 + }, + { + "epoch": 0.1409233586188857, + "grad_norm": 3.2919556174467925, + "learning_rate": 1.9898236567486634e-05, + "loss": 1.4702, + "step": 4310 + }, + { + "epoch": 0.14108684279361758, + "grad_norm": 3.6204995289204214, + "learning_rate": 1.9897422881514673e-05, + "loss": 1.3716, + "step": 4315 + }, + { + "epoch": 0.14125032696834947, + "grad_norm": 3.767707441216891, + "learning_rate": 1.9896605972194583e-05, + "loss": 1.5286, + "step": 4320 + }, + { + "epoch": 0.14141381114308135, + "grad_norm": 3.2146928176197145, + "learning_rate": 1.9895785839792404e-05, + "loss": 1.5122, + "step": 4325 + }, + { + "epoch": 0.14157729531781324, + "grad_norm": 3.5119314474611176, + "learning_rate": 1.9894962484575238e-05, + "loss": 1.5085, + "step": 4330 + }, + { + "epoch": 0.14174077949254513, + "grad_norm": 3.3156329421840582, + "learning_rate": 1.9894135906811226e-05, + "loss": 1.5662, + "step": 4335 + }, + { + "epoch": 0.14190426366727701, + "grad_norm": 3.5138027164709853, + "learning_rate": 1.989330610676957e-05, + "loss": 1.5112, + "step": 4340 + }, + { + "epoch": 0.1420677478420089, + "grad_norm": 3.5745766493362625, + "learning_rate": 1.9892473084720512e-05, + "loss": 1.5118, + "step": 4345 + }, + { + "epoch": 0.1422312320167408, + "grad_norm": 3.442991205119345, + "learning_rate": 1.989163684093535e-05, + "loss": 1.5367, + "step": 4350 + }, + { + "epoch": 0.14239471619147268, + "grad_norm": 3.627024885141257, + "learning_rate": 1.989079737568642e-05, + "loss": 1.7136, + "step": 4355 + }, + { + "epoch": 0.14255820036620456, + "grad_norm": 3.5370980437177977, + "learning_rate": 1.9889954689247127e-05, + "loss": 1.6173, + "step": 4360 + }, + { + "epoch": 0.14272168454093645, + "grad_norm": 3.5733878313811878, + "learning_rate": 1.9889108781891903e-05, + "loss": 1.4457, + "step": 4365 + }, + { + "epoch": 0.14288516871566834, + "grad_norm": 3.3135538510066946, + "learning_rate": 1.9888259653896244e-05, + "loss": 1.3996, + "step": 4370 + }, + { + "epoch": 0.1430486528904002, + "grad_norm": 3.5990799042638977, + "learning_rate": 1.988740730553669e-05, + "loss": 1.5203, + "step": 4375 + }, + { + "epoch": 0.14321213706513208, + "grad_norm": 3.4865447755732126, + "learning_rate": 1.9886551737090835e-05, + "loss": 1.4583, + "step": 4380 + }, + { + "epoch": 0.14337562123986397, + "grad_norm": 3.5020553444704174, + "learning_rate": 1.9885692948837305e-05, + "loss": 1.436, + "step": 4385 + }, + { + "epoch": 0.14353910541459586, + "grad_norm": 3.3406836565485705, + "learning_rate": 1.98848309410558e-05, + "loss": 1.4812, + "step": 4390 + }, + { + "epoch": 0.14370258958932774, + "grad_norm": 3.5393063335855053, + "learning_rate": 1.9883965714027042e-05, + "loss": 1.5369, + "step": 4395 + }, + { + "epoch": 0.14386607376405963, + "grad_norm": 3.5767841961943057, + "learning_rate": 1.9883097268032817e-05, + "loss": 1.4698, + "step": 4400 + }, + { + "epoch": 0.14402955793879152, + "grad_norm": 3.5704574439527654, + "learning_rate": 1.9882225603355967e-05, + "loss": 1.5477, + "step": 4405 + }, + { + "epoch": 0.1441930421135234, + "grad_norm": 3.347081852578425, + "learning_rate": 1.9881350720280358e-05, + "loss": 1.4836, + "step": 4410 + }, + { + "epoch": 0.1443565262882553, + "grad_norm": 4.0919649624897065, + "learning_rate": 1.988047261909093e-05, + "loss": 1.6344, + "step": 4415 + }, + { + "epoch": 0.14452001046298718, + "grad_norm": 3.5488466207083462, + "learning_rate": 1.987959130007365e-05, + "loss": 1.7606, + "step": 4420 + }, + { + "epoch": 0.14468349463771907, + "grad_norm": 3.502496671840229, + "learning_rate": 1.9878706763515546e-05, + "loss": 1.5122, + "step": 4425 + }, + { + "epoch": 0.14484697881245095, + "grad_norm": 3.7164307489395227, + "learning_rate": 1.987781900970469e-05, + "loss": 1.4698, + "step": 4430 + }, + { + "epoch": 0.14501046298718284, + "grad_norm": 3.6507358323355663, + "learning_rate": 1.98769280389302e-05, + "loss": 1.7453, + "step": 4435 + }, + { + "epoch": 0.14517394716191473, + "grad_norm": 3.46003414274465, + "learning_rate": 1.9876033851482243e-05, + "loss": 1.6098, + "step": 4440 + }, + { + "epoch": 0.14533743133664662, + "grad_norm": 3.2616415139624184, + "learning_rate": 1.9875136447652034e-05, + "loss": 1.4367, + "step": 4445 + }, + { + "epoch": 0.1455009155113785, + "grad_norm": 3.4670808005374893, + "learning_rate": 1.9874235827731844e-05, + "loss": 1.5218, + "step": 4450 + }, + { + "epoch": 0.1456643996861104, + "grad_norm": 3.3345662879299294, + "learning_rate": 1.987333199201497e-05, + "loss": 1.369, + "step": 4455 + }, + { + "epoch": 0.14582788386084228, + "grad_norm": 3.581403465812215, + "learning_rate": 1.9872424940795773e-05, + "loss": 1.6746, + "step": 4460 + }, + { + "epoch": 0.14599136803557416, + "grad_norm": 3.256000196786297, + "learning_rate": 1.987151467436966e-05, + "loss": 1.4621, + "step": 4465 + }, + { + "epoch": 0.14615485221030605, + "grad_norm": 3.668652969289664, + "learning_rate": 1.9870601193033085e-05, + "loss": 1.5201, + "step": 4470 + }, + { + "epoch": 0.14631833638503794, + "grad_norm": 3.4101621554709105, + "learning_rate": 1.9869684497083538e-05, + "loss": 1.4742, + "step": 4475 + }, + { + "epoch": 0.14648182055976983, + "grad_norm": 3.386798041245045, + "learning_rate": 1.986876458681957e-05, + "loss": 1.591, + "step": 4480 + }, + { + "epoch": 0.1466453047345017, + "grad_norm": 3.3342859691209994, + "learning_rate": 1.9867841462540774e-05, + "loss": 1.5054, + "step": 4485 + }, + { + "epoch": 0.14680878890923357, + "grad_norm": 3.5795815722428643, + "learning_rate": 1.986691512454779e-05, + "loss": 1.6714, + "step": 4490 + }, + { + "epoch": 0.14697227308396546, + "grad_norm": 3.442284384951059, + "learning_rate": 1.9865985573142297e-05, + "loss": 1.4586, + "step": 4495 + }, + { + "epoch": 0.14713575725869735, + "grad_norm": 3.371927397932696, + "learning_rate": 1.9865052808627036e-05, + "loss": 1.4898, + "step": 4500 + }, + { + "epoch": 0.14729924143342923, + "grad_norm": 3.462591174452971, + "learning_rate": 1.9864116831305777e-05, + "loss": 1.541, + "step": 4505 + }, + { + "epoch": 0.14746272560816112, + "grad_norm": 3.720781176289002, + "learning_rate": 1.986317764148335e-05, + "loss": 1.4904, + "step": 4510 + }, + { + "epoch": 0.147626209782893, + "grad_norm": 3.6490829688705317, + "learning_rate": 1.9862235239465627e-05, + "loss": 1.5673, + "step": 4515 + }, + { + "epoch": 0.1477896939576249, + "grad_norm": 3.598186951530533, + "learning_rate": 1.986128962555952e-05, + "loss": 1.4826, + "step": 4520 + }, + { + "epoch": 0.14795317813235678, + "grad_norm": 3.427119542958949, + "learning_rate": 1.9860340800072993e-05, + "loss": 1.4079, + "step": 4525 + }, + { + "epoch": 0.14811666230708867, + "grad_norm": 4.039224516324664, + "learning_rate": 1.9859388763315062e-05, + "loss": 1.524, + "step": 4530 + }, + { + "epoch": 0.14828014648182056, + "grad_norm": 3.3658647553525656, + "learning_rate": 1.9858433515595775e-05, + "loss": 1.5441, + "step": 4535 + }, + { + "epoch": 0.14844363065655244, + "grad_norm": 3.5830212469251776, + "learning_rate": 1.985747505722624e-05, + "loss": 1.6811, + "step": 4540 + }, + { + "epoch": 0.14860711483128433, + "grad_norm": 3.6756218393508866, + "learning_rate": 1.985651338851859e-05, + "loss": 1.5152, + "step": 4545 + }, + { + "epoch": 0.14877059900601622, + "grad_norm": 3.351653893423466, + "learning_rate": 1.9855548509786032e-05, + "loss": 1.5938, + "step": 4550 + }, + { + "epoch": 0.1489340831807481, + "grad_norm": 3.383703082139735, + "learning_rate": 1.9854580421342797e-05, + "loss": 1.5426, + "step": 4555 + }, + { + "epoch": 0.14909756735548, + "grad_norm": 3.575600522970575, + "learning_rate": 1.9853609123504167e-05, + "loss": 1.6378, + "step": 4560 + }, + { + "epoch": 0.14926105153021188, + "grad_norm": 3.4363601403397017, + "learning_rate": 1.985263461658647e-05, + "loss": 1.5485, + "step": 4565 + }, + { + "epoch": 0.14942453570494377, + "grad_norm": 3.49340460415348, + "learning_rate": 1.985165690090708e-05, + "loss": 1.5195, + "step": 4570 + }, + { + "epoch": 0.14958801987967565, + "grad_norm": 3.4758118162112823, + "learning_rate": 1.9850675976784417e-05, + "loss": 1.5847, + "step": 4575 + }, + { + "epoch": 0.14975150405440754, + "grad_norm": 3.3307507087146204, + "learning_rate": 1.9849691844537937e-05, + "loss": 1.274, + "step": 4580 + }, + { + "epoch": 0.14991498822913943, + "grad_norm": 3.5233140310953264, + "learning_rate": 1.9848704504488154e-05, + "loss": 1.5111, + "step": 4585 + }, + { + "epoch": 0.15007847240387132, + "grad_norm": 3.3126719889944325, + "learning_rate": 1.984771395695662e-05, + "loss": 1.5646, + "step": 4590 + }, + { + "epoch": 0.1502419565786032, + "grad_norm": 3.383313138209485, + "learning_rate": 1.984672020226593e-05, + "loss": 1.5558, + "step": 4595 + }, + { + "epoch": 0.1504054407533351, + "grad_norm": 3.618246765261055, + "learning_rate": 1.9845723240739728e-05, + "loss": 1.6071, + "step": 4600 + }, + { + "epoch": 0.15056892492806695, + "grad_norm": 3.5070152240147916, + "learning_rate": 1.9844723072702697e-05, + "loss": 1.5222, + "step": 4605 + }, + { + "epoch": 0.15073240910279884, + "grad_norm": 3.6841281131789434, + "learning_rate": 1.984371969848057e-05, + "loss": 1.5728, + "step": 4610 + }, + { + "epoch": 0.15089589327753072, + "grad_norm": 3.5123082109576558, + "learning_rate": 1.984271311840012e-05, + "loss": 1.5631, + "step": 4615 + }, + { + "epoch": 0.1510593774522626, + "grad_norm": 3.4059240337046894, + "learning_rate": 1.9841703332789164e-05, + "loss": 1.4505, + "step": 4620 + }, + { + "epoch": 0.1512228616269945, + "grad_norm": 3.366189649927874, + "learning_rate": 1.9840690341976566e-05, + "loss": 1.6017, + "step": 4625 + }, + { + "epoch": 0.15138634580172639, + "grad_norm": 3.19233633245688, + "learning_rate": 1.9839674146292234e-05, + "loss": 1.455, + "step": 4630 + }, + { + "epoch": 0.15154982997645827, + "grad_norm": 3.5910230974334323, + "learning_rate": 1.9838654746067116e-05, + "loss": 1.4895, + "step": 4635 + }, + { + "epoch": 0.15171331415119016, + "grad_norm": 3.464953224323487, + "learning_rate": 1.9837632141633205e-05, + "loss": 1.5519, + "step": 4640 + }, + { + "epoch": 0.15187679832592205, + "grad_norm": 3.1420029992599234, + "learning_rate": 1.983660633332354e-05, + "loss": 1.4906, + "step": 4645 + }, + { + "epoch": 0.15204028250065393, + "grad_norm": 3.3756043314030366, + "learning_rate": 1.9835577321472203e-05, + "loss": 1.5922, + "step": 4650 + }, + { + "epoch": 0.15220376667538582, + "grad_norm": 3.5047795047804997, + "learning_rate": 1.9834545106414312e-05, + "loss": 1.4966, + "step": 4655 + }, + { + "epoch": 0.1523672508501177, + "grad_norm": 3.739628321940597, + "learning_rate": 1.983350968848604e-05, + "loss": 1.5754, + "step": 4660 + }, + { + "epoch": 0.1525307350248496, + "grad_norm": 3.46979228293591, + "learning_rate": 1.9832471068024594e-05, + "loss": 1.5908, + "step": 4665 + }, + { + "epoch": 0.15269421919958148, + "grad_norm": 3.389277288018743, + "learning_rate": 1.9831429245368227e-05, + "loss": 1.6126, + "step": 4670 + }, + { + "epoch": 0.15285770337431337, + "grad_norm": 3.768794961136317, + "learning_rate": 1.9830384220856237e-05, + "loss": 1.4742, + "step": 4675 + }, + { + "epoch": 0.15302118754904526, + "grad_norm": 3.3929066727886976, + "learning_rate": 1.9829335994828964e-05, + "loss": 1.4801, + "step": 4680 + }, + { + "epoch": 0.15318467172377714, + "grad_norm": 3.3880101605482986, + "learning_rate": 1.982828456762779e-05, + "loss": 1.5188, + "step": 4685 + }, + { + "epoch": 0.15334815589850903, + "grad_norm": 3.486617575912131, + "learning_rate": 1.9827229939595134e-05, + "loss": 1.4968, + "step": 4690 + }, + { + "epoch": 0.15351164007324092, + "grad_norm": 3.8445589130676314, + "learning_rate": 1.9826172111074466e-05, + "loss": 1.5834, + "step": 4695 + }, + { + "epoch": 0.1536751242479728, + "grad_norm": 3.5454471007128214, + "learning_rate": 1.9825111082410293e-05, + "loss": 1.5412, + "step": 4700 + }, + { + "epoch": 0.1538386084227047, + "grad_norm": 3.3177437420062996, + "learning_rate": 1.982404685394817e-05, + "loss": 1.5027, + "step": 4705 + }, + { + "epoch": 0.15400209259743658, + "grad_norm": 3.5467632345939544, + "learning_rate": 1.9822979426034688e-05, + "loss": 1.6243, + "step": 4710 + }, + { + "epoch": 0.15416557677216847, + "grad_norm": 3.354002534223789, + "learning_rate": 1.9821908799017478e-05, + "loss": 1.5777, + "step": 4715 + }, + { + "epoch": 0.15432906094690035, + "grad_norm": 3.57429962571145, + "learning_rate": 1.9820834973245225e-05, + "loss": 1.5408, + "step": 4720 + }, + { + "epoch": 0.1544925451216322, + "grad_norm": 3.452456142964739, + "learning_rate": 1.981975794906764e-05, + "loss": 1.6122, + "step": 4725 + }, + { + "epoch": 0.1546560292963641, + "grad_norm": 3.422832618859955, + "learning_rate": 1.9818677726835487e-05, + "loss": 1.5086, + "step": 4730 + }, + { + "epoch": 0.154819513471096, + "grad_norm": 3.512510027830078, + "learning_rate": 1.981759430690057e-05, + "loss": 1.3938, + "step": 4735 + }, + { + "epoch": 0.15498299764582787, + "grad_norm": 3.4868421272304864, + "learning_rate": 1.9816507689615728e-05, + "loss": 1.6021, + "step": 4740 + }, + { + "epoch": 0.15514648182055976, + "grad_norm": 3.6931350931971734, + "learning_rate": 1.981541787533485e-05, + "loss": 1.4796, + "step": 4745 + }, + { + "epoch": 0.15530996599529165, + "grad_norm": 3.1688634746065047, + "learning_rate": 1.9814324864412854e-05, + "loss": 1.4974, + "step": 4750 + }, + { + "epoch": 0.15547345017002354, + "grad_norm": 3.5044906926506814, + "learning_rate": 1.981322865720572e-05, + "loss": 1.5649, + "step": 4755 + }, + { + "epoch": 0.15563693434475542, + "grad_norm": 3.43947351202443, + "learning_rate": 1.9812129254070442e-05, + "loss": 1.5984, + "step": 4760 + }, + { + "epoch": 0.1558004185194873, + "grad_norm": 3.627686903840642, + "learning_rate": 1.9811026655365077e-05, + "loss": 1.6551, + "step": 4765 + }, + { + "epoch": 0.1559639026942192, + "grad_norm": 3.540144580955991, + "learning_rate": 1.9809920861448712e-05, + "loss": 1.582, + "step": 4770 + }, + { + "epoch": 0.15612738686895108, + "grad_norm": 3.574470142381698, + "learning_rate": 1.980881187268148e-05, + "loss": 1.4941, + "step": 4775 + }, + { + "epoch": 0.15629087104368297, + "grad_norm": 3.3866211188399062, + "learning_rate": 1.9807699689424545e-05, + "loss": 1.4772, + "step": 4780 + }, + { + "epoch": 0.15645435521841486, + "grad_norm": 3.5388338271521835, + "learning_rate": 1.980658431204012e-05, + "loss": 1.4848, + "step": 4785 + }, + { + "epoch": 0.15661783939314675, + "grad_norm": 3.386502181383284, + "learning_rate": 1.9805465740891462e-05, + "loss": 1.6023, + "step": 4790 + }, + { + "epoch": 0.15678132356787863, + "grad_norm": 3.465044285859929, + "learning_rate": 1.9804343976342857e-05, + "loss": 1.5831, + "step": 4795 + }, + { + "epoch": 0.15694480774261052, + "grad_norm": 3.5749241928868356, + "learning_rate": 1.9803219018759634e-05, + "loss": 1.5658, + "step": 4800 + }, + { + "epoch": 0.1571082919173424, + "grad_norm": 3.3172398632376465, + "learning_rate": 1.980209086850817e-05, + "loss": 1.5746, + "step": 4805 + }, + { + "epoch": 0.1572717760920743, + "grad_norm": 3.4006197026879685, + "learning_rate": 1.9800959525955873e-05, + "loss": 1.5364, + "step": 4810 + }, + { + "epoch": 0.15743526026680618, + "grad_norm": 3.321663211059867, + "learning_rate": 1.9799824991471193e-05, + "loss": 1.5763, + "step": 4815 + }, + { + "epoch": 0.15759874444153807, + "grad_norm": 3.1592314140218045, + "learning_rate": 1.979868726542362e-05, + "loss": 1.3795, + "step": 4820 + }, + { + "epoch": 0.15776222861626996, + "grad_norm": 3.5709970887325064, + "learning_rate": 1.9797546348183685e-05, + "loss": 1.5455, + "step": 4825 + }, + { + "epoch": 0.15792571279100184, + "grad_norm": 3.549217644209813, + "learning_rate": 1.9796402240122956e-05, + "loss": 1.5114, + "step": 4830 + }, + { + "epoch": 0.15808919696573373, + "grad_norm": 3.7305882801510077, + "learning_rate": 1.9795254941614043e-05, + "loss": 1.4364, + "step": 4835 + }, + { + "epoch": 0.1582526811404656, + "grad_norm": 3.5760223557106117, + "learning_rate": 1.9794104453030586e-05, + "loss": 1.5762, + "step": 4840 + }, + { + "epoch": 0.15841616531519748, + "grad_norm": 3.5214388374346965, + "learning_rate": 1.9792950774747276e-05, + "loss": 1.564, + "step": 4845 + }, + { + "epoch": 0.15857964948992936, + "grad_norm": 3.6513510158246447, + "learning_rate": 1.9791793907139842e-05, + "loss": 1.6305, + "step": 4850 + }, + { + "epoch": 0.15874313366466125, + "grad_norm": 3.37019981942735, + "learning_rate": 1.979063385058504e-05, + "loss": 1.5877, + "step": 4855 + }, + { + "epoch": 0.15890661783939314, + "grad_norm": 3.2995628723086607, + "learning_rate": 1.9789470605460678e-05, + "loss": 1.5696, + "step": 4860 + }, + { + "epoch": 0.15907010201412503, + "grad_norm": 3.690436548531314, + "learning_rate": 1.978830417214559e-05, + "loss": 1.6143, + "step": 4865 + }, + { + "epoch": 0.1592335861888569, + "grad_norm": 3.4943289212615745, + "learning_rate": 1.978713455101966e-05, + "loss": 1.5347, + "step": 4870 + }, + { + "epoch": 0.1593970703635888, + "grad_norm": 3.405450172415782, + "learning_rate": 1.9785961742463806e-05, + "loss": 1.5503, + "step": 4875 + }, + { + "epoch": 0.1595605545383207, + "grad_norm": 3.357602039817574, + "learning_rate": 1.9784785746859974e-05, + "loss": 1.4912, + "step": 4880 + }, + { + "epoch": 0.15972403871305257, + "grad_norm": 3.4364484865905416, + "learning_rate": 1.9783606564591167e-05, + "loss": 1.6037, + "step": 4885 + }, + { + "epoch": 0.15988752288778446, + "grad_norm": 3.485298164708196, + "learning_rate": 1.9782424196041413e-05, + "loss": 1.3795, + "step": 4890 + }, + { + "epoch": 0.16005100706251635, + "grad_norm": 3.360058398625516, + "learning_rate": 1.978123864159578e-05, + "loss": 1.5048, + "step": 4895 + }, + { + "epoch": 0.16021449123724824, + "grad_norm": 3.50528816055715, + "learning_rate": 1.978004990164037e-05, + "loss": 1.6346, + "step": 4900 + }, + { + "epoch": 0.16037797541198012, + "grad_norm": 3.409521370284433, + "learning_rate": 1.977885797656233e-05, + "loss": 1.4201, + "step": 4905 + }, + { + "epoch": 0.160541459586712, + "grad_norm": 3.3219947147304176, + "learning_rate": 1.9777662866749843e-05, + "loss": 1.6009, + "step": 4910 + }, + { + "epoch": 0.1607049437614439, + "grad_norm": 3.452030906798088, + "learning_rate": 1.9776464572592125e-05, + "loss": 1.5625, + "step": 4915 + }, + { + "epoch": 0.16086842793617578, + "grad_norm": 3.398360320108218, + "learning_rate": 1.977526309447943e-05, + "loss": 1.5456, + "step": 4920 + }, + { + "epoch": 0.16103191211090767, + "grad_norm": 3.802384260622591, + "learning_rate": 1.977405843280305e-05, + "loss": 1.6608, + "step": 4925 + }, + { + "epoch": 0.16119539628563956, + "grad_norm": 3.293478548193033, + "learning_rate": 1.9772850587955315e-05, + "loss": 1.46, + "step": 4930 + }, + { + "epoch": 0.16135888046037145, + "grad_norm": 3.565421550706904, + "learning_rate": 1.9771639560329587e-05, + "loss": 1.5756, + "step": 4935 + }, + { + "epoch": 0.16152236463510333, + "grad_norm": 3.5379842946323694, + "learning_rate": 1.9770425350320275e-05, + "loss": 1.5165, + "step": 4940 + }, + { + "epoch": 0.16168584880983522, + "grad_norm": 3.623888750579602, + "learning_rate": 1.9769207958322813e-05, + "loss": 1.5842, + "step": 4945 + }, + { + "epoch": 0.1618493329845671, + "grad_norm": 3.2156857409524924, + "learning_rate": 1.9767987384733676e-05, + "loss": 1.4652, + "step": 4950 + }, + { + "epoch": 0.16201281715929897, + "grad_norm": 3.7533480989976296, + "learning_rate": 1.9766763629950374e-05, + "loss": 1.63, + "step": 4955 + }, + { + "epoch": 0.16217630133403085, + "grad_norm": 3.093073117299729, + "learning_rate": 1.9765536694371455e-05, + "loss": 1.3778, + "step": 4960 + }, + { + "epoch": 0.16233978550876274, + "grad_norm": 3.2367943995283897, + "learning_rate": 1.9764306578396505e-05, + "loss": 1.259, + "step": 4965 + }, + { + "epoch": 0.16250326968349463, + "grad_norm": 3.2968839838087036, + "learning_rate": 1.9763073282426135e-05, + "loss": 1.4391, + "step": 4970 + }, + { + "epoch": 0.16266675385822651, + "grad_norm": 3.5578825449437446, + "learning_rate": 1.9761836806862006e-05, + "loss": 1.5765, + "step": 4975 + }, + { + "epoch": 0.1628302380329584, + "grad_norm": 3.2296081988989433, + "learning_rate": 1.976059715210681e-05, + "loss": 1.5047, + "step": 4980 + }, + { + "epoch": 0.1629937222076903, + "grad_norm": 3.4391984679941747, + "learning_rate": 1.975935431856426e-05, + "loss": 1.5219, + "step": 4985 + }, + { + "epoch": 0.16315720638242218, + "grad_norm": 3.3191299532316587, + "learning_rate": 1.975810830663913e-05, + "loss": 1.497, + "step": 4990 + }, + { + "epoch": 0.16332069055715406, + "grad_norm": 3.4559016450286233, + "learning_rate": 1.9756859116737205e-05, + "loss": 1.4641, + "step": 4995 + }, + { + "epoch": 0.16348417473188595, + "grad_norm": 3.3656434746223742, + "learning_rate": 1.9755606749265322e-05, + "loss": 1.5088, + "step": 5000 + }, + { + "epoch": 0.16364765890661784, + "grad_norm": 3.2813118547626825, + "learning_rate": 1.9754351204631347e-05, + "loss": 1.5877, + "step": 5005 + }, + { + "epoch": 0.16381114308134972, + "grad_norm": 3.325327940593229, + "learning_rate": 1.975309248324418e-05, + "loss": 1.4074, + "step": 5010 + }, + { + "epoch": 0.1639746272560816, + "grad_norm": 3.4277470864061605, + "learning_rate": 1.975183058551375e-05, + "loss": 1.4937, + "step": 5015 + }, + { + "epoch": 0.1641381114308135, + "grad_norm": 3.4575918060776574, + "learning_rate": 1.975056551185103e-05, + "loss": 1.4778, + "step": 5020 + }, + { + "epoch": 0.1643015956055454, + "grad_norm": 3.367785106718818, + "learning_rate": 1.974929726266802e-05, + "loss": 1.4157, + "step": 5025 + }, + { + "epoch": 0.16446507978027727, + "grad_norm": 3.267406910273383, + "learning_rate": 1.9748025838377764e-05, + "loss": 1.5254, + "step": 5030 + }, + { + "epoch": 0.16462856395500916, + "grad_norm": 3.3483755710985386, + "learning_rate": 1.974675123939433e-05, + "loss": 1.4138, + "step": 5035 + }, + { + "epoch": 0.16479204812974105, + "grad_norm": 3.397772294371877, + "learning_rate": 1.974547346613283e-05, + "loss": 1.5252, + "step": 5040 + }, + { + "epoch": 0.16495553230447293, + "grad_norm": 3.5902516259498367, + "learning_rate": 1.974419251900939e-05, + "loss": 1.5632, + "step": 5045 + }, + { + "epoch": 0.16511901647920482, + "grad_norm": 3.363950059257026, + "learning_rate": 1.9742908398441197e-05, + "loss": 1.499, + "step": 5050 + }, + { + "epoch": 0.1652825006539367, + "grad_norm": 3.17781862205315, + "learning_rate": 1.9741621104846448e-05, + "loss": 1.3778, + "step": 5055 + }, + { + "epoch": 0.1654459848286686, + "grad_norm": 3.4514582883298384, + "learning_rate": 1.9740330638644387e-05, + "loss": 1.4444, + "step": 5060 + }, + { + "epoch": 0.16560946900340048, + "grad_norm": 3.455383547674156, + "learning_rate": 1.973903700025529e-05, + "loss": 1.4628, + "step": 5065 + }, + { + "epoch": 0.16577295317813237, + "grad_norm": 3.305626508376043, + "learning_rate": 1.9737740190100453e-05, + "loss": 1.4545, + "step": 5070 + }, + { + "epoch": 0.16593643735286423, + "grad_norm": 3.5334426503701617, + "learning_rate": 1.973644020860223e-05, + "loss": 1.5215, + "step": 5075 + }, + { + "epoch": 0.16609992152759612, + "grad_norm": 3.3409711412534024, + "learning_rate": 1.9735137056183978e-05, + "loss": 1.6156, + "step": 5080 + }, + { + "epoch": 0.166263405702328, + "grad_norm": 3.4106478172242576, + "learning_rate": 1.9733830733270113e-05, + "loss": 1.5494, + "step": 5085 + }, + { + "epoch": 0.1664268898770599, + "grad_norm": 3.3608571962526534, + "learning_rate": 1.9732521240286068e-05, + "loss": 1.5563, + "step": 5090 + }, + { + "epoch": 0.16659037405179178, + "grad_norm": 3.2856978094248444, + "learning_rate": 1.9731208577658317e-05, + "loss": 1.533, + "step": 5095 + }, + { + "epoch": 0.16675385822652367, + "grad_norm": 3.2261017322200383, + "learning_rate": 1.9729892745814354e-05, + "loss": 1.4968, + "step": 5100 + }, + { + "epoch": 0.16691734240125555, + "grad_norm": 3.080737815392367, + "learning_rate": 1.972857374518272e-05, + "loss": 1.431, + "step": 5105 + }, + { + "epoch": 0.16708082657598744, + "grad_norm": 3.3442993696390566, + "learning_rate": 1.972725157619298e-05, + "loss": 1.4524, + "step": 5110 + }, + { + "epoch": 0.16724431075071933, + "grad_norm": 3.560933360300554, + "learning_rate": 1.9725926239275726e-05, + "loss": 1.6534, + "step": 5115 + }, + { + "epoch": 0.16740779492545121, + "grad_norm": 3.365653792476806, + "learning_rate": 1.9724597734862593e-05, + "loss": 1.3789, + "step": 5120 + }, + { + "epoch": 0.1675712791001831, + "grad_norm": 3.216472031535469, + "learning_rate": 1.9723266063386245e-05, + "loss": 1.4965, + "step": 5125 + }, + { + "epoch": 0.167734763274915, + "grad_norm": 3.5504849385680126, + "learning_rate": 1.9721931225280372e-05, + "loss": 1.4789, + "step": 5130 + }, + { + "epoch": 0.16789824744964688, + "grad_norm": 3.1169059135286274, + "learning_rate": 1.97205932209797e-05, + "loss": 1.4928, + "step": 5135 + }, + { + "epoch": 0.16806173162437876, + "grad_norm": 3.4243947734993574, + "learning_rate": 1.971925205091998e-05, + "loss": 1.4879, + "step": 5140 + }, + { + "epoch": 0.16822521579911065, + "grad_norm": 3.5041492207424323, + "learning_rate": 1.9717907715537996e-05, + "loss": 1.5322, + "step": 5145 + }, + { + "epoch": 0.16838869997384254, + "grad_norm": 3.460582325470557, + "learning_rate": 1.9716560215271574e-05, + "loss": 1.6177, + "step": 5150 + }, + { + "epoch": 0.16855218414857442, + "grad_norm": 3.4870606458814604, + "learning_rate": 1.971520955055956e-05, + "loss": 1.5955, + "step": 5155 + }, + { + "epoch": 0.1687156683233063, + "grad_norm": 3.6001017934176613, + "learning_rate": 1.971385572184182e-05, + "loss": 1.4866, + "step": 5160 + }, + { + "epoch": 0.1688791524980382, + "grad_norm": 3.2956955334103077, + "learning_rate": 1.971249872955928e-05, + "loss": 1.5155, + "step": 5165 + }, + { + "epoch": 0.16904263667277009, + "grad_norm": 3.7025894519392915, + "learning_rate": 1.9711138574153872e-05, + "loss": 1.5366, + "step": 5170 + }, + { + "epoch": 0.16920612084750197, + "grad_norm": 3.4224672323389154, + "learning_rate": 1.970977525606856e-05, + "loss": 1.5441, + "step": 5175 + }, + { + "epoch": 0.16936960502223386, + "grad_norm": 3.536203415303866, + "learning_rate": 1.9708408775747353e-05, + "loss": 1.5066, + "step": 5180 + }, + { + "epoch": 0.16953308919696575, + "grad_norm": 3.6975472073872107, + "learning_rate": 1.970703913363527e-05, + "loss": 1.5759, + "step": 5185 + }, + { + "epoch": 0.1696965733716976, + "grad_norm": 3.1631226786055917, + "learning_rate": 1.970566633017838e-05, + "loss": 1.475, + "step": 5190 + }, + { + "epoch": 0.1698600575464295, + "grad_norm": 3.212594429250939, + "learning_rate": 1.9704290365823765e-05, + "loss": 1.6193, + "step": 5195 + }, + { + "epoch": 0.17002354172116138, + "grad_norm": 3.253929643439132, + "learning_rate": 1.9702911241019546e-05, + "loss": 1.5322, + "step": 5200 + }, + { + "epoch": 0.17018702589589327, + "grad_norm": 3.3048767121413856, + "learning_rate": 1.9701528956214865e-05, + "loss": 1.4884, + "step": 5205 + }, + { + "epoch": 0.17035051007062516, + "grad_norm": 3.134293864855004, + "learning_rate": 1.9700143511859905e-05, + "loss": 1.6133, + "step": 5210 + }, + { + "epoch": 0.17051399424535704, + "grad_norm": 3.3299652949845213, + "learning_rate": 1.9698754908405867e-05, + "loss": 1.6415, + "step": 5215 + }, + { + "epoch": 0.17067747842008893, + "grad_norm": 3.465958591292971, + "learning_rate": 1.969736314630499e-05, + "loss": 1.5696, + "step": 5220 + }, + { + "epoch": 0.17084096259482082, + "grad_norm": 3.4230362241312706, + "learning_rate": 1.969596822601053e-05, + "loss": 1.5955, + "step": 5225 + }, + { + "epoch": 0.1710044467695527, + "grad_norm": 3.444822917869584, + "learning_rate": 1.9694570147976782e-05, + "loss": 1.5721, + "step": 5230 + }, + { + "epoch": 0.1711679309442846, + "grad_norm": 3.2170943226772657, + "learning_rate": 1.9693168912659063e-05, + "loss": 1.3639, + "step": 5235 + }, + { + "epoch": 0.17133141511901648, + "grad_norm": 3.493119974539674, + "learning_rate": 1.9691764520513725e-05, + "loss": 1.5701, + "step": 5240 + }, + { + "epoch": 0.17149489929374837, + "grad_norm": 3.4736259727979735, + "learning_rate": 1.9690356971998144e-05, + "loss": 1.6456, + "step": 5245 + }, + { + "epoch": 0.17165838346848025, + "grad_norm": 3.113488058125434, + "learning_rate": 1.968894626757072e-05, + "loss": 1.3852, + "step": 5250 + }, + { + "epoch": 0.17182186764321214, + "grad_norm": 3.4209535818213013, + "learning_rate": 1.968753240769089e-05, + "loss": 1.6704, + "step": 5255 + }, + { + "epoch": 0.17198535181794403, + "grad_norm": 3.4780354799842708, + "learning_rate": 1.968611539281911e-05, + "loss": 1.446, + "step": 5260 + }, + { + "epoch": 0.1721488359926759, + "grad_norm": 3.2572153856436277, + "learning_rate": 1.9684695223416867e-05, + "loss": 1.3831, + "step": 5265 + }, + { + "epoch": 0.1723123201674078, + "grad_norm": 3.21581696846367, + "learning_rate": 1.9683271899946678e-05, + "loss": 1.498, + "step": 5270 + }, + { + "epoch": 0.1724758043421397, + "grad_norm": 3.266696231143246, + "learning_rate": 1.968184542287208e-05, + "loss": 1.4667, + "step": 5275 + }, + { + "epoch": 0.17263928851687158, + "grad_norm": 3.488263020029933, + "learning_rate": 1.9680415792657647e-05, + "loss": 1.4608, + "step": 5280 + }, + { + "epoch": 0.17280277269160346, + "grad_norm": 3.4613154707250167, + "learning_rate": 1.9678983009768973e-05, + "loss": 1.6231, + "step": 5285 + }, + { + "epoch": 0.17296625686633535, + "grad_norm": 3.2845155342906374, + "learning_rate": 1.9677547074672678e-05, + "loss": 1.521, + "step": 5290 + }, + { + "epoch": 0.17312974104106724, + "grad_norm": 3.5284372042011642, + "learning_rate": 1.9676107987836412e-05, + "loss": 1.6204, + "step": 5295 + }, + { + "epoch": 0.17329322521579912, + "grad_norm": 3.289936272823523, + "learning_rate": 1.967466574972885e-05, + "loss": 1.5628, + "step": 5300 + }, + { + "epoch": 0.17345670939053098, + "grad_norm": 3.598421690999863, + "learning_rate": 1.9673220360819693e-05, + "loss": 1.4785, + "step": 5305 + }, + { + "epoch": 0.17362019356526287, + "grad_norm": 3.348609433305009, + "learning_rate": 1.9671771821579676e-05, + "loss": 1.4489, + "step": 5310 + }, + { + "epoch": 0.17378367773999476, + "grad_norm": 3.611178531809768, + "learning_rate": 1.967032013248054e-05, + "loss": 1.5513, + "step": 5315 + }, + { + "epoch": 0.17394716191472664, + "grad_norm": 3.2844871186623257, + "learning_rate": 1.9668865293995075e-05, + "loss": 1.5572, + "step": 5320 + }, + { + "epoch": 0.17411064608945853, + "grad_norm": 3.6033306849017697, + "learning_rate": 1.966740730659708e-05, + "loss": 1.7204, + "step": 5325 + }, + { + "epoch": 0.17427413026419042, + "grad_norm": 3.3664915069146764, + "learning_rate": 1.9665946170761388e-05, + "loss": 1.524, + "step": 5330 + }, + { + "epoch": 0.1744376144389223, + "grad_norm": 3.652180817174219, + "learning_rate": 1.966448188696386e-05, + "loss": 1.6264, + "step": 5335 + }, + { + "epoch": 0.1746010986136542, + "grad_norm": 3.41945897888384, + "learning_rate": 1.9663014455681367e-05, + "loss": 1.5489, + "step": 5340 + }, + { + "epoch": 0.17476458278838608, + "grad_norm": 3.2924677338152226, + "learning_rate": 1.9661543877391823e-05, + "loss": 1.478, + "step": 5345 + }, + { + "epoch": 0.17492806696311797, + "grad_norm": 3.663810378752231, + "learning_rate": 1.966007015257416e-05, + "loss": 1.6956, + "step": 5350 + }, + { + "epoch": 0.17509155113784985, + "grad_norm": 3.554841077901108, + "learning_rate": 1.9658593281708328e-05, + "loss": 1.5368, + "step": 5355 + }, + { + "epoch": 0.17525503531258174, + "grad_norm": 3.308364102984334, + "learning_rate": 1.9657113265275314e-05, + "loss": 1.3711, + "step": 5360 + }, + { + "epoch": 0.17541851948731363, + "grad_norm": 3.2182703352271003, + "learning_rate": 1.965563010375712e-05, + "loss": 1.3358, + "step": 5365 + }, + { + "epoch": 0.17558200366204552, + "grad_norm": 3.3934839204115423, + "learning_rate": 1.9654143797636774e-05, + "loss": 1.5167, + "step": 5370 + }, + { + "epoch": 0.1757454878367774, + "grad_norm": 3.3304090832744566, + "learning_rate": 1.9652654347398332e-05, + "loss": 1.436, + "step": 5375 + }, + { + "epoch": 0.1759089720115093, + "grad_norm": 3.3906747540180473, + "learning_rate": 1.9651161753526872e-05, + "loss": 1.5339, + "step": 5380 + }, + { + "epoch": 0.17607245618624118, + "grad_norm": 3.3444418539763623, + "learning_rate": 1.9649666016508492e-05, + "loss": 1.3459, + "step": 5385 + }, + { + "epoch": 0.17623594036097306, + "grad_norm": 3.194394884669045, + "learning_rate": 1.9648167136830318e-05, + "loss": 1.4669, + "step": 5390 + }, + { + "epoch": 0.17639942453570495, + "grad_norm": 3.4244851147020072, + "learning_rate": 1.96466651149805e-05, + "loss": 1.3617, + "step": 5395 + }, + { + "epoch": 0.17656290871043684, + "grad_norm": 3.5629505602673106, + "learning_rate": 1.964515995144821e-05, + "loss": 1.5474, + "step": 5400 + }, + { + "epoch": 0.17672639288516873, + "grad_norm": 2.9765513652703635, + "learning_rate": 1.9643651646723644e-05, + "loss": 1.5406, + "step": 5405 + }, + { + "epoch": 0.1768898770599006, + "grad_norm": 3.505361043360345, + "learning_rate": 1.9642140201298012e-05, + "loss": 1.5948, + "step": 5410 + }, + { + "epoch": 0.1770533612346325, + "grad_norm": 3.7464880612968234, + "learning_rate": 1.9640625615663565e-05, + "loss": 1.5342, + "step": 5415 + }, + { + "epoch": 0.17721684540936436, + "grad_norm": 3.164561693963579, + "learning_rate": 1.9639107890313558e-05, + "loss": 1.5435, + "step": 5420 + }, + { + "epoch": 0.17738032958409625, + "grad_norm": 3.5559663636605294, + "learning_rate": 1.963758702574228e-05, + "loss": 1.4927, + "step": 5425 + }, + { + "epoch": 0.17754381375882813, + "grad_norm": 3.6564093937297115, + "learning_rate": 1.9636063022445047e-05, + "loss": 1.4792, + "step": 5430 + }, + { + "epoch": 0.17770729793356002, + "grad_norm": 3.801749823299907, + "learning_rate": 1.9634535880918174e-05, + "loss": 1.5514, + "step": 5435 + }, + { + "epoch": 0.1778707821082919, + "grad_norm": 3.553265930527905, + "learning_rate": 1.963300560165903e-05, + "loss": 1.6206, + "step": 5440 + }, + { + "epoch": 0.1780342662830238, + "grad_norm": 3.210181421748329, + "learning_rate": 1.9631472185165976e-05, + "loss": 1.5696, + "step": 5445 + }, + { + "epoch": 0.17819775045775568, + "grad_norm": 3.4283862816296033, + "learning_rate": 1.9629935631938416e-05, + "loss": 1.6048, + "step": 5450 + }, + { + "epoch": 0.17836123463248757, + "grad_norm": 3.4735485576115015, + "learning_rate": 1.962839594247677e-05, + "loss": 1.4856, + "step": 5455 + }, + { + "epoch": 0.17852471880721946, + "grad_norm": 3.666980377675915, + "learning_rate": 1.9626853117282464e-05, + "loss": 1.5613, + "step": 5460 + }, + { + "epoch": 0.17868820298195134, + "grad_norm": 3.3130985121230863, + "learning_rate": 1.962530715685797e-05, + "loss": 1.5447, + "step": 5465 + }, + { + "epoch": 0.17885168715668323, + "grad_norm": 3.2825505548679845, + "learning_rate": 1.962375806170677e-05, + "loss": 1.5245, + "step": 5470 + }, + { + "epoch": 0.17901517133141512, + "grad_norm": 3.4225187386390297, + "learning_rate": 1.962220583233336e-05, + "loss": 1.6253, + "step": 5475 + }, + { + "epoch": 0.179178655506147, + "grad_norm": 3.7547126219927343, + "learning_rate": 1.9620650469243266e-05, + "loss": 1.4993, + "step": 5480 + }, + { + "epoch": 0.1793421396808789, + "grad_norm": 3.363955168293951, + "learning_rate": 1.9619091972943035e-05, + "loss": 1.4184, + "step": 5485 + }, + { + "epoch": 0.17950562385561078, + "grad_norm": 3.359915114321863, + "learning_rate": 1.9617530343940224e-05, + "loss": 1.5925, + "step": 5490 + }, + { + "epoch": 0.17966910803034267, + "grad_norm": 3.17935273899016, + "learning_rate": 1.9615965582743422e-05, + "loss": 1.6289, + "step": 5495 + }, + { + "epoch": 0.17983259220507455, + "grad_norm": 3.7110816522998, + "learning_rate": 1.961439768986223e-05, + "loss": 1.3985, + "step": 5500 + }, + { + "epoch": 0.17999607637980644, + "grad_norm": 3.6036702785329795, + "learning_rate": 1.9612826665807278e-05, + "loss": 1.5427, + "step": 5505 + }, + { + "epoch": 0.18015956055453833, + "grad_norm": 3.338977401327388, + "learning_rate": 1.9611252511090206e-05, + "loss": 1.4488, + "step": 5510 + }, + { + "epoch": 0.18032304472927022, + "grad_norm": 3.7494466002736395, + "learning_rate": 1.960967522622368e-05, + "loss": 1.4987, + "step": 5515 + }, + { + "epoch": 0.1804865289040021, + "grad_norm": 3.5178846361886404, + "learning_rate": 1.9608094811721377e-05, + "loss": 1.5827, + "step": 5520 + }, + { + "epoch": 0.180650013078734, + "grad_norm": 3.402726933148792, + "learning_rate": 1.9606511268098006e-05, + "loss": 1.4958, + "step": 5525 + }, + { + "epoch": 0.18081349725346588, + "grad_norm": 3.422569259947157, + "learning_rate": 1.9604924595869286e-05, + "loss": 1.5025, + "step": 5530 + }, + { + "epoch": 0.18097698142819776, + "grad_norm": 3.033656320607296, + "learning_rate": 1.9603334795551957e-05, + "loss": 1.5031, + "step": 5535 + }, + { + "epoch": 0.18114046560292962, + "grad_norm": 3.585113253313421, + "learning_rate": 1.960174186766378e-05, + "loss": 1.5239, + "step": 5540 + }, + { + "epoch": 0.1813039497776615, + "grad_norm": 3.3431197281342735, + "learning_rate": 1.9600145812723527e-05, + "loss": 1.5055, + "step": 5545 + }, + { + "epoch": 0.1814674339523934, + "grad_norm": 3.4829599078549984, + "learning_rate": 1.9598546631251e-05, + "loss": 1.4878, + "step": 5550 + }, + { + "epoch": 0.18163091812712528, + "grad_norm": 3.2646753156963784, + "learning_rate": 1.959694432376701e-05, + "loss": 1.6752, + "step": 5555 + }, + { + "epoch": 0.18179440230185717, + "grad_norm": 3.545343153843873, + "learning_rate": 1.9595338890793393e-05, + "loss": 1.4088, + "step": 5560 + }, + { + "epoch": 0.18195788647658906, + "grad_norm": 3.47652896144843, + "learning_rate": 1.9593730332852995e-05, + "loss": 1.3815, + "step": 5565 + }, + { + "epoch": 0.18212137065132095, + "grad_norm": 3.2768519860856395, + "learning_rate": 1.9592118650469686e-05, + "loss": 1.4764, + "step": 5570 + }, + { + "epoch": 0.18228485482605283, + "grad_norm": 3.2950031667575717, + "learning_rate": 1.959050384416835e-05, + "loss": 1.4664, + "step": 5575 + }, + { + "epoch": 0.18244833900078472, + "grad_norm": 3.4474237248015567, + "learning_rate": 1.958888591447489e-05, + "loss": 1.6221, + "step": 5580 + }, + { + "epoch": 0.1826118231755166, + "grad_norm": 3.3938216867134203, + "learning_rate": 1.9587264861916227e-05, + "loss": 1.555, + "step": 5585 + }, + { + "epoch": 0.1827753073502485, + "grad_norm": 3.4849682133294646, + "learning_rate": 1.95856406870203e-05, + "loss": 1.5706, + "step": 5590 + }, + { + "epoch": 0.18293879152498038, + "grad_norm": 3.42408377849579, + "learning_rate": 1.9584013390316058e-05, + "loss": 1.4708, + "step": 5595 + }, + { + "epoch": 0.18310227569971227, + "grad_norm": 3.4127730191108694, + "learning_rate": 1.9582382972333476e-05, + "loss": 1.4664, + "step": 5600 + }, + { + "epoch": 0.18326575987444416, + "grad_norm": 3.4326262334057698, + "learning_rate": 1.958074943360354e-05, + "loss": 1.5977, + "step": 5605 + }, + { + "epoch": 0.18342924404917604, + "grad_norm": 3.538952372158441, + "learning_rate": 1.9579112774658254e-05, + "loss": 1.5251, + "step": 5610 + }, + { + "epoch": 0.18359272822390793, + "grad_norm": 3.278706649661433, + "learning_rate": 1.9577472996030634e-05, + "loss": 1.5333, + "step": 5615 + }, + { + "epoch": 0.18375621239863982, + "grad_norm": 3.280826917992031, + "learning_rate": 1.9575830098254723e-05, + "loss": 1.5052, + "step": 5620 + }, + { + "epoch": 0.1839196965733717, + "grad_norm": 3.261585979774743, + "learning_rate": 1.9574184081865564e-05, + "loss": 1.4469, + "step": 5625 + }, + { + "epoch": 0.1840831807481036, + "grad_norm": 3.5285935138166193, + "learning_rate": 1.9572534947399232e-05, + "loss": 1.6163, + "step": 5630 + }, + { + "epoch": 0.18424666492283548, + "grad_norm": 3.5924817165670784, + "learning_rate": 1.9570882695392803e-05, + "loss": 1.5866, + "step": 5635 + }, + { + "epoch": 0.18441014909756737, + "grad_norm": 3.305781053626987, + "learning_rate": 1.956922732638438e-05, + "loss": 1.5407, + "step": 5640 + }, + { + "epoch": 0.18457363327229925, + "grad_norm": 3.516416692032187, + "learning_rate": 1.9567568840913075e-05, + "loss": 1.4091, + "step": 5645 + }, + { + "epoch": 0.18473711744703114, + "grad_norm": 3.5563843259085854, + "learning_rate": 1.9565907239519014e-05, + "loss": 1.6349, + "step": 5650 + }, + { + "epoch": 0.184900601621763, + "grad_norm": 3.2213008765553917, + "learning_rate": 1.956424252274334e-05, + "loss": 1.4725, + "step": 5655 + }, + { + "epoch": 0.1850640857964949, + "grad_norm": 3.3861788274652924, + "learning_rate": 1.9562574691128212e-05, + "loss": 1.5494, + "step": 5660 + }, + { + "epoch": 0.18522756997122677, + "grad_norm": 3.3386940605568824, + "learning_rate": 1.9560903745216805e-05, + "loss": 1.4807, + "step": 5665 + }, + { + "epoch": 0.18539105414595866, + "grad_norm": 3.4244477396991773, + "learning_rate": 1.95592296855533e-05, + "loss": 1.5101, + "step": 5670 + }, + { + "epoch": 0.18555453832069055, + "grad_norm": 3.5004220440033174, + "learning_rate": 1.95575525126829e-05, + "loss": 1.423, + "step": 5675 + }, + { + "epoch": 0.18571802249542244, + "grad_norm": 3.4169919061638225, + "learning_rate": 1.9555872227151814e-05, + "loss": 1.5952, + "step": 5680 + }, + { + "epoch": 0.18588150667015432, + "grad_norm": 3.36382376943894, + "learning_rate": 1.9554188829507277e-05, + "loss": 1.5395, + "step": 5685 + }, + { + "epoch": 0.1860449908448862, + "grad_norm": 3.5067660814756962, + "learning_rate": 1.9552502320297525e-05, + "loss": 1.4439, + "step": 5690 + }, + { + "epoch": 0.1862084750196181, + "grad_norm": 3.417216769585762, + "learning_rate": 1.9550812700071818e-05, + "loss": 1.6332, + "step": 5695 + }, + { + "epoch": 0.18637195919434998, + "grad_norm": 3.6970458210667494, + "learning_rate": 1.954911996938042e-05, + "loss": 1.68, + "step": 5700 + }, + { + "epoch": 0.18653544336908187, + "grad_norm": 3.5496994854815442, + "learning_rate": 1.9547424128774613e-05, + "loss": 1.5805, + "step": 5705 + }, + { + "epoch": 0.18669892754381376, + "grad_norm": 3.4496954248807237, + "learning_rate": 1.954572517880669e-05, + "loss": 1.5895, + "step": 5710 + }, + { + "epoch": 0.18686241171854565, + "grad_norm": 3.782952622752031, + "learning_rate": 1.9544023120029957e-05, + "loss": 1.6951, + "step": 5715 + }, + { + "epoch": 0.18702589589327753, + "grad_norm": 3.2332171950998148, + "learning_rate": 1.954231795299873e-05, + "loss": 1.4728, + "step": 5720 + }, + { + "epoch": 0.18718938006800942, + "grad_norm": 3.5023678538267924, + "learning_rate": 1.9540609678268353e-05, + "loss": 1.4951, + "step": 5725 + }, + { + "epoch": 0.1873528642427413, + "grad_norm": 3.5562104318037964, + "learning_rate": 1.9538898296395156e-05, + "loss": 1.6506, + "step": 5730 + }, + { + "epoch": 0.1875163484174732, + "grad_norm": 3.4621528330745326, + "learning_rate": 1.95371838079365e-05, + "loss": 1.5032, + "step": 5735 + }, + { + "epoch": 0.18767983259220508, + "grad_norm": 3.7600644302760053, + "learning_rate": 1.9535466213450744e-05, + "loss": 1.5894, + "step": 5740 + }, + { + "epoch": 0.18784331676693697, + "grad_norm": 3.106378676214599, + "learning_rate": 1.9533745513497277e-05, + "loss": 1.5629, + "step": 5745 + }, + { + "epoch": 0.18800680094166886, + "grad_norm": 3.6039913501909, + "learning_rate": 1.9532021708636484e-05, + "loss": 1.6197, + "step": 5750 + }, + { + "epoch": 0.18817028511640074, + "grad_norm": 3.6137466494407864, + "learning_rate": 1.9530294799429765e-05, + "loss": 1.5887, + "step": 5755 + }, + { + "epoch": 0.18833376929113263, + "grad_norm": 3.4327754401705093, + "learning_rate": 1.9528564786439537e-05, + "loss": 1.5977, + "step": 5760 + }, + { + "epoch": 0.18849725346586452, + "grad_norm": 3.423801292354114, + "learning_rate": 1.9526831670229218e-05, + "loss": 1.5818, + "step": 5765 + }, + { + "epoch": 0.18866073764059638, + "grad_norm": 3.1791020453885768, + "learning_rate": 1.952509545136324e-05, + "loss": 1.5642, + "step": 5770 + }, + { + "epoch": 0.18882422181532826, + "grad_norm": 3.61096197356056, + "learning_rate": 1.952335613040705e-05, + "loss": 1.515, + "step": 5775 + }, + { + "epoch": 0.18898770599006015, + "grad_norm": 3.1336001974497174, + "learning_rate": 1.9521613707927107e-05, + "loss": 1.5647, + "step": 5780 + }, + { + "epoch": 0.18915119016479204, + "grad_norm": 3.452098930084643, + "learning_rate": 1.9519868184490866e-05, + "loss": 1.509, + "step": 5785 + }, + { + "epoch": 0.18931467433952393, + "grad_norm": 3.5642873107523956, + "learning_rate": 1.9518119560666802e-05, + "loss": 1.5852, + "step": 5790 + }, + { + "epoch": 0.1894781585142558, + "grad_norm": 3.1352098499262966, + "learning_rate": 1.951636783702441e-05, + "loss": 1.4668, + "step": 5795 + }, + { + "epoch": 0.1896416426889877, + "grad_norm": 3.170973612210691, + "learning_rate": 1.951461301413417e-05, + "loss": 1.4749, + "step": 5800 + }, + { + "epoch": 0.1898051268637196, + "grad_norm": 3.7127107740922387, + "learning_rate": 1.9512855092567597e-05, + "loss": 1.5674, + "step": 5805 + }, + { + "epoch": 0.18996861103845147, + "grad_norm": 3.3562333082115803, + "learning_rate": 1.951109407289719e-05, + "loss": 1.5396, + "step": 5810 + }, + { + "epoch": 0.19013209521318336, + "grad_norm": 3.4434732196083986, + "learning_rate": 1.950932995569648e-05, + "loss": 1.4896, + "step": 5815 + }, + { + "epoch": 0.19029557938791525, + "grad_norm": 3.5744252890290267, + "learning_rate": 1.950756274153999e-05, + "loss": 1.5862, + "step": 5820 + }, + { + "epoch": 0.19045906356264714, + "grad_norm": 3.435280512754311, + "learning_rate": 1.9505792431003266e-05, + "loss": 1.6591, + "step": 5825 + }, + { + "epoch": 0.19062254773737902, + "grad_norm": 3.2812565385401946, + "learning_rate": 1.950401902466285e-05, + "loss": 1.4891, + "step": 5830 + }, + { + "epoch": 0.1907860319121109, + "grad_norm": 3.3499356145845676, + "learning_rate": 1.9502242523096295e-05, + "loss": 1.5207, + "step": 5835 + }, + { + "epoch": 0.1909495160868428, + "grad_norm": 3.3208141151757884, + "learning_rate": 1.9500462926882167e-05, + "loss": 1.5908, + "step": 5840 + }, + { + "epoch": 0.19111300026157468, + "grad_norm": 3.517428431433562, + "learning_rate": 1.9498680236600038e-05, + "loss": 1.4586, + "step": 5845 + }, + { + "epoch": 0.19127648443630657, + "grad_norm": 3.4094308656834778, + "learning_rate": 1.9496894452830485e-05, + "loss": 1.6155, + "step": 5850 + }, + { + "epoch": 0.19143996861103846, + "grad_norm": 3.2540851383310034, + "learning_rate": 1.949510557615509e-05, + "loss": 1.4726, + "step": 5855 + }, + { + "epoch": 0.19160345278577035, + "grad_norm": 3.4033600224627496, + "learning_rate": 1.9493313607156453e-05, + "loss": 1.3511, + "step": 5860 + }, + { + "epoch": 0.19176693696050223, + "grad_norm": 3.0129311325523322, + "learning_rate": 1.9491518546418177e-05, + "loss": 1.551, + "step": 5865 + }, + { + "epoch": 0.19193042113523412, + "grad_norm": 3.5939387122824775, + "learning_rate": 1.9489720394524856e-05, + "loss": 1.5444, + "step": 5870 + }, + { + "epoch": 0.192093905309966, + "grad_norm": 3.2492479637456375, + "learning_rate": 1.9487919152062115e-05, + "loss": 1.6583, + "step": 5875 + }, + { + "epoch": 0.1922573894846979, + "grad_norm": 3.9156430050533833, + "learning_rate": 1.948611481961657e-05, + "loss": 1.6364, + "step": 5880 + }, + { + "epoch": 0.19242087365942978, + "grad_norm": 3.6800824065203077, + "learning_rate": 1.9484307397775852e-05, + "loss": 1.4511, + "step": 5885 + }, + { + "epoch": 0.19258435783416164, + "grad_norm": 3.491570962086551, + "learning_rate": 1.948249688712859e-05, + "loss": 1.5165, + "step": 5890 + }, + { + "epoch": 0.19274784200889353, + "grad_norm": 3.5841085610919468, + "learning_rate": 1.9480683288264423e-05, + "loss": 1.5563, + "step": 5895 + }, + { + "epoch": 0.19291132618362541, + "grad_norm": 3.3718715626961875, + "learning_rate": 1.9478866601774e-05, + "loss": 1.5589, + "step": 5900 + }, + { + "epoch": 0.1930748103583573, + "grad_norm": 3.2729155078293717, + "learning_rate": 1.9477046828248968e-05, + "loss": 1.5688, + "step": 5905 + }, + { + "epoch": 0.1932382945330892, + "grad_norm": 3.5222844241188356, + "learning_rate": 1.9475223968281982e-05, + "loss": 1.4721, + "step": 5910 + }, + { + "epoch": 0.19340177870782108, + "grad_norm": 3.202146924691056, + "learning_rate": 1.9473398022466702e-05, + "loss": 1.4107, + "step": 5915 + }, + { + "epoch": 0.19356526288255296, + "grad_norm": 3.1689085405647113, + "learning_rate": 1.9471568991397798e-05, + "loss": 1.4833, + "step": 5920 + }, + { + "epoch": 0.19372874705728485, + "grad_norm": 3.31335969037221, + "learning_rate": 1.946973687567094e-05, + "loss": 1.5983, + "step": 5925 + }, + { + "epoch": 0.19389223123201674, + "grad_norm": 3.4633327243505434, + "learning_rate": 1.9467901675882798e-05, + "loss": 1.6045, + "step": 5930 + }, + { + "epoch": 0.19405571540674862, + "grad_norm": 3.4943581634565426, + "learning_rate": 1.9466063392631052e-05, + "loss": 1.4783, + "step": 5935 + }, + { + "epoch": 0.1942191995814805, + "grad_norm": 3.0488374935100535, + "learning_rate": 1.9464222026514394e-05, + "loss": 1.6115, + "step": 5940 + }, + { + "epoch": 0.1943826837562124, + "grad_norm": 3.7750353771932246, + "learning_rate": 1.9462377578132508e-05, + "loss": 1.6086, + "step": 5945 + }, + { + "epoch": 0.19454616793094429, + "grad_norm": 3.48431670697644, + "learning_rate": 1.946053004808608e-05, + "loss": 1.509, + "step": 5950 + }, + { + "epoch": 0.19470965210567617, + "grad_norm": 3.852037481114185, + "learning_rate": 1.9458679436976813e-05, + "loss": 1.6311, + "step": 5955 + }, + { + "epoch": 0.19487313628040806, + "grad_norm": 3.259908317087822, + "learning_rate": 1.9456825745407403e-05, + "loss": 1.5093, + "step": 5960 + }, + { + "epoch": 0.19503662045513995, + "grad_norm": 3.494918627577606, + "learning_rate": 1.9454968973981548e-05, + "loss": 1.5219, + "step": 5965 + }, + { + "epoch": 0.19520010462987183, + "grad_norm": 3.4505143021906, + "learning_rate": 1.9453109123303958e-05, + "loss": 1.5253, + "step": 5970 + }, + { + "epoch": 0.19536358880460372, + "grad_norm": 3.5745121334025503, + "learning_rate": 1.9451246193980337e-05, + "loss": 1.6825, + "step": 5975 + }, + { + "epoch": 0.1955270729793356, + "grad_norm": 3.0139731428163956, + "learning_rate": 1.94493801866174e-05, + "loss": 1.5748, + "step": 5980 + }, + { + "epoch": 0.1956905571540675, + "grad_norm": 3.489517141383533, + "learning_rate": 1.9447511101822854e-05, + "loss": 1.4684, + "step": 5985 + }, + { + "epoch": 0.19585404132879938, + "grad_norm": 3.5545375480389287, + "learning_rate": 1.944563894020542e-05, + "loss": 1.5487, + "step": 5990 + }, + { + "epoch": 0.19601752550353127, + "grad_norm": 3.2021141998476668, + "learning_rate": 1.944376370237481e-05, + "loss": 1.5436, + "step": 5995 + }, + { + "epoch": 0.19618100967826316, + "grad_norm": 3.2676926961319515, + "learning_rate": 1.944188538894175e-05, + "loss": 1.5368, + "step": 6000 + }, + { + "epoch": 0.19634449385299502, + "grad_norm": 3.165413314873927, + "learning_rate": 1.9440004000517955e-05, + "loss": 1.632, + "step": 6005 + }, + { + "epoch": 0.1965079780277269, + "grad_norm": 3.3118217856054284, + "learning_rate": 1.9438119537716144e-05, + "loss": 1.4822, + "step": 6010 + }, + { + "epoch": 0.1966714622024588, + "grad_norm": 3.0801266551708575, + "learning_rate": 1.943623200115005e-05, + "loss": 1.4518, + "step": 6015 + }, + { + "epoch": 0.19683494637719068, + "grad_norm": 3.3386487763709853, + "learning_rate": 1.943434139143439e-05, + "loss": 1.4363, + "step": 6020 + }, + { + "epoch": 0.19699843055192257, + "grad_norm": 3.441231823747326, + "learning_rate": 1.943244770918489e-05, + "loss": 1.5134, + "step": 6025 + }, + { + "epoch": 0.19716191472665445, + "grad_norm": 3.6899354662403057, + "learning_rate": 1.943055095501828e-05, + "loss": 1.4819, + "step": 6030 + }, + { + "epoch": 0.19732539890138634, + "grad_norm": 3.2476445550823794, + "learning_rate": 1.942865112955228e-05, + "loss": 1.6048, + "step": 6035 + }, + { + "epoch": 0.19748888307611823, + "grad_norm": 3.107638206390069, + "learning_rate": 1.9426748233405627e-05, + "loss": 1.4627, + "step": 6040 + }, + { + "epoch": 0.19765236725085011, + "grad_norm": 3.1881228673202537, + "learning_rate": 1.942484226719804e-05, + "loss": 1.5467, + "step": 6045 + }, + { + "epoch": 0.197815851425582, + "grad_norm": 3.567304339843443, + "learning_rate": 1.942293323155024e-05, + "loss": 1.4738, + "step": 6050 + }, + { + "epoch": 0.1979793356003139, + "grad_norm": 3.334558660706754, + "learning_rate": 1.9421021127083965e-05, + "loss": 1.5256, + "step": 6055 + }, + { + "epoch": 0.19814281977504578, + "grad_norm": 3.472676669023638, + "learning_rate": 1.941910595442193e-05, + "loss": 1.5492, + "step": 6060 + }, + { + "epoch": 0.19830630394977766, + "grad_norm": 3.2126236614209085, + "learning_rate": 1.941718771418787e-05, + "loss": 1.4351, + "step": 6065 + }, + { + "epoch": 0.19846978812450955, + "grad_norm": 3.5161161955899676, + "learning_rate": 1.94152664070065e-05, + "loss": 1.4707, + "step": 6070 + }, + { + "epoch": 0.19863327229924144, + "grad_norm": 3.5031155506277236, + "learning_rate": 1.941334203350355e-05, + "loss": 1.4585, + "step": 6075 + }, + { + "epoch": 0.19879675647397332, + "grad_norm": 3.1196516911764376, + "learning_rate": 1.9411414594305736e-05, + "loss": 1.4239, + "step": 6080 + }, + { + "epoch": 0.1989602406487052, + "grad_norm": 3.269921059900091, + "learning_rate": 1.940948409004078e-05, + "loss": 1.7009, + "step": 6085 + }, + { + "epoch": 0.1991237248234371, + "grad_norm": 3.434669796058615, + "learning_rate": 1.94075505213374e-05, + "loss": 1.4733, + "step": 6090 + }, + { + "epoch": 0.19928720899816899, + "grad_norm": 3.205450522767467, + "learning_rate": 1.940561388882531e-05, + "loss": 1.5494, + "step": 6095 + }, + { + "epoch": 0.19945069317290087, + "grad_norm": 3.1200252019884247, + "learning_rate": 1.940367419313523e-05, + "loss": 1.5071, + "step": 6100 + }, + { + "epoch": 0.19961417734763276, + "grad_norm": 3.0925204750839344, + "learning_rate": 1.940173143489886e-05, + "loss": 1.4792, + "step": 6105 + }, + { + "epoch": 0.19977766152236465, + "grad_norm": 3.1236153117705108, + "learning_rate": 1.9399785614748916e-05, + "loss": 1.4718, + "step": 6110 + }, + { + "epoch": 0.19994114569709653, + "grad_norm": 3.263486619756361, + "learning_rate": 1.9397836733319107e-05, + "loss": 1.5473, + "step": 6115 + }, + { + "epoch": 0.2001046298718284, + "grad_norm": 3.588313204164031, + "learning_rate": 1.939588479124413e-05, + "loss": 1.6789, + "step": 6120 + }, + { + "epoch": 0.20026811404656028, + "grad_norm": 3.6033160038520293, + "learning_rate": 1.9393929789159686e-05, + "loss": 1.659, + "step": 6125 + }, + { + "epoch": 0.20043159822129217, + "grad_norm": 3.3504377390842843, + "learning_rate": 1.9391971727702475e-05, + "loss": 1.4802, + "step": 6130 + }, + { + "epoch": 0.20059508239602405, + "grad_norm": 3.4979067109309483, + "learning_rate": 1.9390010607510183e-05, + "loss": 1.5598, + "step": 6135 + }, + { + "epoch": 0.20075856657075594, + "grad_norm": 3.69675773823839, + "learning_rate": 1.9388046429221505e-05, + "loss": 1.6693, + "step": 6140 + }, + { + "epoch": 0.20092205074548783, + "grad_norm": 3.4610268685406678, + "learning_rate": 1.938607919347612e-05, + "loss": 1.6358, + "step": 6145 + }, + { + "epoch": 0.20108553492021972, + "grad_norm": 3.341575328197815, + "learning_rate": 1.9384108900914717e-05, + "loss": 1.5639, + "step": 6150 + }, + { + "epoch": 0.2012490190949516, + "grad_norm": 3.524862454702847, + "learning_rate": 1.9382135552178963e-05, + "loss": 1.4689, + "step": 6155 + }, + { + "epoch": 0.2014125032696835, + "grad_norm": 3.210759968411077, + "learning_rate": 1.938015914791154e-05, + "loss": 1.49, + "step": 6160 + }, + { + "epoch": 0.20157598744441538, + "grad_norm": 3.4439633380053634, + "learning_rate": 1.93781796887561e-05, + "loss": 1.5072, + "step": 6165 + }, + { + "epoch": 0.20173947161914726, + "grad_norm": 3.2096161133292593, + "learning_rate": 1.9376197175357315e-05, + "loss": 1.5157, + "step": 6170 + }, + { + "epoch": 0.20190295579387915, + "grad_norm": 3.7282208113077497, + "learning_rate": 1.9374211608360837e-05, + "loss": 1.5234, + "step": 6175 + }, + { + "epoch": 0.20206643996861104, + "grad_norm": 3.3328426046278246, + "learning_rate": 1.9372222988413315e-05, + "loss": 1.6498, + "step": 6180 + }, + { + "epoch": 0.20222992414334293, + "grad_norm": 3.3983733337284816, + "learning_rate": 1.93702313161624e-05, + "loss": 1.3778, + "step": 6185 + }, + { + "epoch": 0.2023934083180748, + "grad_norm": 3.3510858867559663, + "learning_rate": 1.936823659225673e-05, + "loss": 1.5743, + "step": 6190 + }, + { + "epoch": 0.2025568924928067, + "grad_norm": 3.2168159607290074, + "learning_rate": 1.936623881734593e-05, + "loss": 1.5609, + "step": 6195 + }, + { + "epoch": 0.2027203766675386, + "grad_norm": 3.2527753618463655, + "learning_rate": 1.936423799208063e-05, + "loss": 1.4005, + "step": 6200 + }, + { + "epoch": 0.20288386084227047, + "grad_norm": 3.422461217113889, + "learning_rate": 1.9362234117112455e-05, + "loss": 1.6315, + "step": 6205 + }, + { + "epoch": 0.20304734501700236, + "grad_norm": 3.333524469811684, + "learning_rate": 1.9360227193094013e-05, + "loss": 1.5116, + "step": 6210 + }, + { + "epoch": 0.20321082919173425, + "grad_norm": 3.6012998011340214, + "learning_rate": 1.935821722067891e-05, + "loss": 1.414, + "step": 6215 + }, + { + "epoch": 0.20337431336646614, + "grad_norm": 3.603673358018272, + "learning_rate": 1.9356204200521745e-05, + "loss": 1.4933, + "step": 6220 + }, + { + "epoch": 0.20353779754119802, + "grad_norm": 3.414143342315997, + "learning_rate": 1.9354188133278112e-05, + "loss": 1.5998, + "step": 6225 + }, + { + "epoch": 0.2037012817159299, + "grad_norm": 3.248077608521593, + "learning_rate": 1.935216901960459e-05, + "loss": 1.4901, + "step": 6230 + }, + { + "epoch": 0.2038647658906618, + "grad_norm": 3.2223805019178364, + "learning_rate": 1.935014686015876e-05, + "loss": 1.5152, + "step": 6235 + }, + { + "epoch": 0.20402825006539366, + "grad_norm": 3.0826635366841297, + "learning_rate": 1.9348121655599187e-05, + "loss": 1.5135, + "step": 6240 + }, + { + "epoch": 0.20419173424012554, + "grad_norm": 3.2834118274660415, + "learning_rate": 1.9346093406585432e-05, + "loss": 1.5228, + "step": 6245 + }, + { + "epoch": 0.20435521841485743, + "grad_norm": 3.542985314989262, + "learning_rate": 1.9344062113778042e-05, + "loss": 1.4437, + "step": 6250 + }, + { + "epoch": 0.20451870258958932, + "grad_norm": 3.7127595276814906, + "learning_rate": 1.9342027777838564e-05, + "loss": 1.7456, + "step": 6255 + }, + { + "epoch": 0.2046821867643212, + "grad_norm": 3.594748034176828, + "learning_rate": 1.933999039942953e-05, + "loss": 1.6289, + "step": 6260 + }, + { + "epoch": 0.2048456709390531, + "grad_norm": 3.440886414703099, + "learning_rate": 1.9337949979214462e-05, + "loss": 1.497, + "step": 6265 + }, + { + "epoch": 0.20500915511378498, + "grad_norm": 3.504981150914722, + "learning_rate": 1.9335906517857877e-05, + "loss": 1.4796, + "step": 6270 + }, + { + "epoch": 0.20517263928851687, + "grad_norm": 3.0384776692701316, + "learning_rate": 1.9333860016025286e-05, + "loss": 1.3797, + "step": 6275 + }, + { + "epoch": 0.20533612346324875, + "grad_norm": 3.3145373653134294, + "learning_rate": 1.933181047438317e-05, + "loss": 1.5038, + "step": 6280 + }, + { + "epoch": 0.20549960763798064, + "grad_norm": 3.4243552737700114, + "learning_rate": 1.932975789359903e-05, + "loss": 1.5437, + "step": 6285 + }, + { + "epoch": 0.20566309181271253, + "grad_norm": 3.37133373368105, + "learning_rate": 1.9327702274341327e-05, + "loss": 1.6323, + "step": 6290 + }, + { + "epoch": 0.20582657598744442, + "grad_norm": 3.5103678844885273, + "learning_rate": 1.9325643617279537e-05, + "loss": 1.6114, + "step": 6295 + }, + { + "epoch": 0.2059900601621763, + "grad_norm": 3.712513276183793, + "learning_rate": 1.9323581923084113e-05, + "loss": 1.5774, + "step": 6300 + }, + { + "epoch": 0.2061535443369082, + "grad_norm": 3.4269595838516373, + "learning_rate": 1.932151719242649e-05, + "loss": 1.6439, + "step": 6305 + }, + { + "epoch": 0.20631702851164008, + "grad_norm": 3.52953145190224, + "learning_rate": 1.9319449425979107e-05, + "loss": 1.5779, + "step": 6310 + }, + { + "epoch": 0.20648051268637196, + "grad_norm": 3.4170177795173826, + "learning_rate": 1.9317378624415388e-05, + "loss": 1.4834, + "step": 6315 + }, + { + "epoch": 0.20664399686110385, + "grad_norm": 3.2091906410666113, + "learning_rate": 1.931530478840973e-05, + "loss": 1.5133, + "step": 6320 + }, + { + "epoch": 0.20680748103583574, + "grad_norm": 3.4562197837566595, + "learning_rate": 1.9313227918637545e-05, + "loss": 1.4899, + "step": 6325 + }, + { + "epoch": 0.20697096521056763, + "grad_norm": 3.2674407702698742, + "learning_rate": 1.9311148015775213e-05, + "loss": 1.4806, + "step": 6330 + }, + { + "epoch": 0.2071344493852995, + "grad_norm": 3.5049833898717595, + "learning_rate": 1.9309065080500106e-05, + "loss": 1.5535, + "step": 6335 + }, + { + "epoch": 0.2072979335600314, + "grad_norm": 3.1237366584665933, + "learning_rate": 1.930697911349058e-05, + "loss": 1.5143, + "step": 6340 + }, + { + "epoch": 0.2074614177347633, + "grad_norm": 3.2594575783330617, + "learning_rate": 1.930489011542599e-05, + "loss": 1.5955, + "step": 6345 + }, + { + "epoch": 0.20762490190949517, + "grad_norm": 3.4018028883428983, + "learning_rate": 1.9302798086986674e-05, + "loss": 1.5315, + "step": 6350 + }, + { + "epoch": 0.20778838608422703, + "grad_norm": 3.3937895152163913, + "learning_rate": 1.9300703028853948e-05, + "loss": 1.5731, + "step": 6355 + }, + { + "epoch": 0.20795187025895892, + "grad_norm": 3.310056942443796, + "learning_rate": 1.929860494171013e-05, + "loss": 1.4171, + "step": 6360 + }, + { + "epoch": 0.2081153544336908, + "grad_norm": 3.1856302839549224, + "learning_rate": 1.92965038262385e-05, + "loss": 1.3695, + "step": 6365 + }, + { + "epoch": 0.2082788386084227, + "grad_norm": 4.161036675992416, + "learning_rate": 1.9294399683123354e-05, + "loss": 1.5379, + "step": 6370 + }, + { + "epoch": 0.20844232278315458, + "grad_norm": 3.354618824408438, + "learning_rate": 1.9292292513049956e-05, + "loss": 1.6332, + "step": 6375 + }, + { + "epoch": 0.20860580695788647, + "grad_norm": 3.2221609998757943, + "learning_rate": 1.9290182316704556e-05, + "loss": 1.5414, + "step": 6380 + }, + { + "epoch": 0.20876929113261836, + "grad_norm": 3.4195319504283863, + "learning_rate": 1.92880690947744e-05, + "loss": 1.6048, + "step": 6385 + }, + { + "epoch": 0.20893277530735024, + "grad_norm": 3.186934852810876, + "learning_rate": 1.9285952847947706e-05, + "loss": 1.5133, + "step": 6390 + }, + { + "epoch": 0.20909625948208213, + "grad_norm": 3.195624740452526, + "learning_rate": 1.9283833576913683e-05, + "loss": 1.4415, + "step": 6395 + }, + { + "epoch": 0.20925974365681402, + "grad_norm": 3.5908726362930454, + "learning_rate": 1.9281711282362535e-05, + "loss": 1.5242, + "step": 6400 + }, + { + "epoch": 0.2094232278315459, + "grad_norm": 3.4012242586523094, + "learning_rate": 1.9279585964985433e-05, + "loss": 1.5218, + "step": 6405 + }, + { + "epoch": 0.2095867120062778, + "grad_norm": 3.3115291575836627, + "learning_rate": 1.927745762547454e-05, + "loss": 1.479, + "step": 6410 + }, + { + "epoch": 0.20975019618100968, + "grad_norm": 3.396467270841424, + "learning_rate": 1.9275326264523012e-05, + "loss": 1.679, + "step": 6415 + }, + { + "epoch": 0.20991368035574157, + "grad_norm": 3.6292423934197076, + "learning_rate": 1.9273191882824974e-05, + "loss": 1.6739, + "step": 6420 + }, + { + "epoch": 0.21007716453047345, + "grad_norm": 3.176766330717052, + "learning_rate": 1.9271054481075544e-05, + "loss": 1.3313, + "step": 6425 + }, + { + "epoch": 0.21024064870520534, + "grad_norm": 3.1834309711553757, + "learning_rate": 1.9268914059970823e-05, + "loss": 1.4956, + "step": 6430 + }, + { + "epoch": 0.21040413287993723, + "grad_norm": 3.3903797751419864, + "learning_rate": 1.926677062020789e-05, + "loss": 1.5448, + "step": 6435 + }, + { + "epoch": 0.21056761705466912, + "grad_norm": 3.391613033784873, + "learning_rate": 1.9264624162484817e-05, + "loss": 1.5413, + "step": 6440 + }, + { + "epoch": 0.210731101229401, + "grad_norm": 3.4652241589258264, + "learning_rate": 1.9262474687500644e-05, + "loss": 1.5273, + "step": 6445 + }, + { + "epoch": 0.2108945854041329, + "grad_norm": 3.325795775937725, + "learning_rate": 1.9260322195955407e-05, + "loss": 1.6454, + "step": 6450 + }, + { + "epoch": 0.21105806957886478, + "grad_norm": 3.239280769060579, + "learning_rate": 1.9258166688550123e-05, + "loss": 1.5896, + "step": 6455 + }, + { + "epoch": 0.21122155375359666, + "grad_norm": 3.6665215823138673, + "learning_rate": 1.925600816598678e-05, + "loss": 1.5756, + "step": 6460 + }, + { + "epoch": 0.21138503792832855, + "grad_norm": 3.53912253383522, + "learning_rate": 1.9253846628968365e-05, + "loss": 1.6292, + "step": 6465 + }, + { + "epoch": 0.2115485221030604, + "grad_norm": 3.2203226993606187, + "learning_rate": 1.925168207819883e-05, + "loss": 1.46, + "step": 6470 + }, + { + "epoch": 0.2117120062777923, + "grad_norm": 3.058162959867076, + "learning_rate": 1.9249514514383123e-05, + "loss": 1.5302, + "step": 6475 + }, + { + "epoch": 0.21187549045252418, + "grad_norm": 2.944918752932749, + "learning_rate": 1.924734393822716e-05, + "loss": 1.4695, + "step": 6480 + }, + { + "epoch": 0.21203897462725607, + "grad_norm": 3.4227017288217056, + "learning_rate": 1.9245170350437847e-05, + "loss": 1.4635, + "step": 6485 + }, + { + "epoch": 0.21220245880198796, + "grad_norm": 3.1960179819125067, + "learning_rate": 1.924299375172307e-05, + "loss": 1.4612, + "step": 6490 + }, + { + "epoch": 0.21236594297671985, + "grad_norm": 3.5475525613722407, + "learning_rate": 1.924081414279169e-05, + "loss": 1.5297, + "step": 6495 + }, + { + "epoch": 0.21252942715145173, + "grad_norm": 3.355495242668258, + "learning_rate": 1.9238631524353558e-05, + "loss": 1.5997, + "step": 6500 + }, + { + "epoch": 0.21269291132618362, + "grad_norm": 3.7004518809174685, + "learning_rate": 1.9236445897119497e-05, + "loss": 1.4786, + "step": 6505 + }, + { + "epoch": 0.2128563955009155, + "grad_norm": 3.2504753927616106, + "learning_rate": 1.9234257261801308e-05, + "loss": 1.4142, + "step": 6510 + }, + { + "epoch": 0.2130198796756474, + "grad_norm": 3.4985173514492294, + "learning_rate": 1.9232065619111783e-05, + "loss": 1.5557, + "step": 6515 + }, + { + "epoch": 0.21318336385037928, + "grad_norm": 3.428713910633595, + "learning_rate": 1.9229870969764675e-05, + "loss": 1.5208, + "step": 6520 + }, + { + "epoch": 0.21334684802511117, + "grad_norm": 3.1632416111456005, + "learning_rate": 1.922767331447474e-05, + "loss": 1.427, + "step": 6525 + }, + { + "epoch": 0.21351033219984306, + "grad_norm": 3.1821234873677255, + "learning_rate": 1.9225472653957697e-05, + "loss": 1.4409, + "step": 6530 + }, + { + "epoch": 0.21367381637457494, + "grad_norm": 3.174563200521325, + "learning_rate": 1.9223268988930243e-05, + "loss": 1.5265, + "step": 6535 + }, + { + "epoch": 0.21383730054930683, + "grad_norm": 3.3445930840346954, + "learning_rate": 1.922106232011006e-05, + "loss": 1.4627, + "step": 6540 + }, + { + "epoch": 0.21400078472403872, + "grad_norm": 3.1111224851842723, + "learning_rate": 1.921885264821581e-05, + "loss": 1.4197, + "step": 6545 + }, + { + "epoch": 0.2141642688987706, + "grad_norm": 3.4037374151208324, + "learning_rate": 1.921663997396712e-05, + "loss": 1.5438, + "step": 6550 + }, + { + "epoch": 0.2143277530735025, + "grad_norm": 3.1193157731458654, + "learning_rate": 1.9214424298084612e-05, + "loss": 1.4703, + "step": 6555 + }, + { + "epoch": 0.21449123724823438, + "grad_norm": 3.3142179360681765, + "learning_rate": 1.9212205621289877e-05, + "loss": 1.5202, + "step": 6560 + }, + { + "epoch": 0.21465472142296627, + "grad_norm": 2.9937361364981916, + "learning_rate": 1.920998394430548e-05, + "loss": 1.4894, + "step": 6565 + }, + { + "epoch": 0.21481820559769815, + "grad_norm": 3.717612728952206, + "learning_rate": 1.9207759267854963e-05, + "loss": 1.5443, + "step": 6570 + }, + { + "epoch": 0.21498168977243004, + "grad_norm": 3.469628956736984, + "learning_rate": 1.9205531592662857e-05, + "loss": 1.6247, + "step": 6575 + }, + { + "epoch": 0.21514517394716193, + "grad_norm": 3.3686622000518653, + "learning_rate": 1.920330091945466e-05, + "loss": 1.4986, + "step": 6580 + }, + { + "epoch": 0.21530865812189381, + "grad_norm": 3.1169502888808873, + "learning_rate": 1.9201067248956842e-05, + "loss": 1.5539, + "step": 6585 + }, + { + "epoch": 0.21547214229662567, + "grad_norm": 3.499916640882549, + "learning_rate": 1.919883058189686e-05, + "loss": 1.4797, + "step": 6590 + }, + { + "epoch": 0.21563562647135756, + "grad_norm": 3.201116254250543, + "learning_rate": 1.919659091900314e-05, + "loss": 1.6095, + "step": 6595 + }, + { + "epoch": 0.21579911064608945, + "grad_norm": 3.205023376209159, + "learning_rate": 1.9194348261005086e-05, + "loss": 1.5143, + "step": 6600 + }, + { + "epoch": 0.21596259482082134, + "grad_norm": 3.5495799320501873, + "learning_rate": 1.9192102608633072e-05, + "loss": 1.5464, + "step": 6605 + }, + { + "epoch": 0.21612607899555322, + "grad_norm": 3.3762819622890956, + "learning_rate": 1.918985396261846e-05, + "loss": 1.461, + "step": 6610 + }, + { + "epoch": 0.2162895631702851, + "grad_norm": 3.3074560327623708, + "learning_rate": 1.918760232369357e-05, + "loss": 1.4593, + "step": 6615 + }, + { + "epoch": 0.216453047345017, + "grad_norm": 2.9892378434697853, + "learning_rate": 1.9185347692591715e-05, + "loss": 1.444, + "step": 6620 + }, + { + "epoch": 0.21661653151974888, + "grad_norm": 3.161856311551367, + "learning_rate": 1.9183090070047167e-05, + "loss": 1.5461, + "step": 6625 + }, + { + "epoch": 0.21678001569448077, + "grad_norm": 3.2310998907865676, + "learning_rate": 1.918082945679518e-05, + "loss": 1.5348, + "step": 6630 + }, + { + "epoch": 0.21694349986921266, + "grad_norm": 3.383341856803346, + "learning_rate": 1.917856585357198e-05, + "loss": 1.5562, + "step": 6635 + }, + { + "epoch": 0.21710698404394455, + "grad_norm": 3.3450925803796743, + "learning_rate": 1.9176299261114767e-05, + "loss": 1.4317, + "step": 6640 + }, + { + "epoch": 0.21727046821867643, + "grad_norm": 3.4715695421813306, + "learning_rate": 1.9174029680161713e-05, + "loss": 1.4729, + "step": 6645 + }, + { + "epoch": 0.21743395239340832, + "grad_norm": 3.2198665287760995, + "learning_rate": 1.917175711145197e-05, + "loss": 1.4028, + "step": 6650 + }, + { + "epoch": 0.2175974365681402, + "grad_norm": 3.397510049628366, + "learning_rate": 1.9169481555725653e-05, + "loss": 1.5317, + "step": 6655 + }, + { + "epoch": 0.2177609207428721, + "grad_norm": 3.462514396070128, + "learning_rate": 1.9167203013723858e-05, + "loss": 1.5032, + "step": 6660 + }, + { + "epoch": 0.21792440491760398, + "grad_norm": 3.4016572507348957, + "learning_rate": 1.916492148618865e-05, + "loss": 1.5119, + "step": 6665 + }, + { + "epoch": 0.21808788909233587, + "grad_norm": 3.3654467264906764, + "learning_rate": 1.9162636973863063e-05, + "loss": 1.5795, + "step": 6670 + }, + { + "epoch": 0.21825137326706776, + "grad_norm": 3.0971423824640887, + "learning_rate": 1.9160349477491108e-05, + "loss": 1.5257, + "step": 6675 + }, + { + "epoch": 0.21841485744179964, + "grad_norm": 3.308515527610559, + "learning_rate": 1.915805899781777e-05, + "loss": 1.4927, + "step": 6680 + }, + { + "epoch": 0.21857834161653153, + "grad_norm": 3.341535795187847, + "learning_rate": 1.9155765535589e-05, + "loss": 1.6284, + "step": 6685 + }, + { + "epoch": 0.21874182579126342, + "grad_norm": 3.262624785680428, + "learning_rate": 1.9153469091551723e-05, + "loss": 1.5438, + "step": 6690 + }, + { + "epoch": 0.2189053099659953, + "grad_norm": 3.2981536565261784, + "learning_rate": 1.915116966645383e-05, + "loss": 1.5256, + "step": 6695 + }, + { + "epoch": 0.2190687941407272, + "grad_norm": 3.358094829962823, + "learning_rate": 1.9148867261044193e-05, + "loss": 1.6068, + "step": 6700 + }, + { + "epoch": 0.21923227831545905, + "grad_norm": 3.4268742038067743, + "learning_rate": 1.914656187607265e-05, + "loss": 1.4345, + "step": 6705 + }, + { + "epoch": 0.21939576249019094, + "grad_norm": 3.3717743902056467, + "learning_rate": 1.9144253512290003e-05, + "loss": 1.4988, + "step": 6710 + }, + { + "epoch": 0.21955924666492282, + "grad_norm": 3.2257587962246066, + "learning_rate": 1.9141942170448032e-05, + "loss": 1.5992, + "step": 6715 + }, + { + "epoch": 0.2197227308396547, + "grad_norm": 3.38936801148283, + "learning_rate": 1.9139627851299485e-05, + "loss": 1.4626, + "step": 6720 + }, + { + "epoch": 0.2198862150143866, + "grad_norm": 3.287681647779801, + "learning_rate": 1.913731055559808e-05, + "loss": 1.6199, + "step": 6725 + }, + { + "epoch": 0.2200496991891185, + "grad_norm": 3.483243719628569, + "learning_rate": 1.9134990284098498e-05, + "loss": 1.5706, + "step": 6730 + }, + { + "epoch": 0.22021318336385037, + "grad_norm": 3.445306352597307, + "learning_rate": 1.91326670375564e-05, + "loss": 1.5081, + "step": 6735 + }, + { + "epoch": 0.22037666753858226, + "grad_norm": 3.2290356216971388, + "learning_rate": 1.913034081672841e-05, + "loss": 1.5473, + "step": 6740 + }, + { + "epoch": 0.22054015171331415, + "grad_norm": 3.1169437238116116, + "learning_rate": 1.9128011622372122e-05, + "loss": 1.5008, + "step": 6745 + }, + { + "epoch": 0.22070363588804603, + "grad_norm": 3.2998923223691214, + "learning_rate": 1.9125679455246095e-05, + "loss": 1.4635, + "step": 6750 + }, + { + "epoch": 0.22086712006277792, + "grad_norm": 3.1908822943798163, + "learning_rate": 1.9123344316109857e-05, + "loss": 1.4883, + "step": 6755 + }, + { + "epoch": 0.2210306042375098, + "grad_norm": 3.2179800228154054, + "learning_rate": 1.912100620572391e-05, + "loss": 1.4759, + "step": 6760 + }, + { + "epoch": 0.2211940884122417, + "grad_norm": 3.4049009747256305, + "learning_rate": 1.911866512484972e-05, + "loss": 1.5336, + "step": 6765 + }, + { + "epoch": 0.22135757258697358, + "grad_norm": 3.0842509798150783, + "learning_rate": 1.911632107424971e-05, + "loss": 1.4096, + "step": 6770 + }, + { + "epoch": 0.22152105676170547, + "grad_norm": 3.2720919208844856, + "learning_rate": 1.9113974054687296e-05, + "loss": 1.4559, + "step": 6775 + }, + { + "epoch": 0.22168454093643736, + "grad_norm": 3.441331594331788, + "learning_rate": 1.9111624066926832e-05, + "loss": 1.4542, + "step": 6780 + }, + { + "epoch": 0.22184802511116924, + "grad_norm": 3.4153231979694887, + "learning_rate": 1.9109271111733652e-05, + "loss": 1.5598, + "step": 6785 + }, + { + "epoch": 0.22201150928590113, + "grad_norm": 3.323513614818892, + "learning_rate": 1.910691518987406e-05, + "loss": 1.5263, + "step": 6790 + }, + { + "epoch": 0.22217499346063302, + "grad_norm": 3.383245304654896, + "learning_rate": 1.9104556302115324e-05, + "loss": 1.6561, + "step": 6795 + }, + { + "epoch": 0.2223384776353649, + "grad_norm": 3.2537485343386927, + "learning_rate": 1.9102194449225667e-05, + "loss": 1.4916, + "step": 6800 + }, + { + "epoch": 0.2225019618100968, + "grad_norm": 3.2554258444630726, + "learning_rate": 1.90998296319743e-05, + "loss": 1.4677, + "step": 6805 + }, + { + "epoch": 0.22266544598482868, + "grad_norm": 3.1316235475825027, + "learning_rate": 1.9097461851131372e-05, + "loss": 1.422, + "step": 6810 + }, + { + "epoch": 0.22282893015956057, + "grad_norm": 3.3607682474970137, + "learning_rate": 1.909509110746802e-05, + "loss": 1.608, + "step": 6815 + }, + { + "epoch": 0.22299241433429243, + "grad_norm": 3.132324848215335, + "learning_rate": 1.9092717401756337e-05, + "loss": 1.4614, + "step": 6820 + }, + { + "epoch": 0.22315589850902431, + "grad_norm": 3.608286050549707, + "learning_rate": 1.9090340734769378e-05, + "loss": 1.507, + "step": 6825 + }, + { + "epoch": 0.2233193826837562, + "grad_norm": 2.848161215670747, + "learning_rate": 1.9087961107281162e-05, + "loss": 1.5376, + "step": 6830 + }, + { + "epoch": 0.2234828668584881, + "grad_norm": 3.324851399933153, + "learning_rate": 1.908557852006668e-05, + "loss": 1.5879, + "step": 6835 + }, + { + "epoch": 0.22364635103321998, + "grad_norm": 3.1782576278558197, + "learning_rate": 1.9083192973901886e-05, + "loss": 1.478, + "step": 6840 + }, + { + "epoch": 0.22380983520795186, + "grad_norm": 3.2166673692949566, + "learning_rate": 1.9080804469563688e-05, + "loss": 1.54, + "step": 6845 + }, + { + "epoch": 0.22397331938268375, + "grad_norm": 3.4458389746403766, + "learning_rate": 1.9078413007829965e-05, + "loss": 1.5116, + "step": 6850 + }, + { + "epoch": 0.22413680355741564, + "grad_norm": 3.107708288542822, + "learning_rate": 1.9076018589479557e-05, + "loss": 1.4915, + "step": 6855 + }, + { + "epoch": 0.22430028773214752, + "grad_norm": 3.5552975108480838, + "learning_rate": 1.9073621215292266e-05, + "loss": 1.5353, + "step": 6860 + }, + { + "epoch": 0.2244637719068794, + "grad_norm": 3.1919926321145833, + "learning_rate": 1.907122088604886e-05, + "loss": 1.4546, + "step": 6865 + }, + { + "epoch": 0.2246272560816113, + "grad_norm": 3.3046481389953164, + "learning_rate": 1.9068817602531065e-05, + "loss": 1.605, + "step": 6870 + }, + { + "epoch": 0.22479074025634319, + "grad_norm": 3.3177726159271064, + "learning_rate": 1.906641136552158e-05, + "loss": 1.5687, + "step": 6875 + }, + { + "epoch": 0.22495422443107507, + "grad_norm": 3.2956487439481554, + "learning_rate": 1.9064002175804048e-05, + "loss": 1.7068, + "step": 6880 + }, + { + "epoch": 0.22511770860580696, + "grad_norm": 3.403583133682157, + "learning_rate": 1.9061590034163085e-05, + "loss": 1.6602, + "step": 6885 + }, + { + "epoch": 0.22528119278053885, + "grad_norm": 2.9381795288881296, + "learning_rate": 1.905917494138427e-05, + "loss": 1.5692, + "step": 6890 + }, + { + "epoch": 0.22544467695527073, + "grad_norm": 3.2654204227823334, + "learning_rate": 1.9056756898254134e-05, + "loss": 1.626, + "step": 6895 + }, + { + "epoch": 0.22560816113000262, + "grad_norm": 3.112311203017064, + "learning_rate": 1.905433590556018e-05, + "loss": 1.5135, + "step": 6900 + }, + { + "epoch": 0.2257716453047345, + "grad_norm": 3.425114723159719, + "learning_rate": 1.9051911964090864e-05, + "loss": 1.5167, + "step": 6905 + }, + { + "epoch": 0.2259351294794664, + "grad_norm": 3.4023821990379113, + "learning_rate": 1.90494850746356e-05, + "loss": 1.6656, + "step": 6910 + }, + { + "epoch": 0.22609861365419828, + "grad_norm": 3.2540634403404125, + "learning_rate": 1.9047055237984775e-05, + "loss": 1.7102, + "step": 6915 + }, + { + "epoch": 0.22626209782893017, + "grad_norm": 3.110451476946895, + "learning_rate": 1.904462245492972e-05, + "loss": 1.4215, + "step": 6920 + }, + { + "epoch": 0.22642558200366206, + "grad_norm": 3.1538329038040254, + "learning_rate": 1.9042186726262736e-05, + "loss": 1.414, + "step": 6925 + }, + { + "epoch": 0.22658906617839394, + "grad_norm": 3.302414813100721, + "learning_rate": 1.903974805277708e-05, + "loss": 1.4066, + "step": 6930 + }, + { + "epoch": 0.22675255035312583, + "grad_norm": 3.323546626236406, + "learning_rate": 1.903730643526697e-05, + "loss": 1.5149, + "step": 6935 + }, + { + "epoch": 0.2269160345278577, + "grad_norm": 3.4587499136777202, + "learning_rate": 1.9034861874527575e-05, + "loss": 1.7175, + "step": 6940 + }, + { + "epoch": 0.22707951870258958, + "grad_norm": 3.494770624238207, + "learning_rate": 1.9032414371355032e-05, + "loss": 1.6401, + "step": 6945 + }, + { + "epoch": 0.22724300287732146, + "grad_norm": 3.2847540136495494, + "learning_rate": 1.9029963926546435e-05, + "loss": 1.4485, + "step": 6950 + }, + { + "epoch": 0.22740648705205335, + "grad_norm": 3.4651585878400843, + "learning_rate": 1.9027510540899832e-05, + "loss": 1.4907, + "step": 6955 + }, + { + "epoch": 0.22756997122678524, + "grad_norm": 3.3374194496528653, + "learning_rate": 1.9025054215214232e-05, + "loss": 1.5368, + "step": 6960 + }, + { + "epoch": 0.22773345540151713, + "grad_norm": 3.361396963157238, + "learning_rate": 1.9022594950289597e-05, + "loss": 1.5267, + "step": 6965 + }, + { + "epoch": 0.227896939576249, + "grad_norm": 3.3563561621806457, + "learning_rate": 1.902013274692685e-05, + "loss": 1.4792, + "step": 6970 + }, + { + "epoch": 0.2280604237509809, + "grad_norm": 3.0834451587192624, + "learning_rate": 1.9017667605927874e-05, + "loss": 1.5794, + "step": 6975 + }, + { + "epoch": 0.2282239079257128, + "grad_norm": 3.218209113295128, + "learning_rate": 1.90151995280955e-05, + "loss": 1.5054, + "step": 6980 + }, + { + "epoch": 0.22838739210044467, + "grad_norm": 2.9703496729200163, + "learning_rate": 1.9012728514233524e-05, + "loss": 1.6291, + "step": 6985 + }, + { + "epoch": 0.22855087627517656, + "grad_norm": 3.3954568045525932, + "learning_rate": 1.9010254565146695e-05, + "loss": 1.4133, + "step": 6990 + }, + { + "epoch": 0.22871436044990845, + "grad_norm": 3.4547766559407305, + "learning_rate": 1.9007777681640713e-05, + "loss": 1.496, + "step": 6995 + }, + { + "epoch": 0.22887784462464034, + "grad_norm": 3.440731240317831, + "learning_rate": 1.9005297864522244e-05, + "loss": 1.4907, + "step": 7000 + }, + { + "epoch": 0.22904132879937222, + "grad_norm": 3.426034550843234, + "learning_rate": 1.90028151145989e-05, + "loss": 1.496, + "step": 7005 + }, + { + "epoch": 0.2292048129741041, + "grad_norm": 3.2809183572433747, + "learning_rate": 1.9000329432679254e-05, + "loss": 1.4351, + "step": 7010 + }, + { + "epoch": 0.229368297148836, + "grad_norm": 3.072264053519079, + "learning_rate": 1.899784081957283e-05, + "loss": 1.3904, + "step": 7015 + }, + { + "epoch": 0.22953178132356788, + "grad_norm": 3.3623645512455322, + "learning_rate": 1.8995349276090106e-05, + "loss": 1.5864, + "step": 7020 + }, + { + "epoch": 0.22969526549829977, + "grad_norm": 3.388163835583451, + "learning_rate": 1.8992854803042525e-05, + "loss": 1.5417, + "step": 7025 + }, + { + "epoch": 0.22985874967303166, + "grad_norm": 3.414557760603845, + "learning_rate": 1.8990357401242464e-05, + "loss": 1.5573, + "step": 7030 + }, + { + "epoch": 0.23002223384776355, + "grad_norm": 3.2684993135417266, + "learning_rate": 1.8987857071503276e-05, + "loss": 1.5165, + "step": 7035 + }, + { + "epoch": 0.23018571802249543, + "grad_norm": 3.321152382995651, + "learning_rate": 1.8985353814639254e-05, + "loss": 1.4567, + "step": 7040 + }, + { + "epoch": 0.23034920219722732, + "grad_norm": 3.278491417989177, + "learning_rate": 1.8982847631465643e-05, + "loss": 1.533, + "step": 7045 + }, + { + "epoch": 0.2305126863719592, + "grad_norm": 3.1283166403619544, + "learning_rate": 1.898033852279865e-05, + "loss": 1.4137, + "step": 7050 + }, + { + "epoch": 0.23067617054669107, + "grad_norm": 3.0369002016830873, + "learning_rate": 1.897782648945543e-05, + "loss": 1.4926, + "step": 7055 + }, + { + "epoch": 0.23083965472142295, + "grad_norm": 3.4449341060764147, + "learning_rate": 1.897531153225409e-05, + "loss": 1.5449, + "step": 7060 + }, + { + "epoch": 0.23100313889615484, + "grad_norm": 3.335622158660047, + "learning_rate": 1.897279365201369e-05, + "loss": 1.4282, + "step": 7065 + }, + { + "epoch": 0.23116662307088673, + "grad_norm": 3.651459507394094, + "learning_rate": 1.8970272849554243e-05, + "loss": 1.5524, + "step": 7070 + }, + { + "epoch": 0.23133010724561862, + "grad_norm": 3.2678431702906705, + "learning_rate": 1.896774912569671e-05, + "loss": 1.5959, + "step": 7075 + }, + { + "epoch": 0.2314935914203505, + "grad_norm": 3.1568852617989402, + "learning_rate": 1.8965222481263005e-05, + "loss": 1.4913, + "step": 7080 + }, + { + "epoch": 0.2316570755950824, + "grad_norm": 3.42521917091421, + "learning_rate": 1.8962692917075998e-05, + "loss": 1.5061, + "step": 7085 + }, + { + "epoch": 0.23182055976981428, + "grad_norm": 3.376697900080686, + "learning_rate": 1.8960160433959505e-05, + "loss": 1.5759, + "step": 7090 + }, + { + "epoch": 0.23198404394454616, + "grad_norm": 3.2000640569236256, + "learning_rate": 1.8957625032738292e-05, + "loss": 1.5334, + "step": 7095 + }, + { + "epoch": 0.23214752811927805, + "grad_norm": 3.472535087243618, + "learning_rate": 1.895508671423808e-05, + "loss": 1.5924, + "step": 7100 + }, + { + "epoch": 0.23231101229400994, + "grad_norm": 3.4090409043541454, + "learning_rate": 1.8952545479285535e-05, + "loss": 1.4456, + "step": 7105 + }, + { + "epoch": 0.23247449646874183, + "grad_norm": 3.2618857074372762, + "learning_rate": 1.8950001328708275e-05, + "loss": 1.5583, + "step": 7110 + }, + { + "epoch": 0.2326379806434737, + "grad_norm": 3.2457520301964045, + "learning_rate": 1.8947454263334868e-05, + "loss": 1.4651, + "step": 7115 + }, + { + "epoch": 0.2328014648182056, + "grad_norm": 3.2012506802110052, + "learning_rate": 1.894490428399483e-05, + "loss": 1.5273, + "step": 7120 + }, + { + "epoch": 0.2329649489929375, + "grad_norm": 3.3157185101550573, + "learning_rate": 1.894235139151863e-05, + "loss": 1.5514, + "step": 7125 + }, + { + "epoch": 0.23312843316766937, + "grad_norm": 3.2196348075736787, + "learning_rate": 1.8939795586737677e-05, + "loss": 1.5731, + "step": 7130 + }, + { + "epoch": 0.23329191734240126, + "grad_norm": 3.514890845703433, + "learning_rate": 1.893723687048434e-05, + "loss": 1.516, + "step": 7135 + }, + { + "epoch": 0.23345540151713315, + "grad_norm": 3.2809912325081463, + "learning_rate": 1.8934675243591926e-05, + "loss": 1.5946, + "step": 7140 + }, + { + "epoch": 0.23361888569186504, + "grad_norm": 3.2714434242125634, + "learning_rate": 1.8932110706894696e-05, + "loss": 1.6143, + "step": 7145 + }, + { + "epoch": 0.23378236986659692, + "grad_norm": 3.3386433089950653, + "learning_rate": 1.8929543261227854e-05, + "loss": 1.5434, + "step": 7150 + }, + { + "epoch": 0.2339458540413288, + "grad_norm": 3.092035747959998, + "learning_rate": 1.892697290742756e-05, + "loss": 1.5206, + "step": 7155 + }, + { + "epoch": 0.2341093382160607, + "grad_norm": 3.4226202623052715, + "learning_rate": 1.8924399646330908e-05, + "loss": 1.488, + "step": 7160 + }, + { + "epoch": 0.23427282239079258, + "grad_norm": 3.27271614912903, + "learning_rate": 1.892182347877595e-05, + "loss": 1.4784, + "step": 7165 + }, + { + "epoch": 0.23443630656552444, + "grad_norm": 3.1840451172987523, + "learning_rate": 1.891924440560168e-05, + "loss": 1.3896, + "step": 7170 + }, + { + "epoch": 0.23459979074025633, + "grad_norm": 3.3900527985513986, + "learning_rate": 1.8916662427648044e-05, + "loss": 1.5788, + "step": 7175 + }, + { + "epoch": 0.23476327491498822, + "grad_norm": 3.360264351156352, + "learning_rate": 1.8914077545755918e-05, + "loss": 1.4783, + "step": 7180 + }, + { + "epoch": 0.2349267590897201, + "grad_norm": 3.3891427612539347, + "learning_rate": 1.8911489760767144e-05, + "loss": 1.5278, + "step": 7185 + }, + { + "epoch": 0.235090243264452, + "grad_norm": 3.5060049208873134, + "learning_rate": 1.8908899073524494e-05, + "loss": 1.5502, + "step": 7190 + }, + { + "epoch": 0.23525372743918388, + "grad_norm": 3.2474389253863256, + "learning_rate": 1.8906305484871697e-05, + "loss": 1.5036, + "step": 7195 + }, + { + "epoch": 0.23541721161391577, + "grad_norm": 3.283058825945273, + "learning_rate": 1.890370899565342e-05, + "loss": 1.4513, + "step": 7200 + }, + { + "epoch": 0.23558069578864765, + "grad_norm": 3.278540022103678, + "learning_rate": 1.8901109606715272e-05, + "loss": 1.5701, + "step": 7205 + }, + { + "epoch": 0.23574417996337954, + "grad_norm": 3.2690454834078393, + "learning_rate": 1.8898507318903813e-05, + "loss": 1.598, + "step": 7210 + }, + { + "epoch": 0.23590766413811143, + "grad_norm": 3.536730455202892, + "learning_rate": 1.8895902133066547e-05, + "loss": 1.5448, + "step": 7215 + }, + { + "epoch": 0.23607114831284332, + "grad_norm": 3.262867041792634, + "learning_rate": 1.8893294050051917e-05, + "loss": 1.6944, + "step": 7220 + }, + { + "epoch": 0.2362346324875752, + "grad_norm": 3.279372061009938, + "learning_rate": 1.889068307070931e-05, + "loss": 1.5046, + "step": 7225 + }, + { + "epoch": 0.2363981166623071, + "grad_norm": 2.9332973121190684, + "learning_rate": 1.8888069195889056e-05, + "loss": 1.3519, + "step": 7230 + }, + { + "epoch": 0.23656160083703898, + "grad_norm": 3.4520052548714877, + "learning_rate": 1.8885452426442437e-05, + "loss": 1.4969, + "step": 7235 + }, + { + "epoch": 0.23672508501177086, + "grad_norm": 3.2786211029648284, + "learning_rate": 1.8882832763221662e-05, + "loss": 1.5678, + "step": 7240 + }, + { + "epoch": 0.23688856918650275, + "grad_norm": 3.2794794057653522, + "learning_rate": 1.88802102070799e-05, + "loss": 1.5097, + "step": 7245 + }, + { + "epoch": 0.23705205336123464, + "grad_norm": 3.3685569168121487, + "learning_rate": 1.8877584758871247e-05, + "loss": 1.5549, + "step": 7250 + }, + { + "epoch": 0.23721553753596653, + "grad_norm": 3.278104133551474, + "learning_rate": 1.8874956419450754e-05, + "loss": 1.4207, + "step": 7255 + }, + { + "epoch": 0.2373790217106984, + "grad_norm": 3.4785614901157222, + "learning_rate": 1.8872325189674398e-05, + "loss": 1.5369, + "step": 7260 + }, + { + "epoch": 0.2375425058854303, + "grad_norm": 3.4461066547173047, + "learning_rate": 1.8869691070399113e-05, + "loss": 1.5493, + "step": 7265 + }, + { + "epoch": 0.2377059900601622, + "grad_norm": 3.168911514822364, + "learning_rate": 1.8867054062482763e-05, + "loss": 1.5351, + "step": 7270 + }, + { + "epoch": 0.23786947423489407, + "grad_norm": 3.664017669338016, + "learning_rate": 1.886441416678416e-05, + "loss": 1.5281, + "step": 7275 + }, + { + "epoch": 0.23803295840962596, + "grad_norm": 3.3763499047167564, + "learning_rate": 1.886177138416305e-05, + "loss": 1.54, + "step": 7280 + }, + { + "epoch": 0.23819644258435782, + "grad_norm": 3.2904532069583126, + "learning_rate": 1.885912571548012e-05, + "loss": 1.5133, + "step": 7285 + }, + { + "epoch": 0.2383599267590897, + "grad_norm": 3.0209079187904937, + "learning_rate": 1.885647716159701e-05, + "loss": 1.5324, + "step": 7290 + }, + { + "epoch": 0.2385234109338216, + "grad_norm": 3.5237273372028555, + "learning_rate": 1.885382572337628e-05, + "loss": 1.5685, + "step": 7295 + }, + { + "epoch": 0.23868689510855348, + "grad_norm": 3.5002503846691493, + "learning_rate": 1.8851171401681442e-05, + "loss": 1.4822, + "step": 7300 + }, + { + "epoch": 0.23885037928328537, + "grad_norm": 3.486050324885001, + "learning_rate": 1.8848514197376938e-05, + "loss": 1.5494, + "step": 7305 + }, + { + "epoch": 0.23901386345801726, + "grad_norm": 3.3204983131668984, + "learning_rate": 1.8845854111328165e-05, + "loss": 1.5567, + "step": 7310 + }, + { + "epoch": 0.23917734763274914, + "grad_norm": 3.4931891139936093, + "learning_rate": 1.8843191144401443e-05, + "loss": 1.3961, + "step": 7315 + }, + { + "epoch": 0.23934083180748103, + "grad_norm": 3.1330304984264075, + "learning_rate": 1.884052529746403e-05, + "loss": 1.517, + "step": 7320 + }, + { + "epoch": 0.23950431598221292, + "grad_norm": 3.1226410706163668, + "learning_rate": 1.883785657138413e-05, + "loss": 1.5194, + "step": 7325 + }, + { + "epoch": 0.2396678001569448, + "grad_norm": 2.886899899702914, + "learning_rate": 1.8835184967030887e-05, + "loss": 1.3112, + "step": 7330 + }, + { + "epoch": 0.2398312843316767, + "grad_norm": 3.377609537731174, + "learning_rate": 1.8832510485274368e-05, + "loss": 1.5883, + "step": 7335 + }, + { + "epoch": 0.23999476850640858, + "grad_norm": 3.465021375640711, + "learning_rate": 1.8829833126985595e-05, + "loss": 1.4097, + "step": 7340 + }, + { + "epoch": 0.24015825268114047, + "grad_norm": 3.489306112172397, + "learning_rate": 1.8827152893036516e-05, + "loss": 1.5306, + "step": 7345 + }, + { + "epoch": 0.24032173685587235, + "grad_norm": 3.119746826099398, + "learning_rate": 1.882446978430001e-05, + "loss": 1.3909, + "step": 7350 + }, + { + "epoch": 0.24048522103060424, + "grad_norm": 3.125967355140031, + "learning_rate": 1.8821783801649908e-05, + "loss": 1.5307, + "step": 7355 + }, + { + "epoch": 0.24064870520533613, + "grad_norm": 3.290813359954124, + "learning_rate": 1.8819094945960965e-05, + "loss": 1.5335, + "step": 7360 + }, + { + "epoch": 0.24081218938006801, + "grad_norm": 3.1354041953309095, + "learning_rate": 1.881640321810888e-05, + "loss": 1.492, + "step": 7365 + }, + { + "epoch": 0.2409756735547999, + "grad_norm": 3.42947787235252, + "learning_rate": 1.8813708618970273e-05, + "loss": 1.4725, + "step": 7370 + }, + { + "epoch": 0.2411391577295318, + "grad_norm": 3.233924961039935, + "learning_rate": 1.881101114942272e-05, + "loss": 1.6011, + "step": 7375 + }, + { + "epoch": 0.24130264190426368, + "grad_norm": 3.240062110532102, + "learning_rate": 1.8808310810344713e-05, + "loss": 1.4779, + "step": 7380 + }, + { + "epoch": 0.24146612607899556, + "grad_norm": 3.1544088931423317, + "learning_rate": 1.8805607602615692e-05, + "loss": 1.4909, + "step": 7385 + }, + { + "epoch": 0.24162961025372745, + "grad_norm": 3.2406269073986014, + "learning_rate": 1.880290152711602e-05, + "loss": 1.5252, + "step": 7390 + }, + { + "epoch": 0.24179309442845934, + "grad_norm": 3.3165720989537353, + "learning_rate": 1.8800192584727004e-05, + "loss": 1.5672, + "step": 7395 + }, + { + "epoch": 0.24195657860319122, + "grad_norm": 3.1773339152668583, + "learning_rate": 1.879748077633088e-05, + "loss": 1.5278, + "step": 7400 + }, + { + "epoch": 0.24212006277792308, + "grad_norm": 3.1645301724249726, + "learning_rate": 1.879476610281081e-05, + "loss": 1.5723, + "step": 7405 + }, + { + "epoch": 0.24228354695265497, + "grad_norm": 3.346001825165418, + "learning_rate": 1.8792048565050906e-05, + "loss": 1.4651, + "step": 7410 + }, + { + "epoch": 0.24244703112738686, + "grad_norm": 2.9649316472826577, + "learning_rate": 1.8789328163936203e-05, + "loss": 1.4799, + "step": 7415 + }, + { + "epoch": 0.24261051530211875, + "grad_norm": 3.347890342458206, + "learning_rate": 1.8786604900352663e-05, + "loss": 1.5652, + "step": 7420 + }, + { + "epoch": 0.24277399947685063, + "grad_norm": 3.409372610931923, + "learning_rate": 1.878387877518719e-05, + "loss": 1.4863, + "step": 7425 + }, + { + "epoch": 0.24293748365158252, + "grad_norm": 3.0461820321802073, + "learning_rate": 1.8781149789327614e-05, + "loss": 1.5304, + "step": 7430 + }, + { + "epoch": 0.2431009678263144, + "grad_norm": 3.519596983292534, + "learning_rate": 1.8778417943662703e-05, + "loss": 1.6778, + "step": 7435 + }, + { + "epoch": 0.2432644520010463, + "grad_norm": 3.6055745379727453, + "learning_rate": 1.8775683239082148e-05, + "loss": 1.5917, + "step": 7440 + }, + { + "epoch": 0.24342793617577818, + "grad_norm": 3.544461426323643, + "learning_rate": 1.8772945676476576e-05, + "loss": 1.5793, + "step": 7445 + }, + { + "epoch": 0.24359142035051007, + "grad_norm": 3.2839610993136, + "learning_rate": 1.8770205256737545e-05, + "loss": 1.3923, + "step": 7450 + }, + { + "epoch": 0.24375490452524196, + "grad_norm": 3.403152610878814, + "learning_rate": 1.8767461980757545e-05, + "loss": 1.504, + "step": 7455 + }, + { + "epoch": 0.24391838869997384, + "grad_norm": 3.1211796934315164, + "learning_rate": 1.8764715849429987e-05, + "loss": 1.3996, + "step": 7460 + }, + { + "epoch": 0.24408187287470573, + "grad_norm": 3.332659270055476, + "learning_rate": 1.8761966863649224e-05, + "loss": 1.4772, + "step": 7465 + }, + { + "epoch": 0.24424535704943762, + "grad_norm": 3.347227314668488, + "learning_rate": 1.8759215024310533e-05, + "loss": 1.5367, + "step": 7470 + }, + { + "epoch": 0.2444088412241695, + "grad_norm": 3.247042316119492, + "learning_rate": 1.875646033231012e-05, + "loss": 1.5081, + "step": 7475 + }, + { + "epoch": 0.2445723253989014, + "grad_norm": 3.162252692608854, + "learning_rate": 1.875370278854512e-05, + "loss": 1.5307, + "step": 7480 + }, + { + "epoch": 0.24473580957363328, + "grad_norm": 3.2090754929018948, + "learning_rate": 1.87509423939136e-05, + "loss": 1.515, + "step": 7485 + }, + { + "epoch": 0.24489929374836517, + "grad_norm": 3.0238790000102336, + "learning_rate": 1.8748179149314548e-05, + "loss": 1.5685, + "step": 7490 + }, + { + "epoch": 0.24506277792309705, + "grad_norm": 3.239012988005439, + "learning_rate": 1.874541305564789e-05, + "loss": 1.5553, + "step": 7495 + }, + { + "epoch": 0.24522626209782894, + "grad_norm": 3.501132199040857, + "learning_rate": 1.874264411381447e-05, + "loss": 1.6324, + "step": 7500 + }, + { + "epoch": 0.24538974627256083, + "grad_norm": 3.1785087592269976, + "learning_rate": 1.873987232471607e-05, + "loss": 1.4903, + "step": 7505 + }, + { + "epoch": 0.24555323044729271, + "grad_norm": 3.2135037952686085, + "learning_rate": 1.873709768925539e-05, + "loss": 1.3862, + "step": 7510 + }, + { + "epoch": 0.2457167146220246, + "grad_norm": 3.3313493654166084, + "learning_rate": 1.8734320208336064e-05, + "loss": 1.5443, + "step": 7515 + }, + { + "epoch": 0.24588019879675646, + "grad_norm": 3.4428403197686617, + "learning_rate": 1.8731539882862643e-05, + "loss": 1.5806, + "step": 7520 + }, + { + "epoch": 0.24604368297148835, + "grad_norm": 3.343516677707824, + "learning_rate": 1.872875671374062e-05, + "loss": 1.5555, + "step": 7525 + }, + { + "epoch": 0.24620716714622023, + "grad_norm": 3.2749729360029436, + "learning_rate": 1.8725970701876397e-05, + "loss": 1.4711, + "step": 7530 + }, + { + "epoch": 0.24637065132095212, + "grad_norm": 3.257413663152614, + "learning_rate": 1.8723181848177314e-05, + "loss": 1.471, + "step": 7535 + }, + { + "epoch": 0.246534135495684, + "grad_norm": 3.2397849801175163, + "learning_rate": 1.872039015355163e-05, + "loss": 1.4326, + "step": 7540 + }, + { + "epoch": 0.2466976196704159, + "grad_norm": 3.6130963069140996, + "learning_rate": 1.8717595618908534e-05, + "loss": 1.6082, + "step": 7545 + }, + { + "epoch": 0.24686110384514778, + "grad_norm": 3.3131426011433507, + "learning_rate": 1.871479824515814e-05, + "loss": 1.5472, + "step": 7550 + }, + { + "epoch": 0.24702458801987967, + "grad_norm": 3.3099601848433813, + "learning_rate": 1.8711998033211475e-05, + "loss": 1.5471, + "step": 7555 + }, + { + "epoch": 0.24718807219461156, + "grad_norm": 3.185299907708701, + "learning_rate": 1.8709194983980506e-05, + "loss": 1.4695, + "step": 7560 + }, + { + "epoch": 0.24735155636934344, + "grad_norm": 3.3423040460152293, + "learning_rate": 1.870638909837812e-05, + "loss": 1.4313, + "step": 7565 + }, + { + "epoch": 0.24751504054407533, + "grad_norm": 3.2622877510319297, + "learning_rate": 1.870358037731812e-05, + "loss": 1.4923, + "step": 7570 + }, + { + "epoch": 0.24767852471880722, + "grad_norm": 3.1877908403432307, + "learning_rate": 1.870076882171524e-05, + "loss": 1.4731, + "step": 7575 + }, + { + "epoch": 0.2478420088935391, + "grad_norm": 3.030311096885018, + "learning_rate": 1.869795443248513e-05, + "loss": 1.4564, + "step": 7580 + }, + { + "epoch": 0.248005493068271, + "grad_norm": 2.9830227652289296, + "learning_rate": 1.8695137210544375e-05, + "loss": 1.3852, + "step": 7585 + }, + { + "epoch": 0.24816897724300288, + "grad_norm": 3.476051046656083, + "learning_rate": 1.869231715681047e-05, + "loss": 1.4821, + "step": 7590 + }, + { + "epoch": 0.24833246141773477, + "grad_norm": 3.9132867270764797, + "learning_rate": 1.868949427220184e-05, + "loss": 1.5423, + "step": 7595 + }, + { + "epoch": 0.24849594559246665, + "grad_norm": 3.1827184257644836, + "learning_rate": 1.868666855763783e-05, + "loss": 1.5807, + "step": 7600 + }, + { + "epoch": 0.24865942976719854, + "grad_norm": 3.3640073606601173, + "learning_rate": 1.8683840014038702e-05, + "loss": 1.4863, + "step": 7605 + }, + { + "epoch": 0.24882291394193043, + "grad_norm": 3.2941536632272794, + "learning_rate": 1.8681008642325648e-05, + "loss": 1.5596, + "step": 7610 + }, + { + "epoch": 0.24898639811666232, + "grad_norm": 3.148275548347783, + "learning_rate": 1.8678174443420775e-05, + "loss": 1.4334, + "step": 7615 + }, + { + "epoch": 0.2491498822913942, + "grad_norm": 3.046879363667961, + "learning_rate": 1.8675337418247107e-05, + "loss": 1.4172, + "step": 7620 + }, + { + "epoch": 0.2493133664661261, + "grad_norm": 3.383171866654074, + "learning_rate": 1.8672497567728603e-05, + "loss": 1.4607, + "step": 7625 + }, + { + "epoch": 0.24947685064085798, + "grad_norm": 3.842963834354121, + "learning_rate": 1.8669654892790124e-05, + "loss": 1.7678, + "step": 7630 + }, + { + "epoch": 0.24964033481558984, + "grad_norm": 3.5412448633042692, + "learning_rate": 1.8666809394357463e-05, + "loss": 1.5229, + "step": 7635 + }, + { + "epoch": 0.24980381899032172, + "grad_norm": 3.67904132511772, + "learning_rate": 1.8663961073357332e-05, + "loss": 1.708, + "step": 7640 + }, + { + "epoch": 0.2499673031650536, + "grad_norm": 3.251780402513257, + "learning_rate": 1.8661109930717357e-05, + "loss": 1.4708, + "step": 7645 + }, + { + "epoch": 0.2501307873397855, + "grad_norm": 3.216316029481785, + "learning_rate": 1.8658255967366084e-05, + "loss": 1.3863, + "step": 7650 + }, + { + "epoch": 0.2502942715145174, + "grad_norm": 2.9410320960590774, + "learning_rate": 1.865539918423298e-05, + "loss": 1.4955, + "step": 7655 + }, + { + "epoch": 0.2504577556892493, + "grad_norm": 3.9804801503567626, + "learning_rate": 1.8652539582248427e-05, + "loss": 1.4624, + "step": 7660 + }, + { + "epoch": 0.25062123986398116, + "grad_norm": 3.2995995886940848, + "learning_rate": 1.8649677162343733e-05, + "loss": 1.5664, + "step": 7665 + }, + { + "epoch": 0.25078472403871305, + "grad_norm": 3.2470731253828253, + "learning_rate": 1.8646811925451114e-05, + "loss": 1.4839, + "step": 7670 + }, + { + "epoch": 0.25094820821344493, + "grad_norm": 3.1742184124220256, + "learning_rate": 1.864394387250371e-05, + "loss": 1.44, + "step": 7675 + }, + { + "epoch": 0.2511116923881768, + "grad_norm": 3.5176562632796005, + "learning_rate": 1.864107300443557e-05, + "loss": 1.4391, + "step": 7680 + }, + { + "epoch": 0.2512751765629087, + "grad_norm": 3.188036277496087, + "learning_rate": 1.863819932218167e-05, + "loss": 1.5147, + "step": 7685 + }, + { + "epoch": 0.2514386607376406, + "grad_norm": 3.0029349812468515, + "learning_rate": 1.86353228266779e-05, + "loss": 1.3955, + "step": 7690 + }, + { + "epoch": 0.2516021449123725, + "grad_norm": 2.8653514333438337, + "learning_rate": 1.8632443518861056e-05, + "loss": 1.5094, + "step": 7695 + }, + { + "epoch": 0.25176562908710437, + "grad_norm": 3.3464770255586274, + "learning_rate": 1.8629561399668866e-05, + "loss": 1.6697, + "step": 7700 + }, + { + "epoch": 0.25192911326183626, + "grad_norm": 3.3621667349362307, + "learning_rate": 1.862667647003996e-05, + "loss": 1.5949, + "step": 7705 + }, + { + "epoch": 0.25209259743656814, + "grad_norm": 3.431641699107384, + "learning_rate": 1.8623788730913895e-05, + "loss": 1.5509, + "step": 7710 + }, + { + "epoch": 0.25225608161130003, + "grad_norm": 3.238564130382999, + "learning_rate": 1.8620898183231134e-05, + "loss": 1.4909, + "step": 7715 + }, + { + "epoch": 0.2524195657860319, + "grad_norm": 3.140955024240825, + "learning_rate": 1.8618004827933054e-05, + "loss": 1.4508, + "step": 7720 + }, + { + "epoch": 0.2525830499607638, + "grad_norm": 3.1815738025875975, + "learning_rate": 1.8615108665961954e-05, + "loss": 1.4884, + "step": 7725 + }, + { + "epoch": 0.2527465341354957, + "grad_norm": 3.2175533374722844, + "learning_rate": 1.8612209698261045e-05, + "loss": 1.4878, + "step": 7730 + }, + { + "epoch": 0.2529100183102276, + "grad_norm": 3.2258414321993287, + "learning_rate": 1.8609307925774442e-05, + "loss": 1.4804, + "step": 7735 + }, + { + "epoch": 0.25307350248495947, + "grad_norm": 3.0947607894195004, + "learning_rate": 1.8606403349447186e-05, + "loss": 1.4966, + "step": 7740 + }, + { + "epoch": 0.25323698665969135, + "grad_norm": 3.307336796816736, + "learning_rate": 1.8603495970225228e-05, + "loss": 1.4455, + "step": 7745 + }, + { + "epoch": 0.25340047083442324, + "grad_norm": 3.176123862527082, + "learning_rate": 1.8600585789055428e-05, + "loss": 1.5339, + "step": 7750 + }, + { + "epoch": 0.25356395500915513, + "grad_norm": 3.4351207948787073, + "learning_rate": 1.8597672806885564e-05, + "loss": 1.5836, + "step": 7755 + }, + { + "epoch": 0.253727439183887, + "grad_norm": 3.311012846215951, + "learning_rate": 1.8594757024664315e-05, + "loss": 1.596, + "step": 7760 + }, + { + "epoch": 0.2538909233586189, + "grad_norm": 3.377210636873998, + "learning_rate": 1.8591838443341288e-05, + "loss": 1.5245, + "step": 7765 + }, + { + "epoch": 0.2540544075333508, + "grad_norm": 3.4500413029584966, + "learning_rate": 1.858891706386699e-05, + "loss": 1.5292, + "step": 7770 + }, + { + "epoch": 0.2542178917080827, + "grad_norm": 3.342840062055148, + "learning_rate": 1.8585992887192842e-05, + "loss": 1.5722, + "step": 7775 + }, + { + "epoch": 0.25438137588281456, + "grad_norm": 3.363956151845759, + "learning_rate": 1.8583065914271177e-05, + "loss": 1.5834, + "step": 7780 + }, + { + "epoch": 0.25454486005754645, + "grad_norm": 3.2501163070756234, + "learning_rate": 1.858013614605524e-05, + "loss": 1.5642, + "step": 7785 + }, + { + "epoch": 0.25470834423227834, + "grad_norm": 3.148804677964444, + "learning_rate": 1.8577203583499185e-05, + "loss": 1.4705, + "step": 7790 + }, + { + "epoch": 0.2548718284070102, + "grad_norm": 3.408594752993675, + "learning_rate": 1.8574268227558073e-05, + "loss": 1.6009, + "step": 7795 + }, + { + "epoch": 0.2550353125817421, + "grad_norm": 2.9065623633046083, + "learning_rate": 1.8571330079187882e-05, + "loss": 1.5913, + "step": 7800 + }, + { + "epoch": 0.255198796756474, + "grad_norm": 3.086377455516761, + "learning_rate": 1.8568389139345488e-05, + "loss": 1.4858, + "step": 7805 + }, + { + "epoch": 0.25536228093120583, + "grad_norm": 3.518004560527996, + "learning_rate": 1.856544540898869e-05, + "loss": 1.5847, + "step": 7810 + }, + { + "epoch": 0.2555257651059377, + "grad_norm": 3.1403822546265325, + "learning_rate": 1.8562498889076185e-05, + "loss": 1.555, + "step": 7815 + }, + { + "epoch": 0.2556892492806696, + "grad_norm": 3.1902461871837464, + "learning_rate": 1.8559549580567585e-05, + "loss": 1.6118, + "step": 7820 + }, + { + "epoch": 0.2558527334554015, + "grad_norm": 3.2107203117939487, + "learning_rate": 1.8556597484423404e-05, + "loss": 1.396, + "step": 7825 + }, + { + "epoch": 0.2560162176301334, + "grad_norm": 3.0606723177873127, + "learning_rate": 1.855364260160507e-05, + "loss": 1.4215, + "step": 7830 + }, + { + "epoch": 0.25617970180486527, + "grad_norm": 3.1817198006528873, + "learning_rate": 1.855068493307491e-05, + "loss": 1.4734, + "step": 7835 + }, + { + "epoch": 0.25634318597959715, + "grad_norm": 3.1313007344622017, + "learning_rate": 1.8547724479796176e-05, + "loss": 1.6125, + "step": 7840 + }, + { + "epoch": 0.25650667015432904, + "grad_norm": 3.322811343536264, + "learning_rate": 1.8544761242733008e-05, + "loss": 1.5223, + "step": 7845 + }, + { + "epoch": 0.25667015432906093, + "grad_norm": 3.1752957944144176, + "learning_rate": 1.8541795222850457e-05, + "loss": 1.5118, + "step": 7850 + }, + { + "epoch": 0.2568336385037928, + "grad_norm": 3.3028409030572425, + "learning_rate": 1.8538826421114485e-05, + "loss": 1.6331, + "step": 7855 + }, + { + "epoch": 0.2569971226785247, + "grad_norm": 3.516639166753063, + "learning_rate": 1.8535854838491962e-05, + "loss": 1.4656, + "step": 7860 + }, + { + "epoch": 0.2571606068532566, + "grad_norm": 3.2730513968355393, + "learning_rate": 1.8532880475950654e-05, + "loss": 1.5074, + "step": 7865 + }, + { + "epoch": 0.2573240910279885, + "grad_norm": 3.1856853809427257, + "learning_rate": 1.8529903334459245e-05, + "loss": 1.559, + "step": 7870 + }, + { + "epoch": 0.25748757520272036, + "grad_norm": 3.442340353440597, + "learning_rate": 1.852692341498731e-05, + "loss": 1.5219, + "step": 7875 + }, + { + "epoch": 0.25765105937745225, + "grad_norm": 3.3022835601707654, + "learning_rate": 1.852394071850534e-05, + "loss": 1.5278, + "step": 7880 + }, + { + "epoch": 0.25781454355218414, + "grad_norm": 3.173482798224103, + "learning_rate": 1.8520955245984722e-05, + "loss": 1.5714, + "step": 7885 + }, + { + "epoch": 0.257978027726916, + "grad_norm": 3.2880899774611754, + "learning_rate": 1.8517966998397753e-05, + "loss": 1.5618, + "step": 7890 + }, + { + "epoch": 0.2581415119016479, + "grad_norm": 3.178856227092996, + "learning_rate": 1.851497597671764e-05, + "loss": 1.4478, + "step": 7895 + }, + { + "epoch": 0.2583049960763798, + "grad_norm": 3.366502039133181, + "learning_rate": 1.851198218191847e-05, + "loss": 1.4639, + "step": 7900 + }, + { + "epoch": 0.2584684802511117, + "grad_norm": 3.3630976737708176, + "learning_rate": 1.8508985614975262e-05, + "loss": 1.622, + "step": 7905 + }, + { + "epoch": 0.2586319644258436, + "grad_norm": 3.0032519175328978, + "learning_rate": 1.850598627686392e-05, + "loss": 1.539, + "step": 7910 + }, + { + "epoch": 0.25879544860057546, + "grad_norm": 3.327139141431935, + "learning_rate": 1.8502984168561252e-05, + "loss": 1.566, + "step": 7915 + }, + { + "epoch": 0.25895893277530735, + "grad_norm": 3.3959299054733556, + "learning_rate": 1.8499979291044978e-05, + "loss": 1.5244, + "step": 7920 + }, + { + "epoch": 0.25912241695003924, + "grad_norm": 3.219811828072925, + "learning_rate": 1.84969716452937e-05, + "loss": 1.4728, + "step": 7925 + }, + { + "epoch": 0.2592859011247711, + "grad_norm": 3.2215928473745894, + "learning_rate": 1.849396123228695e-05, + "loss": 1.5498, + "step": 7930 + }, + { + "epoch": 0.259449385299503, + "grad_norm": 3.358463662670854, + "learning_rate": 1.8490948053005137e-05, + "loss": 1.4858, + "step": 7935 + }, + { + "epoch": 0.2596128694742349, + "grad_norm": 3.142778372177977, + "learning_rate": 1.8487932108429583e-05, + "loss": 1.5169, + "step": 7940 + }, + { + "epoch": 0.2597763536489668, + "grad_norm": 3.063897202991331, + "learning_rate": 1.8484913399542502e-05, + "loss": 1.3967, + "step": 7945 + }, + { + "epoch": 0.25993983782369867, + "grad_norm": 3.213595407451296, + "learning_rate": 1.848189192732702e-05, + "loss": 1.4465, + "step": 7950 + }, + { + "epoch": 0.26010332199843056, + "grad_norm": 3.018006576357238, + "learning_rate": 1.8478867692767156e-05, + "loss": 1.458, + "step": 7955 + }, + { + "epoch": 0.26026680617316245, + "grad_norm": 3.158012932068465, + "learning_rate": 1.8475840696847825e-05, + "loss": 1.5292, + "step": 7960 + }, + { + "epoch": 0.26043029034789433, + "grad_norm": 3.1989455326376763, + "learning_rate": 1.8472810940554845e-05, + "loss": 1.6095, + "step": 7965 + }, + { + "epoch": 0.2605937745226262, + "grad_norm": 3.337526869361347, + "learning_rate": 1.8469778424874935e-05, + "loss": 1.5207, + "step": 7970 + }, + { + "epoch": 0.2607572586973581, + "grad_norm": 3.411081617749677, + "learning_rate": 1.8466743150795715e-05, + "loss": 1.615, + "step": 7975 + }, + { + "epoch": 0.26092074287209, + "grad_norm": 3.299195115920749, + "learning_rate": 1.8463705119305696e-05, + "loss": 1.4681, + "step": 7980 + }, + { + "epoch": 0.2610842270468219, + "grad_norm": 3.378481178940591, + "learning_rate": 1.8460664331394288e-05, + "loss": 1.4993, + "step": 7985 + }, + { + "epoch": 0.26124771122155377, + "grad_norm": 3.4084144698610856, + "learning_rate": 1.8457620788051806e-05, + "loss": 1.4307, + "step": 7990 + }, + { + "epoch": 0.26141119539628566, + "grad_norm": 3.387921822825846, + "learning_rate": 1.8454574490269453e-05, + "loss": 1.4445, + "step": 7995 + }, + { + "epoch": 0.26157467957101754, + "grad_norm": 3.350038627814336, + "learning_rate": 1.8451525439039338e-05, + "loss": 1.4996, + "step": 8000 + }, + { + "epoch": 0.26173816374574943, + "grad_norm": 3.47539775907223, + "learning_rate": 1.8448473635354454e-05, + "loss": 1.4622, + "step": 8005 + }, + { + "epoch": 0.2619016479204813, + "grad_norm": 3.3717802022999726, + "learning_rate": 1.8445419080208714e-05, + "loss": 1.5148, + "step": 8010 + }, + { + "epoch": 0.2620651320952132, + "grad_norm": 3.275754781295255, + "learning_rate": 1.8442361774596897e-05, + "loss": 1.4786, + "step": 8015 + }, + { + "epoch": 0.2622286162699451, + "grad_norm": 3.048840355020505, + "learning_rate": 1.84393017195147e-05, + "loss": 1.4949, + "step": 8020 + }, + { + "epoch": 0.262392100444677, + "grad_norm": 3.116608083881245, + "learning_rate": 1.84362389159587e-05, + "loss": 1.4166, + "step": 8025 + }, + { + "epoch": 0.26255558461940887, + "grad_norm": 3.314709406197081, + "learning_rate": 1.8433173364926393e-05, + "loss": 1.5581, + "step": 8030 + }, + { + "epoch": 0.26271906879414075, + "grad_norm": 3.3742932106518992, + "learning_rate": 1.8430105067416137e-05, + "loss": 1.5804, + "step": 8035 + }, + { + "epoch": 0.26288255296887264, + "grad_norm": 3.278545507874535, + "learning_rate": 1.842703402442721e-05, + "loss": 1.4897, + "step": 8040 + }, + { + "epoch": 0.26304603714360447, + "grad_norm": 3.0340595472450107, + "learning_rate": 1.8423960236959773e-05, + "loss": 1.6398, + "step": 8045 + }, + { + "epoch": 0.26320952131833636, + "grad_norm": 3.1793698577863516, + "learning_rate": 1.8420883706014882e-05, + "loss": 1.4277, + "step": 8050 + }, + { + "epoch": 0.26337300549306825, + "grad_norm": 3.3328353782949254, + "learning_rate": 1.8417804432594494e-05, + "loss": 1.4818, + "step": 8055 + }, + { + "epoch": 0.26353648966780013, + "grad_norm": 3.2010421807189746, + "learning_rate": 1.8414722417701445e-05, + "loss": 1.5767, + "step": 8060 + }, + { + "epoch": 0.263699973842532, + "grad_norm": 3.3571117269446193, + "learning_rate": 1.8411637662339476e-05, + "loss": 1.4661, + "step": 8065 + }, + { + "epoch": 0.2638634580172639, + "grad_norm": 3.277919557248963, + "learning_rate": 1.840855016751322e-05, + "loss": 1.6189, + "step": 8070 + }, + { + "epoch": 0.2640269421919958, + "grad_norm": 3.1011258931458237, + "learning_rate": 1.8405459934228186e-05, + "loss": 1.4322, + "step": 8075 + }, + { + "epoch": 0.2641904263667277, + "grad_norm": 3.3888431275347255, + "learning_rate": 1.8402366963490798e-05, + "loss": 1.5572, + "step": 8080 + }, + { + "epoch": 0.26435391054145957, + "grad_norm": 3.2208791086295965, + "learning_rate": 1.839927125630836e-05, + "loss": 1.5057, + "step": 8085 + }, + { + "epoch": 0.26451739471619146, + "grad_norm": 3.5462867891079273, + "learning_rate": 1.8396172813689067e-05, + "loss": 1.5229, + "step": 8090 + }, + { + "epoch": 0.26468087889092334, + "grad_norm": 3.179939884478597, + "learning_rate": 1.8393071636642004e-05, + "loss": 1.5127, + "step": 8095 + }, + { + "epoch": 0.26484436306565523, + "grad_norm": 3.302761374628936, + "learning_rate": 1.8389967726177146e-05, + "loss": 1.5296, + "step": 8100 + }, + { + "epoch": 0.2650078472403871, + "grad_norm": 3.3309434592874463, + "learning_rate": 1.8386861083305367e-05, + "loss": 1.5266, + "step": 8105 + }, + { + "epoch": 0.265171331415119, + "grad_norm": 3.124128087632766, + "learning_rate": 1.8383751709038423e-05, + "loss": 1.4201, + "step": 8110 + }, + { + "epoch": 0.2653348155898509, + "grad_norm": 3.1744294845987446, + "learning_rate": 1.8380639604388957e-05, + "loss": 1.5625, + "step": 8115 + }, + { + "epoch": 0.2654982997645828, + "grad_norm": 3.271044637215449, + "learning_rate": 1.8377524770370506e-05, + "loss": 1.466, + "step": 8120 + }, + { + "epoch": 0.26566178393931467, + "grad_norm": 3.1514512283644756, + "learning_rate": 1.83744072079975e-05, + "loss": 1.5617, + "step": 8125 + }, + { + "epoch": 0.26582526811404655, + "grad_norm": 3.151432250631805, + "learning_rate": 1.8371286918285243e-05, + "loss": 1.5339, + "step": 8130 + }, + { + "epoch": 0.26598875228877844, + "grad_norm": 2.937636931898448, + "learning_rate": 1.8368163902249948e-05, + "loss": 1.5794, + "step": 8135 + }, + { + "epoch": 0.2661522364635103, + "grad_norm": 3.4059538592148386, + "learning_rate": 1.8365038160908703e-05, + "loss": 1.5549, + "step": 8140 + }, + { + "epoch": 0.2663157206382422, + "grad_norm": 3.110609069118133, + "learning_rate": 1.8361909695279476e-05, + "loss": 1.4436, + "step": 8145 + }, + { + "epoch": 0.2664792048129741, + "grad_norm": 3.2629189080875087, + "learning_rate": 1.8358778506381142e-05, + "loss": 1.5026, + "step": 8150 + }, + { + "epoch": 0.266642688987706, + "grad_norm": 3.170715088340253, + "learning_rate": 1.8355644595233445e-05, + "loss": 1.4691, + "step": 8155 + }, + { + "epoch": 0.2668061731624379, + "grad_norm": 3.2888542616775993, + "learning_rate": 1.8352507962857032e-05, + "loss": 1.4361, + "step": 8160 + }, + { + "epoch": 0.26696965733716976, + "grad_norm": 3.0961048672694487, + "learning_rate": 1.8349368610273418e-05, + "loss": 1.3887, + "step": 8165 + }, + { + "epoch": 0.26713314151190165, + "grad_norm": 3.3084305356619907, + "learning_rate": 1.834622653850502e-05, + "loss": 1.4416, + "step": 8170 + }, + { + "epoch": 0.26729662568663354, + "grad_norm": 3.24462786567694, + "learning_rate": 1.8343081748575127e-05, + "loss": 1.5411, + "step": 8175 + }, + { + "epoch": 0.2674601098613654, + "grad_norm": 3.418447688300205, + "learning_rate": 1.8339934241507925e-05, + "loss": 1.5816, + "step": 8180 + }, + { + "epoch": 0.2676235940360973, + "grad_norm": 3.2735484321113004, + "learning_rate": 1.8336784018328478e-05, + "loss": 1.4354, + "step": 8185 + }, + { + "epoch": 0.2677870782108292, + "grad_norm": 3.0326048842344013, + "learning_rate": 1.833363108006274e-05, + "loss": 1.3488, + "step": 8190 + }, + { + "epoch": 0.2679505623855611, + "grad_norm": 3.2531333940640024, + "learning_rate": 1.8330475427737545e-05, + "loss": 1.4716, + "step": 8195 + }, + { + "epoch": 0.268114046560293, + "grad_norm": 2.9825784243028104, + "learning_rate": 1.8327317062380605e-05, + "loss": 1.3833, + "step": 8200 + }, + { + "epoch": 0.26827753073502486, + "grad_norm": 3.2981109436110962, + "learning_rate": 1.832415598502053e-05, + "loss": 1.5605, + "step": 8205 + }, + { + "epoch": 0.26844101490975675, + "grad_norm": 3.512108411707964, + "learning_rate": 1.83209921966868e-05, + "loss": 1.4548, + "step": 8210 + }, + { + "epoch": 0.26860449908448863, + "grad_norm": 3.426508764787582, + "learning_rate": 1.831782569840979e-05, + "loss": 1.4111, + "step": 8215 + }, + { + "epoch": 0.2687679832592205, + "grad_norm": 3.0998401262625737, + "learning_rate": 1.8314656491220744e-05, + "loss": 1.4929, + "step": 8220 + }, + { + "epoch": 0.2689314674339524, + "grad_norm": 3.5097148849497826, + "learning_rate": 1.83114845761518e-05, + "loss": 1.483, + "step": 8225 + }, + { + "epoch": 0.2690949516086843, + "grad_norm": 3.551950519516323, + "learning_rate": 1.8308309954235968e-05, + "loss": 1.6204, + "step": 8230 + }, + { + "epoch": 0.2692584357834162, + "grad_norm": 2.946489046617984, + "learning_rate": 1.8305132626507146e-05, + "loss": 1.4932, + "step": 8235 + }, + { + "epoch": 0.26942191995814807, + "grad_norm": 3.343359186652486, + "learning_rate": 1.8301952594000117e-05, + "loss": 1.5447, + "step": 8240 + }, + { + "epoch": 0.26958540413287996, + "grad_norm": 3.2167684604145976, + "learning_rate": 1.8298769857750533e-05, + "loss": 1.5286, + "step": 8245 + }, + { + "epoch": 0.26974888830761184, + "grad_norm": 2.991075525438226, + "learning_rate": 1.8295584418794937e-05, + "loss": 1.4975, + "step": 8250 + }, + { + "epoch": 0.26991237248234373, + "grad_norm": 3.2690311991669123, + "learning_rate": 1.8292396278170746e-05, + "loss": 1.5116, + "step": 8255 + }, + { + "epoch": 0.2700758566570756, + "grad_norm": 3.0980480515228423, + "learning_rate": 1.8289205436916265e-05, + "loss": 1.4743, + "step": 8260 + }, + { + "epoch": 0.2702393408318075, + "grad_norm": 3.1512848074293953, + "learning_rate": 1.8286011896070667e-05, + "loss": 1.4196, + "step": 8265 + }, + { + "epoch": 0.2704028250065394, + "grad_norm": 3.1807110416579305, + "learning_rate": 1.828281565667401e-05, + "loss": 1.4708, + "step": 8270 + }, + { + "epoch": 0.2705663091812713, + "grad_norm": 3.3484335825391405, + "learning_rate": 1.8279616719767234e-05, + "loss": 1.4762, + "step": 8275 + }, + { + "epoch": 0.2707297933560031, + "grad_norm": 3.250435622125475, + "learning_rate": 1.827641508639215e-05, + "loss": 1.49, + "step": 8280 + }, + { + "epoch": 0.270893277530735, + "grad_norm": 3.1523416842528262, + "learning_rate": 1.827321075759146e-05, + "loss": 1.4911, + "step": 8285 + }, + { + "epoch": 0.2710567617054669, + "grad_norm": 3.4762473353621384, + "learning_rate": 1.8270003734408724e-05, + "loss": 1.6109, + "step": 8290 + }, + { + "epoch": 0.2712202458801988, + "grad_norm": 3.1365072999575103, + "learning_rate": 1.8266794017888397e-05, + "loss": 1.5615, + "step": 8295 + }, + { + "epoch": 0.27138373005493066, + "grad_norm": 3.286867855084263, + "learning_rate": 1.8263581609075806e-05, + "loss": 1.6467, + "step": 8300 + }, + { + "epoch": 0.27154721422966255, + "grad_norm": 3.1099671143512944, + "learning_rate": 1.8260366509017154e-05, + "loss": 1.5379, + "step": 8305 + }, + { + "epoch": 0.27171069840439444, + "grad_norm": 3.054369199278861, + "learning_rate": 1.8257148718759517e-05, + "loss": 1.4394, + "step": 8310 + }, + { + "epoch": 0.2718741825791263, + "grad_norm": 3.039027827747847, + "learning_rate": 1.8253928239350855e-05, + "loss": 1.4501, + "step": 8315 + }, + { + "epoch": 0.2720376667538582, + "grad_norm": 3.1886994277384475, + "learning_rate": 1.8250705071839997e-05, + "loss": 1.3812, + "step": 8320 + }, + { + "epoch": 0.2722011509285901, + "grad_norm": 3.173123970812699, + "learning_rate": 1.8247479217276643e-05, + "loss": 1.4397, + "step": 8325 + }, + { + "epoch": 0.272364635103322, + "grad_norm": 3.3921428475424427, + "learning_rate": 1.8244250676711387e-05, + "loss": 1.5111, + "step": 8330 + }, + { + "epoch": 0.27252811927805387, + "grad_norm": 3.366423663800683, + "learning_rate": 1.824101945119568e-05, + "loss": 1.5132, + "step": 8335 + }, + { + "epoch": 0.27269160345278576, + "grad_norm": 3.0829420159129937, + "learning_rate": 1.823778554178185e-05, + "loss": 1.623, + "step": 8340 + }, + { + "epoch": 0.27285508762751765, + "grad_norm": 3.372331720067025, + "learning_rate": 1.8234548949523105e-05, + "loss": 1.3548, + "step": 8345 + }, + { + "epoch": 0.27301857180224953, + "grad_norm": 2.9787523463282684, + "learning_rate": 1.823130967547352e-05, + "loss": 1.4261, + "step": 8350 + }, + { + "epoch": 0.2731820559769814, + "grad_norm": 3.2611837744127987, + "learning_rate": 1.8228067720688055e-05, + "loss": 1.4487, + "step": 8355 + }, + { + "epoch": 0.2733455401517133, + "grad_norm": 3.3240626248762455, + "learning_rate": 1.822482308622253e-05, + "loss": 1.4642, + "step": 8360 + }, + { + "epoch": 0.2735090243264452, + "grad_norm": 3.244186375124245, + "learning_rate": 1.8221575773133643e-05, + "loss": 1.4318, + "step": 8365 + }, + { + "epoch": 0.2736725085011771, + "grad_norm": 3.2428173266203375, + "learning_rate": 1.8218325782478967e-05, + "loss": 1.4046, + "step": 8370 + }, + { + "epoch": 0.27383599267590897, + "grad_norm": 3.243686118202001, + "learning_rate": 1.821507311531694e-05, + "loss": 1.5744, + "step": 8375 + }, + { + "epoch": 0.27399947685064086, + "grad_norm": 8.206113610195485, + "learning_rate": 1.821181777270688e-05, + "loss": 1.5138, + "step": 8380 + }, + { + "epoch": 0.27416296102537274, + "grad_norm": 3.655566511855933, + "learning_rate": 1.820855975570897e-05, + "loss": 1.6044, + "step": 8385 + }, + { + "epoch": 0.27432644520010463, + "grad_norm": 2.97118430130287, + "learning_rate": 1.8205299065384266e-05, + "loss": 1.4897, + "step": 8390 + }, + { + "epoch": 0.2744899293748365, + "grad_norm": 3.1611305634142184, + "learning_rate": 1.8202035702794697e-05, + "loss": 1.5613, + "step": 8395 + }, + { + "epoch": 0.2746534135495684, + "grad_norm": 3.208622847718701, + "learning_rate": 1.819876966900306e-05, + "loss": 1.5026, + "step": 8400 + }, + { + "epoch": 0.2748168977243003, + "grad_norm": 3.3903291834230385, + "learning_rate": 1.819550096507302e-05, + "loss": 1.5299, + "step": 8405 + }, + { + "epoch": 0.2749803818990322, + "grad_norm": 3.0585506405477756, + "learning_rate": 1.819222959206912e-05, + "loss": 1.4014, + "step": 8410 + }, + { + "epoch": 0.27514386607376407, + "grad_norm": 3.1643214109268856, + "learning_rate": 1.8188955551056757e-05, + "loss": 1.5086, + "step": 8415 + }, + { + "epoch": 0.27530735024849595, + "grad_norm": 3.424270643890449, + "learning_rate": 1.818567884310221e-05, + "loss": 1.572, + "step": 8420 + }, + { + "epoch": 0.27547083442322784, + "grad_norm": 3.2802800325566994, + "learning_rate": 1.8182399469272622e-05, + "loss": 1.5393, + "step": 8425 + }, + { + "epoch": 0.2756343185979597, + "grad_norm": 3.230363163847788, + "learning_rate": 1.817911743063601e-05, + "loss": 1.4682, + "step": 8430 + }, + { + "epoch": 0.2757978027726916, + "grad_norm": 3.182246582082511, + "learning_rate": 1.8175832728261246e-05, + "loss": 1.6165, + "step": 8435 + }, + { + "epoch": 0.2759612869474235, + "grad_norm": 3.3522062454376553, + "learning_rate": 1.8172545363218078e-05, + "loss": 1.655, + "step": 8440 + }, + { + "epoch": 0.2761247711221554, + "grad_norm": 3.143620093104717, + "learning_rate": 1.8169255336577126e-05, + "loss": 1.4686, + "step": 8445 + }, + { + "epoch": 0.2762882552968873, + "grad_norm": 3.1546142043474843, + "learning_rate": 1.8165962649409865e-05, + "loss": 1.5182, + "step": 8450 + }, + { + "epoch": 0.27645173947161916, + "grad_norm": 3.41668083994076, + "learning_rate": 1.8162667302788645e-05, + "loss": 1.5686, + "step": 8455 + }, + { + "epoch": 0.27661522364635105, + "grad_norm": 3.139517432586291, + "learning_rate": 1.815936929778668e-05, + "loss": 1.5177, + "step": 8460 + }, + { + "epoch": 0.27677870782108294, + "grad_norm": 3.4091757527912168, + "learning_rate": 1.815606863547805e-05, + "loss": 1.5412, + "step": 8465 + }, + { + "epoch": 0.2769421919958148, + "grad_norm": 3.1998065375005584, + "learning_rate": 1.8152765316937697e-05, + "loss": 1.5428, + "step": 8470 + }, + { + "epoch": 0.2771056761705467, + "grad_norm": 3.2450137555767604, + "learning_rate": 1.8149459343241434e-05, + "loss": 1.541, + "step": 8475 + }, + { + "epoch": 0.2772691603452786, + "grad_norm": 3.216936625144817, + "learning_rate": 1.8146150715465934e-05, + "loss": 1.3831, + "step": 8480 + }, + { + "epoch": 0.2774326445200105, + "grad_norm": 3.1418540480977377, + "learning_rate": 1.8142839434688735e-05, + "loss": 1.4666, + "step": 8485 + }, + { + "epoch": 0.2775961286947424, + "grad_norm": 3.433305147126298, + "learning_rate": 1.8139525501988245e-05, + "loss": 1.5701, + "step": 8490 + }, + { + "epoch": 0.27775961286947426, + "grad_norm": 3.093221295569767, + "learning_rate": 1.8136208918443726e-05, + "loss": 1.6521, + "step": 8495 + }, + { + "epoch": 0.27792309704420615, + "grad_norm": 3.2739713716176007, + "learning_rate": 1.8132889685135305e-05, + "loss": 1.5186, + "step": 8500 + }, + { + "epoch": 0.27808658121893803, + "grad_norm": 3.026427459625798, + "learning_rate": 1.8129567803143982e-05, + "loss": 1.4627, + "step": 8505 + }, + { + "epoch": 0.27825006539366987, + "grad_norm": 3.178707969205626, + "learning_rate": 1.8126243273551608e-05, + "loss": 1.5805, + "step": 8510 + }, + { + "epoch": 0.27841354956840175, + "grad_norm": 2.901826757131809, + "learning_rate": 1.8122916097440904e-05, + "loss": 1.3912, + "step": 8515 + }, + { + "epoch": 0.27857703374313364, + "grad_norm": 3.357273792179429, + "learning_rate": 1.811958627589545e-05, + "loss": 1.5982, + "step": 8520 + }, + { + "epoch": 0.2787405179178655, + "grad_norm": 3.0706447930996346, + "learning_rate": 1.8116253809999684e-05, + "loss": 1.4397, + "step": 8525 + }, + { + "epoch": 0.2789040020925974, + "grad_norm": 3.5323399738478196, + "learning_rate": 1.811291870083891e-05, + "loss": 1.5869, + "step": 8530 + }, + { + "epoch": 0.2790674862673293, + "grad_norm": 3.397661036055258, + "learning_rate": 1.810958094949929e-05, + "loss": 1.566, + "step": 8535 + }, + { + "epoch": 0.2792309704420612, + "grad_norm": 3.2263503949163517, + "learning_rate": 1.8106240557067852e-05, + "loss": 1.526, + "step": 8540 + }, + { + "epoch": 0.2793944546167931, + "grad_norm": 3.3046488275373465, + "learning_rate": 1.8102897524632476e-05, + "loss": 1.6009, + "step": 8545 + }, + { + "epoch": 0.27955793879152496, + "grad_norm": 3.3468675096391416, + "learning_rate": 1.8099551853281907e-05, + "loss": 1.5845, + "step": 8550 + }, + { + "epoch": 0.27972142296625685, + "grad_norm": 3.7208770332383265, + "learning_rate": 1.8096203544105745e-05, + "loss": 1.6266, + "step": 8555 + }, + { + "epoch": 0.27988490714098874, + "grad_norm": 3.480767242246043, + "learning_rate": 1.809285259819446e-05, + "loss": 1.3924, + "step": 8560 + }, + { + "epoch": 0.2800483913157206, + "grad_norm": 3.434787931448628, + "learning_rate": 1.8089499016639363e-05, + "loss": 1.4679, + "step": 8565 + }, + { + "epoch": 0.2802118754904525, + "grad_norm": 3.3374297835464772, + "learning_rate": 1.8086142800532642e-05, + "loss": 1.4229, + "step": 8570 + }, + { + "epoch": 0.2803753596651844, + "grad_norm": 3.4199444018600587, + "learning_rate": 1.808278395096733e-05, + "loss": 1.5005, + "step": 8575 + }, + { + "epoch": 0.2805388438399163, + "grad_norm": 3.1446503620368924, + "learning_rate": 1.8079422469037324e-05, + "loss": 1.5851, + "step": 8580 + }, + { + "epoch": 0.2807023280146482, + "grad_norm": 3.3670021216606085, + "learning_rate": 1.8076058355837375e-05, + "loss": 1.5058, + "step": 8585 + }, + { + "epoch": 0.28086581218938006, + "grad_norm": 3.326881625226136, + "learning_rate": 1.8072691612463096e-05, + "loss": 1.4284, + "step": 8590 + }, + { + "epoch": 0.28102929636411195, + "grad_norm": 3.131350404890109, + "learning_rate": 1.8069322240010946e-05, + "loss": 1.5865, + "step": 8595 + }, + { + "epoch": 0.28119278053884383, + "grad_norm": 3.291901802018732, + "learning_rate": 1.806595023957825e-05, + "loss": 1.4867, + "step": 8600 + }, + { + "epoch": 0.2813562647135757, + "grad_norm": 3.446436565686949, + "learning_rate": 1.8062575612263184e-05, + "loss": 1.4959, + "step": 8605 + }, + { + "epoch": 0.2815197488883076, + "grad_norm": 3.3153886735173463, + "learning_rate": 1.8059198359164788e-05, + "loss": 1.4904, + "step": 8610 + }, + { + "epoch": 0.2816832330630395, + "grad_norm": 3.321377369992352, + "learning_rate": 1.8055818481382946e-05, + "loss": 1.4243, + "step": 8615 + }, + { + "epoch": 0.2818467172377714, + "grad_norm": 3.367268232073204, + "learning_rate": 1.80524359800184e-05, + "loss": 1.5485, + "step": 8620 + }, + { + "epoch": 0.28201020141250327, + "grad_norm": 3.082437163765988, + "learning_rate": 1.804905085617275e-05, + "loss": 1.5115, + "step": 8625 + }, + { + "epoch": 0.28217368558723516, + "grad_norm": 2.973257695985591, + "learning_rate": 1.804566311094845e-05, + "loss": 1.3943, + "step": 8630 + }, + { + "epoch": 0.28233716976196704, + "grad_norm": 3.286524096219048, + "learning_rate": 1.80422727454488e-05, + "loss": 1.4769, + "step": 8635 + }, + { + "epoch": 0.28250065393669893, + "grad_norm": 3.3088959371443036, + "learning_rate": 1.8038879760777963e-05, + "loss": 1.4464, + "step": 8640 + }, + { + "epoch": 0.2826641381114308, + "grad_norm": 3.3579307235592046, + "learning_rate": 1.803548415804095e-05, + "loss": 1.6808, + "step": 8645 + }, + { + "epoch": 0.2828276222861627, + "grad_norm": 3.3635206457481988, + "learning_rate": 1.8032085938343623e-05, + "loss": 1.578, + "step": 8650 + }, + { + "epoch": 0.2829911064608946, + "grad_norm": 3.150168078654479, + "learning_rate": 1.8028685102792708e-05, + "loss": 1.4227, + "step": 8655 + }, + { + "epoch": 0.2831545906356265, + "grad_norm": 3.2668978820681622, + "learning_rate": 1.802528165249576e-05, + "loss": 1.4707, + "step": 8660 + }, + { + "epoch": 0.28331807481035837, + "grad_norm": 3.110124554994039, + "learning_rate": 1.8021875588561212e-05, + "loss": 1.4916, + "step": 8665 + }, + { + "epoch": 0.28348155898509025, + "grad_norm": 3.2119435020780314, + "learning_rate": 1.801846691209833e-05, + "loss": 1.512, + "step": 8670 + }, + { + "epoch": 0.28364504315982214, + "grad_norm": 3.2178629395581, + "learning_rate": 1.8015055624217237e-05, + "loss": 1.6142, + "step": 8675 + }, + { + "epoch": 0.28380852733455403, + "grad_norm": 3.3422315753058096, + "learning_rate": 1.8011641726028905e-05, + "loss": 1.4639, + "step": 8680 + }, + { + "epoch": 0.2839720115092859, + "grad_norm": 3.134542764253146, + "learning_rate": 1.8008225218645153e-05, + "loss": 1.6301, + "step": 8685 + }, + { + "epoch": 0.2841354956840178, + "grad_norm": 3.371658868871349, + "learning_rate": 1.8004806103178666e-05, + "loss": 1.4539, + "step": 8690 + }, + { + "epoch": 0.2842989798587497, + "grad_norm": 3.205664023707449, + "learning_rate": 1.8001384380742953e-05, + "loss": 1.5615, + "step": 8695 + }, + { + "epoch": 0.2844624640334816, + "grad_norm": 3.0523037727975364, + "learning_rate": 1.7997960052452393e-05, + "loss": 1.5568, + "step": 8700 + }, + { + "epoch": 0.28462594820821346, + "grad_norm": 3.041854096795445, + "learning_rate": 1.79945331194222e-05, + "loss": 1.601, + "step": 8705 + }, + { + "epoch": 0.28478943238294535, + "grad_norm": 3.3448628710425843, + "learning_rate": 1.799110358276845e-05, + "loss": 1.5464, + "step": 8710 + }, + { + "epoch": 0.28495291655767724, + "grad_norm": 3.3643438855033287, + "learning_rate": 1.7987671443608056e-05, + "loss": 1.451, + "step": 8715 + }, + { + "epoch": 0.2851164007324091, + "grad_norm": 3.206543462446347, + "learning_rate": 1.7984236703058774e-05, + "loss": 1.5168, + "step": 8720 + }, + { + "epoch": 0.285279884907141, + "grad_norm": 3.176688144519156, + "learning_rate": 1.7980799362239227e-05, + "loss": 1.4362, + "step": 8725 + }, + { + "epoch": 0.2854433690818729, + "grad_norm": 3.115139154820721, + "learning_rate": 1.797735942226886e-05, + "loss": 1.4863, + "step": 8730 + }, + { + "epoch": 0.2856068532566048, + "grad_norm": 2.960359142822331, + "learning_rate": 1.7973916884267992e-05, + "loss": 1.3469, + "step": 8735 + }, + { + "epoch": 0.2857703374313367, + "grad_norm": 3.236638025692472, + "learning_rate": 1.797047174935776e-05, + "loss": 1.4287, + "step": 8740 + }, + { + "epoch": 0.2859338216060685, + "grad_norm": 3.1598321979802915, + "learning_rate": 1.7967024018660168e-05, + "loss": 1.3771, + "step": 8745 + }, + { + "epoch": 0.2860973057808004, + "grad_norm": 2.9738841993130034, + "learning_rate": 1.7963573693298054e-05, + "loss": 1.432, + "step": 8750 + }, + { + "epoch": 0.2862607899555323, + "grad_norm": 3.2761986853571634, + "learning_rate": 1.79601207743951e-05, + "loss": 1.5429, + "step": 8755 + }, + { + "epoch": 0.28642427413026417, + "grad_norm": 3.3241987206219425, + "learning_rate": 1.795666526307585e-05, + "loss": 1.4736, + "step": 8760 + }, + { + "epoch": 0.28658775830499605, + "grad_norm": 3.0405744515742463, + "learning_rate": 1.7953207160465667e-05, + "loss": 1.3803, + "step": 8765 + }, + { + "epoch": 0.28675124247972794, + "grad_norm": 3.1274916518280533, + "learning_rate": 1.7949746467690778e-05, + "loss": 1.5504, + "step": 8770 + }, + { + "epoch": 0.28691472665445983, + "grad_norm": 3.33960024895575, + "learning_rate": 1.794628318587824e-05, + "loss": 1.4323, + "step": 8775 + }, + { + "epoch": 0.2870782108291917, + "grad_norm": 3.1194544939051765, + "learning_rate": 1.7942817316155966e-05, + "loss": 1.4412, + "step": 8780 + }, + { + "epoch": 0.2872416950039236, + "grad_norm": 3.2680708539130574, + "learning_rate": 1.7939348859652695e-05, + "loss": 1.5879, + "step": 8785 + }, + { + "epoch": 0.2874051791786555, + "grad_norm": 3.0685063344039842, + "learning_rate": 1.793587781749803e-05, + "loss": 1.4533, + "step": 8790 + }, + { + "epoch": 0.2875686633533874, + "grad_norm": 3.2298570181122113, + "learning_rate": 1.7932404190822393e-05, + "loss": 1.5057, + "step": 8795 + }, + { + "epoch": 0.28773214752811926, + "grad_norm": 3.0495059773974864, + "learning_rate": 1.792892798075707e-05, + "loss": 1.5592, + "step": 8800 + }, + { + "epoch": 0.28789563170285115, + "grad_norm": 3.5121278053323786, + "learning_rate": 1.7925449188434165e-05, + "loss": 1.6001, + "step": 8805 + }, + { + "epoch": 0.28805911587758304, + "grad_norm": 3.2051822298425536, + "learning_rate": 1.7921967814986643e-05, + "loss": 1.5671, + "step": 8810 + }, + { + "epoch": 0.2882226000523149, + "grad_norm": 3.048845705313969, + "learning_rate": 1.7918483861548305e-05, + "loss": 1.4715, + "step": 8815 + }, + { + "epoch": 0.2883860842270468, + "grad_norm": 3.0616202564375237, + "learning_rate": 1.7914997329253784e-05, + "loss": 1.4419, + "step": 8820 + }, + { + "epoch": 0.2885495684017787, + "grad_norm": 3.2621789109480654, + "learning_rate": 1.791150821923856e-05, + "loss": 1.7086, + "step": 8825 + }, + { + "epoch": 0.2887130525765106, + "grad_norm": 3.1197561592168648, + "learning_rate": 1.790801653263895e-05, + "loss": 1.4919, + "step": 8830 + }, + { + "epoch": 0.2888765367512425, + "grad_norm": 3.15475538814038, + "learning_rate": 1.7904522270592113e-05, + "loss": 1.5911, + "step": 8835 + }, + { + "epoch": 0.28904002092597436, + "grad_norm": 3.078184162034494, + "learning_rate": 1.790102543423604e-05, + "loss": 1.4509, + "step": 8840 + }, + { + "epoch": 0.28920350510070625, + "grad_norm": 3.3436603398725353, + "learning_rate": 1.789752602470957e-05, + "loss": 1.5174, + "step": 8845 + }, + { + "epoch": 0.28936698927543814, + "grad_norm": 3.458045450881308, + "learning_rate": 1.7894024043152372e-05, + "loss": 1.498, + "step": 8850 + }, + { + "epoch": 0.28953047345017, + "grad_norm": 3.162625870622513, + "learning_rate": 1.7890519490704956e-05, + "loss": 1.4325, + "step": 8855 + }, + { + "epoch": 0.2896939576249019, + "grad_norm": 3.326987416864393, + "learning_rate": 1.788701236850867e-05, + "loss": 1.5401, + "step": 8860 + }, + { + "epoch": 0.2898574417996338, + "grad_norm": 3.0759307112503795, + "learning_rate": 1.7883502677705692e-05, + "loss": 1.4789, + "step": 8865 + }, + { + "epoch": 0.2900209259743657, + "grad_norm": 3.1992887451205014, + "learning_rate": 1.7879990419439054e-05, + "loss": 1.4052, + "step": 8870 + }, + { + "epoch": 0.29018441014909757, + "grad_norm": 3.1468056150000216, + "learning_rate": 1.78764755948526e-05, + "loss": 1.5394, + "step": 8875 + }, + { + "epoch": 0.29034789432382946, + "grad_norm": 3.1831802357582646, + "learning_rate": 1.7872958205091032e-05, + "loss": 1.5167, + "step": 8880 + }, + { + "epoch": 0.29051137849856135, + "grad_norm": 3.139994257781054, + "learning_rate": 1.7869438251299872e-05, + "loss": 1.521, + "step": 8885 + }, + { + "epoch": 0.29067486267329323, + "grad_norm": 3.32998765067861, + "learning_rate": 1.7865915734625484e-05, + "loss": 1.4113, + "step": 8890 + }, + { + "epoch": 0.2908383468480251, + "grad_norm": 3.1115687084860033, + "learning_rate": 1.7862390656215062e-05, + "loss": 1.5142, + "step": 8895 + }, + { + "epoch": 0.291001831022757, + "grad_norm": 3.1568752733908227, + "learning_rate": 1.7858863017216644e-05, + "loss": 1.5215, + "step": 8900 + }, + { + "epoch": 0.2911653151974889, + "grad_norm": 3.3419997568255444, + "learning_rate": 1.7855332818779095e-05, + "loss": 1.5387, + "step": 8905 + }, + { + "epoch": 0.2913287993722208, + "grad_norm": 3.057053906801254, + "learning_rate": 1.785180006205211e-05, + "loss": 1.4534, + "step": 8910 + }, + { + "epoch": 0.29149228354695267, + "grad_norm": 3.3573569804758825, + "learning_rate": 1.7848264748186223e-05, + "loss": 1.589, + "step": 8915 + }, + { + "epoch": 0.29165576772168456, + "grad_norm": 2.9612236459546954, + "learning_rate": 1.78447268783328e-05, + "loss": 1.4591, + "step": 8920 + }, + { + "epoch": 0.29181925189641644, + "grad_norm": 3.3609248900970727, + "learning_rate": 1.7841186453644036e-05, + "loss": 1.5446, + "step": 8925 + }, + { + "epoch": 0.29198273607114833, + "grad_norm": 3.0561419802164678, + "learning_rate": 1.7837643475272966e-05, + "loss": 1.488, + "step": 8930 + }, + { + "epoch": 0.2921462202458802, + "grad_norm": 3.398667224394713, + "learning_rate": 1.7834097944373446e-05, + "loss": 1.521, + "step": 8935 + }, + { + "epoch": 0.2923097044206121, + "grad_norm": 3.089888726046844, + "learning_rate": 1.7830549862100168e-05, + "loss": 1.4415, + "step": 8940 + }, + { + "epoch": 0.292473188595344, + "grad_norm": 3.2958414263839764, + "learning_rate": 1.782699922960866e-05, + "loss": 1.5059, + "step": 8945 + }, + { + "epoch": 0.2926366727700759, + "grad_norm": 3.1529600700353093, + "learning_rate": 1.7823446048055274e-05, + "loss": 1.4273, + "step": 8950 + }, + { + "epoch": 0.29280015694480777, + "grad_norm": 3.3350415942947587, + "learning_rate": 1.7819890318597194e-05, + "loss": 1.5686, + "step": 8955 + }, + { + "epoch": 0.29296364111953965, + "grad_norm": 3.054082065640839, + "learning_rate": 1.7816332042392435e-05, + "loss": 1.4815, + "step": 8960 + }, + { + "epoch": 0.29312712529427154, + "grad_norm": 3.3290839666335623, + "learning_rate": 1.7812771220599835e-05, + "loss": 1.5407, + "step": 8965 + }, + { + "epoch": 0.2932906094690034, + "grad_norm": 3.095710471330175, + "learning_rate": 1.7809207854379072e-05, + "loss": 1.4244, + "step": 8970 + }, + { + "epoch": 0.29345409364373526, + "grad_norm": 3.1159532084409336, + "learning_rate": 1.780564194489065e-05, + "loss": 1.4938, + "step": 8975 + }, + { + "epoch": 0.29361757781846715, + "grad_norm": 3.1101386469463903, + "learning_rate": 1.780207349329589e-05, + "loss": 1.4245, + "step": 8980 + }, + { + "epoch": 0.29378106199319903, + "grad_norm": 3.4895618815951894, + "learning_rate": 1.7798502500756955e-05, + "loss": 1.5769, + "step": 8985 + }, + { + "epoch": 0.2939445461679309, + "grad_norm": 3.182873154618179, + "learning_rate": 1.779492896843683e-05, + "loss": 1.4173, + "step": 8990 + }, + { + "epoch": 0.2941080303426628, + "grad_norm": 3.257388041779137, + "learning_rate": 1.7791352897499322e-05, + "loss": 1.4845, + "step": 8995 + }, + { + "epoch": 0.2942715145173947, + "grad_norm": 3.233422954247746, + "learning_rate": 1.7787774289109074e-05, + "loss": 1.5223, + "step": 9000 + }, + { + "epoch": 0.2944349986921266, + "grad_norm": 3.3943593824569187, + "learning_rate": 1.7784193144431548e-05, + "loss": 1.6072, + "step": 9005 + }, + { + "epoch": 0.29459848286685847, + "grad_norm": 3.3303688579332067, + "learning_rate": 1.7780609464633037e-05, + "loss": 1.5351, + "step": 9010 + }, + { + "epoch": 0.29476196704159036, + "grad_norm": 3.398838337401543, + "learning_rate": 1.777702325088066e-05, + "loss": 1.4838, + "step": 9015 + }, + { + "epoch": 0.29492545121632224, + "grad_norm": 3.101764673992348, + "learning_rate": 1.7773434504342354e-05, + "loss": 1.4375, + "step": 9020 + }, + { + "epoch": 0.29508893539105413, + "grad_norm": 2.9515066345625174, + "learning_rate": 1.776984322618689e-05, + "loss": 1.3659, + "step": 9025 + }, + { + "epoch": 0.295252419565786, + "grad_norm": 3.443362087811088, + "learning_rate": 1.7766249417583855e-05, + "loss": 1.5651, + "step": 9030 + }, + { + "epoch": 0.2954159037405179, + "grad_norm": 3.147486740135598, + "learning_rate": 1.7762653079703673e-05, + "loss": 1.5582, + "step": 9035 + }, + { + "epoch": 0.2955793879152498, + "grad_norm": 2.985534943502821, + "learning_rate": 1.775905421371757e-05, + "loss": 1.4388, + "step": 9040 + }, + { + "epoch": 0.2957428720899817, + "grad_norm": 3.361521930072311, + "learning_rate": 1.775545282079762e-05, + "loss": 1.3864, + "step": 9045 + }, + { + "epoch": 0.29590635626471357, + "grad_norm": 3.0680239497584183, + "learning_rate": 1.7751848902116706e-05, + "loss": 1.3969, + "step": 9050 + }, + { + "epoch": 0.29606984043944545, + "grad_norm": 3.1293881105084864, + "learning_rate": 1.7748242458848527e-05, + "loss": 1.5246, + "step": 9055 + }, + { + "epoch": 0.29623332461417734, + "grad_norm": 3.1474516482097945, + "learning_rate": 1.7744633492167626e-05, + "loss": 1.597, + "step": 9060 + }, + { + "epoch": 0.2963968087889092, + "grad_norm": 3.219412254233844, + "learning_rate": 1.774102200324935e-05, + "loss": 1.5241, + "step": 9065 + }, + { + "epoch": 0.2965602929636411, + "grad_norm": 3.083793236820213, + "learning_rate": 1.773740799326987e-05, + "loss": 1.4532, + "step": 9070 + }, + { + "epoch": 0.296723777138373, + "grad_norm": 3.075524329838125, + "learning_rate": 1.773379146340618e-05, + "loss": 1.5361, + "step": 9075 + }, + { + "epoch": 0.2968872613131049, + "grad_norm": 3.5296988767224624, + "learning_rate": 1.77301724148361e-05, + "loss": 1.5094, + "step": 9080 + }, + { + "epoch": 0.2970507454878368, + "grad_norm": 3.3135319508501633, + "learning_rate": 1.7726550848738262e-05, + "loss": 1.5264, + "step": 9085 + }, + { + "epoch": 0.29721422966256866, + "grad_norm": 3.373143161832673, + "learning_rate": 1.7722926766292124e-05, + "loss": 1.5868, + "step": 9090 + }, + { + "epoch": 0.29737771383730055, + "grad_norm": 3.27899713041071, + "learning_rate": 1.7719300168677956e-05, + "loss": 1.5333, + "step": 9095 + }, + { + "epoch": 0.29754119801203244, + "grad_norm": 3.383279540564282, + "learning_rate": 1.771567105707686e-05, + "loss": 1.5734, + "step": 9100 + }, + { + "epoch": 0.2977046821867643, + "grad_norm": 3.12980815165738, + "learning_rate": 1.771203943267074e-05, + "loss": 1.5187, + "step": 9105 + }, + { + "epoch": 0.2978681663614962, + "grad_norm": 3.3207356814513127, + "learning_rate": 1.7708405296642334e-05, + "loss": 1.5094, + "step": 9110 + }, + { + "epoch": 0.2980316505362281, + "grad_norm": 3.1672858131315107, + "learning_rate": 1.7704768650175185e-05, + "loss": 1.5567, + "step": 9115 + }, + { + "epoch": 0.29819513471096, + "grad_norm": 3.313430718175338, + "learning_rate": 1.7701129494453662e-05, + "loss": 1.5242, + "step": 9120 + }, + { + "epoch": 0.2983586188856919, + "grad_norm": 3.420789467530675, + "learning_rate": 1.769748783066295e-05, + "loss": 1.5168, + "step": 9125 + }, + { + "epoch": 0.29852210306042376, + "grad_norm": 3.3910967391222298, + "learning_rate": 1.7693843659989052e-05, + "loss": 1.487, + "step": 9130 + }, + { + "epoch": 0.29868558723515565, + "grad_norm": 3.06237053961057, + "learning_rate": 1.769019698361878e-05, + "loss": 1.523, + "step": 9135 + }, + { + "epoch": 0.29884907140988753, + "grad_norm": 3.2765556218915286, + "learning_rate": 1.768654780273977e-05, + "loss": 1.4387, + "step": 9140 + }, + { + "epoch": 0.2990125555846194, + "grad_norm": 3.1996894791503485, + "learning_rate": 1.768289611854047e-05, + "loss": 1.5896, + "step": 9145 + }, + { + "epoch": 0.2991760397593513, + "grad_norm": 3.257388882197165, + "learning_rate": 1.7679241932210147e-05, + "loss": 1.4455, + "step": 9150 + }, + { + "epoch": 0.2993395239340832, + "grad_norm": 3.061643832140933, + "learning_rate": 1.7675585244938872e-05, + "loss": 1.6452, + "step": 9155 + }, + { + "epoch": 0.2995030081088151, + "grad_norm": 3.272575828492733, + "learning_rate": 1.767192605791755e-05, + "loss": 1.371, + "step": 9160 + }, + { + "epoch": 0.29966649228354697, + "grad_norm": 2.99270586634148, + "learning_rate": 1.7668264372337875e-05, + "loss": 1.4514, + "step": 9165 + }, + { + "epoch": 0.29982997645827886, + "grad_norm": 3.4835072719044295, + "learning_rate": 1.7664600189392383e-05, + "loss": 1.5838, + "step": 9170 + }, + { + "epoch": 0.29999346063301074, + "grad_norm": 3.088172830208018, + "learning_rate": 1.7660933510274395e-05, + "loss": 1.3955, + "step": 9175 + }, + { + "epoch": 0.30015694480774263, + "grad_norm": 2.8465432876036485, + "learning_rate": 1.765726433617807e-05, + "loss": 1.4448, + "step": 9180 + }, + { + "epoch": 0.3003204289824745, + "grad_norm": 3.127226149723255, + "learning_rate": 1.7653592668298358e-05, + "loss": 1.5167, + "step": 9185 + }, + { + "epoch": 0.3004839131572064, + "grad_norm": 3.4991951552514475, + "learning_rate": 1.764991850783104e-05, + "loss": 1.5331, + "step": 9190 + }, + { + "epoch": 0.3006473973319383, + "grad_norm": 3.5276895337471523, + "learning_rate": 1.764624185597269e-05, + "loss": 1.4764, + "step": 9195 + }, + { + "epoch": 0.3008108815066702, + "grad_norm": 2.7199168682146317, + "learning_rate": 1.7642562713920716e-05, + "loss": 1.4146, + "step": 9200 + }, + { + "epoch": 0.30097436568140207, + "grad_norm": 3.5635326842558332, + "learning_rate": 1.7638881082873317e-05, + "loss": 1.5445, + "step": 9205 + }, + { + "epoch": 0.3011378498561339, + "grad_norm": 3.221311808388296, + "learning_rate": 1.763519696402951e-05, + "loss": 1.4357, + "step": 9210 + }, + { + "epoch": 0.3013013340308658, + "grad_norm": 3.3335603509267093, + "learning_rate": 1.763151035858912e-05, + "loss": 1.4085, + "step": 9215 + }, + { + "epoch": 0.3014648182055977, + "grad_norm": 3.4962733784588376, + "learning_rate": 1.7627821267752795e-05, + "loss": 1.5131, + "step": 9220 + }, + { + "epoch": 0.30162830238032956, + "grad_norm": 3.261733796385961, + "learning_rate": 1.7624129692721968e-05, + "loss": 1.5548, + "step": 9225 + }, + { + "epoch": 0.30179178655506145, + "grad_norm": 3.2907775106047232, + "learning_rate": 1.76204356346989e-05, + "loss": 1.5278, + "step": 9230 + }, + { + "epoch": 0.30195527072979333, + "grad_norm": 2.9188256022200396, + "learning_rate": 1.761673909488666e-05, + "loss": 1.4975, + "step": 9235 + }, + { + "epoch": 0.3021187549045252, + "grad_norm": 3.090425141255695, + "learning_rate": 1.761304007448911e-05, + "loss": 1.4246, + "step": 9240 + }, + { + "epoch": 0.3022822390792571, + "grad_norm": 3.4299745752279533, + "learning_rate": 1.760933857471094e-05, + "loss": 1.5566, + "step": 9245 + }, + { + "epoch": 0.302445723253989, + "grad_norm": 2.9680894419974613, + "learning_rate": 1.760563459675763e-05, + "loss": 1.4588, + "step": 9250 + }, + { + "epoch": 0.3026092074287209, + "grad_norm": 3.464184810100851, + "learning_rate": 1.7601928141835486e-05, + "loss": 1.6071, + "step": 9255 + }, + { + "epoch": 0.30277269160345277, + "grad_norm": 2.826049657269779, + "learning_rate": 1.75982192111516e-05, + "loss": 1.316, + "step": 9260 + }, + { + "epoch": 0.30293617577818466, + "grad_norm": 2.98403382045161, + "learning_rate": 1.759450780591388e-05, + "loss": 1.5959, + "step": 9265 + }, + { + "epoch": 0.30309965995291654, + "grad_norm": 2.7721428762790308, + "learning_rate": 1.7590793927331046e-05, + "loss": 1.5458, + "step": 9270 + }, + { + "epoch": 0.30326314412764843, + "grad_norm": 3.1521686955703747, + "learning_rate": 1.7587077576612607e-05, + "loss": 1.5283, + "step": 9275 + }, + { + "epoch": 0.3034266283023803, + "grad_norm": 3.1545147341911757, + "learning_rate": 1.75833587549689e-05, + "loss": 1.5266, + "step": 9280 + }, + { + "epoch": 0.3035901124771122, + "grad_norm": 3.2206282410451865, + "learning_rate": 1.7579637463611047e-05, + "loss": 1.6011, + "step": 9285 + }, + { + "epoch": 0.3037535966518441, + "grad_norm": 3.11502310984267, + "learning_rate": 1.757591370375098e-05, + "loss": 1.4106, + "step": 9290 + }, + { + "epoch": 0.303917080826576, + "grad_norm": 3.3784063042872785, + "learning_rate": 1.757218747660144e-05, + "loss": 1.5143, + "step": 9295 + }, + { + "epoch": 0.30408056500130787, + "grad_norm": 3.515021227311433, + "learning_rate": 1.7568458783375963e-05, + "loss": 1.4661, + "step": 9300 + }, + { + "epoch": 0.30424404917603975, + "grad_norm": 3.2674896081797384, + "learning_rate": 1.75647276252889e-05, + "loss": 1.4586, + "step": 9305 + }, + { + "epoch": 0.30440753335077164, + "grad_norm": 3.561417600598195, + "learning_rate": 1.7560994003555394e-05, + "loss": 1.4773, + "step": 9310 + }, + { + "epoch": 0.30457101752550353, + "grad_norm": 3.088930355481389, + "learning_rate": 1.7557257919391392e-05, + "loss": 1.4371, + "step": 9315 + }, + { + "epoch": 0.3047345017002354, + "grad_norm": 3.126327986384523, + "learning_rate": 1.755351937401365e-05, + "loss": 1.5051, + "step": 9320 + }, + { + "epoch": 0.3048979858749673, + "grad_norm": 3.315387527946429, + "learning_rate": 1.754977836863972e-05, + "loss": 1.3644, + "step": 9325 + }, + { + "epoch": 0.3050614700496992, + "grad_norm": 3.0759854116124745, + "learning_rate": 1.754603490448795e-05, + "loss": 1.4382, + "step": 9330 + }, + { + "epoch": 0.3052249542244311, + "grad_norm": 3.2190801465685595, + "learning_rate": 1.75422889827775e-05, + "loss": 1.4999, + "step": 9335 + }, + { + "epoch": 0.30538843839916296, + "grad_norm": 3.1738928664330195, + "learning_rate": 1.7538540604728325e-05, + "loss": 1.5296, + "step": 9340 + }, + { + "epoch": 0.30555192257389485, + "grad_norm": 3.8572732836582744, + "learning_rate": 1.7534789771561177e-05, + "loss": 1.5555, + "step": 9345 + }, + { + "epoch": 0.30571540674862674, + "grad_norm": 3.2400670768687547, + "learning_rate": 1.7531036484497608e-05, + "loss": 1.5881, + "step": 9350 + }, + { + "epoch": 0.3058788909233586, + "grad_norm": 3.1795850476282412, + "learning_rate": 1.7527280744759983e-05, + "loss": 1.5555, + "step": 9355 + }, + { + "epoch": 0.3060423750980905, + "grad_norm": 3.247502128102033, + "learning_rate": 1.7523522553571443e-05, + "loss": 1.6942, + "step": 9360 + }, + { + "epoch": 0.3062058592728224, + "grad_norm": 3.1011492740441144, + "learning_rate": 1.751976191215594e-05, + "loss": 1.4895, + "step": 9365 + }, + { + "epoch": 0.3063693434475543, + "grad_norm": 3.2403301052715183, + "learning_rate": 1.7515998821738227e-05, + "loss": 1.448, + "step": 9370 + }, + { + "epoch": 0.3065328276222862, + "grad_norm": 3.292173179975352, + "learning_rate": 1.751223328354385e-05, + "loss": 1.5736, + "step": 9375 + }, + { + "epoch": 0.30669631179701806, + "grad_norm": 3.1209018191390974, + "learning_rate": 1.750846529879915e-05, + "loss": 1.5272, + "step": 9380 + }, + { + "epoch": 0.30685979597174995, + "grad_norm": 3.1149161448571956, + "learning_rate": 1.750469486873127e-05, + "loss": 1.4324, + "step": 9385 + }, + { + "epoch": 0.30702328014648184, + "grad_norm": 3.5214469268378163, + "learning_rate": 1.7500921994568144e-05, + "loss": 1.5156, + "step": 9390 + }, + { + "epoch": 0.3071867643212137, + "grad_norm": 3.0480961785510536, + "learning_rate": 1.7497146677538505e-05, + "loss": 1.5695, + "step": 9395 + }, + { + "epoch": 0.3073502484959456, + "grad_norm": 3.183326381693171, + "learning_rate": 1.7493368918871885e-05, + "loss": 1.454, + "step": 9400 + }, + { + "epoch": 0.3075137326706775, + "grad_norm": 3.43995543958027, + "learning_rate": 1.7489588719798603e-05, + "loss": 1.5046, + "step": 9405 + }, + { + "epoch": 0.3076772168454094, + "grad_norm": 3.245083560353171, + "learning_rate": 1.748580608154978e-05, + "loss": 1.451, + "step": 9410 + }, + { + "epoch": 0.30784070102014127, + "grad_norm": 3.3349956860055707, + "learning_rate": 1.7482021005357325e-05, + "loss": 1.5876, + "step": 9415 + }, + { + "epoch": 0.30800418519487316, + "grad_norm": 3.183997677975433, + "learning_rate": 1.747823349245395e-05, + "loss": 1.4272, + "step": 9420 + }, + { + "epoch": 0.30816766936960505, + "grad_norm": 2.93817155022159, + "learning_rate": 1.747444354407315e-05, + "loss": 1.503, + "step": 9425 + }, + { + "epoch": 0.30833115354433693, + "grad_norm": 3.2203770734851496, + "learning_rate": 1.7470651161449218e-05, + "loss": 1.4233, + "step": 9430 + }, + { + "epoch": 0.3084946377190688, + "grad_norm": 3.1572735428932477, + "learning_rate": 1.7466856345817244e-05, + "loss": 1.4861, + "step": 9435 + }, + { + "epoch": 0.3086581218938007, + "grad_norm": 3.0701985177142883, + "learning_rate": 1.746305909841311e-05, + "loss": 1.4215, + "step": 9440 + }, + { + "epoch": 0.30882160606853254, + "grad_norm": 3.0741484135768418, + "learning_rate": 1.7459259420473476e-05, + "loss": 1.4666, + "step": 9445 + }, + { + "epoch": 0.3089850902432644, + "grad_norm": 3.2018826352177783, + "learning_rate": 1.7455457313235814e-05, + "loss": 1.5255, + "step": 9450 + }, + { + "epoch": 0.3091485744179963, + "grad_norm": 3.1772983719279124, + "learning_rate": 1.745165277793837e-05, + "loss": 1.5367, + "step": 9455 + }, + { + "epoch": 0.3093120585927282, + "grad_norm": 3.7369632125878254, + "learning_rate": 1.744784581582019e-05, + "loss": 1.6068, + "step": 9460 + }, + { + "epoch": 0.3094755427674601, + "grad_norm": 3.11792502025225, + "learning_rate": 1.744403642812111e-05, + "loss": 1.5873, + "step": 9465 + }, + { + "epoch": 0.309639026942192, + "grad_norm": 3.11114753389884, + "learning_rate": 1.7440224616081752e-05, + "loss": 1.3782, + "step": 9470 + }, + { + "epoch": 0.30980251111692386, + "grad_norm": 3.1617116658309827, + "learning_rate": 1.7436410380943532e-05, + "loss": 1.5423, + "step": 9475 + }, + { + "epoch": 0.30996599529165575, + "grad_norm": 3.367023935042449, + "learning_rate": 1.743259372394865e-05, + "loss": 1.6165, + "step": 9480 + }, + { + "epoch": 0.31012947946638764, + "grad_norm": 3.2775121949176405, + "learning_rate": 1.7428774646340102e-05, + "loss": 1.5446, + "step": 9485 + }, + { + "epoch": 0.3102929636411195, + "grad_norm": 3.3778997330964846, + "learning_rate": 1.7424953149361665e-05, + "loss": 1.4895, + "step": 9490 + }, + { + "epoch": 0.3104564478158514, + "grad_norm": 3.4814562838282477, + "learning_rate": 1.7421129234257906e-05, + "loss": 1.6424, + "step": 9495 + }, + { + "epoch": 0.3106199319905833, + "grad_norm": 3.3444746067783546, + "learning_rate": 1.7417302902274182e-05, + "loss": 1.5832, + "step": 9500 + }, + { + "epoch": 0.3107834161653152, + "grad_norm": 3.3176676434397314, + "learning_rate": 1.7413474154656636e-05, + "loss": 1.5468, + "step": 9505 + }, + { + "epoch": 0.31094690034004707, + "grad_norm": 3.067551528283974, + "learning_rate": 1.7409642992652197e-05, + "loss": 1.4021, + "step": 9510 + }, + { + "epoch": 0.31111038451477896, + "grad_norm": 3.194115505624019, + "learning_rate": 1.7405809417508584e-05, + "loss": 1.4544, + "step": 9515 + }, + { + "epoch": 0.31127386868951085, + "grad_norm": 3.130813375427652, + "learning_rate": 1.740197343047429e-05, + "loss": 1.5949, + "step": 9520 + }, + { + "epoch": 0.31143735286424273, + "grad_norm": 3.091598486214153, + "learning_rate": 1.7398135032798608e-05, + "loss": 1.4737, + "step": 9525 + }, + { + "epoch": 0.3116008370389746, + "grad_norm": 3.2294565315167016, + "learning_rate": 1.7394294225731608e-05, + "loss": 1.5358, + "step": 9530 + }, + { + "epoch": 0.3117643212137065, + "grad_norm": 3.1933195095259204, + "learning_rate": 1.739045101052415e-05, + "loss": 1.516, + "step": 9535 + }, + { + "epoch": 0.3119278053884384, + "grad_norm": 3.1493798377828006, + "learning_rate": 1.7386605388427874e-05, + "loss": 1.49, + "step": 9540 + }, + { + "epoch": 0.3120912895631703, + "grad_norm": 3.090990448586715, + "learning_rate": 1.73827573606952e-05, + "loss": 1.4605, + "step": 9545 + }, + { + "epoch": 0.31225477373790217, + "grad_norm": 3.0236556458088573, + "learning_rate": 1.737890692857934e-05, + "loss": 1.4551, + "step": 9550 + }, + { + "epoch": 0.31241825791263406, + "grad_norm": 3.3274551348234467, + "learning_rate": 1.7375054093334286e-05, + "loss": 1.6037, + "step": 9555 + }, + { + "epoch": 0.31258174208736594, + "grad_norm": 3.3276908409969126, + "learning_rate": 1.7371198856214813e-05, + "loss": 1.4721, + "step": 9560 + }, + { + "epoch": 0.31274522626209783, + "grad_norm": 2.8515767798768983, + "learning_rate": 1.7367341218476476e-05, + "loss": 1.3737, + "step": 9565 + }, + { + "epoch": 0.3129087104368297, + "grad_norm": 3.2022816486513666, + "learning_rate": 1.736348118137561e-05, + "loss": 1.3955, + "step": 9570 + }, + { + "epoch": 0.3130721946115616, + "grad_norm": 4.33308928233594, + "learning_rate": 1.7359618746169343e-05, + "loss": 1.6214, + "step": 9575 + }, + { + "epoch": 0.3132356787862935, + "grad_norm": 3.1855520035897764, + "learning_rate": 1.7355753914115563e-05, + "loss": 1.5403, + "step": 9580 + }, + { + "epoch": 0.3133991629610254, + "grad_norm": 3.2320260343573124, + "learning_rate": 1.7351886686472964e-05, + "loss": 1.5621, + "step": 9585 + }, + { + "epoch": 0.31356264713575727, + "grad_norm": 3.1593768581874224, + "learning_rate": 1.7348017064500994e-05, + "loss": 1.4522, + "step": 9590 + }, + { + "epoch": 0.31372613131048915, + "grad_norm": 3.267170053521526, + "learning_rate": 1.7344145049459906e-05, + "loss": 1.4309, + "step": 9595 + }, + { + "epoch": 0.31388961548522104, + "grad_norm": 3.334143571862588, + "learning_rate": 1.7340270642610716e-05, + "loss": 1.4301, + "step": 9600 + }, + { + "epoch": 0.31405309965995293, + "grad_norm": 3.0361400091097774, + "learning_rate": 1.733639384521522e-05, + "loss": 1.423, + "step": 9605 + }, + { + "epoch": 0.3142165838346848, + "grad_norm": 3.148476659186662, + "learning_rate": 1.7332514658536003e-05, + "loss": 1.4635, + "step": 9610 + }, + { + "epoch": 0.3143800680094167, + "grad_norm": 3.285788867247565, + "learning_rate": 1.7328633083836413e-05, + "loss": 1.501, + "step": 9615 + }, + { + "epoch": 0.3145435521841486, + "grad_norm": 3.2392215532942603, + "learning_rate": 1.7324749122380592e-05, + "loss": 1.4405, + "step": 9620 + }, + { + "epoch": 0.3147070363588805, + "grad_norm": 3.182915741143621, + "learning_rate": 1.7320862775433443e-05, + "loss": 1.3714, + "step": 9625 + }, + { + "epoch": 0.31487052053361236, + "grad_norm": 2.914694435227451, + "learning_rate": 1.7316974044260662e-05, + "loss": 1.5102, + "step": 9630 + }, + { + "epoch": 0.31503400470834425, + "grad_norm": 3.2829781426968054, + "learning_rate": 1.7313082930128705e-05, + "loss": 1.4529, + "step": 9635 + }, + { + "epoch": 0.31519748888307614, + "grad_norm": 3.250590959854691, + "learning_rate": 1.730918943430482e-05, + "loss": 1.3818, + "step": 9640 + }, + { + "epoch": 0.315360973057808, + "grad_norm": 3.046556686922147, + "learning_rate": 1.730529355805702e-05, + "loss": 1.517, + "step": 9645 + }, + { + "epoch": 0.3155244572325399, + "grad_norm": 3.1116738587053945, + "learning_rate": 1.7301395302654094e-05, + "loss": 1.4898, + "step": 9650 + }, + { + "epoch": 0.3156879414072718, + "grad_norm": 3.0315187115445426, + "learning_rate": 1.7297494669365617e-05, + "loss": 1.5678, + "step": 9655 + }, + { + "epoch": 0.3158514255820037, + "grad_norm": 3.1923057432434216, + "learning_rate": 1.7293591659461918e-05, + "loss": 1.4323, + "step": 9660 + }, + { + "epoch": 0.3160149097567356, + "grad_norm": 3.075919904586659, + "learning_rate": 1.7289686274214116e-05, + "loss": 1.5273, + "step": 9665 + }, + { + "epoch": 0.31617839393146746, + "grad_norm": 3.279251973036014, + "learning_rate": 1.7285778514894103e-05, + "loss": 1.5981, + "step": 9670 + }, + { + "epoch": 0.3163418781061993, + "grad_norm": 3.32190130988125, + "learning_rate": 1.728186838277454e-05, + "loss": 1.4494, + "step": 9675 + }, + { + "epoch": 0.3165053622809312, + "grad_norm": 3.614444445053527, + "learning_rate": 1.7277955879128855e-05, + "loss": 1.4628, + "step": 9680 + }, + { + "epoch": 0.31666884645566307, + "grad_norm": 3.5636886657132143, + "learning_rate": 1.7274041005231262e-05, + "loss": 1.5782, + "step": 9685 + }, + { + "epoch": 0.31683233063039495, + "grad_norm": 3.1919112310024715, + "learning_rate": 1.7270123762356733e-05, + "loss": 1.5243, + "step": 9690 + }, + { + "epoch": 0.31699581480512684, + "grad_norm": 3.282897006278805, + "learning_rate": 1.726620415178102e-05, + "loss": 1.3904, + "step": 9695 + }, + { + "epoch": 0.31715929897985873, + "grad_norm": 3.3477028757392304, + "learning_rate": 1.7262282174780654e-05, + "loss": 1.5404, + "step": 9700 + }, + { + "epoch": 0.3173227831545906, + "grad_norm": 3.2151149237623815, + "learning_rate": 1.7258357832632912e-05, + "loss": 1.5409, + "step": 9705 + }, + { + "epoch": 0.3174862673293225, + "grad_norm": 3.4188821434525085, + "learning_rate": 1.7254431126615864e-05, + "loss": 1.4855, + "step": 9710 + }, + { + "epoch": 0.3176497515040544, + "grad_norm": 3.8506555448671023, + "learning_rate": 1.725050205800834e-05, + "loss": 1.6415, + "step": 9715 + }, + { + "epoch": 0.3178132356787863, + "grad_norm": 2.9921457163928964, + "learning_rate": 1.7246570628089943e-05, + "loss": 1.4537, + "step": 9720 + }, + { + "epoch": 0.31797671985351816, + "grad_norm": 3.108124528041241, + "learning_rate": 1.7242636838141038e-05, + "loss": 1.4185, + "step": 9725 + }, + { + "epoch": 0.31814020402825005, + "grad_norm": 3.2583338263127164, + "learning_rate": 1.723870068944277e-05, + "loss": 1.6226, + "step": 9730 + }, + { + "epoch": 0.31830368820298194, + "grad_norm": 3.408495975610544, + "learning_rate": 1.7234762183277044e-05, + "loss": 1.4037, + "step": 9735 + }, + { + "epoch": 0.3184671723777138, + "grad_norm": 3.4044766635018884, + "learning_rate": 1.7230821320926535e-05, + "loss": 1.4944, + "step": 9740 + }, + { + "epoch": 0.3186306565524457, + "grad_norm": 3.2487791268004305, + "learning_rate": 1.722687810367469e-05, + "loss": 1.5479, + "step": 9745 + }, + { + "epoch": 0.3187941407271776, + "grad_norm": 3.246976486482661, + "learning_rate": 1.7222932532805708e-05, + "loss": 1.4898, + "step": 9750 + }, + { + "epoch": 0.3189576249019095, + "grad_norm": 3.259583107307138, + "learning_rate": 1.7218984609604576e-05, + "loss": 1.5477, + "step": 9755 + }, + { + "epoch": 0.3191211090766414, + "grad_norm": 3.389254739220679, + "learning_rate": 1.7215034335357024e-05, + "loss": 1.6829, + "step": 9760 + }, + { + "epoch": 0.31928459325137326, + "grad_norm": 3.0879565836138925, + "learning_rate": 1.721108171134957e-05, + "loss": 1.5233, + "step": 9765 + }, + { + "epoch": 0.31944807742610515, + "grad_norm": 3.3985515415107304, + "learning_rate": 1.720712673886948e-05, + "loss": 1.5903, + "step": 9770 + }, + { + "epoch": 0.31961156160083704, + "grad_norm": 3.1981071035872786, + "learning_rate": 1.7203169419204798e-05, + "loss": 1.3987, + "step": 9775 + }, + { + "epoch": 0.3197750457755689, + "grad_norm": 3.332690581317636, + "learning_rate": 1.719920975364432e-05, + "loss": 1.5665, + "step": 9780 + }, + { + "epoch": 0.3199385299503008, + "grad_norm": 3.2143082673389793, + "learning_rate": 1.7195247743477616e-05, + "loss": 1.4402, + "step": 9785 + }, + { + "epoch": 0.3201020141250327, + "grad_norm": 3.051384918087978, + "learning_rate": 1.7191283389995012e-05, + "loss": 1.679, + "step": 9790 + }, + { + "epoch": 0.3202654982997646, + "grad_norm": 3.251681501790158, + "learning_rate": 1.7187316694487603e-05, + "loss": 1.4023, + "step": 9795 + }, + { + "epoch": 0.32042898247449647, + "grad_norm": 3.1519356490341672, + "learning_rate": 1.7183347658247244e-05, + "loss": 1.5267, + "step": 9800 + }, + { + "epoch": 0.32059246664922836, + "grad_norm": 3.048487919097891, + "learning_rate": 1.7179376282566557e-05, + "loss": 1.4333, + "step": 9805 + }, + { + "epoch": 0.32075595082396025, + "grad_norm": 3.1285393756183506, + "learning_rate": 1.7175402568738914e-05, + "loss": 1.4866, + "step": 9810 + }, + { + "epoch": 0.32091943499869213, + "grad_norm": 3.160834725177182, + "learning_rate": 1.7171426518058463e-05, + "loss": 1.4073, + "step": 9815 + }, + { + "epoch": 0.321082919173424, + "grad_norm": 2.9871049436247885, + "learning_rate": 1.7167448131820104e-05, + "loss": 1.5188, + "step": 9820 + }, + { + "epoch": 0.3212464033481559, + "grad_norm": 3.0819165563859183, + "learning_rate": 1.7163467411319496e-05, + "loss": 1.5706, + "step": 9825 + }, + { + "epoch": 0.3214098875228878, + "grad_norm": 3.1840559920444433, + "learning_rate": 1.715948435785307e-05, + "loss": 1.4948, + "step": 9830 + }, + { + "epoch": 0.3215733716976197, + "grad_norm": 3.5916444880835967, + "learning_rate": 1.7155498972718e-05, + "loss": 1.5044, + "step": 9835 + }, + { + "epoch": 0.32173685587235157, + "grad_norm": 3.2300651412481516, + "learning_rate": 1.7151511257212234e-05, + "loss": 1.5001, + "step": 9840 + }, + { + "epoch": 0.32190034004708346, + "grad_norm": 3.291857920615709, + "learning_rate": 1.7147521212634474e-05, + "loss": 1.5692, + "step": 9845 + }, + { + "epoch": 0.32206382422181534, + "grad_norm": 3.1698939014872765, + "learning_rate": 1.7143528840284178e-05, + "loss": 1.4301, + "step": 9850 + }, + { + "epoch": 0.32222730839654723, + "grad_norm": 2.9738791494976806, + "learning_rate": 1.7139534141461564e-05, + "loss": 1.4009, + "step": 9855 + }, + { + "epoch": 0.3223907925712791, + "grad_norm": 3.235377376587802, + "learning_rate": 1.7135537117467603e-05, + "loss": 1.5778, + "step": 9860 + }, + { + "epoch": 0.322554276746011, + "grad_norm": 3.113444410513833, + "learning_rate": 1.7131537769604037e-05, + "loss": 1.2855, + "step": 9865 + }, + { + "epoch": 0.3227177609207429, + "grad_norm": 3.247576115877292, + "learning_rate": 1.712753609917335e-05, + "loss": 1.5492, + "step": 9870 + }, + { + "epoch": 0.3228812450954748, + "grad_norm": 3.179516704235053, + "learning_rate": 1.7123532107478785e-05, + "loss": 1.4876, + "step": 9875 + }, + { + "epoch": 0.32304472927020667, + "grad_norm": 3.377235054099559, + "learning_rate": 1.7119525795824353e-05, + "loss": 1.4777, + "step": 9880 + }, + { + "epoch": 0.32320821344493855, + "grad_norm": 3.0554307796621742, + "learning_rate": 1.71155171655148e-05, + "loss": 1.3731, + "step": 9885 + }, + { + "epoch": 0.32337169761967044, + "grad_norm": 3.0531924554184915, + "learning_rate": 1.7111506217855648e-05, + "loss": 1.4653, + "step": 9890 + }, + { + "epoch": 0.3235351817944023, + "grad_norm": 3.1888484004617696, + "learning_rate": 1.7107492954153162e-05, + "loss": 1.5063, + "step": 9895 + }, + { + "epoch": 0.3236986659691342, + "grad_norm": 3.1544471125075315, + "learning_rate": 1.7103477375714363e-05, + "loss": 1.5097, + "step": 9900 + }, + { + "epoch": 0.3238621501438661, + "grad_norm": 3.2091185829701057, + "learning_rate": 1.7099459483847024e-05, + "loss": 1.5595, + "step": 9905 + }, + { + "epoch": 0.32402563431859793, + "grad_norm": 3.1820477961153064, + "learning_rate": 1.7095439279859678e-05, + "loss": 1.5056, + "step": 9910 + }, + { + "epoch": 0.3241891184933298, + "grad_norm": 3.267984190839669, + "learning_rate": 1.7091416765061602e-05, + "loss": 1.393, + "step": 9915 + }, + { + "epoch": 0.3243526026680617, + "grad_norm": 2.9008922943443425, + "learning_rate": 1.7087391940762842e-05, + "loss": 1.3738, + "step": 9920 + }, + { + "epoch": 0.3245160868427936, + "grad_norm": 3.031815144800599, + "learning_rate": 1.708336480827417e-05, + "loss": 1.3624, + "step": 9925 + }, + { + "epoch": 0.3246795710175255, + "grad_norm": 3.3077436746495703, + "learning_rate": 1.707933536890713e-05, + "loss": 1.6016, + "step": 9930 + }, + { + "epoch": 0.32484305519225737, + "grad_norm": 3.026092959135563, + "learning_rate": 1.7075303623974018e-05, + "loss": 1.4546, + "step": 9935 + }, + { + "epoch": 0.32500653936698926, + "grad_norm": 3.0891570870587275, + "learning_rate": 1.7071269574787863e-05, + "loss": 1.6568, + "step": 9940 + }, + { + "epoch": 0.32517002354172114, + "grad_norm": 3.0228689107043873, + "learning_rate": 1.7067233222662466e-05, + "loss": 1.3255, + "step": 9945 + }, + { + "epoch": 0.32533350771645303, + "grad_norm": 3.2824882939238385, + "learning_rate": 1.7063194568912362e-05, + "loss": 1.557, + "step": 9950 + }, + { + "epoch": 0.3254969918911849, + "grad_norm": 3.1605155368582096, + "learning_rate": 1.7059153614852847e-05, + "loss": 1.5026, + "step": 9955 + }, + { + "epoch": 0.3256604760659168, + "grad_norm": 3.046758186255955, + "learning_rate": 1.705511036179995e-05, + "loss": 1.4498, + "step": 9960 + }, + { + "epoch": 0.3258239602406487, + "grad_norm": 3.0317810301964174, + "learning_rate": 1.7051064811070474e-05, + "loss": 1.5327, + "step": 9965 + }, + { + "epoch": 0.3259874444153806, + "grad_norm": 3.2150616745574254, + "learning_rate": 1.7047016963981948e-05, + "loss": 1.5375, + "step": 9970 + }, + { + "epoch": 0.32615092859011247, + "grad_norm": 3.25542296991951, + "learning_rate": 1.7042966821852653e-05, + "loss": 1.4745, + "step": 9975 + }, + { + "epoch": 0.32631441276484435, + "grad_norm": 3.1734582424553226, + "learning_rate": 1.7038914386001627e-05, + "loss": 1.3606, + "step": 9980 + }, + { + "epoch": 0.32647789693957624, + "grad_norm": 3.2136459272394386, + "learning_rate": 1.703485965774865e-05, + "loss": 1.4887, + "step": 9985 + }, + { + "epoch": 0.3266413811143081, + "grad_norm": 3.1930011861377805, + "learning_rate": 1.703080263841424e-05, + "loss": 1.4678, + "step": 9990 + }, + { + "epoch": 0.32680486528904, + "grad_norm": 3.431041475445151, + "learning_rate": 1.7026743329319676e-05, + "loss": 1.3887, + "step": 9995 + }, + { + "epoch": 0.3269683494637719, + "grad_norm": 3.223919232804425, + "learning_rate": 1.7022681731786973e-05, + "loss": 1.5288, + "step": 10000 + }, + { + "epoch": 0.3271318336385038, + "grad_norm": 3.277806341984485, + "learning_rate": 1.7018617847138897e-05, + "loss": 1.457, + "step": 10005 + }, + { + "epoch": 0.3272953178132357, + "grad_norm": 3.221283276354996, + "learning_rate": 1.7014551676698945e-05, + "loss": 1.4979, + "step": 10010 + }, + { + "epoch": 0.32745880198796756, + "grad_norm": 2.93644669041101, + "learning_rate": 1.701048322179139e-05, + "loss": 1.4855, + "step": 10015 + }, + { + "epoch": 0.32762228616269945, + "grad_norm": 3.2509698509654252, + "learning_rate": 1.70064124837412e-05, + "loss": 1.6069, + "step": 10020 + }, + { + "epoch": 0.32778577033743134, + "grad_norm": 2.927797059542785, + "learning_rate": 1.700233946387414e-05, + "loss": 1.4484, + "step": 10025 + }, + { + "epoch": 0.3279492545121632, + "grad_norm": 3.22368860152724, + "learning_rate": 1.699826416351668e-05, + "loss": 1.5032, + "step": 10030 + }, + { + "epoch": 0.3281127386868951, + "grad_norm": 3.3486005540174197, + "learning_rate": 1.6994186583996043e-05, + "loss": 1.5193, + "step": 10035 + }, + { + "epoch": 0.328276222861627, + "grad_norm": 3.005222964405026, + "learning_rate": 1.6990106726640206e-05, + "loss": 1.4387, + "step": 10040 + }, + { + "epoch": 0.3284397070363589, + "grad_norm": 3.275554268082243, + "learning_rate": 1.6986024592777873e-05, + "loss": 1.5324, + "step": 10045 + }, + { + "epoch": 0.3286031912110908, + "grad_norm": 3.091806579832144, + "learning_rate": 1.6981940183738496e-05, + "loss": 1.4223, + "step": 10050 + }, + { + "epoch": 0.32876667538582266, + "grad_norm": 3.4120123946114664, + "learning_rate": 1.697785350085227e-05, + "loss": 1.4911, + "step": 10055 + }, + { + "epoch": 0.32893015956055455, + "grad_norm": 2.8287623570965867, + "learning_rate": 1.697376454545012e-05, + "loss": 1.267, + "step": 10060 + }, + { + "epoch": 0.32909364373528643, + "grad_norm": 2.8496839000492655, + "learning_rate": 1.6969673318863727e-05, + "loss": 1.4673, + "step": 10065 + }, + { + "epoch": 0.3292571279100183, + "grad_norm": 3.3597053874168554, + "learning_rate": 1.6965579822425497e-05, + "loss": 1.5276, + "step": 10070 + }, + { + "epoch": 0.3294206120847502, + "grad_norm": 3.362543770386295, + "learning_rate": 1.696148405746858e-05, + "loss": 1.5592, + "step": 10075 + }, + { + "epoch": 0.3295840962594821, + "grad_norm": 3.0214733377833, + "learning_rate": 1.6957386025326878e-05, + "loss": 1.5518, + "step": 10080 + }, + { + "epoch": 0.329747580434214, + "grad_norm": 3.301688217136061, + "learning_rate": 1.6953285727335006e-05, + "loss": 1.5136, + "step": 10085 + }, + { + "epoch": 0.32991106460894587, + "grad_norm": 3.240911161914824, + "learning_rate": 1.6949183164828336e-05, + "loss": 1.5411, + "step": 10090 + }, + { + "epoch": 0.33007454878367776, + "grad_norm": 3.211959791682164, + "learning_rate": 1.6945078339142974e-05, + "loss": 1.4437, + "step": 10095 + }, + { + "epoch": 0.33023803295840964, + "grad_norm": 3.3196407116070943, + "learning_rate": 1.6940971251615762e-05, + "loss": 1.492, + "step": 10100 + }, + { + "epoch": 0.33040151713314153, + "grad_norm": 3.183647447084887, + "learning_rate": 1.6936861903584276e-05, + "loss": 1.5864, + "step": 10105 + }, + { + "epoch": 0.3305650013078734, + "grad_norm": 3.0511607795969478, + "learning_rate": 1.6932750296386825e-05, + "loss": 1.6188, + "step": 10110 + }, + { + "epoch": 0.3307284854826053, + "grad_norm": 3.0500002807775157, + "learning_rate": 1.6928636431362466e-05, + "loss": 1.5405, + "step": 10115 + }, + { + "epoch": 0.3308919696573372, + "grad_norm": 3.433172855279102, + "learning_rate": 1.692452030985098e-05, + "loss": 1.5015, + "step": 10120 + }, + { + "epoch": 0.3310554538320691, + "grad_norm": 3.1392736812641813, + "learning_rate": 1.692040193319289e-05, + "loss": 1.5096, + "step": 10125 + }, + { + "epoch": 0.33121893800680097, + "grad_norm": 2.9171577427768502, + "learning_rate": 1.6916281302729447e-05, + "loss": 1.4192, + "step": 10130 + }, + { + "epoch": 0.33138242218153285, + "grad_norm": 3.035166940466288, + "learning_rate": 1.691215841980264e-05, + "loss": 1.5632, + "step": 10135 + }, + { + "epoch": 0.33154590635626474, + "grad_norm": 3.164686033501917, + "learning_rate": 1.6908033285755193e-05, + "loss": 1.4575, + "step": 10140 + }, + { + "epoch": 0.3317093905309966, + "grad_norm": 3.1680585636223544, + "learning_rate": 1.6903905901930558e-05, + "loss": 1.5473, + "step": 10145 + }, + { + "epoch": 0.33187287470572846, + "grad_norm": 3.39311405131514, + "learning_rate": 1.689977626967293e-05, + "loss": 1.5126, + "step": 10150 + }, + { + "epoch": 0.33203635888046035, + "grad_norm": 3.1371893565135136, + "learning_rate": 1.689564439032722e-05, + "loss": 1.3867, + "step": 10155 + }, + { + "epoch": 0.33219984305519223, + "grad_norm": 3.30567819550198, + "learning_rate": 1.6891510265239084e-05, + "loss": 1.3845, + "step": 10160 + }, + { + "epoch": 0.3323633272299241, + "grad_norm": 3.171951609655606, + "learning_rate": 1.6887373895754902e-05, + "loss": 1.5019, + "step": 10165 + }, + { + "epoch": 0.332526811404656, + "grad_norm": 3.2204216913297237, + "learning_rate": 1.6883235283221794e-05, + "loss": 1.4503, + "step": 10170 + }, + { + "epoch": 0.3326902955793879, + "grad_norm": 3.2618656600119884, + "learning_rate": 1.68790944289876e-05, + "loss": 1.6857, + "step": 10175 + }, + { + "epoch": 0.3328537797541198, + "grad_norm": 3.201828848613805, + "learning_rate": 1.68749513344009e-05, + "loss": 1.4478, + "step": 10180 + }, + { + "epoch": 0.33301726392885167, + "grad_norm": 3.3194126127332715, + "learning_rate": 1.687080600081099e-05, + "loss": 1.643, + "step": 10185 + }, + { + "epoch": 0.33318074810358356, + "grad_norm": 3.1574998240304675, + "learning_rate": 1.686665842956791e-05, + "loss": 1.4394, + "step": 10190 + }, + { + "epoch": 0.33334423227831544, + "grad_norm": 3.307852555986016, + "learning_rate": 1.6862508622022424e-05, + "loss": 1.5477, + "step": 10195 + }, + { + "epoch": 0.33350771645304733, + "grad_norm": 3.1566848353216748, + "learning_rate": 1.6858356579526018e-05, + "loss": 1.5681, + "step": 10200 + }, + { + "epoch": 0.3336712006277792, + "grad_norm": 3.0940953022534616, + "learning_rate": 1.6854202303430913e-05, + "loss": 1.5021, + "step": 10205 + }, + { + "epoch": 0.3338346848025111, + "grad_norm": 3.1854086248119615, + "learning_rate": 1.685004579509005e-05, + "loss": 1.5271, + "step": 10210 + }, + { + "epoch": 0.333998168977243, + "grad_norm": 3.1369430563503333, + "learning_rate": 1.684588705585711e-05, + "loss": 1.5045, + "step": 10215 + }, + { + "epoch": 0.3341616531519749, + "grad_norm": 3.4039758548681287, + "learning_rate": 1.6841726087086486e-05, + "loss": 1.6109, + "step": 10220 + }, + { + "epoch": 0.33432513732670677, + "grad_norm": 3.08965659249306, + "learning_rate": 1.6837562890133306e-05, + "loss": 1.3868, + "step": 10225 + }, + { + "epoch": 0.33448862150143865, + "grad_norm": 3.4139656582842592, + "learning_rate": 1.683339746635342e-05, + "loss": 1.5388, + "step": 10230 + }, + { + "epoch": 0.33465210567617054, + "grad_norm": 3.0145473201828774, + "learning_rate": 1.6829229817103408e-05, + "loss": 1.4541, + "step": 10235 + }, + { + "epoch": 0.33481558985090243, + "grad_norm": 3.2728736938188066, + "learning_rate": 1.6825059943740566e-05, + "loss": 1.5704, + "step": 10240 + }, + { + "epoch": 0.3349790740256343, + "grad_norm": 3.4736268882244383, + "learning_rate": 1.6820887847622924e-05, + "loss": 1.5581, + "step": 10245 + }, + { + "epoch": 0.3351425582003662, + "grad_norm": 3.2032359341276004, + "learning_rate": 1.681671353010923e-05, + "loss": 1.5585, + "step": 10250 + }, + { + "epoch": 0.3353060423750981, + "grad_norm": 3.097916694915863, + "learning_rate": 1.6812536992558958e-05, + "loss": 1.5417, + "step": 10255 + }, + { + "epoch": 0.33546952654983, + "grad_norm": 3.4096148101881543, + "learning_rate": 1.6808358236332304e-05, + "loss": 1.5426, + "step": 10260 + }, + { + "epoch": 0.33563301072456186, + "grad_norm": 3.3337815319757222, + "learning_rate": 1.680417726279018e-05, + "loss": 1.5016, + "step": 10265 + }, + { + "epoch": 0.33579649489929375, + "grad_norm": 3.190447149056922, + "learning_rate": 1.6799994073294237e-05, + "loss": 1.5394, + "step": 10270 + }, + { + "epoch": 0.33595997907402564, + "grad_norm": 3.0070809846543725, + "learning_rate": 1.679580866920683e-05, + "loss": 1.4321, + "step": 10275 + }, + { + "epoch": 0.3361234632487575, + "grad_norm": 3.1387208523115775, + "learning_rate": 1.679162105189105e-05, + "loss": 1.4477, + "step": 10280 + }, + { + "epoch": 0.3362869474234894, + "grad_norm": 3.319118674947225, + "learning_rate": 1.6787431222710687e-05, + "loss": 1.4204, + "step": 10285 + }, + { + "epoch": 0.3364504315982213, + "grad_norm": 3.3708349777385367, + "learning_rate": 1.678323918303028e-05, + "loss": 1.5992, + "step": 10290 + }, + { + "epoch": 0.3366139157729532, + "grad_norm": 3.1009905372450426, + "learning_rate": 1.6779044934215067e-05, + "loss": 1.4403, + "step": 10295 + }, + { + "epoch": 0.3367773999476851, + "grad_norm": 3.249362650772671, + "learning_rate": 1.6774848477631015e-05, + "loss": 1.4443, + "step": 10300 + }, + { + "epoch": 0.33694088412241696, + "grad_norm": 3.0757939598337987, + "learning_rate": 1.6770649814644805e-05, + "loss": 1.4351, + "step": 10305 + }, + { + "epoch": 0.33710436829714885, + "grad_norm": 3.318874232376666, + "learning_rate": 1.6766448946623843e-05, + "loss": 1.4589, + "step": 10310 + }, + { + "epoch": 0.33726785247188074, + "grad_norm": 3.2697194340417446, + "learning_rate": 1.6762245874936242e-05, + "loss": 1.5543, + "step": 10315 + }, + { + "epoch": 0.3374313366466126, + "grad_norm": 2.839412603663339, + "learning_rate": 1.675804060095084e-05, + "loss": 1.4846, + "step": 10320 + }, + { + "epoch": 0.3375948208213445, + "grad_norm": 3.4509210002945983, + "learning_rate": 1.6753833126037197e-05, + "loss": 1.6241, + "step": 10325 + }, + { + "epoch": 0.3377583049960764, + "grad_norm": 3.2351352925759227, + "learning_rate": 1.674962345156558e-05, + "loss": 1.4076, + "step": 10330 + }, + { + "epoch": 0.3379217891708083, + "grad_norm": 3.199999641193821, + "learning_rate": 1.674541157890698e-05, + "loss": 1.6444, + "step": 10335 + }, + { + "epoch": 0.33808527334554017, + "grad_norm": 3.277672250771637, + "learning_rate": 1.67411975094331e-05, + "loss": 1.4919, + "step": 10340 + }, + { + "epoch": 0.33824875752027206, + "grad_norm": 3.3525038989918996, + "learning_rate": 1.673698124451636e-05, + "loss": 1.5182, + "step": 10345 + }, + { + "epoch": 0.33841224169500395, + "grad_norm": 3.226399564493069, + "learning_rate": 1.673276278552989e-05, + "loss": 1.5187, + "step": 10350 + }, + { + "epoch": 0.33857572586973583, + "grad_norm": 2.9511662745983944, + "learning_rate": 1.6728542133847546e-05, + "loss": 1.597, + "step": 10355 + }, + { + "epoch": 0.3387392100444677, + "grad_norm": 3.4203128391356694, + "learning_rate": 1.672431929084388e-05, + "loss": 1.3283, + "step": 10360 + }, + { + "epoch": 0.3389026942191996, + "grad_norm": 3.3058893708456067, + "learning_rate": 1.6720094257894176e-05, + "loss": 1.4288, + "step": 10365 + }, + { + "epoch": 0.3390661783939315, + "grad_norm": 3.155000715108142, + "learning_rate": 1.6715867036374427e-05, + "loss": 1.4778, + "step": 10370 + }, + { + "epoch": 0.3392296625686633, + "grad_norm": 2.9968034128596797, + "learning_rate": 1.6711637627661327e-05, + "loss": 1.42, + "step": 10375 + }, + { + "epoch": 0.3393931467433952, + "grad_norm": 3.146269186052012, + "learning_rate": 1.6707406033132295e-05, + "loss": 1.5116, + "step": 10380 + }, + { + "epoch": 0.3395566309181271, + "grad_norm": 3.1474062952764985, + "learning_rate": 1.670317225416546e-05, + "loss": 1.5696, + "step": 10385 + }, + { + "epoch": 0.339720115092859, + "grad_norm": 3.124194139252438, + "learning_rate": 1.6698936292139657e-05, + "loss": 1.505, + "step": 10390 + }, + { + "epoch": 0.3398835992675909, + "grad_norm": 3.2893304502609375, + "learning_rate": 1.6694698148434432e-05, + "loss": 1.5724, + "step": 10395 + }, + { + "epoch": 0.34004708344232276, + "grad_norm": 3.164558718589039, + "learning_rate": 1.6690457824430054e-05, + "loss": 1.5036, + "step": 10400 + }, + { + "epoch": 0.34021056761705465, + "grad_norm": 3.2091465630344493, + "learning_rate": 1.668621532150748e-05, + "loss": 1.5037, + "step": 10405 + }, + { + "epoch": 0.34037405179178654, + "grad_norm": 3.494818543957328, + "learning_rate": 1.66819706410484e-05, + "loss": 1.4808, + "step": 10410 + }, + { + "epoch": 0.3405375359665184, + "grad_norm": 3.335942073607169, + "learning_rate": 1.6677723784435197e-05, + "loss": 1.4626, + "step": 10415 + }, + { + "epoch": 0.3407010201412503, + "grad_norm": 3.0049630961010814, + "learning_rate": 1.667347475305097e-05, + "loss": 1.583, + "step": 10420 + }, + { + "epoch": 0.3408645043159822, + "grad_norm": 3.130429933849279, + "learning_rate": 1.6669223548279527e-05, + "loss": 1.3855, + "step": 10425 + }, + { + "epoch": 0.3410279884907141, + "grad_norm": 3.3117283315073913, + "learning_rate": 1.6664970171505373e-05, + "loss": 1.3919, + "step": 10430 + }, + { + "epoch": 0.34119147266544597, + "grad_norm": 2.9739516781613307, + "learning_rate": 1.666071462411374e-05, + "loss": 1.4539, + "step": 10435 + }, + { + "epoch": 0.34135495684017786, + "grad_norm": 3.4041775670765353, + "learning_rate": 1.6656456907490546e-05, + "loss": 1.5487, + "step": 10440 + }, + { + "epoch": 0.34151844101490975, + "grad_norm": 3.1731064963080815, + "learning_rate": 1.6652197023022428e-05, + "loss": 1.4401, + "step": 10445 + }, + { + "epoch": 0.34168192518964163, + "grad_norm": 3.1077140939527355, + "learning_rate": 1.6647934972096725e-05, + "loss": 1.5338, + "step": 10450 + }, + { + "epoch": 0.3418454093643735, + "grad_norm": 3.133349540706528, + "learning_rate": 1.6643670756101483e-05, + "loss": 1.5693, + "step": 10455 + }, + { + "epoch": 0.3420088935391054, + "grad_norm": 3.056296463000735, + "learning_rate": 1.663940437642546e-05, + "loss": 1.492, + "step": 10460 + }, + { + "epoch": 0.3421723777138373, + "grad_norm": 3.1259836379702532, + "learning_rate": 1.66351358344581e-05, + "loss": 1.4321, + "step": 10465 + }, + { + "epoch": 0.3423358618885692, + "grad_norm": 2.889202305999886, + "learning_rate": 1.663086513158957e-05, + "loss": 1.5057, + "step": 10470 + }, + { + "epoch": 0.34249934606330107, + "grad_norm": 3.272914684643696, + "learning_rate": 1.662659226921073e-05, + "loss": 1.6272, + "step": 10475 + }, + { + "epoch": 0.34266283023803296, + "grad_norm": 3.4381147874434954, + "learning_rate": 1.6622317248713144e-05, + "loss": 1.6381, + "step": 10480 + }, + { + "epoch": 0.34282631441276484, + "grad_norm": 3.037340894414251, + "learning_rate": 1.661804007148909e-05, + "loss": 1.5454, + "step": 10485 + }, + { + "epoch": 0.34298979858749673, + "grad_norm": 3.072036812222429, + "learning_rate": 1.6613760738931534e-05, + "loss": 1.504, + "step": 10490 + }, + { + "epoch": 0.3431532827622286, + "grad_norm": 2.9632179770391485, + "learning_rate": 1.6609479252434145e-05, + "loss": 1.5011, + "step": 10495 + }, + { + "epoch": 0.3433167669369605, + "grad_norm": 3.3477353905814002, + "learning_rate": 1.6605195613391307e-05, + "loss": 1.5569, + "step": 10500 + }, + { + "epoch": 0.3434802511116924, + "grad_norm": 3.045523997852102, + "learning_rate": 1.6600909823198094e-05, + "loss": 1.4623, + "step": 10505 + }, + { + "epoch": 0.3436437352864243, + "grad_norm": 3.473655497882172, + "learning_rate": 1.659662188325028e-05, + "loss": 1.5396, + "step": 10510 + }, + { + "epoch": 0.34380721946115617, + "grad_norm": 3.2221212682291203, + "learning_rate": 1.6592331794944346e-05, + "loss": 1.4209, + "step": 10515 + }, + { + "epoch": 0.34397070363588805, + "grad_norm": 3.1981819289600986, + "learning_rate": 1.658803955967746e-05, + "loss": 1.4208, + "step": 10520 + }, + { + "epoch": 0.34413418781061994, + "grad_norm": 3.376997132225382, + "learning_rate": 1.6583745178847512e-05, + "loss": 1.3421, + "step": 10525 + }, + { + "epoch": 0.3442976719853518, + "grad_norm": 3.1208947539635266, + "learning_rate": 1.6579448653853067e-05, + "loss": 1.5098, + "step": 10530 + }, + { + "epoch": 0.3444611561600837, + "grad_norm": 3.0793434966044386, + "learning_rate": 1.6575149986093396e-05, + "loss": 1.457, + "step": 10535 + }, + { + "epoch": 0.3446246403348156, + "grad_norm": 3.1710873658594085, + "learning_rate": 1.6570849176968477e-05, + "loss": 1.4494, + "step": 10540 + }, + { + "epoch": 0.3447881245095475, + "grad_norm": 3.31175263846486, + "learning_rate": 1.6566546227878975e-05, + "loss": 1.494, + "step": 10545 + }, + { + "epoch": 0.3449516086842794, + "grad_norm": 3.405016707521435, + "learning_rate": 1.6562241140226255e-05, + "loss": 1.5715, + "step": 10550 + }, + { + "epoch": 0.34511509285901126, + "grad_norm": 3.3743735261800687, + "learning_rate": 1.6557933915412375e-05, + "loss": 1.547, + "step": 10555 + }, + { + "epoch": 0.34527857703374315, + "grad_norm": 3.088680789863492, + "learning_rate": 1.65536245548401e-05, + "loss": 1.6572, + "step": 10560 + }, + { + "epoch": 0.34544206120847504, + "grad_norm": 3.222708036786929, + "learning_rate": 1.654931305991288e-05, + "loss": 1.4387, + "step": 10565 + }, + { + "epoch": 0.3456055453832069, + "grad_norm": 3.26575274720762, + "learning_rate": 1.6544999432034856e-05, + "loss": 1.5007, + "step": 10570 + }, + { + "epoch": 0.3457690295579388, + "grad_norm": 3.0575856193024764, + "learning_rate": 1.6540683672610882e-05, + "loss": 1.506, + "step": 10575 + }, + { + "epoch": 0.3459325137326707, + "grad_norm": 3.2165697511887035, + "learning_rate": 1.653636578304649e-05, + "loss": 1.5668, + "step": 10580 + }, + { + "epoch": 0.3460959979074026, + "grad_norm": 2.939997419214355, + "learning_rate": 1.6532045764747908e-05, + "loss": 1.5277, + "step": 10585 + }, + { + "epoch": 0.3462594820821345, + "grad_norm": 3.150543970047848, + "learning_rate": 1.6527723619122067e-05, + "loss": 1.6787, + "step": 10590 + }, + { + "epoch": 0.34642296625686636, + "grad_norm": 3.2618288680814915, + "learning_rate": 1.6523399347576577e-05, + "loss": 1.547, + "step": 10595 + }, + { + "epoch": 0.34658645043159825, + "grad_norm": 3.222079100417098, + "learning_rate": 1.6519072951519756e-05, + "loss": 1.3873, + "step": 10600 + }, + { + "epoch": 0.34674993460633013, + "grad_norm": 3.0443731663005447, + "learning_rate": 1.6514744432360595e-05, + "loss": 1.4332, + "step": 10605 + }, + { + "epoch": 0.34691341878106197, + "grad_norm": 4.336308426998174, + "learning_rate": 1.651041379150879e-05, + "loss": 1.6188, + "step": 10610 + }, + { + "epoch": 0.34707690295579385, + "grad_norm": 3.018427537946744, + "learning_rate": 1.6506081030374733e-05, + "loss": 1.4018, + "step": 10615 + }, + { + "epoch": 0.34724038713052574, + "grad_norm": 3.2927387429864585, + "learning_rate": 1.6501746150369487e-05, + "loss": 1.4269, + "step": 10620 + }, + { + "epoch": 0.3474038713052576, + "grad_norm": 3.1294437121106604, + "learning_rate": 1.6497409152904816e-05, + "loss": 1.4811, + "step": 10625 + }, + { + "epoch": 0.3475673554799895, + "grad_norm": 2.8760473638507977, + "learning_rate": 1.6493070039393186e-05, + "loss": 1.4635, + "step": 10630 + }, + { + "epoch": 0.3477308396547214, + "grad_norm": 3.274863008617262, + "learning_rate": 1.6488728811247726e-05, + "loss": 1.436, + "step": 10635 + }, + { + "epoch": 0.3478943238294533, + "grad_norm": 3.3793764493914695, + "learning_rate": 1.6484385469882278e-05, + "loss": 1.5045, + "step": 10640 + }, + { + "epoch": 0.3480578080041852, + "grad_norm": 3.2730128632418634, + "learning_rate": 1.6480040016711354e-05, + "loss": 1.5446, + "step": 10645 + }, + { + "epoch": 0.34822129217891706, + "grad_norm": 3.2435453652027153, + "learning_rate": 1.647569245315017e-05, + "loss": 1.5374, + "step": 10650 + }, + { + "epoch": 0.34838477635364895, + "grad_norm": 3.09021378827123, + "learning_rate": 1.6471342780614615e-05, + "loss": 1.3932, + "step": 10655 + }, + { + "epoch": 0.34854826052838084, + "grad_norm": 3.4972478351588476, + "learning_rate": 1.646699100052127e-05, + "loss": 1.5184, + "step": 10660 + }, + { + "epoch": 0.3487117447031127, + "grad_norm": 3.2045120897594574, + "learning_rate": 1.6462637114287406e-05, + "loss": 1.5653, + "step": 10665 + }, + { + "epoch": 0.3488752288778446, + "grad_norm": 3.0650453117737695, + "learning_rate": 1.6458281123330975e-05, + "loss": 1.6124, + "step": 10670 + }, + { + "epoch": 0.3490387130525765, + "grad_norm": 3.305472631452195, + "learning_rate": 1.645392302907062e-05, + "loss": 1.4508, + "step": 10675 + }, + { + "epoch": 0.3492021972273084, + "grad_norm": 3.441305022696201, + "learning_rate": 1.644956283292566e-05, + "loss": 1.4511, + "step": 10680 + }, + { + "epoch": 0.3493656814020403, + "grad_norm": 3.172702664023342, + "learning_rate": 1.644520053631611e-05, + "loss": 1.5284, + "step": 10685 + }, + { + "epoch": 0.34952916557677216, + "grad_norm": 3.107980305357534, + "learning_rate": 1.6440836140662657e-05, + "loss": 1.6132, + "step": 10690 + }, + { + "epoch": 0.34969264975150405, + "grad_norm": 2.954192426207443, + "learning_rate": 1.6436469647386685e-05, + "loss": 1.5039, + "step": 10695 + }, + { + "epoch": 0.34985613392623593, + "grad_norm": 3.1031981575404757, + "learning_rate": 1.6432101057910248e-05, + "loss": 1.4161, + "step": 10700 + }, + { + "epoch": 0.3500196181009678, + "grad_norm": 3.0318007535425093, + "learning_rate": 1.6427730373656093e-05, + "loss": 1.447, + "step": 10705 + }, + { + "epoch": 0.3501831022756997, + "grad_norm": 3.111952453880933, + "learning_rate": 1.642335759604764e-05, + "loss": 1.47, + "step": 10710 + }, + { + "epoch": 0.3503465864504316, + "grad_norm": 3.0849775247906495, + "learning_rate": 1.6418982726508996e-05, + "loss": 1.4113, + "step": 10715 + }, + { + "epoch": 0.3505100706251635, + "grad_norm": 3.4620491339649613, + "learning_rate": 1.6414605766464956e-05, + "loss": 1.6562, + "step": 10720 + }, + { + "epoch": 0.35067355479989537, + "grad_norm": 3.3197789453766076, + "learning_rate": 1.6410226717340977e-05, + "loss": 1.6025, + "step": 10725 + }, + { + "epoch": 0.35083703897462726, + "grad_norm": 3.2264148980405274, + "learning_rate": 1.640584558056322e-05, + "loss": 1.5545, + "step": 10730 + }, + { + "epoch": 0.35100052314935914, + "grad_norm": 2.8107207968183623, + "learning_rate": 1.6401462357558507e-05, + "loss": 1.497, + "step": 10735 + }, + { + "epoch": 0.35116400732409103, + "grad_norm": 3.0495616118912663, + "learning_rate": 1.6397077049754346e-05, + "loss": 1.3851, + "step": 10740 + }, + { + "epoch": 0.3513274914988229, + "grad_norm": 3.1815781541917487, + "learning_rate": 1.6392689658578928e-05, + "loss": 1.5809, + "step": 10745 + }, + { + "epoch": 0.3514909756735548, + "grad_norm": 3.162433814962451, + "learning_rate": 1.6388300185461113e-05, + "loss": 1.3835, + "step": 10750 + }, + { + "epoch": 0.3516544598482867, + "grad_norm": 3.2613306858649835, + "learning_rate": 1.638390863183045e-05, + "loss": 1.4864, + "step": 10755 + }, + { + "epoch": 0.3518179440230186, + "grad_norm": 3.3411135358659707, + "learning_rate": 1.6379514999117164e-05, + "loss": 1.4829, + "step": 10760 + }, + { + "epoch": 0.35198142819775047, + "grad_norm": 3.132462096149566, + "learning_rate": 1.6375119288752143e-05, + "loss": 1.596, + "step": 10765 + }, + { + "epoch": 0.35214491237248235, + "grad_norm": 3.206160518951867, + "learning_rate": 1.6370721502166972e-05, + "loss": 1.4674, + "step": 10770 + }, + { + "epoch": 0.35230839654721424, + "grad_norm": 3.1001152594150367, + "learning_rate": 1.6366321640793893e-05, + "loss": 1.4827, + "step": 10775 + }, + { + "epoch": 0.35247188072194613, + "grad_norm": 3.113258007378552, + "learning_rate": 1.636191970606584e-05, + "loss": 1.4121, + "step": 10780 + }, + { + "epoch": 0.352635364896678, + "grad_norm": 3.0678094261440383, + "learning_rate": 1.6357515699416414e-05, + "loss": 1.6551, + "step": 10785 + }, + { + "epoch": 0.3527988490714099, + "grad_norm": 2.9087594843967235, + "learning_rate": 1.635310962227989e-05, + "loss": 1.7029, + "step": 10790 + }, + { + "epoch": 0.3529623332461418, + "grad_norm": 3.404908411337703, + "learning_rate": 1.6348701476091223e-05, + "loss": 1.5354, + "step": 10795 + }, + { + "epoch": 0.3531258174208737, + "grad_norm": 3.3105171080619393, + "learning_rate": 1.6344291262286036e-05, + "loss": 1.4626, + "step": 10800 + }, + { + "epoch": 0.35328930159560556, + "grad_norm": 3.341305600732228, + "learning_rate": 1.6339878982300625e-05, + "loss": 1.3819, + "step": 10805 + }, + { + "epoch": 0.35345278577033745, + "grad_norm": 3.2692560547228657, + "learning_rate": 1.6335464637571967e-05, + "loss": 1.3312, + "step": 10810 + }, + { + "epoch": 0.35361626994506934, + "grad_norm": 3.263110950048188, + "learning_rate": 1.63310482295377e-05, + "loss": 1.4942, + "step": 10815 + }, + { + "epoch": 0.3537797541198012, + "grad_norm": 3.420470440903312, + "learning_rate": 1.6326629759636142e-05, + "loss": 1.4418, + "step": 10820 + }, + { + "epoch": 0.3539432382945331, + "grad_norm": 3.3174098480056533, + "learning_rate": 1.632220922930629e-05, + "loss": 1.5309, + "step": 10825 + }, + { + "epoch": 0.354106722469265, + "grad_norm": 2.9947434414549012, + "learning_rate": 1.631778663998778e-05, + "loss": 1.4986, + "step": 10830 + }, + { + "epoch": 0.3542702066439969, + "grad_norm": 3.4501033645545847, + "learning_rate": 1.6313361993120966e-05, + "loss": 1.627, + "step": 10835 + }, + { + "epoch": 0.3544336908187287, + "grad_norm": 2.84553554546001, + "learning_rate": 1.630893529014683e-05, + "loss": 1.4337, + "step": 10840 + }, + { + "epoch": 0.3545971749934606, + "grad_norm": 3.503006730990615, + "learning_rate": 1.6304506532507048e-05, + "loss": 1.4951, + "step": 10845 + }, + { + "epoch": 0.3547606591681925, + "grad_norm": 3.131266426692093, + "learning_rate": 1.6300075721643958e-05, + "loss": 1.5327, + "step": 10850 + }, + { + "epoch": 0.3549241433429244, + "grad_norm": 3.206427242858377, + "learning_rate": 1.6295642859000562e-05, + "loss": 1.453, + "step": 10855 + }, + { + "epoch": 0.35508762751765627, + "grad_norm": 2.8297056700429244, + "learning_rate": 1.629120794602054e-05, + "loss": 1.3822, + "step": 10860 + }, + { + "epoch": 0.35525111169238816, + "grad_norm": 3.2644036500727798, + "learning_rate": 1.628677098414823e-05, + "loss": 1.4277, + "step": 10865 + }, + { + "epoch": 0.35541459586712004, + "grad_norm": 3.2385716939417635, + "learning_rate": 1.6282331974828643e-05, + "loss": 1.5539, + "step": 10870 + }, + { + "epoch": 0.35557808004185193, + "grad_norm": 3.0411925045864256, + "learning_rate": 1.6277890919507463e-05, + "loss": 1.4784, + "step": 10875 + }, + { + "epoch": 0.3557415642165838, + "grad_norm": 3.14256587334861, + "learning_rate": 1.627344781963102e-05, + "loss": 1.4904, + "step": 10880 + }, + { + "epoch": 0.3559050483913157, + "grad_norm": 3.039824691432066, + "learning_rate": 1.6269002676646332e-05, + "loss": 1.4521, + "step": 10885 + }, + { + "epoch": 0.3560685325660476, + "grad_norm": 3.275488751206842, + "learning_rate": 1.626455549200107e-05, + "loss": 1.4494, + "step": 10890 + }, + { + "epoch": 0.3562320167407795, + "grad_norm": 2.7210290241484882, + "learning_rate": 1.626010626714357e-05, + "loss": 1.2925, + "step": 10895 + }, + { + "epoch": 0.35639550091551137, + "grad_norm": 3.086306694956964, + "learning_rate": 1.625565500352284e-05, + "loss": 1.3798, + "step": 10900 + }, + { + "epoch": 0.35655898509024325, + "grad_norm": 3.187992663973129, + "learning_rate": 1.6251201702588548e-05, + "loss": 1.539, + "step": 10905 + }, + { + "epoch": 0.35672246926497514, + "grad_norm": 3.0244167040869128, + "learning_rate": 1.6246746365791023e-05, + "loss": 1.4664, + "step": 10910 + }, + { + "epoch": 0.356885953439707, + "grad_norm": 3.520227172631441, + "learning_rate": 1.6242288994581258e-05, + "loss": 1.4811, + "step": 10915 + }, + { + "epoch": 0.3570494376144389, + "grad_norm": 3.064006743591685, + "learning_rate": 1.6237829590410914e-05, + "loss": 1.47, + "step": 10920 + }, + { + "epoch": 0.3572129217891708, + "grad_norm": 3.2827051969319787, + "learning_rate": 1.6233368154732305e-05, + "loss": 1.4921, + "step": 10925 + }, + { + "epoch": 0.3573764059639027, + "grad_norm": 2.9567556303405045, + "learning_rate": 1.6228904688998413e-05, + "loss": 1.6079, + "step": 10930 + }, + { + "epoch": 0.3575398901386346, + "grad_norm": 3.2370753207445575, + "learning_rate": 1.622443919466288e-05, + "loss": 1.3863, + "step": 10935 + }, + { + "epoch": 0.35770337431336646, + "grad_norm": 3.0868280890815747, + "learning_rate": 1.6219971673180005e-05, + "loss": 1.4429, + "step": 10940 + }, + { + "epoch": 0.35786685848809835, + "grad_norm": 3.321651456368836, + "learning_rate": 1.6215502126004753e-05, + "loss": 1.4845, + "step": 10945 + }, + { + "epoch": 0.35803034266283024, + "grad_norm": 3.091637505111077, + "learning_rate": 1.621103055459275e-05, + "loss": 1.4951, + "step": 10950 + }, + { + "epoch": 0.3581938268375621, + "grad_norm": 3.3246790913124933, + "learning_rate": 1.620655696040027e-05, + "loss": 1.4873, + "step": 10955 + }, + { + "epoch": 0.358357311012294, + "grad_norm": 3.229854484100196, + "learning_rate": 1.6202081344884254e-05, + "loss": 1.4247, + "step": 10960 + }, + { + "epoch": 0.3585207951870259, + "grad_norm": 3.169517460044448, + "learning_rate": 1.6197603709502305e-05, + "loss": 1.4379, + "step": 10965 + }, + { + "epoch": 0.3586842793617578, + "grad_norm": 3.3748891508162284, + "learning_rate": 1.6193124055712675e-05, + "loss": 1.5378, + "step": 10970 + }, + { + "epoch": 0.3588477635364897, + "grad_norm": 3.2197703568331835, + "learning_rate": 1.6188642384974283e-05, + "loss": 1.3939, + "step": 10975 + }, + { + "epoch": 0.35901124771122156, + "grad_norm": 3.240117182110877, + "learning_rate": 1.6184158698746696e-05, + "loss": 1.565, + "step": 10980 + }, + { + "epoch": 0.35917473188595345, + "grad_norm": 3.0553812572615766, + "learning_rate": 1.6179672998490133e-05, + "loss": 1.6306, + "step": 10985 + }, + { + "epoch": 0.35933821606068533, + "grad_norm": 3.2990380677393225, + "learning_rate": 1.617518528566549e-05, + "loss": 1.4805, + "step": 10990 + }, + { + "epoch": 0.3595017002354172, + "grad_norm": 3.282115211507663, + "learning_rate": 1.6170695561734294e-05, + "loss": 1.5118, + "step": 10995 + }, + { + "epoch": 0.3596651844101491, + "grad_norm": 3.2333136547279686, + "learning_rate": 1.6166203828158745e-05, + "loss": 1.4367, + "step": 11000 + }, + { + "epoch": 0.359828668584881, + "grad_norm": 3.357122079948706, + "learning_rate": 1.6161710086401693e-05, + "loss": 1.6676, + "step": 11005 + }, + { + "epoch": 0.3599921527596129, + "grad_norm": 3.2689750717190984, + "learning_rate": 1.6157214337926627e-05, + "loss": 1.5697, + "step": 11010 + }, + { + "epoch": 0.36015563693434477, + "grad_norm": 3.115596733501509, + "learning_rate": 1.6152716584197715e-05, + "loss": 1.5309, + "step": 11015 + }, + { + "epoch": 0.36031912110907666, + "grad_norm": 3.22773987044328, + "learning_rate": 1.6148216826679758e-05, + "loss": 1.4055, + "step": 11020 + }, + { + "epoch": 0.36048260528380854, + "grad_norm": 3.0314558016646984, + "learning_rate": 1.614371506683822e-05, + "loss": 1.6048, + "step": 11025 + }, + { + "epoch": 0.36064608945854043, + "grad_norm": 3.071473796236567, + "learning_rate": 1.6139211306139215e-05, + "loss": 1.3697, + "step": 11030 + }, + { + "epoch": 0.3608095736332723, + "grad_norm": 2.9690721982739334, + "learning_rate": 1.6134705546049503e-05, + "loss": 1.4261, + "step": 11035 + }, + { + "epoch": 0.3609730578080042, + "grad_norm": 3.0237663506391006, + "learning_rate": 1.6130197788036505e-05, + "loss": 1.4409, + "step": 11040 + }, + { + "epoch": 0.3611365419827361, + "grad_norm": 3.3500104710038245, + "learning_rate": 1.6125688033568283e-05, + "loss": 1.5614, + "step": 11045 + }, + { + "epoch": 0.361300026157468, + "grad_norm": 3.101079114799973, + "learning_rate": 1.6121176284113555e-05, + "loss": 1.3694, + "step": 11050 + }, + { + "epoch": 0.36146351033219987, + "grad_norm": 3.069866752343101, + "learning_rate": 1.611666254114169e-05, + "loss": 1.4021, + "step": 11055 + }, + { + "epoch": 0.36162699450693175, + "grad_norm": 3.118276370709517, + "learning_rate": 1.6112146806122696e-05, + "loss": 1.461, + "step": 11060 + }, + { + "epoch": 0.36179047868166364, + "grad_norm": 3.2406278158535917, + "learning_rate": 1.6107629080527243e-05, + "loss": 1.4434, + "step": 11065 + }, + { + "epoch": 0.36195396285639553, + "grad_norm": 3.1832097771560166, + "learning_rate": 1.6103109365826645e-05, + "loss": 1.5478, + "step": 11070 + }, + { + "epoch": 0.36211744703112736, + "grad_norm": 3.0193111903074343, + "learning_rate": 1.609858766349286e-05, + "loss": 1.4328, + "step": 11075 + }, + { + "epoch": 0.36228093120585925, + "grad_norm": 3.2853895874317667, + "learning_rate": 1.6094063974998498e-05, + "loss": 1.5419, + "step": 11080 + }, + { + "epoch": 0.36244441538059113, + "grad_norm": 3.439068522855385, + "learning_rate": 1.608953830181681e-05, + "loss": 1.5683, + "step": 11085 + }, + { + "epoch": 0.362607899555323, + "grad_norm": 3.2563295334237887, + "learning_rate": 1.6085010645421694e-05, + "loss": 1.5766, + "step": 11090 + }, + { + "epoch": 0.3627713837300549, + "grad_norm": 3.759046205409616, + "learning_rate": 1.6080481007287703e-05, + "loss": 1.4899, + "step": 11095 + }, + { + "epoch": 0.3629348679047868, + "grad_norm": 3.0909983654749005, + "learning_rate": 1.6075949388890028e-05, + "loss": 1.5361, + "step": 11100 + }, + { + "epoch": 0.3630983520795187, + "grad_norm": 3.098485123281429, + "learning_rate": 1.6071415791704502e-05, + "loss": 1.501, + "step": 11105 + }, + { + "epoch": 0.36326183625425057, + "grad_norm": 3.323192332955827, + "learning_rate": 1.6066880217207615e-05, + "loss": 1.463, + "step": 11110 + }, + { + "epoch": 0.36342532042898246, + "grad_norm": 3.541364143268912, + "learning_rate": 1.606234266687648e-05, + "loss": 1.4652, + "step": 11115 + }, + { + "epoch": 0.36358880460371434, + "grad_norm": 3.4214142295903995, + "learning_rate": 1.605780314218888e-05, + "loss": 1.5446, + "step": 11120 + }, + { + "epoch": 0.36375228877844623, + "grad_norm": 3.0915616571227837, + "learning_rate": 1.6053261644623214e-05, + "loss": 1.4439, + "step": 11125 + }, + { + "epoch": 0.3639157729531781, + "grad_norm": 3.180541656273328, + "learning_rate": 1.604871817565855e-05, + "loss": 1.4191, + "step": 11130 + }, + { + "epoch": 0.36407925712791, + "grad_norm": 3.007141761326633, + "learning_rate": 1.604417273677457e-05, + "loss": 1.4839, + "step": 11135 + }, + { + "epoch": 0.3642427413026419, + "grad_norm": 3.224552448835493, + "learning_rate": 1.603962532945162e-05, + "loss": 1.5406, + "step": 11140 + }, + { + "epoch": 0.3644062254773738, + "grad_norm": 3.253118835273823, + "learning_rate": 1.603507595517068e-05, + "loss": 1.4652, + "step": 11145 + }, + { + "epoch": 0.36456970965210567, + "grad_norm": 3.2091178835110066, + "learning_rate": 1.6030524615413367e-05, + "loss": 1.5232, + "step": 11150 + }, + { + "epoch": 0.36473319382683755, + "grad_norm": 3.2706333283059617, + "learning_rate": 1.6025971311661944e-05, + "loss": 1.5957, + "step": 11155 + }, + { + "epoch": 0.36489667800156944, + "grad_norm": 3.08612749267251, + "learning_rate": 1.602141604539931e-05, + "loss": 1.4543, + "step": 11160 + }, + { + "epoch": 0.36506016217630133, + "grad_norm": 3.170206550763301, + "learning_rate": 1.6016858818108992e-05, + "loss": 1.6563, + "step": 11165 + }, + { + "epoch": 0.3652236463510332, + "grad_norm": 2.9957239450121227, + "learning_rate": 1.6012299631275187e-05, + "loss": 1.3312, + "step": 11170 + }, + { + "epoch": 0.3653871305257651, + "grad_norm": 3.244742486644731, + "learning_rate": 1.6007738486382696e-05, + "loss": 1.4522, + "step": 11175 + }, + { + "epoch": 0.365550614700497, + "grad_norm": 3.285487715661552, + "learning_rate": 1.6003175384916977e-05, + "loss": 1.434, + "step": 11180 + }, + { + "epoch": 0.3657140988752289, + "grad_norm": 3.0305677448819925, + "learning_rate": 1.599861032836412e-05, + "loss": 1.4904, + "step": 11185 + }, + { + "epoch": 0.36587758304996076, + "grad_norm": 3.4110183521042, + "learning_rate": 1.5994043318210858e-05, + "loss": 1.4336, + "step": 11190 + }, + { + "epoch": 0.36604106722469265, + "grad_norm": 3.2739960317392565, + "learning_rate": 1.5989474355944544e-05, + "loss": 1.5017, + "step": 11195 + }, + { + "epoch": 0.36620455139942454, + "grad_norm": 3.0682509148264647, + "learning_rate": 1.598490344305318e-05, + "loss": 1.4288, + "step": 11200 + }, + { + "epoch": 0.3663680355741564, + "grad_norm": 3.1107601108887994, + "learning_rate": 1.5980330581025403e-05, + "loss": 1.4744, + "step": 11205 + }, + { + "epoch": 0.3665315197488883, + "grad_norm": 3.048564576732614, + "learning_rate": 1.597575577135048e-05, + "loss": 1.4365, + "step": 11210 + }, + { + "epoch": 0.3666950039236202, + "grad_norm": 3.253978973138461, + "learning_rate": 1.5971179015518318e-05, + "loss": 1.533, + "step": 11215 + }, + { + "epoch": 0.3668584880983521, + "grad_norm": 3.3507003495398324, + "learning_rate": 1.5966600315019448e-05, + "loss": 1.5186, + "step": 11220 + }, + { + "epoch": 0.367021972273084, + "grad_norm": 3.262395081660001, + "learning_rate": 1.596201967134505e-05, + "loss": 1.5103, + "step": 11225 + }, + { + "epoch": 0.36718545644781586, + "grad_norm": 3.1515059552577878, + "learning_rate": 1.5957437085986914e-05, + "loss": 1.5111, + "step": 11230 + }, + { + "epoch": 0.36734894062254775, + "grad_norm": 3.1449025989194026, + "learning_rate": 1.595285256043749e-05, + "loss": 1.5537, + "step": 11235 + }, + { + "epoch": 0.36751242479727964, + "grad_norm": 3.0342183764853, + "learning_rate": 1.594826609618984e-05, + "loss": 1.5854, + "step": 11240 + }, + { + "epoch": 0.3676759089720115, + "grad_norm": 3.1520324665873476, + "learning_rate": 1.5943677694737655e-05, + "loss": 1.539, + "step": 11245 + }, + { + "epoch": 0.3678393931467434, + "grad_norm": 2.999364572421225, + "learning_rate": 1.5939087357575276e-05, + "loss": 1.3804, + "step": 11250 + }, + { + "epoch": 0.3680028773214753, + "grad_norm": 3.3149349343503225, + "learning_rate": 1.5934495086197655e-05, + "loss": 1.4487, + "step": 11255 + }, + { + "epoch": 0.3681663614962072, + "grad_norm": 3.1830692502575517, + "learning_rate": 1.5929900882100394e-05, + "loss": 1.5293, + "step": 11260 + }, + { + "epoch": 0.36832984567093907, + "grad_norm": 3.2048565642560507, + "learning_rate": 1.5925304746779702e-05, + "loss": 1.4392, + "step": 11265 + }, + { + "epoch": 0.36849332984567096, + "grad_norm": 2.901451014017518, + "learning_rate": 1.5920706681732433e-05, + "loss": 1.4967, + "step": 11270 + }, + { + "epoch": 0.36865681402040285, + "grad_norm": 3.389862220976877, + "learning_rate": 1.5916106688456058e-05, + "loss": 1.6433, + "step": 11275 + }, + { + "epoch": 0.36882029819513473, + "grad_norm": 3.2344108076812503, + "learning_rate": 1.591150476844869e-05, + "loss": 1.4955, + "step": 11280 + }, + { + "epoch": 0.3689837823698666, + "grad_norm": 3.1580912277144613, + "learning_rate": 1.5906900923209055e-05, + "loss": 1.424, + "step": 11285 + }, + { + "epoch": 0.3691472665445985, + "grad_norm": 3.1767793578193992, + "learning_rate": 1.590229515423652e-05, + "loss": 1.5457, + "step": 11290 + }, + { + "epoch": 0.3693107507193304, + "grad_norm": 3.1327892651905036, + "learning_rate": 1.589768746303106e-05, + "loss": 1.4887, + "step": 11295 + }, + { + "epoch": 0.3694742348940623, + "grad_norm": 3.2935607373859765, + "learning_rate": 1.58930778510933e-05, + "loss": 1.5822, + "step": 11300 + }, + { + "epoch": 0.36963771906879417, + "grad_norm": 3.141696305574291, + "learning_rate": 1.588846631992447e-05, + "loss": 1.3306, + "step": 11305 + }, + { + "epoch": 0.369801203243526, + "grad_norm": 3.2021644936274574, + "learning_rate": 1.5883852871026427e-05, + "loss": 1.4458, + "step": 11310 + }, + { + "epoch": 0.3699646874182579, + "grad_norm": 2.8825182093374524, + "learning_rate": 1.587923750590167e-05, + "loss": 1.3195, + "step": 11315 + }, + { + "epoch": 0.3701281715929898, + "grad_norm": 3.0776422405159085, + "learning_rate": 1.5874620226053307e-05, + "loss": 1.4007, + "step": 11320 + }, + { + "epoch": 0.37029165576772166, + "grad_norm": 3.456114056086124, + "learning_rate": 1.587000103298507e-05, + "loss": 1.4127, + "step": 11325 + }, + { + "epoch": 0.37045513994245355, + "grad_norm": 3.325592686126991, + "learning_rate": 1.586537992820132e-05, + "loss": 1.5764, + "step": 11330 + }, + { + "epoch": 0.37061862411718544, + "grad_norm": 3.2079444220981026, + "learning_rate": 1.5860756913207036e-05, + "loss": 1.4188, + "step": 11335 + }, + { + "epoch": 0.3707821082919173, + "grad_norm": 3.461076405121598, + "learning_rate": 1.5856131989507823e-05, + "loss": 1.4948, + "step": 11340 + }, + { + "epoch": 0.3709455924666492, + "grad_norm": 3.058650417648073, + "learning_rate": 1.5851505158609897e-05, + "loss": 1.3611, + "step": 11345 + }, + { + "epoch": 0.3711090766413811, + "grad_norm": 3.1471005524927924, + "learning_rate": 1.5846876422020115e-05, + "loss": 1.64, + "step": 11350 + }, + { + "epoch": 0.371272560816113, + "grad_norm": 3.361586336687054, + "learning_rate": 1.5842245781245937e-05, + "loss": 1.5119, + "step": 11355 + }, + { + "epoch": 0.37143604499084487, + "grad_norm": 3.0349288626627247, + "learning_rate": 1.5837613237795448e-05, + "loss": 1.5806, + "step": 11360 + }, + { + "epoch": 0.37159952916557676, + "grad_norm": 3.374953814324488, + "learning_rate": 1.5832978793177358e-05, + "loss": 1.4856, + "step": 11365 + }, + { + "epoch": 0.37176301334030865, + "grad_norm": 3.012375832833221, + "learning_rate": 1.5828342448900988e-05, + "loss": 1.5101, + "step": 11370 + }, + { + "epoch": 0.37192649751504053, + "grad_norm": 3.21461584334998, + "learning_rate": 1.5823704206476285e-05, + "loss": 1.4645, + "step": 11375 + }, + { + "epoch": 0.3720899816897724, + "grad_norm": 3.105818967645064, + "learning_rate": 1.581906406741381e-05, + "loss": 1.6103, + "step": 11380 + }, + { + "epoch": 0.3722534658645043, + "grad_norm": 3.385630545535293, + "learning_rate": 1.581442203322474e-05, + "loss": 1.5536, + "step": 11385 + }, + { + "epoch": 0.3724169500392362, + "grad_norm": 3.22047347995395, + "learning_rate": 1.580977810542088e-05, + "loss": 1.3841, + "step": 11390 + }, + { + "epoch": 0.3725804342139681, + "grad_norm": 3.2763194295696794, + "learning_rate": 1.5805132285514633e-05, + "loss": 1.5905, + "step": 11395 + }, + { + "epoch": 0.37274391838869997, + "grad_norm": 3.482551477005467, + "learning_rate": 1.580048457501903e-05, + "loss": 1.4818, + "step": 11400 + }, + { + "epoch": 0.37290740256343186, + "grad_norm": 3.0853614853939226, + "learning_rate": 1.5795834975447725e-05, + "loss": 1.4219, + "step": 11405 + }, + { + "epoch": 0.37307088673816374, + "grad_norm": 2.9856533159491323, + "learning_rate": 1.5791183488314974e-05, + "loss": 1.4076, + "step": 11410 + }, + { + "epoch": 0.37323437091289563, + "grad_norm": 2.947691786665424, + "learning_rate": 1.578653011513565e-05, + "loss": 1.4705, + "step": 11415 + }, + { + "epoch": 0.3733978550876275, + "grad_norm": 3.1607458705987526, + "learning_rate": 1.5781874857425243e-05, + "loss": 1.4882, + "step": 11420 + }, + { + "epoch": 0.3735613392623594, + "grad_norm": 3.074556959978691, + "learning_rate": 1.5777217716699857e-05, + "loss": 1.4856, + "step": 11425 + }, + { + "epoch": 0.3737248234370913, + "grad_norm": 3.277263445154224, + "learning_rate": 1.5772558694476212e-05, + "loss": 1.4886, + "step": 11430 + }, + { + "epoch": 0.3738883076118232, + "grad_norm": 3.146966649569656, + "learning_rate": 1.5767897792271637e-05, + "loss": 1.4572, + "step": 11435 + }, + { + "epoch": 0.37405179178655507, + "grad_norm": 3.1068454712230316, + "learning_rate": 1.5763235011604064e-05, + "loss": 1.5299, + "step": 11440 + }, + { + "epoch": 0.37421527596128695, + "grad_norm": 3.04870389709621, + "learning_rate": 1.575857035399206e-05, + "loss": 1.5404, + "step": 11445 + }, + { + "epoch": 0.37437876013601884, + "grad_norm": 3.2124594925881906, + "learning_rate": 1.575390382095478e-05, + "loss": 1.4808, + "step": 11450 + }, + { + "epoch": 0.3745422443107507, + "grad_norm": 3.2337398595026903, + "learning_rate": 1.574923541401201e-05, + "loss": 1.5576, + "step": 11455 + }, + { + "epoch": 0.3747057284854826, + "grad_norm": 3.100809971146118, + "learning_rate": 1.574456513468412e-05, + "loss": 1.5232, + "step": 11460 + }, + { + "epoch": 0.3748692126602145, + "grad_norm": 3.2398629843834112, + "learning_rate": 1.5739892984492117e-05, + "loss": 1.4728, + "step": 11465 + }, + { + "epoch": 0.3750326968349464, + "grad_norm": 3.1941342892897095, + "learning_rate": 1.5735218964957607e-05, + "loss": 1.5909, + "step": 11470 + }, + { + "epoch": 0.3751961810096783, + "grad_norm": 3.157520167049987, + "learning_rate": 1.5730543077602796e-05, + "loss": 1.417, + "step": 11475 + }, + { + "epoch": 0.37535966518441016, + "grad_norm": 2.860623712750329, + "learning_rate": 1.572586532395051e-05, + "loss": 1.4505, + "step": 11480 + }, + { + "epoch": 0.37552314935914205, + "grad_norm": 3.057668827486356, + "learning_rate": 1.5721185705524178e-05, + "loss": 1.4642, + "step": 11485 + }, + { + "epoch": 0.37568663353387394, + "grad_norm": 3.061556530653181, + "learning_rate": 1.571650422384784e-05, + "loss": 1.4725, + "step": 11490 + }, + { + "epoch": 0.3758501177086058, + "grad_norm": 3.0657605573931894, + "learning_rate": 1.571182088044614e-05, + "loss": 1.5227, + "step": 11495 + }, + { + "epoch": 0.3760136018833377, + "grad_norm": 3.125304115395961, + "learning_rate": 1.570713567684432e-05, + "loss": 1.5753, + "step": 11500 + }, + { + "epoch": 0.3761770860580696, + "grad_norm": 3.219650861743178, + "learning_rate": 1.5702448614568243e-05, + "loss": 1.5276, + "step": 11505 + }, + { + "epoch": 0.3763405702328015, + "grad_norm": 3.004450288361641, + "learning_rate": 1.5697759695144366e-05, + "loss": 1.5746, + "step": 11510 + }, + { + "epoch": 0.3765040544075334, + "grad_norm": 3.047669663473297, + "learning_rate": 1.5693068920099764e-05, + "loss": 1.4494, + "step": 11515 + }, + { + "epoch": 0.37666753858226526, + "grad_norm": 3.133055751755215, + "learning_rate": 1.56883762909621e-05, + "loss": 1.5361, + "step": 11520 + }, + { + "epoch": 0.37683102275699715, + "grad_norm": 2.7865632939745297, + "learning_rate": 1.568368180925965e-05, + "loss": 1.3642, + "step": 11525 + }, + { + "epoch": 0.37699450693172903, + "grad_norm": 3.237786510878633, + "learning_rate": 1.567898547652129e-05, + "loss": 1.5182, + "step": 11530 + }, + { + "epoch": 0.3771579911064609, + "grad_norm": 3.4547984914483987, + "learning_rate": 1.5674287294276506e-05, + "loss": 1.4664, + "step": 11535 + }, + { + "epoch": 0.37732147528119275, + "grad_norm": 3.0442920069079853, + "learning_rate": 1.5669587264055373e-05, + "loss": 1.4747, + "step": 11540 + }, + { + "epoch": 0.37748495945592464, + "grad_norm": 3.231866985547192, + "learning_rate": 1.5664885387388582e-05, + "loss": 1.5417, + "step": 11545 + }, + { + "epoch": 0.3776484436306565, + "grad_norm": 3.242692652040253, + "learning_rate": 1.5660181665807413e-05, + "loss": 1.4783, + "step": 11550 + }, + { + "epoch": 0.3778119278053884, + "grad_norm": 3.2662583220857373, + "learning_rate": 1.5655476100843762e-05, + "loss": 1.5372, + "step": 11555 + }, + { + "epoch": 0.3779754119801203, + "grad_norm": 3.209086436733002, + "learning_rate": 1.5650768694030108e-05, + "loss": 1.5234, + "step": 11560 + }, + { + "epoch": 0.3781388961548522, + "grad_norm": 3.055815315210023, + "learning_rate": 1.5646059446899544e-05, + "loss": 1.4385, + "step": 11565 + }, + { + "epoch": 0.3783023803295841, + "grad_norm": 3.128086229995038, + "learning_rate": 1.564134836098575e-05, + "loss": 1.5812, + "step": 11570 + }, + { + "epoch": 0.37846586450431596, + "grad_norm": 3.1312271615405343, + "learning_rate": 1.5636635437823017e-05, + "loss": 1.3874, + "step": 11575 + }, + { + "epoch": 0.37862934867904785, + "grad_norm": 3.002715595126176, + "learning_rate": 1.563192067894622e-05, + "loss": 1.4003, + "step": 11580 + }, + { + "epoch": 0.37879283285377974, + "grad_norm": 2.894402344992888, + "learning_rate": 1.5627204085890855e-05, + "loss": 1.5053, + "step": 11585 + }, + { + "epoch": 0.3789563170285116, + "grad_norm": 3.051666758937181, + "learning_rate": 1.5622485660192984e-05, + "loss": 1.5376, + "step": 11590 + }, + { + "epoch": 0.3791198012032435, + "grad_norm": 3.208093005839207, + "learning_rate": 1.56177654033893e-05, + "loss": 1.5407, + "step": 11595 + }, + { + "epoch": 0.3792832853779754, + "grad_norm": 3.28480664059321, + "learning_rate": 1.561304331701706e-05, + "loss": 1.4151, + "step": 11600 + }, + { + "epoch": 0.3794467695527073, + "grad_norm": 3.2994798782210983, + "learning_rate": 1.560831940261414e-05, + "loss": 1.4846, + "step": 11605 + }, + { + "epoch": 0.3796102537274392, + "grad_norm": 3.4648949847316013, + "learning_rate": 1.5603593661719e-05, + "loss": 1.4981, + "step": 11610 + }, + { + "epoch": 0.37977373790217106, + "grad_norm": 3.029899641091375, + "learning_rate": 1.5598866095870703e-05, + "loss": 1.4813, + "step": 11615 + }, + { + "epoch": 0.37993722207690295, + "grad_norm": 2.828529559701764, + "learning_rate": 1.5594136706608893e-05, + "loss": 1.3635, + "step": 11620 + }, + { + "epoch": 0.38010070625163483, + "grad_norm": 3.193708457768346, + "learning_rate": 1.558940549547382e-05, + "loss": 1.5678, + "step": 11625 + }, + { + "epoch": 0.3802641904263667, + "grad_norm": 3.075886038967623, + "learning_rate": 1.558467246400633e-05, + "loss": 1.5713, + "step": 11630 + }, + { + "epoch": 0.3804276746010986, + "grad_norm": 3.1216185473648568, + "learning_rate": 1.5579937613747847e-05, + "loss": 1.4361, + "step": 11635 + }, + { + "epoch": 0.3805911587758305, + "grad_norm": 3.312208793896105, + "learning_rate": 1.5575200946240397e-05, + "loss": 1.495, + "step": 11640 + }, + { + "epoch": 0.3807546429505624, + "grad_norm": 3.0385461877923854, + "learning_rate": 1.5570462463026595e-05, + "loss": 1.4393, + "step": 11645 + }, + { + "epoch": 0.38091812712529427, + "grad_norm": 3.069330248562518, + "learning_rate": 1.556572216564966e-05, + "loss": 1.5124, + "step": 11650 + }, + { + "epoch": 0.38108161130002616, + "grad_norm": 3.3252788570392977, + "learning_rate": 1.5560980055653376e-05, + "loss": 1.4033, + "step": 11655 + }, + { + "epoch": 0.38124509547475804, + "grad_norm": 3.1041770980842056, + "learning_rate": 1.5556236134582138e-05, + "loss": 1.4151, + "step": 11660 + }, + { + "epoch": 0.38140857964948993, + "grad_norm": 3.677175035143127, + "learning_rate": 1.5551490403980927e-05, + "loss": 1.5889, + "step": 11665 + }, + { + "epoch": 0.3815720638242218, + "grad_norm": 3.0529544417977963, + "learning_rate": 1.554674286539531e-05, + "loss": 1.4534, + "step": 11670 + }, + { + "epoch": 0.3817355479989537, + "grad_norm": 3.096875252066191, + "learning_rate": 1.5541993520371444e-05, + "loss": 1.3826, + "step": 11675 + }, + { + "epoch": 0.3818990321736856, + "grad_norm": 3.3057322785502152, + "learning_rate": 1.5537242370456072e-05, + "loss": 1.3954, + "step": 11680 + }, + { + "epoch": 0.3820625163484175, + "grad_norm": 3.2796766011913103, + "learning_rate": 1.553248941719653e-05, + "loss": 1.3932, + "step": 11685 + }, + { + "epoch": 0.38222600052314937, + "grad_norm": 3.0747123489789954, + "learning_rate": 1.552773466214074e-05, + "loss": 1.4022, + "step": 11690 + }, + { + "epoch": 0.38238948469788125, + "grad_norm": 3.2304368261659397, + "learning_rate": 1.5522978106837204e-05, + "loss": 1.55, + "step": 11695 + }, + { + "epoch": 0.38255296887261314, + "grad_norm": 3.21486824732981, + "learning_rate": 1.5518219752835018e-05, + "loss": 1.4889, + "step": 11700 + }, + { + "epoch": 0.38271645304734503, + "grad_norm": 3.3843151505363758, + "learning_rate": 1.551345960168386e-05, + "loss": 1.7822, + "step": 11705 + }, + { + "epoch": 0.3828799372220769, + "grad_norm": 3.169261238918564, + "learning_rate": 1.5508697654934e-05, + "loss": 1.4661, + "step": 11710 + }, + { + "epoch": 0.3830434213968088, + "grad_norm": 3.1180835114267627, + "learning_rate": 1.5503933914136282e-05, + "loss": 1.5808, + "step": 11715 + }, + { + "epoch": 0.3832069055715407, + "grad_norm": 2.9788833492083615, + "learning_rate": 1.5499168380842142e-05, + "loss": 1.4574, + "step": 11720 + }, + { + "epoch": 0.3833703897462726, + "grad_norm": 2.950017976997185, + "learning_rate": 1.5494401056603595e-05, + "loss": 1.3762, + "step": 11725 + }, + { + "epoch": 0.38353387392100446, + "grad_norm": 3.0932486394632366, + "learning_rate": 1.548963194297324e-05, + "loss": 1.436, + "step": 11730 + }, + { + "epoch": 0.38369735809573635, + "grad_norm": 3.11800048938595, + "learning_rate": 1.548486104150427e-05, + "loss": 1.518, + "step": 11735 + }, + { + "epoch": 0.38386084227046824, + "grad_norm": 3.4469779602315658, + "learning_rate": 1.548008835375044e-05, + "loss": 1.523, + "step": 11740 + }, + { + "epoch": 0.3840243264452001, + "grad_norm": 3.4235874477971033, + "learning_rate": 1.5475313881266105e-05, + "loss": 1.4973, + "step": 11745 + }, + { + "epoch": 0.384187810619932, + "grad_norm": 3.299151083827066, + "learning_rate": 1.5470537625606187e-05, + "loss": 1.5546, + "step": 11750 + }, + { + "epoch": 0.3843512947946639, + "grad_norm": 3.2596985040996445, + "learning_rate": 1.5465759588326203e-05, + "loss": 1.5178, + "step": 11755 + }, + { + "epoch": 0.3845147789693958, + "grad_norm": 3.0961710655758896, + "learning_rate": 1.5460979770982235e-05, + "loss": 1.4563, + "step": 11760 + }, + { + "epoch": 0.3846782631441277, + "grad_norm": 3.283776641446498, + "learning_rate": 1.5456198175130957e-05, + "loss": 1.5244, + "step": 11765 + }, + { + "epoch": 0.38484174731885956, + "grad_norm": 3.15243474572595, + "learning_rate": 1.5451414802329622e-05, + "loss": 1.4612, + "step": 11770 + }, + { + "epoch": 0.3850052314935914, + "grad_norm": 3.2756982801229095, + "learning_rate": 1.5446629654136045e-05, + "loss": 1.5247, + "step": 11775 + }, + { + "epoch": 0.3851687156683233, + "grad_norm": 3.407890121542529, + "learning_rate": 1.5441842732108642e-05, + "loss": 1.4203, + "step": 11780 + }, + { + "epoch": 0.38533219984305517, + "grad_norm": 2.7864968996680344, + "learning_rate": 1.5437054037806393e-05, + "loss": 1.4295, + "step": 11785 + }, + { + "epoch": 0.38549568401778705, + "grad_norm": 3.1258907346622826, + "learning_rate": 1.5432263572788856e-05, + "loss": 1.4547, + "step": 11790 + }, + { + "epoch": 0.38565916819251894, + "grad_norm": 3.166841716480746, + "learning_rate": 1.542747133861617e-05, + "loss": 1.4983, + "step": 11795 + }, + { + "epoch": 0.38582265236725083, + "grad_norm": 3.106772548142214, + "learning_rate": 1.5422677336849053e-05, + "loss": 1.3747, + "step": 11800 + }, + { + "epoch": 0.3859861365419827, + "grad_norm": 3.185396878259495, + "learning_rate": 1.5417881569048784e-05, + "loss": 1.5534, + "step": 11805 + }, + { + "epoch": 0.3861496207167146, + "grad_norm": 3.0921382643084128, + "learning_rate": 1.5413084036777238e-05, + "loss": 1.544, + "step": 11810 + }, + { + "epoch": 0.3863131048914465, + "grad_norm": 2.965803365679799, + "learning_rate": 1.5408284741596843e-05, + "loss": 1.4485, + "step": 11815 + }, + { + "epoch": 0.3864765890661784, + "grad_norm": 3.2508336893156065, + "learning_rate": 1.540348368507062e-05, + "loss": 1.4963, + "step": 11820 + }, + { + "epoch": 0.38664007324091026, + "grad_norm": 3.129121485645896, + "learning_rate": 1.5398680868762152e-05, + "loss": 1.4701, + "step": 11825 + }, + { + "epoch": 0.38680355741564215, + "grad_norm": 3.1798238487136277, + "learning_rate": 1.5393876294235603e-05, + "loss": 1.466, + "step": 11830 + }, + { + "epoch": 0.38696704159037404, + "grad_norm": 3.215327882656136, + "learning_rate": 1.5389069963055692e-05, + "loss": 1.5893, + "step": 11835 + }, + { + "epoch": 0.3871305257651059, + "grad_norm": 2.7289551950582824, + "learning_rate": 1.538426187678774e-05, + "loss": 1.4156, + "step": 11840 + }, + { + "epoch": 0.3872940099398378, + "grad_norm": 3.168352835942204, + "learning_rate": 1.5379452036997612e-05, + "loss": 1.4529, + "step": 11845 + }, + { + "epoch": 0.3874574941145697, + "grad_norm": 3.084751525154363, + "learning_rate": 1.5374640445251762e-05, + "loss": 1.4094, + "step": 11850 + }, + { + "epoch": 0.3876209782893016, + "grad_norm": 3.211303163925531, + "learning_rate": 1.5369827103117202e-05, + "loss": 1.5724, + "step": 11855 + }, + { + "epoch": 0.3877844624640335, + "grad_norm": 3.180544695774094, + "learning_rate": 1.5365012012161522e-05, + "loss": 1.3293, + "step": 11860 + }, + { + "epoch": 0.38794794663876536, + "grad_norm": 3.0364945229227516, + "learning_rate": 1.5360195173952878e-05, + "loss": 1.3122, + "step": 11865 + }, + { + "epoch": 0.38811143081349725, + "grad_norm": 2.983346671600305, + "learning_rate": 1.5355376590059997e-05, + "loss": 1.4115, + "step": 11870 + }, + { + "epoch": 0.38827491498822914, + "grad_norm": 3.0014892297529037, + "learning_rate": 1.5350556262052178e-05, + "loss": 1.3615, + "step": 11875 + }, + { + "epoch": 0.388438399162961, + "grad_norm": 3.2919307941009452, + "learning_rate": 1.5345734191499276e-05, + "loss": 1.475, + "step": 11880 + }, + { + "epoch": 0.3886018833376929, + "grad_norm": 2.807105202187302, + "learning_rate": 1.5340910379971724e-05, + "loss": 1.3501, + "step": 11885 + }, + { + "epoch": 0.3887653675124248, + "grad_norm": 2.8156301456172397, + "learning_rate": 1.5336084829040517e-05, + "loss": 1.3662, + "step": 11890 + }, + { + "epoch": 0.3889288516871567, + "grad_norm": 3.4148923405808644, + "learning_rate": 1.5331257540277227e-05, + "loss": 1.5711, + "step": 11895 + }, + { + "epoch": 0.38909233586188857, + "grad_norm": 2.9833623751239724, + "learning_rate": 1.5326428515253977e-05, + "loss": 1.5106, + "step": 11900 + }, + { + "epoch": 0.38925582003662046, + "grad_norm": 3.2514639481098193, + "learning_rate": 1.5321597755543463e-05, + "loss": 1.504, + "step": 11905 + }, + { + "epoch": 0.38941930421135235, + "grad_norm": 3.0609709115301738, + "learning_rate": 1.531676526271895e-05, + "loss": 1.4719, + "step": 11910 + }, + { + "epoch": 0.38958278838608423, + "grad_norm": 3.2360259857904725, + "learning_rate": 1.531193103835425e-05, + "loss": 1.5365, + "step": 11915 + }, + { + "epoch": 0.3897462725608161, + "grad_norm": 3.2445537819515082, + "learning_rate": 1.5307095084023765e-05, + "loss": 1.5407, + "step": 11920 + }, + { + "epoch": 0.389909756735548, + "grad_norm": 3.2313267613117103, + "learning_rate": 1.5302257401302438e-05, + "loss": 1.4263, + "step": 11925 + }, + { + "epoch": 0.3900732409102799, + "grad_norm": 3.317820566135025, + "learning_rate": 1.529741799176579e-05, + "loss": 1.3698, + "step": 11930 + }, + { + "epoch": 0.3902367250850118, + "grad_norm": 3.1514551460605245, + "learning_rate": 1.529257685698989e-05, + "loss": 1.538, + "step": 11935 + }, + { + "epoch": 0.39040020925974367, + "grad_norm": 3.2516393376766066, + "learning_rate": 1.5287733998551386e-05, + "loss": 1.5271, + "step": 11940 + }, + { + "epoch": 0.39056369343447556, + "grad_norm": 3.1710046025703638, + "learning_rate": 1.5282889418027475e-05, + "loss": 1.5801, + "step": 11945 + }, + { + "epoch": 0.39072717760920744, + "grad_norm": 3.097163141600787, + "learning_rate": 1.5278043116995917e-05, + "loss": 1.4674, + "step": 11950 + }, + { + "epoch": 0.39089066178393933, + "grad_norm": 2.8911313156499063, + "learning_rate": 1.5273195097035035e-05, + "loss": 1.4076, + "step": 11955 + }, + { + "epoch": 0.3910541459586712, + "grad_norm": 3.1651423117734425, + "learning_rate": 1.526834535972371e-05, + "loss": 1.5368, + "step": 11960 + }, + { + "epoch": 0.3912176301334031, + "grad_norm": 3.272744712845591, + "learning_rate": 1.5263493906641378e-05, + "loss": 1.5451, + "step": 11965 + }, + { + "epoch": 0.391381114308135, + "grad_norm": 3.1349891674496955, + "learning_rate": 1.5258640739368044e-05, + "loss": 1.4564, + "step": 11970 + }, + { + "epoch": 0.3915445984828669, + "grad_norm": 3.1809088903684626, + "learning_rate": 1.5253785859484267e-05, + "loss": 1.5525, + "step": 11975 + }, + { + "epoch": 0.39170808265759877, + "grad_norm": 3.153148216164303, + "learning_rate": 1.5248929268571156e-05, + "loss": 1.424, + "step": 11980 + }, + { + "epoch": 0.39187156683233065, + "grad_norm": 3.2201607456068873, + "learning_rate": 1.5244070968210389e-05, + "loss": 1.5584, + "step": 11985 + }, + { + "epoch": 0.39203505100706254, + "grad_norm": 3.5868264125948164, + "learning_rate": 1.5239210959984195e-05, + "loss": 1.6103, + "step": 11990 + }, + { + "epoch": 0.3921985351817944, + "grad_norm": 3.1264622796446115, + "learning_rate": 1.5234349245475356e-05, + "loss": 1.5258, + "step": 11995 + }, + { + "epoch": 0.3923620193565263, + "grad_norm": 3.1756178627125795, + "learning_rate": 1.5229485826267216e-05, + "loss": 1.5131, + "step": 12000 + }, + { + "epoch": 0.3925255035312582, + "grad_norm": 3.243387136294429, + "learning_rate": 1.5224620703943671e-05, + "loss": 1.607, + "step": 12005 + }, + { + "epoch": 0.39268898770599003, + "grad_norm": 3.27284068643561, + "learning_rate": 1.5219753880089175e-05, + "loss": 1.4601, + "step": 12010 + }, + { + "epoch": 0.3928524718807219, + "grad_norm": 2.964328632512975, + "learning_rate": 1.5214885356288727e-05, + "loss": 1.3691, + "step": 12015 + }, + { + "epoch": 0.3930159560554538, + "grad_norm": 3.269261949353604, + "learning_rate": 1.5210015134127889e-05, + "loss": 1.5155, + "step": 12020 + }, + { + "epoch": 0.3931794402301857, + "grad_norm": 3.0413493563331104, + "learning_rate": 1.5205143215192775e-05, + "loss": 1.5613, + "step": 12025 + }, + { + "epoch": 0.3933429244049176, + "grad_norm": 3.280329882153739, + "learning_rate": 1.5200269601070047e-05, + "loss": 1.5338, + "step": 12030 + }, + { + "epoch": 0.39350640857964947, + "grad_norm": 2.9781141000318754, + "learning_rate": 1.5195394293346926e-05, + "loss": 1.4217, + "step": 12035 + }, + { + "epoch": 0.39366989275438136, + "grad_norm": 2.855942495760017, + "learning_rate": 1.5190517293611175e-05, + "loss": 1.381, + "step": 12040 + }, + { + "epoch": 0.39383337692911324, + "grad_norm": 3.2138331473848907, + "learning_rate": 1.5185638603451113e-05, + "loss": 1.6186, + "step": 12045 + }, + { + "epoch": 0.39399686110384513, + "grad_norm": 2.9823999782537944, + "learning_rate": 1.5180758224455617e-05, + "loss": 1.4504, + "step": 12050 + }, + { + "epoch": 0.394160345278577, + "grad_norm": 3.3483196692327946, + "learning_rate": 1.5175876158214099e-05, + "loss": 1.4532, + "step": 12055 + }, + { + "epoch": 0.3943238294533089, + "grad_norm": 2.909365407430407, + "learning_rate": 1.5170992406316528e-05, + "loss": 1.2858, + "step": 12060 + }, + { + "epoch": 0.3944873136280408, + "grad_norm": 3.0942992082725445, + "learning_rate": 1.5166106970353431e-05, + "loss": 1.525, + "step": 12065 + }, + { + "epoch": 0.3946507978027727, + "grad_norm": 3.190909373820485, + "learning_rate": 1.5161219851915867e-05, + "loss": 1.5365, + "step": 12070 + }, + { + "epoch": 0.39481428197750457, + "grad_norm": 3.097312275892072, + "learning_rate": 1.5156331052595454e-05, + "loss": 1.4319, + "step": 12075 + }, + { + "epoch": 0.39497776615223645, + "grad_norm": 3.067486218573727, + "learning_rate": 1.5151440573984353e-05, + "loss": 1.536, + "step": 12080 + }, + { + "epoch": 0.39514125032696834, + "grad_norm": 3.1985049765341507, + "learning_rate": 1.5146548417675275e-05, + "loss": 1.4308, + "step": 12085 + }, + { + "epoch": 0.39530473450170023, + "grad_norm": 3.1434712165902177, + "learning_rate": 1.5141654585261474e-05, + "loss": 1.5731, + "step": 12090 + }, + { + "epoch": 0.3954682186764321, + "grad_norm": 3.87072066810193, + "learning_rate": 1.5136759078336746e-05, + "loss": 1.4489, + "step": 12095 + }, + { + "epoch": 0.395631702851164, + "grad_norm": 3.210851706535205, + "learning_rate": 1.513186189849545e-05, + "loss": 1.483, + "step": 12100 + }, + { + "epoch": 0.3957951870258959, + "grad_norm": 3.1916630108101445, + "learning_rate": 1.5126963047332469e-05, + "loss": 1.3438, + "step": 12105 + }, + { + "epoch": 0.3959586712006278, + "grad_norm": 3.1342595815486685, + "learning_rate": 1.5122062526443238e-05, + "loss": 1.546, + "step": 12110 + }, + { + "epoch": 0.39612215537535966, + "grad_norm": 3.447294095799586, + "learning_rate": 1.5117160337423742e-05, + "loss": 1.4993, + "step": 12115 + }, + { + "epoch": 0.39628563955009155, + "grad_norm": 3.4580695168732256, + "learning_rate": 1.5112256481870495e-05, + "loss": 1.6776, + "step": 12120 + }, + { + "epoch": 0.39644912372482344, + "grad_norm": 3.0525459424789676, + "learning_rate": 1.5107350961380571e-05, + "loss": 1.5434, + "step": 12125 + }, + { + "epoch": 0.3966126078995553, + "grad_norm": 3.1699533227824697, + "learning_rate": 1.5102443777551577e-05, + "loss": 1.5195, + "step": 12130 + }, + { + "epoch": 0.3967760920742872, + "grad_norm": 2.8913918573201753, + "learning_rate": 1.5097534931981658e-05, + "loss": 1.535, + "step": 12135 + }, + { + "epoch": 0.3969395762490191, + "grad_norm": 2.985872755268671, + "learning_rate": 1.5092624426269511e-05, + "loss": 1.4058, + "step": 12140 + }, + { + "epoch": 0.397103060423751, + "grad_norm": 3.23461215816092, + "learning_rate": 1.5087712262014357e-05, + "loss": 1.4647, + "step": 12145 + }, + { + "epoch": 0.3972665445984829, + "grad_norm": 2.9698488644131773, + "learning_rate": 1.508279844081598e-05, + "loss": 1.4741, + "step": 12150 + }, + { + "epoch": 0.39743002877321476, + "grad_norm": 3.4030655063278044, + "learning_rate": 1.5077882964274687e-05, + "loss": 1.5442, + "step": 12155 + }, + { + "epoch": 0.39759351294794665, + "grad_norm": 3.231549887034292, + "learning_rate": 1.5072965833991322e-05, + "loss": 1.431, + "step": 12160 + }, + { + "epoch": 0.39775699712267854, + "grad_norm": 3.064899372087165, + "learning_rate": 1.506804705156728e-05, + "loss": 1.4345, + "step": 12165 + }, + { + "epoch": 0.3979204812974104, + "grad_norm": 3.0491879975683625, + "learning_rate": 1.5063126618604486e-05, + "loss": 1.6074, + "step": 12170 + }, + { + "epoch": 0.3980839654721423, + "grad_norm": 3.190716078722282, + "learning_rate": 1.5058204536705405e-05, + "loss": 1.4038, + "step": 12175 + }, + { + "epoch": 0.3982474496468742, + "grad_norm": 3.1801705142646655, + "learning_rate": 1.5053280807473042e-05, + "loss": 1.5755, + "step": 12180 + }, + { + "epoch": 0.3984109338216061, + "grad_norm": 3.0131727478851427, + "learning_rate": 1.5048355432510927e-05, + "loss": 1.569, + "step": 12185 + }, + { + "epoch": 0.39857441799633797, + "grad_norm": 3.244769474754786, + "learning_rate": 1.504342841342314e-05, + "loss": 1.4588, + "step": 12190 + }, + { + "epoch": 0.39873790217106986, + "grad_norm": 3.0457931403047707, + "learning_rate": 1.5038499751814288e-05, + "loss": 1.4416, + "step": 12195 + }, + { + "epoch": 0.39890138634580175, + "grad_norm": 3.464675222697592, + "learning_rate": 1.5033569449289516e-05, + "loss": 1.4256, + "step": 12200 + }, + { + "epoch": 0.39906487052053363, + "grad_norm": 3.5178730050099767, + "learning_rate": 1.5028637507454505e-05, + "loss": 1.5541, + "step": 12205 + }, + { + "epoch": 0.3992283546952655, + "grad_norm": 3.1028223784519553, + "learning_rate": 1.5023703927915462e-05, + "loss": 1.5169, + "step": 12210 + }, + { + "epoch": 0.3993918388699974, + "grad_norm": 3.078597886215295, + "learning_rate": 1.5018768712279142e-05, + "loss": 1.3748, + "step": 12215 + }, + { + "epoch": 0.3995553230447293, + "grad_norm": 3.067740027409064, + "learning_rate": 1.5013831862152812e-05, + "loss": 1.5083, + "step": 12220 + }, + { + "epoch": 0.3997188072194612, + "grad_norm": 3.0923323259495934, + "learning_rate": 1.5008893379144294e-05, + "loss": 1.5948, + "step": 12225 + }, + { + "epoch": 0.39988229139419307, + "grad_norm": 3.211003605361355, + "learning_rate": 1.5003953264861924e-05, + "loss": 1.5106, + "step": 12230 + }, + { + "epoch": 0.40004577556892496, + "grad_norm": 3.1876156468846406, + "learning_rate": 1.499901152091458e-05, + "loss": 1.5024, + "step": 12235 + }, + { + "epoch": 0.4002092597436568, + "grad_norm": 3.2700830088531423, + "learning_rate": 1.4994068148911662e-05, + "loss": 1.5317, + "step": 12240 + }, + { + "epoch": 0.4003727439183887, + "grad_norm": 3.262975434837536, + "learning_rate": 1.4989123150463112e-05, + "loss": 1.5292, + "step": 12245 + }, + { + "epoch": 0.40053622809312056, + "grad_norm": 3.0540794334977353, + "learning_rate": 1.4984176527179389e-05, + "loss": 1.5149, + "step": 12250 + }, + { + "epoch": 0.40069971226785245, + "grad_norm": 3.0562253265724677, + "learning_rate": 1.4979228280671491e-05, + "loss": 1.4477, + "step": 12255 + }, + { + "epoch": 0.40086319644258434, + "grad_norm": 3.1849525999015773, + "learning_rate": 1.4974278412550937e-05, + "loss": 1.5544, + "step": 12260 + }, + { + "epoch": 0.4010266806173162, + "grad_norm": 3.1771501992138345, + "learning_rate": 1.4969326924429783e-05, + "loss": 1.5944, + "step": 12265 + }, + { + "epoch": 0.4011901647920481, + "grad_norm": 3.2076293034349406, + "learning_rate": 1.49643738179206e-05, + "loss": 1.4484, + "step": 12270 + }, + { + "epoch": 0.40135364896678, + "grad_norm": 3.2538372232465433, + "learning_rate": 1.49594190946365e-05, + "loss": 1.7038, + "step": 12275 + }, + { + "epoch": 0.4015171331415119, + "grad_norm": 3.2423253954054934, + "learning_rate": 1.4954462756191113e-05, + "loss": 1.5041, + "step": 12280 + }, + { + "epoch": 0.40168061731624377, + "grad_norm": 3.2122103137720455, + "learning_rate": 1.4949504804198599e-05, + "loss": 1.4894, + "step": 12285 + }, + { + "epoch": 0.40184410149097566, + "grad_norm": 3.0370468221735214, + "learning_rate": 1.4944545240273634e-05, + "loss": 1.5259, + "step": 12290 + }, + { + "epoch": 0.40200758566570755, + "grad_norm": 3.336719047136376, + "learning_rate": 1.4939584066031434e-05, + "loss": 1.5113, + "step": 12295 + }, + { + "epoch": 0.40217106984043943, + "grad_norm": 3.051835749368279, + "learning_rate": 1.4934621283087727e-05, + "loss": 1.4326, + "step": 12300 + }, + { + "epoch": 0.4023345540151713, + "grad_norm": 3.087763130940389, + "learning_rate": 1.4929656893058775e-05, + "loss": 1.3675, + "step": 12305 + }, + { + "epoch": 0.4024980381899032, + "grad_norm": 3.2413502378677634, + "learning_rate": 1.4924690897561353e-05, + "loss": 1.5543, + "step": 12310 + }, + { + "epoch": 0.4026615223646351, + "grad_norm": 2.9984937347761806, + "learning_rate": 1.4919723298212772e-05, + "loss": 1.4173, + "step": 12315 + }, + { + "epoch": 0.402825006539367, + "grad_norm": 3.0127864078320803, + "learning_rate": 1.4914754096630848e-05, + "loss": 1.5209, + "step": 12320 + }, + { + "epoch": 0.40298849071409887, + "grad_norm": 3.237300133205722, + "learning_rate": 1.490978329443393e-05, + "loss": 1.4896, + "step": 12325 + }, + { + "epoch": 0.40315197488883076, + "grad_norm": 3.251297306248211, + "learning_rate": 1.490481089324089e-05, + "loss": 1.7196, + "step": 12330 + }, + { + "epoch": 0.40331545906356264, + "grad_norm": 3.2513232335212723, + "learning_rate": 1.4899836894671118e-05, + "loss": 1.4118, + "step": 12335 + }, + { + "epoch": 0.40347894323829453, + "grad_norm": 3.068351920367775, + "learning_rate": 1.489486130034452e-05, + "loss": 1.4793, + "step": 12340 + }, + { + "epoch": 0.4036424274130264, + "grad_norm": 3.299093596201307, + "learning_rate": 1.4889884111881527e-05, + "loss": 1.5561, + "step": 12345 + }, + { + "epoch": 0.4038059115877583, + "grad_norm": 3.569830548235071, + "learning_rate": 1.4884905330903083e-05, + "loss": 1.601, + "step": 12350 + }, + { + "epoch": 0.4039693957624902, + "grad_norm": 3.2650637000773512, + "learning_rate": 1.4879924959030663e-05, + "loss": 1.5383, + "step": 12355 + }, + { + "epoch": 0.4041328799372221, + "grad_norm": 3.1165786961205044, + "learning_rate": 1.4874942997886246e-05, + "loss": 1.5068, + "step": 12360 + }, + { + "epoch": 0.40429636411195397, + "grad_norm": 3.056114981756997, + "learning_rate": 1.4869959449092336e-05, + "loss": 1.4802, + "step": 12365 + }, + { + "epoch": 0.40445984828668585, + "grad_norm": 3.1801846820363755, + "learning_rate": 1.4864974314271956e-05, + "loss": 1.5165, + "step": 12370 + }, + { + "epoch": 0.40462333246141774, + "grad_norm": 3.1560688143966904, + "learning_rate": 1.4859987595048638e-05, + "loss": 1.5055, + "step": 12375 + }, + { + "epoch": 0.4047868166361496, + "grad_norm": 3.3231979895018986, + "learning_rate": 1.4854999293046433e-05, + "loss": 1.5474, + "step": 12380 + }, + { + "epoch": 0.4049503008108815, + "grad_norm": 3.432230187389495, + "learning_rate": 1.4850009409889914e-05, + "loss": 1.5794, + "step": 12385 + }, + { + "epoch": 0.4051137849856134, + "grad_norm": 3.0283829993989073, + "learning_rate": 1.484501794720416e-05, + "loss": 1.5168, + "step": 12390 + }, + { + "epoch": 0.4052772691603453, + "grad_norm": 3.141298971753372, + "learning_rate": 1.4840024906614772e-05, + "loss": 1.363, + "step": 12395 + }, + { + "epoch": 0.4054407533350772, + "grad_norm": 3.224732308603274, + "learning_rate": 1.4835030289747858e-05, + "loss": 1.4276, + "step": 12400 + }, + { + "epoch": 0.40560423750980906, + "grad_norm": 3.146420465060425, + "learning_rate": 1.483003409823004e-05, + "loss": 1.4163, + "step": 12405 + }, + { + "epoch": 0.40576772168454095, + "grad_norm": 3.3746761950352733, + "learning_rate": 1.4825036333688458e-05, + "loss": 1.4331, + "step": 12410 + }, + { + "epoch": 0.40593120585927284, + "grad_norm": 3.1883726220204256, + "learning_rate": 1.4820036997750765e-05, + "loss": 1.5112, + "step": 12415 + }, + { + "epoch": 0.4060946900340047, + "grad_norm": 2.9674864565755295, + "learning_rate": 1.4815036092045113e-05, + "loss": 1.4188, + "step": 12420 + }, + { + "epoch": 0.4062581742087366, + "grad_norm": 3.99762047627081, + "learning_rate": 1.4810033618200185e-05, + "loss": 1.5769, + "step": 12425 + }, + { + "epoch": 0.4064216583834685, + "grad_norm": 3.2758898547795368, + "learning_rate": 1.4805029577845157e-05, + "loss": 1.5301, + "step": 12430 + }, + { + "epoch": 0.4065851425582004, + "grad_norm": 2.9527330171152415, + "learning_rate": 1.4800023972609726e-05, + "loss": 1.4485, + "step": 12435 + }, + { + "epoch": 0.4067486267329323, + "grad_norm": 3.1310747649531296, + "learning_rate": 1.4795016804124091e-05, + "loss": 1.4883, + "step": 12440 + }, + { + "epoch": 0.40691211090766416, + "grad_norm": 3.196785368449703, + "learning_rate": 1.479000807401897e-05, + "loss": 1.5359, + "step": 12445 + }, + { + "epoch": 0.40707559508239605, + "grad_norm": 3.351609590701131, + "learning_rate": 1.4784997783925576e-05, + "loss": 1.4704, + "step": 12450 + }, + { + "epoch": 0.40723907925712793, + "grad_norm": 3.387951492233911, + "learning_rate": 1.4779985935475643e-05, + "loss": 1.5124, + "step": 12455 + }, + { + "epoch": 0.4074025634318598, + "grad_norm": 2.92832084682855, + "learning_rate": 1.4774972530301406e-05, + "loss": 1.3494, + "step": 12460 + }, + { + "epoch": 0.4075660476065917, + "grad_norm": 3.231456032429591, + "learning_rate": 1.476995757003561e-05, + "loss": 1.5052, + "step": 12465 + }, + { + "epoch": 0.4077295317813236, + "grad_norm": 3.0940370485144215, + "learning_rate": 1.4764941056311503e-05, + "loss": 1.485, + "step": 12470 + }, + { + "epoch": 0.4078930159560554, + "grad_norm": 3.241878039614136, + "learning_rate": 1.475992299076284e-05, + "loss": 1.4597, + "step": 12475 + }, + { + "epoch": 0.4080565001307873, + "grad_norm": 3.1811669126737714, + "learning_rate": 1.4754903375023881e-05, + "loss": 1.5086, + "step": 12480 + }, + { + "epoch": 0.4082199843055192, + "grad_norm": 2.988566353894158, + "learning_rate": 1.4749882210729397e-05, + "loss": 1.3954, + "step": 12485 + }, + { + "epoch": 0.4083834684802511, + "grad_norm": 3.2830685614049067, + "learning_rate": 1.4744859499514653e-05, + "loss": 1.5121, + "step": 12490 + }, + { + "epoch": 0.408546952654983, + "grad_norm": 3.4433374617293584, + "learning_rate": 1.4739835243015423e-05, + "loss": 1.5304, + "step": 12495 + }, + { + "epoch": 0.40871043682971486, + "grad_norm": 3.062794911982985, + "learning_rate": 1.4734809442867988e-05, + "loss": 1.4266, + "step": 12500 + }, + { + "epoch": 0.40887392100444675, + "grad_norm": 3.160991050979007, + "learning_rate": 1.4729782100709127e-05, + "loss": 1.5507, + "step": 12505 + }, + { + "epoch": 0.40903740517917864, + "grad_norm": 3.282520933030134, + "learning_rate": 1.4724753218176117e-05, + "loss": 1.4629, + "step": 12510 + }, + { + "epoch": 0.4092008893539105, + "grad_norm": 3.3199786580213195, + "learning_rate": 1.4719722796906748e-05, + "loss": 1.4438, + "step": 12515 + }, + { + "epoch": 0.4093643735286424, + "grad_norm": 3.2463671948154396, + "learning_rate": 1.4714690838539305e-05, + "loss": 1.401, + "step": 12520 + }, + { + "epoch": 0.4095278577033743, + "grad_norm": 3.2723478093581075, + "learning_rate": 1.4709657344712568e-05, + "loss": 1.4702, + "step": 12525 + }, + { + "epoch": 0.4096913418781062, + "grad_norm": 3.1746843707781727, + "learning_rate": 1.4704622317065832e-05, + "loss": 1.384, + "step": 12530 + }, + { + "epoch": 0.4098548260528381, + "grad_norm": 3.0527774014712485, + "learning_rate": 1.469958575723887e-05, + "loss": 1.4417, + "step": 12535 + }, + { + "epoch": 0.41001831022756996, + "grad_norm": 2.9132359500792275, + "learning_rate": 1.4694547666871977e-05, + "loss": 1.4026, + "step": 12540 + }, + { + "epoch": 0.41018179440230185, + "grad_norm": 2.9105915449221653, + "learning_rate": 1.4689508047605927e-05, + "loss": 1.4143, + "step": 12545 + }, + { + "epoch": 0.41034527857703373, + "grad_norm": 3.3010413399154004, + "learning_rate": 1.4684466901082006e-05, + "loss": 1.5963, + "step": 12550 + }, + { + "epoch": 0.4105087627517656, + "grad_norm": 3.283729401006113, + "learning_rate": 1.467942422894199e-05, + "loss": 1.5051, + "step": 12555 + }, + { + "epoch": 0.4106722469264975, + "grad_norm": 3.1151660646524757, + "learning_rate": 1.4674380032828154e-05, + "loss": 1.3983, + "step": 12560 + }, + { + "epoch": 0.4108357311012294, + "grad_norm": 3.492128588279019, + "learning_rate": 1.466933431438327e-05, + "loss": 1.461, + "step": 12565 + }, + { + "epoch": 0.4109992152759613, + "grad_norm": 2.9878118462962084, + "learning_rate": 1.4664287075250604e-05, + "loss": 1.5882, + "step": 12570 + }, + { + "epoch": 0.41116269945069317, + "grad_norm": 2.8114890398012196, + "learning_rate": 1.4659238317073918e-05, + "loss": 1.5181, + "step": 12575 + }, + { + "epoch": 0.41132618362542506, + "grad_norm": 3.09342131560474, + "learning_rate": 1.465418804149747e-05, + "loss": 1.5204, + "step": 12580 + }, + { + "epoch": 0.41148966780015694, + "grad_norm": 3.124007008987131, + "learning_rate": 1.4649136250166006e-05, + "loss": 1.5836, + "step": 12585 + }, + { + "epoch": 0.41165315197488883, + "grad_norm": 3.052481245270543, + "learning_rate": 1.4644082944724777e-05, + "loss": 1.5408, + "step": 12590 + }, + { + "epoch": 0.4118166361496207, + "grad_norm": 3.1536724081390055, + "learning_rate": 1.4639028126819521e-05, + "loss": 1.5571, + "step": 12595 + }, + { + "epoch": 0.4119801203243526, + "grad_norm": 3.1611741057992355, + "learning_rate": 1.4633971798096464e-05, + "loss": 1.5512, + "step": 12600 + }, + { + "epoch": 0.4121436044990845, + "grad_norm": 2.9329608281874515, + "learning_rate": 1.462891396020233e-05, + "loss": 1.3363, + "step": 12605 + }, + { + "epoch": 0.4123070886738164, + "grad_norm": 3.1930880551937664, + "learning_rate": 1.4623854614784331e-05, + "loss": 1.4492, + "step": 12610 + }, + { + "epoch": 0.41247057284854827, + "grad_norm": 3.034136268844583, + "learning_rate": 1.4618793763490176e-05, + "loss": 1.4622, + "step": 12615 + }, + { + "epoch": 0.41263405702328015, + "grad_norm": 3.2580943301155605, + "learning_rate": 1.461373140796806e-05, + "loss": 1.3334, + "step": 12620 + }, + { + "epoch": 0.41279754119801204, + "grad_norm": 3.247676452988337, + "learning_rate": 1.4608667549866665e-05, + "loss": 1.5593, + "step": 12625 + }, + { + "epoch": 0.41296102537274393, + "grad_norm": 3.1693001014496263, + "learning_rate": 1.4603602190835165e-05, + "loss": 1.4787, + "step": 12630 + }, + { + "epoch": 0.4131245095474758, + "grad_norm": 3.0778945162141724, + "learning_rate": 1.4598535332523227e-05, + "loss": 1.4704, + "step": 12635 + }, + { + "epoch": 0.4132879937222077, + "grad_norm": 3.1562387121678794, + "learning_rate": 1.4593466976581e-05, + "loss": 1.5321, + "step": 12640 + }, + { + "epoch": 0.4134514778969396, + "grad_norm": 3.109911583365218, + "learning_rate": 1.4588397124659126e-05, + "loss": 1.5374, + "step": 12645 + }, + { + "epoch": 0.4136149620716715, + "grad_norm": 2.9698446945291206, + "learning_rate": 1.458332577840873e-05, + "loss": 1.3954, + "step": 12650 + }, + { + "epoch": 0.41377844624640336, + "grad_norm": 3.0142300884238584, + "learning_rate": 1.457825293948142e-05, + "loss": 1.6394, + "step": 12655 + }, + { + "epoch": 0.41394193042113525, + "grad_norm": 3.144107796187508, + "learning_rate": 1.4573178609529304e-05, + "loss": 1.5653, + "step": 12660 + }, + { + "epoch": 0.41410541459586714, + "grad_norm": 3.3317080092188243, + "learning_rate": 1.4568102790204964e-05, + "loss": 1.4552, + "step": 12665 + }, + { + "epoch": 0.414268898770599, + "grad_norm": 3.286547315675162, + "learning_rate": 1.4563025483161469e-05, + "loss": 1.603, + "step": 12670 + }, + { + "epoch": 0.4144323829453309, + "grad_norm": 3.256454761550821, + "learning_rate": 1.4557946690052371e-05, + "loss": 1.5696, + "step": 12675 + }, + { + "epoch": 0.4145958671200628, + "grad_norm": 3.0949331902626045, + "learning_rate": 1.4552866412531713e-05, + "loss": 1.3422, + "step": 12680 + }, + { + "epoch": 0.4147593512947947, + "grad_norm": 3.2743557794079785, + "learning_rate": 1.4547784652254014e-05, + "loss": 1.4513, + "step": 12685 + }, + { + "epoch": 0.4149228354695266, + "grad_norm": 3.0771105574694477, + "learning_rate": 1.454270141087428e-05, + "loss": 1.5105, + "step": 12690 + }, + { + "epoch": 0.41508631964425846, + "grad_norm": 3.094893738344109, + "learning_rate": 1.4537616690048e-05, + "loss": 1.5444, + "step": 12695 + }, + { + "epoch": 0.41524980381899035, + "grad_norm": 3.211124195431417, + "learning_rate": 1.453253049143114e-05, + "loss": 1.4626, + "step": 12700 + }, + { + "epoch": 0.4154132879937222, + "grad_norm": 3.244801046927036, + "learning_rate": 1.4527442816680151e-05, + "loss": 1.3949, + "step": 12705 + }, + { + "epoch": 0.41557677216845407, + "grad_norm": 3.203365548810137, + "learning_rate": 1.4522353667451966e-05, + "loss": 1.5099, + "step": 12710 + }, + { + "epoch": 0.41574025634318595, + "grad_norm": 3.0897207252144816, + "learning_rate": 1.4517263045403991e-05, + "loss": 1.559, + "step": 12715 + }, + { + "epoch": 0.41590374051791784, + "grad_norm": 3.1659104804123848, + "learning_rate": 1.4512170952194122e-05, + "loss": 1.5307, + "step": 12720 + }, + { + "epoch": 0.41606722469264973, + "grad_norm": 2.8788686979455265, + "learning_rate": 1.450707738948073e-05, + "loss": 1.4141, + "step": 12725 + }, + { + "epoch": 0.4162307088673816, + "grad_norm": 3.356865538585046, + "learning_rate": 1.450198235892266e-05, + "loss": 1.6634, + "step": 12730 + }, + { + "epoch": 0.4163941930421135, + "grad_norm": 3.0666264724336956, + "learning_rate": 1.4496885862179237e-05, + "loss": 1.4764, + "step": 12735 + }, + { + "epoch": 0.4165576772168454, + "grad_norm": 2.9540652670403107, + "learning_rate": 1.449178790091027e-05, + "loss": 1.3812, + "step": 12740 + }, + { + "epoch": 0.4167211613915773, + "grad_norm": 3.031935748040481, + "learning_rate": 1.4486688476776039e-05, + "loss": 1.4632, + "step": 12745 + }, + { + "epoch": 0.41688464556630916, + "grad_norm": 3.132134182353702, + "learning_rate": 1.4481587591437298e-05, + "loss": 1.4801, + "step": 12750 + }, + { + "epoch": 0.41704812974104105, + "grad_norm": 2.996008698792865, + "learning_rate": 1.4476485246555285e-05, + "loss": 1.5299, + "step": 12755 + }, + { + "epoch": 0.41721161391577294, + "grad_norm": 3.113628293850963, + "learning_rate": 1.4471381443791703e-05, + "loss": 1.5477, + "step": 12760 + }, + { + "epoch": 0.4173750980905048, + "grad_norm": 3.2669894874656196, + "learning_rate": 1.446627618480874e-05, + "loss": 1.5084, + "step": 12765 + }, + { + "epoch": 0.4175385822652367, + "grad_norm": 3.267520870880615, + "learning_rate": 1.4461169471269054e-05, + "loss": 1.6005, + "step": 12770 + }, + { + "epoch": 0.4177020664399686, + "grad_norm": 3.0600605413772404, + "learning_rate": 1.4456061304835776e-05, + "loss": 1.4946, + "step": 12775 + }, + { + "epoch": 0.4178655506147005, + "grad_norm": 3.060730706251525, + "learning_rate": 1.4450951687172508e-05, + "loss": 1.4993, + "step": 12780 + }, + { + "epoch": 0.4180290347894324, + "grad_norm": 3.110237492780103, + "learning_rate": 1.444584061994333e-05, + "loss": 1.4746, + "step": 12785 + }, + { + "epoch": 0.41819251896416426, + "grad_norm": 2.8270914182214146, + "learning_rate": 1.4440728104812789e-05, + "loss": 1.3297, + "step": 12790 + }, + { + "epoch": 0.41835600313889615, + "grad_norm": 3.1559597141868716, + "learning_rate": 1.4435614143445907e-05, + "loss": 1.4315, + "step": 12795 + }, + { + "epoch": 0.41851948731362804, + "grad_norm": 3.0057708553069022, + "learning_rate": 1.4430498737508178e-05, + "loss": 1.3749, + "step": 12800 + }, + { + "epoch": 0.4186829714883599, + "grad_norm": 3.164628982431149, + "learning_rate": 1.4425381888665564e-05, + "loss": 1.4221, + "step": 12805 + }, + { + "epoch": 0.4188464556630918, + "grad_norm": 3.223288447485316, + "learning_rate": 1.4420263598584494e-05, + "loss": 1.4252, + "step": 12810 + }, + { + "epoch": 0.4190099398378237, + "grad_norm": 3.0541025764315832, + "learning_rate": 1.441514386893187e-05, + "loss": 1.5624, + "step": 12815 + }, + { + "epoch": 0.4191734240125556, + "grad_norm": 3.0644440856785993, + "learning_rate": 1.4410022701375069e-05, + "loss": 1.4285, + "step": 12820 + }, + { + "epoch": 0.41933690818728747, + "grad_norm": 3.0520210485771555, + "learning_rate": 1.4404900097581922e-05, + "loss": 1.7138, + "step": 12825 + }, + { + "epoch": 0.41950039236201936, + "grad_norm": 3.0903952582372267, + "learning_rate": 1.4399776059220739e-05, + "loss": 1.4554, + "step": 12830 + }, + { + "epoch": 0.41966387653675125, + "grad_norm": 3.434555412360749, + "learning_rate": 1.4394650587960293e-05, + "loss": 1.4904, + "step": 12835 + }, + { + "epoch": 0.41982736071148313, + "grad_norm": 3.0179048627947402, + "learning_rate": 1.4389523685469823e-05, + "loss": 1.3523, + "step": 12840 + }, + { + "epoch": 0.419990844886215, + "grad_norm": 3.0552905798832857, + "learning_rate": 1.4384395353419041e-05, + "loss": 1.5304, + "step": 12845 + }, + { + "epoch": 0.4201543290609469, + "grad_norm": 3.1419139213948397, + "learning_rate": 1.4379265593478113e-05, + "loss": 1.5452, + "step": 12850 + }, + { + "epoch": 0.4203178132356788, + "grad_norm": 3.259660153554324, + "learning_rate": 1.437413440731768e-05, + "loss": 1.5023, + "step": 12855 + }, + { + "epoch": 0.4204812974104107, + "grad_norm": 3.0631249584016613, + "learning_rate": 1.436900179660884e-05, + "loss": 1.4673, + "step": 12860 + }, + { + "epoch": 0.42064478158514257, + "grad_norm": 3.2453247438784936, + "learning_rate": 1.436386776302316e-05, + "loss": 1.6323, + "step": 12865 + }, + { + "epoch": 0.42080826575987446, + "grad_norm": 3.297231377202367, + "learning_rate": 1.4358732308232672e-05, + "loss": 1.4853, + "step": 12870 + }, + { + "epoch": 0.42097174993460634, + "grad_norm": 3.0708178609671273, + "learning_rate": 1.4353595433909863e-05, + "loss": 1.4473, + "step": 12875 + }, + { + "epoch": 0.42113523410933823, + "grad_norm": 3.38551329357128, + "learning_rate": 1.4348457141727691e-05, + "loss": 1.4985, + "step": 12880 + }, + { + "epoch": 0.4212987182840701, + "grad_norm": 3.3515309648682003, + "learning_rate": 1.4343317433359571e-05, + "loss": 1.4584, + "step": 12885 + }, + { + "epoch": 0.421462202458802, + "grad_norm": 3.090203581794414, + "learning_rate": 1.4338176310479377e-05, + "loss": 1.4546, + "step": 12890 + }, + { + "epoch": 0.4216256866335339, + "grad_norm": 3.246795300632673, + "learning_rate": 1.4333033774761452e-05, + "loss": 1.4877, + "step": 12895 + }, + { + "epoch": 0.4217891708082658, + "grad_norm": 3.0711463026587458, + "learning_rate": 1.432788982788059e-05, + "loss": 1.3374, + "step": 12900 + }, + { + "epoch": 0.42195265498299767, + "grad_norm": 3.1021598725677313, + "learning_rate": 1.4322744471512049e-05, + "loss": 1.5036, + "step": 12905 + }, + { + "epoch": 0.42211613915772955, + "grad_norm": 3.095223741704941, + "learning_rate": 1.4317597707331548e-05, + "loss": 1.4617, + "step": 12910 + }, + { + "epoch": 0.42227962333246144, + "grad_norm": 3.303308233198606, + "learning_rate": 1.4312449537015258e-05, + "loss": 1.4254, + "step": 12915 + }, + { + "epoch": 0.4224431075071933, + "grad_norm": 3.341730033722612, + "learning_rate": 1.4307299962239811e-05, + "loss": 1.398, + "step": 12920 + }, + { + "epoch": 0.4226065916819252, + "grad_norm": 3.0225633826754845, + "learning_rate": 1.4302148984682304e-05, + "loss": 1.5089, + "step": 12925 + }, + { + "epoch": 0.4227700758566571, + "grad_norm": 3.0373326874055118, + "learning_rate": 1.429699660602028e-05, + "loss": 1.4188, + "step": 12930 + }, + { + "epoch": 0.422933560031389, + "grad_norm": 3.144366815114578, + "learning_rate": 1.4291842827931745e-05, + "loss": 1.5175, + "step": 12935 + }, + { + "epoch": 0.4230970442061208, + "grad_norm": 2.878550807133269, + "learning_rate": 1.4286687652095154e-05, + "loss": 1.4637, + "step": 12940 + }, + { + "epoch": 0.4232605283808527, + "grad_norm": 3.1516693770490622, + "learning_rate": 1.4281531080189424e-05, + "loss": 1.4846, + "step": 12945 + }, + { + "epoch": 0.4234240125555846, + "grad_norm": 3.1341323263199246, + "learning_rate": 1.4276373113893924e-05, + "loss": 1.4742, + "step": 12950 + }, + { + "epoch": 0.4235874967303165, + "grad_norm": 3.1855529535116798, + "learning_rate": 1.4271213754888477e-05, + "loss": 1.4513, + "step": 12955 + }, + { + "epoch": 0.42375098090504837, + "grad_norm": 2.9695822990454896, + "learning_rate": 1.426605300485336e-05, + "loss": 1.4014, + "step": 12960 + }, + { + "epoch": 0.42391446507978026, + "grad_norm": 3.4512947388798447, + "learning_rate": 1.4260890865469299e-05, + "loss": 1.5271, + "step": 12965 + }, + { + "epoch": 0.42407794925451214, + "grad_norm": 2.93103015690082, + "learning_rate": 1.4255727338417484e-05, + "loss": 1.5915, + "step": 12970 + }, + { + "epoch": 0.42424143342924403, + "grad_norm": 3.5108355466448926, + "learning_rate": 1.4250562425379546e-05, + "loss": 1.499, + "step": 12975 + }, + { + "epoch": 0.4244049176039759, + "grad_norm": 3.2575645627953573, + "learning_rate": 1.424539612803757e-05, + "loss": 1.4468, + "step": 12980 + }, + { + "epoch": 0.4245684017787078, + "grad_norm": 3.4230989493399826, + "learning_rate": 1.424022844807409e-05, + "loss": 1.625, + "step": 12985 + }, + { + "epoch": 0.4247318859534397, + "grad_norm": 3.127268401520692, + "learning_rate": 1.4235059387172097e-05, + "loss": 1.4527, + "step": 12990 + }, + { + "epoch": 0.4248953701281716, + "grad_norm": 3.160475654436933, + "learning_rate": 1.4229888947015022e-05, + "loss": 1.3586, + "step": 12995 + }, + { + "epoch": 0.42505885430290347, + "grad_norm": 3.1686637469431633, + "learning_rate": 1.4224717129286756e-05, + "loss": 1.4933, + "step": 13000 + }, + { + "epoch": 0.42522233847763535, + "grad_norm": 3.190010569672379, + "learning_rate": 1.4219543935671634e-05, + "loss": 1.5422, + "step": 13005 + }, + { + "epoch": 0.42538582265236724, + "grad_norm": 3.127578675995277, + "learning_rate": 1.4214369367854434e-05, + "loss": 1.4493, + "step": 13010 + }, + { + "epoch": 0.4255493068270991, + "grad_norm": 3.2367151897268767, + "learning_rate": 1.4209193427520388e-05, + "loss": 1.4883, + "step": 13015 + }, + { + "epoch": 0.425712791001831, + "grad_norm": 3.180179153339423, + "learning_rate": 1.4204016116355173e-05, + "loss": 1.6131, + "step": 13020 + }, + { + "epoch": 0.4258762751765629, + "grad_norm": 3.028293259135286, + "learning_rate": 1.4198837436044914e-05, + "loss": 1.4257, + "step": 13025 + }, + { + "epoch": 0.4260397593512948, + "grad_norm": 2.8859807752108293, + "learning_rate": 1.4193657388276176e-05, + "loss": 1.5111, + "step": 13030 + }, + { + "epoch": 0.4262032435260267, + "grad_norm": 2.9942912436149496, + "learning_rate": 1.4188475974735978e-05, + "loss": 1.5665, + "step": 13035 + }, + { + "epoch": 0.42636672770075856, + "grad_norm": 3.144063317767582, + "learning_rate": 1.4183293197111778e-05, + "loss": 1.4263, + "step": 13040 + }, + { + "epoch": 0.42653021187549045, + "grad_norm": 3.00425183352023, + "learning_rate": 1.4178109057091478e-05, + "loss": 1.3771, + "step": 13045 + }, + { + "epoch": 0.42669369605022234, + "grad_norm": 3.192273233993129, + "learning_rate": 1.417292355636343e-05, + "loss": 1.4454, + "step": 13050 + }, + { + "epoch": 0.4268571802249542, + "grad_norm": 2.99057935125776, + "learning_rate": 1.4167736696616418e-05, + "loss": 1.3385, + "step": 13055 + }, + { + "epoch": 0.4270206643996861, + "grad_norm": 3.0817610971044913, + "learning_rate": 1.416254847953968e-05, + "loss": 1.411, + "step": 13060 + }, + { + "epoch": 0.427184148574418, + "grad_norm": 2.9626652156085944, + "learning_rate": 1.4157358906822887e-05, + "loss": 1.5486, + "step": 13065 + }, + { + "epoch": 0.4273476327491499, + "grad_norm": 3.51207360068449, + "learning_rate": 1.415216798015616e-05, + "loss": 1.4163, + "step": 13070 + }, + { + "epoch": 0.4275111169238818, + "grad_norm": 3.1135197548409814, + "learning_rate": 1.4146975701230054e-05, + "loss": 1.4753, + "step": 13075 + }, + { + "epoch": 0.42767460109861366, + "grad_norm": 3.256726028698158, + "learning_rate": 1.414178207173557e-05, + "loss": 1.5569, + "step": 13080 + }, + { + "epoch": 0.42783808527334555, + "grad_norm": 3.0017424926501044, + "learning_rate": 1.4136587093364143e-05, + "loss": 1.3769, + "step": 13085 + }, + { + "epoch": 0.42800156944807743, + "grad_norm": 3.1767110880194367, + "learning_rate": 1.4131390767807651e-05, + "loss": 1.5887, + "step": 13090 + }, + { + "epoch": 0.4281650536228093, + "grad_norm": 3.191748891596535, + "learning_rate": 1.4126193096758408e-05, + "loss": 1.4853, + "step": 13095 + }, + { + "epoch": 0.4283285377975412, + "grad_norm": 3.2382329359499438, + "learning_rate": 1.4120994081909171e-05, + "loss": 1.4648, + "step": 13100 + }, + { + "epoch": 0.4284920219722731, + "grad_norm": 3.3210828257011302, + "learning_rate": 1.4115793724953133e-05, + "loss": 1.4643, + "step": 13105 + }, + { + "epoch": 0.428655506147005, + "grad_norm": 3.267559703693459, + "learning_rate": 1.4110592027583917e-05, + "loss": 1.3844, + "step": 13110 + }, + { + "epoch": 0.42881899032173687, + "grad_norm": 2.9931825832315653, + "learning_rate": 1.4105388991495597e-05, + "loss": 1.4353, + "step": 13115 + }, + { + "epoch": 0.42898247449646876, + "grad_norm": 3.689654167958851, + "learning_rate": 1.4100184618382667e-05, + "loss": 1.437, + "step": 13120 + }, + { + "epoch": 0.42914595867120064, + "grad_norm": 3.1272274696234765, + "learning_rate": 1.4094978909940063e-05, + "loss": 1.3849, + "step": 13125 + }, + { + "epoch": 0.42930944284593253, + "grad_norm": 3.219069157634153, + "learning_rate": 1.4089771867863164e-05, + "loss": 1.5579, + "step": 13130 + }, + { + "epoch": 0.4294729270206644, + "grad_norm": 3.522575774960417, + "learning_rate": 1.408456349384777e-05, + "loss": 1.3104, + "step": 13135 + }, + { + "epoch": 0.4296364111953963, + "grad_norm": 2.7545997278711587, + "learning_rate": 1.4079353789590125e-05, + "loss": 1.3245, + "step": 13140 + }, + { + "epoch": 0.4297998953701282, + "grad_norm": 3.099404411850723, + "learning_rate": 1.4074142756786897e-05, + "loss": 1.37, + "step": 13145 + }, + { + "epoch": 0.4299633795448601, + "grad_norm": 3.1450244402368517, + "learning_rate": 1.4068930397135196e-05, + "loss": 1.4513, + "step": 13150 + }, + { + "epoch": 0.43012686371959197, + "grad_norm": 3.2652322001906975, + "learning_rate": 1.4063716712332558e-05, + "loss": 1.5726, + "step": 13155 + }, + { + "epoch": 0.43029034789432385, + "grad_norm": 3.374726521688662, + "learning_rate": 1.4058501704076953e-05, + "loss": 1.5331, + "step": 13160 + }, + { + "epoch": 0.43045383206905574, + "grad_norm": 3.2838449637124443, + "learning_rate": 1.405328537406678e-05, + "loss": 1.4529, + "step": 13165 + }, + { + "epoch": 0.43061731624378763, + "grad_norm": 3.0183376406910805, + "learning_rate": 1.4048067724000873e-05, + "loss": 1.4764, + "step": 13170 + }, + { + "epoch": 0.43078080041851946, + "grad_norm": 3.1623743845318124, + "learning_rate": 1.4042848755578488e-05, + "loss": 1.5329, + "step": 13175 + }, + { + "epoch": 0.43094428459325135, + "grad_norm": 3.2780699455756834, + "learning_rate": 1.403762847049932e-05, + "loss": 1.5052, + "step": 13180 + }, + { + "epoch": 0.43110776876798323, + "grad_norm": 2.977064622410221, + "learning_rate": 1.4032406870463486e-05, + "loss": 1.4571, + "step": 13185 + }, + { + "epoch": 0.4312712529427151, + "grad_norm": 3.1577741426981083, + "learning_rate": 1.402718395717153e-05, + "loss": 1.4915, + "step": 13190 + }, + { + "epoch": 0.431434737117447, + "grad_norm": 3.0496792024979387, + "learning_rate": 1.4021959732324435e-05, + "loss": 1.5019, + "step": 13195 + }, + { + "epoch": 0.4315982212921789, + "grad_norm": 3.1143242056332276, + "learning_rate": 1.4016734197623594e-05, + "loss": 1.4516, + "step": 13200 + }, + { + "epoch": 0.4317617054669108, + "grad_norm": 3.1735574547680887, + "learning_rate": 1.4011507354770841e-05, + "loss": 1.5978, + "step": 13205 + }, + { + "epoch": 0.43192518964164267, + "grad_norm": 3.0448474205643103, + "learning_rate": 1.4006279205468429e-05, + "loss": 1.4494, + "step": 13210 + }, + { + "epoch": 0.43208867381637456, + "grad_norm": 3.521205919315724, + "learning_rate": 1.4001049751419037e-05, + "loss": 1.5146, + "step": 13215 + }, + { + "epoch": 0.43225215799110644, + "grad_norm": 3.182151172849828, + "learning_rate": 1.3995818994325773e-05, + "loss": 1.5474, + "step": 13220 + }, + { + "epoch": 0.43241564216583833, + "grad_norm": 2.8144376291181055, + "learning_rate": 1.3990586935892165e-05, + "loss": 1.3803, + "step": 13225 + }, + { + "epoch": 0.4325791263405702, + "grad_norm": 2.9823632227530696, + "learning_rate": 1.3985353577822168e-05, + "loss": 1.4187, + "step": 13230 + }, + { + "epoch": 0.4327426105153021, + "grad_norm": 3.0618564668148824, + "learning_rate": 1.3980118921820154e-05, + "loss": 1.395, + "step": 13235 + }, + { + "epoch": 0.432906094690034, + "grad_norm": 3.1681426523309577, + "learning_rate": 1.3974882969590927e-05, + "loss": 1.4929, + "step": 13240 + }, + { + "epoch": 0.4330695788647659, + "grad_norm": 3.3876696319792123, + "learning_rate": 1.3969645722839702e-05, + "loss": 1.4361, + "step": 13245 + }, + { + "epoch": 0.43323306303949777, + "grad_norm": 2.973074863270686, + "learning_rate": 1.3964407183272131e-05, + "loss": 1.4656, + "step": 13250 + }, + { + "epoch": 0.43339654721422965, + "grad_norm": 3.2661158650971784, + "learning_rate": 1.395916735259427e-05, + "loss": 1.4701, + "step": 13255 + }, + { + "epoch": 0.43356003138896154, + "grad_norm": 3.3190404807590665, + "learning_rate": 1.3953926232512609e-05, + "loss": 1.499, + "step": 13260 + }, + { + "epoch": 0.43372351556369343, + "grad_norm": 3.1687956510435864, + "learning_rate": 1.3948683824734049e-05, + "loss": 1.4194, + "step": 13265 + }, + { + "epoch": 0.4338869997384253, + "grad_norm": 2.9700472768848436, + "learning_rate": 1.3943440130965912e-05, + "loss": 1.4346, + "step": 13270 + }, + { + "epoch": 0.4340504839131572, + "grad_norm": 3.11364313434945, + "learning_rate": 1.3938195152915945e-05, + "loss": 1.4681, + "step": 13275 + }, + { + "epoch": 0.4342139680878891, + "grad_norm": 3.287513484932764, + "learning_rate": 1.3932948892292308e-05, + "loss": 1.3288, + "step": 13280 + }, + { + "epoch": 0.434377452262621, + "grad_norm": 3.41945597137863, + "learning_rate": 1.3927701350803579e-05, + "loss": 1.4924, + "step": 13285 + }, + { + "epoch": 0.43454093643735286, + "grad_norm": 3.033253189998458, + "learning_rate": 1.3922452530158755e-05, + "loss": 1.5189, + "step": 13290 + }, + { + "epoch": 0.43470442061208475, + "grad_norm": 3.196264996327462, + "learning_rate": 1.3917202432067242e-05, + "loss": 1.4177, + "step": 13295 + }, + { + "epoch": 0.43486790478681664, + "grad_norm": 3.2463080367655177, + "learning_rate": 1.3911951058238878e-05, + "loss": 1.4951, + "step": 13300 + }, + { + "epoch": 0.4350313889615485, + "grad_norm": 2.9666148518576625, + "learning_rate": 1.3906698410383897e-05, + "loss": 1.5424, + "step": 13305 + }, + { + "epoch": 0.4351948731362804, + "grad_norm": 2.975953681354225, + "learning_rate": 1.3901444490212965e-05, + "loss": 1.4476, + "step": 13310 + }, + { + "epoch": 0.4353583573110123, + "grad_norm": 3.2507615362164124, + "learning_rate": 1.3896189299437152e-05, + "loss": 1.5237, + "step": 13315 + }, + { + "epoch": 0.4355218414857442, + "grad_norm": 3.1640309606182986, + "learning_rate": 1.3890932839767946e-05, + "loss": 1.4823, + "step": 13320 + }, + { + "epoch": 0.4356853256604761, + "grad_norm": 3.315296922625906, + "learning_rate": 1.3885675112917247e-05, + "loss": 1.5435, + "step": 13325 + }, + { + "epoch": 0.43584880983520796, + "grad_norm": 3.348676126095346, + "learning_rate": 1.3880416120597367e-05, + "loss": 1.6243, + "step": 13330 + }, + { + "epoch": 0.43601229400993985, + "grad_norm": 3.047171431066392, + "learning_rate": 1.3875155864521031e-05, + "loss": 1.45, + "step": 13335 + }, + { + "epoch": 0.43617577818467174, + "grad_norm": 3.1304891110292914, + "learning_rate": 1.3869894346401375e-05, + "loss": 1.4123, + "step": 13340 + }, + { + "epoch": 0.4363392623594036, + "grad_norm": 3.1099303592653453, + "learning_rate": 1.386463156795195e-05, + "loss": 1.389, + "step": 13345 + }, + { + "epoch": 0.4365027465341355, + "grad_norm": 3.4178658765346404, + "learning_rate": 1.385936753088671e-05, + "loss": 1.4234, + "step": 13350 + }, + { + "epoch": 0.4366662307088674, + "grad_norm": 3.3522265092563845, + "learning_rate": 1.3854102236920022e-05, + "loss": 1.6109, + "step": 13355 + }, + { + "epoch": 0.4368297148835993, + "grad_norm": 3.00402808742166, + "learning_rate": 1.3848835687766671e-05, + "loss": 1.6535, + "step": 13360 + }, + { + "epoch": 0.43699319905833117, + "grad_norm": 2.879676357912354, + "learning_rate": 1.3843567885141832e-05, + "loss": 1.4102, + "step": 13365 + }, + { + "epoch": 0.43715668323306306, + "grad_norm": 3.2852600179481093, + "learning_rate": 1.383829883076111e-05, + "loss": 1.5806, + "step": 13370 + }, + { + "epoch": 0.43732016740779495, + "grad_norm": 2.994852972371605, + "learning_rate": 1.3833028526340498e-05, + "loss": 1.4629, + "step": 13375 + }, + { + "epoch": 0.43748365158252683, + "grad_norm": 3.2996622573732797, + "learning_rate": 1.3827756973596408e-05, + "loss": 1.437, + "step": 13380 + }, + { + "epoch": 0.4376471357572587, + "grad_norm": 3.099967234408161, + "learning_rate": 1.3822484174245658e-05, + "loss": 1.3578, + "step": 13385 + }, + { + "epoch": 0.4378106199319906, + "grad_norm": 3.657944877356037, + "learning_rate": 1.3817210130005467e-05, + "loss": 1.448, + "step": 13390 + }, + { + "epoch": 0.4379741041067225, + "grad_norm": 3.093923931232771, + "learning_rate": 1.3811934842593467e-05, + "loss": 1.4753, + "step": 13395 + }, + { + "epoch": 0.4381375882814544, + "grad_norm": 3.2558245162942816, + "learning_rate": 1.3806658313727681e-05, + "loss": 1.4619, + "step": 13400 + }, + { + "epoch": 0.4383010724561862, + "grad_norm": 2.964928427021209, + "learning_rate": 1.380138054512655e-05, + "loss": 1.4704, + "step": 13405 + }, + { + "epoch": 0.4384645566309181, + "grad_norm": 3.076261602118684, + "learning_rate": 1.3796101538508915e-05, + "loss": 1.4707, + "step": 13410 + }, + { + "epoch": 0.43862804080565, + "grad_norm": 3.2422001536381666, + "learning_rate": 1.3790821295594018e-05, + "loss": 1.5081, + "step": 13415 + }, + { + "epoch": 0.4387915249803819, + "grad_norm": 3.1216022437520183, + "learning_rate": 1.3785539818101506e-05, + "loss": 1.4741, + "step": 13420 + }, + { + "epoch": 0.43895500915511376, + "grad_norm": 3.1500759804429133, + "learning_rate": 1.3780257107751425e-05, + "loss": 1.4539, + "step": 13425 + }, + { + "epoch": 0.43911849332984565, + "grad_norm": 3.499171376332811, + "learning_rate": 1.3774973166264223e-05, + "loss": 1.4045, + "step": 13430 + }, + { + "epoch": 0.43928197750457754, + "grad_norm": 3.180711951496187, + "learning_rate": 1.376968799536075e-05, + "loss": 1.4452, + "step": 13435 + }, + { + "epoch": 0.4394454616793094, + "grad_norm": 3.0958548977212095, + "learning_rate": 1.3764401596762263e-05, + "loss": 1.4818, + "step": 13440 + }, + { + "epoch": 0.4396089458540413, + "grad_norm": 3.1517481016044724, + "learning_rate": 1.3759113972190407e-05, + "loss": 1.4945, + "step": 13445 + }, + { + "epoch": 0.4397724300287732, + "grad_norm": 2.9239357693612265, + "learning_rate": 1.3753825123367235e-05, + "loss": 1.2532, + "step": 13450 + }, + { + "epoch": 0.4399359142035051, + "grad_norm": 2.9745156831105044, + "learning_rate": 1.374853505201519e-05, + "loss": 1.5088, + "step": 13455 + }, + { + "epoch": 0.440099398378237, + "grad_norm": 3.2340860977221326, + "learning_rate": 1.3743243759857126e-05, + "loss": 1.6625, + "step": 13460 + }, + { + "epoch": 0.44026288255296886, + "grad_norm": 2.9948670191528968, + "learning_rate": 1.3737951248616281e-05, + "loss": 1.2814, + "step": 13465 + }, + { + "epoch": 0.44042636672770075, + "grad_norm": 3.1389418764058137, + "learning_rate": 1.3732657520016298e-05, + "loss": 1.5384, + "step": 13470 + }, + { + "epoch": 0.44058985090243263, + "grad_norm": 3.2954792343527335, + "learning_rate": 1.3727362575781218e-05, + "loss": 1.4694, + "step": 13475 + }, + { + "epoch": 0.4407533350771645, + "grad_norm": 2.9351042314455906, + "learning_rate": 1.3722066417635467e-05, + "loss": 1.5421, + "step": 13480 + }, + { + "epoch": 0.4409168192518964, + "grad_norm": 3.36424994614246, + "learning_rate": 1.3716769047303882e-05, + "loss": 1.4644, + "step": 13485 + }, + { + "epoch": 0.4410803034266283, + "grad_norm": 3.2883506106194917, + "learning_rate": 1.3711470466511685e-05, + "loss": 1.5374, + "step": 13490 + }, + { + "epoch": 0.4412437876013602, + "grad_norm": 3.432471166278424, + "learning_rate": 1.3706170676984489e-05, + "loss": 1.5056, + "step": 13495 + }, + { + "epoch": 0.44140727177609207, + "grad_norm": 3.4041667297087637, + "learning_rate": 1.3700869680448312e-05, + "loss": 1.5311, + "step": 13500 + }, + { + "epoch": 0.44157075595082396, + "grad_norm": 3.2332551650015895, + "learning_rate": 1.3695567478629554e-05, + "loss": 1.5502, + "step": 13505 + }, + { + "epoch": 0.44173424012555584, + "grad_norm": 3.2117357092267453, + "learning_rate": 1.3690264073255012e-05, + "loss": 1.43, + "step": 13510 + }, + { + "epoch": 0.44189772430028773, + "grad_norm": 3.1341889201735826, + "learning_rate": 1.3684959466051881e-05, + "loss": 1.4424, + "step": 13515 + }, + { + "epoch": 0.4420612084750196, + "grad_norm": 3.28052379217036, + "learning_rate": 1.3679653658747739e-05, + "loss": 1.4864, + "step": 13520 + }, + { + "epoch": 0.4422246926497515, + "grad_norm": 3.137764651289005, + "learning_rate": 1.3674346653070554e-05, + "loss": 1.4504, + "step": 13525 + }, + { + "epoch": 0.4423881768244834, + "grad_norm": 2.9484128153233478, + "learning_rate": 1.3669038450748691e-05, + "loss": 1.4733, + "step": 13530 + }, + { + "epoch": 0.4425516609992153, + "grad_norm": 3.5530871291531056, + "learning_rate": 1.3663729053510897e-05, + "loss": 1.5441, + "step": 13535 + }, + { + "epoch": 0.44271514517394717, + "grad_norm": 3.1784121063792554, + "learning_rate": 1.3658418463086318e-05, + "loss": 1.4461, + "step": 13540 + }, + { + "epoch": 0.44287862934867905, + "grad_norm": 3.232333179672832, + "learning_rate": 1.3653106681204482e-05, + "loss": 1.4114, + "step": 13545 + }, + { + "epoch": 0.44304211352341094, + "grad_norm": 3.3014693537496447, + "learning_rate": 1.3647793709595305e-05, + "loss": 1.4334, + "step": 13550 + }, + { + "epoch": 0.44320559769814283, + "grad_norm": 3.424353226857531, + "learning_rate": 1.3642479549989092e-05, + "loss": 1.48, + "step": 13555 + }, + { + "epoch": 0.4433690818728747, + "grad_norm": 3.1525768323043426, + "learning_rate": 1.3637164204116535e-05, + "loss": 1.4146, + "step": 13560 + }, + { + "epoch": 0.4435325660476066, + "grad_norm": 3.282077014267658, + "learning_rate": 1.3631847673708714e-05, + "loss": 1.5111, + "step": 13565 + }, + { + "epoch": 0.4436960502223385, + "grad_norm": 3.021798965510224, + "learning_rate": 1.3626529960497087e-05, + "loss": 1.5483, + "step": 13570 + }, + { + "epoch": 0.4438595343970704, + "grad_norm": 3.079579019613633, + "learning_rate": 1.3621211066213507e-05, + "loss": 1.4686, + "step": 13575 + }, + { + "epoch": 0.44402301857180226, + "grad_norm": 3.1225256406778072, + "learning_rate": 1.3615890992590207e-05, + "loss": 1.4956, + "step": 13580 + }, + { + "epoch": 0.44418650274653415, + "grad_norm": 3.06781291682159, + "learning_rate": 1.36105697413598e-05, + "loss": 1.4978, + "step": 13585 + }, + { + "epoch": 0.44434998692126604, + "grad_norm": 3.2152305296353005, + "learning_rate": 1.3605247314255297e-05, + "loss": 1.3671, + "step": 13590 + }, + { + "epoch": 0.4445134710959979, + "grad_norm": 3.1404325568228186, + "learning_rate": 1.3599923713010075e-05, + "loss": 1.4197, + "step": 13595 + }, + { + "epoch": 0.4446769552707298, + "grad_norm": 3.1425615414258568, + "learning_rate": 1.3594598939357902e-05, + "loss": 1.529, + "step": 13600 + }, + { + "epoch": 0.4448404394454617, + "grad_norm": 3.4536931868362037, + "learning_rate": 1.3589272995032928e-05, + "loss": 1.5345, + "step": 13605 + }, + { + "epoch": 0.4450039236201936, + "grad_norm": 3.172068911575126, + "learning_rate": 1.3583945881769677e-05, + "loss": 1.5082, + "step": 13610 + }, + { + "epoch": 0.4451674077949255, + "grad_norm": 3.2626416440535912, + "learning_rate": 1.3578617601303066e-05, + "loss": 1.5016, + "step": 13615 + }, + { + "epoch": 0.44533089196965736, + "grad_norm": 2.9904858438899713, + "learning_rate": 1.3573288155368382e-05, + "loss": 1.3659, + "step": 13620 + }, + { + "epoch": 0.44549437614438925, + "grad_norm": 2.9755376406378877, + "learning_rate": 1.3567957545701298e-05, + "loss": 1.4465, + "step": 13625 + }, + { + "epoch": 0.44565786031912114, + "grad_norm": 3.135780492353918, + "learning_rate": 1.3562625774037858e-05, + "loss": 1.6101, + "step": 13630 + }, + { + "epoch": 0.445821344493853, + "grad_norm": 2.8959781836461898, + "learning_rate": 1.3557292842114494e-05, + "loss": 1.3906, + "step": 13635 + }, + { + "epoch": 0.44598482866858485, + "grad_norm": 3.122950322000183, + "learning_rate": 1.3551958751668007e-05, + "loss": 1.5687, + "step": 13640 + }, + { + "epoch": 0.44614831284331674, + "grad_norm": 2.9927368146600215, + "learning_rate": 1.354662350443558e-05, + "loss": 1.3718, + "step": 13645 + }, + { + "epoch": 0.44631179701804863, + "grad_norm": 3.2449672130397578, + "learning_rate": 1.3541287102154779e-05, + "loss": 1.3162, + "step": 13650 + }, + { + "epoch": 0.4464752811927805, + "grad_norm": 3.3692334861763293, + "learning_rate": 1.3535949546563531e-05, + "loss": 1.499, + "step": 13655 + }, + { + "epoch": 0.4466387653675124, + "grad_norm": 3.1626480228305898, + "learning_rate": 1.3530610839400153e-05, + "loss": 1.4048, + "step": 13660 + }, + { + "epoch": 0.4468022495422443, + "grad_norm": 3.355106878834281, + "learning_rate": 1.3525270982403327e-05, + "loss": 1.5224, + "step": 13665 + }, + { + "epoch": 0.4469657337169762, + "grad_norm": 3.134221390148123, + "learning_rate": 1.3519929977312117e-05, + "loss": 1.4108, + "step": 13670 + }, + { + "epoch": 0.44712921789170806, + "grad_norm": 3.132688181505283, + "learning_rate": 1.3514587825865957e-05, + "loss": 1.4776, + "step": 13675 + }, + { + "epoch": 0.44729270206643995, + "grad_norm": 27.34696198074906, + "learning_rate": 1.3509244529804651e-05, + "loss": 1.3589, + "step": 13680 + }, + { + "epoch": 0.44745618624117184, + "grad_norm": 3.0425845098357334, + "learning_rate": 1.350390009086838e-05, + "loss": 1.4862, + "step": 13685 + }, + { + "epoch": 0.4476196704159037, + "grad_norm": 3.1619933271456384, + "learning_rate": 1.3498554510797704e-05, + "loss": 1.4844, + "step": 13690 + }, + { + "epoch": 0.4477831545906356, + "grad_norm": 3.404949630040887, + "learning_rate": 1.349320779133354e-05, + "loss": 1.4557, + "step": 13695 + }, + { + "epoch": 0.4479466387653675, + "grad_norm": 3.1605501851817923, + "learning_rate": 1.3487859934217188e-05, + "loss": 1.5055, + "step": 13700 + }, + { + "epoch": 0.4481101229400994, + "grad_norm": 3.2605953508781895, + "learning_rate": 1.3482510941190313e-05, + "loss": 1.5624, + "step": 13705 + }, + { + "epoch": 0.4482736071148313, + "grad_norm": 3.1834490791603094, + "learning_rate": 1.3477160813994946e-05, + "loss": 1.5858, + "step": 13710 + }, + { + "epoch": 0.44843709128956316, + "grad_norm": 3.054232240171047, + "learning_rate": 1.3471809554373498e-05, + "loss": 1.4264, + "step": 13715 + }, + { + "epoch": 0.44860057546429505, + "grad_norm": 3.156792873554481, + "learning_rate": 1.3466457164068743e-05, + "loss": 1.3827, + "step": 13720 + }, + { + "epoch": 0.44876405963902694, + "grad_norm": 3.0920423207901773, + "learning_rate": 1.3461103644823822e-05, + "loss": 1.612, + "step": 13725 + }, + { + "epoch": 0.4489275438137588, + "grad_norm": 3.1043616733159727, + "learning_rate": 1.3455748998382243e-05, + "loss": 1.3795, + "step": 13730 + }, + { + "epoch": 0.4490910279884907, + "grad_norm": 3.257800527540785, + "learning_rate": 1.3450393226487887e-05, + "loss": 1.4126, + "step": 13735 + }, + { + "epoch": 0.4492545121632226, + "grad_norm": 3.2687904887947794, + "learning_rate": 1.3445036330884992e-05, + "loss": 1.4316, + "step": 13740 + }, + { + "epoch": 0.4494179963379545, + "grad_norm": 3.338286305028256, + "learning_rate": 1.3439678313318176e-05, + "loss": 1.3471, + "step": 13745 + }, + { + "epoch": 0.44958148051268637, + "grad_norm": 2.9736154384328835, + "learning_rate": 1.343431917553241e-05, + "loss": 1.5086, + "step": 13750 + }, + { + "epoch": 0.44974496468741826, + "grad_norm": 3.0671649358826403, + "learning_rate": 1.3428958919273031e-05, + "loss": 1.4371, + "step": 13755 + }, + { + "epoch": 0.44990844886215015, + "grad_norm": 2.9735803096404974, + "learning_rate": 1.3423597546285747e-05, + "loss": 1.4197, + "step": 13760 + }, + { + "epoch": 0.45007193303688203, + "grad_norm": 3.0284634019581387, + "learning_rate": 1.3418235058316625e-05, + "loss": 1.3826, + "step": 13765 + }, + { + "epoch": 0.4502354172116139, + "grad_norm": 3.1145743830192303, + "learning_rate": 1.3412871457112095e-05, + "loss": 1.2723, + "step": 13770 + }, + { + "epoch": 0.4503989013863458, + "grad_norm": 3.169920812849271, + "learning_rate": 1.3407506744418949e-05, + "loss": 1.4581, + "step": 13775 + }, + { + "epoch": 0.4505623855610777, + "grad_norm": 3.249947432323536, + "learning_rate": 1.3402140921984348e-05, + "loss": 1.5079, + "step": 13780 + }, + { + "epoch": 0.4507258697358096, + "grad_norm": 2.942087307934963, + "learning_rate": 1.3396773991555802e-05, + "loss": 1.3559, + "step": 13785 + }, + { + "epoch": 0.45088935391054147, + "grad_norm": 3.273405553245092, + "learning_rate": 1.339140595488119e-05, + "loss": 1.5091, + "step": 13790 + }, + { + "epoch": 0.45105283808527336, + "grad_norm": 3.092010108612854, + "learning_rate": 1.3386036813708756e-05, + "loss": 1.3432, + "step": 13795 + }, + { + "epoch": 0.45121632226000524, + "grad_norm": 3.0993145991657522, + "learning_rate": 1.3380666569787092e-05, + "loss": 1.5546, + "step": 13800 + }, + { + "epoch": 0.45137980643473713, + "grad_norm": 3.1127481534620247, + "learning_rate": 1.3375295224865157e-05, + "loss": 1.4553, + "step": 13805 + }, + { + "epoch": 0.451543290609469, + "grad_norm": 3.0170385476277857, + "learning_rate": 1.3369922780692265e-05, + "loss": 1.5136, + "step": 13810 + }, + { + "epoch": 0.4517067747842009, + "grad_norm": 3.154299018556198, + "learning_rate": 1.3364549239018093e-05, + "loss": 1.5397, + "step": 13815 + }, + { + "epoch": 0.4518702589589328, + "grad_norm": 3.285645503738037, + "learning_rate": 1.3359174601592665e-05, + "loss": 1.4183, + "step": 13820 + }, + { + "epoch": 0.4520337431336647, + "grad_norm": 2.8912634878794847, + "learning_rate": 1.3353798870166376e-05, + "loss": 1.3812, + "step": 13825 + }, + { + "epoch": 0.45219722730839657, + "grad_norm": 3.2629940068078738, + "learning_rate": 1.3348422046489968e-05, + "loss": 1.4732, + "step": 13830 + }, + { + "epoch": 0.45236071148312845, + "grad_norm": 3.2700422331040944, + "learning_rate": 1.334304413231454e-05, + "loss": 1.4426, + "step": 13835 + }, + { + "epoch": 0.45252419565786034, + "grad_norm": 3.250838206835798, + "learning_rate": 1.3337665129391545e-05, + "loss": 1.5655, + "step": 13840 + }, + { + "epoch": 0.4526876798325922, + "grad_norm": 3.1358153161770965, + "learning_rate": 1.3332285039472792e-05, + "loss": 1.4616, + "step": 13845 + }, + { + "epoch": 0.4528511640073241, + "grad_norm": 3.207385680445628, + "learning_rate": 1.332690386431045e-05, + "loss": 1.3996, + "step": 13850 + }, + { + "epoch": 0.453014648182056, + "grad_norm": 3.290633270473165, + "learning_rate": 1.3321521605657033e-05, + "loss": 1.4568, + "step": 13855 + }, + { + "epoch": 0.4531781323567879, + "grad_norm": 3.1550330574217362, + "learning_rate": 1.3316138265265408e-05, + "loss": 1.3905, + "step": 13860 + }, + { + "epoch": 0.4533416165315198, + "grad_norm": 3.086875398417331, + "learning_rate": 1.33107538448888e-05, + "loss": 1.4097, + "step": 13865 + }, + { + "epoch": 0.45350510070625166, + "grad_norm": 3.343607208948401, + "learning_rate": 1.3305368346280781e-05, + "loss": 1.4925, + "step": 13870 + }, + { + "epoch": 0.4536685848809835, + "grad_norm": 3.0439080529748272, + "learning_rate": 1.329998177119528e-05, + "loss": 1.3638, + "step": 13875 + }, + { + "epoch": 0.4538320690557154, + "grad_norm": 3.2519535348679702, + "learning_rate": 1.329459412138657e-05, + "loss": 1.5703, + "step": 13880 + }, + { + "epoch": 0.45399555323044727, + "grad_norm": 3.158663469544958, + "learning_rate": 1.3289205398609273e-05, + "loss": 1.4836, + "step": 13885 + }, + { + "epoch": 0.45415903740517916, + "grad_norm": 3.2316563733971697, + "learning_rate": 1.3283815604618366e-05, + "loss": 1.4649, + "step": 13890 + }, + { + "epoch": 0.45432252157991104, + "grad_norm": 3.450050020318453, + "learning_rate": 1.3278424741169178e-05, + "loss": 1.5013, + "step": 13895 + }, + { + "epoch": 0.45448600575464293, + "grad_norm": 3.208747421755566, + "learning_rate": 1.3273032810017374e-05, + "loss": 1.3852, + "step": 13900 + }, + { + "epoch": 0.4546494899293748, + "grad_norm": 2.9546452524985796, + "learning_rate": 1.326763981291898e-05, + "loss": 1.3477, + "step": 13905 + }, + { + "epoch": 0.4548129741041067, + "grad_norm": 3.088806626804542, + "learning_rate": 1.3262245751630359e-05, + "loss": 1.3287, + "step": 13910 + }, + { + "epoch": 0.4549764582788386, + "grad_norm": 3.0842477773259405, + "learning_rate": 1.3256850627908224e-05, + "loss": 1.4108, + "step": 13915 + }, + { + "epoch": 0.4551399424535705, + "grad_norm": 2.814415031747181, + "learning_rate": 1.3251454443509637e-05, + "loss": 1.4186, + "step": 13920 + }, + { + "epoch": 0.45530342662830237, + "grad_norm": 2.9363578677014566, + "learning_rate": 1.3246057200192001e-05, + "loss": 1.4495, + "step": 13925 + }, + { + "epoch": 0.45546691080303425, + "grad_norm": 3.0985163821235875, + "learning_rate": 1.324065889971307e-05, + "loss": 1.5476, + "step": 13930 + }, + { + "epoch": 0.45563039497776614, + "grad_norm": 3.144390633435146, + "learning_rate": 1.3235259543830934e-05, + "loss": 1.4921, + "step": 13935 + }, + { + "epoch": 0.455793879152498, + "grad_norm": 3.0110419622292435, + "learning_rate": 1.3229859134304033e-05, + "loss": 1.3982, + "step": 13940 + }, + { + "epoch": 0.4559573633272299, + "grad_norm": 3.0164698446462253, + "learning_rate": 1.3224457672891145e-05, + "loss": 1.4035, + "step": 13945 + }, + { + "epoch": 0.4561208475019618, + "grad_norm": 3.1921316878450847, + "learning_rate": 1.3219055161351398e-05, + "loss": 1.3464, + "step": 13950 + }, + { + "epoch": 0.4562843316766937, + "grad_norm": 3.1312056352050286, + "learning_rate": 1.3213651601444255e-05, + "loss": 1.5315, + "step": 13955 + }, + { + "epoch": 0.4564478158514256, + "grad_norm": 3.216631688167698, + "learning_rate": 1.3208246994929526e-05, + "loss": 1.4253, + "step": 13960 + }, + { + "epoch": 0.45661130002615746, + "grad_norm": 3.0580121615882554, + "learning_rate": 1.3202841343567353e-05, + "loss": 1.4409, + "step": 13965 + }, + { + "epoch": 0.45677478420088935, + "grad_norm": 3.273697866599447, + "learning_rate": 1.319743464911823e-05, + "loss": 1.5755, + "step": 13970 + }, + { + "epoch": 0.45693826837562124, + "grad_norm": 3.204141886121589, + "learning_rate": 1.3192026913342982e-05, + "loss": 1.4367, + "step": 13975 + }, + { + "epoch": 0.4571017525503531, + "grad_norm": 3.3237263689251773, + "learning_rate": 1.3186618138002778e-05, + "loss": 1.4283, + "step": 13980 + }, + { + "epoch": 0.457265236725085, + "grad_norm": 3.333289188883014, + "learning_rate": 1.3181208324859123e-05, + "loss": 1.5653, + "step": 13985 + }, + { + "epoch": 0.4574287208998169, + "grad_norm": 3.158983334854349, + "learning_rate": 1.3175797475673857e-05, + "loss": 1.379, + "step": 13990 + }, + { + "epoch": 0.4575922050745488, + "grad_norm": 3.2329326429986875, + "learning_rate": 1.3170385592209164e-05, + "loss": 1.5075, + "step": 13995 + }, + { + "epoch": 0.4577556892492807, + "grad_norm": 3.0222158376774155, + "learning_rate": 1.3164972676227563e-05, + "loss": 1.2777, + "step": 14000 + }, + { + "epoch": 0.45791917342401256, + "grad_norm": 3.0354950593433476, + "learning_rate": 1.3159558729491908e-05, + "loss": 1.5614, + "step": 14005 + }, + { + "epoch": 0.45808265759874445, + "grad_norm": 3.0857429907631397, + "learning_rate": 1.3154143753765386e-05, + "loss": 1.4365, + "step": 14010 + }, + { + "epoch": 0.45824614177347633, + "grad_norm": 3.4422492627868264, + "learning_rate": 1.3148727750811525e-05, + "loss": 1.5545, + "step": 14015 + }, + { + "epoch": 0.4584096259482082, + "grad_norm": 2.9802511014306745, + "learning_rate": 1.3143310722394183e-05, + "loss": 1.4417, + "step": 14020 + }, + { + "epoch": 0.4585731101229401, + "grad_norm": 3.0019978631312854, + "learning_rate": 1.3137892670277552e-05, + "loss": 1.3035, + "step": 14025 + }, + { + "epoch": 0.458736594297672, + "grad_norm": 3.5095516248856815, + "learning_rate": 1.3132473596226162e-05, + "loss": 1.6906, + "step": 14030 + }, + { + "epoch": 0.4589000784724039, + "grad_norm": 3.191156912446317, + "learning_rate": 1.3127053502004872e-05, + "loss": 1.5193, + "step": 14035 + }, + { + "epoch": 0.45906356264713577, + "grad_norm": 2.937806658853562, + "learning_rate": 1.3121632389378873e-05, + "loss": 1.3861, + "step": 14040 + }, + { + "epoch": 0.45922704682186766, + "grad_norm": 3.1493759308176257, + "learning_rate": 1.3116210260113686e-05, + "loss": 1.512, + "step": 14045 + }, + { + "epoch": 0.45939053099659954, + "grad_norm": 3.04205013865678, + "learning_rate": 1.3110787115975168e-05, + "loss": 1.4203, + "step": 14050 + }, + { + "epoch": 0.45955401517133143, + "grad_norm": 3.213251029682186, + "learning_rate": 1.3105362958729506e-05, + "loss": 1.4678, + "step": 14055 + }, + { + "epoch": 0.4597174993460633, + "grad_norm": 3.1714545833306924, + "learning_rate": 1.3099937790143214e-05, + "loss": 1.4599, + "step": 14060 + }, + { + "epoch": 0.4598809835207952, + "grad_norm": 3.229785165110759, + "learning_rate": 1.3094511611983136e-05, + "loss": 1.5496, + "step": 14065 + }, + { + "epoch": 0.4600444676955271, + "grad_norm": 3.2000684090417986, + "learning_rate": 1.3089084426016445e-05, + "loss": 1.5538, + "step": 14070 + }, + { + "epoch": 0.460207951870259, + "grad_norm": 3.1716230247399775, + "learning_rate": 1.308365623401064e-05, + "loss": 1.4663, + "step": 14075 + }, + { + "epoch": 0.46037143604499087, + "grad_norm": 2.90898166552822, + "learning_rate": 1.3078227037733554e-05, + "loss": 1.5439, + "step": 14080 + }, + { + "epoch": 0.46053492021972275, + "grad_norm": 3.2580706158939425, + "learning_rate": 1.3072796838953346e-05, + "loss": 1.5079, + "step": 14085 + }, + { + "epoch": 0.46069840439445464, + "grad_norm": 3.2265931789631455, + "learning_rate": 1.3067365639438491e-05, + "loss": 1.6185, + "step": 14090 + }, + { + "epoch": 0.46086188856918653, + "grad_norm": 3.207396404490138, + "learning_rate": 1.3061933440957803e-05, + "loss": 1.4652, + "step": 14095 + }, + { + "epoch": 0.4610253727439184, + "grad_norm": 3.4102622927420114, + "learning_rate": 1.305650024528041e-05, + "loss": 1.4988, + "step": 14100 + }, + { + "epoch": 0.46118885691865025, + "grad_norm": 2.812304308286704, + "learning_rate": 1.305106605417578e-05, + "loss": 1.3036, + "step": 14105 + }, + { + "epoch": 0.46135234109338213, + "grad_norm": 3.1061244618297406, + "learning_rate": 1.3045630869413688e-05, + "loss": 1.5636, + "step": 14110 + }, + { + "epoch": 0.461515825268114, + "grad_norm": 3.2727048660789926, + "learning_rate": 1.3040194692764247e-05, + "loss": 1.4344, + "step": 14115 + }, + { + "epoch": 0.4616793094428459, + "grad_norm": 3.0139720821688307, + "learning_rate": 1.303475752599788e-05, + "loss": 1.3259, + "step": 14120 + }, + { + "epoch": 0.4618427936175778, + "grad_norm": 3.226413734804549, + "learning_rate": 1.302931937088534e-05, + "loss": 1.3836, + "step": 14125 + }, + { + "epoch": 0.4620062777923097, + "grad_norm": 3.0802831269414552, + "learning_rate": 1.3023880229197708e-05, + "loss": 1.3117, + "step": 14130 + }, + { + "epoch": 0.46216976196704157, + "grad_norm": 3.099184752414252, + "learning_rate": 1.3018440102706368e-05, + "loss": 1.3786, + "step": 14135 + }, + { + "epoch": 0.46233324614177346, + "grad_norm": 3.0142661546754215, + "learning_rate": 1.3012998993183048e-05, + "loss": 1.4379, + "step": 14140 + }, + { + "epoch": 0.46249673031650534, + "grad_norm": 3.045919553062025, + "learning_rate": 1.3007556902399776e-05, + "loss": 1.4948, + "step": 14145 + }, + { + "epoch": 0.46266021449123723, + "grad_norm": 3.078656083780217, + "learning_rate": 1.3002113832128907e-05, + "loss": 1.5777, + "step": 14150 + }, + { + "epoch": 0.4628236986659691, + "grad_norm": 3.164154504157611, + "learning_rate": 1.2996669784143122e-05, + "loss": 1.4682, + "step": 14155 + }, + { + "epoch": 0.462987182840701, + "grad_norm": 3.2750125295780936, + "learning_rate": 1.2991224760215407e-05, + "loss": 1.4913, + "step": 14160 + }, + { + "epoch": 0.4631506670154329, + "grad_norm": 3.2942975679159, + "learning_rate": 1.2985778762119079e-05, + "loss": 1.4643, + "step": 14165 + }, + { + "epoch": 0.4633141511901648, + "grad_norm": 3.1967198275215987, + "learning_rate": 1.298033179162776e-05, + "loss": 1.4513, + "step": 14170 + }, + { + "epoch": 0.46347763536489667, + "grad_norm": 3.0079614546070146, + "learning_rate": 1.2974883850515401e-05, + "loss": 1.5009, + "step": 14175 + }, + { + "epoch": 0.46364111953962855, + "grad_norm": 3.2592527998037952, + "learning_rate": 1.2969434940556258e-05, + "loss": 1.3702, + "step": 14180 + }, + { + "epoch": 0.46380460371436044, + "grad_norm": 3.0032825717687226, + "learning_rate": 1.2963985063524914e-05, + "loss": 1.5181, + "step": 14185 + }, + { + "epoch": 0.46396808788909233, + "grad_norm": 3.2544404085986485, + "learning_rate": 1.2958534221196253e-05, + "loss": 1.4252, + "step": 14190 + }, + { + "epoch": 0.4641315720638242, + "grad_norm": 3.312590185707388, + "learning_rate": 1.2953082415345484e-05, + "loss": 1.5378, + "step": 14195 + }, + { + "epoch": 0.4642950562385561, + "grad_norm": 3.2866409979359403, + "learning_rate": 1.2947629647748123e-05, + "loss": 1.6349, + "step": 14200 + }, + { + "epoch": 0.464458540413288, + "grad_norm": 3.1223934657683934, + "learning_rate": 1.2942175920180012e-05, + "loss": 1.5784, + "step": 14205 + }, + { + "epoch": 0.4646220245880199, + "grad_norm": 3.0731803955379107, + "learning_rate": 1.2936721234417286e-05, + "loss": 1.4441, + "step": 14210 + }, + { + "epoch": 0.46478550876275176, + "grad_norm": 3.3454228742366365, + "learning_rate": 1.2931265592236408e-05, + "loss": 1.5194, + "step": 14215 + }, + { + "epoch": 0.46494899293748365, + "grad_norm": 3.262931676720206, + "learning_rate": 1.2925808995414149e-05, + "loss": 1.5695, + "step": 14220 + }, + { + "epoch": 0.46511247711221554, + "grad_norm": 3.1494256547641353, + "learning_rate": 1.2920351445727583e-05, + "loss": 1.4211, + "step": 14225 + }, + { + "epoch": 0.4652759612869474, + "grad_norm": 3.234616671370427, + "learning_rate": 1.2914892944954103e-05, + "loss": 1.3927, + "step": 14230 + }, + { + "epoch": 0.4654394454616793, + "grad_norm": 3.221130740020714, + "learning_rate": 1.290943349487141e-05, + "loss": 1.4087, + "step": 14235 + }, + { + "epoch": 0.4656029296364112, + "grad_norm": 3.1411048062567954, + "learning_rate": 1.2903973097257514e-05, + "loss": 1.5535, + "step": 14240 + }, + { + "epoch": 0.4657664138111431, + "grad_norm": 3.041259927539561, + "learning_rate": 1.289851175389073e-05, + "loss": 1.5843, + "step": 14245 + }, + { + "epoch": 0.465929897985875, + "grad_norm": 3.33583963391561, + "learning_rate": 1.2893049466549683e-05, + "loss": 1.527, + "step": 14250 + }, + { + "epoch": 0.46609338216060686, + "grad_norm": 3.0984415885368746, + "learning_rate": 1.2887586237013307e-05, + "loss": 1.5313, + "step": 14255 + }, + { + "epoch": 0.46625686633533875, + "grad_norm": 3.3148683832235033, + "learning_rate": 1.2882122067060847e-05, + "loss": 1.4478, + "step": 14260 + }, + { + "epoch": 0.46642035051007064, + "grad_norm": 3.2839436565377444, + "learning_rate": 1.2876656958471845e-05, + "loss": 1.5364, + "step": 14265 + }, + { + "epoch": 0.4665838346848025, + "grad_norm": 3.3436284237432017, + "learning_rate": 1.287119091302615e-05, + "loss": 1.4987, + "step": 14270 + }, + { + "epoch": 0.4667473188595344, + "grad_norm": 3.2624541144375465, + "learning_rate": 1.2865723932503924e-05, + "loss": 1.4793, + "step": 14275 + }, + { + "epoch": 0.4669108030342663, + "grad_norm": 3.367734095984866, + "learning_rate": 1.2860256018685626e-05, + "loss": 1.5368, + "step": 14280 + }, + { + "epoch": 0.4670742872089982, + "grad_norm": 2.947963116204321, + "learning_rate": 1.2854787173352024e-05, + "loss": 1.418, + "step": 14285 + }, + { + "epoch": 0.46723777138373007, + "grad_norm": 3.269883083871039, + "learning_rate": 1.2849317398284186e-05, + "loss": 1.5033, + "step": 14290 + }, + { + "epoch": 0.46740125555846196, + "grad_norm": 3.2938073206538, + "learning_rate": 1.2843846695263483e-05, + "loss": 1.4887, + "step": 14295 + }, + { + "epoch": 0.46756473973319385, + "grad_norm": 3.3619395435217316, + "learning_rate": 1.2838375066071586e-05, + "loss": 1.4178, + "step": 14300 + }, + { + "epoch": 0.46772822390792573, + "grad_norm": 3.077105518814583, + "learning_rate": 1.2832902512490475e-05, + "loss": 1.3923, + "step": 14305 + }, + { + "epoch": 0.4678917080826576, + "grad_norm": 3.1343983664244837, + "learning_rate": 1.2827429036302426e-05, + "loss": 1.4791, + "step": 14310 + }, + { + "epoch": 0.4680551922573895, + "grad_norm": 3.1240837522053964, + "learning_rate": 1.2821954639290014e-05, + "loss": 1.489, + "step": 14315 + }, + { + "epoch": 0.4682186764321214, + "grad_norm": 3.078104742873956, + "learning_rate": 1.2816479323236117e-05, + "loss": 1.4168, + "step": 14320 + }, + { + "epoch": 0.4683821606068533, + "grad_norm": 3.274204228505356, + "learning_rate": 1.2811003089923913e-05, + "loss": 1.6219, + "step": 14325 + }, + { + "epoch": 0.46854564478158517, + "grad_norm": 3.076290308854897, + "learning_rate": 1.2805525941136874e-05, + "loss": 1.3451, + "step": 14330 + }, + { + "epoch": 0.46870912895631706, + "grad_norm": 3.318753242314131, + "learning_rate": 1.2800047878658775e-05, + "loss": 1.518, + "step": 14335 + }, + { + "epoch": 0.4688726131310489, + "grad_norm": 3.1111972336220517, + "learning_rate": 1.2794568904273686e-05, + "loss": 1.3751, + "step": 14340 + }, + { + "epoch": 0.4690360973057808, + "grad_norm": 3.245139423617386, + "learning_rate": 1.2789089019765977e-05, + "loss": 1.4404, + "step": 14345 + }, + { + "epoch": 0.46919958148051266, + "grad_norm": 3.22176572818969, + "learning_rate": 1.278360822692031e-05, + "loss": 1.5319, + "step": 14350 + }, + { + "epoch": 0.46936306565524455, + "grad_norm": 2.9263106458815873, + "learning_rate": 1.2778126527521643e-05, + "loss": 1.5166, + "step": 14355 + }, + { + "epoch": 0.46952654982997644, + "grad_norm": 3.238723768398887, + "learning_rate": 1.2772643923355235e-05, + "loss": 1.4944, + "step": 14360 + }, + { + "epoch": 0.4696900340047083, + "grad_norm": 2.826103624282034, + "learning_rate": 1.2767160416206634e-05, + "loss": 1.3973, + "step": 14365 + }, + { + "epoch": 0.4698535181794402, + "grad_norm": 3.0515860574477514, + "learning_rate": 1.2761676007861685e-05, + "loss": 1.5572, + "step": 14370 + }, + { + "epoch": 0.4700170023541721, + "grad_norm": 3.0773075918171795, + "learning_rate": 1.2756190700106523e-05, + "loss": 1.4905, + "step": 14375 + }, + { + "epoch": 0.470180486528904, + "grad_norm": 3.1494547554766545, + "learning_rate": 1.2750704494727581e-05, + "loss": 1.3765, + "step": 14380 + }, + { + "epoch": 0.47034397070363587, + "grad_norm": 3.027367629975798, + "learning_rate": 1.274521739351158e-05, + "loss": 1.3111, + "step": 14385 + }, + { + "epoch": 0.47050745487836776, + "grad_norm": 2.9963382794760616, + "learning_rate": 1.2739729398245537e-05, + "loss": 1.4021, + "step": 14390 + }, + { + "epoch": 0.47067093905309965, + "grad_norm": 3.2322448011235796, + "learning_rate": 1.2734240510716756e-05, + "loss": 1.4545, + "step": 14395 + }, + { + "epoch": 0.47083442322783153, + "grad_norm": 3.0289680489186663, + "learning_rate": 1.272875073271283e-05, + "loss": 1.4355, + "step": 14400 + }, + { + "epoch": 0.4709979074025634, + "grad_norm": 3.240406926898811, + "learning_rate": 1.2723260066021647e-05, + "loss": 1.4212, + "step": 14405 + }, + { + "epoch": 0.4711613915772953, + "grad_norm": 3.4171864358302115, + "learning_rate": 1.2717768512431385e-05, + "loss": 1.5024, + "step": 14410 + }, + { + "epoch": 0.4713248757520272, + "grad_norm": 3.063341752722578, + "learning_rate": 1.271227607373051e-05, + "loss": 1.474, + "step": 14415 + }, + { + "epoch": 0.4714883599267591, + "grad_norm": 3.1988458180365877, + "learning_rate": 1.2706782751707769e-05, + "loss": 1.4127, + "step": 14420 + }, + { + "epoch": 0.47165184410149097, + "grad_norm": 3.238424284809533, + "learning_rate": 1.2701288548152207e-05, + "loss": 1.3617, + "step": 14425 + }, + { + "epoch": 0.47181532827622286, + "grad_norm": 2.9587236298815562, + "learning_rate": 1.2695793464853147e-05, + "loss": 1.4079, + "step": 14430 + }, + { + "epoch": 0.47197881245095474, + "grad_norm": 3.121253817659289, + "learning_rate": 1.2690297503600206e-05, + "loss": 1.4108, + "step": 14435 + }, + { + "epoch": 0.47214229662568663, + "grad_norm": 3.3856401156283744, + "learning_rate": 1.2684800666183286e-05, + "loss": 1.5272, + "step": 14440 + }, + { + "epoch": 0.4723057808004185, + "grad_norm": 3.318921453269878, + "learning_rate": 1.2679302954392567e-05, + "loss": 1.4788, + "step": 14445 + }, + { + "epoch": 0.4724692649751504, + "grad_norm": 3.01554316754963, + "learning_rate": 1.2673804370018523e-05, + "loss": 1.4242, + "step": 14450 + }, + { + "epoch": 0.4726327491498823, + "grad_norm": 3.5808922494009154, + "learning_rate": 1.2668304914851907e-05, + "loss": 1.49, + "step": 14455 + }, + { + "epoch": 0.4727962333246142, + "grad_norm": 2.9471553289193744, + "learning_rate": 1.2662804590683756e-05, + "loss": 1.421, + "step": 14460 + }, + { + "epoch": 0.47295971749934607, + "grad_norm": 3.016166455882111, + "learning_rate": 1.2657303399305394e-05, + "loss": 1.466, + "step": 14465 + }, + { + "epoch": 0.47312320167407795, + "grad_norm": 2.9149010072922565, + "learning_rate": 1.265180134250842e-05, + "loss": 1.4871, + "step": 14470 + }, + { + "epoch": 0.47328668584880984, + "grad_norm": 3.1351145756253547, + "learning_rate": 1.2646298422084721e-05, + "loss": 1.3995, + "step": 14475 + }, + { + "epoch": 0.4734501700235417, + "grad_norm": 2.9775229843337208, + "learning_rate": 1.2640794639826465e-05, + "loss": 1.3828, + "step": 14480 + }, + { + "epoch": 0.4736136541982736, + "grad_norm": 3.0995354528590022, + "learning_rate": 1.26352899975261e-05, + "loss": 1.4275, + "step": 14485 + }, + { + "epoch": 0.4737771383730055, + "grad_norm": 3.3296275244686298, + "learning_rate": 1.2629784496976343e-05, + "loss": 1.4851, + "step": 14490 + }, + { + "epoch": 0.4739406225477374, + "grad_norm": 2.9843261165587482, + "learning_rate": 1.2624278139970216e-05, + "loss": 1.5222, + "step": 14495 + }, + { + "epoch": 0.4741041067224693, + "grad_norm": 2.9179650627751625, + "learning_rate": 1.2618770928300995e-05, + "loss": 1.4672, + "step": 14500 + }, + { + "epoch": 0.47426759089720116, + "grad_norm": 2.9809576403361095, + "learning_rate": 1.2613262863762244e-05, + "loss": 1.3999, + "step": 14505 + }, + { + "epoch": 0.47443107507193305, + "grad_norm": 3.253937692302124, + "learning_rate": 1.2607753948147803e-05, + "loss": 1.476, + "step": 14510 + }, + { + "epoch": 0.47459455924666494, + "grad_norm": 2.941644753241746, + "learning_rate": 1.2602244183251797e-05, + "loss": 1.4279, + "step": 14515 + }, + { + "epoch": 0.4747580434213968, + "grad_norm": 2.9108564669827155, + "learning_rate": 1.259673357086862e-05, + "loss": 1.4523, + "step": 14520 + }, + { + "epoch": 0.4749215275961287, + "grad_norm": 3.2235108431992274, + "learning_rate": 1.2591222112792943e-05, + "loss": 1.4894, + "step": 14525 + }, + { + "epoch": 0.4750850117708606, + "grad_norm": 3.0349274115457896, + "learning_rate": 1.2585709810819709e-05, + "loss": 1.4078, + "step": 14530 + }, + { + "epoch": 0.4752484959455925, + "grad_norm": 3.4490882634809887, + "learning_rate": 1.258019666674414e-05, + "loss": 1.5159, + "step": 14535 + }, + { + "epoch": 0.4754119801203244, + "grad_norm": 3.199831909683398, + "learning_rate": 1.257468268236174e-05, + "loss": 1.537, + "step": 14540 + }, + { + "epoch": 0.47557546429505626, + "grad_norm": 3.1757694171950637, + "learning_rate": 1.2569167859468271e-05, + "loss": 1.5235, + "step": 14545 + }, + { + "epoch": 0.47573894846978815, + "grad_norm": 3.2559259490168757, + "learning_rate": 1.2563652199859777e-05, + "loss": 1.4803, + "step": 14550 + }, + { + "epoch": 0.47590243264452003, + "grad_norm": 3.059969999400934, + "learning_rate": 1.2558135705332577e-05, + "loss": 1.4438, + "step": 14555 + }, + { + "epoch": 0.4760659168192519, + "grad_norm": 3.1115020431556872, + "learning_rate": 1.2552618377683255e-05, + "loss": 1.3964, + "step": 14560 + }, + { + "epoch": 0.4762294009939838, + "grad_norm": 3.0561224076805646, + "learning_rate": 1.2547100218708663e-05, + "loss": 1.3502, + "step": 14565 + }, + { + "epoch": 0.47639288516871564, + "grad_norm": 3.1462337927596047, + "learning_rate": 1.2541581230205944e-05, + "loss": 1.5853, + "step": 14570 + }, + { + "epoch": 0.47655636934344753, + "grad_norm": 3.167584838740197, + "learning_rate": 1.2536061413972486e-05, + "loss": 1.4874, + "step": 14575 + }, + { + "epoch": 0.4767198535181794, + "grad_norm": 3.1251799438006382, + "learning_rate": 1.2530540771805965e-05, + "loss": 1.5378, + "step": 14580 + }, + { + "epoch": 0.4768833376929113, + "grad_norm": 2.9703336392255673, + "learning_rate": 1.2525019305504316e-05, + "loss": 1.4887, + "step": 14585 + }, + { + "epoch": 0.4770468218676432, + "grad_norm": 3.1025595083383677, + "learning_rate": 1.251949701686574e-05, + "loss": 1.4695, + "step": 14590 + }, + { + "epoch": 0.4772103060423751, + "grad_norm": 3.3940340893668663, + "learning_rate": 1.2513973907688721e-05, + "loss": 1.5632, + "step": 14595 + }, + { + "epoch": 0.47737379021710696, + "grad_norm": 3.1745889920963974, + "learning_rate": 1.2508449979771997e-05, + "loss": 1.443, + "step": 14600 + }, + { + "epoch": 0.47753727439183885, + "grad_norm": 3.2027569079863967, + "learning_rate": 1.2502925234914573e-05, + "loss": 1.5137, + "step": 14605 + }, + { + "epoch": 0.47770075856657074, + "grad_norm": 3.201536771251264, + "learning_rate": 1.2497399674915723e-05, + "loss": 1.3901, + "step": 14610 + }, + { + "epoch": 0.4778642427413026, + "grad_norm": 3.0811639752457727, + "learning_rate": 1.2491873301574989e-05, + "loss": 1.4822, + "step": 14615 + }, + { + "epoch": 0.4780277269160345, + "grad_norm": 3.132991449198055, + "learning_rate": 1.2486346116692174e-05, + "loss": 1.4489, + "step": 14620 + }, + { + "epoch": 0.4781912110907664, + "grad_norm": 3.1886774229045476, + "learning_rate": 1.2480818122067348e-05, + "loss": 1.4352, + "step": 14625 + }, + { + "epoch": 0.4783546952654983, + "grad_norm": 3.178610207358636, + "learning_rate": 1.2475289319500844e-05, + "loss": 1.3508, + "step": 14630 + }, + { + "epoch": 0.4785181794402302, + "grad_norm": 3.1664739088333644, + "learning_rate": 1.2469759710793254e-05, + "loss": 1.5866, + "step": 14635 + }, + { + "epoch": 0.47868166361496206, + "grad_norm": 3.197703324710839, + "learning_rate": 1.2464229297745438e-05, + "loss": 1.3876, + "step": 14640 + }, + { + "epoch": 0.47884514778969395, + "grad_norm": 3.0659051174914307, + "learning_rate": 1.2458698082158517e-05, + "loss": 1.5834, + "step": 14645 + }, + { + "epoch": 0.47900863196442584, + "grad_norm": 3.3226885766699934, + "learning_rate": 1.2453166065833872e-05, + "loss": 1.4964, + "step": 14650 + }, + { + "epoch": 0.4791721161391577, + "grad_norm": 3.3025410517253793, + "learning_rate": 1.2447633250573145e-05, + "loss": 1.5535, + "step": 14655 + }, + { + "epoch": 0.4793356003138896, + "grad_norm": 2.847101175892745, + "learning_rate": 1.2442099638178237e-05, + "loss": 1.3581, + "step": 14660 + }, + { + "epoch": 0.4794990844886215, + "grad_norm": 3.316994374136044, + "learning_rate": 1.2436565230451312e-05, + "loss": 1.4522, + "step": 14665 + }, + { + "epoch": 0.4796625686633534, + "grad_norm": 2.9483876982449146, + "learning_rate": 1.243103002919479e-05, + "loss": 1.3791, + "step": 14670 + }, + { + "epoch": 0.47982605283808527, + "grad_norm": 3.037688310786343, + "learning_rate": 1.2425494036211351e-05, + "loss": 1.4768, + "step": 14675 + }, + { + "epoch": 0.47998953701281716, + "grad_norm": 3.5729217225251446, + "learning_rate": 1.2419957253303934e-05, + "loss": 1.4666, + "step": 14680 + }, + { + "epoch": 0.48015302118754905, + "grad_norm": 3.2489121162422747, + "learning_rate": 1.241441968227573e-05, + "loss": 1.4156, + "step": 14685 + }, + { + "epoch": 0.48031650536228093, + "grad_norm": 2.964640724473931, + "learning_rate": 1.240888132493019e-05, + "loss": 1.5819, + "step": 14690 + }, + { + "epoch": 0.4804799895370128, + "grad_norm": 3.123922357000481, + "learning_rate": 1.2403342183071022e-05, + "loss": 1.6333, + "step": 14695 + }, + { + "epoch": 0.4806434737117447, + "grad_norm": 2.8485528193226046, + "learning_rate": 1.2397802258502191e-05, + "loss": 1.3724, + "step": 14700 + }, + { + "epoch": 0.4808069578864766, + "grad_norm": 3.19624519439699, + "learning_rate": 1.2392261553027918e-05, + "loss": 1.3207, + "step": 14705 + }, + { + "epoch": 0.4809704420612085, + "grad_norm": 2.9676840193359917, + "learning_rate": 1.2386720068452667e-05, + "loss": 1.3565, + "step": 14710 + }, + { + "epoch": 0.48113392623594037, + "grad_norm": 3.172660912084661, + "learning_rate": 1.2381177806581164e-05, + "loss": 1.3862, + "step": 14715 + }, + { + "epoch": 0.48129741041067226, + "grad_norm": 3.410914769857234, + "learning_rate": 1.2375634769218394e-05, + "loss": 1.3404, + "step": 14720 + }, + { + "epoch": 0.48146089458540414, + "grad_norm": 3.147640945868806, + "learning_rate": 1.2370090958169585e-05, + "loss": 1.4881, + "step": 14725 + }, + { + "epoch": 0.48162437876013603, + "grad_norm": 3.2247846433577245, + "learning_rate": 1.2364546375240223e-05, + "loss": 1.3947, + "step": 14730 + }, + { + "epoch": 0.4817878629348679, + "grad_norm": 3.069527856649995, + "learning_rate": 1.2359001022236039e-05, + "loss": 1.4247, + "step": 14735 + }, + { + "epoch": 0.4819513471095998, + "grad_norm": 3.170728961558985, + "learning_rate": 1.2353454900963017e-05, + "loss": 1.3858, + "step": 14740 + }, + { + "epoch": 0.4821148312843317, + "grad_norm": 3.349581327757592, + "learning_rate": 1.2347908013227401e-05, + "loss": 1.4684, + "step": 14745 + }, + { + "epoch": 0.4822783154590636, + "grad_norm": 3.025158267093575, + "learning_rate": 1.2342360360835673e-05, + "loss": 1.3857, + "step": 14750 + }, + { + "epoch": 0.48244179963379547, + "grad_norm": 3.3043200714458103, + "learning_rate": 1.2336811945594562e-05, + "loss": 1.4446, + "step": 14755 + }, + { + "epoch": 0.48260528380852735, + "grad_norm": 3.7605723680500414, + "learning_rate": 1.2331262769311057e-05, + "loss": 1.4931, + "step": 14760 + }, + { + "epoch": 0.48276876798325924, + "grad_norm": 3.0684987037583755, + "learning_rate": 1.2325712833792386e-05, + "loss": 1.4713, + "step": 14765 + }, + { + "epoch": 0.4829322521579911, + "grad_norm": 3.0251157141999725, + "learning_rate": 1.2320162140846026e-05, + "loss": 1.4679, + "step": 14770 + }, + { + "epoch": 0.483095736332723, + "grad_norm": 3.096430783451541, + "learning_rate": 1.2314610692279707e-05, + "loss": 1.3915, + "step": 14775 + }, + { + "epoch": 0.4832592205074549, + "grad_norm": 2.828345560733586, + "learning_rate": 1.2309058489901396e-05, + "loss": 1.4447, + "step": 14780 + }, + { + "epoch": 0.4834227046821868, + "grad_norm": 3.0966516284993024, + "learning_rate": 1.2303505535519307e-05, + "loss": 1.4296, + "step": 14785 + }, + { + "epoch": 0.4835861888569187, + "grad_norm": 2.9615173968620967, + "learning_rate": 1.2297951830941906e-05, + "loss": 1.4707, + "step": 14790 + }, + { + "epoch": 0.48374967303165056, + "grad_norm": 3.2509668727736627, + "learning_rate": 1.2292397377977891e-05, + "loss": 1.4682, + "step": 14795 + }, + { + "epoch": 0.48391315720638245, + "grad_norm": 3.2549047361091206, + "learning_rate": 1.2286842178436222e-05, + "loss": 1.3738, + "step": 14800 + }, + { + "epoch": 0.4840766413811143, + "grad_norm": 2.9807091215608814, + "learning_rate": 1.2281286234126087e-05, + "loss": 1.3752, + "step": 14805 + }, + { + "epoch": 0.48424012555584617, + "grad_norm": 3.2395056633619426, + "learning_rate": 1.2275729546856918e-05, + "loss": 1.4743, + "step": 14810 + }, + { + "epoch": 0.48440360973057806, + "grad_norm": 3.0549034342103605, + "learning_rate": 1.2270172118438391e-05, + "loss": 1.6032, + "step": 14815 + }, + { + "epoch": 0.48456709390530994, + "grad_norm": 3.2035429596705667, + "learning_rate": 1.2264613950680429e-05, + "loss": 1.3698, + "step": 14820 + }, + { + "epoch": 0.48473057808004183, + "grad_norm": 3.2689303184815266, + "learning_rate": 1.225905504539319e-05, + "loss": 1.4611, + "step": 14825 + }, + { + "epoch": 0.4848940622547737, + "grad_norm": 3.1674615968962994, + "learning_rate": 1.225349540438707e-05, + "loss": 1.5059, + "step": 14830 + }, + { + "epoch": 0.4850575464295056, + "grad_norm": 3.322857062616097, + "learning_rate": 1.2247935029472712e-05, + "loss": 1.593, + "step": 14835 + }, + { + "epoch": 0.4852210306042375, + "grad_norm": 3.2935434077909727, + "learning_rate": 1.2242373922460993e-05, + "loss": 1.5539, + "step": 14840 + }, + { + "epoch": 0.4853845147789694, + "grad_norm": 3.1353117252216953, + "learning_rate": 1.2236812085163024e-05, + "loss": 1.4521, + "step": 14845 + }, + { + "epoch": 0.48554799895370127, + "grad_norm": 3.152668851507543, + "learning_rate": 1.2231249519390164e-05, + "loss": 1.4815, + "step": 14850 + }, + { + "epoch": 0.48571148312843315, + "grad_norm": 3.3710619745938515, + "learning_rate": 1.2225686226954008e-05, + "loss": 1.4853, + "step": 14855 + }, + { + "epoch": 0.48587496730316504, + "grad_norm": 3.0443827999166575, + "learning_rate": 1.2220122209666379e-05, + "loss": 1.4046, + "step": 14860 + }, + { + "epoch": 0.4860384514778969, + "grad_norm": 3.132849141202639, + "learning_rate": 1.221455746933934e-05, + "loss": 1.5145, + "step": 14865 + }, + { + "epoch": 0.4862019356526288, + "grad_norm": 3.1098174475283678, + "learning_rate": 1.2208992007785193e-05, + "loss": 1.4332, + "step": 14870 + }, + { + "epoch": 0.4863654198273607, + "grad_norm": 3.3480133374132923, + "learning_rate": 1.220342582681647e-05, + "loss": 1.4975, + "step": 14875 + }, + { + "epoch": 0.4865289040020926, + "grad_norm": 3.2108426407764603, + "learning_rate": 1.2197858928245945e-05, + "loss": 1.5366, + "step": 14880 + }, + { + "epoch": 0.4866923881768245, + "grad_norm": 3.1768603763310486, + "learning_rate": 1.2192291313886618e-05, + "loss": 1.5106, + "step": 14885 + }, + { + "epoch": 0.48685587235155636, + "grad_norm": 3.2236444722107698, + "learning_rate": 1.218672298555172e-05, + "loss": 1.5367, + "step": 14890 + }, + { + "epoch": 0.48701935652628825, + "grad_norm": 3.121616190359876, + "learning_rate": 1.2181153945054726e-05, + "loss": 1.4414, + "step": 14895 + }, + { + "epoch": 0.48718284070102014, + "grad_norm": 3.1525594501192664, + "learning_rate": 1.2175584194209329e-05, + "loss": 1.4868, + "step": 14900 + }, + { + "epoch": 0.487346324875752, + "grad_norm": 3.090943111955756, + "learning_rate": 1.2170013734829468e-05, + "loss": 1.482, + "step": 14905 + }, + { + "epoch": 0.4875098090504839, + "grad_norm": 3.1786164544060598, + "learning_rate": 1.2164442568729305e-05, + "loss": 1.4438, + "step": 14910 + }, + { + "epoch": 0.4876732932252158, + "grad_norm": 3.322439594346873, + "learning_rate": 1.2158870697723223e-05, + "loss": 1.5451, + "step": 14915 + }, + { + "epoch": 0.4878367773999477, + "grad_norm": 3.1188339399253038, + "learning_rate": 1.215329812362585e-05, + "loss": 1.5776, + "step": 14920 + }, + { + "epoch": 0.4880002615746796, + "grad_norm": 2.7653445490155617, + "learning_rate": 1.2147724848252039e-05, + "loss": 1.3919, + "step": 14925 + }, + { + "epoch": 0.48816374574941146, + "grad_norm": 3.074779592295586, + "learning_rate": 1.2142150873416865e-05, + "loss": 1.4064, + "step": 14930 + }, + { + "epoch": 0.48832722992414335, + "grad_norm": 3.202279756874678, + "learning_rate": 1.2136576200935637e-05, + "loss": 1.4598, + "step": 14935 + }, + { + "epoch": 0.48849071409887523, + "grad_norm": 3.036899062415323, + "learning_rate": 1.213100083262389e-05, + "loss": 1.3686, + "step": 14940 + }, + { + "epoch": 0.4886541982736071, + "grad_norm": 3.239906751442379, + "learning_rate": 1.2125424770297381e-05, + "loss": 1.4756, + "step": 14945 + }, + { + "epoch": 0.488817682448339, + "grad_norm": 3.150865355148098, + "learning_rate": 1.2119848015772102e-05, + "loss": 1.423, + "step": 14950 + }, + { + "epoch": 0.4889811666230709, + "grad_norm": 3.3753759627728406, + "learning_rate": 1.2114270570864263e-05, + "loss": 1.453, + "step": 14955 + }, + { + "epoch": 0.4891446507978028, + "grad_norm": 3.3522146409974645, + "learning_rate": 1.2108692437390302e-05, + "loss": 1.4913, + "step": 14960 + }, + { + "epoch": 0.48930813497253467, + "grad_norm": 3.127797875675738, + "learning_rate": 1.2103113617166877e-05, + "loss": 1.4625, + "step": 14965 + }, + { + "epoch": 0.48947161914726656, + "grad_norm": 3.4434009578168174, + "learning_rate": 1.209753411201088e-05, + "loss": 1.5526, + "step": 14970 + }, + { + "epoch": 0.48963510332199844, + "grad_norm": 3.2693095413450535, + "learning_rate": 1.2091953923739411e-05, + "loss": 1.4288, + "step": 14975 + }, + { + "epoch": 0.48979858749673033, + "grad_norm": 3.2873130450734838, + "learning_rate": 1.2086373054169805e-05, + "loss": 1.4731, + "step": 14980 + }, + { + "epoch": 0.4899620716714622, + "grad_norm": 3.0509284155023892, + "learning_rate": 1.2080791505119614e-05, + "loss": 1.3881, + "step": 14985 + }, + { + "epoch": 0.4901255558461941, + "grad_norm": 3.2143099331667346, + "learning_rate": 1.2075209278406611e-05, + "loss": 1.4471, + "step": 14990 + }, + { + "epoch": 0.490289040020926, + "grad_norm": 3.3079782754470273, + "learning_rate": 1.2069626375848795e-05, + "loss": 1.6643, + "step": 14995 + }, + { + "epoch": 0.4904525241956579, + "grad_norm": 3.067314190765251, + "learning_rate": 1.2064042799264373e-05, + "loss": 1.4176, + "step": 15000 + }, + { + "epoch": 0.49061600837038977, + "grad_norm": 3.145066113010778, + "learning_rate": 1.2058458550471783e-05, + "loss": 1.4622, + "step": 15005 + }, + { + "epoch": 0.49077949254512165, + "grad_norm": 3.164468193333201, + "learning_rate": 1.2052873631289678e-05, + "loss": 1.439, + "step": 15010 + }, + { + "epoch": 0.49094297671985354, + "grad_norm": 3.171837671969503, + "learning_rate": 1.2047288043536934e-05, + "loss": 1.4009, + "step": 15015 + }, + { + "epoch": 0.49110646089458543, + "grad_norm": 3.2442005715771427, + "learning_rate": 1.204170178903263e-05, + "loss": 1.4407, + "step": 15020 + }, + { + "epoch": 0.4912699450693173, + "grad_norm": 3.2114116134880275, + "learning_rate": 1.2036114869596077e-05, + "loss": 1.6943, + "step": 15025 + }, + { + "epoch": 0.4914334292440492, + "grad_norm": 3.108628468484975, + "learning_rate": 1.2030527287046801e-05, + "loss": 1.4677, + "step": 15030 + }, + { + "epoch": 0.4915969134187811, + "grad_norm": 3.280622753740283, + "learning_rate": 1.2024939043204536e-05, + "loss": 1.4829, + "step": 15035 + }, + { + "epoch": 0.4917603975935129, + "grad_norm": 2.950040816258377, + "learning_rate": 1.2019350139889238e-05, + "loss": 1.431, + "step": 15040 + }, + { + "epoch": 0.4919238817682448, + "grad_norm": 3.390150090867348, + "learning_rate": 1.2013760578921077e-05, + "loss": 1.4949, + "step": 15045 + }, + { + "epoch": 0.4920873659429767, + "grad_norm": 3.1994592646171247, + "learning_rate": 1.2008170362120433e-05, + "loss": 1.4981, + "step": 15050 + }, + { + "epoch": 0.4922508501177086, + "grad_norm": 2.979274246738422, + "learning_rate": 1.2002579491307907e-05, + "loss": 1.3792, + "step": 15055 + }, + { + "epoch": 0.49241433429244047, + "grad_norm": 3.261739296768386, + "learning_rate": 1.1996987968304306e-05, + "loss": 1.5381, + "step": 15060 + }, + { + "epoch": 0.49257781846717236, + "grad_norm": 3.1267443922456697, + "learning_rate": 1.1991395794930651e-05, + "loss": 1.3374, + "step": 15065 + }, + { + "epoch": 0.49274130264190424, + "grad_norm": 3.0808689071527713, + "learning_rate": 1.1985802973008178e-05, + "loss": 1.3976, + "step": 15070 + }, + { + "epoch": 0.49290478681663613, + "grad_norm": 3.0173136645290946, + "learning_rate": 1.1980209504358329e-05, + "loss": 1.4518, + "step": 15075 + }, + { + "epoch": 0.493068270991368, + "grad_norm": 3.0038635438835937, + "learning_rate": 1.1974615390802767e-05, + "loss": 1.4029, + "step": 15080 + }, + { + "epoch": 0.4932317551660999, + "grad_norm": 3.0933751602710404, + "learning_rate": 1.1969020634163349e-05, + "loss": 1.4415, + "step": 15085 + }, + { + "epoch": 0.4933952393408318, + "grad_norm": 3.2200718705713594, + "learning_rate": 1.1963425236262154e-05, + "loss": 1.5787, + "step": 15090 + }, + { + "epoch": 0.4935587235155637, + "grad_norm": 3.0123592244963393, + "learning_rate": 1.1957829198921466e-05, + "loss": 1.5057, + "step": 15095 + }, + { + "epoch": 0.49372220769029557, + "grad_norm": 3.3013562171069317, + "learning_rate": 1.1952232523963778e-05, + "loss": 1.5928, + "step": 15100 + }, + { + "epoch": 0.49388569186502745, + "grad_norm": 3.189702732698051, + "learning_rate": 1.1946635213211785e-05, + "loss": 1.5424, + "step": 15105 + }, + { + "epoch": 0.49404917603975934, + "grad_norm": 3.0103193697415085, + "learning_rate": 1.1941037268488403e-05, + "loss": 1.6242, + "step": 15110 + }, + { + "epoch": 0.49421266021449123, + "grad_norm": 3.1701850630146327, + "learning_rate": 1.1935438691616742e-05, + "loss": 1.4359, + "step": 15115 + }, + { + "epoch": 0.4943761443892231, + "grad_norm": 3.05457697538953, + "learning_rate": 1.1929839484420117e-05, + "loss": 1.4997, + "step": 15120 + }, + { + "epoch": 0.494539628563955, + "grad_norm": 3.2102651855864237, + "learning_rate": 1.1924239648722051e-05, + "loss": 1.5077, + "step": 15125 + }, + { + "epoch": 0.4947031127386869, + "grad_norm": 3.0230909299371227, + "learning_rate": 1.191863918634628e-05, + "loss": 1.5468, + "step": 15130 + }, + { + "epoch": 0.4948665969134188, + "grad_norm": 3.761817334826089, + "learning_rate": 1.1913038099116738e-05, + "loss": 1.5901, + "step": 15135 + }, + { + "epoch": 0.49503008108815066, + "grad_norm": 3.139612427975268, + "learning_rate": 1.1907436388857558e-05, + "loss": 1.5627, + "step": 15140 + }, + { + "epoch": 0.49519356526288255, + "grad_norm": 3.166336187087427, + "learning_rate": 1.190183405739308e-05, + "loss": 1.4686, + "step": 15145 + }, + { + "epoch": 0.49535704943761444, + "grad_norm": 3.237862544916369, + "learning_rate": 1.1896231106547847e-05, + "loss": 1.4405, + "step": 15150 + }, + { + "epoch": 0.4955205336123463, + "grad_norm": 3.260042438914589, + "learning_rate": 1.18906275381466e-05, + "loss": 1.4992, + "step": 15155 + }, + { + "epoch": 0.4956840177870782, + "grad_norm": 3.068100809297776, + "learning_rate": 1.188502335401429e-05, + "loss": 1.4111, + "step": 15160 + }, + { + "epoch": 0.4958475019618101, + "grad_norm": 3.0608359213500056, + "learning_rate": 1.1879418555976056e-05, + "loss": 1.5547, + "step": 15165 + }, + { + "epoch": 0.496010986136542, + "grad_norm": 3.269973973182607, + "learning_rate": 1.187381314585725e-05, + "loss": 1.4384, + "step": 15170 + }, + { + "epoch": 0.4961744703112739, + "grad_norm": 3.2130119659223144, + "learning_rate": 1.1868207125483408e-05, + "loss": 1.5184, + "step": 15175 + }, + { + "epoch": 0.49633795448600576, + "grad_norm": 3.0788824248179325, + "learning_rate": 1.1862600496680282e-05, + "loss": 1.4351, + "step": 15180 + }, + { + "epoch": 0.49650143866073765, + "grad_norm": 2.827730211852624, + "learning_rate": 1.1856993261273809e-05, + "loss": 1.3493, + "step": 15185 + }, + { + "epoch": 0.49666492283546954, + "grad_norm": 3.1315207683055686, + "learning_rate": 1.185138542109013e-05, + "loss": 1.4745, + "step": 15190 + }, + { + "epoch": 0.4968284070102014, + "grad_norm": 3.2031351951130893, + "learning_rate": 1.184577697795558e-05, + "loss": 1.5044, + "step": 15195 + }, + { + "epoch": 0.4969918911849333, + "grad_norm": 2.961734662168576, + "learning_rate": 1.1840167933696692e-05, + "loss": 1.486, + "step": 15200 + }, + { + "epoch": 0.4971553753596652, + "grad_norm": 3.148629163817005, + "learning_rate": 1.1834558290140193e-05, + "loss": 1.3845, + "step": 15205 + }, + { + "epoch": 0.4973188595343971, + "grad_norm": 3.2388977971956057, + "learning_rate": 1.1828948049113009e-05, + "loss": 1.5093, + "step": 15210 + }, + { + "epoch": 0.49748234370912897, + "grad_norm": 2.9819334345508155, + "learning_rate": 1.1823337212442259e-05, + "loss": 1.5033, + "step": 15215 + }, + { + "epoch": 0.49764582788386086, + "grad_norm": 3.10542979456646, + "learning_rate": 1.1817725781955252e-05, + "loss": 1.4339, + "step": 15220 + }, + { + "epoch": 0.49780931205859275, + "grad_norm": 3.0653673178300775, + "learning_rate": 1.1812113759479494e-05, + "loss": 1.4997, + "step": 15225 + }, + { + "epoch": 0.49797279623332463, + "grad_norm": 3.1072816584091143, + "learning_rate": 1.180650114684268e-05, + "loss": 1.5228, + "step": 15230 + }, + { + "epoch": 0.4981362804080565, + "grad_norm": 3.195250174382567, + "learning_rate": 1.1800887945872707e-05, + "loss": 1.3497, + "step": 15235 + }, + { + "epoch": 0.4982997645827884, + "grad_norm": 3.5699868292603782, + "learning_rate": 1.1795274158397654e-05, + "loss": 1.5419, + "step": 15240 + }, + { + "epoch": 0.4984632487575203, + "grad_norm": 3.330970944151455, + "learning_rate": 1.1789659786245795e-05, + "loss": 1.5762, + "step": 15245 + }, + { + "epoch": 0.4986267329322522, + "grad_norm": 3.0256454558345864, + "learning_rate": 1.1784044831245591e-05, + "loss": 1.3266, + "step": 15250 + }, + { + "epoch": 0.49879021710698407, + "grad_norm": 3.2402155693136008, + "learning_rate": 1.1778429295225693e-05, + "loss": 1.4697, + "step": 15255 + }, + { + "epoch": 0.49895370128171596, + "grad_norm": 3.11522253416988, + "learning_rate": 1.1772813180014953e-05, + "loss": 1.3495, + "step": 15260 + }, + { + "epoch": 0.49911718545644784, + "grad_norm": 2.9190465352397106, + "learning_rate": 1.1767196487442395e-05, + "loss": 1.3702, + "step": 15265 + }, + { + "epoch": 0.4992806696311797, + "grad_norm": 3.1132350005315472, + "learning_rate": 1.1761579219337239e-05, + "loss": 1.6231, + "step": 15270 + }, + { + "epoch": 0.49944415380591156, + "grad_norm": 3.0783400531035303, + "learning_rate": 1.175596137752889e-05, + "loss": 1.365, + "step": 15275 + }, + { + "epoch": 0.49960763798064345, + "grad_norm": 3.076520427059713, + "learning_rate": 1.1750342963846946e-05, + "loss": 1.5148, + "step": 15280 + }, + { + "epoch": 0.49977112215537534, + "grad_norm": 2.8391407347702224, + "learning_rate": 1.1744723980121182e-05, + "loss": 1.5067, + "step": 15285 + }, + { + "epoch": 0.4999346063301072, + "grad_norm": 3.148847054258218, + "learning_rate": 1.1739104428181567e-05, + "loss": 1.3562, + "step": 15290 + }, + { + "epoch": 0.5000980905048391, + "grad_norm": 3.0860336247073232, + "learning_rate": 1.173348430985825e-05, + "loss": 1.468, + "step": 15295 + }, + { + "epoch": 0.500261574679571, + "grad_norm": 3.166743782643589, + "learning_rate": 1.1727863626981564e-05, + "loss": 1.4528, + "step": 15300 + }, + { + "epoch": 0.5004250588543029, + "grad_norm": 3.0727222092333037, + "learning_rate": 1.1722242381382032e-05, + "loss": 1.5118, + "step": 15305 + }, + { + "epoch": 0.5005885430290348, + "grad_norm": 3.1931452126503066, + "learning_rate": 1.1716620574890349e-05, + "loss": 1.307, + "step": 15310 + }, + { + "epoch": 0.5007520272037667, + "grad_norm": 3.123071952363037, + "learning_rate": 1.171099820933741e-05, + "loss": 1.5919, + "step": 15315 + }, + { + "epoch": 0.5009155113784985, + "grad_norm": 3.137931715695395, + "learning_rate": 1.1705375286554273e-05, + "loss": 1.5362, + "step": 15320 + }, + { + "epoch": 0.5010789955532304, + "grad_norm": 3.1910053533696225, + "learning_rate": 1.169975180837219e-05, + "loss": 1.4097, + "step": 15325 + }, + { + "epoch": 0.5012424797279623, + "grad_norm": 3.024077962566744, + "learning_rate": 1.1694127776622586e-05, + "loss": 1.4619, + "step": 15330 + }, + { + "epoch": 0.5014059639026942, + "grad_norm": 2.9425752420633, + "learning_rate": 1.1688503193137075e-05, + "loss": 1.392, + "step": 15335 + }, + { + "epoch": 0.5015694480774261, + "grad_norm": 3.210872993818286, + "learning_rate": 1.1682878059747445e-05, + "loss": 1.457, + "step": 15340 + }, + { + "epoch": 0.501732932252158, + "grad_norm": 3.193024133637559, + "learning_rate": 1.1677252378285664e-05, + "loss": 1.3425, + "step": 15345 + }, + { + "epoch": 0.5018964164268899, + "grad_norm": 3.295357366117978, + "learning_rate": 1.1671626150583877e-05, + "loss": 1.5339, + "step": 15350 + }, + { + "epoch": 0.5020599006016218, + "grad_norm": 3.2412783748413045, + "learning_rate": 1.1665999378474408e-05, + "loss": 1.4622, + "step": 15355 + }, + { + "epoch": 0.5022233847763536, + "grad_norm": 2.9469180915508963, + "learning_rate": 1.1660372063789763e-05, + "loss": 1.334, + "step": 15360 + }, + { + "epoch": 0.5023868689510855, + "grad_norm": 3.552400319570964, + "learning_rate": 1.1654744208362616e-05, + "loss": 1.4878, + "step": 15365 + }, + { + "epoch": 0.5025503531258174, + "grad_norm": 2.9007838040875615, + "learning_rate": 1.1649115814025824e-05, + "loss": 1.3929, + "step": 15370 + }, + { + "epoch": 0.5027138373005493, + "grad_norm": 3.2962376337039987, + "learning_rate": 1.1643486882612418e-05, + "loss": 1.5367, + "step": 15375 + }, + { + "epoch": 0.5028773214752812, + "grad_norm": 3.2373565657051118, + "learning_rate": 1.16378574159556e-05, + "loss": 1.578, + "step": 15380 + }, + { + "epoch": 0.5030408056500131, + "grad_norm": 2.995223505176997, + "learning_rate": 1.1632227415888751e-05, + "loss": 1.3978, + "step": 15385 + }, + { + "epoch": 0.503204289824745, + "grad_norm": 3.0388268309217796, + "learning_rate": 1.1626596884245424e-05, + "loss": 1.4705, + "step": 15390 + }, + { + "epoch": 0.5033677739994769, + "grad_norm": 3.0895597411851417, + "learning_rate": 1.1620965822859347e-05, + "loss": 1.572, + "step": 15395 + }, + { + "epoch": 0.5035312581742087, + "grad_norm": 3.265689045264057, + "learning_rate": 1.1615334233564417e-05, + "loss": 1.567, + "step": 15400 + }, + { + "epoch": 0.5036947423489406, + "grad_norm": 3.367363221788513, + "learning_rate": 1.1609702118194705e-05, + "loss": 1.596, + "step": 15405 + }, + { + "epoch": 0.5038582265236725, + "grad_norm": 3.265740786256043, + "learning_rate": 1.160406947858445e-05, + "loss": 1.4937, + "step": 15410 + }, + { + "epoch": 0.5040217106984044, + "grad_norm": 3.0667830705517787, + "learning_rate": 1.1598436316568068e-05, + "loss": 1.5495, + "step": 15415 + }, + { + "epoch": 0.5041851948731363, + "grad_norm": 3.0205757761240384, + "learning_rate": 1.1592802633980145e-05, + "loss": 1.3969, + "step": 15420 + }, + { + "epoch": 0.5043486790478682, + "grad_norm": 3.441836043290901, + "learning_rate": 1.158716843265543e-05, + "loss": 1.5174, + "step": 15425 + }, + { + "epoch": 0.5045121632226001, + "grad_norm": 3.1393536865005975, + "learning_rate": 1.1581533714428841e-05, + "loss": 1.481, + "step": 15430 + }, + { + "epoch": 0.504675647397332, + "grad_norm": 3.1686374815043834, + "learning_rate": 1.157589848113547e-05, + "loss": 1.5078, + "step": 15435 + }, + { + "epoch": 0.5048391315720638, + "grad_norm": 3.3428945745144145, + "learning_rate": 1.1570262734610575e-05, + "loss": 1.4578, + "step": 15440 + }, + { + "epoch": 0.5050026157467957, + "grad_norm": 3.347652543094711, + "learning_rate": 1.1564626476689582e-05, + "loss": 1.4959, + "step": 15445 + }, + { + "epoch": 0.5051660999215276, + "grad_norm": 3.370382627758732, + "learning_rate": 1.1558989709208082e-05, + "loss": 1.4471, + "step": 15450 + }, + { + "epoch": 0.5053295840962595, + "grad_norm": 3.2307819179786312, + "learning_rate": 1.155335243400183e-05, + "loss": 1.5469, + "step": 15455 + }, + { + "epoch": 0.5054930682709914, + "grad_norm": 3.047644873964802, + "learning_rate": 1.1547714652906746e-05, + "loss": 1.4819, + "step": 15460 + }, + { + "epoch": 0.5056565524457233, + "grad_norm": 3.4680190926429773, + "learning_rate": 1.1542076367758922e-05, + "loss": 1.5966, + "step": 15465 + }, + { + "epoch": 0.5058200366204552, + "grad_norm": 3.243815652910945, + "learning_rate": 1.153643758039461e-05, + "loss": 1.4392, + "step": 15470 + }, + { + "epoch": 0.505983520795187, + "grad_norm": 3.2818620192160313, + "learning_rate": 1.1530798292650223e-05, + "loss": 1.3215, + "step": 15475 + }, + { + "epoch": 0.5061470049699189, + "grad_norm": 7.341321714171626, + "learning_rate": 1.1525158506362338e-05, + "loss": 1.372, + "step": 15480 + }, + { + "epoch": 0.5063104891446508, + "grad_norm": 3.090724969270655, + "learning_rate": 1.1519518223367694e-05, + "loss": 1.4195, + "step": 15485 + }, + { + "epoch": 0.5064739733193827, + "grad_norm": 3.2823545319322314, + "learning_rate": 1.1513877445503193e-05, + "loss": 1.4854, + "step": 15490 + }, + { + "epoch": 0.5066374574941146, + "grad_norm": 2.975982908148913, + "learning_rate": 1.1508236174605902e-05, + "loss": 1.4085, + "step": 15495 + }, + { + "epoch": 0.5068009416688465, + "grad_norm": 2.882335713390506, + "learning_rate": 1.1502594412513042e-05, + "loss": 1.3747, + "step": 15500 + }, + { + "epoch": 0.5069644258435784, + "grad_norm": 2.944289312432973, + "learning_rate": 1.1496952161061998e-05, + "loss": 1.4959, + "step": 15505 + }, + { + "epoch": 0.5071279100183103, + "grad_norm": 3.3288941163780126, + "learning_rate": 1.1491309422090313e-05, + "loss": 1.3866, + "step": 15510 + }, + { + "epoch": 0.5072913941930421, + "grad_norm": 3.093717611255861, + "learning_rate": 1.1485666197435685e-05, + "loss": 1.3911, + "step": 15515 + }, + { + "epoch": 0.507454878367774, + "grad_norm": 3.0163837288582607, + "learning_rate": 1.1480022488935978e-05, + "loss": 1.5376, + "step": 15520 + }, + { + "epoch": 0.5076183625425059, + "grad_norm": 3.0679769952863327, + "learning_rate": 1.1474378298429212e-05, + "loss": 1.4083, + "step": 15525 + }, + { + "epoch": 0.5077818467172378, + "grad_norm": 3.2476044661215298, + "learning_rate": 1.1468733627753557e-05, + "loss": 1.4685, + "step": 15530 + }, + { + "epoch": 0.5079453308919697, + "grad_norm": 2.8619635582120164, + "learning_rate": 1.1463088478747343e-05, + "loss": 1.4495, + "step": 15535 + }, + { + "epoch": 0.5081088150667016, + "grad_norm": 3.1518642232994227, + "learning_rate": 1.145744285324906e-05, + "loss": 1.4359, + "step": 15540 + }, + { + "epoch": 0.5082722992414335, + "grad_norm": 2.909146330184591, + "learning_rate": 1.1451796753097348e-05, + "loss": 1.3527, + "step": 15545 + }, + { + "epoch": 0.5084357834161654, + "grad_norm": 2.8655115195209464, + "learning_rate": 1.1446150180131006e-05, + "loss": 1.4126, + "step": 15550 + }, + { + "epoch": 0.5085992675908972, + "grad_norm": 3.2139045830104567, + "learning_rate": 1.1440503136188983e-05, + "loss": 1.501, + "step": 15555 + }, + { + "epoch": 0.5087627517656291, + "grad_norm": 3.2973772465497095, + "learning_rate": 1.1434855623110382e-05, + "loss": 1.4198, + "step": 15560 + }, + { + "epoch": 0.508926235940361, + "grad_norm": 3.1608441850802493, + "learning_rate": 1.1429207642734457e-05, + "loss": 1.5075, + "step": 15565 + }, + { + "epoch": 0.5090897201150929, + "grad_norm": 2.862629783121277, + "learning_rate": 1.1423559196900621e-05, + "loss": 1.3154, + "step": 15570 + }, + { + "epoch": 0.5092532042898248, + "grad_norm": 3.031433862758664, + "learning_rate": 1.1417910287448437e-05, + "loss": 1.3621, + "step": 15575 + }, + { + "epoch": 0.5094166884645567, + "grad_norm": 3.192886072619931, + "learning_rate": 1.1412260916217612e-05, + "loss": 1.5569, + "step": 15580 + }, + { + "epoch": 0.5095801726392886, + "grad_norm": 3.116309002239758, + "learning_rate": 1.1406611085048007e-05, + "loss": 1.3895, + "step": 15585 + }, + { + "epoch": 0.5097436568140205, + "grad_norm": 3.261560133316038, + "learning_rate": 1.1400960795779634e-05, + "loss": 1.49, + "step": 15590 + }, + { + "epoch": 0.5099071409887523, + "grad_norm": 3.093326980926363, + "learning_rate": 1.1395310050252659e-05, + "loss": 1.3534, + "step": 15595 + }, + { + "epoch": 0.5100706251634842, + "grad_norm": 3.129539987107065, + "learning_rate": 1.1389658850307386e-05, + "loss": 1.5309, + "step": 15600 + }, + { + "epoch": 0.5102341093382161, + "grad_norm": 3.240724176565147, + "learning_rate": 1.1384007197784272e-05, + "loss": 1.4863, + "step": 15605 + }, + { + "epoch": 0.510397593512948, + "grad_norm": 3.058920465429283, + "learning_rate": 1.1378355094523925e-05, + "loss": 1.4554, + "step": 15610 + }, + { + "epoch": 0.5105610776876799, + "grad_norm": 3.213152063169059, + "learning_rate": 1.1372702542367096e-05, + "loss": 1.4604, + "step": 15615 + }, + { + "epoch": 0.5107245618624117, + "grad_norm": 3.21269789376237, + "learning_rate": 1.1367049543154677e-05, + "loss": 1.3968, + "step": 15620 + }, + { + "epoch": 0.5108880460371436, + "grad_norm": 3.1411359322141683, + "learning_rate": 1.1361396098727721e-05, + "loss": 1.3342, + "step": 15625 + }, + { + "epoch": 0.5110515302118754, + "grad_norm": 3.7495549051796866, + "learning_rate": 1.135574221092741e-05, + "loss": 1.4271, + "step": 15630 + }, + { + "epoch": 0.5112150143866073, + "grad_norm": 3.244778349535858, + "learning_rate": 1.1350087881595083e-05, + "loss": 1.4783, + "step": 15635 + }, + { + "epoch": 0.5113784985613392, + "grad_norm": 3.331870978608927, + "learning_rate": 1.1344433112572205e-05, + "loss": 1.6499, + "step": 15640 + }, + { + "epoch": 0.5115419827360711, + "grad_norm": 3.0102105850350074, + "learning_rate": 1.1338777905700402e-05, + "loss": 1.4325, + "step": 15645 + }, + { + "epoch": 0.511705466910803, + "grad_norm": 3.1374512356323256, + "learning_rate": 1.133312226282144e-05, + "loss": 1.4685, + "step": 15650 + }, + { + "epoch": 0.5118689510855349, + "grad_norm": 3.032801845847243, + "learning_rate": 1.132746618577722e-05, + "loss": 1.4544, + "step": 15655 + }, + { + "epoch": 0.5120324352602668, + "grad_norm": 3.0209334925206353, + "learning_rate": 1.1321809676409787e-05, + "loss": 1.4335, + "step": 15660 + }, + { + "epoch": 0.5121959194349986, + "grad_norm": 3.0345119012198047, + "learning_rate": 1.1316152736561329e-05, + "loss": 1.5, + "step": 15665 + }, + { + "epoch": 0.5123594036097305, + "grad_norm": 3.084097139067053, + "learning_rate": 1.131049536807417e-05, + "loss": 1.56, + "step": 15670 + }, + { + "epoch": 0.5125228877844624, + "grad_norm": 3.2109782077722433, + "learning_rate": 1.130483757279078e-05, + "loss": 1.4982, + "step": 15675 + }, + { + "epoch": 0.5126863719591943, + "grad_norm": 3.0427928110981686, + "learning_rate": 1.1299179352553762e-05, + "loss": 1.4068, + "step": 15680 + }, + { + "epoch": 0.5128498561339262, + "grad_norm": 3.190461471655785, + "learning_rate": 1.1293520709205863e-05, + "loss": 1.4859, + "step": 15685 + }, + { + "epoch": 0.5130133403086581, + "grad_norm": 3.2624930909738725, + "learning_rate": 1.128786164458996e-05, + "loss": 1.4384, + "step": 15690 + }, + { + "epoch": 0.51317682448339, + "grad_norm": 3.123001924864268, + "learning_rate": 1.128220216054907e-05, + "loss": 1.3937, + "step": 15695 + }, + { + "epoch": 0.5133403086581219, + "grad_norm": 3.2118100691919635, + "learning_rate": 1.1276542258926355e-05, + "loss": 1.4799, + "step": 15700 + }, + { + "epoch": 0.5135037928328537, + "grad_norm": 3.26266273486375, + "learning_rate": 1.1270881941565104e-05, + "loss": 1.4592, + "step": 15705 + }, + { + "epoch": 0.5136672770075856, + "grad_norm": 3.1834900754858686, + "learning_rate": 1.126522121030874e-05, + "loss": 1.6375, + "step": 15710 + }, + { + "epoch": 0.5138307611823175, + "grad_norm": 2.825537959134732, + "learning_rate": 1.125956006700083e-05, + "loss": 1.349, + "step": 15715 + }, + { + "epoch": 0.5139942453570494, + "grad_norm": 3.125302617984938, + "learning_rate": 1.1253898513485064e-05, + "loss": 1.4821, + "step": 15720 + }, + { + "epoch": 0.5141577295317813, + "grad_norm": 2.963637929307099, + "learning_rate": 1.1248236551605276e-05, + "loss": 1.3098, + "step": 15725 + }, + { + "epoch": 0.5143212137065132, + "grad_norm": 3.1034788913803615, + "learning_rate": 1.1242574183205427e-05, + "loss": 1.4163, + "step": 15730 + }, + { + "epoch": 0.5144846978812451, + "grad_norm": 3.0741446958271954, + "learning_rate": 1.1236911410129613e-05, + "loss": 1.4819, + "step": 15735 + }, + { + "epoch": 0.514648182055977, + "grad_norm": 3.430794155271823, + "learning_rate": 1.1231248234222053e-05, + "loss": 1.3982, + "step": 15740 + }, + { + "epoch": 0.5148116662307088, + "grad_norm": 3.3349971721525007, + "learning_rate": 1.1225584657327116e-05, + "loss": 1.3992, + "step": 15745 + }, + { + "epoch": 0.5149751504054407, + "grad_norm": 3.1862599467193773, + "learning_rate": 1.121992068128928e-05, + "loss": 1.6481, + "step": 15750 + }, + { + "epoch": 0.5151386345801726, + "grad_norm": 3.1045279732682145, + "learning_rate": 1.1214256307953172e-05, + "loss": 1.4846, + "step": 15755 + }, + { + "epoch": 0.5153021187549045, + "grad_norm": 3.0417411773718506, + "learning_rate": 1.1208591539163532e-05, + "loss": 1.5331, + "step": 15760 + }, + { + "epoch": 0.5154656029296364, + "grad_norm": 3.115935581840869, + "learning_rate": 1.1202926376765239e-05, + "loss": 1.4378, + "step": 15765 + }, + { + "epoch": 0.5156290871043683, + "grad_norm": 3.7596693895376525, + "learning_rate": 1.1197260822603298e-05, + "loss": 1.5437, + "step": 15770 + }, + { + "epoch": 0.5157925712791002, + "grad_norm": 3.0770723973017815, + "learning_rate": 1.1191594878522842e-05, + "loss": 1.4458, + "step": 15775 + }, + { + "epoch": 0.515956055453832, + "grad_norm": 3.1094672915573236, + "learning_rate": 1.1185928546369132e-05, + "loss": 1.4548, + "step": 15780 + }, + { + "epoch": 0.5161195396285639, + "grad_norm": 3.2127384950876716, + "learning_rate": 1.1180261827987549e-05, + "loss": 1.4988, + "step": 15785 + }, + { + "epoch": 0.5162830238032958, + "grad_norm": 3.22228219969917, + "learning_rate": 1.1174594725223606e-05, + "loss": 1.4161, + "step": 15790 + }, + { + "epoch": 0.5164465079780277, + "grad_norm": 3.213119606938237, + "learning_rate": 1.1168927239922939e-05, + "loss": 1.5061, + "step": 15795 + }, + { + "epoch": 0.5166099921527596, + "grad_norm": 3.097639180878405, + "learning_rate": 1.1163259373931312e-05, + "loss": 1.3898, + "step": 15800 + }, + { + "epoch": 0.5167734763274915, + "grad_norm": 3.3497759447733584, + "learning_rate": 1.115759112909461e-05, + "loss": 1.3789, + "step": 15805 + }, + { + "epoch": 0.5169369605022234, + "grad_norm": 3.196573869195135, + "learning_rate": 1.1151922507258836e-05, + "loss": 1.4918, + "step": 15810 + }, + { + "epoch": 0.5171004446769553, + "grad_norm": 3.2600932620367526, + "learning_rate": 1.1146253510270129e-05, + "loss": 1.5559, + "step": 15815 + }, + { + "epoch": 0.5172639288516871, + "grad_norm": 3.4605293441066207, + "learning_rate": 1.1140584139974735e-05, + "loss": 1.4422, + "step": 15820 + }, + { + "epoch": 0.517427413026419, + "grad_norm": 3.283387457702283, + "learning_rate": 1.113491439821903e-05, + "loss": 1.4743, + "step": 15825 + }, + { + "epoch": 0.5175908972011509, + "grad_norm": 3.141284563760149, + "learning_rate": 1.1129244286849517e-05, + "loss": 1.5061, + "step": 15830 + }, + { + "epoch": 0.5177543813758828, + "grad_norm": 3.430195256452399, + "learning_rate": 1.1123573807712806e-05, + "loss": 1.5494, + "step": 15835 + }, + { + "epoch": 0.5179178655506147, + "grad_norm": 2.928309680673806, + "learning_rate": 1.1117902962655636e-05, + "loss": 1.4436, + "step": 15840 + }, + { + "epoch": 0.5180813497253466, + "grad_norm": 3.3199796583825796, + "learning_rate": 1.1112231753524858e-05, + "loss": 1.4056, + "step": 15845 + }, + { + "epoch": 0.5182448339000785, + "grad_norm": 3.1882614833577763, + "learning_rate": 1.1106560182167451e-05, + "loss": 1.3812, + "step": 15850 + }, + { + "epoch": 0.5184083180748104, + "grad_norm": 3.070424365577585, + "learning_rate": 1.1100888250430503e-05, + "loss": 1.5546, + "step": 15855 + }, + { + "epoch": 0.5185718022495422, + "grad_norm": 3.15588701823154, + "learning_rate": 1.1095215960161227e-05, + "loss": 1.5951, + "step": 15860 + }, + { + "epoch": 0.5187352864242741, + "grad_norm": 2.98224484058664, + "learning_rate": 1.1089543313206948e-05, + "loss": 1.5766, + "step": 15865 + }, + { + "epoch": 0.518898770599006, + "grad_norm": 3.051585179269464, + "learning_rate": 1.1083870311415104e-05, + "loss": 1.3192, + "step": 15870 + }, + { + "epoch": 0.5190622547737379, + "grad_norm": 2.9801780218445573, + "learning_rate": 1.1078196956633257e-05, + "loss": 1.5012, + "step": 15875 + }, + { + "epoch": 0.5192257389484698, + "grad_norm": 2.9040377977941527, + "learning_rate": 1.1072523250709078e-05, + "loss": 1.2223, + "step": 15880 + }, + { + "epoch": 0.5193892231232017, + "grad_norm": 3.2122286442787886, + "learning_rate": 1.1066849195490352e-05, + "loss": 1.5207, + "step": 15885 + }, + { + "epoch": 0.5195527072979336, + "grad_norm": 3.2158504665325363, + "learning_rate": 1.1061174792824987e-05, + "loss": 1.4393, + "step": 15890 + }, + { + "epoch": 0.5197161914726655, + "grad_norm": 3.3622777189393647, + "learning_rate": 1.105550004456099e-05, + "loss": 1.5845, + "step": 15895 + }, + { + "epoch": 0.5198796756473973, + "grad_norm": 3.265995855037312, + "learning_rate": 1.1049824952546486e-05, + "loss": 1.3457, + "step": 15900 + }, + { + "epoch": 0.5200431598221292, + "grad_norm": 3.25068289804545, + "learning_rate": 1.104414951862972e-05, + "loss": 1.5405, + "step": 15905 + }, + { + "epoch": 0.5202066439968611, + "grad_norm": 3.2583433529459707, + "learning_rate": 1.103847374465904e-05, + "loss": 1.4114, + "step": 15910 + }, + { + "epoch": 0.520370128171593, + "grad_norm": 3.1664026685944964, + "learning_rate": 1.1032797632482904e-05, + "loss": 1.4025, + "step": 15915 + }, + { + "epoch": 0.5205336123463249, + "grad_norm": 2.898913099581729, + "learning_rate": 1.1027121183949883e-05, + "loss": 1.3841, + "step": 15920 + }, + { + "epoch": 0.5206970965210568, + "grad_norm": 3.0417556041321343, + "learning_rate": 1.1021444400908656e-05, + "loss": 1.4878, + "step": 15925 + }, + { + "epoch": 0.5208605806957887, + "grad_norm": 3.050712477993021, + "learning_rate": 1.1015767285208018e-05, + "loss": 1.422, + "step": 15930 + }, + { + "epoch": 0.5210240648705206, + "grad_norm": 2.9276120664430256, + "learning_rate": 1.1010089838696862e-05, + "loss": 1.4633, + "step": 15935 + }, + { + "epoch": 0.5211875490452524, + "grad_norm": 3.186855067238765, + "learning_rate": 1.1004412063224195e-05, + "loss": 1.4075, + "step": 15940 + }, + { + "epoch": 0.5213510332199843, + "grad_norm": 3.0412347689790367, + "learning_rate": 1.099873396063913e-05, + "loss": 1.5764, + "step": 15945 + }, + { + "epoch": 0.5215145173947162, + "grad_norm": 3.1778551059658975, + "learning_rate": 1.0993055532790878e-05, + "loss": 1.4544, + "step": 15950 + }, + { + "epoch": 0.5216780015694481, + "grad_norm": 3.1668580220335447, + "learning_rate": 1.0987376781528774e-05, + "loss": 1.4066, + "step": 15955 + }, + { + "epoch": 0.52184148574418, + "grad_norm": 3.328853286598159, + "learning_rate": 1.0981697708702244e-05, + "loss": 1.348, + "step": 15960 + }, + { + "epoch": 0.5220049699189119, + "grad_norm": 2.7535524095175288, + "learning_rate": 1.0976018316160821e-05, + "loss": 1.442, + "step": 15965 + }, + { + "epoch": 0.5221684540936438, + "grad_norm": 2.9904612518174463, + "learning_rate": 1.097033860575415e-05, + "loss": 1.4976, + "step": 15970 + }, + { + "epoch": 0.5223319382683757, + "grad_norm": 2.8488794976469287, + "learning_rate": 1.0964658579331964e-05, + "loss": 1.2952, + "step": 15975 + }, + { + "epoch": 0.5224954224431075, + "grad_norm": 3.188610856991094, + "learning_rate": 1.0958978238744118e-05, + "loss": 1.4436, + "step": 15980 + }, + { + "epoch": 0.5226589066178394, + "grad_norm": 3.053739276623855, + "learning_rate": 1.0953297585840554e-05, + "loss": 1.4827, + "step": 15985 + }, + { + "epoch": 0.5228223907925713, + "grad_norm": 3.150129344404813, + "learning_rate": 1.0947616622471325e-05, + "loss": 1.4788, + "step": 15990 + }, + { + "epoch": 0.5229858749673032, + "grad_norm": 3.1227973880703224, + "learning_rate": 1.0941935350486579e-05, + "loss": 1.5486, + "step": 15995 + }, + { + "epoch": 0.5231493591420351, + "grad_norm": 3.105588870015297, + "learning_rate": 1.0936253771736565e-05, + "loss": 1.4823, + "step": 16000 + }, + { + "epoch": 0.523312843316767, + "grad_norm": 3.3406208396513755, + "learning_rate": 1.093057188807164e-05, + "loss": 1.5297, + "step": 16005 + }, + { + "epoch": 0.5234763274914989, + "grad_norm": 3.1920405893225756, + "learning_rate": 1.0924889701342251e-05, + "loss": 1.4738, + "step": 16010 + }, + { + "epoch": 0.5236398116662307, + "grad_norm": 3.0991720586763343, + "learning_rate": 1.0919207213398948e-05, + "loss": 1.3512, + "step": 16015 + }, + { + "epoch": 0.5238032958409626, + "grad_norm": 3.1990120420234214, + "learning_rate": 1.0913524426092374e-05, + "loss": 1.5274, + "step": 16020 + }, + { + "epoch": 0.5239667800156945, + "grad_norm": 3.304885865635531, + "learning_rate": 1.0907841341273279e-05, + "loss": 1.5607, + "step": 16025 + }, + { + "epoch": 0.5241302641904264, + "grad_norm": 3.129363920476118, + "learning_rate": 1.0902157960792504e-05, + "loss": 1.3824, + "step": 16030 + }, + { + "epoch": 0.5242937483651583, + "grad_norm": 3.6171631275262555, + "learning_rate": 1.0896474286500982e-05, + "loss": 1.4581, + "step": 16035 + }, + { + "epoch": 0.5244572325398902, + "grad_norm": 3.138077506048982, + "learning_rate": 1.089079032024975e-05, + "loss": 1.3416, + "step": 16040 + }, + { + "epoch": 0.5246207167146221, + "grad_norm": 3.4563747653759385, + "learning_rate": 1.0885106063889938e-05, + "loss": 1.5995, + "step": 16045 + }, + { + "epoch": 0.524784200889354, + "grad_norm": 3.0689545838823604, + "learning_rate": 1.0879421519272768e-05, + "loss": 1.3921, + "step": 16050 + }, + { + "epoch": 0.5249476850640858, + "grad_norm": 3.141312690310401, + "learning_rate": 1.0873736688249554e-05, + "loss": 1.5002, + "step": 16055 + }, + { + "epoch": 0.5251111692388177, + "grad_norm": 3.2585547382730438, + "learning_rate": 1.086805157267171e-05, + "loss": 1.4435, + "step": 16060 + }, + { + "epoch": 0.5252746534135496, + "grad_norm": 3.3156171907924152, + "learning_rate": 1.0862366174390734e-05, + "loss": 1.4498, + "step": 16065 + }, + { + "epoch": 0.5254381375882815, + "grad_norm": 3.18732056338354, + "learning_rate": 1.0856680495258227e-05, + "loss": 1.4834, + "step": 16070 + }, + { + "epoch": 0.5256016217630134, + "grad_norm": 3.6141732097730768, + "learning_rate": 1.0850994537125872e-05, + "loss": 1.4914, + "step": 16075 + }, + { + "epoch": 0.5257651059377453, + "grad_norm": 3.373652585543236, + "learning_rate": 1.0845308301845444e-05, + "loss": 1.3024, + "step": 16080 + }, + { + "epoch": 0.5259285901124771, + "grad_norm": 3.150201520514671, + "learning_rate": 1.0839621791268812e-05, + "loss": 1.4883, + "step": 16085 + }, + { + "epoch": 0.5260920742872089, + "grad_norm": 3.156319168844806, + "learning_rate": 1.083393500724794e-05, + "loss": 1.4537, + "step": 16090 + }, + { + "epoch": 0.5262555584619408, + "grad_norm": 3.2006672306524893, + "learning_rate": 1.0828247951634865e-05, + "loss": 1.3947, + "step": 16095 + }, + { + "epoch": 0.5264190426366727, + "grad_norm": 3.2489925396581874, + "learning_rate": 1.0822560626281727e-05, + "loss": 1.5271, + "step": 16100 + }, + { + "epoch": 0.5265825268114046, + "grad_norm": 3.0541739348873667, + "learning_rate": 1.0816873033040742e-05, + "loss": 1.3367, + "step": 16105 + }, + { + "epoch": 0.5267460109861365, + "grad_norm": 3.2726548701944145, + "learning_rate": 1.081118517376423e-05, + "loss": 1.5159, + "step": 16110 + }, + { + "epoch": 0.5269094951608684, + "grad_norm": 3.156677032051467, + "learning_rate": 1.080549705030458e-05, + "loss": 1.4512, + "step": 16115 + }, + { + "epoch": 0.5270729793356003, + "grad_norm": 3.1302469854756607, + "learning_rate": 1.0799808664514277e-05, + "loss": 1.3843, + "step": 16120 + }, + { + "epoch": 0.5272364635103322, + "grad_norm": 3.11227050397391, + "learning_rate": 1.0794120018245888e-05, + "loss": 1.4282, + "step": 16125 + }, + { + "epoch": 0.527399947685064, + "grad_norm": 3.31105354536987, + "learning_rate": 1.0788431113352063e-05, + "loss": 1.5136, + "step": 16130 + }, + { + "epoch": 0.5275634318597959, + "grad_norm": 2.984626094228687, + "learning_rate": 1.0782741951685545e-05, + "loss": 1.3046, + "step": 16135 + }, + { + "epoch": 0.5277269160345278, + "grad_norm": 3.3665978108249712, + "learning_rate": 1.077705253509915e-05, + "loss": 1.6285, + "step": 16140 + }, + { + "epoch": 0.5278904002092597, + "grad_norm": 2.9496890062177537, + "learning_rate": 1.0771362865445784e-05, + "loss": 1.4409, + "step": 16145 + }, + { + "epoch": 0.5280538843839916, + "grad_norm": 3.0319965529823594, + "learning_rate": 1.0765672944578436e-05, + "loss": 1.4814, + "step": 16150 + }, + { + "epoch": 0.5282173685587235, + "grad_norm": 3.2216116068139633, + "learning_rate": 1.0759982774350164e-05, + "loss": 1.5195, + "step": 16155 + }, + { + "epoch": 0.5283808527334554, + "grad_norm": 3.2912635140650464, + "learning_rate": 1.0754292356614125e-05, + "loss": 1.528, + "step": 16160 + }, + { + "epoch": 0.5285443369081873, + "grad_norm": 3.336181597707753, + "learning_rate": 1.0748601693223546e-05, + "loss": 1.5594, + "step": 16165 + }, + { + "epoch": 0.5287078210829191, + "grad_norm": 3.25476837229605, + "learning_rate": 1.0742910786031738e-05, + "loss": 1.3824, + "step": 16170 + }, + { + "epoch": 0.528871305257651, + "grad_norm": 3.4253090884299913, + "learning_rate": 1.0737219636892089e-05, + "loss": 1.4752, + "step": 16175 + }, + { + "epoch": 0.5290347894323829, + "grad_norm": 3.1719907989479137, + "learning_rate": 1.0731528247658065e-05, + "loss": 1.4946, + "step": 16180 + }, + { + "epoch": 0.5291982736071148, + "grad_norm": 3.288827985967019, + "learning_rate": 1.0725836620183216e-05, + "loss": 1.4638, + "step": 16185 + }, + { + "epoch": 0.5293617577818467, + "grad_norm": 3.11585615669962, + "learning_rate": 1.0720144756321163e-05, + "loss": 1.5985, + "step": 16190 + }, + { + "epoch": 0.5295252419565786, + "grad_norm": 2.994665916995509, + "learning_rate": 1.0714452657925609e-05, + "loss": 1.4167, + "step": 16195 + }, + { + "epoch": 0.5296887261313105, + "grad_norm": 3.14003247210282, + "learning_rate": 1.0708760326850326e-05, + "loss": 1.4734, + "step": 16200 + }, + { + "epoch": 0.5298522103060423, + "grad_norm": 3.2664245290908145, + "learning_rate": 1.070306776494917e-05, + "loss": 1.607, + "step": 16205 + }, + { + "epoch": 0.5300156944807742, + "grad_norm": 3.513197417526411, + "learning_rate": 1.069737497407607e-05, + "loss": 1.5644, + "step": 16210 + }, + { + "epoch": 0.5301791786555061, + "grad_norm": 3.0924980986259083, + "learning_rate": 1.0691681956085032e-05, + "loss": 1.5083, + "step": 16215 + }, + { + "epoch": 0.530342662830238, + "grad_norm": 3.025944461722434, + "learning_rate": 1.0685988712830124e-05, + "loss": 1.4607, + "step": 16220 + }, + { + "epoch": 0.5305061470049699, + "grad_norm": 3.095702895223248, + "learning_rate": 1.06802952461655e-05, + "loss": 1.4898, + "step": 16225 + }, + { + "epoch": 0.5306696311797018, + "grad_norm": 3.495963527906044, + "learning_rate": 1.0674601557945384e-05, + "loss": 1.5162, + "step": 16230 + }, + { + "epoch": 0.5308331153544337, + "grad_norm": 2.9512891418735068, + "learning_rate": 1.0668907650024063e-05, + "loss": 1.3335, + "step": 16235 + }, + { + "epoch": 0.5309965995291656, + "grad_norm": 3.0659291527482995, + "learning_rate": 1.0663213524255915e-05, + "loss": 1.3278, + "step": 16240 + }, + { + "epoch": 0.5311600837038974, + "grad_norm": 2.9406410688529547, + "learning_rate": 1.065751918249537e-05, + "loss": 1.5617, + "step": 16245 + }, + { + "epoch": 0.5313235678786293, + "grad_norm": 3.1177564290818602, + "learning_rate": 1.0651824626596938e-05, + "loss": 1.3896, + "step": 16250 + }, + { + "epoch": 0.5314870520533612, + "grad_norm": 3.21522983033955, + "learning_rate": 1.0646129858415197e-05, + "loss": 1.4228, + "step": 16255 + }, + { + "epoch": 0.5316505362280931, + "grad_norm": 3.091864131311399, + "learning_rate": 1.0640434879804791e-05, + "loss": 1.4863, + "step": 16260 + }, + { + "epoch": 0.531814020402825, + "grad_norm": 3.4229986385140516, + "learning_rate": 1.0634739692620435e-05, + "loss": 1.3836, + "step": 16265 + }, + { + "epoch": 0.5319775045775569, + "grad_norm": 3.001477923456309, + "learning_rate": 1.0629044298716916e-05, + "loss": 1.397, + "step": 16270 + }, + { + "epoch": 0.5321409887522888, + "grad_norm": 3.2754927934137763, + "learning_rate": 1.062334869994908e-05, + "loss": 1.4754, + "step": 16275 + }, + { + "epoch": 0.5323044729270207, + "grad_norm": 3.26903930804468, + "learning_rate": 1.0617652898171842e-05, + "loss": 1.5216, + "step": 16280 + }, + { + "epoch": 0.5324679571017525, + "grad_norm": 3.2686538128288776, + "learning_rate": 1.0611956895240188e-05, + "loss": 1.366, + "step": 16285 + }, + { + "epoch": 0.5326314412764844, + "grad_norm": 3.026478337696991, + "learning_rate": 1.060626069300917e-05, + "loss": 1.3779, + "step": 16290 + }, + { + "epoch": 0.5327949254512163, + "grad_norm": 3.1396400564250557, + "learning_rate": 1.0600564293333898e-05, + "loss": 1.4632, + "step": 16295 + }, + { + "epoch": 0.5329584096259482, + "grad_norm": 3.2284358153459016, + "learning_rate": 1.0594867698069551e-05, + "loss": 1.3824, + "step": 16300 + }, + { + "epoch": 0.5331218938006801, + "grad_norm": 3.152758082246359, + "learning_rate": 1.0589170909071366e-05, + "loss": 1.4203, + "step": 16305 + }, + { + "epoch": 0.533285377975412, + "grad_norm": 2.990870005772669, + "learning_rate": 1.0583473928194654e-05, + "loss": 1.3994, + "step": 16310 + }, + { + "epoch": 0.5334488621501439, + "grad_norm": 3.1263587153931214, + "learning_rate": 1.057777675729478e-05, + "loss": 1.4437, + "step": 16315 + }, + { + "epoch": 0.5336123463248758, + "grad_norm": 2.9852030148856326, + "learning_rate": 1.057207939822717e-05, + "loss": 1.3367, + "step": 16320 + }, + { + "epoch": 0.5337758304996076, + "grad_norm": 3.2439941000910015, + "learning_rate": 1.0566381852847321e-05, + "loss": 1.4912, + "step": 16325 + }, + { + "epoch": 0.5339393146743395, + "grad_norm": 3.1775096246292094, + "learning_rate": 1.0560684123010776e-05, + "loss": 1.3927, + "step": 16330 + }, + { + "epoch": 0.5341027988490714, + "grad_norm": 3.1534381440869086, + "learning_rate": 1.0554986210573148e-05, + "loss": 1.4829, + "step": 16335 + }, + { + "epoch": 0.5342662830238033, + "grad_norm": 3.0989597759723266, + "learning_rate": 1.0549288117390113e-05, + "loss": 1.5425, + "step": 16340 + }, + { + "epoch": 0.5344297671985352, + "grad_norm": 3.1594193700362165, + "learning_rate": 1.0543589845317394e-05, + "loss": 1.5588, + "step": 16345 + }, + { + "epoch": 0.5345932513732671, + "grad_norm": 3.1661967054220552, + "learning_rate": 1.0537891396210783e-05, + "loss": 1.4189, + "step": 16350 + }, + { + "epoch": 0.534756735547999, + "grad_norm": 3.206277491390717, + "learning_rate": 1.0532192771926121e-05, + "loss": 1.4591, + "step": 16355 + }, + { + "epoch": 0.5349202197227308, + "grad_norm": 3.283792114115111, + "learning_rate": 1.0526493974319315e-05, + "loss": 1.4901, + "step": 16360 + }, + { + "epoch": 0.5350837038974627, + "grad_norm": 3.089669135963048, + "learning_rate": 1.0520795005246318e-05, + "loss": 1.4895, + "step": 16365 + }, + { + "epoch": 0.5352471880721946, + "grad_norm": 2.8566396304035138, + "learning_rate": 1.0515095866563152e-05, + "loss": 1.4173, + "step": 16370 + }, + { + "epoch": 0.5354106722469265, + "grad_norm": 3.0795878825861367, + "learning_rate": 1.050939656012588e-05, + "loss": 1.4799, + "step": 16375 + }, + { + "epoch": 0.5355741564216584, + "grad_norm": 3.1055517505061894, + "learning_rate": 1.0503697087790629e-05, + "loss": 1.4901, + "step": 16380 + }, + { + "epoch": 0.5357376405963903, + "grad_norm": 3.034679240131745, + "learning_rate": 1.0497997451413577e-05, + "loss": 1.4967, + "step": 16385 + }, + { + "epoch": 0.5359011247711222, + "grad_norm": 3.1723424034070526, + "learning_rate": 1.0492297652850957e-05, + "loss": 1.3971, + "step": 16390 + }, + { + "epoch": 0.5360646089458541, + "grad_norm": 3.2300906873409674, + "learning_rate": 1.0486597693959054e-05, + "loss": 1.3948, + "step": 16395 + }, + { + "epoch": 0.536228093120586, + "grad_norm": 2.8628240125092392, + "learning_rate": 1.0480897576594206e-05, + "loss": 1.4215, + "step": 16400 + }, + { + "epoch": 0.5363915772953178, + "grad_norm": 3.2738494404704457, + "learning_rate": 1.0475197302612801e-05, + "loss": 1.4583, + "step": 16405 + }, + { + "epoch": 0.5365550614700497, + "grad_norm": 3.246116078392775, + "learning_rate": 1.0469496873871274e-05, + "loss": 1.432, + "step": 16410 + }, + { + "epoch": 0.5367185456447816, + "grad_norm": 3.340747400833663, + "learning_rate": 1.0463796292226116e-05, + "loss": 1.4918, + "step": 16415 + }, + { + "epoch": 0.5368820298195135, + "grad_norm": 3.1767736760872016, + "learning_rate": 1.0458095559533873e-05, + "loss": 1.4484, + "step": 16420 + }, + { + "epoch": 0.5370455139942454, + "grad_norm": 3.2834706215613605, + "learning_rate": 1.045239467765113e-05, + "loss": 1.4282, + "step": 16425 + }, + { + "epoch": 0.5372089981689773, + "grad_norm": 3.1587243329219725, + "learning_rate": 1.0446693648434525e-05, + "loss": 1.4943, + "step": 16430 + }, + { + "epoch": 0.5373724823437092, + "grad_norm": 3.2920753920764034, + "learning_rate": 1.0440992473740744e-05, + "loss": 1.4805, + "step": 16435 + }, + { + "epoch": 0.537535966518441, + "grad_norm": 2.9813887207147136, + "learning_rate": 1.0435291155426514e-05, + "loss": 1.3712, + "step": 16440 + }, + { + "epoch": 0.5376994506931729, + "grad_norm": 3.2709445305170557, + "learning_rate": 1.0429589695348626e-05, + "loss": 1.5033, + "step": 16445 + }, + { + "epoch": 0.5378629348679048, + "grad_norm": 2.9901602936840868, + "learning_rate": 1.0423888095363896e-05, + "loss": 1.5156, + "step": 16450 + }, + { + "epoch": 0.5380264190426367, + "grad_norm": 3.1504640787556717, + "learning_rate": 1.04181863573292e-05, + "loss": 1.5401, + "step": 16455 + }, + { + "epoch": 0.5381899032173686, + "grad_norm": 2.9600602654870922, + "learning_rate": 1.0412484483101455e-05, + "loss": 1.4315, + "step": 16460 + }, + { + "epoch": 0.5383533873921005, + "grad_norm": 3.4085061356253106, + "learning_rate": 1.040678247453762e-05, + "loss": 1.5301, + "step": 16465 + }, + { + "epoch": 0.5385168715668324, + "grad_norm": 3.1948754966275126, + "learning_rate": 1.0401080333494698e-05, + "loss": 1.4974, + "step": 16470 + }, + { + "epoch": 0.5386803557415643, + "grad_norm": 3.3050588367948417, + "learning_rate": 1.039537806182974e-05, + "loss": 1.3504, + "step": 16475 + }, + { + "epoch": 0.5388438399162961, + "grad_norm": 3.5506148248356397, + "learning_rate": 1.0389675661399834e-05, + "loss": 1.5483, + "step": 16480 + }, + { + "epoch": 0.539007324091028, + "grad_norm": 3.1660620271468782, + "learning_rate": 1.038397313406211e-05, + "loss": 1.4013, + "step": 16485 + }, + { + "epoch": 0.5391708082657599, + "grad_norm": 3.356270114676638, + "learning_rate": 1.0378270481673747e-05, + "loss": 1.3897, + "step": 16490 + }, + { + "epoch": 0.5393342924404918, + "grad_norm": 3.306068519388739, + "learning_rate": 1.0372567706091953e-05, + "loss": 1.5395, + "step": 16495 + }, + { + "epoch": 0.5394977766152237, + "grad_norm": 3.199208991581195, + "learning_rate": 1.036686480917399e-05, + "loss": 1.4623, + "step": 16500 + }, + { + "epoch": 0.5396612607899556, + "grad_norm": 3.0899808080877382, + "learning_rate": 1.0361161792777146e-05, + "loss": 1.4129, + "step": 16505 + }, + { + "epoch": 0.5398247449646875, + "grad_norm": 3.159572226504458, + "learning_rate": 1.0355458658758754e-05, + "loss": 1.3953, + "step": 16510 + }, + { + "epoch": 0.5399882291394194, + "grad_norm": 3.0100141330538963, + "learning_rate": 1.0349755408976183e-05, + "loss": 1.4531, + "step": 16515 + }, + { + "epoch": 0.5401517133141512, + "grad_norm": 3.1141025288636537, + "learning_rate": 1.034405204528685e-05, + "loss": 1.5555, + "step": 16520 + }, + { + "epoch": 0.5403151974888831, + "grad_norm": 3.019104260929103, + "learning_rate": 1.0338348569548193e-05, + "loss": 1.4266, + "step": 16525 + }, + { + "epoch": 0.540478681663615, + "grad_norm": 3.2806117260679244, + "learning_rate": 1.0332644983617696e-05, + "loss": 1.5618, + "step": 16530 + }, + { + "epoch": 0.5406421658383469, + "grad_norm": 3.4743265821464444, + "learning_rate": 1.032694128935288e-05, + "loss": 1.4357, + "step": 16535 + }, + { + "epoch": 0.5408056500130788, + "grad_norm": 3.3121873173593603, + "learning_rate": 1.0321237488611298e-05, + "loss": 1.4038, + "step": 16540 + }, + { + "epoch": 0.5409691341878107, + "grad_norm": 3.1963339887939726, + "learning_rate": 1.0315533583250531e-05, + "loss": 1.5107, + "step": 16545 + }, + { + "epoch": 0.5411326183625426, + "grad_norm": 3.057321186365034, + "learning_rate": 1.0309829575128212e-05, + "loss": 1.4847, + "step": 16550 + }, + { + "epoch": 0.5412961025372743, + "grad_norm": 2.9636416301244135, + "learning_rate": 1.0304125466101989e-05, + "loss": 1.4114, + "step": 16555 + }, + { + "epoch": 0.5414595867120062, + "grad_norm": 3.401097130022778, + "learning_rate": 1.0298421258029553e-05, + "loss": 1.4198, + "step": 16560 + }, + { + "epoch": 0.5416230708867381, + "grad_norm": 2.972479270443069, + "learning_rate": 1.0292716952768628e-05, + "loss": 1.4109, + "step": 16565 + }, + { + "epoch": 0.54178655506147, + "grad_norm": 3.0710251876967227, + "learning_rate": 1.0287012552176961e-05, + "loss": 1.4176, + "step": 16570 + }, + { + "epoch": 0.5419500392362019, + "grad_norm": 3.140053072890089, + "learning_rate": 1.0281308058112338e-05, + "loss": 1.3547, + "step": 16575 + }, + { + "epoch": 0.5421135234109338, + "grad_norm": 3.2444333541388137, + "learning_rate": 1.0275603472432574e-05, + "loss": 1.51, + "step": 16580 + }, + { + "epoch": 0.5422770075856657, + "grad_norm": 3.0988412752195544, + "learning_rate": 1.0269898796995512e-05, + "loss": 1.4098, + "step": 16585 + }, + { + "epoch": 0.5424404917603975, + "grad_norm": 8.16960167820411, + "learning_rate": 1.026419403365902e-05, + "loss": 1.4794, + "step": 16590 + }, + { + "epoch": 0.5426039759351294, + "grad_norm": 3.3121949857789748, + "learning_rate": 1.0258489184281008e-05, + "loss": 1.5068, + "step": 16595 + }, + { + "epoch": 0.5427674601098613, + "grad_norm": 3.1051551754276274, + "learning_rate": 1.0252784250719403e-05, + "loss": 1.3809, + "step": 16600 + }, + { + "epoch": 0.5429309442845932, + "grad_norm": 3.202943184071893, + "learning_rate": 1.024707923483216e-05, + "loss": 1.515, + "step": 16605 + }, + { + "epoch": 0.5430944284593251, + "grad_norm": 3.26534444072387, + "learning_rate": 1.0241374138477265e-05, + "loss": 1.434, + "step": 16610 + }, + { + "epoch": 0.543257912634057, + "grad_norm": 3.029446215885449, + "learning_rate": 1.0235668963512724e-05, + "loss": 1.4623, + "step": 16615 + }, + { + "epoch": 0.5434213968087889, + "grad_norm": 3.0886652607252465, + "learning_rate": 1.0229963711796576e-05, + "loss": 1.5655, + "step": 16620 + }, + { + "epoch": 0.5435848809835208, + "grad_norm": 3.0473287513303093, + "learning_rate": 1.0224258385186882e-05, + "loss": 1.5278, + "step": 16625 + }, + { + "epoch": 0.5437483651582526, + "grad_norm": 3.177892682755102, + "learning_rate": 1.0218552985541729e-05, + "loss": 1.449, + "step": 16630 + }, + { + "epoch": 0.5439118493329845, + "grad_norm": 3.1974086674966786, + "learning_rate": 1.0212847514719222e-05, + "loss": 1.4204, + "step": 16635 + }, + { + "epoch": 0.5440753335077164, + "grad_norm": 2.9060897015740075, + "learning_rate": 1.0207141974577494e-05, + "loss": 1.3899, + "step": 16640 + }, + { + "epoch": 0.5442388176824483, + "grad_norm": 3.0664956507893697, + "learning_rate": 1.0201436366974699e-05, + "loss": 1.4199, + "step": 16645 + }, + { + "epoch": 0.5444023018571802, + "grad_norm": 3.2115679562725536, + "learning_rate": 1.0195730693769017e-05, + "loss": 1.409, + "step": 16650 + }, + { + "epoch": 0.5445657860319121, + "grad_norm": 3.144545299320838, + "learning_rate": 1.0190024956818642e-05, + "loss": 1.3606, + "step": 16655 + }, + { + "epoch": 0.544729270206644, + "grad_norm": 3.1247761471362074, + "learning_rate": 1.0184319157981798e-05, + "loss": 1.505, + "step": 16660 + }, + { + "epoch": 0.5448927543813759, + "grad_norm": 3.27199464850872, + "learning_rate": 1.0178613299116717e-05, + "loss": 1.5364, + "step": 16665 + }, + { + "epoch": 0.5450562385561077, + "grad_norm": 3.2220719322356017, + "learning_rate": 1.0172907382081663e-05, + "loss": 1.5123, + "step": 16670 + }, + { + "epoch": 0.5452197227308396, + "grad_norm": 3.378761752390801, + "learning_rate": 1.0167201408734908e-05, + "loss": 1.4902, + "step": 16675 + }, + { + "epoch": 0.5453832069055715, + "grad_norm": 3.310392088384264, + "learning_rate": 1.0161495380934752e-05, + "loss": 1.4312, + "step": 16680 + }, + { + "epoch": 0.5455466910803034, + "grad_norm": 3.323515001733779, + "learning_rate": 1.0155789300539509e-05, + "loss": 1.4161, + "step": 16685 + }, + { + "epoch": 0.5457101752550353, + "grad_norm": 3.3019539723276248, + "learning_rate": 1.0150083169407506e-05, + "loss": 1.4762, + "step": 16690 + }, + { + "epoch": 0.5458736594297672, + "grad_norm": 3.2865480325163303, + "learning_rate": 1.0144376989397092e-05, + "loss": 1.3215, + "step": 16695 + }, + { + "epoch": 0.5460371436044991, + "grad_norm": 3.372677038513579, + "learning_rate": 1.0138670762366629e-05, + "loss": 1.328, + "step": 16700 + }, + { + "epoch": 0.546200627779231, + "grad_norm": 3.310142627315887, + "learning_rate": 1.0132964490174498e-05, + "loss": 1.4876, + "step": 16705 + }, + { + "epoch": 0.5463641119539628, + "grad_norm": 2.9561719274820737, + "learning_rate": 1.0127258174679089e-05, + "loss": 1.4645, + "step": 16710 + }, + { + "epoch": 0.5465275961286947, + "grad_norm": 3.2912769757441027, + "learning_rate": 1.0121551817738813e-05, + "loss": 1.5654, + "step": 16715 + }, + { + "epoch": 0.5466910803034266, + "grad_norm": 3.353139129868719, + "learning_rate": 1.0115845421212082e-05, + "loss": 1.3247, + "step": 16720 + }, + { + "epoch": 0.5468545644781585, + "grad_norm": 3.272914582124596, + "learning_rate": 1.0110138986957343e-05, + "loss": 1.5459, + "step": 16725 + }, + { + "epoch": 0.5470180486528904, + "grad_norm": 3.1676586088080962, + "learning_rate": 1.0104432516833031e-05, + "loss": 1.2935, + "step": 16730 + }, + { + "epoch": 0.5471815328276223, + "grad_norm": 3.315175980038363, + "learning_rate": 1.0098726012697608e-05, + "loss": 1.5206, + "step": 16735 + }, + { + "epoch": 0.5473450170023542, + "grad_norm": 3.3355716637151285, + "learning_rate": 1.0093019476409543e-05, + "loss": 1.4547, + "step": 16740 + }, + { + "epoch": 0.547508501177086, + "grad_norm": 2.98016117808604, + "learning_rate": 1.008731290982731e-05, + "loss": 1.3749, + "step": 16745 + }, + { + "epoch": 0.5476719853518179, + "grad_norm": 3.2647152136120487, + "learning_rate": 1.0081606314809402e-05, + "loss": 1.6341, + "step": 16750 + }, + { + "epoch": 0.5478354695265498, + "grad_norm": 3.1752464879376947, + "learning_rate": 1.0075899693214317e-05, + "loss": 1.4806, + "step": 16755 + }, + { + "epoch": 0.5479989537012817, + "grad_norm": 3.07144044856544, + "learning_rate": 1.0070193046900565e-05, + "loss": 1.4273, + "step": 16760 + }, + { + "epoch": 0.5481624378760136, + "grad_norm": 3.355852437828579, + "learning_rate": 1.0064486377726655e-05, + "loss": 1.5154, + "step": 16765 + }, + { + "epoch": 0.5483259220507455, + "grad_norm": 3.3778753592354, + "learning_rate": 1.0058779687551113e-05, + "loss": 1.3673, + "step": 16770 + }, + { + "epoch": 0.5484894062254774, + "grad_norm": 3.255719889945629, + "learning_rate": 1.0053072978232463e-05, + "loss": 1.4288, + "step": 16775 + }, + { + "epoch": 0.5486528904002093, + "grad_norm": 3.4494131277625004, + "learning_rate": 1.004736625162925e-05, + "loss": 1.4512, + "step": 16780 + }, + { + "epoch": 0.5488163745749411, + "grad_norm": 3.324705137470801, + "learning_rate": 1.0041659509600005e-05, + "loss": 1.3096, + "step": 16785 + }, + { + "epoch": 0.548979858749673, + "grad_norm": 3.051834372027565, + "learning_rate": 1.003595275400328e-05, + "loss": 1.5436, + "step": 16790 + }, + { + "epoch": 0.5491433429244049, + "grad_norm": 3.1674263578931154, + "learning_rate": 1.0030245986697622e-05, + "loss": 1.3103, + "step": 16795 + }, + { + "epoch": 0.5493068270991368, + "grad_norm": 2.8932688910470574, + "learning_rate": 1.0024539209541591e-05, + "loss": 1.3913, + "step": 16800 + }, + { + "epoch": 0.5494703112738687, + "grad_norm": 3.1132070489743264, + "learning_rate": 1.0018832424393738e-05, + "loss": 1.522, + "step": 16805 + }, + { + "epoch": 0.5496337954486006, + "grad_norm": 3.3819755344904583, + "learning_rate": 1.0013125633112626e-05, + "loss": 1.4182, + "step": 16810 + }, + { + "epoch": 0.5497972796233325, + "grad_norm": 3.3676779014556875, + "learning_rate": 1.0007418837556816e-05, + "loss": 1.4862, + "step": 16815 + }, + { + "epoch": 0.5499607637980644, + "grad_norm": 3.341756078925655, + "learning_rate": 1.0001712039584876e-05, + "loss": 1.5365, + "step": 16820 + }, + { + "epoch": 0.5501242479727962, + "grad_norm": 3.2899451355676703, + "learning_rate": 9.996005241055364e-06, + "loss": 1.5256, + "step": 16825 + }, + { + "epoch": 0.5502877321475281, + "grad_norm": 3.00649958498356, + "learning_rate": 9.990298443826848e-06, + "loss": 1.5567, + "step": 16830 + }, + { + "epoch": 0.55045121632226, + "grad_norm": 3.367559031328479, + "learning_rate": 9.984591649757891e-06, + "loss": 1.6009, + "step": 16835 + }, + { + "epoch": 0.5506147004969919, + "grad_norm": 3.267542798407377, + "learning_rate": 9.978884860707057e-06, + "loss": 1.4386, + "step": 16840 + }, + { + "epoch": 0.5507781846717238, + "grad_norm": 3.49916746767504, + "learning_rate": 9.973178078532903e-06, + "loss": 1.4967, + "step": 16845 + }, + { + "epoch": 0.5509416688464557, + "grad_norm": 3.115251481753498, + "learning_rate": 9.967471305093995e-06, + "loss": 1.4422, + "step": 16850 + }, + { + "epoch": 0.5511051530211876, + "grad_norm": 3.2308176688624908, + "learning_rate": 9.961764542248883e-06, + "loss": 1.5309, + "step": 16855 + }, + { + "epoch": 0.5512686371959195, + "grad_norm": 3.3161521157349796, + "learning_rate": 9.956057791856127e-06, + "loss": 1.5171, + "step": 16860 + }, + { + "epoch": 0.5514321213706513, + "grad_norm": 3.2351689594138553, + "learning_rate": 9.950351055774268e-06, + "loss": 1.4741, + "step": 16865 + }, + { + "epoch": 0.5515956055453832, + "grad_norm": 3.0730334652965374, + "learning_rate": 9.944644335861854e-06, + "loss": 1.4014, + "step": 16870 + }, + { + "epoch": 0.5517590897201151, + "grad_norm": 3.0413815126912294, + "learning_rate": 9.938937633977424e-06, + "loss": 1.4335, + "step": 16875 + }, + { + "epoch": 0.551922573894847, + "grad_norm": 3.050020313297915, + "learning_rate": 9.933230951979512e-06, + "loss": 1.4433, + "step": 16880 + }, + { + "epoch": 0.5520860580695789, + "grad_norm": 2.971916430141568, + "learning_rate": 9.927524291726641e-06, + "loss": 1.5113, + "step": 16885 + }, + { + "epoch": 0.5522495422443108, + "grad_norm": 3.0583206847107474, + "learning_rate": 9.921817655077335e-06, + "loss": 1.424, + "step": 16890 + }, + { + "epoch": 0.5524130264190427, + "grad_norm": 3.0517036799549655, + "learning_rate": 9.916111043890101e-06, + "loss": 1.4328, + "step": 16895 + }, + { + "epoch": 0.5525765105937746, + "grad_norm": 3.22155229052962, + "learning_rate": 9.910404460023445e-06, + "loss": 1.4378, + "step": 16900 + }, + { + "epoch": 0.5527399947685064, + "grad_norm": 3.1505026076108145, + "learning_rate": 9.90469790533586e-06, + "loss": 1.589, + "step": 16905 + }, + { + "epoch": 0.5529034789432383, + "grad_norm": 2.9097123708063815, + "learning_rate": 9.898991381685835e-06, + "loss": 1.432, + "step": 16910 + }, + { + "epoch": 0.5530669631179702, + "grad_norm": 3.0156444711898716, + "learning_rate": 9.89328489093184e-06, + "loss": 1.3302, + "step": 16915 + }, + { + "epoch": 0.5532304472927021, + "grad_norm": 3.1792073310804807, + "learning_rate": 9.887578434932343e-06, + "loss": 1.441, + "step": 16920 + }, + { + "epoch": 0.553393931467434, + "grad_norm": 3.212282762947364, + "learning_rate": 9.881872015545792e-06, + "loss": 1.4601, + "step": 16925 + }, + { + "epoch": 0.5535574156421659, + "grad_norm": 3.074110170706346, + "learning_rate": 9.876165634630633e-06, + "loss": 1.461, + "step": 16930 + }, + { + "epoch": 0.5537208998168978, + "grad_norm": 3.035855033027411, + "learning_rate": 9.870459294045295e-06, + "loss": 1.526, + "step": 16935 + }, + { + "epoch": 0.5538843839916296, + "grad_norm": 2.973328360293079, + "learning_rate": 9.86475299564819e-06, + "loss": 1.5409, + "step": 16940 + }, + { + "epoch": 0.5540478681663615, + "grad_norm": 3.3254866406149213, + "learning_rate": 9.85904674129772e-06, + "loss": 1.5723, + "step": 16945 + }, + { + "epoch": 0.5542113523410934, + "grad_norm": 2.963381096352016, + "learning_rate": 9.853340532852273e-06, + "loss": 1.4126, + "step": 16950 + }, + { + "epoch": 0.5543748365158253, + "grad_norm": 3.0678696825360343, + "learning_rate": 9.847634372170219e-06, + "loss": 1.3954, + "step": 16955 + }, + { + "epoch": 0.5545383206905572, + "grad_norm": 3.3986736630365537, + "learning_rate": 9.841928261109918e-06, + "loss": 1.4823, + "step": 16960 + }, + { + "epoch": 0.5547018048652891, + "grad_norm": 3.045354243218154, + "learning_rate": 9.836222201529712e-06, + "loss": 1.2965, + "step": 16965 + }, + { + "epoch": 0.554865289040021, + "grad_norm": 3.1121027555775673, + "learning_rate": 9.83051619528792e-06, + "loss": 1.4305, + "step": 16970 + }, + { + "epoch": 0.5550287732147529, + "grad_norm": 3.107094030275244, + "learning_rate": 9.824810244242852e-06, + "loss": 1.3256, + "step": 16975 + }, + { + "epoch": 0.5551922573894847, + "grad_norm": 3.2193999879810886, + "learning_rate": 9.819104350252792e-06, + "loss": 1.43, + "step": 16980 + }, + { + "epoch": 0.5553557415642166, + "grad_norm": 3.1468542648560383, + "learning_rate": 9.813398515176014e-06, + "loss": 1.5121, + "step": 16985 + }, + { + "epoch": 0.5555192257389485, + "grad_norm": 3.5862593342614435, + "learning_rate": 9.80769274087077e-06, + "loss": 1.4596, + "step": 16990 + }, + { + "epoch": 0.5556827099136804, + "grad_norm": 3.148554698414795, + "learning_rate": 9.801987029195287e-06, + "loss": 1.3344, + "step": 16995 + }, + { + "epoch": 0.5558461940884123, + "grad_norm": 3.0019307494013776, + "learning_rate": 9.79628138200778e-06, + "loss": 1.3575, + "step": 17000 + }, + { + "epoch": 0.5560096782631442, + "grad_norm": 3.079773436572248, + "learning_rate": 9.790575801166432e-06, + "loss": 1.418, + "step": 17005 + }, + { + "epoch": 0.5561731624378761, + "grad_norm": 3.1570855657534707, + "learning_rate": 9.784870288529413e-06, + "loss": 1.445, + "step": 17010 + }, + { + "epoch": 0.556336646612608, + "grad_norm": 3.1708702301027505, + "learning_rate": 9.779164845954874e-06, + "loss": 1.3651, + "step": 17015 + }, + { + "epoch": 0.5565001307873397, + "grad_norm": 3.4026390317014537, + "learning_rate": 9.773459475300932e-06, + "loss": 1.4878, + "step": 17020 + }, + { + "epoch": 0.5566636149620716, + "grad_norm": 2.961495212376508, + "learning_rate": 9.767754178425689e-06, + "loss": 1.4259, + "step": 17025 + }, + { + "epoch": 0.5568270991368035, + "grad_norm": 3.2379216823907675, + "learning_rate": 9.762048957187221e-06, + "loss": 1.4047, + "step": 17030 + }, + { + "epoch": 0.5569905833115354, + "grad_norm": 3.304731997559686, + "learning_rate": 9.756343813443576e-06, + "loss": 1.5572, + "step": 17035 + }, + { + "epoch": 0.5571540674862673, + "grad_norm": 3.010452133494072, + "learning_rate": 9.750638749052782e-06, + "loss": 1.2832, + "step": 17040 + }, + { + "epoch": 0.5573175516609992, + "grad_norm": 3.3143612713444326, + "learning_rate": 9.744933765872838e-06, + "loss": 1.4381, + "step": 17045 + }, + { + "epoch": 0.557481035835731, + "grad_norm": 3.588784160798702, + "learning_rate": 9.739228865761713e-06, + "loss": 1.4647, + "step": 17050 + }, + { + "epoch": 0.5576445200104629, + "grad_norm": 2.8926417365845354, + "learning_rate": 9.73352405057736e-06, + "loss": 1.4631, + "step": 17055 + }, + { + "epoch": 0.5578080041851948, + "grad_norm": 2.992457033342878, + "learning_rate": 9.727819322177696e-06, + "loss": 1.3583, + "step": 17060 + }, + { + "epoch": 0.5579714883599267, + "grad_norm": 3.2736584802133826, + "learning_rate": 9.72211468242061e-06, + "loss": 1.3953, + "step": 17065 + }, + { + "epoch": 0.5581349725346586, + "grad_norm": 3.191859184411776, + "learning_rate": 9.716410133163962e-06, + "loss": 1.3584, + "step": 17070 + }, + { + "epoch": 0.5582984567093905, + "grad_norm": 2.9271000108472283, + "learning_rate": 9.710705676265586e-06, + "loss": 1.3766, + "step": 17075 + }, + { + "epoch": 0.5584619408841224, + "grad_norm": 3.296826083832715, + "learning_rate": 9.705001313583282e-06, + "loss": 1.3124, + "step": 17080 + }, + { + "epoch": 0.5586254250588543, + "grad_norm": 3.211840968619704, + "learning_rate": 9.699297046974823e-06, + "loss": 1.5105, + "step": 17085 + }, + { + "epoch": 0.5587889092335862, + "grad_norm": 3.3462425346530678, + "learning_rate": 9.693592878297948e-06, + "loss": 1.542, + "step": 17090 + }, + { + "epoch": 0.558952393408318, + "grad_norm": 3.0620979772223396, + "learning_rate": 9.687888809410366e-06, + "loss": 1.4543, + "step": 17095 + }, + { + "epoch": 0.5591158775830499, + "grad_norm": 3.270091789892295, + "learning_rate": 9.682184842169751e-06, + "loss": 1.4256, + "step": 17100 + }, + { + "epoch": 0.5592793617577818, + "grad_norm": 3.2524024596358614, + "learning_rate": 9.676480978433746e-06, + "loss": 1.3261, + "step": 17105 + }, + { + "epoch": 0.5594428459325137, + "grad_norm": 3.138171815115836, + "learning_rate": 9.670777220059959e-06, + "loss": 1.4655, + "step": 17110 + }, + { + "epoch": 0.5596063301072456, + "grad_norm": 3.3108433996338626, + "learning_rate": 9.665073568905967e-06, + "loss": 1.4084, + "step": 17115 + }, + { + "epoch": 0.5597698142819775, + "grad_norm": 3.021880485532437, + "learning_rate": 9.659370026829307e-06, + "loss": 1.3624, + "step": 17120 + }, + { + "epoch": 0.5599332984567094, + "grad_norm": 2.817001407548006, + "learning_rate": 9.653666595687483e-06, + "loss": 1.3481, + "step": 17125 + }, + { + "epoch": 0.5600967826314412, + "grad_norm": 3.137563781687612, + "learning_rate": 9.647963277337962e-06, + "loss": 1.3851, + "step": 17130 + }, + { + "epoch": 0.5602602668061731, + "grad_norm": 3.1875861285461293, + "learning_rate": 9.642260073638178e-06, + "loss": 1.4858, + "step": 17135 + }, + { + "epoch": 0.560423750980905, + "grad_norm": 3.153505849131608, + "learning_rate": 9.636556986445522e-06, + "loss": 1.4092, + "step": 17140 + }, + { + "epoch": 0.5605872351556369, + "grad_norm": 3.387670561838539, + "learning_rate": 9.630854017617352e-06, + "loss": 1.4646, + "step": 17145 + }, + { + "epoch": 0.5607507193303688, + "grad_norm": 2.947042677792708, + "learning_rate": 9.625151169010983e-06, + "loss": 1.3973, + "step": 17150 + }, + { + "epoch": 0.5609142035051007, + "grad_norm": 3.320441498283477, + "learning_rate": 9.619448442483696e-06, + "loss": 1.4786, + "step": 17155 + }, + { + "epoch": 0.5610776876798326, + "grad_norm": 2.8930631880188455, + "learning_rate": 9.613745839892723e-06, + "loss": 1.3264, + "step": 17160 + }, + { + "epoch": 0.5612411718545645, + "grad_norm": 3.2586875463333, + "learning_rate": 9.608043363095268e-06, + "loss": 1.3977, + "step": 17165 + }, + { + "epoch": 0.5614046560292963, + "grad_norm": 3.3077284480134375, + "learning_rate": 9.602341013948488e-06, + "loss": 1.3991, + "step": 17170 + }, + { + "epoch": 0.5615681402040282, + "grad_norm": 2.818857678066059, + "learning_rate": 9.596638794309496e-06, + "loss": 1.4236, + "step": 17175 + }, + { + "epoch": 0.5617316243787601, + "grad_norm": 3.013612574981894, + "learning_rate": 9.590936706035365e-06, + "loss": 1.3105, + "step": 17180 + }, + { + "epoch": 0.561895108553492, + "grad_norm": 3.2077250663998758, + "learning_rate": 9.585234750983125e-06, + "loss": 1.5921, + "step": 17185 + }, + { + "epoch": 0.5620585927282239, + "grad_norm": 2.996801018843489, + "learning_rate": 9.579532931009767e-06, + "loss": 1.5313, + "step": 17190 + }, + { + "epoch": 0.5622220769029558, + "grad_norm": 3.2516177837817577, + "learning_rate": 9.57383124797223e-06, + "loss": 1.347, + "step": 17195 + }, + { + "epoch": 0.5623855610776877, + "grad_norm": 3.0675455761331705, + "learning_rate": 9.568129703727416e-06, + "loss": 1.4926, + "step": 17200 + }, + { + "epoch": 0.5625490452524196, + "grad_norm": 3.172056964576785, + "learning_rate": 9.562428300132174e-06, + "loss": 1.3439, + "step": 17205 + }, + { + "epoch": 0.5627125294271514, + "grad_norm": 3.2211614024112465, + "learning_rate": 9.556727039043315e-06, + "loss": 1.3945, + "step": 17210 + }, + { + "epoch": 0.5628760136018833, + "grad_norm": 3.140054568048794, + "learning_rate": 9.551025922317595e-06, + "loss": 1.4458, + "step": 17215 + }, + { + "epoch": 0.5630394977766152, + "grad_norm": 3.3603171640383627, + "learning_rate": 9.545324951811737e-06, + "loss": 1.4522, + "step": 17220 + }, + { + "epoch": 0.5632029819513471, + "grad_norm": 2.9626535647605317, + "learning_rate": 9.5396241293824e-06, + "loss": 1.486, + "step": 17225 + }, + { + "epoch": 0.563366466126079, + "grad_norm": 3.077649453270401, + "learning_rate": 9.533923456886204e-06, + "loss": 1.3559, + "step": 17230 + }, + { + "epoch": 0.5635299503008109, + "grad_norm": 3.2209159193494536, + "learning_rate": 9.528222936179719e-06, + "loss": 1.4831, + "step": 17235 + }, + { + "epoch": 0.5636934344755428, + "grad_norm": 3.0134280667505338, + "learning_rate": 9.522522569119466e-06, + "loss": 1.445, + "step": 17240 + }, + { + "epoch": 0.5638569186502747, + "grad_norm": 3.35254587774871, + "learning_rate": 9.516822357561913e-06, + "loss": 1.4174, + "step": 17245 + }, + { + "epoch": 0.5640204028250065, + "grad_norm": 3.190578223932862, + "learning_rate": 9.511122303363478e-06, + "loss": 1.4395, + "step": 17250 + }, + { + "epoch": 0.5641838869997384, + "grad_norm": 3.0827802907928765, + "learning_rate": 9.505422408380531e-06, + "loss": 1.4405, + "step": 17255 + }, + { + "epoch": 0.5643473711744703, + "grad_norm": 3.4123482325212238, + "learning_rate": 9.499722674469386e-06, + "loss": 1.4473, + "step": 17260 + }, + { + "epoch": 0.5645108553492022, + "grad_norm": 3.446650730269, + "learning_rate": 9.494023103486312e-06, + "loss": 1.4021, + "step": 17265 + }, + { + "epoch": 0.5646743395239341, + "grad_norm": 3.211157904581307, + "learning_rate": 9.488323697287515e-06, + "loss": 1.4306, + "step": 17270 + }, + { + "epoch": 0.564837823698666, + "grad_norm": 3.0542051870686806, + "learning_rate": 9.482624457729153e-06, + "loss": 1.388, + "step": 17275 + }, + { + "epoch": 0.5650013078733979, + "grad_norm": 3.188620673615776, + "learning_rate": 9.47692538666733e-06, + "loss": 1.4987, + "step": 17280 + }, + { + "epoch": 0.5651647920481297, + "grad_norm": 3.0300781343130727, + "learning_rate": 9.471226485958089e-06, + "loss": 1.276, + "step": 17285 + }, + { + "epoch": 0.5653282762228616, + "grad_norm": 3.297978994813752, + "learning_rate": 9.465527757457427e-06, + "loss": 1.471, + "step": 17290 + }, + { + "epoch": 0.5654917603975935, + "grad_norm": 3.0956314538901486, + "learning_rate": 9.459829203021281e-06, + "loss": 1.4617, + "step": 17295 + }, + { + "epoch": 0.5656552445723254, + "grad_norm": 3.5917248174991783, + "learning_rate": 9.45413082450553e-06, + "loss": 1.5269, + "step": 17300 + }, + { + "epoch": 0.5658187287470573, + "grad_norm": 3.0761386563894226, + "learning_rate": 9.448432623765993e-06, + "loss": 1.4708, + "step": 17305 + }, + { + "epoch": 0.5659822129217892, + "grad_norm": 3.1678839850383005, + "learning_rate": 9.442734602658434e-06, + "loss": 1.4741, + "step": 17310 + }, + { + "epoch": 0.5661456970965211, + "grad_norm": 3.143989003071978, + "learning_rate": 9.437036763038565e-06, + "loss": 1.3588, + "step": 17315 + }, + { + "epoch": 0.566309181271253, + "grad_norm": 3.2861953774468007, + "learning_rate": 9.431339106762027e-06, + "loss": 1.4993, + "step": 17320 + }, + { + "epoch": 0.5664726654459848, + "grad_norm": 3.4545721690471076, + "learning_rate": 9.42564163568441e-06, + "loss": 1.3925, + "step": 17325 + }, + { + "epoch": 0.5666361496207167, + "grad_norm": 3.066242875090134, + "learning_rate": 9.41994435166124e-06, + "loss": 1.3319, + "step": 17330 + }, + { + "epoch": 0.5667996337954486, + "grad_norm": 3.2929373876086276, + "learning_rate": 9.414247256547983e-06, + "loss": 1.4445, + "step": 17335 + }, + { + "epoch": 0.5669631179701805, + "grad_norm": 3.112908850904098, + "learning_rate": 9.40855035220004e-06, + "loss": 1.4496, + "step": 17340 + }, + { + "epoch": 0.5671266021449124, + "grad_norm": 3.305709410762943, + "learning_rate": 9.40285364047276e-06, + "loss": 1.4787, + "step": 17345 + }, + { + "epoch": 0.5672900863196443, + "grad_norm": 3.1630554826855306, + "learning_rate": 9.397157123221416e-06, + "loss": 1.4201, + "step": 17350 + }, + { + "epoch": 0.5674535704943762, + "grad_norm": 4.832053572491571, + "learning_rate": 9.391460802301227e-06, + "loss": 1.3377, + "step": 17355 + }, + { + "epoch": 0.5676170546691081, + "grad_norm": 3.2933719331222187, + "learning_rate": 9.385764679567345e-06, + "loss": 1.3804, + "step": 17360 + }, + { + "epoch": 0.5677805388438399, + "grad_norm": 3.4779445228329444, + "learning_rate": 9.380068756874856e-06, + "loss": 1.6185, + "step": 17365 + }, + { + "epoch": 0.5679440230185718, + "grad_norm": 3.3961375661609075, + "learning_rate": 9.374373036078785e-06, + "loss": 1.5267, + "step": 17370 + }, + { + "epoch": 0.5681075071933037, + "grad_norm": 2.8504301843834434, + "learning_rate": 9.368677519034088e-06, + "loss": 1.3526, + "step": 17375 + }, + { + "epoch": 0.5682709913680356, + "grad_norm": 3.2784189584723387, + "learning_rate": 9.362982207595655e-06, + "loss": 1.5086, + "step": 17380 + }, + { + "epoch": 0.5684344755427675, + "grad_norm": 3.267132140822307, + "learning_rate": 9.35728710361831e-06, + "loss": 1.5123, + "step": 17385 + }, + { + "epoch": 0.5685979597174994, + "grad_norm": 3.4311145433770207, + "learning_rate": 9.351592208956806e-06, + "loss": 1.4666, + "step": 17390 + }, + { + "epoch": 0.5687614438922313, + "grad_norm": 3.1670584890998557, + "learning_rate": 9.345897525465833e-06, + "loss": 1.3322, + "step": 17395 + }, + { + "epoch": 0.5689249280669632, + "grad_norm": 3.3118963607580807, + "learning_rate": 9.340203055000012e-06, + "loss": 1.4799, + "step": 17400 + }, + { + "epoch": 0.569088412241695, + "grad_norm": 3.3821251210342744, + "learning_rate": 9.334508799413889e-06, + "loss": 1.4747, + "step": 17405 + }, + { + "epoch": 0.5692518964164269, + "grad_norm": 2.982131850053979, + "learning_rate": 9.328814760561943e-06, + "loss": 1.3246, + "step": 17410 + }, + { + "epoch": 0.5694153805911588, + "grad_norm": 3.1666656890313494, + "learning_rate": 9.323120940298588e-06, + "loss": 1.426, + "step": 17415 + }, + { + "epoch": 0.5695788647658907, + "grad_norm": 2.979561382205564, + "learning_rate": 9.317427340478153e-06, + "loss": 1.3527, + "step": 17420 + }, + { + "epoch": 0.5697423489406226, + "grad_norm": 3.0838732928489883, + "learning_rate": 9.311733962954915e-06, + "loss": 1.4989, + "step": 17425 + }, + { + "epoch": 0.5699058331153545, + "grad_norm": 3.237684300901275, + "learning_rate": 9.306040809583059e-06, + "loss": 1.4838, + "step": 17430 + }, + { + "epoch": 0.5700693172900864, + "grad_norm": 5.533725853313939, + "learning_rate": 9.300347882216709e-06, + "loss": 1.3264, + "step": 17435 + }, + { + "epoch": 0.5702328014648183, + "grad_norm": 3.3087716466043493, + "learning_rate": 9.294655182709912e-06, + "loss": 1.5017, + "step": 17440 + }, + { + "epoch": 0.5703962856395501, + "grad_norm": 3.2355788917126453, + "learning_rate": 9.288962712916642e-06, + "loss": 1.5094, + "step": 17445 + }, + { + "epoch": 0.570559769814282, + "grad_norm": 3.075470404596598, + "learning_rate": 9.283270474690793e-06, + "loss": 1.4124, + "step": 17450 + }, + { + "epoch": 0.5707232539890139, + "grad_norm": 3.3134720774483997, + "learning_rate": 9.277578469886192e-06, + "loss": 1.4656, + "step": 17455 + }, + { + "epoch": 0.5708867381637458, + "grad_norm": 3.205867649120836, + "learning_rate": 9.271886700356579e-06, + "loss": 1.3447, + "step": 17460 + }, + { + "epoch": 0.5710502223384777, + "grad_norm": 3.255999918115168, + "learning_rate": 9.26619516795563e-06, + "loss": 1.4103, + "step": 17465 + }, + { + "epoch": 0.5712137065132096, + "grad_norm": 3.2812022736790585, + "learning_rate": 9.260503874536936e-06, + "loss": 1.5011, + "step": 17470 + }, + { + "epoch": 0.5713771906879415, + "grad_norm": 3.2818949659857615, + "learning_rate": 9.25481282195401e-06, + "loss": 1.5041, + "step": 17475 + }, + { + "epoch": 0.5715406748626733, + "grad_norm": 3.1591300265771145, + "learning_rate": 9.249122012060292e-06, + "loss": 1.5244, + "step": 17480 + }, + { + "epoch": 0.5717041590374051, + "grad_norm": 3.406812379572405, + "learning_rate": 9.243431446709137e-06, + "loss": 1.4474, + "step": 17485 + }, + { + "epoch": 0.571867643212137, + "grad_norm": 3.143796434578371, + "learning_rate": 9.23774112775382e-06, + "loss": 1.3604, + "step": 17490 + }, + { + "epoch": 0.5720311273868689, + "grad_norm": 2.9598717849730516, + "learning_rate": 9.232051057047544e-06, + "loss": 1.4248, + "step": 17495 + }, + { + "epoch": 0.5721946115616008, + "grad_norm": 3.354480255166785, + "learning_rate": 9.226361236443423e-06, + "loss": 1.4263, + "step": 17500 + }, + { + "epoch": 0.5723580957363327, + "grad_norm": 3.5434614209741557, + "learning_rate": 9.220671667794493e-06, + "loss": 1.6401, + "step": 17505 + }, + { + "epoch": 0.5725215799110646, + "grad_norm": 3.244148954063027, + "learning_rate": 9.214982352953706e-06, + "loss": 1.3923, + "step": 17510 + }, + { + "epoch": 0.5726850640857964, + "grad_norm": 2.9905047622054535, + "learning_rate": 9.209293293773929e-06, + "loss": 1.5445, + "step": 17515 + }, + { + "epoch": 0.5728485482605283, + "grad_norm": 3.0895688300833237, + "learning_rate": 9.203604492107957e-06, + "loss": 1.3343, + "step": 17520 + }, + { + "epoch": 0.5730120324352602, + "grad_norm": 3.198825941442979, + "learning_rate": 9.19791594980849e-06, + "loss": 1.5339, + "step": 17525 + }, + { + "epoch": 0.5731755166099921, + "grad_norm": 2.9008650426929057, + "learning_rate": 9.192227668728145e-06, + "loss": 1.4043, + "step": 17530 + }, + { + "epoch": 0.573339000784724, + "grad_norm": 2.9992645295618843, + "learning_rate": 9.186539650719454e-06, + "loss": 1.4017, + "step": 17535 + }, + { + "epoch": 0.5735024849594559, + "grad_norm": 3.2950318401389196, + "learning_rate": 9.180851897634873e-06, + "loss": 1.5539, + "step": 17540 + }, + { + "epoch": 0.5736659691341878, + "grad_norm": 3.150565059980104, + "learning_rate": 9.175164411326753e-06, + "loss": 1.426, + "step": 17545 + }, + { + "epoch": 0.5738294533089197, + "grad_norm": 3.2179483348723186, + "learning_rate": 9.169477193647378e-06, + "loss": 1.4447, + "step": 17550 + }, + { + "epoch": 0.5739929374836515, + "grad_norm": 3.0918026855952325, + "learning_rate": 9.163790246448933e-06, + "loss": 1.487, + "step": 17555 + }, + { + "epoch": 0.5741564216583834, + "grad_norm": 3.396152994248185, + "learning_rate": 9.158103571583516e-06, + "loss": 1.2279, + "step": 17560 + }, + { + "epoch": 0.5743199058331153, + "grad_norm": 3.0390880556269386, + "learning_rate": 9.152417170903138e-06, + "loss": 1.4546, + "step": 17565 + }, + { + "epoch": 0.5744833900078472, + "grad_norm": 3.4102233547700016, + "learning_rate": 9.14673104625972e-06, + "loss": 1.5544, + "step": 17570 + }, + { + "epoch": 0.5746468741825791, + "grad_norm": 3.2595404829293777, + "learning_rate": 9.141045199505095e-06, + "loss": 1.5673, + "step": 17575 + }, + { + "epoch": 0.574810358357311, + "grad_norm": 2.914118466661138, + "learning_rate": 9.135359632491003e-06, + "loss": 1.2307, + "step": 17580 + }, + { + "epoch": 0.5749738425320429, + "grad_norm": 3.1402932686740104, + "learning_rate": 9.129674347069094e-06, + "loss": 1.4619, + "step": 17585 + }, + { + "epoch": 0.5751373267067748, + "grad_norm": 3.506645912009643, + "learning_rate": 9.123989345090927e-06, + "loss": 1.5896, + "step": 17590 + }, + { + "epoch": 0.5753008108815066, + "grad_norm": 3.401782095321018, + "learning_rate": 9.118304628407967e-06, + "loss": 1.3879, + "step": 17595 + }, + { + "epoch": 0.5754642950562385, + "grad_norm": 3.0319631625016954, + "learning_rate": 9.112620198871584e-06, + "loss": 1.3678, + "step": 17600 + }, + { + "epoch": 0.5756277792309704, + "grad_norm": 2.9871558652730816, + "learning_rate": 9.106936058333063e-06, + "loss": 1.3767, + "step": 17605 + }, + { + "epoch": 0.5757912634057023, + "grad_norm": 2.945296125651828, + "learning_rate": 9.101252208643586e-06, + "loss": 1.2535, + "step": 17610 + }, + { + "epoch": 0.5759547475804342, + "grad_norm": 3.0475684006580304, + "learning_rate": 9.095568651654245e-06, + "loss": 1.4986, + "step": 17615 + }, + { + "epoch": 0.5761182317551661, + "grad_norm": 3.028761053713784, + "learning_rate": 9.089885389216033e-06, + "loss": 1.3924, + "step": 17620 + }, + { + "epoch": 0.576281715929898, + "grad_norm": 3.297899893585908, + "learning_rate": 9.08420242317985e-06, + "loss": 1.4672, + "step": 17625 + }, + { + "epoch": 0.5764452001046299, + "grad_norm": 3.0867198239098537, + "learning_rate": 9.078519755396501e-06, + "loss": 1.4836, + "step": 17630 + }, + { + "epoch": 0.5766086842793617, + "grad_norm": 3.031925284886494, + "learning_rate": 9.072837387716692e-06, + "loss": 1.4802, + "step": 17635 + }, + { + "epoch": 0.5767721684540936, + "grad_norm": 3.2004216519489512, + "learning_rate": 9.067155321991029e-06, + "loss": 1.4798, + "step": 17640 + }, + { + "epoch": 0.5769356526288255, + "grad_norm": 3.1749396953649316, + "learning_rate": 9.06147356007002e-06, + "loss": 1.3067, + "step": 17645 + }, + { + "epoch": 0.5770991368035574, + "grad_norm": 3.1175060317993015, + "learning_rate": 9.05579210380408e-06, + "loss": 1.4249, + "step": 17650 + }, + { + "epoch": 0.5772626209782893, + "grad_norm": 3.1329528452828157, + "learning_rate": 9.050110955043516e-06, + "loss": 1.4141, + "step": 17655 + }, + { + "epoch": 0.5774261051530212, + "grad_norm": 2.973537491421601, + "learning_rate": 9.04443011563854e-06, + "loss": 1.4664, + "step": 17660 + }, + { + "epoch": 0.5775895893277531, + "grad_norm": 3.156361173191675, + "learning_rate": 9.038749587439261e-06, + "loss": 1.4627, + "step": 17665 + }, + { + "epoch": 0.577753073502485, + "grad_norm": 2.97351179392273, + "learning_rate": 9.033069372295694e-06, + "loss": 1.346, + "step": 17670 + }, + { + "epoch": 0.5779165576772168, + "grad_norm": 3.2205705939303044, + "learning_rate": 9.027389472057739e-06, + "loss": 1.5352, + "step": 17675 + }, + { + "epoch": 0.5780800418519487, + "grad_norm": 3.120875123131597, + "learning_rate": 9.021709888575202e-06, + "loss": 1.4998, + "step": 17680 + }, + { + "epoch": 0.5782435260266806, + "grad_norm": 3.0763436168208504, + "learning_rate": 9.016030623697786e-06, + "loss": 1.4569, + "step": 17685 + }, + { + "epoch": 0.5784070102014125, + "grad_norm": 3.052144284566283, + "learning_rate": 9.010351679275087e-06, + "loss": 1.4924, + "step": 17690 + }, + { + "epoch": 0.5785704943761444, + "grad_norm": 3.2417196068406433, + "learning_rate": 9.004673057156597e-06, + "loss": 1.413, + "step": 17695 + }, + { + "epoch": 0.5787339785508763, + "grad_norm": 3.0999835048218194, + "learning_rate": 8.998994759191709e-06, + "loss": 1.6117, + "step": 17700 + }, + { + "epoch": 0.5788974627256082, + "grad_norm": 3.094973289459636, + "learning_rate": 8.9933167872297e-06, + "loss": 1.3427, + "step": 17705 + }, + { + "epoch": 0.57906094690034, + "grad_norm": 3.0446326459703856, + "learning_rate": 8.987639143119749e-06, + "loss": 1.5517, + "step": 17710 + }, + { + "epoch": 0.5792244310750719, + "grad_norm": 3.2394141737301707, + "learning_rate": 8.981961828710926e-06, + "loss": 1.4978, + "step": 17715 + }, + { + "epoch": 0.5793879152498038, + "grad_norm": 2.947595582442715, + "learning_rate": 8.97628484585219e-06, + "loss": 1.3119, + "step": 17720 + }, + { + "epoch": 0.5795513994245357, + "grad_norm": 3.002313725669461, + "learning_rate": 8.970608196392399e-06, + "loss": 1.4148, + "step": 17725 + }, + { + "epoch": 0.5797148835992676, + "grad_norm": 3.647986361414934, + "learning_rate": 8.964931882180297e-06, + "loss": 1.4954, + "step": 17730 + }, + { + "epoch": 0.5798783677739995, + "grad_norm": 3.2175878590072657, + "learning_rate": 8.95925590506452e-06, + "loss": 1.5079, + "step": 17735 + }, + { + "epoch": 0.5800418519487314, + "grad_norm": 3.1503960350094573, + "learning_rate": 8.953580266893597e-06, + "loss": 1.4086, + "step": 17740 + }, + { + "epoch": 0.5802053361234633, + "grad_norm": 3.3352932816154133, + "learning_rate": 8.947904969515941e-06, + "loss": 1.4182, + "step": 17745 + }, + { + "epoch": 0.5803688202981951, + "grad_norm": 3.040956917028475, + "learning_rate": 8.942230014779857e-06, + "loss": 1.4152, + "step": 17750 + }, + { + "epoch": 0.580532304472927, + "grad_norm": 3.285958207510521, + "learning_rate": 8.936555404533542e-06, + "loss": 1.452, + "step": 17755 + }, + { + "epoch": 0.5806957886476589, + "grad_norm": 3.326471400982065, + "learning_rate": 8.930881140625078e-06, + "loss": 1.5415, + "step": 17760 + }, + { + "epoch": 0.5808592728223908, + "grad_norm": 3.2595229910889434, + "learning_rate": 8.92520722490243e-06, + "loss": 1.4204, + "step": 17765 + }, + { + "epoch": 0.5810227569971227, + "grad_norm": 3.0162213425364124, + "learning_rate": 8.919533659213456e-06, + "loss": 1.352, + "step": 17770 + }, + { + "epoch": 0.5811862411718546, + "grad_norm": 3.119363512412672, + "learning_rate": 8.913860445405896e-06, + "loss": 1.446, + "step": 17775 + }, + { + "epoch": 0.5813497253465865, + "grad_norm": 3.3565924206987336, + "learning_rate": 8.908187585327376e-06, + "loss": 1.4196, + "step": 17780 + }, + { + "epoch": 0.5815132095213184, + "grad_norm": 3.32387920421157, + "learning_rate": 8.902515080825411e-06, + "loss": 1.3513, + "step": 17785 + }, + { + "epoch": 0.5816766936960502, + "grad_norm": 2.647230178714119, + "learning_rate": 8.896842933747394e-06, + "loss": 1.4511, + "step": 17790 + }, + { + "epoch": 0.5818401778707821, + "grad_norm": 2.9876355739792047, + "learning_rate": 8.891171145940605e-06, + "loss": 1.4688, + "step": 17795 + }, + { + "epoch": 0.582003662045514, + "grad_norm": 3.254963409830011, + "learning_rate": 8.885499719252205e-06, + "loss": 1.6536, + "step": 17800 + }, + { + "epoch": 0.5821671462202459, + "grad_norm": 3.026427424297842, + "learning_rate": 8.87982865552924e-06, + "loss": 1.3301, + "step": 17805 + }, + { + "epoch": 0.5823306303949778, + "grad_norm": 3.1303747801288937, + "learning_rate": 8.874157956618636e-06, + "loss": 1.4085, + "step": 17810 + }, + { + "epoch": 0.5824941145697097, + "grad_norm": 2.845889205013552, + "learning_rate": 8.8684876243672e-06, + "loss": 1.3951, + "step": 17815 + }, + { + "epoch": 0.5826575987444416, + "grad_norm": 2.9372696454281315, + "learning_rate": 8.862817660621625e-06, + "loss": 1.352, + "step": 17820 + }, + { + "epoch": 0.5828210829191734, + "grad_norm": 3.042824386804509, + "learning_rate": 8.857148067228473e-06, + "loss": 1.5836, + "step": 17825 + }, + { + "epoch": 0.5829845670939053, + "grad_norm": 3.0347071714285363, + "learning_rate": 8.851478846034193e-06, + "loss": 1.4575, + "step": 17830 + }, + { + "epoch": 0.5831480512686372, + "grad_norm": 3.1551316789179276, + "learning_rate": 8.845809998885117e-06, + "loss": 1.4437, + "step": 17835 + }, + { + "epoch": 0.5833115354433691, + "grad_norm": 3.3608874468599765, + "learning_rate": 8.840141527627442e-06, + "loss": 1.4618, + "step": 17840 + }, + { + "epoch": 0.583475019618101, + "grad_norm": 3.275984460097182, + "learning_rate": 8.834473434107256e-06, + "loss": 1.5217, + "step": 17845 + }, + { + "epoch": 0.5836385037928329, + "grad_norm": 3.157876936221489, + "learning_rate": 8.828805720170515e-06, + "loss": 1.445, + "step": 17850 + }, + { + "epoch": 0.5838019879675648, + "grad_norm": 3.206955578912974, + "learning_rate": 8.823138387663052e-06, + "loss": 1.5303, + "step": 17855 + }, + { + "epoch": 0.5839654721422967, + "grad_norm": 3.1785347749162822, + "learning_rate": 8.81747143843059e-06, + "loss": 1.4276, + "step": 17860 + }, + { + "epoch": 0.5841289563170285, + "grad_norm": 3.2827318838844137, + "learning_rate": 8.811804874318701e-06, + "loss": 1.5106, + "step": 17865 + }, + { + "epoch": 0.5842924404917604, + "grad_norm": 3.2835879486997563, + "learning_rate": 8.806138697172852e-06, + "loss": 1.5017, + "step": 17870 + }, + { + "epoch": 0.5844559246664923, + "grad_norm": 3.281620959492744, + "learning_rate": 8.800472908838378e-06, + "loss": 1.5229, + "step": 17875 + }, + { + "epoch": 0.5846194088412242, + "grad_norm": 3.1054593362969096, + "learning_rate": 8.794807511160487e-06, + "loss": 1.2813, + "step": 17880 + }, + { + "epoch": 0.5847828930159561, + "grad_norm": 3.2178926536417523, + "learning_rate": 8.789142505984264e-06, + "loss": 1.4285, + "step": 17885 + }, + { + "epoch": 0.584946377190688, + "grad_norm": 3.115350182202249, + "learning_rate": 8.783477895154658e-06, + "loss": 1.4981, + "step": 17890 + }, + { + "epoch": 0.5851098613654199, + "grad_norm": 3.654468787081664, + "learning_rate": 8.777813680516497e-06, + "loss": 1.5453, + "step": 17895 + }, + { + "epoch": 0.5852733455401518, + "grad_norm": 3.114665824519335, + "learning_rate": 8.77214986391447e-06, + "loss": 1.3787, + "step": 17900 + }, + { + "epoch": 0.5854368297148836, + "grad_norm": 3.1255490268748614, + "learning_rate": 8.766486447193153e-06, + "loss": 1.4777, + "step": 17905 + }, + { + "epoch": 0.5856003138896155, + "grad_norm": 3.0172055520885097, + "learning_rate": 8.760823432196976e-06, + "loss": 1.3812, + "step": 17910 + }, + { + "epoch": 0.5857637980643474, + "grad_norm": 3.240019948230867, + "learning_rate": 8.755160820770248e-06, + "loss": 1.4058, + "step": 17915 + }, + { + "epoch": 0.5859272822390793, + "grad_norm": 3.284319604716601, + "learning_rate": 8.749498614757139e-06, + "loss": 1.5346, + "step": 17920 + }, + { + "epoch": 0.5860907664138112, + "grad_norm": 3.205855232313159, + "learning_rate": 8.743836816001691e-06, + "loss": 1.4356, + "step": 17925 + }, + { + "epoch": 0.5862542505885431, + "grad_norm": 3.0959943886778283, + "learning_rate": 8.738175426347815e-06, + "loss": 1.4103, + "step": 17930 + }, + { + "epoch": 0.586417734763275, + "grad_norm": 3.1526131128924093, + "learning_rate": 8.732514447639289e-06, + "loss": 1.4139, + "step": 17935 + }, + { + "epoch": 0.5865812189380069, + "grad_norm": 3.194396837132353, + "learning_rate": 8.72685388171975e-06, + "loss": 1.3734, + "step": 17940 + }, + { + "epoch": 0.5867447031127387, + "grad_norm": 3.18622324928962, + "learning_rate": 8.72119373043271e-06, + "loss": 1.4271, + "step": 17945 + }, + { + "epoch": 0.5869081872874705, + "grad_norm": 3.6085693415778484, + "learning_rate": 8.715533995621539e-06, + "loss": 1.4248, + "step": 17950 + }, + { + "epoch": 0.5870716714622024, + "grad_norm": 3.3318345436611674, + "learning_rate": 8.709874679129474e-06, + "loss": 1.5445, + "step": 17955 + }, + { + "epoch": 0.5872351556369343, + "grad_norm": 3.7015493356562708, + "learning_rate": 8.704215782799615e-06, + "loss": 1.5036, + "step": 17960 + }, + { + "epoch": 0.5873986398116662, + "grad_norm": 3.011465749165187, + "learning_rate": 8.698557308474931e-06, + "loss": 1.358, + "step": 17965 + }, + { + "epoch": 0.5875621239863981, + "grad_norm": 3.7096294902684193, + "learning_rate": 8.692899257998241e-06, + "loss": 1.3446, + "step": 17970 + }, + { + "epoch": 0.58772560816113, + "grad_norm": 3.0424840607952235, + "learning_rate": 8.68724163321224e-06, + "loss": 1.4031, + "step": 17975 + }, + { + "epoch": 0.5878890923358618, + "grad_norm": 3.2218270416199113, + "learning_rate": 8.681584435959472e-06, + "loss": 1.4093, + "step": 17980 + }, + { + "epoch": 0.5880525765105937, + "grad_norm": 3.243955380693273, + "learning_rate": 8.675927668082353e-06, + "loss": 1.4222, + "step": 17985 + }, + { + "epoch": 0.5882160606853256, + "grad_norm": 3.01192083910381, + "learning_rate": 8.670271331423152e-06, + "loss": 1.4309, + "step": 17990 + }, + { + "epoch": 0.5883795448600575, + "grad_norm": 2.8893998566842205, + "learning_rate": 8.664615427823996e-06, + "loss": 1.3953, + "step": 17995 + }, + { + "epoch": 0.5885430290347894, + "grad_norm": 3.33588586284624, + "learning_rate": 8.658959959126878e-06, + "loss": 1.3951, + "step": 18000 + }, + { + "epoch": 0.5887065132095213, + "grad_norm": 3.3863871835413364, + "learning_rate": 8.653304927173645e-06, + "loss": 1.3944, + "step": 18005 + }, + { + "epoch": 0.5888699973842532, + "grad_norm": 3.4769754730064153, + "learning_rate": 8.647650333805998e-06, + "loss": 1.354, + "step": 18010 + }, + { + "epoch": 0.589033481558985, + "grad_norm": 3.1168905626132624, + "learning_rate": 8.641996180865506e-06, + "loss": 1.3852, + "step": 18015 + }, + { + "epoch": 0.5891969657337169, + "grad_norm": 3.0980025483738753, + "learning_rate": 8.636342470193585e-06, + "loss": 1.5824, + "step": 18020 + }, + { + "epoch": 0.5893604499084488, + "grad_norm": 3.1529085415081837, + "learning_rate": 8.63068920363151e-06, + "loss": 1.417, + "step": 18025 + }, + { + "epoch": 0.5895239340831807, + "grad_norm": 3.1242765419082796, + "learning_rate": 8.625036383020413e-06, + "loss": 1.3791, + "step": 18030 + }, + { + "epoch": 0.5896874182579126, + "grad_norm": 3.2047464751114227, + "learning_rate": 8.619384010201272e-06, + "loss": 1.3903, + "step": 18035 + }, + { + "epoch": 0.5898509024326445, + "grad_norm": 3.6169643896826855, + "learning_rate": 8.613732087014939e-06, + "loss": 1.4844, + "step": 18040 + }, + { + "epoch": 0.5900143866073764, + "grad_norm": 3.178816671631025, + "learning_rate": 8.608080615302097e-06, + "loss": 1.4661, + "step": 18045 + }, + { + "epoch": 0.5901778707821083, + "grad_norm": 3.232488387539473, + "learning_rate": 8.602429596903295e-06, + "loss": 1.3805, + "step": 18050 + }, + { + "epoch": 0.5903413549568401, + "grad_norm": 3.0594688555497664, + "learning_rate": 8.596779033658932e-06, + "loss": 1.3082, + "step": 18055 + }, + { + "epoch": 0.590504839131572, + "grad_norm": 3.208411376663378, + "learning_rate": 8.591128927409257e-06, + "loss": 1.2723, + "step": 18060 + }, + { + "epoch": 0.5906683233063039, + "grad_norm": 3.1519104286193134, + "learning_rate": 8.585479279994373e-06, + "loss": 1.4022, + "step": 18065 + }, + { + "epoch": 0.5908318074810358, + "grad_norm": 3.1358730555236374, + "learning_rate": 8.579830093254229e-06, + "loss": 1.3385, + "step": 18070 + }, + { + "epoch": 0.5909952916557677, + "grad_norm": 3.0994789140731096, + "learning_rate": 8.574181369028628e-06, + "loss": 1.4585, + "step": 18075 + }, + { + "epoch": 0.5911587758304996, + "grad_norm": 3.13055006713946, + "learning_rate": 8.568533109157217e-06, + "loss": 1.4242, + "step": 18080 + }, + { + "epoch": 0.5913222600052315, + "grad_norm": 3.146725856556345, + "learning_rate": 8.562885315479503e-06, + "loss": 1.4738, + "step": 18085 + }, + { + "epoch": 0.5914857441799634, + "grad_norm": 2.9751047095241905, + "learning_rate": 8.55723798983483e-06, + "loss": 1.3286, + "step": 18090 + }, + { + "epoch": 0.5916492283546952, + "grad_norm": 3.306146859928608, + "learning_rate": 8.551591134062395e-06, + "loss": 1.4914, + "step": 18095 + }, + { + "epoch": 0.5918127125294271, + "grad_norm": 3.2312516459545275, + "learning_rate": 8.54594475000124e-06, + "loss": 1.3773, + "step": 18100 + }, + { + "epoch": 0.591976196704159, + "grad_norm": 3.280254436574427, + "learning_rate": 8.54029883949025e-06, + "loss": 1.4084, + "step": 18105 + }, + { + "epoch": 0.5921396808788909, + "grad_norm": 3.1620348661538182, + "learning_rate": 8.534653404368169e-06, + "loss": 1.5177, + "step": 18110 + }, + { + "epoch": 0.5923031650536228, + "grad_norm": 3.106055089633426, + "learning_rate": 8.52900844647357e-06, + "loss": 1.5554, + "step": 18115 + }, + { + "epoch": 0.5924666492283547, + "grad_norm": 3.0193192442350774, + "learning_rate": 8.523363967644878e-06, + "loss": 1.4335, + "step": 18120 + }, + { + "epoch": 0.5926301334030866, + "grad_norm": 3.04059283888283, + "learning_rate": 8.517719969720362e-06, + "loss": 1.4743, + "step": 18125 + }, + { + "epoch": 0.5927936175778185, + "grad_norm": 3.3135174320929877, + "learning_rate": 8.512076454538136e-06, + "loss": 1.4561, + "step": 18130 + }, + { + "epoch": 0.5929571017525503, + "grad_norm": 3.2906574214603483, + "learning_rate": 8.506433423936149e-06, + "loss": 1.4675, + "step": 18135 + }, + { + "epoch": 0.5931205859272822, + "grad_norm": 3.1524867844857387, + "learning_rate": 8.500790879752205e-06, + "loss": 1.4868, + "step": 18140 + }, + { + "epoch": 0.5932840701020141, + "grad_norm": 3.0676171087810813, + "learning_rate": 8.495148823823937e-06, + "loss": 1.327, + "step": 18145 + }, + { + "epoch": 0.593447554276746, + "grad_norm": 3.0336710231670017, + "learning_rate": 8.489507257988829e-06, + "loss": 1.5044, + "step": 18150 + }, + { + "epoch": 0.5936110384514779, + "grad_norm": 3.2034478857034325, + "learning_rate": 8.483866184084197e-06, + "loss": 1.5793, + "step": 18155 + }, + { + "epoch": 0.5937745226262098, + "grad_norm": 3.2203741419390184, + "learning_rate": 8.4782256039472e-06, + "loss": 1.3973, + "step": 18160 + }, + { + "epoch": 0.5939380068009417, + "grad_norm": 3.3133004938752553, + "learning_rate": 8.47258551941484e-06, + "loss": 1.3804, + "step": 18165 + }, + { + "epoch": 0.5941014909756736, + "grad_norm": 3.2429110608956915, + "learning_rate": 8.466945932323954e-06, + "loss": 1.3614, + "step": 18170 + }, + { + "epoch": 0.5942649751504054, + "grad_norm": 3.3897088939259987, + "learning_rate": 8.461306844511216e-06, + "loss": 1.5078, + "step": 18175 + }, + { + "epoch": 0.5944284593251373, + "grad_norm": 3.3177966656878746, + "learning_rate": 8.455668257813138e-06, + "loss": 1.5435, + "step": 18180 + }, + { + "epoch": 0.5945919434998692, + "grad_norm": 3.422963886617594, + "learning_rate": 8.450030174066068e-06, + "loss": 1.4442, + "step": 18185 + }, + { + "epoch": 0.5947554276746011, + "grad_norm": 3.564797329596063, + "learning_rate": 8.444392595106197e-06, + "loss": 1.5327, + "step": 18190 + }, + { + "epoch": 0.594918911849333, + "grad_norm": 3.3398159757531416, + "learning_rate": 8.438755522769544e-06, + "loss": 1.5091, + "step": 18195 + }, + { + "epoch": 0.5950823960240649, + "grad_norm": 3.195060823224897, + "learning_rate": 8.433118958891966e-06, + "loss": 1.3136, + "step": 18200 + }, + { + "epoch": 0.5952458801987968, + "grad_norm": 3.102347258079385, + "learning_rate": 8.42748290530915e-06, + "loss": 1.6065, + "step": 18205 + }, + { + "epoch": 0.5954093643735286, + "grad_norm": 3.3251050504603494, + "learning_rate": 8.421847363856624e-06, + "loss": 1.4677, + "step": 18210 + }, + { + "epoch": 0.5955728485482605, + "grad_norm": 2.937376859114308, + "learning_rate": 8.416212336369743e-06, + "loss": 1.3233, + "step": 18215 + }, + { + "epoch": 0.5957363327229924, + "grad_norm": 3.6706700024125203, + "learning_rate": 8.410577824683703e-06, + "loss": 1.4256, + "step": 18220 + }, + { + "epoch": 0.5958998168977243, + "grad_norm": 3.0468147755338304, + "learning_rate": 8.404943830633521e-06, + "loss": 1.505, + "step": 18225 + }, + { + "epoch": 0.5960633010724562, + "grad_norm": 2.9626566338157048, + "learning_rate": 8.399310356054053e-06, + "loss": 1.2698, + "step": 18230 + }, + { + "epoch": 0.5962267852471881, + "grad_norm": 3.066587659959109, + "learning_rate": 8.393677402779983e-06, + "loss": 1.4194, + "step": 18235 + }, + { + "epoch": 0.59639026942192, + "grad_norm": 3.089061775886294, + "learning_rate": 8.388044972645825e-06, + "loss": 1.3832, + "step": 18240 + }, + { + "epoch": 0.5965537535966519, + "grad_norm": 3.053622763441495, + "learning_rate": 8.382413067485926e-06, + "loss": 1.3712, + "step": 18245 + }, + { + "epoch": 0.5967172377713837, + "grad_norm": 3.248229777581473, + "learning_rate": 8.376781689134458e-06, + "loss": 1.309, + "step": 18250 + }, + { + "epoch": 0.5968807219461156, + "grad_norm": 2.870522777622665, + "learning_rate": 8.371150839425423e-06, + "loss": 1.4248, + "step": 18255 + }, + { + "epoch": 0.5970442061208475, + "grad_norm": 3.5929995390539466, + "learning_rate": 8.365520520192651e-06, + "loss": 1.6532, + "step": 18260 + }, + { + "epoch": 0.5972076902955794, + "grad_norm": 3.381764172507215, + "learning_rate": 8.359890733269799e-06, + "loss": 1.5865, + "step": 18265 + }, + { + "epoch": 0.5973711744703113, + "grad_norm": 3.1199346988986765, + "learning_rate": 8.354261480490348e-06, + "loss": 1.5148, + "step": 18270 + }, + { + "epoch": 0.5975346586450432, + "grad_norm": 3.031839426100703, + "learning_rate": 8.348632763687617e-06, + "loss": 1.4533, + "step": 18275 + }, + { + "epoch": 0.5976981428197751, + "grad_norm": 3.012809746290847, + "learning_rate": 8.34300458469473e-06, + "loss": 1.4438, + "step": 18280 + }, + { + "epoch": 0.597861626994507, + "grad_norm": 3.1711506153648807, + "learning_rate": 8.33737694534465e-06, + "loss": 1.4845, + "step": 18285 + }, + { + "epoch": 0.5980251111692388, + "grad_norm": 3.219526240810741, + "learning_rate": 8.331749847470163e-06, + "loss": 1.4496, + "step": 18290 + }, + { + "epoch": 0.5981885953439707, + "grad_norm": 3.2289428411523975, + "learning_rate": 8.326123292903879e-06, + "loss": 1.4777, + "step": 18295 + }, + { + "epoch": 0.5983520795187026, + "grad_norm": 3.422355994374377, + "learning_rate": 8.320497283478224e-06, + "loss": 1.4462, + "step": 18300 + }, + { + "epoch": 0.5985155636934345, + "grad_norm": 3.022735814958033, + "learning_rate": 8.314871821025456e-06, + "loss": 1.3456, + "step": 18305 + }, + { + "epoch": 0.5986790478681664, + "grad_norm": 2.854768426026407, + "learning_rate": 8.309246907377645e-06, + "loss": 1.4151, + "step": 18310 + }, + { + "epoch": 0.5988425320428983, + "grad_norm": 2.9198051768938194, + "learning_rate": 8.303622544366692e-06, + "loss": 1.5155, + "step": 18315 + }, + { + "epoch": 0.5990060162176302, + "grad_norm": 3.13977656269545, + "learning_rate": 8.297998733824316e-06, + "loss": 1.4454, + "step": 18320 + }, + { + "epoch": 0.599169500392362, + "grad_norm": 3.0676187590483344, + "learning_rate": 8.292375477582048e-06, + "loss": 1.374, + "step": 18325 + }, + { + "epoch": 0.5993329845670939, + "grad_norm": 3.1192311750606754, + "learning_rate": 8.28675277747125e-06, + "loss": 1.4687, + "step": 18330 + }, + { + "epoch": 0.5994964687418258, + "grad_norm": 3.239425733082213, + "learning_rate": 8.281130635323096e-06, + "loss": 1.4401, + "step": 18335 + }, + { + "epoch": 0.5996599529165577, + "grad_norm": 3.218353961281337, + "learning_rate": 8.275509052968577e-06, + "loss": 1.4461, + "step": 18340 + }, + { + "epoch": 0.5998234370912896, + "grad_norm": 3.596804810715867, + "learning_rate": 8.269888032238508e-06, + "loss": 1.5311, + "step": 18345 + }, + { + "epoch": 0.5999869212660215, + "grad_norm": 3.3433173094289113, + "learning_rate": 8.26426757496352e-06, + "loss": 1.4661, + "step": 18350 + }, + { + "epoch": 0.6001504054407534, + "grad_norm": 3.530880145988537, + "learning_rate": 8.258647682974054e-06, + "loss": 1.4813, + "step": 18355 + }, + { + "epoch": 0.6003138896154853, + "grad_norm": 3.1226185198642455, + "learning_rate": 8.253028358100372e-06, + "loss": 1.3681, + "step": 18360 + }, + { + "epoch": 0.6004773737902172, + "grad_norm": 3.4074311751772246, + "learning_rate": 8.247409602172549e-06, + "loss": 1.4007, + "step": 18365 + }, + { + "epoch": 0.600640857964949, + "grad_norm": 3.189831413347176, + "learning_rate": 8.24179141702048e-06, + "loss": 1.4809, + "step": 18370 + }, + { + "epoch": 0.6008043421396809, + "grad_norm": 3.088816713068773, + "learning_rate": 8.236173804473869e-06, + "loss": 1.4124, + "step": 18375 + }, + { + "epoch": 0.6009678263144128, + "grad_norm": 3.1383489637139395, + "learning_rate": 8.230556766362232e-06, + "loss": 1.3308, + "step": 18380 + }, + { + "epoch": 0.6011313104891447, + "grad_norm": 3.290913992312521, + "learning_rate": 8.224940304514905e-06, + "loss": 1.4775, + "step": 18385 + }, + { + "epoch": 0.6012947946638766, + "grad_norm": 3.178460734327845, + "learning_rate": 8.21932442076103e-06, + "loss": 1.419, + "step": 18390 + }, + { + "epoch": 0.6014582788386085, + "grad_norm": 3.2547635873531955, + "learning_rate": 8.21370911692956e-06, + "loss": 1.4891, + "step": 18395 + }, + { + "epoch": 0.6016217630133404, + "grad_norm": 3.115890339231043, + "learning_rate": 8.208094394849266e-06, + "loss": 1.4102, + "step": 18400 + }, + { + "epoch": 0.6017852471880722, + "grad_norm": 3.0746071350507926, + "learning_rate": 8.202480256348723e-06, + "loss": 1.4474, + "step": 18405 + }, + { + "epoch": 0.6019487313628041, + "grad_norm": 2.9172339354126815, + "learning_rate": 8.19686670325632e-06, + "loss": 1.3916, + "step": 18410 + }, + { + "epoch": 0.602112215537536, + "grad_norm": 3.150465009801813, + "learning_rate": 8.191253737400252e-06, + "loss": 1.4718, + "step": 18415 + }, + { + "epoch": 0.6022756997122678, + "grad_norm": 3.2999091764154316, + "learning_rate": 8.185641360608525e-06, + "loss": 1.4271, + "step": 18420 + }, + { + "epoch": 0.6024391838869997, + "grad_norm": 3.276865685270224, + "learning_rate": 8.180029574708953e-06, + "loss": 1.4055, + "step": 18425 + }, + { + "epoch": 0.6026026680617316, + "grad_norm": 3.4046343449948835, + "learning_rate": 8.174418381529157e-06, + "loss": 1.4729, + "step": 18430 + }, + { + "epoch": 0.6027661522364635, + "grad_norm": 3.437312708224811, + "learning_rate": 8.168807782896566e-06, + "loss": 1.3952, + "step": 18435 + }, + { + "epoch": 0.6029296364111953, + "grad_norm": 2.989010981092972, + "learning_rate": 8.163197780638414e-06, + "loss": 1.4706, + "step": 18440 + }, + { + "epoch": 0.6030931205859272, + "grad_norm": 3.186361596717962, + "learning_rate": 8.15758837658174e-06, + "loss": 1.4324, + "step": 18445 + }, + { + "epoch": 0.6032566047606591, + "grad_norm": 3.3257040438665677, + "learning_rate": 8.15197957255339e-06, + "loss": 1.4407, + "step": 18450 + }, + { + "epoch": 0.603420088935391, + "grad_norm": 3.0792883048708126, + "learning_rate": 8.146371370380016e-06, + "loss": 1.4115, + "step": 18455 + }, + { + "epoch": 0.6035835731101229, + "grad_norm": 3.3118473478908195, + "learning_rate": 8.140763771888071e-06, + "loss": 1.5057, + "step": 18460 + }, + { + "epoch": 0.6037470572848548, + "grad_norm": 3.135387780512475, + "learning_rate": 8.135156778903811e-06, + "loss": 1.3971, + "step": 18465 + }, + { + "epoch": 0.6039105414595867, + "grad_norm": 3.1602201334502595, + "learning_rate": 8.129550393253297e-06, + "loss": 1.3908, + "step": 18470 + }, + { + "epoch": 0.6040740256343186, + "grad_norm": 3.034127136738024, + "learning_rate": 8.123944616762391e-06, + "loss": 1.3094, + "step": 18475 + }, + { + "epoch": 0.6042375098090504, + "grad_norm": 3.2742198127238438, + "learning_rate": 8.118339451256762e-06, + "loss": 1.4644, + "step": 18480 + }, + { + "epoch": 0.6044009939837823, + "grad_norm": 2.983109735649239, + "learning_rate": 8.112734898561869e-06, + "loss": 1.3966, + "step": 18485 + }, + { + "epoch": 0.6045644781585142, + "grad_norm": 3.186601130302736, + "learning_rate": 8.107130960502976e-06, + "loss": 1.3933, + "step": 18490 + }, + { + "epoch": 0.6047279623332461, + "grad_norm": 3.239832233904225, + "learning_rate": 8.101527638905154e-06, + "loss": 1.4109, + "step": 18495 + }, + { + "epoch": 0.604891446507978, + "grad_norm": 3.30704517281397, + "learning_rate": 8.095924935593265e-06, + "loss": 1.5135, + "step": 18500 + }, + { + "epoch": 0.6050549306827099, + "grad_norm": 3.1684224947355566, + "learning_rate": 8.09032285239197e-06, + "loss": 1.3618, + "step": 18505 + }, + { + "epoch": 0.6052184148574418, + "grad_norm": 3.0953132062371167, + "learning_rate": 8.084721391125735e-06, + "loss": 1.3724, + "step": 18510 + }, + { + "epoch": 0.6053818990321737, + "grad_norm": 3.130638705291305, + "learning_rate": 8.079120553618815e-06, + "loss": 1.3305, + "step": 18515 + }, + { + "epoch": 0.6055453832069055, + "grad_norm": 3.2097358700212384, + "learning_rate": 8.073520341695267e-06, + "loss": 1.3535, + "step": 18520 + }, + { + "epoch": 0.6057088673816374, + "grad_norm": 3.0794959566018623, + "learning_rate": 8.067920757178944e-06, + "loss": 1.3938, + "step": 18525 + }, + { + "epoch": 0.6058723515563693, + "grad_norm": 3.052409090800342, + "learning_rate": 8.062321801893492e-06, + "loss": 1.482, + "step": 18530 + }, + { + "epoch": 0.6060358357311012, + "grad_norm": 3.1629854632793992, + "learning_rate": 8.056723477662353e-06, + "loss": 1.4687, + "step": 18535 + }, + { + "epoch": 0.6061993199058331, + "grad_norm": 3.161955698957366, + "learning_rate": 8.051125786308766e-06, + "loss": 1.4743, + "step": 18540 + }, + { + "epoch": 0.606362804080565, + "grad_norm": 3.3116888379951654, + "learning_rate": 8.045528729655757e-06, + "loss": 1.4356, + "step": 18545 + }, + { + "epoch": 0.6065262882552969, + "grad_norm": 3.1347509217017566, + "learning_rate": 8.039932309526157e-06, + "loss": 1.4975, + "step": 18550 + }, + { + "epoch": 0.6066897724300288, + "grad_norm": 3.317447479142333, + "learning_rate": 8.034336527742579e-06, + "loss": 1.5462, + "step": 18555 + }, + { + "epoch": 0.6068532566047606, + "grad_norm": 3.498861320543137, + "learning_rate": 8.028741386127435e-06, + "loss": 1.4793, + "step": 18560 + }, + { + "epoch": 0.6070167407794925, + "grad_norm": 3.2498646751637104, + "learning_rate": 8.023146886502921e-06, + "loss": 1.3352, + "step": 18565 + }, + { + "epoch": 0.6071802249542244, + "grad_norm": 3.4212342928482182, + "learning_rate": 8.017553030691028e-06, + "loss": 1.4794, + "step": 18570 + }, + { + "epoch": 0.6073437091289563, + "grad_norm": 2.9875100017799667, + "learning_rate": 8.011959820513545e-06, + "loss": 1.5723, + "step": 18575 + }, + { + "epoch": 0.6075071933036882, + "grad_norm": 3.0401943075270865, + "learning_rate": 8.006367257792038e-06, + "loss": 1.4072, + "step": 18580 + }, + { + "epoch": 0.6076706774784201, + "grad_norm": 3.238144558522493, + "learning_rate": 8.000775344347868e-06, + "loss": 1.4886, + "step": 18585 + }, + { + "epoch": 0.607834161653152, + "grad_norm": 2.962183083147765, + "learning_rate": 7.995184082002187e-06, + "loss": 1.3795, + "step": 18590 + }, + { + "epoch": 0.6079976458278838, + "grad_norm": 3.251924171508196, + "learning_rate": 7.989593472575929e-06, + "loss": 1.4682, + "step": 18595 + }, + { + "epoch": 0.6081611300026157, + "grad_norm": 3.0040843390769867, + "learning_rate": 7.984003517889818e-06, + "loss": 1.4583, + "step": 18600 + }, + { + "epoch": 0.6083246141773476, + "grad_norm": 3.3934778494657247, + "learning_rate": 7.978414219764368e-06, + "loss": 1.4616, + "step": 18605 + }, + { + "epoch": 0.6084880983520795, + "grad_norm": 2.9770830505141737, + "learning_rate": 7.972825580019876e-06, + "loss": 1.3865, + "step": 18610 + }, + { + "epoch": 0.6086515825268114, + "grad_norm": 3.3904974037210454, + "learning_rate": 7.967237600476424e-06, + "loss": 1.3938, + "step": 18615 + }, + { + "epoch": 0.6088150667015433, + "grad_norm": 3.2687276762408555, + "learning_rate": 7.96165028295388e-06, + "loss": 1.5324, + "step": 18620 + }, + { + "epoch": 0.6089785508762752, + "grad_norm": 2.987961283941434, + "learning_rate": 7.956063629271897e-06, + "loss": 1.3937, + "step": 18625 + }, + { + "epoch": 0.6091420350510071, + "grad_norm": 3.1165058324186865, + "learning_rate": 7.950477641249911e-06, + "loss": 1.4616, + "step": 18630 + }, + { + "epoch": 0.609305519225739, + "grad_norm": 3.2341444566543833, + "learning_rate": 7.944892320707142e-06, + "loss": 1.4845, + "step": 18635 + }, + { + "epoch": 0.6094690034004708, + "grad_norm": 3.160734671567609, + "learning_rate": 7.939307669462591e-06, + "loss": 1.4495, + "step": 18640 + }, + { + "epoch": 0.6096324875752027, + "grad_norm": 3.5771533638543582, + "learning_rate": 7.933723689335043e-06, + "loss": 1.4885, + "step": 18645 + }, + { + "epoch": 0.6097959717499346, + "grad_norm": 2.9504250605279894, + "learning_rate": 7.928140382143062e-06, + "loss": 1.3287, + "step": 18650 + }, + { + "epoch": 0.6099594559246665, + "grad_norm": 3.2056871819169834, + "learning_rate": 7.922557749704996e-06, + "loss": 1.3543, + "step": 18655 + }, + { + "epoch": 0.6101229400993984, + "grad_norm": 3.1956748832491946, + "learning_rate": 7.916975793838972e-06, + "loss": 1.4974, + "step": 18660 + }, + { + "epoch": 0.6102864242741303, + "grad_norm": 3.1240316061690905, + "learning_rate": 7.911394516362896e-06, + "loss": 1.5218, + "step": 18665 + }, + { + "epoch": 0.6104499084488622, + "grad_norm": 2.9489970610004916, + "learning_rate": 7.905813919094452e-06, + "loss": 1.3892, + "step": 18670 + }, + { + "epoch": 0.610613392623594, + "grad_norm": 3.184562165626325, + "learning_rate": 7.900234003851105e-06, + "loss": 1.5226, + "step": 18675 + }, + { + "epoch": 0.6107768767983259, + "grad_norm": 3.22397428953161, + "learning_rate": 7.894654772450094e-06, + "loss": 1.4959, + "step": 18680 + }, + { + "epoch": 0.6109403609730578, + "grad_norm": 3.198283459738501, + "learning_rate": 7.889076226708446e-06, + "loss": 1.3873, + "step": 18685 + }, + { + "epoch": 0.6111038451477897, + "grad_norm": 3.0801153261221432, + "learning_rate": 7.883498368442947e-06, + "loss": 1.4187, + "step": 18690 + }, + { + "epoch": 0.6112673293225216, + "grad_norm": 3.0795350772339254, + "learning_rate": 7.87792119947017e-06, + "loss": 1.346, + "step": 18695 + }, + { + "epoch": 0.6114308134972535, + "grad_norm": 3.1989001480735846, + "learning_rate": 7.872344721606466e-06, + "loss": 1.35, + "step": 18700 + }, + { + "epoch": 0.6115942976719854, + "grad_norm": 2.902447292902399, + "learning_rate": 7.866768936667957e-06, + "loss": 1.3144, + "step": 18705 + }, + { + "epoch": 0.6117577818467173, + "grad_norm": 3.293736238690797, + "learning_rate": 7.861193846470539e-06, + "loss": 1.4322, + "step": 18710 + }, + { + "epoch": 0.6119212660214491, + "grad_norm": 3.1118839446986164, + "learning_rate": 7.855619452829882e-06, + "loss": 1.3852, + "step": 18715 + }, + { + "epoch": 0.612084750196181, + "grad_norm": 3.320857575022587, + "learning_rate": 7.850045757561427e-06, + "loss": 1.4375, + "step": 18720 + }, + { + "epoch": 0.6122482343709129, + "grad_norm": 3.3861834080707545, + "learning_rate": 7.844472762480395e-06, + "loss": 1.5203, + "step": 18725 + }, + { + "epoch": 0.6124117185456448, + "grad_norm": 3.3843184365187495, + "learning_rate": 7.838900469401772e-06, + "loss": 1.4007, + "step": 18730 + }, + { + "epoch": 0.6125752027203767, + "grad_norm": 3.0476704607698815, + "learning_rate": 7.833328880140314e-06, + "loss": 1.3609, + "step": 18735 + }, + { + "epoch": 0.6127386868951086, + "grad_norm": 3.1945436775392, + "learning_rate": 7.827757996510555e-06, + "loss": 1.5191, + "step": 18740 + }, + { + "epoch": 0.6129021710698405, + "grad_norm": 3.137228397976648, + "learning_rate": 7.822187820326793e-06, + "loss": 1.3487, + "step": 18745 + }, + { + "epoch": 0.6130656552445723, + "grad_norm": 3.0802111261824763, + "learning_rate": 7.816618353403098e-06, + "loss": 1.5031, + "step": 18750 + }, + { + "epoch": 0.6132291394193042, + "grad_norm": 3.108968400597087, + "learning_rate": 7.811049597553314e-06, + "loss": 1.4499, + "step": 18755 + }, + { + "epoch": 0.6133926235940361, + "grad_norm": 3.060168292098564, + "learning_rate": 7.80548155459104e-06, + "loss": 1.3511, + "step": 18760 + }, + { + "epoch": 0.613556107768768, + "grad_norm": 3.0395770107495004, + "learning_rate": 7.799914226329658e-06, + "loss": 1.451, + "step": 18765 + }, + { + "epoch": 0.6137195919434999, + "grad_norm": 3.471390090534357, + "learning_rate": 7.794347614582307e-06, + "loss": 1.4917, + "step": 18770 + }, + { + "epoch": 0.6138830761182318, + "grad_norm": 3.269085879650525, + "learning_rate": 7.788781721161895e-06, + "loss": 1.5737, + "step": 18775 + }, + { + "epoch": 0.6140465602929637, + "grad_norm": 3.1489370551328677, + "learning_rate": 7.783216547881101e-06, + "loss": 1.3901, + "step": 18780 + }, + { + "epoch": 0.6142100444676956, + "grad_norm": 3.3715385505955586, + "learning_rate": 7.777652096552363e-06, + "loss": 1.5576, + "step": 18785 + }, + { + "epoch": 0.6143735286424274, + "grad_norm": 3.264259107560218, + "learning_rate": 7.772088368987888e-06, + "loss": 1.4231, + "step": 18790 + }, + { + "epoch": 0.6145370128171593, + "grad_norm": 3.2284596345955054, + "learning_rate": 7.766525366999643e-06, + "loss": 1.3561, + "step": 18795 + }, + { + "epoch": 0.6147004969918912, + "grad_norm": 3.0895541149570396, + "learning_rate": 7.760963092399364e-06, + "loss": 1.3975, + "step": 18800 + }, + { + "epoch": 0.6148639811666231, + "grad_norm": 3.298613083890218, + "learning_rate": 7.755401546998546e-06, + "loss": 1.3785, + "step": 18805 + }, + { + "epoch": 0.615027465341355, + "grad_norm": 3.3006906475122197, + "learning_rate": 7.74984073260845e-06, + "loss": 1.5129, + "step": 18810 + }, + { + "epoch": 0.6151909495160869, + "grad_norm": 3.2167105971650884, + "learning_rate": 7.744280651040094e-06, + "loss": 1.3766, + "step": 18815 + }, + { + "epoch": 0.6153544336908188, + "grad_norm": 3.1657886901103782, + "learning_rate": 7.738721304104264e-06, + "loss": 1.4205, + "step": 18820 + }, + { + "epoch": 0.6155179178655507, + "grad_norm": 3.1298143074268068, + "learning_rate": 7.733162693611501e-06, + "loss": 1.3481, + "step": 18825 + }, + { + "epoch": 0.6156814020402825, + "grad_norm": 3.037915086121402, + "learning_rate": 7.727604821372107e-06, + "loss": 1.3952, + "step": 18830 + }, + { + "epoch": 0.6158448862150144, + "grad_norm": 3.2556314699900053, + "learning_rate": 7.722047689196147e-06, + "loss": 1.4402, + "step": 18835 + }, + { + "epoch": 0.6160083703897463, + "grad_norm": 2.901509845604248, + "learning_rate": 7.716491298893443e-06, + "loss": 1.3243, + "step": 18840 + }, + { + "epoch": 0.6161718545644782, + "grad_norm": 3.1398453113629126, + "learning_rate": 7.710935652273574e-06, + "loss": 1.3799, + "step": 18845 + }, + { + "epoch": 0.6163353387392101, + "grad_norm": 3.079216535818114, + "learning_rate": 7.705380751145878e-06, + "loss": 1.412, + "step": 18850 + }, + { + "epoch": 0.616498822913942, + "grad_norm": 3.384021827743491, + "learning_rate": 7.69982659731945e-06, + "loss": 1.3635, + "step": 18855 + }, + { + "epoch": 0.6166623070886739, + "grad_norm": 3.3656512053546717, + "learning_rate": 7.69427319260314e-06, + "loss": 1.2623, + "step": 18860 + }, + { + "epoch": 0.6168257912634058, + "grad_norm": 3.1704384197462407, + "learning_rate": 7.688720538805563e-06, + "loss": 1.5399, + "step": 18865 + }, + { + "epoch": 0.6169892754381376, + "grad_norm": 3.145006001917919, + "learning_rate": 7.683168637735076e-06, + "loss": 1.4657, + "step": 18870 + }, + { + "epoch": 0.6171527596128695, + "grad_norm": 3.3567896786182687, + "learning_rate": 7.677617491199797e-06, + "loss": 1.5324, + "step": 18875 + }, + { + "epoch": 0.6173162437876014, + "grad_norm": 3.3094406070162004, + "learning_rate": 7.6720671010076e-06, + "loss": 1.6457, + "step": 18880 + }, + { + "epoch": 0.6174797279623332, + "grad_norm": 3.312789881097473, + "learning_rate": 7.666517468966112e-06, + "loss": 1.4258, + "step": 18885 + }, + { + "epoch": 0.6176432121370651, + "grad_norm": 3.1853094375994893, + "learning_rate": 7.66096859688271e-06, + "loss": 1.5207, + "step": 18890 + }, + { + "epoch": 0.617806696311797, + "grad_norm": 3.328860923833532, + "learning_rate": 7.655420486564533e-06, + "loss": 1.3961, + "step": 18895 + }, + { + "epoch": 0.6179701804865289, + "grad_norm": 3.1122915875561317, + "learning_rate": 7.649873139818452e-06, + "loss": 1.4262, + "step": 18900 + }, + { + "epoch": 0.6181336646612607, + "grad_norm": 3.3354380404680994, + "learning_rate": 7.64432655845111e-06, + "loss": 1.5924, + "step": 18905 + }, + { + "epoch": 0.6182971488359926, + "grad_norm": 3.3631087577217036, + "learning_rate": 7.638780744268892e-06, + "loss": 1.509, + "step": 18910 + }, + { + "epoch": 0.6184606330107245, + "grad_norm": 3.3627630163778814, + "learning_rate": 7.633235699077932e-06, + "loss": 1.4951, + "step": 18915 + }, + { + "epoch": 0.6186241171854564, + "grad_norm": 3.2353927154357858, + "learning_rate": 7.627691424684116e-06, + "loss": 1.4129, + "step": 18920 + }, + { + "epoch": 0.6187876013601883, + "grad_norm": 3.03346063350697, + "learning_rate": 7.62214792289308e-06, + "loss": 1.5381, + "step": 18925 + }, + { + "epoch": 0.6189510855349202, + "grad_norm": 2.976698709752373, + "learning_rate": 7.616605195510201e-06, + "loss": 1.406, + "step": 18930 + }, + { + "epoch": 0.6191145697096521, + "grad_norm": 3.078460188856313, + "learning_rate": 7.611063244340617e-06, + "loss": 1.2919, + "step": 18935 + }, + { + "epoch": 0.619278053884384, + "grad_norm": 3.047334134770311, + "learning_rate": 7.605522071189204e-06, + "loss": 1.3986, + "step": 18940 + }, + { + "epoch": 0.6194415380591158, + "grad_norm": 2.9121944001006392, + "learning_rate": 7.599981677860584e-06, + "loss": 1.407, + "step": 18945 + }, + { + "epoch": 0.6196050222338477, + "grad_norm": 3.2018683171825804, + "learning_rate": 7.5944420661591266e-06, + "loss": 1.4306, + "step": 18950 + }, + { + "epoch": 0.6197685064085796, + "grad_norm": 3.267445136195565, + "learning_rate": 7.588903237888949e-06, + "loss": 1.3449, + "step": 18955 + }, + { + "epoch": 0.6199319905833115, + "grad_norm": 2.9787239789535107, + "learning_rate": 7.583365194853913e-06, + "loss": 1.4055, + "step": 18960 + }, + { + "epoch": 0.6200954747580434, + "grad_norm": 3.2222617551681023, + "learning_rate": 7.577827938857623e-06, + "loss": 1.429, + "step": 18965 + }, + { + "epoch": 0.6202589589327753, + "grad_norm": 3.241538732604451, + "learning_rate": 7.572291471703428e-06, + "loss": 1.4323, + "step": 18970 + }, + { + "epoch": 0.6204224431075072, + "grad_norm": 2.986228750305001, + "learning_rate": 7.566755795194418e-06, + "loss": 1.3162, + "step": 18975 + }, + { + "epoch": 0.620585927282239, + "grad_norm": 3.219086090574294, + "learning_rate": 7.561220911133425e-06, + "loss": 1.4674, + "step": 18980 + }, + { + "epoch": 0.6207494114569709, + "grad_norm": 3.319518408323149, + "learning_rate": 7.555686821323033e-06, + "loss": 1.3302, + "step": 18985 + }, + { + "epoch": 0.6209128956317028, + "grad_norm": 2.9637390645141313, + "learning_rate": 7.550153527565553e-06, + "loss": 1.3362, + "step": 18990 + }, + { + "epoch": 0.6210763798064347, + "grad_norm": 3.2494470458356584, + "learning_rate": 7.544621031663045e-06, + "loss": 1.3803, + "step": 18995 + }, + { + "epoch": 0.6212398639811666, + "grad_norm": 3.320823298982099, + "learning_rate": 7.539089335417308e-06, + "loss": 1.3499, + "step": 19000 + }, + { + "epoch": 0.6214033481558985, + "grad_norm": 2.996422760471694, + "learning_rate": 7.533558440629878e-06, + "loss": 1.3295, + "step": 19005 + }, + { + "epoch": 0.6215668323306304, + "grad_norm": 3.3919730222375932, + "learning_rate": 7.528028349102032e-06, + "loss": 1.5016, + "step": 19010 + }, + { + "epoch": 0.6217303165053623, + "grad_norm": 2.709068470180704, + "learning_rate": 7.522499062634788e-06, + "loss": 1.1804, + "step": 19015 + }, + { + "epoch": 0.6218938006800941, + "grad_norm": 3.219667280234325, + "learning_rate": 7.516970583028897e-06, + "loss": 1.5088, + "step": 19020 + }, + { + "epoch": 0.622057284854826, + "grad_norm": 3.1698717585729805, + "learning_rate": 7.511442912084852e-06, + "loss": 1.429, + "step": 19025 + }, + { + "epoch": 0.6222207690295579, + "grad_norm": 3.1217110745320653, + "learning_rate": 7.505916051602876e-06, + "loss": 1.5119, + "step": 19030 + }, + { + "epoch": 0.6223842532042898, + "grad_norm": 3.205986012560064, + "learning_rate": 7.500390003382932e-06, + "loss": 1.5463, + "step": 19035 + }, + { + "epoch": 0.6225477373790217, + "grad_norm": 3.4573690930656724, + "learning_rate": 7.494864769224723e-06, + "loss": 1.4926, + "step": 19040 + }, + { + "epoch": 0.6227112215537536, + "grad_norm": 3.4217675288752893, + "learning_rate": 7.489340350927681e-06, + "loss": 1.6327, + "step": 19045 + }, + { + "epoch": 0.6228747057284855, + "grad_norm": 3.0323837926614408, + "learning_rate": 7.483816750290971e-06, + "loss": 1.3573, + "step": 19050 + }, + { + "epoch": 0.6230381899032174, + "grad_norm": 3.32890435958108, + "learning_rate": 7.478293969113497e-06, + "loss": 1.4437, + "step": 19055 + }, + { + "epoch": 0.6232016740779492, + "grad_norm": 3.486708370179482, + "learning_rate": 7.472772009193891e-06, + "loss": 1.4697, + "step": 19060 + }, + { + "epoch": 0.6233651582526811, + "grad_norm": 3.3643497749406945, + "learning_rate": 7.46725087233052e-06, + "loss": 1.4269, + "step": 19065 + }, + { + "epoch": 0.623528642427413, + "grad_norm": 3.0208695302832274, + "learning_rate": 7.461730560321487e-06, + "loss": 1.3894, + "step": 19070 + }, + { + "epoch": 0.6236921266021449, + "grad_norm": 3.3563114588367746, + "learning_rate": 7.4562110749646215e-06, + "loss": 1.4593, + "step": 19075 + }, + { + "epoch": 0.6238556107768768, + "grad_norm": 3.2186543284110787, + "learning_rate": 7.45069241805748e-06, + "loss": 1.4472, + "step": 19080 + }, + { + "epoch": 0.6240190949516087, + "grad_norm": 3.0909207781891155, + "learning_rate": 7.4451745913973585e-06, + "loss": 1.4566, + "step": 19085 + }, + { + "epoch": 0.6241825791263406, + "grad_norm": 3.2513944637124212, + "learning_rate": 7.4396575967812736e-06, + "loss": 1.4975, + "step": 19090 + }, + { + "epoch": 0.6243460633010725, + "grad_norm": 3.2267485366167263, + "learning_rate": 7.4341414360059805e-06, + "loss": 1.564, + "step": 19095 + }, + { + "epoch": 0.6245095474758043, + "grad_norm": 2.9161154106751916, + "learning_rate": 7.428626110867959e-06, + "loss": 1.3541, + "step": 19100 + }, + { + "epoch": 0.6246730316505362, + "grad_norm": 3.086464662844527, + "learning_rate": 7.423111623163406e-06, + "loss": 1.4326, + "step": 19105 + }, + { + "epoch": 0.6248365158252681, + "grad_norm": 3.551635671991326, + "learning_rate": 7.417597974688261e-06, + "loss": 1.5074, + "step": 19110 + }, + { + "epoch": 0.625, + "grad_norm": 3.0999885496959707, + "learning_rate": 7.4120851672381855e-06, + "loss": 1.4038, + "step": 19115 + }, + { + "epoch": 0.6251634841747319, + "grad_norm": 3.533847075769774, + "learning_rate": 7.406573202608562e-06, + "loss": 1.4481, + "step": 19120 + }, + { + "epoch": 0.6253269683494638, + "grad_norm": 2.970838552848851, + "learning_rate": 7.401062082594506e-06, + "loss": 1.4626, + "step": 19125 + }, + { + "epoch": 0.6254904525241957, + "grad_norm": 2.9064577096018698, + "learning_rate": 7.395551808990852e-06, + "loss": 1.3483, + "step": 19130 + }, + { + "epoch": 0.6256539366989275, + "grad_norm": 2.957613716902631, + "learning_rate": 7.3900423835921595e-06, + "loss": 1.2606, + "step": 19135 + }, + { + "epoch": 0.6258174208736594, + "grad_norm": 3.434896845284954, + "learning_rate": 7.384533808192718e-06, + "loss": 1.4793, + "step": 19140 + }, + { + "epoch": 0.6259809050483913, + "grad_norm": 3.2460320986429925, + "learning_rate": 7.379026084586533e-06, + "loss": 1.4524, + "step": 19145 + }, + { + "epoch": 0.6261443892231232, + "grad_norm": 3.480229427055135, + "learning_rate": 7.373519214567335e-06, + "loss": 1.3277, + "step": 19150 + }, + { + "epoch": 0.6263078733978551, + "grad_norm": 3.243412191501369, + "learning_rate": 7.368013199928577e-06, + "loss": 1.4342, + "step": 19155 + }, + { + "epoch": 0.626471357572587, + "grad_norm": 3.4295546705636855, + "learning_rate": 7.3625080424634325e-06, + "loss": 1.5552, + "step": 19160 + }, + { + "epoch": 0.6266348417473189, + "grad_norm": 3.2148539296612952, + "learning_rate": 7.3570037439647965e-06, + "loss": 1.4452, + "step": 19165 + }, + { + "epoch": 0.6267983259220508, + "grad_norm": 3.0244920278925185, + "learning_rate": 7.351500306225285e-06, + "loss": 1.438, + "step": 19170 + }, + { + "epoch": 0.6269618100967826, + "grad_norm": 3.088362591334286, + "learning_rate": 7.345997731037233e-06, + "loss": 1.4826, + "step": 19175 + }, + { + "epoch": 0.6271252942715145, + "grad_norm": 3.250975518524783, + "learning_rate": 7.340496020192695e-06, + "loss": 1.4999, + "step": 19180 + }, + { + "epoch": 0.6272887784462464, + "grad_norm": 3.143380324138815, + "learning_rate": 7.3349951754834416e-06, + "loss": 1.3692, + "step": 19185 + }, + { + "epoch": 0.6274522626209783, + "grad_norm": 3.2961960381082656, + "learning_rate": 7.32949519870096e-06, + "loss": 1.5837, + "step": 19190 + }, + { + "epoch": 0.6276157467957102, + "grad_norm": 3.3787689907923197, + "learning_rate": 7.323996091636465e-06, + "loss": 1.4996, + "step": 19195 + }, + { + "epoch": 0.6277792309704421, + "grad_norm": 3.085838448218959, + "learning_rate": 7.318497856080877e-06, + "loss": 1.4834, + "step": 19200 + }, + { + "epoch": 0.627942715145174, + "grad_norm": 2.8761198039425135, + "learning_rate": 7.313000493824837e-06, + "loss": 1.3465, + "step": 19205 + }, + { + "epoch": 0.6281061993199059, + "grad_norm": 3.1584875853601595, + "learning_rate": 7.307504006658703e-06, + "loss": 1.3786, + "step": 19210 + }, + { + "epoch": 0.6282696834946377, + "grad_norm": 3.2375523530377, + "learning_rate": 7.302008396372542e-06, + "loss": 1.3088, + "step": 19215 + }, + { + "epoch": 0.6284331676693696, + "grad_norm": 3.216866763392655, + "learning_rate": 7.296513664756144e-06, + "loss": 1.4397, + "step": 19220 + }, + { + "epoch": 0.6285966518441015, + "grad_norm": 3.0320736215821515, + "learning_rate": 7.291019813599006e-06, + "loss": 1.505, + "step": 19225 + }, + { + "epoch": 0.6287601360188334, + "grad_norm": 2.8462517809060115, + "learning_rate": 7.285526844690342e-06, + "loss": 1.3362, + "step": 19230 + }, + { + "epoch": 0.6289236201935653, + "grad_norm": 3.1011349587661443, + "learning_rate": 7.280034759819078e-06, + "loss": 1.5884, + "step": 19235 + }, + { + "epoch": 0.6290871043682972, + "grad_norm": 3.3080198067522626, + "learning_rate": 7.274543560773847e-06, + "loss": 1.331, + "step": 19240 + }, + { + "epoch": 0.6292505885430291, + "grad_norm": 3.017182069117566, + "learning_rate": 7.269053249343003e-06, + "loss": 1.3391, + "step": 19245 + }, + { + "epoch": 0.629414072717761, + "grad_norm": 3.3798970785258033, + "learning_rate": 7.263563827314606e-06, + "loss": 1.6318, + "step": 19250 + }, + { + "epoch": 0.6295775568924928, + "grad_norm": 2.886975159606112, + "learning_rate": 7.258075296476423e-06, + "loss": 1.3324, + "step": 19255 + }, + { + "epoch": 0.6297410410672247, + "grad_norm": 3.33378793161245, + "learning_rate": 7.2525876586159375e-06, + "loss": 1.4657, + "step": 19260 + }, + { + "epoch": 0.6299045252419566, + "grad_norm": 3.5004436560703756, + "learning_rate": 7.2471009155203345e-06, + "loss": 1.359, + "step": 19265 + }, + { + "epoch": 0.6300680094166885, + "grad_norm": 3.2171656535142743, + "learning_rate": 7.241615068976513e-06, + "loss": 1.4018, + "step": 19270 + }, + { + "epoch": 0.6302314935914204, + "grad_norm": 3.2900449925626236, + "learning_rate": 7.236130120771081e-06, + "loss": 1.5826, + "step": 19275 + }, + { + "epoch": 0.6303949777661523, + "grad_norm": 2.953327507216433, + "learning_rate": 7.230646072690351e-06, + "loss": 1.3998, + "step": 19280 + }, + { + "epoch": 0.6305584619408842, + "grad_norm": 3.242759046767623, + "learning_rate": 7.225162926520343e-06, + "loss": 1.5155, + "step": 19285 + }, + { + "epoch": 0.630721946115616, + "grad_norm": 2.9220802236201866, + "learning_rate": 7.219680684046783e-06, + "loss": 1.456, + "step": 19290 + }, + { + "epoch": 0.6308854302903479, + "grad_norm": 3.485968780424567, + "learning_rate": 7.2141993470551e-06, + "loss": 1.4651, + "step": 19295 + }, + { + "epoch": 0.6310489144650798, + "grad_norm": 3.175643922278212, + "learning_rate": 7.208718917330437e-06, + "loss": 1.5574, + "step": 19300 + }, + { + "epoch": 0.6312123986398117, + "grad_norm": 2.9799683426050434, + "learning_rate": 7.203239396657637e-06, + "loss": 1.4351, + "step": 19305 + }, + { + "epoch": 0.6313758828145436, + "grad_norm": 3.1026938874088827, + "learning_rate": 7.1977607868212355e-06, + "loss": 1.4572, + "step": 19310 + }, + { + "epoch": 0.6315393669892755, + "grad_norm": 3.0356686594092257, + "learning_rate": 7.192283089605489e-06, + "loss": 1.3852, + "step": 19315 + }, + { + "epoch": 0.6317028511640074, + "grad_norm": 3.1621823093082155, + "learning_rate": 7.186806306794349e-06, + "loss": 1.4972, + "step": 19320 + }, + { + "epoch": 0.6318663353387393, + "grad_norm": 3.272431723965692, + "learning_rate": 7.181330440171468e-06, + "loss": 1.3231, + "step": 19325 + }, + { + "epoch": 0.6320298195134711, + "grad_norm": 3.2040690031488683, + "learning_rate": 7.175855491520201e-06, + "loss": 1.4537, + "step": 19330 + }, + { + "epoch": 0.632193303688203, + "grad_norm": 2.944557843690295, + "learning_rate": 7.170381462623606e-06, + "loss": 1.3268, + "step": 19335 + }, + { + "epoch": 0.6323567878629349, + "grad_norm": 3.284380221223344, + "learning_rate": 7.1649083552644375e-06, + "loss": 1.3798, + "step": 19340 + }, + { + "epoch": 0.6325202720376668, + "grad_norm": 3.2180942359556846, + "learning_rate": 7.159436171225157e-06, + "loss": 1.3719, + "step": 19345 + }, + { + "epoch": 0.6326837562123986, + "grad_norm": 3.0507727265566946, + "learning_rate": 7.153964912287919e-06, + "loss": 1.3973, + "step": 19350 + }, + { + "epoch": 0.6328472403871305, + "grad_norm": 3.368269350543319, + "learning_rate": 7.148494580234575e-06, + "loss": 1.4531, + "step": 19355 + }, + { + "epoch": 0.6330107245618624, + "grad_norm": 2.9695472288416935, + "learning_rate": 7.143025176846683e-06, + "loss": 1.3613, + "step": 19360 + }, + { + "epoch": 0.6331742087365942, + "grad_norm": 3.276934551474359, + "learning_rate": 7.1375567039054895e-06, + "loss": 1.4514, + "step": 19365 + }, + { + "epoch": 0.6333376929113261, + "grad_norm": 3.0832901761807294, + "learning_rate": 7.132089163191947e-06, + "loss": 1.3052, + "step": 19370 + }, + { + "epoch": 0.633501177086058, + "grad_norm": 3.196732501640412, + "learning_rate": 7.1266225564866956e-06, + "loss": 1.3771, + "step": 19375 + }, + { + "epoch": 0.6336646612607899, + "grad_norm": 3.0784570921376355, + "learning_rate": 7.121156885570076e-06, + "loss": 1.3487, + "step": 19380 + }, + { + "epoch": 0.6338281454355218, + "grad_norm": 3.0133953361263153, + "learning_rate": 7.115692152222125e-06, + "loss": 1.5021, + "step": 19385 + }, + { + "epoch": 0.6339916296102537, + "grad_norm": 3.369455491645415, + "learning_rate": 7.1102283582225705e-06, + "loss": 1.5502, + "step": 19390 + }, + { + "epoch": 0.6341551137849856, + "grad_norm": 3.1652868291100753, + "learning_rate": 7.104765505350835e-06, + "loss": 1.376, + "step": 19395 + }, + { + "epoch": 0.6343185979597175, + "grad_norm": 3.199848894899775, + "learning_rate": 7.099303595386038e-06, + "loss": 1.316, + "step": 19400 + }, + { + "epoch": 0.6344820821344493, + "grad_norm": 3.327282334306675, + "learning_rate": 7.093842630106991e-06, + "loss": 1.4431, + "step": 19405 + }, + { + "epoch": 0.6346455663091812, + "grad_norm": 3.1824234692185827, + "learning_rate": 7.088382611292195e-06, + "loss": 1.4664, + "step": 19410 + }, + { + "epoch": 0.6348090504839131, + "grad_norm": 3.106229484692416, + "learning_rate": 7.082923540719845e-06, + "loss": 1.4235, + "step": 19415 + }, + { + "epoch": 0.634972534658645, + "grad_norm": 3.3691647283402877, + "learning_rate": 7.0774654201678226e-06, + "loss": 1.4467, + "step": 19420 + }, + { + "epoch": 0.6351360188333769, + "grad_norm": 3.088615480019331, + "learning_rate": 7.072008251413711e-06, + "loss": 1.3487, + "step": 19425 + }, + { + "epoch": 0.6352995030081088, + "grad_norm": 3.0696462344886593, + "learning_rate": 7.066552036234771e-06, + "loss": 1.4288, + "step": 19430 + }, + { + "epoch": 0.6354629871828407, + "grad_norm": 3.0281024545428874, + "learning_rate": 7.061096776407961e-06, + "loss": 1.3816, + "step": 19435 + }, + { + "epoch": 0.6356264713575726, + "grad_norm": 3.2425821006373696, + "learning_rate": 7.055642473709923e-06, + "loss": 1.4357, + "step": 19440 + }, + { + "epoch": 0.6357899555323044, + "grad_norm": 3.231368761139602, + "learning_rate": 7.05018912991699e-06, + "loss": 1.3757, + "step": 19445 + }, + { + "epoch": 0.6359534397070363, + "grad_norm": 3.1595673397877264, + "learning_rate": 7.044736746805185e-06, + "loss": 1.3643, + "step": 19450 + }, + { + "epoch": 0.6361169238817682, + "grad_norm": 3.1400237601832854, + "learning_rate": 7.039285326150214e-06, + "loss": 1.4538, + "step": 19455 + }, + { + "epoch": 0.6362804080565001, + "grad_norm": 3.063271897507573, + "learning_rate": 7.033834869727471e-06, + "loss": 1.2268, + "step": 19460 + }, + { + "epoch": 0.636443892231232, + "grad_norm": 3.1588840141655554, + "learning_rate": 7.0283853793120375e-06, + "loss": 1.5339, + "step": 19465 + }, + { + "epoch": 0.6366073764059639, + "grad_norm": 3.2183777340830546, + "learning_rate": 7.022936856678677e-06, + "loss": 1.3999, + "step": 19470 + }, + { + "epoch": 0.6367708605806958, + "grad_norm": 3.1419609589296655, + "learning_rate": 7.017489303601839e-06, + "loss": 1.4214, + "step": 19475 + }, + { + "epoch": 0.6369343447554277, + "grad_norm": 3.242741634537613, + "learning_rate": 7.012042721855663e-06, + "loss": 1.4832, + "step": 19480 + }, + { + "epoch": 0.6370978289301595, + "grad_norm": 3.1302947645214627, + "learning_rate": 7.006597113213962e-06, + "loss": 1.4809, + "step": 19485 + }, + { + "epoch": 0.6372613131048914, + "grad_norm": 3.0682157254831846, + "learning_rate": 7.00115247945024e-06, + "loss": 1.4545, + "step": 19490 + }, + { + "epoch": 0.6374247972796233, + "grad_norm": 3.080939417167418, + "learning_rate": 6.9957088223376805e-06, + "loss": 1.4079, + "step": 19495 + }, + { + "epoch": 0.6375882814543552, + "grad_norm": 3.1641888613769673, + "learning_rate": 6.990266143649146e-06, + "loss": 1.265, + "step": 19500 + }, + { + "epoch": 0.6377517656290871, + "grad_norm": 3.4706967778461606, + "learning_rate": 6.984824445157188e-06, + "loss": 1.4288, + "step": 19505 + }, + { + "epoch": 0.637915249803819, + "grad_norm": 3.2444276698800834, + "learning_rate": 6.9793837286340345e-06, + "loss": 1.381, + "step": 19510 + }, + { + "epoch": 0.6380787339785509, + "grad_norm": 3.0945466742883694, + "learning_rate": 6.973943995851593e-06, + "loss": 1.4976, + "step": 19515 + }, + { + "epoch": 0.6382422181532827, + "grad_norm": 3.2376020269855506, + "learning_rate": 6.968505248581447e-06, + "loss": 1.4594, + "step": 19520 + }, + { + "epoch": 0.6384057023280146, + "grad_norm": 3.415659535576162, + "learning_rate": 6.963067488594868e-06, + "loss": 1.4939, + "step": 19525 + }, + { + "epoch": 0.6385691865027465, + "grad_norm": 3.2408485405805427, + "learning_rate": 6.9576307176628e-06, + "loss": 1.4515, + "step": 19530 + }, + { + "epoch": 0.6387326706774784, + "grad_norm": 3.116356217413786, + "learning_rate": 6.9521949375558635e-06, + "loss": 1.3913, + "step": 19535 + }, + { + "epoch": 0.6388961548522103, + "grad_norm": 2.943554559935838, + "learning_rate": 6.946760150044362e-06, + "loss": 1.3371, + "step": 19540 + }, + { + "epoch": 0.6390596390269422, + "grad_norm": 3.1999037670178043, + "learning_rate": 6.94132635689827e-06, + "loss": 1.3005, + "step": 19545 + }, + { + "epoch": 0.6392231232016741, + "grad_norm": 3.0110709224346643, + "learning_rate": 6.935893559887243e-06, + "loss": 1.2775, + "step": 19550 + }, + { + "epoch": 0.639386607376406, + "grad_norm": 3.5121489356203752, + "learning_rate": 6.930461760780611e-06, + "loss": 1.3731, + "step": 19555 + }, + { + "epoch": 0.6395500915511378, + "grad_norm": 3.0478853869508877, + "learning_rate": 6.9250309613473756e-06, + "loss": 1.3131, + "step": 19560 + }, + { + "epoch": 0.6397135757258697, + "grad_norm": 3.1508563959192037, + "learning_rate": 6.919601163356215e-06, + "loss": 1.4147, + "step": 19565 + }, + { + "epoch": 0.6398770599006016, + "grad_norm": 3.25166356625407, + "learning_rate": 6.9141723685754805e-06, + "loss": 1.5026, + "step": 19570 + }, + { + "epoch": 0.6400405440753335, + "grad_norm": 3.1142988542501073, + "learning_rate": 6.908744578773201e-06, + "loss": 1.3964, + "step": 19575 + }, + { + "epoch": 0.6402040282500654, + "grad_norm": 2.945609157740044, + "learning_rate": 6.903317795717073e-06, + "loss": 1.3897, + "step": 19580 + }, + { + "epoch": 0.6403675124247973, + "grad_norm": 3.020099536096274, + "learning_rate": 6.897892021174467e-06, + "loss": 1.3718, + "step": 19585 + }, + { + "epoch": 0.6405309965995292, + "grad_norm": 3.0430887195263985, + "learning_rate": 6.892467256912424e-06, + "loss": 1.3264, + "step": 19590 + }, + { + "epoch": 0.640694480774261, + "grad_norm": 3.0978426410449034, + "learning_rate": 6.887043504697657e-06, + "loss": 1.5048, + "step": 19595 + }, + { + "epoch": 0.6408579649489929, + "grad_norm": 3.1654696403476974, + "learning_rate": 6.881620766296546e-06, + "loss": 1.4112, + "step": 19600 + }, + { + "epoch": 0.6410214491237248, + "grad_norm": 2.8072844579192626, + "learning_rate": 6.876199043475151e-06, + "loss": 1.4849, + "step": 19605 + }, + { + "epoch": 0.6411849332984567, + "grad_norm": 3.5908728280262387, + "learning_rate": 6.870778337999191e-06, + "loss": 1.5496, + "step": 19610 + }, + { + "epoch": 0.6413484174731886, + "grad_norm": 3.212186042820403, + "learning_rate": 6.865358651634055e-06, + "loss": 1.5209, + "step": 19615 + }, + { + "epoch": 0.6415119016479205, + "grad_norm": 3.4317812143596, + "learning_rate": 6.8599399861448055e-06, + "loss": 1.4803, + "step": 19620 + }, + { + "epoch": 0.6416753858226524, + "grad_norm": 3.166901295058317, + "learning_rate": 6.854522343296165e-06, + "loss": 1.3105, + "step": 19625 + }, + { + "epoch": 0.6418388699973843, + "grad_norm": 3.2569397972068943, + "learning_rate": 6.849105724852531e-06, + "loss": 1.4441, + "step": 19630 + }, + { + "epoch": 0.6420023541721162, + "grad_norm": 3.4187402845314208, + "learning_rate": 6.8436901325779615e-06, + "loss": 1.469, + "step": 19635 + }, + { + "epoch": 0.642165838346848, + "grad_norm": 3.1660514095370593, + "learning_rate": 6.838275568236184e-06, + "loss": 1.479, + "step": 19640 + }, + { + "epoch": 0.6423293225215799, + "grad_norm": 3.4283109422943823, + "learning_rate": 6.832862033590586e-06, + "loss": 1.4449, + "step": 19645 + }, + { + "epoch": 0.6424928066963118, + "grad_norm": 3.519592311576967, + "learning_rate": 6.827449530404224e-06, + "loss": 1.4752, + "step": 19650 + }, + { + "epoch": 0.6426562908710437, + "grad_norm": 3.3783102299791556, + "learning_rate": 6.82203806043982e-06, + "loss": 1.3662, + "step": 19655 + }, + { + "epoch": 0.6428197750457756, + "grad_norm": 3.1168046698393246, + "learning_rate": 6.816627625459755e-06, + "loss": 1.3568, + "step": 19660 + }, + { + "epoch": 0.6429832592205075, + "grad_norm": 2.9999389288362224, + "learning_rate": 6.811218227226078e-06, + "loss": 1.3548, + "step": 19665 + }, + { + "epoch": 0.6431467433952394, + "grad_norm": 3.153088966300663, + "learning_rate": 6.805809867500494e-06, + "loss": 1.402, + "step": 19670 + }, + { + "epoch": 0.6433102275699712, + "grad_norm": 3.270068712835188, + "learning_rate": 6.800402548044375e-06, + "loss": 1.4391, + "step": 19675 + }, + { + "epoch": 0.6434737117447031, + "grad_norm": 3.322801148805754, + "learning_rate": 6.79499627061875e-06, + "loss": 1.4689, + "step": 19680 + }, + { + "epoch": 0.643637195919435, + "grad_norm": 3.398709626457033, + "learning_rate": 6.789591036984315e-06, + "loss": 1.3596, + "step": 19685 + }, + { + "epoch": 0.6438006800941669, + "grad_norm": 3.1485666925388958, + "learning_rate": 6.784186848901422e-06, + "loss": 1.3824, + "step": 19690 + }, + { + "epoch": 0.6439641642688988, + "grad_norm": 3.3343340678066093, + "learning_rate": 6.778783708130079e-06, + "loss": 1.5628, + "step": 19695 + }, + { + "epoch": 0.6441276484436307, + "grad_norm": 3.268087772180196, + "learning_rate": 6.7733816164299595e-06, + "loss": 1.4153, + "step": 19700 + }, + { + "epoch": 0.6442911326183626, + "grad_norm": 3.275315065021076, + "learning_rate": 6.76798057556039e-06, + "loss": 1.5019, + "step": 19705 + }, + { + "epoch": 0.6444546167930945, + "grad_norm": 3.3550840284594345, + "learning_rate": 6.7625805872803605e-06, + "loss": 1.5439, + "step": 19710 + }, + { + "epoch": 0.6446181009678263, + "grad_norm": 3.174195592383824, + "learning_rate": 6.757181653348512e-06, + "loss": 1.521, + "step": 19715 + }, + { + "epoch": 0.6447815851425582, + "grad_norm": 3.069999475557259, + "learning_rate": 6.751783775523152e-06, + "loss": 1.487, + "step": 19720 + }, + { + "epoch": 0.6449450693172901, + "grad_norm": 3.194909562591088, + "learning_rate": 6.746386955562224e-06, + "loss": 1.4338, + "step": 19725 + }, + { + "epoch": 0.645108553492022, + "grad_norm": 2.9513576744591, + "learning_rate": 6.74099119522335e-06, + "loss": 1.4671, + "step": 19730 + }, + { + "epoch": 0.6452720376667539, + "grad_norm": 2.99612913685896, + "learning_rate": 6.735596496263792e-06, + "loss": 1.3671, + "step": 19735 + }, + { + "epoch": 0.6454355218414858, + "grad_norm": 3.2078773587359155, + "learning_rate": 6.730202860440476e-06, + "loss": 1.6062, + "step": 19740 + }, + { + "epoch": 0.6455990060162177, + "grad_norm": 3.068303999834541, + "learning_rate": 6.724810289509973e-06, + "loss": 1.2519, + "step": 19745 + }, + { + "epoch": 0.6457624901909496, + "grad_norm": 3.3186666409299193, + "learning_rate": 6.719418785228511e-06, + "loss": 1.5787, + "step": 19750 + }, + { + "epoch": 0.6459259743656814, + "grad_norm": 3.302209240870194, + "learning_rate": 6.714028349351973e-06, + "loss": 1.5559, + "step": 19755 + }, + { + "epoch": 0.6460894585404133, + "grad_norm": 3.207555910410695, + "learning_rate": 6.708638983635893e-06, + "loss": 1.3967, + "step": 19760 + }, + { + "epoch": 0.6462529427151452, + "grad_norm": 3.143901970639308, + "learning_rate": 6.703250689835454e-06, + "loss": 1.5348, + "step": 19765 + }, + { + "epoch": 0.6464164268898771, + "grad_norm": 3.3323683372509336, + "learning_rate": 6.69786346970549e-06, + "loss": 1.4901, + "step": 19770 + }, + { + "epoch": 0.646579911064609, + "grad_norm": 2.838133213301619, + "learning_rate": 6.692477325000487e-06, + "loss": 1.4005, + "step": 19775 + }, + { + "epoch": 0.6467433952393409, + "grad_norm": 3.162949760348623, + "learning_rate": 6.6870922574745825e-06, + "loss": 1.4204, + "step": 19780 + }, + { + "epoch": 0.6469068794140728, + "grad_norm": 3.1792833730505685, + "learning_rate": 6.6817082688815595e-06, + "loss": 1.4512, + "step": 19785 + }, + { + "epoch": 0.6470703635888047, + "grad_norm": 2.9750394993564715, + "learning_rate": 6.67632536097485e-06, + "loss": 1.3413, + "step": 19790 + }, + { + "epoch": 0.6472338477635365, + "grad_norm": 3.2071800782929767, + "learning_rate": 6.670943535507538e-06, + "loss": 1.4319, + "step": 19795 + }, + { + "epoch": 0.6473973319382684, + "grad_norm": 3.1607469576122744, + "learning_rate": 6.66556279423235e-06, + "loss": 1.4234, + "step": 19800 + }, + { + "epoch": 0.6475608161130003, + "grad_norm": 3.05027071572289, + "learning_rate": 6.6601831389016605e-06, + "loss": 1.3888, + "step": 19805 + }, + { + "epoch": 0.6477243002877322, + "grad_norm": 3.306131524078093, + "learning_rate": 6.654804571267495e-06, + "loss": 1.447, + "step": 19810 + }, + { + "epoch": 0.647887784462464, + "grad_norm": 2.9480385899070387, + "learning_rate": 6.649427093081519e-06, + "loss": 1.393, + "step": 19815 + }, + { + "epoch": 0.6480512686371959, + "grad_norm": 3.3382721619528746, + "learning_rate": 6.644050706095047e-06, + "loss": 1.3969, + "step": 19820 + }, + { + "epoch": 0.6482147528119278, + "grad_norm": 3.1160429241423144, + "learning_rate": 6.638675412059032e-06, + "loss": 1.3842, + "step": 19825 + }, + { + "epoch": 0.6483782369866596, + "grad_norm": 2.9921028364236824, + "learning_rate": 6.6333012127240804e-06, + "loss": 1.3368, + "step": 19830 + }, + { + "epoch": 0.6485417211613915, + "grad_norm": 3.2627490270920525, + "learning_rate": 6.627928109840436e-06, + "loss": 1.5448, + "step": 19835 + }, + { + "epoch": 0.6487052053361234, + "grad_norm": 3.2013118430170904, + "learning_rate": 6.622556105157987e-06, + "loss": 1.4938, + "step": 19840 + }, + { + "epoch": 0.6488686895108553, + "grad_norm": 3.3195287807687155, + "learning_rate": 6.617185200426264e-06, + "loss": 1.4058, + "step": 19845 + }, + { + "epoch": 0.6490321736855872, + "grad_norm": 3.2248779248943196, + "learning_rate": 6.611815397394437e-06, + "loss": 1.4387, + "step": 19850 + }, + { + "epoch": 0.6491956578603191, + "grad_norm": 3.1641013127919337, + "learning_rate": 6.60644669781132e-06, + "loss": 1.5471, + "step": 19855 + }, + { + "epoch": 0.649359142035051, + "grad_norm": 3.194930890625412, + "learning_rate": 6.601079103425366e-06, + "loss": 1.4112, + "step": 19860 + }, + { + "epoch": 0.6495226262097828, + "grad_norm": 3.299438231985322, + "learning_rate": 6.595712615984673e-06, + "loss": 1.3333, + "step": 19865 + }, + { + "epoch": 0.6496861103845147, + "grad_norm": 3.080359572077944, + "learning_rate": 6.590347237236971e-06, + "loss": 1.4662, + "step": 19870 + }, + { + "epoch": 0.6498495945592466, + "grad_norm": 3.10979017239222, + "learning_rate": 6.5849829689296344e-06, + "loss": 1.5656, + "step": 19875 + }, + { + "epoch": 0.6500130787339785, + "grad_norm": 3.21721561045898, + "learning_rate": 6.579619812809671e-06, + "loss": 1.4241, + "step": 19880 + }, + { + "epoch": 0.6501765629087104, + "grad_norm": 3.1864316646228756, + "learning_rate": 6.574257770623731e-06, + "loss": 1.6461, + "step": 19885 + }, + { + "epoch": 0.6503400470834423, + "grad_norm": 3.0298758785669455, + "learning_rate": 6.568896844118101e-06, + "loss": 1.2688, + "step": 19890 + }, + { + "epoch": 0.6505035312581742, + "grad_norm": 2.873270944930398, + "learning_rate": 6.563537035038703e-06, + "loss": 1.4297, + "step": 19895 + }, + { + "epoch": 0.6506670154329061, + "grad_norm": 3.506226584230592, + "learning_rate": 6.558178345131097e-06, + "loss": 1.4423, + "step": 19900 + }, + { + "epoch": 0.650830499607638, + "grad_norm": 3.045125525624816, + "learning_rate": 6.552820776140474e-06, + "loss": 1.3062, + "step": 19905 + }, + { + "epoch": 0.6509939837823698, + "grad_norm": 3.126070882364486, + "learning_rate": 6.5474643298116635e-06, + "loss": 1.352, + "step": 19910 + }, + { + "epoch": 0.6511574679571017, + "grad_norm": 2.70703972841485, + "learning_rate": 6.542109007889128e-06, + "loss": 1.2624, + "step": 19915 + }, + { + "epoch": 0.6513209521318336, + "grad_norm": 3.1537155767433673, + "learning_rate": 6.5367548121169674e-06, + "loss": 1.4397, + "step": 19920 + }, + { + "epoch": 0.6514844363065655, + "grad_norm": 3.283081215628471, + "learning_rate": 6.531401744238912e-06, + "loss": 1.4819, + "step": 19925 + }, + { + "epoch": 0.6516479204812974, + "grad_norm": 3.557278169593485, + "learning_rate": 6.526049805998326e-06, + "loss": 1.5168, + "step": 19930 + }, + { + "epoch": 0.6518114046560293, + "grad_norm": 3.0627920146351633, + "learning_rate": 6.5206989991382e-06, + "loss": 1.3474, + "step": 19935 + }, + { + "epoch": 0.6519748888307612, + "grad_norm": 3.239858216721537, + "learning_rate": 6.515349325401163e-06, + "loss": 1.6394, + "step": 19940 + }, + { + "epoch": 0.652138373005493, + "grad_norm": 2.950396857257767, + "learning_rate": 6.5100007865294714e-06, + "loss": 1.4142, + "step": 19945 + }, + { + "epoch": 0.6523018571802249, + "grad_norm": 3.1922192559978377, + "learning_rate": 6.504653384265016e-06, + "loss": 1.4922, + "step": 19950 + }, + { + "epoch": 0.6524653413549568, + "grad_norm": 2.980331230497642, + "learning_rate": 6.49930712034931e-06, + "loss": 1.3846, + "step": 19955 + }, + { + "epoch": 0.6526288255296887, + "grad_norm": 3.307551651691909, + "learning_rate": 6.493961996523506e-06, + "loss": 1.4944, + "step": 19960 + }, + { + "epoch": 0.6527923097044206, + "grad_norm": 2.955618304237263, + "learning_rate": 6.488618014528379e-06, + "loss": 1.2927, + "step": 19965 + }, + { + "epoch": 0.6529557938791525, + "grad_norm": 3.144025490449479, + "learning_rate": 6.483275176104329e-06, + "loss": 1.3781, + "step": 19970 + }, + { + "epoch": 0.6531192780538844, + "grad_norm": 3.1790003210967845, + "learning_rate": 6.477933482991392e-06, + "loss": 1.4748, + "step": 19975 + }, + { + "epoch": 0.6532827622286163, + "grad_norm": 3.2604891194040255, + "learning_rate": 6.472592936929225e-06, + "loss": 1.4111, + "step": 19980 + }, + { + "epoch": 0.6534462464033481, + "grad_norm": 3.415602566591858, + "learning_rate": 6.46725353965711e-06, + "loss": 1.5041, + "step": 19985 + }, + { + "epoch": 0.65360973057808, + "grad_norm": 3.2173989065643847, + "learning_rate": 6.461915292913963e-06, + "loss": 1.3499, + "step": 19990 + }, + { + "epoch": 0.6537732147528119, + "grad_norm": 3.1492220454668014, + "learning_rate": 6.456578198438317e-06, + "loss": 1.4128, + "step": 19995 + }, + { + "epoch": 0.6539366989275438, + "grad_norm": 3.3934756379148445, + "learning_rate": 6.451242257968335e-06, + "loss": 1.499, + "step": 20000 + }, + { + "epoch": 0.6541001831022757, + "grad_norm": 3.300945541088885, + "learning_rate": 6.445907473241801e-06, + "loss": 1.5575, + "step": 20005 + }, + { + "epoch": 0.6542636672770076, + "grad_norm": 3.259621195642835, + "learning_rate": 6.44057384599612e-06, + "loss": 1.366, + "step": 20010 + }, + { + "epoch": 0.6544271514517395, + "grad_norm": 3.406108401509199, + "learning_rate": 6.435241377968328e-06, + "loss": 1.6615, + "step": 20015 + }, + { + "epoch": 0.6545906356264714, + "grad_norm": 3.318697467199518, + "learning_rate": 6.429910070895082e-06, + "loss": 1.4576, + "step": 20020 + }, + { + "epoch": 0.6547541198012032, + "grad_norm": 3.124943887397504, + "learning_rate": 6.424579926512653e-06, + "loss": 1.441, + "step": 20025 + }, + { + "epoch": 0.6549176039759351, + "grad_norm": 3.1535962473713743, + "learning_rate": 6.419250946556939e-06, + "loss": 1.5528, + "step": 20030 + }, + { + "epoch": 0.655081088150667, + "grad_norm": 3.299544611965195, + "learning_rate": 6.413923132763458e-06, + "loss": 1.3665, + "step": 20035 + }, + { + "epoch": 0.6552445723253989, + "grad_norm": 3.2359203690777063, + "learning_rate": 6.40859648686735e-06, + "loss": 1.3388, + "step": 20040 + }, + { + "epoch": 0.6554080565001308, + "grad_norm": 3.0679126098117195, + "learning_rate": 6.403271010603374e-06, + "loss": 1.446, + "step": 20045 + }, + { + "epoch": 0.6555715406748627, + "grad_norm": 3.6060637521313232, + "learning_rate": 6.397946705705905e-06, + "loss": 1.6063, + "step": 20050 + }, + { + "epoch": 0.6557350248495946, + "grad_norm": 3.265365367352464, + "learning_rate": 6.39262357390894e-06, + "loss": 1.4281, + "step": 20055 + }, + { + "epoch": 0.6558985090243264, + "grad_norm": 3.17509785810657, + "learning_rate": 6.387301616946091e-06, + "loss": 1.4539, + "step": 20060 + }, + { + "epoch": 0.6560619931990583, + "grad_norm": 3.3049953937908723, + "learning_rate": 6.38198083655059e-06, + "loss": 1.4244, + "step": 20065 + }, + { + "epoch": 0.6562254773737902, + "grad_norm": 2.9646970924212077, + "learning_rate": 6.376661234455284e-06, + "loss": 1.393, + "step": 20070 + }, + { + "epoch": 0.6563889615485221, + "grad_norm": 3.3696887301423284, + "learning_rate": 6.371342812392639e-06, + "loss": 1.5675, + "step": 20075 + }, + { + "epoch": 0.656552445723254, + "grad_norm": 2.9368218654679676, + "learning_rate": 6.3660255720947336e-06, + "loss": 1.266, + "step": 20080 + }, + { + "epoch": 0.6567159298979859, + "grad_norm": 3.0703831286008962, + "learning_rate": 6.360709515293263e-06, + "loss": 1.2721, + "step": 20085 + }, + { + "epoch": 0.6568794140727178, + "grad_norm": 3.217862608461583, + "learning_rate": 6.3553946437195345e-06, + "loss": 1.3205, + "step": 20090 + }, + { + "epoch": 0.6570428982474497, + "grad_norm": 3.131996012864299, + "learning_rate": 6.350080959104474e-06, + "loss": 1.4006, + "step": 20095 + }, + { + "epoch": 0.6572063824221815, + "grad_norm": 3.278707354189864, + "learning_rate": 6.344768463178619e-06, + "loss": 1.4373, + "step": 20100 + }, + { + "epoch": 0.6573698665969134, + "grad_norm": 3.189419730854553, + "learning_rate": 6.339457157672118e-06, + "loss": 1.4815, + "step": 20105 + }, + { + "epoch": 0.6575333507716453, + "grad_norm": 3.2064704039260157, + "learning_rate": 6.334147044314734e-06, + "loss": 1.2632, + "step": 20110 + }, + { + "epoch": 0.6576968349463772, + "grad_norm": 3.5319216793138035, + "learning_rate": 6.328838124835837e-06, + "loss": 1.4874, + "step": 20115 + }, + { + "epoch": 0.6578603191211091, + "grad_norm": 3.347443529517409, + "learning_rate": 6.323530400964415e-06, + "loss": 1.4317, + "step": 20120 + }, + { + "epoch": 0.658023803295841, + "grad_norm": 3.2553734943340666, + "learning_rate": 6.3182238744290645e-06, + "loss": 1.3459, + "step": 20125 + }, + { + "epoch": 0.6581872874705729, + "grad_norm": 3.132479365152293, + "learning_rate": 6.312918546957991e-06, + "loss": 1.3672, + "step": 20130 + }, + { + "epoch": 0.6583507716453048, + "grad_norm": 3.0660952277173195, + "learning_rate": 6.3076144202790116e-06, + "loss": 1.3791, + "step": 20135 + }, + { + "epoch": 0.6585142558200366, + "grad_norm": 3.0696763410949828, + "learning_rate": 6.302311496119544e-06, + "loss": 1.4655, + "step": 20140 + }, + { + "epoch": 0.6586777399947685, + "grad_norm": 3.41222679825323, + "learning_rate": 6.297009776206625e-06, + "loss": 1.4971, + "step": 20145 + }, + { + "epoch": 0.6588412241695004, + "grad_norm": 3.474391003987568, + "learning_rate": 6.291709262266894e-06, + "loss": 1.3557, + "step": 20150 + }, + { + "epoch": 0.6590047083442323, + "grad_norm": 3.461386113669279, + "learning_rate": 6.286409956026599e-06, + "loss": 1.5717, + "step": 20155 + }, + { + "epoch": 0.6591681925189642, + "grad_norm": 3.3154625201488086, + "learning_rate": 6.281111859211592e-06, + "loss": 1.5369, + "step": 20160 + }, + { + "epoch": 0.6593316766936961, + "grad_norm": 3.337695887269797, + "learning_rate": 6.2758149735473376e-06, + "loss": 1.396, + "step": 20165 + }, + { + "epoch": 0.659495160868428, + "grad_norm": 3.134882078434035, + "learning_rate": 6.270519300758898e-06, + "loss": 1.5178, + "step": 20170 + }, + { + "epoch": 0.6596586450431599, + "grad_norm": 3.0852217248479987, + "learning_rate": 6.2652248425709475e-06, + "loss": 1.4387, + "step": 20175 + }, + { + "epoch": 0.6598221292178917, + "grad_norm": 3.000952501581751, + "learning_rate": 6.259931600707757e-06, + "loss": 1.4028, + "step": 20180 + }, + { + "epoch": 0.6599856133926236, + "grad_norm": 3.1420928474090544, + "learning_rate": 6.254639576893209e-06, + "loss": 1.428, + "step": 20185 + }, + { + "epoch": 0.6601490975673555, + "grad_norm": 3.302997206127045, + "learning_rate": 6.249348772850783e-06, + "loss": 1.4423, + "step": 20190 + }, + { + "epoch": 0.6603125817420874, + "grad_norm": 3.5488978692325475, + "learning_rate": 6.244059190303569e-06, + "loss": 1.3962, + "step": 20195 + }, + { + "epoch": 0.6604760659168193, + "grad_norm": 3.257264325138717, + "learning_rate": 6.238770830974251e-06, + "loss": 1.4884, + "step": 20200 + }, + { + "epoch": 0.6606395500915512, + "grad_norm": 3.0928615738398864, + "learning_rate": 6.233483696585118e-06, + "loss": 1.6283, + "step": 20205 + }, + { + "epoch": 0.6608030342662831, + "grad_norm": 3.2729315503856924, + "learning_rate": 6.228197788858062e-06, + "loss": 1.4926, + "step": 20210 + }, + { + "epoch": 0.660966518441015, + "grad_norm": 3.164116175901348, + "learning_rate": 6.22291310951457e-06, + "loss": 1.295, + "step": 20215 + }, + { + "epoch": 0.6611300026157468, + "grad_norm": 3.278028423199563, + "learning_rate": 6.217629660275738e-06, + "loss": 1.3159, + "step": 20220 + }, + { + "epoch": 0.6612934867904787, + "grad_norm": 3.173929790438836, + "learning_rate": 6.212347442862252e-06, + "loss": 1.5133, + "step": 20225 + }, + { + "epoch": 0.6614569709652106, + "grad_norm": 3.441816544428913, + "learning_rate": 6.207066458994402e-06, + "loss": 1.5258, + "step": 20230 + }, + { + "epoch": 0.6616204551399425, + "grad_norm": 3.392643017141049, + "learning_rate": 6.201786710392076e-06, + "loss": 1.4562, + "step": 20235 + }, + { + "epoch": 0.6617839393146744, + "grad_norm": 3.1165506422561813, + "learning_rate": 6.196508198774754e-06, + "loss": 1.3777, + "step": 20240 + }, + { + "epoch": 0.6619474234894063, + "grad_norm": 3.4762768640728887, + "learning_rate": 6.191230925861524e-06, + "loss": 1.5687, + "step": 20245 + }, + { + "epoch": 0.6621109076641382, + "grad_norm": 3.0716984789448127, + "learning_rate": 6.1859548933710634e-06, + "loss": 1.4822, + "step": 20250 + }, + { + "epoch": 0.66227439183887, + "grad_norm": 3.2766634622338975, + "learning_rate": 6.1806801030216445e-06, + "loss": 1.4274, + "step": 20255 + }, + { + "epoch": 0.6624378760136019, + "grad_norm": 3.2218911808376824, + "learning_rate": 6.175406556531139e-06, + "loss": 1.4325, + "step": 20260 + }, + { + "epoch": 0.6626013601883338, + "grad_norm": 3.322903087141066, + "learning_rate": 6.170134255617008e-06, + "loss": 1.5163, + "step": 20265 + }, + { + "epoch": 0.6627648443630657, + "grad_norm": 2.955533231867881, + "learning_rate": 6.164863201996314e-06, + "loss": 1.398, + "step": 20270 + }, + { + "epoch": 0.6629283285377976, + "grad_norm": 2.945865998892635, + "learning_rate": 6.1595933973857125e-06, + "loss": 1.4208, + "step": 20275 + }, + { + "epoch": 0.6630918127125295, + "grad_norm": 3.2214873643448443, + "learning_rate": 6.1543248435014445e-06, + "loss": 1.528, + "step": 20280 + }, + { + "epoch": 0.6632552968872613, + "grad_norm": 3.1941977993106403, + "learning_rate": 6.149057542059354e-06, + "loss": 1.4264, + "step": 20285 + }, + { + "epoch": 0.6634187810619931, + "grad_norm": 3.383481669738255, + "learning_rate": 6.143791494774867e-06, + "loss": 1.4264, + "step": 20290 + }, + { + "epoch": 0.663582265236725, + "grad_norm": 3.1465335386981144, + "learning_rate": 6.138526703363008e-06, + "loss": 1.6052, + "step": 20295 + }, + { + "epoch": 0.6637457494114569, + "grad_norm": 3.2438568167046378, + "learning_rate": 6.133263169538393e-06, + "loss": 1.4844, + "step": 20300 + }, + { + "epoch": 0.6639092335861888, + "grad_norm": 3.1370560959430285, + "learning_rate": 6.128000895015223e-06, + "loss": 1.4194, + "step": 20305 + }, + { + "epoch": 0.6640727177609207, + "grad_norm": 3.3228352156130416, + "learning_rate": 6.122739881507294e-06, + "loss": 1.4111, + "step": 20310 + }, + { + "epoch": 0.6642362019356526, + "grad_norm": 3.1380544802305446, + "learning_rate": 6.117480130727987e-06, + "loss": 1.4168, + "step": 20315 + }, + { + "epoch": 0.6643996861103845, + "grad_norm": 3.0820165399621606, + "learning_rate": 6.1122216443902745e-06, + "loss": 1.4305, + "step": 20320 + }, + { + "epoch": 0.6645631702851164, + "grad_norm": 3.180500114142597, + "learning_rate": 6.106964424206716e-06, + "loss": 1.3241, + "step": 20325 + }, + { + "epoch": 0.6647266544598482, + "grad_norm": 3.262717442122676, + "learning_rate": 6.101708471889464e-06, + "loss": 1.4209, + "step": 20330 + }, + { + "epoch": 0.6648901386345801, + "grad_norm": 2.9702509378769966, + "learning_rate": 6.0964537891502475e-06, + "loss": 1.4361, + "step": 20335 + }, + { + "epoch": 0.665053622809312, + "grad_norm": 3.493200915329675, + "learning_rate": 6.091200377700395e-06, + "loss": 1.3384, + "step": 20340 + }, + { + "epoch": 0.6652171069840439, + "grad_norm": 3.0870849752335405, + "learning_rate": 6.085948239250805e-06, + "loss": 1.473, + "step": 20345 + }, + { + "epoch": 0.6653805911587758, + "grad_norm": 3.1756868410000885, + "learning_rate": 6.080697375511975e-06, + "loss": 1.4802, + "step": 20350 + }, + { + "epoch": 0.6655440753335077, + "grad_norm": 3.0691649309072537, + "learning_rate": 6.075447788193982e-06, + "loss": 1.4351, + "step": 20355 + }, + { + "epoch": 0.6657075595082396, + "grad_norm": 3.1864997661064085, + "learning_rate": 6.0701994790064885e-06, + "loss": 1.4679, + "step": 20360 + }, + { + "epoch": 0.6658710436829715, + "grad_norm": 3.301423323683128, + "learning_rate": 6.0649524496587385e-06, + "loss": 1.3219, + "step": 20365 + }, + { + "epoch": 0.6660345278577033, + "grad_norm": 3.4056309598405816, + "learning_rate": 6.059706701859564e-06, + "loss": 1.3797, + "step": 20370 + }, + { + "epoch": 0.6661980120324352, + "grad_norm": 3.189897393604391, + "learning_rate": 6.054462237317374e-06, + "loss": 1.4128, + "step": 20375 + }, + { + "epoch": 0.6663614962071671, + "grad_norm": 3.4212806433979677, + "learning_rate": 6.049219057740164e-06, + "loss": 1.4324, + "step": 20380 + }, + { + "epoch": 0.666524980381899, + "grad_norm": 3.0049403017361986, + "learning_rate": 6.043977164835508e-06, + "loss": 1.2851, + "step": 20385 + }, + { + "epoch": 0.6666884645566309, + "grad_norm": 2.982997587240176, + "learning_rate": 6.038736560310561e-06, + "loss": 1.3222, + "step": 20390 + }, + { + "epoch": 0.6668519487313628, + "grad_norm": 3.2413737668134686, + "learning_rate": 6.033497245872059e-06, + "loss": 1.3433, + "step": 20395 + }, + { + "epoch": 0.6670154329060947, + "grad_norm": 2.914784422858903, + "learning_rate": 6.0282592232263225e-06, + "loss": 1.3451, + "step": 20400 + }, + { + "epoch": 0.6671789170808266, + "grad_norm": 3.4217338355616818, + "learning_rate": 6.023022494079244e-06, + "loss": 1.298, + "step": 20405 + }, + { + "epoch": 0.6673424012555584, + "grad_norm": 3.3291197524354765, + "learning_rate": 6.0177870601363e-06, + "loss": 1.5444, + "step": 20410 + }, + { + "epoch": 0.6675058854302903, + "grad_norm": 2.986275639997299, + "learning_rate": 6.01255292310254e-06, + "loss": 1.374, + "step": 20415 + }, + { + "epoch": 0.6676693696050222, + "grad_norm": 3.2194252068995173, + "learning_rate": 6.0073200846825956e-06, + "loss": 1.4021, + "step": 20420 + }, + { + "epoch": 0.6678328537797541, + "grad_norm": 3.4308910706757705, + "learning_rate": 6.0020885465806735e-06, + "loss": 1.5035, + "step": 20425 + }, + { + "epoch": 0.667996337954486, + "grad_norm": 3.2923682247076997, + "learning_rate": 5.99685831050056e-06, + "loss": 1.568, + "step": 20430 + }, + { + "epoch": 0.6681598221292179, + "grad_norm": 3.0450506974986538, + "learning_rate": 5.991629378145613e-06, + "loss": 1.561, + "step": 20435 + }, + { + "epoch": 0.6683233063039498, + "grad_norm": 3.3016365318754017, + "learning_rate": 5.986401751218767e-06, + "loss": 1.5165, + "step": 20440 + }, + { + "epoch": 0.6684867904786816, + "grad_norm": 3.182350276129604, + "learning_rate": 5.981175431422532e-06, + "loss": 1.5282, + "step": 20445 + }, + { + "epoch": 0.6686502746534135, + "grad_norm": 3.332100760246114, + "learning_rate": 5.975950420458991e-06, + "loss": 1.4178, + "step": 20450 + }, + { + "epoch": 0.6688137588281454, + "grad_norm": 3.3343055880182977, + "learning_rate": 5.970726720029808e-06, + "loss": 1.3174, + "step": 20455 + }, + { + "epoch": 0.6689772430028773, + "grad_norm": 3.130407021528445, + "learning_rate": 5.965504331836209e-06, + "loss": 1.368, + "step": 20460 + }, + { + "epoch": 0.6691407271776092, + "grad_norm": 3.14480230789352, + "learning_rate": 5.960283257579e-06, + "loss": 1.4568, + "step": 20465 + }, + { + "epoch": 0.6693042113523411, + "grad_norm": 3.6843876974159384, + "learning_rate": 5.955063498958555e-06, + "loss": 1.6006, + "step": 20470 + }, + { + "epoch": 0.669467695527073, + "grad_norm": 2.911146912237667, + "learning_rate": 5.9498450576748215e-06, + "loss": 1.4394, + "step": 20475 + }, + { + "epoch": 0.6696311797018049, + "grad_norm": 3.3817695734038, + "learning_rate": 5.9446279354273205e-06, + "loss": 1.3564, + "step": 20480 + }, + { + "epoch": 0.6697946638765367, + "grad_norm": 3.0541092865696156, + "learning_rate": 5.939412133915139e-06, + "loss": 1.4997, + "step": 20485 + }, + { + "epoch": 0.6699581480512686, + "grad_norm": 3.0120119175166855, + "learning_rate": 5.934197654836937e-06, + "loss": 1.4848, + "step": 20490 + }, + { + "epoch": 0.6701216322260005, + "grad_norm": 3.240564549955931, + "learning_rate": 5.9289844998909415e-06, + "loss": 1.4534, + "step": 20495 + }, + { + "epoch": 0.6702851164007324, + "grad_norm": 3.0803976280728618, + "learning_rate": 5.923772670774948e-06, + "loss": 1.3925, + "step": 20500 + }, + { + "epoch": 0.6704486005754643, + "grad_norm": 3.1622260809741287, + "learning_rate": 5.918562169186326e-06, + "loss": 1.3908, + "step": 20505 + }, + { + "epoch": 0.6706120847501962, + "grad_norm": 3.137702649524009, + "learning_rate": 5.913352996822004e-06, + "loss": 1.4505, + "step": 20510 + }, + { + "epoch": 0.6707755689249281, + "grad_norm": 3.2528226727193887, + "learning_rate": 5.908145155378484e-06, + "loss": 1.4366, + "step": 20515 + }, + { + "epoch": 0.67093905309966, + "grad_norm": 3.2495878311133057, + "learning_rate": 5.90293864655183e-06, + "loss": 1.3169, + "step": 20520 + }, + { + "epoch": 0.6711025372743918, + "grad_norm": 3.1028400902035744, + "learning_rate": 5.8977334720376775e-06, + "loss": 1.3074, + "step": 20525 + }, + { + "epoch": 0.6712660214491237, + "grad_norm": 3.3315628008881495, + "learning_rate": 5.892529633531221e-06, + "loss": 1.4732, + "step": 20530 + }, + { + "epoch": 0.6714295056238556, + "grad_norm": 3.244265291002833, + "learning_rate": 5.887327132727225e-06, + "loss": 1.2999, + "step": 20535 + }, + { + "epoch": 0.6715929897985875, + "grad_norm": 3.4449441364568285, + "learning_rate": 5.882125971320019e-06, + "loss": 1.4359, + "step": 20540 + }, + { + "epoch": 0.6717564739733194, + "grad_norm": 3.1714623438253664, + "learning_rate": 5.87692615100349e-06, + "loss": 1.4978, + "step": 20545 + }, + { + "epoch": 0.6719199581480513, + "grad_norm": 3.1704258888763492, + "learning_rate": 5.871727673471095e-06, + "loss": 1.3683, + "step": 20550 + }, + { + "epoch": 0.6720834423227832, + "grad_norm": 3.123288705951812, + "learning_rate": 5.866530540415848e-06, + "loss": 1.3459, + "step": 20555 + }, + { + "epoch": 0.672246926497515, + "grad_norm": 4.073529893676772, + "learning_rate": 5.861334753530328e-06, + "loss": 1.2676, + "step": 20560 + }, + { + "epoch": 0.6724104106722469, + "grad_norm": 3.2280590266394458, + "learning_rate": 5.856140314506677e-06, + "loss": 1.4579, + "step": 20565 + }, + { + "epoch": 0.6725738948469788, + "grad_norm": 3.1850547785593615, + "learning_rate": 5.850947225036595e-06, + "loss": 1.4195, + "step": 20570 + }, + { + "epoch": 0.6727373790217107, + "grad_norm": 3.413823763099731, + "learning_rate": 5.845755486811346e-06, + "loss": 1.4739, + "step": 20575 + }, + { + "epoch": 0.6729008631964426, + "grad_norm": 2.8382387610397033, + "learning_rate": 5.840565101521751e-06, + "loss": 1.4027, + "step": 20580 + }, + { + "epoch": 0.6730643473711745, + "grad_norm": 3.1702187693448773, + "learning_rate": 5.835376070858192e-06, + "loss": 1.3526, + "step": 20585 + }, + { + "epoch": 0.6732278315459064, + "grad_norm": 3.2309206847171383, + "learning_rate": 5.830188396510606e-06, + "loss": 1.425, + "step": 20590 + }, + { + "epoch": 0.6733913157206383, + "grad_norm": 3.398686112622928, + "learning_rate": 5.825002080168498e-06, + "loss": 1.5046, + "step": 20595 + }, + { + "epoch": 0.6735547998953701, + "grad_norm": 3.6001333188245335, + "learning_rate": 5.819817123520917e-06, + "loss": 1.4816, + "step": 20600 + }, + { + "epoch": 0.673718284070102, + "grad_norm": 3.064073347890429, + "learning_rate": 5.8146335282564814e-06, + "loss": 1.3127, + "step": 20605 + }, + { + "epoch": 0.6738817682448339, + "grad_norm": 3.0626036037474442, + "learning_rate": 5.809451296063358e-06, + "loss": 1.6083, + "step": 20610 + }, + { + "epoch": 0.6740452524195658, + "grad_norm": 3.2040668493526763, + "learning_rate": 5.8042704286292705e-06, + "loss": 1.4246, + "step": 20615 + }, + { + "epoch": 0.6742087365942977, + "grad_norm": 3.4033581845642513, + "learning_rate": 5.7990909276415105e-06, + "loss": 1.4768, + "step": 20620 + }, + { + "epoch": 0.6743722207690296, + "grad_norm": 3.092098336095807, + "learning_rate": 5.793912794786903e-06, + "loss": 1.399, + "step": 20625 + }, + { + "epoch": 0.6745357049437615, + "grad_norm": 3.405769463414941, + "learning_rate": 5.788736031751849e-06, + "loss": 1.4667, + "step": 20630 + }, + { + "epoch": 0.6746991891184934, + "grad_norm": 3.087251731239467, + "learning_rate": 5.783560640222283e-06, + "loss": 1.5241, + "step": 20635 + }, + { + "epoch": 0.6748626732932252, + "grad_norm": 3.0937844150445937, + "learning_rate": 5.77838662188371e-06, + "loss": 1.314, + "step": 20640 + }, + { + "epoch": 0.6750261574679571, + "grad_norm": 3.2117768296707037, + "learning_rate": 5.7732139784211835e-06, + "loss": 1.45, + "step": 20645 + }, + { + "epoch": 0.675189641642689, + "grad_norm": 3.3398023229600193, + "learning_rate": 5.768042711519299e-06, + "loss": 1.4095, + "step": 20650 + }, + { + "epoch": 0.6753531258174209, + "grad_norm": 3.035976758591805, + "learning_rate": 5.76287282286222e-06, + "loss": 1.4386, + "step": 20655 + }, + { + "epoch": 0.6755166099921528, + "grad_norm": 3.3504008659430924, + "learning_rate": 5.757704314133643e-06, + "loss": 1.4373, + "step": 20660 + }, + { + "epoch": 0.6756800941668847, + "grad_norm": 3.315243252886304, + "learning_rate": 5.752537187016829e-06, + "loss": 1.38, + "step": 20665 + }, + { + "epoch": 0.6758435783416166, + "grad_norm": 3.62176083273675, + "learning_rate": 5.747371443194589e-06, + "loss": 1.4972, + "step": 20670 + }, + { + "epoch": 0.6760070625163485, + "grad_norm": 3.1024544619385286, + "learning_rate": 5.742207084349274e-06, + "loss": 1.2941, + "step": 20675 + }, + { + "epoch": 0.6761705466910803, + "grad_norm": 3.1931686701361524, + "learning_rate": 5.737044112162793e-06, + "loss": 1.3211, + "step": 20680 + }, + { + "epoch": 0.6763340308658122, + "grad_norm": 3.1134949904458784, + "learning_rate": 5.731882528316592e-06, + "loss": 1.5617, + "step": 20685 + }, + { + "epoch": 0.6764975150405441, + "grad_norm": 3.195845537706994, + "learning_rate": 5.726722334491684e-06, + "loss": 1.3317, + "step": 20690 + }, + { + "epoch": 0.676660999215276, + "grad_norm": 3.2387106700921753, + "learning_rate": 5.721563532368605e-06, + "loss": 1.5196, + "step": 20695 + }, + { + "epoch": 0.6768244833900079, + "grad_norm": 3.0765690426770913, + "learning_rate": 5.716406123627458e-06, + "loss": 1.4196, + "step": 20700 + }, + { + "epoch": 0.6769879675647398, + "grad_norm": 3.078390382541569, + "learning_rate": 5.711250109947887e-06, + "loss": 1.2523, + "step": 20705 + }, + { + "epoch": 0.6771514517394717, + "grad_norm": 3.4303708969004956, + "learning_rate": 5.706095493009072e-06, + "loss": 1.5546, + "step": 20710 + }, + { + "epoch": 0.6773149359142036, + "grad_norm": 2.9547338907892087, + "learning_rate": 5.7009422744897525e-06, + "loss": 1.3677, + "step": 20715 + }, + { + "epoch": 0.6774784200889354, + "grad_norm": 2.97365408255052, + "learning_rate": 5.695790456068198e-06, + "loss": 1.2989, + "step": 20720 + }, + { + "epoch": 0.6776419042636673, + "grad_norm": 3.2472586916155985, + "learning_rate": 5.690640039422235e-06, + "loss": 1.4295, + "step": 20725 + }, + { + "epoch": 0.6778053884383992, + "grad_norm": 3.170672009427189, + "learning_rate": 5.6854910262292294e-06, + "loss": 1.3764, + "step": 20730 + }, + { + "epoch": 0.6779688726131311, + "grad_norm": 3.165870874840406, + "learning_rate": 5.680343418166083e-06, + "loss": 1.4517, + "step": 20735 + }, + { + "epoch": 0.678132356787863, + "grad_norm": 3.344436984884924, + "learning_rate": 5.675197216909252e-06, + "loss": 1.4077, + "step": 20740 + }, + { + "epoch": 0.6782958409625949, + "grad_norm": 3.214145479875027, + "learning_rate": 5.67005242413472e-06, + "loss": 1.392, + "step": 20745 + }, + { + "epoch": 0.6784593251373267, + "grad_norm": 3.200748075350781, + "learning_rate": 5.664909041518025e-06, + "loss": 1.5212, + "step": 20750 + }, + { + "epoch": 0.6786228093120585, + "grad_norm": 3.382968987085184, + "learning_rate": 5.659767070734249e-06, + "loss": 1.4136, + "step": 20755 + }, + { + "epoch": 0.6787862934867904, + "grad_norm": 3.4164813938748786, + "learning_rate": 5.654626513457988e-06, + "loss": 1.4498, + "step": 20760 + }, + { + "epoch": 0.6789497776615223, + "grad_norm": 3.280706544880252, + "learning_rate": 5.649487371363407e-06, + "loss": 1.5148, + "step": 20765 + }, + { + "epoch": 0.6791132618362542, + "grad_norm": 3.3566017302968656, + "learning_rate": 5.644349646124199e-06, + "loss": 1.491, + "step": 20770 + }, + { + "epoch": 0.6792767460109861, + "grad_norm": 3.121715232218829, + "learning_rate": 5.639213339413587e-06, + "loss": 1.5111, + "step": 20775 + }, + { + "epoch": 0.679440230185718, + "grad_norm": 3.097957810599353, + "learning_rate": 5.634078452904353e-06, + "loss": 1.4613, + "step": 20780 + }, + { + "epoch": 0.6796037143604499, + "grad_norm": 3.3663299059242844, + "learning_rate": 5.6289449882687895e-06, + "loss": 1.3998, + "step": 20785 + }, + { + "epoch": 0.6797671985351817, + "grad_norm": 3.3611912886872757, + "learning_rate": 5.623812947178748e-06, + "loss": 1.5217, + "step": 20790 + }, + { + "epoch": 0.6799306827099136, + "grad_norm": 3.030406009443487, + "learning_rate": 5.618682331305614e-06, + "loss": 1.3981, + "step": 20795 + }, + { + "epoch": 0.6800941668846455, + "grad_norm": 3.119599560952173, + "learning_rate": 5.6135531423202915e-06, + "loss": 1.3012, + "step": 20800 + }, + { + "epoch": 0.6802576510593774, + "grad_norm": 3.1637327505051234, + "learning_rate": 5.608425381893241e-06, + "loss": 1.3851, + "step": 20805 + }, + { + "epoch": 0.6804211352341093, + "grad_norm": 3.1848000877309306, + "learning_rate": 5.603299051694442e-06, + "loss": 1.4151, + "step": 20810 + }, + { + "epoch": 0.6805846194088412, + "grad_norm": 3.0831533801439277, + "learning_rate": 5.598174153393421e-06, + "loss": 1.4169, + "step": 20815 + }, + { + "epoch": 0.6807481035835731, + "grad_norm": 3.40975325878482, + "learning_rate": 5.593050688659223e-06, + "loss": 1.4782, + "step": 20820 + }, + { + "epoch": 0.680911587758305, + "grad_norm": 3.23471863595697, + "learning_rate": 5.587928659160442e-06, + "loss": 1.3851, + "step": 20825 + }, + { + "epoch": 0.6810750719330368, + "grad_norm": 3.1219819420547656, + "learning_rate": 5.582808066565198e-06, + "loss": 1.5025, + "step": 20830 + }, + { + "epoch": 0.6812385561077687, + "grad_norm": 3.4209561808409727, + "learning_rate": 5.577688912541137e-06, + "loss": 1.3795, + "step": 20835 + }, + { + "epoch": 0.6814020402825006, + "grad_norm": 3.2313333830642352, + "learning_rate": 5.57257119875545e-06, + "loss": 1.4722, + "step": 20840 + }, + { + "epoch": 0.6815655244572325, + "grad_norm": 3.07567565857543, + "learning_rate": 5.5674549268748426e-06, + "loss": 1.3419, + "step": 20845 + }, + { + "epoch": 0.6817290086319644, + "grad_norm": 3.419847355240763, + "learning_rate": 5.562340098565562e-06, + "loss": 1.5109, + "step": 20850 + }, + { + "epoch": 0.6818924928066963, + "grad_norm": 3.105934135673401, + "learning_rate": 5.557226715493387e-06, + "loss": 1.4667, + "step": 20855 + }, + { + "epoch": 0.6820559769814282, + "grad_norm": 3.6020447626243133, + "learning_rate": 5.552114779323614e-06, + "loss": 1.5218, + "step": 20860 + }, + { + "epoch": 0.6822194611561601, + "grad_norm": 3.2900203101720202, + "learning_rate": 5.547004291721082e-06, + "loss": 1.2724, + "step": 20865 + }, + { + "epoch": 0.6823829453308919, + "grad_norm": 2.9550237922847473, + "learning_rate": 5.541895254350145e-06, + "loss": 1.3917, + "step": 20870 + }, + { + "epoch": 0.6825464295056238, + "grad_norm": 3.0647194457716926, + "learning_rate": 5.536787668874694e-06, + "loss": 1.4379, + "step": 20875 + }, + { + "epoch": 0.6827099136803557, + "grad_norm": 3.098623164710192, + "learning_rate": 5.531681536958151e-06, + "loss": 1.2718, + "step": 20880 + }, + { + "epoch": 0.6828733978550876, + "grad_norm": 3.1816157486728893, + "learning_rate": 5.5265768602634485e-06, + "loss": 1.3405, + "step": 20885 + }, + { + "epoch": 0.6830368820298195, + "grad_norm": 3.484143792755321, + "learning_rate": 5.5214736404530615e-06, + "loss": 1.3876, + "step": 20890 + }, + { + "epoch": 0.6832003662045514, + "grad_norm": 3.3584115622612245, + "learning_rate": 5.516371879188975e-06, + "loss": 1.3763, + "step": 20895 + }, + { + "epoch": 0.6833638503792833, + "grad_norm": 3.382018807963323, + "learning_rate": 5.511271578132715e-06, + "loss": 1.3964, + "step": 20900 + }, + { + "epoch": 0.6835273345540152, + "grad_norm": 3.1939818936967335, + "learning_rate": 5.506172738945327e-06, + "loss": 1.4289, + "step": 20905 + }, + { + "epoch": 0.683690818728747, + "grad_norm": 3.055133880918371, + "learning_rate": 5.501075363287369e-06, + "loss": 1.4406, + "step": 20910 + }, + { + "epoch": 0.6838543029034789, + "grad_norm": 3.170544817896109, + "learning_rate": 5.49597945281894e-06, + "loss": 1.4537, + "step": 20915 + }, + { + "epoch": 0.6840177870782108, + "grad_norm": 3.2741467967052476, + "learning_rate": 5.490885009199647e-06, + "loss": 1.4048, + "step": 20920 + }, + { + "epoch": 0.6841812712529427, + "grad_norm": 3.1791117643155853, + "learning_rate": 5.4857920340886265e-06, + "loss": 1.444, + "step": 20925 + }, + { + "epoch": 0.6843447554276746, + "grad_norm": 3.064993839098502, + "learning_rate": 5.480700529144541e-06, + "loss": 1.5079, + "step": 20930 + }, + { + "epoch": 0.6845082396024065, + "grad_norm": 3.0054493482283626, + "learning_rate": 5.475610496025561e-06, + "loss": 1.3751, + "step": 20935 + }, + { + "epoch": 0.6846717237771384, + "grad_norm": 3.443005427657363, + "learning_rate": 5.470521936389392e-06, + "loss": 1.4514, + "step": 20940 + }, + { + "epoch": 0.6848352079518703, + "grad_norm": 3.227186179230407, + "learning_rate": 5.46543485189325e-06, + "loss": 1.364, + "step": 20945 + }, + { + "epoch": 0.6849986921266021, + "grad_norm": 3.2110365448338554, + "learning_rate": 5.460349244193877e-06, + "loss": 1.5459, + "step": 20950 + }, + { + "epoch": 0.685162176301334, + "grad_norm": 3.2516180827752774, + "learning_rate": 5.455265114947524e-06, + "loss": 1.4251, + "step": 20955 + }, + { + "epoch": 0.6853256604760659, + "grad_norm": 3.612113355051276, + "learning_rate": 5.450182465809971e-06, + "loss": 1.4343, + "step": 20960 + }, + { + "epoch": 0.6854891446507978, + "grad_norm": 3.250038353663565, + "learning_rate": 5.445101298436522e-06, + "loss": 1.3692, + "step": 20965 + }, + { + "epoch": 0.6856526288255297, + "grad_norm": 2.9823980110518056, + "learning_rate": 5.4400216144819705e-06, + "loss": 1.3761, + "step": 20970 + }, + { + "epoch": 0.6858161130002616, + "grad_norm": 2.979021380384431, + "learning_rate": 5.4349434156006555e-06, + "loss": 1.2409, + "step": 20975 + }, + { + "epoch": 0.6859795971749935, + "grad_norm": 3.350067294232124, + "learning_rate": 5.429866703446424e-06, + "loss": 1.3879, + "step": 20980 + }, + { + "epoch": 0.6861430813497253, + "grad_norm": 3.0707431045505027, + "learning_rate": 5.42479147967263e-06, + "loss": 1.3979, + "step": 20985 + }, + { + "epoch": 0.6863065655244572, + "grad_norm": 3.1134287225511827, + "learning_rate": 5.419717745932156e-06, + "loss": 1.2694, + "step": 20990 + }, + { + "epoch": 0.6864700496991891, + "grad_norm": 3.26975357275247, + "learning_rate": 5.4146455038773874e-06, + "loss": 1.4565, + "step": 20995 + }, + { + "epoch": 0.686633533873921, + "grad_norm": 2.849887595489892, + "learning_rate": 5.40957475516023e-06, + "loss": 1.3173, + "step": 21000 + }, + { + "epoch": 0.6867970180486529, + "grad_norm": 3.0544520722498794, + "learning_rate": 5.404505501432109e-06, + "loss": 1.3546, + "step": 21005 + }, + { + "epoch": 0.6869605022233848, + "grad_norm": 3.285513177567724, + "learning_rate": 5.399437744343946e-06, + "loss": 1.3707, + "step": 21010 + }, + { + "epoch": 0.6871239863981167, + "grad_norm": 3.1922794849481257, + "learning_rate": 5.394371485546195e-06, + "loss": 1.4554, + "step": 21015 + }, + { + "epoch": 0.6872874705728486, + "grad_norm": 3.377484061023215, + "learning_rate": 5.389306726688803e-06, + "loss": 1.4674, + "step": 21020 + }, + { + "epoch": 0.6874509547475804, + "grad_norm": 3.157247512800755, + "learning_rate": 5.384243469421244e-06, + "loss": 1.4841, + "step": 21025 + }, + { + "epoch": 0.6876144389223123, + "grad_norm": 3.171407223243941, + "learning_rate": 5.379181715392499e-06, + "loss": 1.4615, + "step": 21030 + }, + { + "epoch": 0.6877779230970442, + "grad_norm": 3.3514692693921164, + "learning_rate": 5.37412146625105e-06, + "loss": 1.4526, + "step": 21035 + }, + { + "epoch": 0.6879414072717761, + "grad_norm": 3.2905934765589477, + "learning_rate": 5.3690627236449025e-06, + "loss": 1.5203, + "step": 21040 + }, + { + "epoch": 0.688104891446508, + "grad_norm": 3.4516036894469755, + "learning_rate": 5.36400548922156e-06, + "loss": 1.3991, + "step": 21045 + }, + { + "epoch": 0.6882683756212399, + "grad_norm": 2.9799173463468867, + "learning_rate": 5.358949764628041e-06, + "loss": 1.3797, + "step": 21050 + }, + { + "epoch": 0.6884318597959718, + "grad_norm": 3.1092925887800096, + "learning_rate": 5.353895551510877e-06, + "loss": 1.4186, + "step": 21055 + }, + { + "epoch": 0.6885953439707037, + "grad_norm": 3.10804415758157, + "learning_rate": 5.348842851516094e-06, + "loss": 1.3164, + "step": 21060 + }, + { + "epoch": 0.6887588281454355, + "grad_norm": 2.993021101762764, + "learning_rate": 5.343791666289238e-06, + "loss": 1.3694, + "step": 21065 + }, + { + "epoch": 0.6889223123201674, + "grad_norm": 3.1262765106717754, + "learning_rate": 5.33874199747535e-06, + "loss": 1.3005, + "step": 21070 + }, + { + "epoch": 0.6890857964948993, + "grad_norm": 3.1766696030412542, + "learning_rate": 5.3336938467189906e-06, + "loss": 1.3685, + "step": 21075 + }, + { + "epoch": 0.6892492806696312, + "grad_norm": 3.062926292380093, + "learning_rate": 5.328647215664211e-06, + "loss": 1.4884, + "step": 21080 + }, + { + "epoch": 0.6894127648443631, + "grad_norm": 3.1366273280015418, + "learning_rate": 5.32360210595458e-06, + "loss": 1.4287, + "step": 21085 + }, + { + "epoch": 0.689576249019095, + "grad_norm": 3.22429672683818, + "learning_rate": 5.31855851923317e-06, + "loss": 1.4794, + "step": 21090 + }, + { + "epoch": 0.6897397331938269, + "grad_norm": 3.3050812021063143, + "learning_rate": 5.313516457142545e-06, + "loss": 1.416, + "step": 21095 + }, + { + "epoch": 0.6899032173685588, + "grad_norm": 3.0797413575921198, + "learning_rate": 5.308475921324789e-06, + "loss": 1.4237, + "step": 21100 + }, + { + "epoch": 0.6900667015432906, + "grad_norm": 3.228676781712744, + "learning_rate": 5.303436913421475e-06, + "loss": 1.4945, + "step": 21105 + }, + { + "epoch": 0.6902301857180225, + "grad_norm": 3.299954497411795, + "learning_rate": 5.2983994350736865e-06, + "loss": 1.478, + "step": 21110 + }, + { + "epoch": 0.6903936698927544, + "grad_norm": 3.253754994044221, + "learning_rate": 5.293363487922011e-06, + "loss": 1.5016, + "step": 21115 + }, + { + "epoch": 0.6905571540674863, + "grad_norm": 3.3886671733580016, + "learning_rate": 5.2883290736065245e-06, + "loss": 1.3765, + "step": 21120 + }, + { + "epoch": 0.6907206382422182, + "grad_norm": 3.3058447334968077, + "learning_rate": 5.283296193766822e-06, + "loss": 1.4164, + "step": 21125 + }, + { + "epoch": 0.6908841224169501, + "grad_norm": 3.0519575443811866, + "learning_rate": 5.27826485004198e-06, + "loss": 1.353, + "step": 21130 + }, + { + "epoch": 0.691047606591682, + "grad_norm": 3.0125473996748573, + "learning_rate": 5.273235044070589e-06, + "loss": 1.4624, + "step": 21135 + }, + { + "epoch": 0.6912110907664138, + "grad_norm": 3.143183962449347, + "learning_rate": 5.2682067774907355e-06, + "loss": 1.3238, + "step": 21140 + }, + { + "epoch": 0.6913745749411457, + "grad_norm": 3.2398467796118187, + "learning_rate": 5.263180051939995e-06, + "loss": 1.5274, + "step": 21145 + }, + { + "epoch": 0.6915380591158776, + "grad_norm": 3.2625751471434343, + "learning_rate": 5.258154869055461e-06, + "loss": 1.5085, + "step": 21150 + }, + { + "epoch": 0.6917015432906095, + "grad_norm": 2.7790929944479945, + "learning_rate": 5.2531312304736995e-06, + "loss": 1.2199, + "step": 21155 + }, + { + "epoch": 0.6918650274653414, + "grad_norm": 3.057678715737522, + "learning_rate": 5.248109137830792e-06, + "loss": 1.3619, + "step": 21160 + }, + { + "epoch": 0.6920285116400733, + "grad_norm": 3.138234751796001, + "learning_rate": 5.243088592762315e-06, + "loss": 1.4149, + "step": 21165 + }, + { + "epoch": 0.6921919958148052, + "grad_norm": 3.390377286963257, + "learning_rate": 5.2380695969033345e-06, + "loss": 1.4598, + "step": 21170 + }, + { + "epoch": 0.6923554799895371, + "grad_norm": 3.1009711966736635, + "learning_rate": 5.233052151888409e-06, + "loss": 1.3763, + "step": 21175 + }, + { + "epoch": 0.692518964164269, + "grad_norm": 3.2399777775948144, + "learning_rate": 5.228036259351605e-06, + "loss": 1.535, + "step": 21180 + }, + { + "epoch": 0.6926824483390008, + "grad_norm": 3.759151239299678, + "learning_rate": 5.22302192092647e-06, + "loss": 1.4537, + "step": 21185 + }, + { + "epoch": 0.6928459325137327, + "grad_norm": 3.1595798360417744, + "learning_rate": 5.218009138246056e-06, + "loss": 1.3552, + "step": 21190 + }, + { + "epoch": 0.6930094166884646, + "grad_norm": 3.3014409136297185, + "learning_rate": 5.212997912942898e-06, + "loss": 1.4836, + "step": 21195 + }, + { + "epoch": 0.6931729008631965, + "grad_norm": 3.2833182207858806, + "learning_rate": 5.207988246649033e-06, + "loss": 1.3942, + "step": 21200 + }, + { + "epoch": 0.6933363850379284, + "grad_norm": 3.2289216509754635, + "learning_rate": 5.20298014099599e-06, + "loss": 1.5574, + "step": 21205 + }, + { + "epoch": 0.6934998692126603, + "grad_norm": 3.1346762312903307, + "learning_rate": 5.197973597614777e-06, + "loss": 1.4562, + "step": 21210 + }, + { + "epoch": 0.693663353387392, + "grad_norm": 3.137417913231582, + "learning_rate": 5.192968618135913e-06, + "loss": 1.3303, + "step": 21215 + }, + { + "epoch": 0.6938268375621239, + "grad_norm": 3.080843748148865, + "learning_rate": 5.187965204189388e-06, + "loss": 1.471, + "step": 21220 + }, + { + "epoch": 0.6939903217368558, + "grad_norm": 3.0721784711947873, + "learning_rate": 5.182963357404699e-06, + "loss": 1.3827, + "step": 21225 + }, + { + "epoch": 0.6941538059115877, + "grad_norm": 3.2685727076289752, + "learning_rate": 5.177963079410817e-06, + "loss": 1.3142, + "step": 21230 + }, + { + "epoch": 0.6943172900863196, + "grad_norm": 3.1968187413023403, + "learning_rate": 5.172964371836215e-06, + "loss": 1.3657, + "step": 21235 + }, + { + "epoch": 0.6944807742610515, + "grad_norm": 3.238151576309582, + "learning_rate": 5.167967236308853e-06, + "loss": 1.3101, + "step": 21240 + }, + { + "epoch": 0.6946442584357834, + "grad_norm": 3.4416243363755017, + "learning_rate": 5.162971674456168e-06, + "loss": 1.3935, + "step": 21245 + }, + { + "epoch": 0.6948077426105153, + "grad_norm": 3.195953395960606, + "learning_rate": 5.157977687905099e-06, + "loss": 1.3743, + "step": 21250 + }, + { + "epoch": 0.6949712267852471, + "grad_norm": 2.9642915203902924, + "learning_rate": 5.152985278282059e-06, + "loss": 1.348, + "step": 21255 + }, + { + "epoch": 0.695134710959979, + "grad_norm": 3.2501281925554624, + "learning_rate": 5.147994447212954e-06, + "loss": 1.4291, + "step": 21260 + }, + { + "epoch": 0.6952981951347109, + "grad_norm": 3.273581701504791, + "learning_rate": 5.143005196323183e-06, + "loss": 1.3896, + "step": 21265 + }, + { + "epoch": 0.6954616793094428, + "grad_norm": 3.1284963351369446, + "learning_rate": 5.138017527237613e-06, + "loss": 1.3209, + "step": 21270 + }, + { + "epoch": 0.6956251634841747, + "grad_norm": 3.2510962936670817, + "learning_rate": 5.133031441580614e-06, + "loss": 1.4556, + "step": 21275 + }, + { + "epoch": 0.6957886476589066, + "grad_norm": 3.0682445156025007, + "learning_rate": 5.128046940976024e-06, + "loss": 1.3247, + "step": 21280 + }, + { + "epoch": 0.6959521318336385, + "grad_norm": 3.083929663129531, + "learning_rate": 5.123064027047177e-06, + "loss": 1.3149, + "step": 21285 + }, + { + "epoch": 0.6961156160083704, + "grad_norm": 3.254487058938962, + "learning_rate": 5.1180827014168884e-06, + "loss": 1.4334, + "step": 21290 + }, + { + "epoch": 0.6962791001831022, + "grad_norm": 126.28668135719141, + "learning_rate": 5.113102965707449e-06, + "loss": 1.3422, + "step": 21295 + }, + { + "epoch": 0.6964425843578341, + "grad_norm": 3.7160965825136927, + "learning_rate": 5.108124821540642e-06, + "loss": 1.5979, + "step": 21300 + }, + { + "epoch": 0.696606068532566, + "grad_norm": 3.455305371743768, + "learning_rate": 5.10314827053772e-06, + "loss": 1.4999, + "step": 21305 + }, + { + "epoch": 0.6967695527072979, + "grad_norm": 3.05170440575876, + "learning_rate": 5.098173314319428e-06, + "loss": 1.2936, + "step": 21310 + }, + { + "epoch": 0.6969330368820298, + "grad_norm": 3.2741594935940537, + "learning_rate": 5.09319995450599e-06, + "loss": 1.449, + "step": 21315 + }, + { + "epoch": 0.6970965210567617, + "grad_norm": 3.1785176522726397, + "learning_rate": 5.0882281927171e-06, + "loss": 1.4802, + "step": 21320 + }, + { + "epoch": 0.6972600052314936, + "grad_norm": 3.1525584668492592, + "learning_rate": 5.083258030571949e-06, + "loss": 1.337, + "step": 21325 + }, + { + "epoch": 0.6974234894062254, + "grad_norm": 3.088531399320851, + "learning_rate": 5.078289469689186e-06, + "loss": 1.4167, + "step": 21330 + }, + { + "epoch": 0.6975869735809573, + "grad_norm": 3.2485361147809164, + "learning_rate": 5.07332251168696e-06, + "loss": 1.4496, + "step": 21335 + }, + { + "epoch": 0.6977504577556892, + "grad_norm": 3.1240759510835954, + "learning_rate": 5.068357158182877e-06, + "loss": 1.3268, + "step": 21340 + }, + { + "epoch": 0.6979139419304211, + "grad_norm": 3.042903149631404, + "learning_rate": 5.063393410794038e-06, + "loss": 1.337, + "step": 21345 + }, + { + "epoch": 0.698077426105153, + "grad_norm": 3.284048755659099, + "learning_rate": 5.058431271137015e-06, + "loss": 1.4868, + "step": 21350 + }, + { + "epoch": 0.6982409102798849, + "grad_norm": 3.2892751520351378, + "learning_rate": 5.0534707408278495e-06, + "loss": 1.4757, + "step": 21355 + }, + { + "epoch": 0.6984043944546168, + "grad_norm": 3.054362528434794, + "learning_rate": 5.0485118214820715e-06, + "loss": 1.345, + "step": 21360 + }, + { + "epoch": 0.6985678786293487, + "grad_norm": 3.248295813372091, + "learning_rate": 5.0435545147146724e-06, + "loss": 1.479, + "step": 21365 + }, + { + "epoch": 0.6987313628040805, + "grad_norm": 3.1296464765211107, + "learning_rate": 5.0385988221401286e-06, + "loss": 1.5118, + "step": 21370 + }, + { + "epoch": 0.6988948469788124, + "grad_norm": 3.2366367273335483, + "learning_rate": 5.033644745372396e-06, + "loss": 1.5426, + "step": 21375 + }, + { + "epoch": 0.6990583311535443, + "grad_norm": 3.3574711487818476, + "learning_rate": 5.028692286024881e-06, + "loss": 1.4521, + "step": 21380 + }, + { + "epoch": 0.6992218153282762, + "grad_norm": 3.0472413514296965, + "learning_rate": 5.023741445710484e-06, + "loss": 1.2758, + "step": 21385 + }, + { + "epoch": 0.6993852995030081, + "grad_norm": 3.1555057393375083, + "learning_rate": 5.01879222604158e-06, + "loss": 1.4424, + "step": 21390 + }, + { + "epoch": 0.69954878367774, + "grad_norm": 3.163669966260507, + "learning_rate": 5.013844628629996e-06, + "loss": 1.4428, + "step": 21395 + }, + { + "epoch": 0.6997122678524719, + "grad_norm": 2.949278952418196, + "learning_rate": 5.008898655087056e-06, + "loss": 1.3863, + "step": 21400 + }, + { + "epoch": 0.6998757520272038, + "grad_norm": 3.119094456020198, + "learning_rate": 5.003954307023531e-06, + "loss": 1.6033, + "step": 21405 + }, + { + "epoch": 0.7000392362019356, + "grad_norm": 2.896034841590897, + "learning_rate": 4.999011586049679e-06, + "loss": 1.4316, + "step": 21410 + }, + { + "epoch": 0.7002027203766675, + "grad_norm": 3.161232228439868, + "learning_rate": 4.994070493775227e-06, + "loss": 1.324, + "step": 21415 + }, + { + "epoch": 0.7003662045513994, + "grad_norm": 3.172934507232258, + "learning_rate": 4.98913103180936e-06, + "loss": 1.4047, + "step": 21420 + }, + { + "epoch": 0.7005296887261313, + "grad_norm": 3.366290279437188, + "learning_rate": 4.984193201760749e-06, + "loss": 1.48, + "step": 21425 + }, + { + "epoch": 0.7006931729008632, + "grad_norm": 3.289970590703026, + "learning_rate": 4.979257005237514e-06, + "loss": 1.623, + "step": 21430 + }, + { + "epoch": 0.7008566570755951, + "grad_norm": 3.4040862698227805, + "learning_rate": 4.974322443847257e-06, + "loss": 1.3899, + "step": 21435 + }, + { + "epoch": 0.701020141250327, + "grad_norm": 3.2387847376704353, + "learning_rate": 4.969389519197051e-06, + "loss": 1.3948, + "step": 21440 + }, + { + "epoch": 0.7011836254250589, + "grad_norm": 2.9083004547250892, + "learning_rate": 4.964458232893418e-06, + "loss": 1.2649, + "step": 21445 + }, + { + "epoch": 0.7013471095997907, + "grad_norm": 3.3318193261701397, + "learning_rate": 4.959528586542365e-06, + "loss": 1.4854, + "step": 21450 + }, + { + "epoch": 0.7015105937745226, + "grad_norm": 3.391804890061995, + "learning_rate": 4.95460058174935e-06, + "loss": 1.5334, + "step": 21455 + }, + { + "epoch": 0.7016740779492545, + "grad_norm": 3.1936303511898525, + "learning_rate": 4.9496742201193074e-06, + "loss": 1.3798, + "step": 21460 + }, + { + "epoch": 0.7018375621239864, + "grad_norm": 2.93809604959758, + "learning_rate": 4.9447495032566365e-06, + "loss": 1.2744, + "step": 21465 + }, + { + "epoch": 0.7020010462987183, + "grad_norm": 3.3290646405638737, + "learning_rate": 4.939826432765189e-06, + "loss": 1.4996, + "step": 21470 + }, + { + "epoch": 0.7021645304734502, + "grad_norm": 3.3048560390156867, + "learning_rate": 4.934905010248295e-06, + "loss": 1.4186, + "step": 21475 + }, + { + "epoch": 0.7023280146481821, + "grad_norm": 3.2950252316460906, + "learning_rate": 4.929985237308735e-06, + "loss": 1.4736, + "step": 21480 + }, + { + "epoch": 0.702491498822914, + "grad_norm": 3.322371142532792, + "learning_rate": 4.925067115548766e-06, + "loss": 1.3974, + "step": 21485 + }, + { + "epoch": 0.7026549829976458, + "grad_norm": 3.2078438293208955, + "learning_rate": 4.920150646570091e-06, + "loss": 1.4396, + "step": 21490 + }, + { + "epoch": 0.7028184671723777, + "grad_norm": 3.385215863203908, + "learning_rate": 4.915235831973889e-06, + "loss": 1.4223, + "step": 21495 + }, + { + "epoch": 0.7029819513471096, + "grad_norm": 3.2026480950001526, + "learning_rate": 4.910322673360797e-06, + "loss": 1.4884, + "step": 21500 + }, + { + "epoch": 0.7031454355218415, + "grad_norm": 3.0398798369341065, + "learning_rate": 4.905411172330903e-06, + "loss": 1.5125, + "step": 21505 + }, + { + "epoch": 0.7033089196965734, + "grad_norm": 3.0651128529217058, + "learning_rate": 4.900501330483771e-06, + "loss": 1.3586, + "step": 21510 + }, + { + "epoch": 0.7034724038713053, + "grad_norm": 3.1370345135059923, + "learning_rate": 4.895593149418409e-06, + "loss": 1.3831, + "step": 21515 + }, + { + "epoch": 0.7036358880460372, + "grad_norm": 2.870048688352406, + "learning_rate": 4.890686630733292e-06, + "loss": 1.3627, + "step": 21520 + }, + { + "epoch": 0.703799372220769, + "grad_norm": 2.9401196728840713, + "learning_rate": 4.8857817760263595e-06, + "loss": 1.2863, + "step": 21525 + }, + { + "epoch": 0.7039628563955009, + "grad_norm": 2.9375153619937593, + "learning_rate": 4.880878586894995e-06, + "loss": 1.41, + "step": 21530 + }, + { + "epoch": 0.7041263405702328, + "grad_norm": 3.018415768817596, + "learning_rate": 4.875977064936054e-06, + "loss": 1.4255, + "step": 21535 + }, + { + "epoch": 0.7042898247449647, + "grad_norm": 3.8861385016304317, + "learning_rate": 4.871077211745834e-06, + "loss": 1.5242, + "step": 21540 + }, + { + "epoch": 0.7044533089196966, + "grad_norm": 3.1145602025713113, + "learning_rate": 4.866179028920101e-06, + "loss": 1.4516, + "step": 21545 + }, + { + "epoch": 0.7046167930944285, + "grad_norm": 3.070873988982886, + "learning_rate": 4.861282518054078e-06, + "loss": 1.4022, + "step": 21550 + }, + { + "epoch": 0.7047802772691604, + "grad_norm": 3.0535676934651796, + "learning_rate": 4.85638768074243e-06, + "loss": 1.2726, + "step": 21555 + }, + { + "epoch": 0.7049437614438923, + "grad_norm": 3.296963049271789, + "learning_rate": 4.851494518579294e-06, + "loss": 1.4344, + "step": 21560 + }, + { + "epoch": 0.7051072456186241, + "grad_norm": 3.2040617442556387, + "learning_rate": 4.846603033158245e-06, + "loss": 1.3579, + "step": 21565 + }, + { + "epoch": 0.705270729793356, + "grad_norm": 3.197058698036215, + "learning_rate": 4.841713226072323e-06, + "loss": 1.388, + "step": 21570 + }, + { + "epoch": 0.7054342139680879, + "grad_norm": 3.17600705395378, + "learning_rate": 4.836825098914024e-06, + "loss": 1.481, + "step": 21575 + }, + { + "epoch": 0.7055976981428198, + "grad_norm": 2.9836015436583065, + "learning_rate": 4.831938653275282e-06, + "loss": 1.4788, + "step": 21580 + }, + { + "epoch": 0.7057611823175517, + "grad_norm": 3.0889711801865967, + "learning_rate": 4.827053890747501e-06, + "loss": 1.3772, + "step": 21585 + }, + { + "epoch": 0.7059246664922836, + "grad_norm": 3.0859149298643263, + "learning_rate": 4.822170812921524e-06, + "loss": 1.5183, + "step": 21590 + }, + { + "epoch": 0.7060881506670155, + "grad_norm": 3.1990010463389105, + "learning_rate": 4.817289421387646e-06, + "loss": 1.441, + "step": 21595 + }, + { + "epoch": 0.7062516348417474, + "grad_norm": 3.39884111169947, + "learning_rate": 4.8124097177356255e-06, + "loss": 1.4983, + "step": 21600 + }, + { + "epoch": 0.7064151190164792, + "grad_norm": 3.275569585503516, + "learning_rate": 4.807531703554655e-06, + "loss": 1.3491, + "step": 21605 + }, + { + "epoch": 0.7065786031912111, + "grad_norm": 3.0887859817688947, + "learning_rate": 4.802655380433389e-06, + "loss": 1.4722, + "step": 21610 + }, + { + "epoch": 0.706742087365943, + "grad_norm": 3.1999187911485545, + "learning_rate": 4.797780749959921e-06, + "loss": 1.4247, + "step": 21615 + }, + { + "epoch": 0.7069055715406749, + "grad_norm": 3.0113095450720313, + "learning_rate": 4.792907813721802e-06, + "loss": 1.3784, + "step": 21620 + }, + { + "epoch": 0.7070690557154068, + "grad_norm": 3.039715980379901, + "learning_rate": 4.788036573306032e-06, + "loss": 1.3793, + "step": 21625 + }, + { + "epoch": 0.7072325398901387, + "grad_norm": 3.121613149725068, + "learning_rate": 4.783167030299048e-06, + "loss": 1.6006, + "step": 21630 + }, + { + "epoch": 0.7073960240648706, + "grad_norm": 3.2878138212346597, + "learning_rate": 4.778299186286746e-06, + "loss": 1.4578, + "step": 21635 + }, + { + "epoch": 0.7075595082396025, + "grad_norm": 3.2714736896700916, + "learning_rate": 4.773433042854457e-06, + "loss": 1.5252, + "step": 21640 + }, + { + "epoch": 0.7077229924143343, + "grad_norm": 3.220270259447513, + "learning_rate": 4.7685686015869704e-06, + "loss": 1.3254, + "step": 21645 + }, + { + "epoch": 0.7078864765890662, + "grad_norm": 3.2024334221163726, + "learning_rate": 4.763705864068517e-06, + "loss": 1.4162, + "step": 21650 + }, + { + "epoch": 0.7080499607637981, + "grad_norm": 3.3958747472562925, + "learning_rate": 4.758844831882764e-06, + "loss": 1.3555, + "step": 21655 + }, + { + "epoch": 0.70821344493853, + "grad_norm": 3.3207623138980016, + "learning_rate": 4.75398550661284e-06, + "loss": 1.4856, + "step": 21660 + }, + { + "epoch": 0.7083769291132619, + "grad_norm": 3.0775340802119624, + "learning_rate": 4.7491278898412975e-06, + "loss": 1.3582, + "step": 21665 + }, + { + "epoch": 0.7085404132879938, + "grad_norm": 3.1903985058927353, + "learning_rate": 4.7442719831501495e-06, + "loss": 1.5446, + "step": 21670 + }, + { + "epoch": 0.7087038974627257, + "grad_norm": 3.231794999904999, + "learning_rate": 4.739417788120848e-06, + "loss": 1.5396, + "step": 21675 + }, + { + "epoch": 0.7088673816374574, + "grad_norm": 3.1542655109179787, + "learning_rate": 4.734565306334279e-06, + "loss": 1.4856, + "step": 21680 + }, + { + "epoch": 0.7090308658121893, + "grad_norm": 3.1730111634948828, + "learning_rate": 4.7297145393707846e-06, + "loss": 1.336, + "step": 21685 + }, + { + "epoch": 0.7091943499869212, + "grad_norm": 3.1027472111813195, + "learning_rate": 4.7248654888101316e-06, + "loss": 1.491, + "step": 21690 + }, + { + "epoch": 0.7093578341616531, + "grad_norm": 3.1129577520113174, + "learning_rate": 4.720018156231543e-06, + "loss": 1.483, + "step": 21695 + }, + { + "epoch": 0.709521318336385, + "grad_norm": 3.1536423790871724, + "learning_rate": 4.715172543213679e-06, + "loss": 1.3495, + "step": 21700 + }, + { + "epoch": 0.7096848025111169, + "grad_norm": 3.195420787994824, + "learning_rate": 4.710328651334628e-06, + "loss": 1.3725, + "step": 21705 + }, + { + "epoch": 0.7098482866858488, + "grad_norm": 3.0764740406817714, + "learning_rate": 4.705486482171936e-06, + "loss": 1.5176, + "step": 21710 + }, + { + "epoch": 0.7100117708605806, + "grad_norm": 3.2910973270138877, + "learning_rate": 4.700646037302571e-06, + "loss": 1.3727, + "step": 21715 + }, + { + "epoch": 0.7101752550353125, + "grad_norm": 3.3428719779213454, + "learning_rate": 4.695807318302952e-06, + "loss": 1.3698, + "step": 21720 + }, + { + "epoch": 0.7103387392100444, + "grad_norm": 3.3254801265451572, + "learning_rate": 4.690970326748934e-06, + "loss": 1.376, + "step": 21725 + }, + { + "epoch": 0.7105022233847763, + "grad_norm": 3.3525047128612355, + "learning_rate": 4.686135064215799e-06, + "loss": 1.4087, + "step": 21730 + }, + { + "epoch": 0.7106657075595082, + "grad_norm": 3.234418220716236, + "learning_rate": 4.68130153227828e-06, + "loss": 1.4593, + "step": 21735 + }, + { + "epoch": 0.7108291917342401, + "grad_norm": 3.1884002076836744, + "learning_rate": 4.6764697325105355e-06, + "loss": 1.4016, + "step": 21740 + }, + { + "epoch": 0.710992675908972, + "grad_norm": 3.3239415472952043, + "learning_rate": 4.67163966648617e-06, + "loss": 1.5768, + "step": 21745 + }, + { + "epoch": 0.7111561600837039, + "grad_norm": 3.4751881562877958, + "learning_rate": 4.66681133577821e-06, + "loss": 1.4886, + "step": 21750 + }, + { + "epoch": 0.7113196442584357, + "grad_norm": 3.298997375175645, + "learning_rate": 4.661984741959128e-06, + "loss": 1.4226, + "step": 21755 + }, + { + "epoch": 0.7114831284331676, + "grad_norm": 2.9140365555786185, + "learning_rate": 4.657159886600831e-06, + "loss": 1.3413, + "step": 21760 + }, + { + "epoch": 0.7116466126078995, + "grad_norm": 3.1783891246815044, + "learning_rate": 4.6523367712746504e-06, + "loss": 1.5225, + "step": 21765 + }, + { + "epoch": 0.7118100967826314, + "grad_norm": 3.155858331446185, + "learning_rate": 4.647515397551363e-06, + "loss": 1.4018, + "step": 21770 + }, + { + "epoch": 0.7119735809573633, + "grad_norm": 3.017444871914045, + "learning_rate": 4.642695767001164e-06, + "loss": 1.3822, + "step": 21775 + }, + { + "epoch": 0.7121370651320952, + "grad_norm": 3.1239227937091005, + "learning_rate": 4.637877881193693e-06, + "loss": 1.3986, + "step": 21780 + }, + { + "epoch": 0.7123005493068271, + "grad_norm": 3.3392941831265475, + "learning_rate": 4.633061741698023e-06, + "loss": 1.3433, + "step": 21785 + }, + { + "epoch": 0.712464033481559, + "grad_norm": 3.4479973233450045, + "learning_rate": 4.628247350082647e-06, + "loss": 1.443, + "step": 21790 + }, + { + "epoch": 0.7126275176562908, + "grad_norm": 2.833376111106377, + "learning_rate": 4.62343470791549e-06, + "loss": 1.2743, + "step": 21795 + }, + { + "epoch": 0.7127910018310227, + "grad_norm": 2.876186388719106, + "learning_rate": 4.61862381676392e-06, + "loss": 1.3665, + "step": 21800 + }, + { + "epoch": 0.7129544860057546, + "grad_norm": 3.1416176878918582, + "learning_rate": 4.613814678194719e-06, + "loss": 1.3178, + "step": 21805 + }, + { + "epoch": 0.7131179701804865, + "grad_norm": 3.137418010651965, + "learning_rate": 4.609007293774114e-06, + "loss": 1.3078, + "step": 21810 + }, + { + "epoch": 0.7132814543552184, + "grad_norm": 3.3317221154479224, + "learning_rate": 4.6042016650677435e-06, + "loss": 1.3119, + "step": 21815 + }, + { + "epoch": 0.7134449385299503, + "grad_norm": 3.129680399187033, + "learning_rate": 4.599397793640687e-06, + "loss": 1.4031, + "step": 21820 + }, + { + "epoch": 0.7136084227046822, + "grad_norm": 3.1841386984369735, + "learning_rate": 4.594595681057451e-06, + "loss": 1.4514, + "step": 21825 + }, + { + "epoch": 0.713771906879414, + "grad_norm": 3.141214753852827, + "learning_rate": 4.589795328881961e-06, + "loss": 1.4531, + "step": 21830 + }, + { + "epoch": 0.7139353910541459, + "grad_norm": 3.05579347807693, + "learning_rate": 4.584996738677578e-06, + "loss": 1.4035, + "step": 21835 + }, + { + "epoch": 0.7140988752288778, + "grad_norm": 3.0215331872689815, + "learning_rate": 4.58019991200708e-06, + "loss": 1.3558, + "step": 21840 + }, + { + "epoch": 0.7142623594036097, + "grad_norm": 3.418242488566723, + "learning_rate": 4.575404850432679e-06, + "loss": 1.3843, + "step": 21845 + }, + { + "epoch": 0.7144258435783416, + "grad_norm": 3.448530344032106, + "learning_rate": 4.570611555516012e-06, + "loss": 1.4912, + "step": 21850 + }, + { + "epoch": 0.7145893277530735, + "grad_norm": 3.100321070807498, + "learning_rate": 4.565820028818133e-06, + "loss": 1.4615, + "step": 21855 + }, + { + "epoch": 0.7147528119278054, + "grad_norm": 3.110649456897114, + "learning_rate": 4.561030271899529e-06, + "loss": 1.3712, + "step": 21860 + }, + { + "epoch": 0.7149162961025373, + "grad_norm": 3.479746534231634, + "learning_rate": 4.556242286320101e-06, + "loss": 1.4863, + "step": 21865 + }, + { + "epoch": 0.7150797802772692, + "grad_norm": 3.085374117644079, + "learning_rate": 4.551456073639185e-06, + "loss": 1.4851, + "step": 21870 + }, + { + "epoch": 0.715243264452001, + "grad_norm": 3.107143332125927, + "learning_rate": 4.546671635415528e-06, + "loss": 1.3396, + "step": 21875 + }, + { + "epoch": 0.7154067486267329, + "grad_norm": 3.7387559674481237, + "learning_rate": 4.541888973207305e-06, + "loss": 1.3226, + "step": 21880 + }, + { + "epoch": 0.7155702328014648, + "grad_norm": 3.082291004486196, + "learning_rate": 4.537108088572116e-06, + "loss": 1.3622, + "step": 21885 + }, + { + "epoch": 0.7157337169761967, + "grad_norm": 3.163830791675274, + "learning_rate": 4.532328983066974e-06, + "loss": 1.394, + "step": 21890 + }, + { + "epoch": 0.7158972011509286, + "grad_norm": 3.3006205849522794, + "learning_rate": 4.527551658248319e-06, + "loss": 1.3923, + "step": 21895 + }, + { + "epoch": 0.7160606853256605, + "grad_norm": 3.223680354463102, + "learning_rate": 4.5227761156720054e-06, + "loss": 1.4948, + "step": 21900 + }, + { + "epoch": 0.7162241695003924, + "grad_norm": 2.895463734860634, + "learning_rate": 4.518002356893311e-06, + "loss": 1.337, + "step": 21905 + }, + { + "epoch": 0.7163876536751242, + "grad_norm": 3.108306213142484, + "learning_rate": 4.513230383466938e-06, + "loss": 1.3486, + "step": 21910 + }, + { + "epoch": 0.7165511378498561, + "grad_norm": 3.2177102208845314, + "learning_rate": 4.508460196946993e-06, + "loss": 1.5051, + "step": 21915 + }, + { + "epoch": 0.716714622024588, + "grad_norm": 3.290508117900164, + "learning_rate": 4.503691798887015e-06, + "loss": 1.4766, + "step": 21920 + }, + { + "epoch": 0.7168781061993199, + "grad_norm": 3.3144219876966035, + "learning_rate": 4.49892519083995e-06, + "loss": 1.4637, + "step": 21925 + }, + { + "epoch": 0.7170415903740518, + "grad_norm": 3.0959990511196964, + "learning_rate": 4.494160374358168e-06, + "loss": 1.3942, + "step": 21930 + }, + { + "epoch": 0.7172050745487837, + "grad_norm": 3.1356007173208087, + "learning_rate": 4.489397350993454e-06, + "loss": 1.3992, + "step": 21935 + }, + { + "epoch": 0.7173685587235156, + "grad_norm": 3.1646291873341035, + "learning_rate": 4.484636122297003e-06, + "loss": 1.36, + "step": 21940 + }, + { + "epoch": 0.7175320428982475, + "grad_norm": 3.145933214880885, + "learning_rate": 4.479876689819439e-06, + "loss": 1.4745, + "step": 21945 + }, + { + "epoch": 0.7176955270729793, + "grad_norm": 3.121053222980913, + "learning_rate": 4.4751190551107825e-06, + "loss": 1.4043, + "step": 21950 + }, + { + "epoch": 0.7178590112477112, + "grad_norm": 3.1074991765937576, + "learning_rate": 4.470363219720485e-06, + "loss": 1.3463, + "step": 21955 + }, + { + "epoch": 0.7180224954224431, + "grad_norm": 3.1635935835497633, + "learning_rate": 4.465609185197407e-06, + "loss": 1.3397, + "step": 21960 + }, + { + "epoch": 0.718185979597175, + "grad_norm": 3.2393843106769045, + "learning_rate": 4.460856953089815e-06, + "loss": 1.2861, + "step": 21965 + }, + { + "epoch": 0.7183494637719069, + "grad_norm": 3.364426307271133, + "learning_rate": 4.4561065249454005e-06, + "loss": 1.4822, + "step": 21970 + }, + { + "epoch": 0.7185129479466388, + "grad_norm": 3.1613718304398177, + "learning_rate": 4.451357902311256e-06, + "loss": 1.5267, + "step": 21975 + }, + { + "epoch": 0.7186764321213707, + "grad_norm": 2.8145821658819523, + "learning_rate": 4.4466110867338944e-06, + "loss": 1.2928, + "step": 21980 + }, + { + "epoch": 0.7188399162961026, + "grad_norm": 3.154564934174321, + "learning_rate": 4.441866079759241e-06, + "loss": 1.3918, + "step": 21985 + }, + { + "epoch": 0.7190034004708344, + "grad_norm": 3.2036574460372855, + "learning_rate": 4.43712288293262e-06, + "loss": 1.3812, + "step": 21990 + }, + { + "epoch": 0.7191668846455663, + "grad_norm": 3.444539554200854, + "learning_rate": 4.432381497798782e-06, + "loss": 1.3805, + "step": 21995 + }, + { + "epoch": 0.7193303688202982, + "grad_norm": 3.495384559077217, + "learning_rate": 4.427641925901878e-06, + "loss": 1.4879, + "step": 22000 + }, + { + "epoch": 0.7194938529950301, + "grad_norm": 3.191106586941424, + "learning_rate": 4.422904168785466e-06, + "loss": 1.4654, + "step": 22005 + }, + { + "epoch": 0.719657337169762, + "grad_norm": 3.2717529379508, + "learning_rate": 4.418168227992523e-06, + "loss": 1.4451, + "step": 22010 + }, + { + "epoch": 0.7198208213444939, + "grad_norm": 3.155639966326062, + "learning_rate": 4.413434105065424e-06, + "loss": 1.39, + "step": 22015 + }, + { + "epoch": 0.7199843055192258, + "grad_norm": 3.2681276796004552, + "learning_rate": 4.4087018015459635e-06, + "loss": 1.3703, + "step": 22020 + }, + { + "epoch": 0.7201477896939577, + "grad_norm": 3.191607517402653, + "learning_rate": 4.403971318975329e-06, + "loss": 1.4461, + "step": 22025 + }, + { + "epoch": 0.7203112738686895, + "grad_norm": 3.145037732377433, + "learning_rate": 4.399242658894125e-06, + "loss": 1.4059, + "step": 22030 + }, + { + "epoch": 0.7204747580434214, + "grad_norm": 3.316539416274342, + "learning_rate": 4.394515822842367e-06, + "loss": 1.4161, + "step": 22035 + }, + { + "epoch": 0.7206382422181533, + "grad_norm": 3.0929282593547542, + "learning_rate": 4.3897908123594605e-06, + "loss": 1.5007, + "step": 22040 + }, + { + "epoch": 0.7208017263928852, + "grad_norm": 3.4052042185848017, + "learning_rate": 4.385067628984232e-06, + "loss": 1.468, + "step": 22045 + }, + { + "epoch": 0.7209652105676171, + "grad_norm": 3.3754563388391565, + "learning_rate": 4.380346274254902e-06, + "loss": 1.3053, + "step": 22050 + }, + { + "epoch": 0.721128694742349, + "grad_norm": 3.1882627885352974, + "learning_rate": 4.375626749709102e-06, + "loss": 1.3757, + "step": 22055 + }, + { + "epoch": 0.7212921789170809, + "grad_norm": 2.882573687540567, + "learning_rate": 4.3709090568838685e-06, + "loss": 1.4366, + "step": 22060 + }, + { + "epoch": 0.7214556630918127, + "grad_norm": 3.356725391997193, + "learning_rate": 4.366193197315634e-06, + "loss": 1.3106, + "step": 22065 + }, + { + "epoch": 0.7216191472665446, + "grad_norm": 3.176457578789257, + "learning_rate": 4.361479172540242e-06, + "loss": 1.4073, + "step": 22070 + }, + { + "epoch": 0.7217826314412765, + "grad_norm": 3.347165985247158, + "learning_rate": 4.35676698409293e-06, + "loss": 1.4491, + "step": 22075 + }, + { + "epoch": 0.7219461156160084, + "grad_norm": 3.229663750447218, + "learning_rate": 4.352056633508345e-06, + "loss": 1.4243, + "step": 22080 + }, + { + "epoch": 0.7221095997907403, + "grad_norm": 3.296926442051326, + "learning_rate": 4.347348122320537e-06, + "loss": 1.4974, + "step": 22085 + }, + { + "epoch": 0.7222730839654722, + "grad_norm": 3.073712953272154, + "learning_rate": 4.342641452062945e-06, + "loss": 1.5027, + "step": 22090 + }, + { + "epoch": 0.7224365681402041, + "grad_norm": 3.2523446456070686, + "learning_rate": 4.337936624268424e-06, + "loss": 1.4433, + "step": 22095 + }, + { + "epoch": 0.722600052314936, + "grad_norm": 3.0557292876904336, + "learning_rate": 4.333233640469214e-06, + "loss": 1.3626, + "step": 22100 + }, + { + "epoch": 0.7227635364896678, + "grad_norm": 3.077732836587437, + "learning_rate": 4.328532502196964e-06, + "loss": 1.3995, + "step": 22105 + }, + { + "epoch": 0.7229270206643997, + "grad_norm": 2.8213647781026143, + "learning_rate": 4.323833210982724e-06, + "loss": 1.509, + "step": 22110 + }, + { + "epoch": 0.7230905048391316, + "grad_norm": 3.134020836688395, + "learning_rate": 4.319135768356931e-06, + "loss": 1.222, + "step": 22115 + }, + { + "epoch": 0.7232539890138635, + "grad_norm": 3.1313969585062513, + "learning_rate": 4.314440175849434e-06, + "loss": 1.4092, + "step": 22120 + }, + { + "epoch": 0.7234174731885954, + "grad_norm": 3.168488662753006, + "learning_rate": 4.309746434989465e-06, + "loss": 1.449, + "step": 22125 + }, + { + "epoch": 0.7235809573633273, + "grad_norm": 3.365974188719427, + "learning_rate": 4.305054547305667e-06, + "loss": 1.4804, + "step": 22130 + }, + { + "epoch": 0.7237444415380592, + "grad_norm": 3.226287636932132, + "learning_rate": 4.300364514326067e-06, + "loss": 1.3119, + "step": 22135 + }, + { + "epoch": 0.7239079257127911, + "grad_norm": 3.4318593978568597, + "learning_rate": 4.295676337578098e-06, + "loss": 1.4331, + "step": 22140 + }, + { + "epoch": 0.7240714098875229, + "grad_norm": 3.315702993758406, + "learning_rate": 4.290990018588585e-06, + "loss": 1.4344, + "step": 22145 + }, + { + "epoch": 0.7242348940622547, + "grad_norm": 3.110825768968744, + "learning_rate": 4.2863055588837425e-06, + "loss": 1.447, + "step": 22150 + }, + { + "epoch": 0.7243983782369866, + "grad_norm": 3.326401512841179, + "learning_rate": 4.28162295998919e-06, + "loss": 1.3156, + "step": 22155 + }, + { + "epoch": 0.7245618624117185, + "grad_norm": 3.138641552557174, + "learning_rate": 4.276942223429929e-06, + "loss": 1.4656, + "step": 22160 + }, + { + "epoch": 0.7247253465864504, + "grad_norm": 3.066796863824104, + "learning_rate": 4.272263350730364e-06, + "loss": 1.419, + "step": 22165 + }, + { + "epoch": 0.7248888307611823, + "grad_norm": 3.1371446531814415, + "learning_rate": 4.267586343414294e-06, + "loss": 1.3934, + "step": 22170 + }, + { + "epoch": 0.7250523149359142, + "grad_norm": 3.4295599111672233, + "learning_rate": 4.262911203004897e-06, + "loss": 1.538, + "step": 22175 + }, + { + "epoch": 0.725215799110646, + "grad_norm": 3.3738705598705785, + "learning_rate": 4.258237931024759e-06, + "loss": 1.4005, + "step": 22180 + }, + { + "epoch": 0.7253792832853779, + "grad_norm": 2.962565519661443, + "learning_rate": 4.253566528995843e-06, + "loss": 1.3047, + "step": 22185 + }, + { + "epoch": 0.7255427674601098, + "grad_norm": 3.2145715511270696, + "learning_rate": 4.248896998439515e-06, + "loss": 1.3072, + "step": 22190 + }, + { + "epoch": 0.7257062516348417, + "grad_norm": 2.9748796229556227, + "learning_rate": 4.2442293408765276e-06, + "loss": 1.4523, + "step": 22195 + }, + { + "epoch": 0.7258697358095736, + "grad_norm": 3.249420870722478, + "learning_rate": 4.2395635578270174e-06, + "loss": 1.2231, + "step": 22200 + }, + { + "epoch": 0.7260332199843055, + "grad_norm": 3.029462128167647, + "learning_rate": 4.234899650810523e-06, + "loss": 1.4857, + "step": 22205 + }, + { + "epoch": 0.7261967041590374, + "grad_norm": 3.544674368483019, + "learning_rate": 4.230237621345962e-06, + "loss": 1.4426, + "step": 22210 + }, + { + "epoch": 0.7263601883337693, + "grad_norm": 3.2854468548986304, + "learning_rate": 4.225577470951636e-06, + "loss": 1.5201, + "step": 22215 + }, + { + "epoch": 0.7265236725085011, + "grad_norm": 3.4956819482122996, + "learning_rate": 4.220919201145252e-06, + "loss": 1.4376, + "step": 22220 + }, + { + "epoch": 0.726687156683233, + "grad_norm": 3.3738952348229834, + "learning_rate": 4.216262813443885e-06, + "loss": 1.4178, + "step": 22225 + }, + { + "epoch": 0.7268506408579649, + "grad_norm": 3.2230544842470015, + "learning_rate": 4.211608309364012e-06, + "loss": 1.3502, + "step": 22230 + }, + { + "epoch": 0.7270141250326968, + "grad_norm": 3.0969379604107865, + "learning_rate": 4.206955690421495e-06, + "loss": 1.4222, + "step": 22235 + }, + { + "epoch": 0.7271776092074287, + "grad_norm": 3.3306236160813545, + "learning_rate": 4.202304958131568e-06, + "loss": 1.3773, + "step": 22240 + }, + { + "epoch": 0.7273410933821606, + "grad_norm": 3.563411273391226, + "learning_rate": 4.197656114008869e-06, + "loss": 1.4844, + "step": 22245 + }, + { + "epoch": 0.7275045775568925, + "grad_norm": 3.1299401791565913, + "learning_rate": 4.193009159567407e-06, + "loss": 1.4631, + "step": 22250 + }, + { + "epoch": 0.7276680617316243, + "grad_norm": 3.3670302742450264, + "learning_rate": 4.188364096320583e-06, + "loss": 1.4205, + "step": 22255 + }, + { + "epoch": 0.7278315459063562, + "grad_norm": 3.1936641115446713, + "learning_rate": 4.183720925781184e-06, + "loss": 1.3305, + "step": 22260 + }, + { + "epoch": 0.7279950300810881, + "grad_norm": 3.3183295514638678, + "learning_rate": 4.179079649461371e-06, + "loss": 1.4371, + "step": 22265 + }, + { + "epoch": 0.72815851425582, + "grad_norm": 3.17948516311356, + "learning_rate": 4.174440268872699e-06, + "loss": 1.3997, + "step": 22270 + }, + { + "epoch": 0.7283219984305519, + "grad_norm": 3.103904423883962, + "learning_rate": 4.169802785526094e-06, + "loss": 1.3465, + "step": 22275 + }, + { + "epoch": 0.7284854826052838, + "grad_norm": 3.0816236090383615, + "learning_rate": 4.165167200931881e-06, + "loss": 1.4113, + "step": 22280 + }, + { + "epoch": 0.7286489667800157, + "grad_norm": 3.090585574449704, + "learning_rate": 4.160533516599745e-06, + "loss": 1.2657, + "step": 22285 + }, + { + "epoch": 0.7288124509547476, + "grad_norm": 3.246885036014256, + "learning_rate": 4.15590173403877e-06, + "loss": 1.4319, + "step": 22290 + }, + { + "epoch": 0.7289759351294794, + "grad_norm": 3.0888401711127402, + "learning_rate": 4.151271854757416e-06, + "loss": 1.4873, + "step": 22295 + }, + { + "epoch": 0.7291394193042113, + "grad_norm": 3.0976574090055773, + "learning_rate": 4.146643880263515e-06, + "loss": 1.5527, + "step": 22300 + }, + { + "epoch": 0.7293029034789432, + "grad_norm": 3.2749797410138544, + "learning_rate": 4.14201781206429e-06, + "loss": 1.3601, + "step": 22305 + }, + { + "epoch": 0.7294663876536751, + "grad_norm": 3.0793377917832014, + "learning_rate": 4.137393651666332e-06, + "loss": 1.3334, + "step": 22310 + }, + { + "epoch": 0.729629871828407, + "grad_norm": 3.1120128579611377, + "learning_rate": 4.132771400575623e-06, + "loss": 1.3373, + "step": 22315 + }, + { + "epoch": 0.7297933560031389, + "grad_norm": 3.3906486533551203, + "learning_rate": 4.128151060297517e-06, + "loss": 1.4413, + "step": 22320 + }, + { + "epoch": 0.7299568401778708, + "grad_norm": 3.2178805803364106, + "learning_rate": 4.123532632336741e-06, + "loss": 1.4756, + "step": 22325 + }, + { + "epoch": 0.7301203243526027, + "grad_norm": 3.102241779310896, + "learning_rate": 4.118916118197409e-06, + "loss": 1.409, + "step": 22330 + }, + { + "epoch": 0.7302838085273345, + "grad_norm": 3.2060961142950952, + "learning_rate": 4.114301519383e-06, + "loss": 1.3669, + "step": 22335 + }, + { + "epoch": 0.7304472927020664, + "grad_norm": 3.3469096859854814, + "learning_rate": 4.109688837396379e-06, + "loss": 1.3836, + "step": 22340 + }, + { + "epoch": 0.7306107768767983, + "grad_norm": 3.4773851129498876, + "learning_rate": 4.105078073739789e-06, + "loss": 1.5121, + "step": 22345 + }, + { + "epoch": 0.7307742610515302, + "grad_norm": 3.3755552278917835, + "learning_rate": 4.100469229914833e-06, + "loss": 1.4736, + "step": 22350 + }, + { + "epoch": 0.7309377452262621, + "grad_norm": 3.260067624813814, + "learning_rate": 4.095862307422508e-06, + "loss": 1.3497, + "step": 22355 + }, + { + "epoch": 0.731101229400994, + "grad_norm": 3.1461792763943657, + "learning_rate": 4.091257307763167e-06, + "loss": 1.4778, + "step": 22360 + }, + { + "epoch": 0.7312647135757259, + "grad_norm": 3.0248527531917633, + "learning_rate": 4.086654232436549e-06, + "loss": 1.3431, + "step": 22365 + }, + { + "epoch": 0.7314281977504578, + "grad_norm": 2.998785428575611, + "learning_rate": 4.082053082941767e-06, + "loss": 1.3744, + "step": 22370 + }, + { + "epoch": 0.7315916819251896, + "grad_norm": 3.339184692847707, + "learning_rate": 4.077453860777296e-06, + "loss": 1.572, + "step": 22375 + }, + { + "epoch": 0.7317551660999215, + "grad_norm": 2.9187151277990715, + "learning_rate": 4.072856567440997e-06, + "loss": 1.2932, + "step": 22380 + }, + { + "epoch": 0.7319186502746534, + "grad_norm": 3.202875404669824, + "learning_rate": 4.068261204430088e-06, + "loss": 1.4181, + "step": 22385 + }, + { + "epoch": 0.7320821344493853, + "grad_norm": 3.1758178474984717, + "learning_rate": 4.063667773241174e-06, + "loss": 1.3689, + "step": 22390 + }, + { + "epoch": 0.7322456186241172, + "grad_norm": 3.6228706389451424, + "learning_rate": 4.059076275370214e-06, + "loss": 1.3396, + "step": 22395 + }, + { + "epoch": 0.7324091027988491, + "grad_norm": 3.110875102666724, + "learning_rate": 4.0544867123125534e-06, + "loss": 1.38, + "step": 22400 + }, + { + "epoch": 0.732572586973581, + "grad_norm": 3.1387894622771193, + "learning_rate": 4.049899085562901e-06, + "loss": 1.3719, + "step": 22405 + }, + { + "epoch": 0.7327360711483129, + "grad_norm": 3.2629923570738075, + "learning_rate": 4.045313396615331e-06, + "loss": 1.3406, + "step": 22410 + }, + { + "epoch": 0.7328995553230447, + "grad_norm": 2.938942715485653, + "learning_rate": 4.0407296469632885e-06, + "loss": 1.4247, + "step": 22415 + }, + { + "epoch": 0.7330630394977766, + "grad_norm": 3.349623952669822, + "learning_rate": 4.036147838099594e-06, + "loss": 1.2897, + "step": 22420 + }, + { + "epoch": 0.7332265236725085, + "grad_norm": 3.3136994047514916, + "learning_rate": 4.031567971516424e-06, + "loss": 1.3723, + "step": 22425 + }, + { + "epoch": 0.7333900078472404, + "grad_norm": 3.1815257303199207, + "learning_rate": 4.026990048705334e-06, + "loss": 1.5, + "step": 22430 + }, + { + "epoch": 0.7335534920219723, + "grad_norm": 3.319778855165104, + "learning_rate": 4.022414071157237e-06, + "loss": 1.4879, + "step": 22435 + }, + { + "epoch": 0.7337169761967042, + "grad_norm": 3.334881771910853, + "learning_rate": 4.017840040362419e-06, + "loss": 1.336, + "step": 22440 + }, + { + "epoch": 0.7338804603714361, + "grad_norm": 3.2716408064555624, + "learning_rate": 4.0132679578105325e-06, + "loss": 1.2811, + "step": 22445 + }, + { + "epoch": 0.734043944546168, + "grad_norm": 3.220735235511314, + "learning_rate": 4.008697824990587e-06, + "loss": 1.415, + "step": 22450 + }, + { + "epoch": 0.7342074287208998, + "grad_norm": 3.1751958498751955, + "learning_rate": 4.0041296433909705e-06, + "loss": 1.3735, + "step": 22455 + }, + { + "epoch": 0.7343709128956317, + "grad_norm": 3.2492735583564087, + "learning_rate": 3.999563414499418e-06, + "loss": 1.5008, + "step": 22460 + }, + { + "epoch": 0.7345343970703636, + "grad_norm": 3.2987846843021353, + "learning_rate": 3.994999139803044e-06, + "loss": 1.3444, + "step": 22465 + }, + { + "epoch": 0.7346978812450955, + "grad_norm": 3.289120304920221, + "learning_rate": 3.990436820788325e-06, + "loss": 1.3355, + "step": 22470 + }, + { + "epoch": 0.7348613654198274, + "grad_norm": 3.2049894718370457, + "learning_rate": 3.985876458941087e-06, + "loss": 1.5474, + "step": 22475 + }, + { + "epoch": 0.7350248495945593, + "grad_norm": 3.06380149526031, + "learning_rate": 3.981318055746537e-06, + "loss": 1.4857, + "step": 22480 + }, + { + "epoch": 0.7351883337692912, + "grad_norm": 3.324701716453959, + "learning_rate": 3.976761612689228e-06, + "loss": 1.3817, + "step": 22485 + }, + { + "epoch": 0.735351817944023, + "grad_norm": 3.237495597959627, + "learning_rate": 3.972207131253086e-06, + "loss": 1.4587, + "step": 22490 + }, + { + "epoch": 0.7355153021187549, + "grad_norm": 3.2687926300183436, + "learning_rate": 3.967654612921397e-06, + "loss": 1.391, + "step": 22495 + }, + { + "epoch": 0.7356787862934868, + "grad_norm": 3.1579214197252012, + "learning_rate": 3.963104059176796e-06, + "loss": 1.35, + "step": 22500 + }, + { + "epoch": 0.7358422704682187, + "grad_norm": 3.1590763865514706, + "learning_rate": 3.958555471501295e-06, + "loss": 1.4401, + "step": 22505 + }, + { + "epoch": 0.7360057546429506, + "grad_norm": 3.1879991473785303, + "learning_rate": 3.954008851376252e-06, + "loss": 1.3566, + "step": 22510 + }, + { + "epoch": 0.7361692388176825, + "grad_norm": 3.532433092487378, + "learning_rate": 3.949464200282392e-06, + "loss": 1.3899, + "step": 22515 + }, + { + "epoch": 0.7363327229924144, + "grad_norm": 2.857715239592779, + "learning_rate": 3.9449215196998e-06, + "loss": 1.348, + "step": 22520 + }, + { + "epoch": 0.7364962071671463, + "grad_norm": 3.146892708241509, + "learning_rate": 3.940380811107909e-06, + "loss": 1.4901, + "step": 22525 + }, + { + "epoch": 0.7366596913418781, + "grad_norm": 3.1885479354114272, + "learning_rate": 3.935842075985523e-06, + "loss": 1.4549, + "step": 22530 + }, + { + "epoch": 0.73682317551661, + "grad_norm": 3.074632005789124, + "learning_rate": 3.931305315810791e-06, + "loss": 1.4121, + "step": 22535 + }, + { + "epoch": 0.7369866596913419, + "grad_norm": 3.3154822607860552, + "learning_rate": 3.926770532061229e-06, + "loss": 1.4045, + "step": 22540 + }, + { + "epoch": 0.7371501438660738, + "grad_norm": 3.288551478999182, + "learning_rate": 3.9222377262137015e-06, + "loss": 1.3864, + "step": 22545 + }, + { + "epoch": 0.7373136280408057, + "grad_norm": 3.210052996189202, + "learning_rate": 3.917706899744435e-06, + "loss": 1.4382, + "step": 22550 + }, + { + "epoch": 0.7374771122155376, + "grad_norm": 3.1759545102745443, + "learning_rate": 3.9131780541290085e-06, + "loss": 1.2856, + "step": 22555 + }, + { + "epoch": 0.7376405963902695, + "grad_norm": 3.5311673274453517, + "learning_rate": 3.9086511908423545e-06, + "loss": 1.3529, + "step": 22560 + }, + { + "epoch": 0.7378040805650014, + "grad_norm": 3.127748694231263, + "learning_rate": 3.904126311358765e-06, + "loss": 1.3594, + "step": 22565 + }, + { + "epoch": 0.7379675647397332, + "grad_norm": 3.175317027602986, + "learning_rate": 3.899603417151876e-06, + "loss": 1.4523, + "step": 22570 + }, + { + "epoch": 0.7381310489144651, + "grad_norm": 3.2462073885246214, + "learning_rate": 3.895082509694687e-06, + "loss": 1.3964, + "step": 22575 + }, + { + "epoch": 0.738294533089197, + "grad_norm": 3.1632834630882676, + "learning_rate": 3.890563590459549e-06, + "loss": 1.4023, + "step": 22580 + }, + { + "epoch": 0.7384580172639289, + "grad_norm": 3.341570620631577, + "learning_rate": 3.88604666091816e-06, + "loss": 1.4962, + "step": 22585 + }, + { + "epoch": 0.7386215014386608, + "grad_norm": 3.0988696268090394, + "learning_rate": 3.881531722541577e-06, + "loss": 1.4576, + "step": 22590 + }, + { + "epoch": 0.7387849856133927, + "grad_norm": 3.045359415422231, + "learning_rate": 3.877018776800199e-06, + "loss": 1.3843, + "step": 22595 + }, + { + "epoch": 0.7389484697881246, + "grad_norm": 3.1967525331726643, + "learning_rate": 3.872507825163784e-06, + "loss": 1.3549, + "step": 22600 + }, + { + "epoch": 0.7391119539628564, + "grad_norm": 3.251037975589491, + "learning_rate": 3.867998869101443e-06, + "loss": 1.5854, + "step": 22605 + }, + { + "epoch": 0.7392754381375883, + "grad_norm": 2.954500915731317, + "learning_rate": 3.863491910081627e-06, + "loss": 1.272, + "step": 22610 + }, + { + "epoch": 0.7394389223123201, + "grad_norm": 3.028742963959346, + "learning_rate": 3.858986949572147e-06, + "loss": 1.5282, + "step": 22615 + }, + { + "epoch": 0.739602406487052, + "grad_norm": 2.974275261230482, + "learning_rate": 3.854483989040154e-06, + "loss": 1.3439, + "step": 22620 + }, + { + "epoch": 0.7397658906617839, + "grad_norm": 3.1256692421949914, + "learning_rate": 3.849983029952151e-06, + "loss": 1.2621, + "step": 22625 + }, + { + "epoch": 0.7399293748365158, + "grad_norm": 3.2293895596283773, + "learning_rate": 3.845484073773996e-06, + "loss": 1.3075, + "step": 22630 + }, + { + "epoch": 0.7400928590112477, + "grad_norm": 3.510175107643017, + "learning_rate": 3.840987121970881e-06, + "loss": 1.4718, + "step": 22635 + }, + { + "epoch": 0.7402563431859795, + "grad_norm": 3.222922882836594, + "learning_rate": 3.836492176007358e-06, + "loss": 1.5257, + "step": 22640 + }, + { + "epoch": 0.7404198273607114, + "grad_norm": 3.2669238839200245, + "learning_rate": 3.831999237347324e-06, + "loss": 1.4329, + "step": 22645 + }, + { + "epoch": 0.7405833115354433, + "grad_norm": 3.4846782049858773, + "learning_rate": 3.827508307454011e-06, + "loss": 1.5033, + "step": 22650 + }, + { + "epoch": 0.7407467957101752, + "grad_norm": 3.2489943397014653, + "learning_rate": 3.823019387790011e-06, + "loss": 1.4481, + "step": 22655 + }, + { + "epoch": 0.7409102798849071, + "grad_norm": 3.1576960405960706, + "learning_rate": 3.818532479817251e-06, + "loss": 1.3774, + "step": 22660 + }, + { + "epoch": 0.741073764059639, + "grad_norm": 3.1830026938313183, + "learning_rate": 3.8140475849970116e-06, + "loss": 1.5208, + "step": 22665 + }, + { + "epoch": 0.7412372482343709, + "grad_norm": 3.394306284576994, + "learning_rate": 3.8095647047899076e-06, + "loss": 1.4792, + "step": 22670 + }, + { + "epoch": 0.7414007324091028, + "grad_norm": 3.0988190535805065, + "learning_rate": 3.8050838406559064e-06, + "loss": 1.3775, + "step": 22675 + }, + { + "epoch": 0.7415642165838346, + "grad_norm": 3.4097380707584777, + "learning_rate": 3.8006049940543187e-06, + "loss": 1.4362, + "step": 22680 + }, + { + "epoch": 0.7417277007585665, + "grad_norm": 3.237982140115447, + "learning_rate": 3.7961281664437888e-06, + "loss": 1.4615, + "step": 22685 + }, + { + "epoch": 0.7418911849332984, + "grad_norm": 2.9613093044020595, + "learning_rate": 3.7916533592823156e-06, + "loss": 1.3091, + "step": 22690 + }, + { + "epoch": 0.7420546691080303, + "grad_norm": 3.1025829015962403, + "learning_rate": 3.7871805740272283e-06, + "loss": 1.3403, + "step": 22695 + }, + { + "epoch": 0.7422181532827622, + "grad_norm": 3.113753624856864, + "learning_rate": 3.7827098121352058e-06, + "loss": 1.2756, + "step": 22700 + }, + { + "epoch": 0.7423816374574941, + "grad_norm": 3.0356303785378262, + "learning_rate": 3.778241075062271e-06, + "loss": 1.3535, + "step": 22705 + }, + { + "epoch": 0.742545121632226, + "grad_norm": 3.1742298392000285, + "learning_rate": 3.7737743642637736e-06, + "loss": 1.5041, + "step": 22710 + }, + { + "epoch": 0.7427086058069579, + "grad_norm": 3.129567370649485, + "learning_rate": 3.7693096811944185e-06, + "loss": 1.399, + "step": 22715 + }, + { + "epoch": 0.7428720899816897, + "grad_norm": 3.300702984403173, + "learning_rate": 3.764847027308238e-06, + "loss": 1.3074, + "step": 22720 + }, + { + "epoch": 0.7430355741564216, + "grad_norm": 2.870694523165477, + "learning_rate": 3.7603864040586124e-06, + "loss": 1.3293, + "step": 22725 + }, + { + "epoch": 0.7431990583311535, + "grad_norm": 3.2685708817702115, + "learning_rate": 3.755927812898261e-06, + "loss": 1.4052, + "step": 22730 + }, + { + "epoch": 0.7433625425058854, + "grad_norm": 3.151990266473492, + "learning_rate": 3.7514712552792287e-06, + "loss": 1.4939, + "step": 22735 + }, + { + "epoch": 0.7435260266806173, + "grad_norm": 3.1904494394656226, + "learning_rate": 3.747016732652917e-06, + "loss": 1.5023, + "step": 22740 + }, + { + "epoch": 0.7436895108553492, + "grad_norm": 3.2493172084974775, + "learning_rate": 3.742564246470046e-06, + "loss": 1.4172, + "step": 22745 + }, + { + "epoch": 0.7438529950300811, + "grad_norm": 3.33309945803083, + "learning_rate": 3.738113798180685e-06, + "loss": 1.4828, + "step": 22750 + }, + { + "epoch": 0.744016479204813, + "grad_norm": 3.063575733709241, + "learning_rate": 3.7336653892342402e-06, + "loss": 1.4068, + "step": 22755 + }, + { + "epoch": 0.7441799633795448, + "grad_norm": 3.122690362963393, + "learning_rate": 3.729219021079441e-06, + "loss": 1.3945, + "step": 22760 + }, + { + "epoch": 0.7443434475542767, + "grad_norm": 3.391986149591788, + "learning_rate": 3.7247746951643694e-06, + "loss": 1.4329, + "step": 22765 + }, + { + "epoch": 0.7445069317290086, + "grad_norm": 3.1827016114384623, + "learning_rate": 3.720332412936426e-06, + "loss": 1.4579, + "step": 22770 + }, + { + "epoch": 0.7446704159037405, + "grad_norm": 3.184511788858252, + "learning_rate": 3.7158921758423547e-06, + "loss": 1.3069, + "step": 22775 + }, + { + "epoch": 0.7448339000784724, + "grad_norm": 3.2384263084579668, + "learning_rate": 3.711453985328238e-06, + "loss": 1.4159, + "step": 22780 + }, + { + "epoch": 0.7449973842532043, + "grad_norm": 3.191404070963423, + "learning_rate": 3.7070178428394786e-06, + "loss": 1.3843, + "step": 22785 + }, + { + "epoch": 0.7451608684279362, + "grad_norm": 3.141713123255851, + "learning_rate": 3.702583749820825e-06, + "loss": 1.4021, + "step": 22790 + }, + { + "epoch": 0.745324352602668, + "grad_norm": 3.186841554942768, + "learning_rate": 3.6981517077163466e-06, + "loss": 1.2677, + "step": 22795 + }, + { + "epoch": 0.7454878367773999, + "grad_norm": 3.049877756411328, + "learning_rate": 3.6937217179694586e-06, + "loss": 1.3437, + "step": 22800 + }, + { + "epoch": 0.7456513209521318, + "grad_norm": 3.1397432214785956, + "learning_rate": 3.6892937820228903e-06, + "loss": 1.3335, + "step": 22805 + }, + { + "epoch": 0.7458148051268637, + "grad_norm": 2.960018742145011, + "learning_rate": 3.684867901318718e-06, + "loss": 1.4595, + "step": 22810 + }, + { + "epoch": 0.7459782893015956, + "grad_norm": 3.298050892894904, + "learning_rate": 3.6804440772983462e-06, + "loss": 1.4525, + "step": 22815 + }, + { + "epoch": 0.7461417734763275, + "grad_norm": 3.1107196445621055, + "learning_rate": 3.6760223114024984e-06, + "loss": 1.2471, + "step": 22820 + }, + { + "epoch": 0.7463052576510594, + "grad_norm": 3.179525171293746, + "learning_rate": 3.6716026050712416e-06, + "loss": 1.3122, + "step": 22825 + }, + { + "epoch": 0.7464687418257913, + "grad_norm": 3.229603583482779, + "learning_rate": 3.6671849597439626e-06, + "loss": 1.4649, + "step": 22830 + }, + { + "epoch": 0.7466322260005231, + "grad_norm": 3.1735347273670342, + "learning_rate": 3.6627693768593774e-06, + "loss": 1.4206, + "step": 22835 + }, + { + "epoch": 0.746795710175255, + "grad_norm": 2.77878237973048, + "learning_rate": 3.6583558578555412e-06, + "loss": 1.4039, + "step": 22840 + }, + { + "epoch": 0.7469591943499869, + "grad_norm": 3.496158386832613, + "learning_rate": 3.653944404169819e-06, + "loss": 1.4648, + "step": 22845 + }, + { + "epoch": 0.7471226785247188, + "grad_norm": 3.242819033897682, + "learning_rate": 3.64953501723892e-06, + "loss": 1.313, + "step": 22850 + }, + { + "epoch": 0.7472861626994507, + "grad_norm": 3.4637201942069926, + "learning_rate": 3.645127698498875e-06, + "loss": 1.3537, + "step": 22855 + }, + { + "epoch": 0.7474496468741826, + "grad_norm": 3.258417480429427, + "learning_rate": 3.6407224493850325e-06, + "loss": 1.4163, + "step": 22860 + }, + { + "epoch": 0.7476131310489145, + "grad_norm": 3.270812800241526, + "learning_rate": 3.6363192713320818e-06, + "loss": 1.4651, + "step": 22865 + }, + { + "epoch": 0.7477766152236464, + "grad_norm": 3.1115498940949373, + "learning_rate": 3.6319181657740234e-06, + "loss": 1.3907, + "step": 22870 + }, + { + "epoch": 0.7479400993983782, + "grad_norm": 3.092271173039627, + "learning_rate": 3.6275191341441927e-06, + "loss": 1.355, + "step": 22875 + }, + { + "epoch": 0.7481035835731101, + "grad_norm": 3.0896359597998653, + "learning_rate": 3.6231221778752514e-06, + "loss": 1.4003, + "step": 22880 + }, + { + "epoch": 0.748267067747842, + "grad_norm": 3.0889318935889514, + "learning_rate": 3.6187272983991705e-06, + "loss": 1.442, + "step": 22885 + }, + { + "epoch": 0.7484305519225739, + "grad_norm": 3.3193681507198893, + "learning_rate": 3.614334497147264e-06, + "loss": 1.3771, + "step": 22890 + }, + { + "epoch": 0.7485940360973058, + "grad_norm": 3.295410877219419, + "learning_rate": 3.609943775550151e-06, + "loss": 1.4731, + "step": 22895 + }, + { + "epoch": 0.7487575202720377, + "grad_norm": 3.0160817725200686, + "learning_rate": 3.6055551350377872e-06, + "loss": 1.355, + "step": 22900 + }, + { + "epoch": 0.7489210044467696, + "grad_norm": 3.3628653957674204, + "learning_rate": 3.6011685770394478e-06, + "loss": 1.3227, + "step": 22905 + }, + { + "epoch": 0.7490844886215015, + "grad_norm": 3.149189435843762, + "learning_rate": 3.59678410298372e-06, + "loss": 1.3686, + "step": 22910 + }, + { + "epoch": 0.7492479727962333, + "grad_norm": 3.417654100985508, + "learning_rate": 3.592401714298528e-06, + "loss": 1.4848, + "step": 22915 + }, + { + "epoch": 0.7494114569709652, + "grad_norm": 3.238527395461109, + "learning_rate": 3.588021412411099e-06, + "loss": 1.4259, + "step": 22920 + }, + { + "epoch": 0.7495749411456971, + "grad_norm": 3.19957475196477, + "learning_rate": 3.5836431987479992e-06, + "loss": 1.4798, + "step": 22925 + }, + { + "epoch": 0.749738425320429, + "grad_norm": 3.237181199962601, + "learning_rate": 3.5792670747350967e-06, + "loss": 1.4011, + "step": 22930 + }, + { + "epoch": 0.7499019094951609, + "grad_norm": 3.0267105976655486, + "learning_rate": 3.5748930417975937e-06, + "loss": 1.3253, + "step": 22935 + }, + { + "epoch": 0.7500653936698928, + "grad_norm": 3.3960862246564463, + "learning_rate": 3.570521101360006e-06, + "loss": 1.4948, + "step": 22940 + }, + { + "epoch": 0.7502288778446247, + "grad_norm": 3.3897031689051254, + "learning_rate": 3.566151254846164e-06, + "loss": 1.3374, + "step": 22945 + }, + { + "epoch": 0.7503923620193566, + "grad_norm": 3.330764028664144, + "learning_rate": 3.5617835036792238e-06, + "loss": 1.4391, + "step": 22950 + }, + { + "epoch": 0.7505558461940884, + "grad_norm": 3.1866453379385558, + "learning_rate": 3.5574178492816493e-06, + "loss": 1.3956, + "step": 22955 + }, + { + "epoch": 0.7507193303688203, + "grad_norm": 2.981619548957606, + "learning_rate": 3.5530542930752297e-06, + "loss": 1.4345, + "step": 22960 + }, + { + "epoch": 0.7508828145435522, + "grad_norm": 3.3132016636602164, + "learning_rate": 3.5486928364810735e-06, + "loss": 1.4736, + "step": 22965 + }, + { + "epoch": 0.7510462987182841, + "grad_norm": 3.4372381373327014, + "learning_rate": 3.544333480919592e-06, + "loss": 1.4577, + "step": 22970 + }, + { + "epoch": 0.751209782893016, + "grad_norm": 3.1462537096646206, + "learning_rate": 3.5399762278105265e-06, + "loss": 1.3912, + "step": 22975 + }, + { + "epoch": 0.7513732670677479, + "grad_norm": 3.230203217005731, + "learning_rate": 3.5356210785729226e-06, + "loss": 1.5133, + "step": 22980 + }, + { + "epoch": 0.7515367512424798, + "grad_norm": 3.2898733730621523, + "learning_rate": 3.531268034625149e-06, + "loss": 1.3864, + "step": 22985 + }, + { + "epoch": 0.7517002354172116, + "grad_norm": 3.2323796494007038, + "learning_rate": 3.5269170973848877e-06, + "loss": 1.4002, + "step": 22990 + }, + { + "epoch": 0.7518637195919435, + "grad_norm": 3.3910660713383773, + "learning_rate": 3.5225682682691265e-06, + "loss": 1.4802, + "step": 22995 + }, + { + "epoch": 0.7520272037666754, + "grad_norm": 3.020569323676633, + "learning_rate": 3.5182215486941785e-06, + "loss": 1.4424, + "step": 23000 + }, + { + "epoch": 0.7521906879414073, + "grad_norm": 3.3328285168511504, + "learning_rate": 3.513876940075658e-06, + "loss": 1.4881, + "step": 23005 + }, + { + "epoch": 0.7523541721161392, + "grad_norm": 3.2480185786399907, + "learning_rate": 3.5095344438284996e-06, + "loss": 1.4758, + "step": 23010 + }, + { + "epoch": 0.7525176562908711, + "grad_norm": 3.06855905341258, + "learning_rate": 3.5051940613669523e-06, + "loss": 1.3405, + "step": 23015 + }, + { + "epoch": 0.752681140465603, + "grad_norm": 3.175899575841915, + "learning_rate": 3.5008557941045664e-06, + "loss": 1.5437, + "step": 23020 + }, + { + "epoch": 0.7528446246403349, + "grad_norm": 3.347608007507009, + "learning_rate": 3.4965196434542135e-06, + "loss": 1.4219, + "step": 23025 + }, + { + "epoch": 0.7530081088150667, + "grad_norm": 3.610050641825617, + "learning_rate": 3.4921856108280673e-06, + "loss": 1.5345, + "step": 23030 + }, + { + "epoch": 0.7531715929897986, + "grad_norm": 3.0957417891407655, + "learning_rate": 3.4878536976376207e-06, + "loss": 1.3582, + "step": 23035 + }, + { + "epoch": 0.7533350771645305, + "grad_norm": 3.2959857796548526, + "learning_rate": 3.483523905293671e-06, + "loss": 1.4342, + "step": 23040 + }, + { + "epoch": 0.7534985613392624, + "grad_norm": 3.42404675416795, + "learning_rate": 3.479196235206319e-06, + "loss": 1.2844, + "step": 23045 + }, + { + "epoch": 0.7536620455139943, + "grad_norm": 3.2022758159426687, + "learning_rate": 3.474870688784986e-06, + "loss": 1.3765, + "step": 23050 + }, + { + "epoch": 0.7538255296887262, + "grad_norm": 3.403450504374092, + "learning_rate": 3.4705472674384e-06, + "loss": 1.3863, + "step": 23055 + }, + { + "epoch": 0.7539890138634581, + "grad_norm": 3.3696551366418945, + "learning_rate": 3.4662259725745862e-06, + "loss": 1.4371, + "step": 23060 + }, + { + "epoch": 0.75415249803819, + "grad_norm": 2.955864609595148, + "learning_rate": 3.461906805600892e-06, + "loss": 1.3066, + "step": 23065 + }, + { + "epoch": 0.7543159822129218, + "grad_norm": 3.2008900512979124, + "learning_rate": 3.457589767923956e-06, + "loss": 1.4405, + "step": 23070 + }, + { + "epoch": 0.7544794663876537, + "grad_norm": 3.091018856438525, + "learning_rate": 3.453274860949739e-06, + "loss": 1.3357, + "step": 23075 + }, + { + "epoch": 0.7546429505623855, + "grad_norm": 3.2775933240754815, + "learning_rate": 3.448962086083494e-06, + "loss": 1.3668, + "step": 23080 + }, + { + "epoch": 0.7548064347371174, + "grad_norm": 3.451367085428227, + "learning_rate": 3.4446514447297886e-06, + "loss": 1.3435, + "step": 23085 + }, + { + "epoch": 0.7549699189118493, + "grad_norm": 3.1583958098439266, + "learning_rate": 3.440342938292498e-06, + "loss": 1.3803, + "step": 23090 + }, + { + "epoch": 0.7551334030865812, + "grad_norm": 3.9345153716012415, + "learning_rate": 3.43603656817479e-06, + "loss": 1.3958, + "step": 23095 + }, + { + "epoch": 0.755296887261313, + "grad_norm": 3.237833981512648, + "learning_rate": 3.431732335779149e-06, + "loss": 1.399, + "step": 23100 + }, + { + "epoch": 0.7554603714360449, + "grad_norm": 3.1006640642458767, + "learning_rate": 3.4274302425073535e-06, + "loss": 1.3574, + "step": 23105 + }, + { + "epoch": 0.7556238556107768, + "grad_norm": 3.0180917490834265, + "learning_rate": 3.423130289760491e-06, + "loss": 1.3704, + "step": 23110 + }, + { + "epoch": 0.7557873397855087, + "grad_norm": 3.0905866473889083, + "learning_rate": 3.418832478938956e-06, + "loss": 1.4698, + "step": 23115 + }, + { + "epoch": 0.7559508239602406, + "grad_norm": 3.1999162145622395, + "learning_rate": 3.4145368114424336e-06, + "loss": 1.3662, + "step": 23120 + }, + { + "epoch": 0.7561143081349725, + "grad_norm": 3.294499380205865, + "learning_rate": 3.410243288669922e-06, + "loss": 1.5364, + "step": 23125 + }, + { + "epoch": 0.7562777923097044, + "grad_norm": 3.5637335491599487, + "learning_rate": 3.4059519120197127e-06, + "loss": 1.4685, + "step": 23130 + }, + { + "epoch": 0.7564412764844363, + "grad_norm": 3.2975943735716258, + "learning_rate": 3.401662682889402e-06, + "loss": 1.5336, + "step": 23135 + }, + { + "epoch": 0.7566047606591682, + "grad_norm": 3.3922371337418427, + "learning_rate": 3.397375602675892e-06, + "loss": 1.5364, + "step": 23140 + }, + { + "epoch": 0.7567682448339, + "grad_norm": 3.300051010175981, + "learning_rate": 3.3930906727753733e-06, + "loss": 1.2241, + "step": 23145 + }, + { + "epoch": 0.7569317290086319, + "grad_norm": 3.3316712368407826, + "learning_rate": 3.388807894583348e-06, + "loss": 1.4263, + "step": 23150 + }, + { + "epoch": 0.7570952131833638, + "grad_norm": 3.3572699335541802, + "learning_rate": 3.3845272694946076e-06, + "loss": 1.4594, + "step": 23155 + }, + { + "epoch": 0.7572586973580957, + "grad_norm": 3.1546232347900087, + "learning_rate": 3.3802487989032463e-06, + "loss": 1.3307, + "step": 23160 + }, + { + "epoch": 0.7574221815328276, + "grad_norm": 3.282756733461849, + "learning_rate": 3.375972484202664e-06, + "loss": 1.4596, + "step": 23165 + }, + { + "epoch": 0.7575856657075595, + "grad_norm": 3.3441667092377445, + "learning_rate": 3.371698326785543e-06, + "loss": 1.4251, + "step": 23170 + }, + { + "epoch": 0.7577491498822914, + "grad_norm": 3.0457097582564967, + "learning_rate": 3.36742632804388e-06, + "loss": 1.4648, + "step": 23175 + }, + { + "epoch": 0.7579126340570232, + "grad_norm": 3.14617936564455, + "learning_rate": 3.3631564893689517e-06, + "loss": 1.3247, + "step": 23180 + }, + { + "epoch": 0.7580761182317551, + "grad_norm": 3.186342714319181, + "learning_rate": 3.3588888121513485e-06, + "loss": 1.4025, + "step": 23185 + }, + { + "epoch": 0.758239602406487, + "grad_norm": 3.1610426679858894, + "learning_rate": 3.35462329778094e-06, + "loss": 1.5292, + "step": 23190 + }, + { + "epoch": 0.7584030865812189, + "grad_norm": 3.3585284113012426, + "learning_rate": 3.350359947646904e-06, + "loss": 1.5228, + "step": 23195 + }, + { + "epoch": 0.7585665707559508, + "grad_norm": 3.198547743766072, + "learning_rate": 3.3460987631377118e-06, + "loss": 1.39, + "step": 23200 + }, + { + "epoch": 0.7587300549306827, + "grad_norm": 3.156291626236949, + "learning_rate": 3.341839745641121e-06, + "loss": 1.3638, + "step": 23205 + }, + { + "epoch": 0.7588935391054146, + "grad_norm": 3.250094148804318, + "learning_rate": 3.337582896544196e-06, + "loss": 1.4766, + "step": 23210 + }, + { + "epoch": 0.7590570232801465, + "grad_norm": 3.2296854711037652, + "learning_rate": 3.33332821723328e-06, + "loss": 1.3522, + "step": 23215 + }, + { + "epoch": 0.7592205074548783, + "grad_norm": 3.05590933288308, + "learning_rate": 3.329075709094023e-06, + "loss": 1.4707, + "step": 23220 + }, + { + "epoch": 0.7593839916296102, + "grad_norm": 3.1481885801719045, + "learning_rate": 3.3248253735113643e-06, + "loss": 1.391, + "step": 23225 + }, + { + "epoch": 0.7595474758043421, + "grad_norm": 3.3500277017153, + "learning_rate": 3.32057721186953e-06, + "loss": 1.5274, + "step": 23230 + }, + { + "epoch": 0.759710959979074, + "grad_norm": 3.4275634298679774, + "learning_rate": 3.3163312255520465e-06, + "loss": 1.3743, + "step": 23235 + }, + { + "epoch": 0.7598744441538059, + "grad_norm": 3.2603000196516856, + "learning_rate": 3.312087415941725e-06, + "loss": 1.486, + "step": 23240 + }, + { + "epoch": 0.7600379283285378, + "grad_norm": 3.236747326397388, + "learning_rate": 3.307845784420667e-06, + "loss": 1.3077, + "step": 23245 + }, + { + "epoch": 0.7602014125032697, + "grad_norm": 3.1804477449748787, + "learning_rate": 3.303606332370274e-06, + "loss": 1.307, + "step": 23250 + }, + { + "epoch": 0.7603648966780016, + "grad_norm": 3.2539614506335965, + "learning_rate": 3.299369061171226e-06, + "loss": 1.4823, + "step": 23255 + }, + { + "epoch": 0.7605283808527334, + "grad_norm": 3.0642267193450343, + "learning_rate": 3.2951339722035014e-06, + "loss": 1.4533, + "step": 23260 + }, + { + "epoch": 0.7606918650274653, + "grad_norm": 3.316043422156651, + "learning_rate": 3.290901066846368e-06, + "loss": 1.3597, + "step": 23265 + }, + { + "epoch": 0.7608553492021972, + "grad_norm": 3.3633674133036564, + "learning_rate": 3.2866703464783733e-06, + "loss": 1.4192, + "step": 23270 + }, + { + "epoch": 0.7610188333769291, + "grad_norm": 3.128247210163624, + "learning_rate": 3.282441812477365e-06, + "loss": 1.4336, + "step": 23275 + }, + { + "epoch": 0.761182317551661, + "grad_norm": 3.3324065826116533, + "learning_rate": 3.278215466220467e-06, + "loss": 1.5582, + "step": 23280 + }, + { + "epoch": 0.7613458017263929, + "grad_norm": 2.9478298273592425, + "learning_rate": 3.2739913090841002e-06, + "loss": 1.3027, + "step": 23285 + }, + { + "epoch": 0.7615092859011248, + "grad_norm": 3.031596528266559, + "learning_rate": 3.2697693424439715e-06, + "loss": 1.4072, + "step": 23290 + }, + { + "epoch": 0.7616727700758567, + "grad_norm": 3.502190530865804, + "learning_rate": 3.265549567675067e-06, + "loss": 1.4162, + "step": 23295 + }, + { + "epoch": 0.7618362542505885, + "grad_norm": 3.3456530647433698, + "learning_rate": 3.261331986151669e-06, + "loss": 1.5818, + "step": 23300 + }, + { + "epoch": 0.7619997384253204, + "grad_norm": 3.471360783938714, + "learning_rate": 3.2571165992473343e-06, + "loss": 1.372, + "step": 23305 + }, + { + "epoch": 0.7621632226000523, + "grad_norm": 3.2477127874181697, + "learning_rate": 3.252903408334914e-06, + "loss": 1.3945, + "step": 23310 + }, + { + "epoch": 0.7623267067747842, + "grad_norm": 3.1872476951782662, + "learning_rate": 3.248692414786546e-06, + "loss": 1.1912, + "step": 23315 + }, + { + "epoch": 0.7624901909495161, + "grad_norm": 3.0860277043689033, + "learning_rate": 3.2444836199736394e-06, + "loss": 1.3605, + "step": 23320 + }, + { + "epoch": 0.762653675124248, + "grad_norm": 3.5374396700684043, + "learning_rate": 3.2402770252669036e-06, + "loss": 1.4721, + "step": 23325 + }, + { + "epoch": 0.7628171592989799, + "grad_norm": 3.114387868206831, + "learning_rate": 3.2360726320363158e-06, + "loss": 1.4983, + "step": 23330 + }, + { + "epoch": 0.7629806434737118, + "grad_norm": 2.993442177602959, + "learning_rate": 3.2318704416511504e-06, + "loss": 1.386, + "step": 23335 + }, + { + "epoch": 0.7631441276484436, + "grad_norm": 3.277970489340064, + "learning_rate": 3.227670455479951e-06, + "loss": 1.3104, + "step": 23340 + }, + { + "epoch": 0.7633076118231755, + "grad_norm": 3.221260704731621, + "learning_rate": 3.2234726748905555e-06, + "loss": 1.3936, + "step": 23345 + }, + { + "epoch": 0.7634710959979074, + "grad_norm": 3.0373773857502626, + "learning_rate": 3.219277101250079e-06, + "loss": 1.4666, + "step": 23350 + }, + { + "epoch": 0.7636345801726393, + "grad_norm": 3.053222361701262, + "learning_rate": 3.215083735924912e-06, + "loss": 1.376, + "step": 23355 + }, + { + "epoch": 0.7637980643473712, + "grad_norm": 3.1692368930501686, + "learning_rate": 3.2108925802807366e-06, + "loss": 1.4474, + "step": 23360 + }, + { + "epoch": 0.7639615485221031, + "grad_norm": 3.2232793948146274, + "learning_rate": 3.2067036356825043e-06, + "loss": 1.4041, + "step": 23365 + }, + { + "epoch": 0.764125032696835, + "grad_norm": 3.2313653177042263, + "learning_rate": 3.2025169034944524e-06, + "loss": 1.3396, + "step": 23370 + }, + { + "epoch": 0.7642885168715668, + "grad_norm": 3.5599923650597445, + "learning_rate": 3.198332385080103e-06, + "loss": 1.566, + "step": 23375 + }, + { + "epoch": 0.7644520010462987, + "grad_norm": 3.131379654350248, + "learning_rate": 3.1941500818022443e-06, + "loss": 1.4274, + "step": 23380 + }, + { + "epoch": 0.7646154852210306, + "grad_norm": 3.1405266370296037, + "learning_rate": 3.1899699950229547e-06, + "loss": 1.4314, + "step": 23385 + }, + { + "epoch": 0.7647789693957625, + "grad_norm": 3.211929254716548, + "learning_rate": 3.1857921261035808e-06, + "loss": 1.4217, + "step": 23390 + }, + { + "epoch": 0.7649424535704944, + "grad_norm": 3.103628468415758, + "learning_rate": 3.181616476404754e-06, + "loss": 1.3104, + "step": 23395 + }, + { + "epoch": 0.7651059377452263, + "grad_norm": 2.8062637885342014, + "learning_rate": 3.177443047286387e-06, + "loss": 1.4676, + "step": 23400 + }, + { + "epoch": 0.7652694219199582, + "grad_norm": 3.0775950295836068, + "learning_rate": 3.173271840107656e-06, + "loss": 1.3851, + "step": 23405 + }, + { + "epoch": 0.7654329060946901, + "grad_norm": 3.4155835287915903, + "learning_rate": 3.1691028562270252e-06, + "loss": 1.4194, + "step": 23410 + }, + { + "epoch": 0.765596390269422, + "grad_norm": 3.3193079791064717, + "learning_rate": 3.164936097002227e-06, + "loss": 1.4395, + "step": 23415 + }, + { + "epoch": 0.7657598744441538, + "grad_norm": 3.1446827432327455, + "learning_rate": 3.1607715637902734e-06, + "loss": 1.5748, + "step": 23420 + }, + { + "epoch": 0.7659233586188857, + "grad_norm": 3.1783602950336114, + "learning_rate": 3.156609257947457e-06, + "loss": 1.5183, + "step": 23425 + }, + { + "epoch": 0.7660868427936176, + "grad_norm": 3.3289341829409502, + "learning_rate": 3.15244918082933e-06, + "loss": 1.5179, + "step": 23430 + }, + { + "epoch": 0.7662503269683495, + "grad_norm": 3.2923883921541597, + "learning_rate": 3.148291333790735e-06, + "loss": 1.3399, + "step": 23435 + }, + { + "epoch": 0.7664138111430814, + "grad_norm": 3.3029784129555946, + "learning_rate": 3.1441357181857745e-06, + "loss": 1.3972, + "step": 23440 + }, + { + "epoch": 0.7665772953178133, + "grad_norm": 3.2624123391881854, + "learning_rate": 3.139982335367837e-06, + "loss": 1.4473, + "step": 23445 + }, + { + "epoch": 0.7667407794925452, + "grad_norm": 3.1088057305406918, + "learning_rate": 3.135831186689574e-06, + "loss": 1.3573, + "step": 23450 + }, + { + "epoch": 0.766904263667277, + "grad_norm": 3.161355976201788, + "learning_rate": 3.1316822735029105e-06, + "loss": 1.408, + "step": 23455 + }, + { + "epoch": 0.7670677478420089, + "grad_norm": 3.0000665184256197, + "learning_rate": 3.1275355971590516e-06, + "loss": 1.3572, + "step": 23460 + }, + { + "epoch": 0.7672312320167408, + "grad_norm": 3.478542261842801, + "learning_rate": 3.123391159008462e-06, + "loss": 1.4532, + "step": 23465 + }, + { + "epoch": 0.7673947161914727, + "grad_norm": 3.1433228328572613, + "learning_rate": 3.1192489604008857e-06, + "loss": 1.3952, + "step": 23470 + }, + { + "epoch": 0.7675582003662046, + "grad_norm": 3.0370601617740363, + "learning_rate": 3.11510900268534e-06, + "loss": 1.3425, + "step": 23475 + }, + { + "epoch": 0.7677216845409365, + "grad_norm": 3.4619938782392943, + "learning_rate": 3.1109712872101015e-06, + "loss": 1.4115, + "step": 23480 + }, + { + "epoch": 0.7678851687156684, + "grad_norm": 3.1304963496617093, + "learning_rate": 3.1068358153227285e-06, + "loss": 1.3191, + "step": 23485 + }, + { + "epoch": 0.7680486528904003, + "grad_norm": 3.317906921390292, + "learning_rate": 3.102702588370037e-06, + "loss": 1.3973, + "step": 23490 + }, + { + "epoch": 0.7682121370651321, + "grad_norm": 3.2781136261376864, + "learning_rate": 3.0985716076981198e-06, + "loss": 1.3216, + "step": 23495 + }, + { + "epoch": 0.768375621239864, + "grad_norm": 3.512977015392056, + "learning_rate": 3.0944428746523393e-06, + "loss": 1.4702, + "step": 23500 + }, + { + "epoch": 0.7685391054145959, + "grad_norm": 3.3937333124512703, + "learning_rate": 3.090316390577318e-06, + "loss": 1.3286, + "step": 23505 + }, + { + "epoch": 0.7687025895893278, + "grad_norm": 3.1695424374913403, + "learning_rate": 3.086192156816955e-06, + "loss": 1.4046, + "step": 23510 + }, + { + "epoch": 0.7688660737640597, + "grad_norm": 3.266691198417968, + "learning_rate": 3.0820701747144076e-06, + "loss": 1.3424, + "step": 23515 + }, + { + "epoch": 0.7690295579387916, + "grad_norm": 3.3556082107517726, + "learning_rate": 3.077950445612107e-06, + "loss": 1.5332, + "step": 23520 + }, + { + "epoch": 0.7691930421135235, + "grad_norm": 3.5395684490469366, + "learning_rate": 3.07383297085175e-06, + "loss": 1.3673, + "step": 23525 + }, + { + "epoch": 0.7693565262882553, + "grad_norm": 3.126654944054413, + "learning_rate": 3.0697177517742916e-06, + "loss": 1.3656, + "step": 23530 + }, + { + "epoch": 0.7695200104629872, + "grad_norm": 3.3003892339160448, + "learning_rate": 3.065604789719966e-06, + "loss": 1.4157, + "step": 23535 + }, + { + "epoch": 0.7696834946377191, + "grad_norm": 3.383026592561616, + "learning_rate": 3.061494086028255e-06, + "loss": 1.3199, + "step": 23540 + }, + { + "epoch": 0.7698469788124509, + "grad_norm": 3.16057267788718, + "learning_rate": 3.057385642037919e-06, + "loss": 1.3014, + "step": 23545 + }, + { + "epoch": 0.7700104629871828, + "grad_norm": 3.421374762274014, + "learning_rate": 3.0532794590869795e-06, + "loss": 1.4254, + "step": 23550 + }, + { + "epoch": 0.7701739471619147, + "grad_norm": 3.205115327941509, + "learning_rate": 3.0491755385127153e-06, + "loss": 1.322, + "step": 23555 + }, + { + "epoch": 0.7703374313366466, + "grad_norm": 3.1385527026304008, + "learning_rate": 3.0450738816516765e-06, + "loss": 1.4002, + "step": 23560 + }, + { + "epoch": 0.7705009155113784, + "grad_norm": 3.0587691693666392, + "learning_rate": 3.0409744898396687e-06, + "loss": 1.4463, + "step": 23565 + }, + { + "epoch": 0.7706643996861103, + "grad_norm": 3.143311434523343, + "learning_rate": 3.0368773644117645e-06, + "loss": 1.4085, + "step": 23570 + }, + { + "epoch": 0.7708278838608422, + "grad_norm": 3.276713849971137, + "learning_rate": 3.0327825067023007e-06, + "loss": 1.3701, + "step": 23575 + }, + { + "epoch": 0.7709913680355741, + "grad_norm": 3.1458648582574242, + "learning_rate": 3.028689918044867e-06, + "loss": 1.3805, + "step": 23580 + }, + { + "epoch": 0.771154852210306, + "grad_norm": 3.1036382511022818, + "learning_rate": 3.0245995997723244e-06, + "loss": 1.4998, + "step": 23585 + }, + { + "epoch": 0.7713183363850379, + "grad_norm": 3.3798805890219605, + "learning_rate": 3.020511553216783e-06, + "loss": 1.5727, + "step": 23590 + }, + { + "epoch": 0.7714818205597698, + "grad_norm": 3.119678036453602, + "learning_rate": 3.0164257797096265e-06, + "loss": 1.3122, + "step": 23595 + }, + { + "epoch": 0.7716453047345017, + "grad_norm": 3.234072779285612, + "learning_rate": 3.012342280581484e-06, + "loss": 1.4168, + "step": 23600 + }, + { + "epoch": 0.7718087889092335, + "grad_norm": 3.3493557368308915, + "learning_rate": 3.0082610571622552e-06, + "loss": 1.4158, + "step": 23605 + }, + { + "epoch": 0.7719722730839654, + "grad_norm": 3.0584234076411936, + "learning_rate": 3.0041821107810974e-06, + "loss": 1.3755, + "step": 23610 + }, + { + "epoch": 0.7721357572586973, + "grad_norm": 3.048002667570871, + "learning_rate": 3.0001054427664165e-06, + "loss": 1.4544, + "step": 23615 + }, + { + "epoch": 0.7722992414334292, + "grad_norm": 3.1501628627684086, + "learning_rate": 2.9960310544458904e-06, + "loss": 1.3823, + "step": 23620 + }, + { + "epoch": 0.7724627256081611, + "grad_norm": 3.3192729318989893, + "learning_rate": 2.9919589471464416e-06, + "loss": 1.4217, + "step": 23625 + }, + { + "epoch": 0.772626209782893, + "grad_norm": 3.1121333679048595, + "learning_rate": 2.9878891221942585e-06, + "loss": 1.3853, + "step": 23630 + }, + { + "epoch": 0.7727896939576249, + "grad_norm": 3.3136687919376837, + "learning_rate": 2.983821580914785e-06, + "loss": 1.4262, + "step": 23635 + }, + { + "epoch": 0.7729531781323568, + "grad_norm": 3.0328631745854704, + "learning_rate": 2.9797563246327165e-06, + "loss": 1.2687, + "step": 23640 + }, + { + "epoch": 0.7731166623070886, + "grad_norm": 3.1677976153039467, + "learning_rate": 2.9756933546720114e-06, + "loss": 1.4823, + "step": 23645 + }, + { + "epoch": 0.7732801464818205, + "grad_norm": 3.3402966900812037, + "learning_rate": 2.971632672355873e-06, + "loss": 1.5039, + "step": 23650 + }, + { + "epoch": 0.7734436306565524, + "grad_norm": 3.3144370450397287, + "learning_rate": 2.967574279006773e-06, + "loss": 1.4375, + "step": 23655 + }, + { + "epoch": 0.7736071148312843, + "grad_norm": 3.1936041971932094, + "learning_rate": 2.963518175946428e-06, + "loss": 1.5131, + "step": 23660 + }, + { + "epoch": 0.7737705990060162, + "grad_norm": 3.363003368454343, + "learning_rate": 2.9594643644958075e-06, + "loss": 1.4189, + "step": 23665 + }, + { + "epoch": 0.7739340831807481, + "grad_norm": 3.0580019057143573, + "learning_rate": 2.955412845975142e-06, + "loss": 1.3335, + "step": 23670 + }, + { + "epoch": 0.77409756735548, + "grad_norm": 3.1656439633929163, + "learning_rate": 2.9513636217039143e-06, + "loss": 1.4749, + "step": 23675 + }, + { + "epoch": 0.7742610515302119, + "grad_norm": 3.1382226710768277, + "learning_rate": 2.947316693000852e-06, + "loss": 1.4143, + "step": 23680 + }, + { + "epoch": 0.7744245357049437, + "grad_norm": 3.28722111150799, + "learning_rate": 2.943272061183948e-06, + "loss": 1.4458, + "step": 23685 + }, + { + "epoch": 0.7745880198796756, + "grad_norm": 3.066566131708615, + "learning_rate": 2.939229727570432e-06, + "loss": 1.3479, + "step": 23690 + }, + { + "epoch": 0.7747515040544075, + "grad_norm": 3.4158508209508716, + "learning_rate": 2.9351896934767953e-06, + "loss": 1.3748, + "step": 23695 + }, + { + "epoch": 0.7749149882291394, + "grad_norm": 3.15595243502785, + "learning_rate": 2.9311519602187833e-06, + "loss": 1.3721, + "step": 23700 + }, + { + "epoch": 0.7750784724038713, + "grad_norm": 3.4853921548652664, + "learning_rate": 2.927116529111379e-06, + "loss": 1.3517, + "step": 23705 + }, + { + "epoch": 0.7752419565786032, + "grad_norm": 3.302197475590212, + "learning_rate": 2.9230834014688305e-06, + "loss": 1.4485, + "step": 23710 + }, + { + "epoch": 0.7754054407533351, + "grad_norm": 3.1579800157059768, + "learning_rate": 2.919052578604622e-06, + "loss": 1.486, + "step": 23715 + }, + { + "epoch": 0.775568924928067, + "grad_norm": 3.3441007204394784, + "learning_rate": 2.9150240618314996e-06, + "loss": 1.5485, + "step": 23720 + }, + { + "epoch": 0.7757324091027988, + "grad_norm": 3.38488238768388, + "learning_rate": 2.910997852461448e-06, + "loss": 1.3986, + "step": 23725 + }, + { + "epoch": 0.7758958932775307, + "grad_norm": 3.3790258953839043, + "learning_rate": 2.906973951805706e-06, + "loss": 1.3491, + "step": 23730 + }, + { + "epoch": 0.7760593774522626, + "grad_norm": 3.243245881754523, + "learning_rate": 2.9029523611747635e-06, + "loss": 1.3753, + "step": 23735 + }, + { + "epoch": 0.7762228616269945, + "grad_norm": 3.4236957787153237, + "learning_rate": 2.8989330818783477e-06, + "loss": 1.6466, + "step": 23740 + }, + { + "epoch": 0.7763863458017264, + "grad_norm": 3.2249744650457273, + "learning_rate": 2.8949161152254456e-06, + "loss": 1.418, + "step": 23745 + }, + { + "epoch": 0.7765498299764583, + "grad_norm": 3.1774775232303796, + "learning_rate": 2.890901462524278e-06, + "loss": 1.5002, + "step": 23750 + }, + { + "epoch": 0.7767133141511902, + "grad_norm": 3.1034661684182843, + "learning_rate": 2.886889125082324e-06, + "loss": 1.3224, + "step": 23755 + }, + { + "epoch": 0.776876798325922, + "grad_norm": 3.0524429114982397, + "learning_rate": 2.882879104206304e-06, + "loss": 1.313, + "step": 23760 + }, + { + "epoch": 0.7770402825006539, + "grad_norm": 3.3282138577211797, + "learning_rate": 2.87887140120218e-06, + "loss": 1.4172, + "step": 23765 + }, + { + "epoch": 0.7772037666753858, + "grad_norm": 3.20976182026376, + "learning_rate": 2.8748660173751675e-06, + "loss": 1.4198, + "step": 23770 + }, + { + "epoch": 0.7773672508501177, + "grad_norm": 3.750576606996202, + "learning_rate": 2.870862954029715e-06, + "loss": 1.4855, + "step": 23775 + }, + { + "epoch": 0.7775307350248496, + "grad_norm": 3.2275952157746364, + "learning_rate": 2.8668622124695254e-06, + "loss": 1.367, + "step": 23780 + }, + { + "epoch": 0.7776942191995815, + "grad_norm": 3.0807913792113744, + "learning_rate": 2.8628637939975477e-06, + "loss": 1.5286, + "step": 23785 + }, + { + "epoch": 0.7778577033743134, + "grad_norm": 3.132439983947082, + "learning_rate": 2.858867699915959e-06, + "loss": 1.399, + "step": 23790 + }, + { + "epoch": 0.7780211875490453, + "grad_norm": 3.2596926850137904, + "learning_rate": 2.8548739315261984e-06, + "loss": 1.3223, + "step": 23795 + }, + { + "epoch": 0.7781846717237771, + "grad_norm": 3.0608212716046377, + "learning_rate": 2.850882490128931e-06, + "loss": 1.3147, + "step": 23800 + }, + { + "epoch": 0.778348155898509, + "grad_norm": 3.169962548824402, + "learning_rate": 2.8468933770240746e-06, + "loss": 1.2428, + "step": 23805 + }, + { + "epoch": 0.7785116400732409, + "grad_norm": 3.1762197542406025, + "learning_rate": 2.8429065935107893e-06, + "loss": 1.4652, + "step": 23810 + }, + { + "epoch": 0.7786751242479728, + "grad_norm": 3.0592862617004766, + "learning_rate": 2.8389221408874655e-06, + "loss": 1.2743, + "step": 23815 + }, + { + "epoch": 0.7788386084227047, + "grad_norm": 3.364095485327774, + "learning_rate": 2.8349400204517486e-06, + "loss": 1.4392, + "step": 23820 + }, + { + "epoch": 0.7790020925974366, + "grad_norm": 3.174383491095513, + "learning_rate": 2.8309602335005102e-06, + "loss": 1.336, + "step": 23825 + }, + { + "epoch": 0.7791655767721685, + "grad_norm": 3.7055172162635874, + "learning_rate": 2.8269827813298756e-06, + "loss": 1.38, + "step": 23830 + }, + { + "epoch": 0.7793290609469004, + "grad_norm": 3.2562035342180096, + "learning_rate": 2.8230076652352046e-06, + "loss": 1.4339, + "step": 23835 + }, + { + "epoch": 0.7794925451216322, + "grad_norm": 3.192828333605932, + "learning_rate": 2.8190348865110884e-06, + "loss": 1.3537, + "step": 23840 + }, + { + "epoch": 0.7796560292963641, + "grad_norm": 3.1930350786140544, + "learning_rate": 2.81506444645137e-06, + "loss": 1.3358, + "step": 23845 + }, + { + "epoch": 0.779819513471096, + "grad_norm": 3.0829167903423325, + "learning_rate": 2.811096346349119e-06, + "loss": 1.3154, + "step": 23850 + }, + { + "epoch": 0.7799829976458279, + "grad_norm": 3.2129917232013088, + "learning_rate": 2.807130587496656e-06, + "loss": 1.3937, + "step": 23855 + }, + { + "epoch": 0.7801464818205598, + "grad_norm": 3.288420407051476, + "learning_rate": 2.8031671711855245e-06, + "loss": 1.2746, + "step": 23860 + }, + { + "epoch": 0.7803099659952917, + "grad_norm": 3.192700168826694, + "learning_rate": 2.799206098706513e-06, + "loss": 1.3931, + "step": 23865 + }, + { + "epoch": 0.7804734501700236, + "grad_norm": 3.337267184512942, + "learning_rate": 2.7952473713496496e-06, + "loss": 1.345, + "step": 23870 + }, + { + "epoch": 0.7806369343447555, + "grad_norm": 3.34835802874087, + "learning_rate": 2.7912909904041884e-06, + "loss": 1.4726, + "step": 23875 + }, + { + "epoch": 0.7808004185194873, + "grad_norm": 3.2965448210631387, + "learning_rate": 2.7873369571586307e-06, + "loss": 1.4647, + "step": 23880 + }, + { + "epoch": 0.7809639026942192, + "grad_norm": 3.220380401442809, + "learning_rate": 2.7833852729007105e-06, + "loss": 1.3693, + "step": 23885 + }, + { + "epoch": 0.7811273868689511, + "grad_norm": 3.233812636610094, + "learning_rate": 2.7794359389173877e-06, + "loss": 1.6005, + "step": 23890 + }, + { + "epoch": 0.781290871043683, + "grad_norm": 3.2172870840538033, + "learning_rate": 2.7754889564948694e-06, + "loss": 1.5422, + "step": 23895 + }, + { + "epoch": 0.7814543552184149, + "grad_norm": 2.9994670735869926, + "learning_rate": 2.771544326918587e-06, + "loss": 1.3514, + "step": 23900 + }, + { + "epoch": 0.7816178393931468, + "grad_norm": 3.19466186852432, + "learning_rate": 2.767602051473212e-06, + "loss": 1.2985, + "step": 23905 + }, + { + "epoch": 0.7817813235678787, + "grad_norm": 3.3905101923152072, + "learning_rate": 2.7636621314426483e-06, + "loss": 1.4037, + "step": 23910 + }, + { + "epoch": 0.7819448077426105, + "grad_norm": 3.114241138270702, + "learning_rate": 2.759724568110026e-06, + "loss": 1.4804, + "step": 23915 + }, + { + "epoch": 0.7821082919173424, + "grad_norm": 3.0589654820484973, + "learning_rate": 2.75578936275772e-06, + "loss": 1.5161, + "step": 23920 + }, + { + "epoch": 0.7822717760920743, + "grad_norm": 3.0673944360379144, + "learning_rate": 2.751856516667324e-06, + "loss": 1.3089, + "step": 23925 + }, + { + "epoch": 0.7824352602668062, + "grad_norm": 3.1655793767007103, + "learning_rate": 2.747926031119673e-06, + "loss": 1.3899, + "step": 23930 + }, + { + "epoch": 0.7825987444415381, + "grad_norm": 3.138624921818799, + "learning_rate": 2.743997907394831e-06, + "loss": 1.4612, + "step": 23935 + }, + { + "epoch": 0.78276222861627, + "grad_norm": 3.2889547401924824, + "learning_rate": 2.740072146772088e-06, + "loss": 1.521, + "step": 23940 + }, + { + "epoch": 0.7829257127910019, + "grad_norm": 3.0971092111520506, + "learning_rate": 2.7361487505299724e-06, + "loss": 1.3378, + "step": 23945 + }, + { + "epoch": 0.7830891969657338, + "grad_norm": 3.17237863550813, + "learning_rate": 2.732227719946232e-06, + "loss": 1.2234, + "step": 23950 + }, + { + "epoch": 0.7832526811404656, + "grad_norm": 2.902452975301643, + "learning_rate": 2.7283090562978553e-06, + "loss": 1.3693, + "step": 23955 + }, + { + "epoch": 0.7834161653151975, + "grad_norm": 3.263952731329399, + "learning_rate": 2.7243927608610565e-06, + "loss": 1.3383, + "step": 23960 + }, + { + "epoch": 0.7835796494899294, + "grad_norm": 3.090660501148062, + "learning_rate": 2.7204788349112708e-06, + "loss": 1.3677, + "step": 23965 + }, + { + "epoch": 0.7837431336646613, + "grad_norm": 2.945806961512505, + "learning_rate": 2.7165672797231747e-06, + "loss": 1.2609, + "step": 23970 + }, + { + "epoch": 0.7839066178393932, + "grad_norm": 2.99981633779962, + "learning_rate": 2.7126580965706604e-06, + "loss": 1.2836, + "step": 23975 + }, + { + "epoch": 0.7840701020141251, + "grad_norm": 3.1941276064896456, + "learning_rate": 2.7087512867268584e-06, + "loss": 1.4202, + "step": 23980 + }, + { + "epoch": 0.784233586188857, + "grad_norm": 3.1502923538261296, + "learning_rate": 2.7048468514641145e-06, + "loss": 1.3408, + "step": 23985 + }, + { + "epoch": 0.7843970703635889, + "grad_norm": 3.344366791831169, + "learning_rate": 2.700944792054012e-06, + "loss": 1.4305, + "step": 23990 + }, + { + "epoch": 0.7845605545383207, + "grad_norm": 3.0990270083841285, + "learning_rate": 2.697045109767358e-06, + "loss": 1.4256, + "step": 23995 + }, + { + "epoch": 0.7847240387130526, + "grad_norm": 3.2970965962326293, + "learning_rate": 2.6931478058741767e-06, + "loss": 1.4831, + "step": 24000 + }, + { + "epoch": 0.7848875228877845, + "grad_norm": 3.141155627874413, + "learning_rate": 2.6892528816437326e-06, + "loss": 1.3965, + "step": 24005 + }, + { + "epoch": 0.7850510070625164, + "grad_norm": 3.349377174772886, + "learning_rate": 2.6853603383444994e-06, + "loss": 1.4972, + "step": 24010 + }, + { + "epoch": 0.7852144912372482, + "grad_norm": 3.222549325738664, + "learning_rate": 2.6814701772441863e-06, + "loss": 1.5326, + "step": 24015 + }, + { + "epoch": 0.7853779754119801, + "grad_norm": 3.276025494804105, + "learning_rate": 2.677582399609727e-06, + "loss": 1.3462, + "step": 24020 + }, + { + "epoch": 0.785541459586712, + "grad_norm": 3.126561835708292, + "learning_rate": 2.6736970067072686e-06, + "loss": 1.357, + "step": 24025 + }, + { + "epoch": 0.7857049437614438, + "grad_norm": 3.0010712344137884, + "learning_rate": 2.6698139998021956e-06, + "loss": 1.33, + "step": 24030 + }, + { + "epoch": 0.7858684279361757, + "grad_norm": 2.9954330877517283, + "learning_rate": 2.6659333801591013e-06, + "loss": 1.3891, + "step": 24035 + }, + { + "epoch": 0.7860319121109076, + "grad_norm": 3.223929693647591, + "learning_rate": 2.6620551490418122e-06, + "loss": 1.3425, + "step": 24040 + }, + { + "epoch": 0.7861953962856395, + "grad_norm": 3.391126116430986, + "learning_rate": 2.6581793077133756e-06, + "loss": 1.5202, + "step": 24045 + }, + { + "epoch": 0.7863588804603714, + "grad_norm": 3.153436240921532, + "learning_rate": 2.6543058574360514e-06, + "loss": 1.413, + "step": 24050 + }, + { + "epoch": 0.7865223646351033, + "grad_norm": 3.0402344071664764, + "learning_rate": 2.6504347994713355e-06, + "loss": 1.4065, + "step": 24055 + }, + { + "epoch": 0.7866858488098352, + "grad_norm": 3.111535223571642, + "learning_rate": 2.6465661350799276e-06, + "loss": 1.3871, + "step": 24060 + }, + { + "epoch": 0.786849332984567, + "grad_norm": 3.056880943057381, + "learning_rate": 2.6426998655217663e-06, + "loss": 1.2897, + "step": 24065 + }, + { + "epoch": 0.7870128171592989, + "grad_norm": 3.143833899476707, + "learning_rate": 2.6388359920559945e-06, + "loss": 1.3658, + "step": 24070 + }, + { + "epoch": 0.7871763013340308, + "grad_norm": 3.1720384677277442, + "learning_rate": 2.6349745159409814e-06, + "loss": 1.4374, + "step": 24075 + }, + { + "epoch": 0.7873397855087627, + "grad_norm": 3.3420884847495076, + "learning_rate": 2.6311154384343153e-06, + "loss": 1.2799, + "step": 24080 + }, + { + "epoch": 0.7875032696834946, + "grad_norm": 3.1314879933037116, + "learning_rate": 2.627258760792808e-06, + "loss": 1.3659, + "step": 24085 + }, + { + "epoch": 0.7876667538582265, + "grad_norm": 3.153562170138128, + "learning_rate": 2.623404484272478e-06, + "loss": 1.3388, + "step": 24090 + }, + { + "epoch": 0.7878302380329584, + "grad_norm": 3.0236955978393243, + "learning_rate": 2.619552610128575e-06, + "loss": 1.3928, + "step": 24095 + }, + { + "epoch": 0.7879937222076903, + "grad_norm": 3.1525263018359295, + "learning_rate": 2.615703139615554e-06, + "loss": 1.2962, + "step": 24100 + }, + { + "epoch": 0.7881572063824221, + "grad_norm": 3.045547933092587, + "learning_rate": 2.611856073987098e-06, + "loss": 1.3337, + "step": 24105 + }, + { + "epoch": 0.788320690557154, + "grad_norm": 3.4605919577955855, + "learning_rate": 2.6080114144961012e-06, + "loss": 1.435, + "step": 24110 + }, + { + "epoch": 0.7884841747318859, + "grad_norm": 3.338088687336502, + "learning_rate": 2.6041691623946718e-06, + "loss": 1.5069, + "step": 24115 + }, + { + "epoch": 0.7886476589066178, + "grad_norm": 3.5413663260732746, + "learning_rate": 2.6003293189341437e-06, + "loss": 1.3422, + "step": 24120 + }, + { + "epoch": 0.7888111430813497, + "grad_norm": 3.1585530603687695, + "learning_rate": 2.596491885365051e-06, + "loss": 1.3642, + "step": 24125 + }, + { + "epoch": 0.7889746272560816, + "grad_norm": 3.3041029799590644, + "learning_rate": 2.592656862937161e-06, + "loss": 1.4859, + "step": 24130 + }, + { + "epoch": 0.7891381114308135, + "grad_norm": 3.242959311685077, + "learning_rate": 2.588824252899438e-06, + "loss": 1.5134, + "step": 24135 + }, + { + "epoch": 0.7893015956055454, + "grad_norm": 2.97556159048036, + "learning_rate": 2.584994056500073e-06, + "loss": 1.3827, + "step": 24140 + }, + { + "epoch": 0.7894650797802772, + "grad_norm": 3.4439532395466848, + "learning_rate": 2.581166274986471e-06, + "loss": 1.4245, + "step": 24145 + }, + { + "epoch": 0.7896285639550091, + "grad_norm": 2.9697894849923046, + "learning_rate": 2.5773409096052393e-06, + "loss": 1.2966, + "step": 24150 + }, + { + "epoch": 0.789792048129741, + "grad_norm": 3.3263358743990556, + "learning_rate": 2.573517961602213e-06, + "loss": 1.5637, + "step": 24155 + }, + { + "epoch": 0.7899555323044729, + "grad_norm": 3.446185197177724, + "learning_rate": 2.5696974322224255e-06, + "loss": 1.4388, + "step": 24160 + }, + { + "epoch": 0.7901190164792048, + "grad_norm": 3.292601095477334, + "learning_rate": 2.565879322710133e-06, + "loss": 1.4885, + "step": 24165 + }, + { + "epoch": 0.7902825006539367, + "grad_norm": 3.3016323038809903, + "learning_rate": 2.5620636343088045e-06, + "loss": 1.3866, + "step": 24170 + }, + { + "epoch": 0.7904459848286686, + "grad_norm": 3.3412154356899553, + "learning_rate": 2.558250368261107e-06, + "loss": 1.479, + "step": 24175 + }, + { + "epoch": 0.7906094690034005, + "grad_norm": 3.028521477490565, + "learning_rate": 2.554439525808937e-06, + "loss": 1.3818, + "step": 24180 + }, + { + "epoch": 0.7907729531781323, + "grad_norm": 3.0945264138162503, + "learning_rate": 2.5506311081933856e-06, + "loss": 1.4071, + "step": 24185 + }, + { + "epoch": 0.7909364373528642, + "grad_norm": 3.411494836135705, + "learning_rate": 2.546825116654763e-06, + "loss": 1.5316, + "step": 24190 + }, + { + "epoch": 0.7910999215275961, + "grad_norm": 3.2679559772419537, + "learning_rate": 2.543021552432592e-06, + "loss": 1.517, + "step": 24195 + }, + { + "epoch": 0.791263405702328, + "grad_norm": 3.3700054762281813, + "learning_rate": 2.539220416765593e-06, + "loss": 1.3935, + "step": 24200 + }, + { + "epoch": 0.7914268898770599, + "grad_norm": 3.1484549863416627, + "learning_rate": 2.535421710891709e-06, + "loss": 1.3928, + "step": 24205 + }, + { + "epoch": 0.7915903740517918, + "grad_norm": 3.1812799080657674, + "learning_rate": 2.5316254360480797e-06, + "loss": 1.3385, + "step": 24210 + }, + { + "epoch": 0.7917538582265237, + "grad_norm": 3.5378448395590785, + "learning_rate": 2.527831593471063e-06, + "loss": 1.5317, + "step": 24215 + }, + { + "epoch": 0.7919173424012556, + "grad_norm": 3.1727747287212944, + "learning_rate": 2.5240401843962204e-06, + "loss": 1.4306, + "step": 24220 + }, + { + "epoch": 0.7920808265759874, + "grad_norm": 3.38942451620292, + "learning_rate": 2.5202512100583187e-06, + "loss": 1.5129, + "step": 24225 + }, + { + "epoch": 0.7922443107507193, + "grad_norm": 3.3341503867215434, + "learning_rate": 2.5164646716913367e-06, + "loss": 1.4119, + "step": 24230 + }, + { + "epoch": 0.7924077949254512, + "grad_norm": 3.059931320786837, + "learning_rate": 2.512680570528453e-06, + "loss": 1.3405, + "step": 24235 + }, + { + "epoch": 0.7925712791001831, + "grad_norm": 3.1753972999051006, + "learning_rate": 2.50889890780206e-06, + "loss": 1.4231, + "step": 24240 + }, + { + "epoch": 0.792734763274915, + "grad_norm": 3.369194303460991, + "learning_rate": 2.505119684743753e-06, + "loss": 1.3683, + "step": 24245 + }, + { + "epoch": 0.7928982474496469, + "grad_norm": 3.163762283246909, + "learning_rate": 2.5013429025843296e-06, + "loss": 1.3382, + "step": 24250 + }, + { + "epoch": 0.7930617316243788, + "grad_norm": 3.141218797817047, + "learning_rate": 2.497568562553799e-06, + "loss": 1.4032, + "step": 24255 + }, + { + "epoch": 0.7932252157991107, + "grad_norm": 3.0619941538603506, + "learning_rate": 2.4937966658813672e-06, + "loss": 1.3112, + "step": 24260 + }, + { + "epoch": 0.7933886999738425, + "grad_norm": 3.0273696700878543, + "learning_rate": 2.4900272137954527e-06, + "loss": 1.3021, + "step": 24265 + }, + { + "epoch": 0.7935521841485744, + "grad_norm": 3.0153949982106476, + "learning_rate": 2.486260207523669e-06, + "loss": 1.4822, + "step": 24270 + }, + { + "epoch": 0.7937156683233063, + "grad_norm": 3.200242937923815, + "learning_rate": 2.482495648292842e-06, + "loss": 1.2731, + "step": 24275 + }, + { + "epoch": 0.7938791524980382, + "grad_norm": 3.4238993304014724, + "learning_rate": 2.4787335373289945e-06, + "loss": 1.426, + "step": 24280 + }, + { + "epoch": 0.7940426366727701, + "grad_norm": 2.9164284784413645, + "learning_rate": 2.4749738758573517e-06, + "loss": 1.3412, + "step": 24285 + }, + { + "epoch": 0.794206120847502, + "grad_norm": 3.3707780017836444, + "learning_rate": 2.4712166651023452e-06, + "loss": 1.4967, + "step": 24290 + }, + { + "epoch": 0.7943696050222339, + "grad_norm": 3.0071221202973515, + "learning_rate": 2.4674619062876094e-06, + "loss": 1.426, + "step": 24295 + }, + { + "epoch": 0.7945330891969657, + "grad_norm": 3.2040427554912623, + "learning_rate": 2.4637096006359717e-06, + "loss": 1.441, + "step": 24300 + }, + { + "epoch": 0.7946965733716976, + "grad_norm": 3.5844819614252414, + "learning_rate": 2.4599597493694725e-06, + "loss": 1.5445, + "step": 24305 + }, + { + "epoch": 0.7948600575464295, + "grad_norm": 3.377192831500161, + "learning_rate": 2.45621235370934e-06, + "loss": 1.4714, + "step": 24310 + }, + { + "epoch": 0.7950235417211614, + "grad_norm": 3.1827517271641, + "learning_rate": 2.4524674148760108e-06, + "loss": 1.3388, + "step": 24315 + }, + { + "epoch": 0.7951870258958933, + "grad_norm": 3.1947146480610784, + "learning_rate": 2.448724934089125e-06, + "loss": 1.4509, + "step": 24320 + }, + { + "epoch": 0.7953505100706252, + "grad_norm": 3.164315208479899, + "learning_rate": 2.4449849125675097e-06, + "loss": 1.4863, + "step": 24325 + }, + { + "epoch": 0.7955139942453571, + "grad_norm": 3.1455518301223835, + "learning_rate": 2.4412473515292045e-06, + "loss": 1.4113, + "step": 24330 + }, + { + "epoch": 0.795677478420089, + "grad_norm": 3.2663660948976654, + "learning_rate": 2.4375122521914353e-06, + "loss": 1.547, + "step": 24335 + }, + { + "epoch": 0.7958409625948208, + "grad_norm": 3.5005435751108145, + "learning_rate": 2.4337796157706362e-06, + "loss": 1.5803, + "step": 24340 + }, + { + "epoch": 0.7960044467695527, + "grad_norm": 3.140612638860524, + "learning_rate": 2.4300494434824373e-06, + "loss": 1.5641, + "step": 24345 + }, + { + "epoch": 0.7961679309442846, + "grad_norm": 3.0076983534603285, + "learning_rate": 2.426321736541659e-06, + "loss": 1.3525, + "step": 24350 + }, + { + "epoch": 0.7963314151190165, + "grad_norm": 3.230836374928098, + "learning_rate": 2.4225964961623295e-06, + "loss": 1.5576, + "step": 24355 + }, + { + "epoch": 0.7964948992937484, + "grad_norm": 3.2013444586087174, + "learning_rate": 2.418873723557663e-06, + "loss": 1.5393, + "step": 24360 + }, + { + "epoch": 0.7966583834684803, + "grad_norm": 3.1854912708411582, + "learning_rate": 2.4151534199400785e-06, + "loss": 1.3557, + "step": 24365 + }, + { + "epoch": 0.7968218676432122, + "grad_norm": 3.3250871862469986, + "learning_rate": 2.4114355865211904e-06, + "loss": 1.5323, + "step": 24370 + }, + { + "epoch": 0.796985351817944, + "grad_norm": 3.117071792325645, + "learning_rate": 2.4077202245117993e-06, + "loss": 1.436, + "step": 24375 + }, + { + "epoch": 0.7971488359926759, + "grad_norm": 3.280079815500708, + "learning_rate": 2.404007335121915e-06, + "loss": 1.3408, + "step": 24380 + }, + { + "epoch": 0.7973123201674078, + "grad_norm": 3.1654160778377554, + "learning_rate": 2.4002969195607274e-06, + "loss": 1.5179, + "step": 24385 + }, + { + "epoch": 0.7974758043421397, + "grad_norm": 3.2552733119692947, + "learning_rate": 2.3965889790366337e-06, + "loss": 1.5177, + "step": 24390 + }, + { + "epoch": 0.7976392885168716, + "grad_norm": 3.085309158541442, + "learning_rate": 2.3928835147572137e-06, + "loss": 1.3313, + "step": 24395 + }, + { + "epoch": 0.7978027726916035, + "grad_norm": 3.1360000774429637, + "learning_rate": 2.389180527929251e-06, + "loss": 1.3648, + "step": 24400 + }, + { + "epoch": 0.7979662568663354, + "grad_norm": 3.6130936178958417, + "learning_rate": 2.385480019758718e-06, + "loss": 1.3948, + "step": 24405 + }, + { + "epoch": 0.7981297410410673, + "grad_norm": 3.0617755364255173, + "learning_rate": 2.3817819914507755e-06, + "loss": 1.4052, + "step": 24410 + }, + { + "epoch": 0.7982932252157992, + "grad_norm": 3.58700016959767, + "learning_rate": 2.378086444209785e-06, + "loss": 1.3559, + "step": 24415 + }, + { + "epoch": 0.798456709390531, + "grad_norm": 3.2293487754499663, + "learning_rate": 2.3743933792392914e-06, + "loss": 1.4109, + "step": 24420 + }, + { + "epoch": 0.7986201935652629, + "grad_norm": 3.1813842335133633, + "learning_rate": 2.370702797742037e-06, + "loss": 1.4552, + "step": 24425 + }, + { + "epoch": 0.7987836777399948, + "grad_norm": 3.424601518808388, + "learning_rate": 2.3670147009199586e-06, + "loss": 1.337, + "step": 24430 + }, + { + "epoch": 0.7989471619147267, + "grad_norm": 3.3484751539894715, + "learning_rate": 2.363329089974171e-06, + "loss": 1.403, + "step": 24435 + }, + { + "epoch": 0.7991106460894586, + "grad_norm": 3.6176212374223873, + "learning_rate": 2.3596459661049943e-06, + "loss": 1.4693, + "step": 24440 + }, + { + "epoch": 0.7992741302641905, + "grad_norm": 3.4438249322063146, + "learning_rate": 2.3559653305119257e-06, + "loss": 1.4369, + "step": 24445 + }, + { + "epoch": 0.7994376144389224, + "grad_norm": 3.1702894725515356, + "learning_rate": 2.352287184393661e-06, + "loss": 1.3976, + "step": 24450 + }, + { + "epoch": 0.7996010986136542, + "grad_norm": 2.970896230773802, + "learning_rate": 2.348611528948086e-06, + "loss": 1.3935, + "step": 24455 + }, + { + "epoch": 0.7997645827883861, + "grad_norm": 3.340546934973194, + "learning_rate": 2.3449383653722646e-06, + "loss": 1.4749, + "step": 24460 + }, + { + "epoch": 0.799928066963118, + "grad_norm": 3.0850693432782, + "learning_rate": 2.3412676948624615e-06, + "loss": 1.4888, + "step": 24465 + }, + { + "epoch": 0.8000915511378499, + "grad_norm": 3.287196391099547, + "learning_rate": 2.3375995186141197e-06, + "loss": 1.4282, + "step": 24470 + }, + { + "epoch": 0.8002550353125818, + "grad_norm": 3.4653026654503467, + "learning_rate": 2.333933837821877e-06, + "loss": 1.4575, + "step": 24475 + }, + { + "epoch": 0.8004185194873136, + "grad_norm": 3.19816961710101, + "learning_rate": 2.3302706536795607e-06, + "loss": 1.353, + "step": 24480 + }, + { + "epoch": 0.8005820036620455, + "grad_norm": 3.2171204864718885, + "learning_rate": 2.32660996738017e-06, + "loss": 1.4168, + "step": 24485 + }, + { + "epoch": 0.8007454878367773, + "grad_norm": 3.2719671657472436, + "learning_rate": 2.322951780115905e-06, + "loss": 1.4027, + "step": 24490 + }, + { + "epoch": 0.8009089720115092, + "grad_norm": 3.134698854605316, + "learning_rate": 2.3192960930781516e-06, + "loss": 1.4325, + "step": 24495 + }, + { + "epoch": 0.8010724561862411, + "grad_norm": 3.042693614484662, + "learning_rate": 2.3156429074574717e-06, + "loss": 1.4309, + "step": 24500 + }, + { + "epoch": 0.801235940360973, + "grad_norm": 2.927368846939408, + "learning_rate": 2.311992224443623e-06, + "loss": 1.3422, + "step": 24505 + }, + { + "epoch": 0.8013994245357049, + "grad_norm": 3.2554144473409887, + "learning_rate": 2.308344045225539e-06, + "loss": 1.4723, + "step": 24510 + }, + { + "epoch": 0.8015629087104368, + "grad_norm": 3.4912812386299894, + "learning_rate": 2.3046983709913483e-06, + "loss": 1.55, + "step": 24515 + }, + { + "epoch": 0.8017263928851687, + "grad_norm": 3.0520176664700878, + "learning_rate": 2.3010552029283504e-06, + "loss": 1.3346, + "step": 24520 + }, + { + "epoch": 0.8018898770599006, + "grad_norm": 3.457856987670365, + "learning_rate": 2.2974145422230397e-06, + "loss": 1.3189, + "step": 24525 + }, + { + "epoch": 0.8020533612346324, + "grad_norm": 3.2771524742372913, + "learning_rate": 2.293776390061093e-06, + "loss": 1.3918, + "step": 24530 + }, + { + "epoch": 0.8022168454093643, + "grad_norm": 3.41153568433213, + "learning_rate": 2.2901407476273617e-06, + "loss": 1.4364, + "step": 24535 + }, + { + "epoch": 0.8023803295840962, + "grad_norm": 3.224955972056874, + "learning_rate": 2.2865076161058907e-06, + "loss": 1.4728, + "step": 24540 + }, + { + "epoch": 0.8025438137588281, + "grad_norm": 3.3041650634371, + "learning_rate": 2.282876996679897e-06, + "loss": 1.282, + "step": 24545 + }, + { + "epoch": 0.80270729793356, + "grad_norm": 3.084931445411499, + "learning_rate": 2.2792488905317857e-06, + "loss": 1.3823, + "step": 24550 + }, + { + "epoch": 0.8028707821082919, + "grad_norm": 3.1209353165311167, + "learning_rate": 2.275623298843147e-06, + "loss": 1.4345, + "step": 24555 + }, + { + "epoch": 0.8030342662830238, + "grad_norm": 3.2404994784810737, + "learning_rate": 2.2720002227947403e-06, + "loss": 1.3719, + "step": 24560 + }, + { + "epoch": 0.8031977504577557, + "grad_norm": 2.9941571764059156, + "learning_rate": 2.268379663566519e-06, + "loss": 1.4692, + "step": 24565 + }, + { + "epoch": 0.8033612346324875, + "grad_norm": 3.395939225264196, + "learning_rate": 2.2647616223376034e-06, + "loss": 1.3773, + "step": 24570 + }, + { + "epoch": 0.8035247188072194, + "grad_norm": 3.35691586356258, + "learning_rate": 2.2611461002863054e-06, + "loss": 1.3987, + "step": 24575 + }, + { + "epoch": 0.8036882029819513, + "grad_norm": 3.4146603233860087, + "learning_rate": 2.2575330985901143e-06, + "loss": 1.4264, + "step": 24580 + }, + { + "epoch": 0.8038516871566832, + "grad_norm": 3.317831989639217, + "learning_rate": 2.2539226184256915e-06, + "loss": 1.3907, + "step": 24585 + }, + { + "epoch": 0.8040151713314151, + "grad_norm": 2.87254445097696, + "learning_rate": 2.250314660968885e-06, + "loss": 1.3416, + "step": 24590 + }, + { + "epoch": 0.804178655506147, + "grad_norm": 3.0042755991496835, + "learning_rate": 2.2467092273947145e-06, + "loss": 1.3495, + "step": 24595 + }, + { + "epoch": 0.8043421396808789, + "grad_norm": 3.551437835898312, + "learning_rate": 2.243106318877384e-06, + "loss": 1.4111, + "step": 24600 + }, + { + "epoch": 0.8045056238556108, + "grad_norm": 3.1290129113182656, + "learning_rate": 2.239505936590275e-06, + "loss": 1.3782, + "step": 24605 + }, + { + "epoch": 0.8046691080303426, + "grad_norm": 3.145798208923198, + "learning_rate": 2.2359080817059385e-06, + "loss": 1.3299, + "step": 24610 + }, + { + "epoch": 0.8048325922050745, + "grad_norm": 3.194042241022628, + "learning_rate": 2.2323127553961133e-06, + "loss": 1.4478, + "step": 24615 + }, + { + "epoch": 0.8049960763798064, + "grad_norm": 3.0841706138165796, + "learning_rate": 2.228719958831703e-06, + "loss": 1.3527, + "step": 24620 + }, + { + "epoch": 0.8051595605545383, + "grad_norm": 3.0773710282529154, + "learning_rate": 2.2251296931827958e-06, + "loss": 1.3927, + "step": 24625 + }, + { + "epoch": 0.8053230447292702, + "grad_norm": 3.0327394267314594, + "learning_rate": 2.2215419596186573e-06, + "loss": 1.3219, + "step": 24630 + }, + { + "epoch": 0.8054865289040021, + "grad_norm": 3.772883504318911, + "learning_rate": 2.2179567593077187e-06, + "loss": 1.4879, + "step": 24635 + }, + { + "epoch": 0.805650013078734, + "grad_norm": 3.5225850282001967, + "learning_rate": 2.214374093417596e-06, + "loss": 1.308, + "step": 24640 + }, + { + "epoch": 0.8058134972534658, + "grad_norm": 3.4563040554004214, + "learning_rate": 2.2107939631150723e-06, + "loss": 1.4049, + "step": 24645 + }, + { + "epoch": 0.8059769814281977, + "grad_norm": 3.191394577878618, + "learning_rate": 2.207216369566112e-06, + "loss": 1.5051, + "step": 24650 + }, + { + "epoch": 0.8061404656029296, + "grad_norm": 3.2809359785731114, + "learning_rate": 2.203641313935845e-06, + "loss": 1.4528, + "step": 24655 + }, + { + "epoch": 0.8063039497776615, + "grad_norm": 3.178712071944659, + "learning_rate": 2.2000687973885824e-06, + "loss": 1.4196, + "step": 24660 + }, + { + "epoch": 0.8064674339523934, + "grad_norm": 3.098703433454439, + "learning_rate": 2.1964988210878067e-06, + "loss": 1.468, + "step": 24665 + }, + { + "epoch": 0.8066309181271253, + "grad_norm": 3.238015307904182, + "learning_rate": 2.1929313861961677e-06, + "loss": 1.4924, + "step": 24670 + }, + { + "epoch": 0.8067944023018572, + "grad_norm": 3.3335966736547036, + "learning_rate": 2.1893664938754967e-06, + "loss": 1.3929, + "step": 24675 + }, + { + "epoch": 0.8069578864765891, + "grad_norm": 3.3950084432070664, + "learning_rate": 2.1858041452867863e-06, + "loss": 1.4386, + "step": 24680 + }, + { + "epoch": 0.807121370651321, + "grad_norm": 3.058106631023779, + "learning_rate": 2.1822443415902073e-06, + "loss": 1.4416, + "step": 24685 + }, + { + "epoch": 0.8072848548260528, + "grad_norm": 2.72297897983425, + "learning_rate": 2.178687083945109e-06, + "loss": 1.3353, + "step": 24690 + }, + { + "epoch": 0.8074483390007847, + "grad_norm": 3.381887079373083, + "learning_rate": 2.1751323735099894e-06, + "loss": 1.3943, + "step": 24695 + }, + { + "epoch": 0.8076118231755166, + "grad_norm": 3.2218218098146343, + "learning_rate": 2.171580211442538e-06, + "loss": 1.586, + "step": 24700 + }, + { + "epoch": 0.8077753073502485, + "grad_norm": 3.008478101992111, + "learning_rate": 2.1680305988996075e-06, + "loss": 1.4846, + "step": 24705 + }, + { + "epoch": 0.8079387915249804, + "grad_norm": 3.222384713611992, + "learning_rate": 2.164483537037216e-06, + "loss": 1.3039, + "step": 24710 + }, + { + "epoch": 0.8081022756997123, + "grad_norm": 3.233967958772179, + "learning_rate": 2.1609390270105614e-06, + "loss": 1.4389, + "step": 24715 + }, + { + "epoch": 0.8082657598744442, + "grad_norm": 3.4220563363919596, + "learning_rate": 2.1573970699739953e-06, + "loss": 1.5441, + "step": 24720 + }, + { + "epoch": 0.808429244049176, + "grad_norm": 2.9664920333526545, + "learning_rate": 2.1538576670810497e-06, + "loss": 1.3562, + "step": 24725 + }, + { + "epoch": 0.8085927282239079, + "grad_norm": 3.2242562328525923, + "learning_rate": 2.150320819484426e-06, + "loss": 1.3741, + "step": 24730 + }, + { + "epoch": 0.8087562123986398, + "grad_norm": 2.9307166464970957, + "learning_rate": 2.146786528335982e-06, + "loss": 1.3047, + "step": 24735 + }, + { + "epoch": 0.8089196965733717, + "grad_norm": 3.2061952109320413, + "learning_rate": 2.1432547947867543e-06, + "loss": 1.3657, + "step": 24740 + }, + { + "epoch": 0.8090831807481036, + "grad_norm": 3.0834133122830742, + "learning_rate": 2.1397256199869385e-06, + "loss": 1.3147, + "step": 24745 + }, + { + "epoch": 0.8092466649228355, + "grad_norm": 3.158627811117507, + "learning_rate": 2.1361990050859028e-06, + "loss": 1.3676, + "step": 24750 + }, + { + "epoch": 0.8094101490975674, + "grad_norm": 3.3985859794360684, + "learning_rate": 2.13267495123218e-06, + "loss": 1.2758, + "step": 24755 + }, + { + "epoch": 0.8095736332722993, + "grad_norm": 3.1974826077861223, + "learning_rate": 2.129153459573465e-06, + "loss": 1.3492, + "step": 24760 + }, + { + "epoch": 0.8097371174470311, + "grad_norm": 3.3345562424707755, + "learning_rate": 2.125634531256625e-06, + "loss": 1.3857, + "step": 24765 + }, + { + "epoch": 0.809900601621763, + "grad_norm": 3.3396066489217615, + "learning_rate": 2.1221181674276846e-06, + "loss": 1.3968, + "step": 24770 + }, + { + "epoch": 0.8100640857964949, + "grad_norm": 3.564387999074219, + "learning_rate": 2.11860436923184e-06, + "loss": 1.4569, + "step": 24775 + }, + { + "epoch": 0.8102275699712268, + "grad_norm": 3.4011089788228417, + "learning_rate": 2.115093137813451e-06, + "loss": 1.5447, + "step": 24780 + }, + { + "epoch": 0.8103910541459587, + "grad_norm": 3.148106262038899, + "learning_rate": 2.111584474316034e-06, + "loss": 1.4618, + "step": 24785 + }, + { + "epoch": 0.8105545383206906, + "grad_norm": 3.418955293379201, + "learning_rate": 2.1080783798822824e-06, + "loss": 1.3502, + "step": 24790 + }, + { + "epoch": 0.8107180224954225, + "grad_norm": 3.163846920418021, + "learning_rate": 2.104574855654037e-06, + "loss": 1.4307, + "step": 24795 + }, + { + "epoch": 0.8108815066701544, + "grad_norm": 3.1769898932918488, + "learning_rate": 2.101073902772317e-06, + "loss": 1.3073, + "step": 24800 + }, + { + "epoch": 0.8110449908448862, + "grad_norm": 3.293521613464438, + "learning_rate": 2.097575522377291e-06, + "loss": 1.4728, + "step": 24805 + }, + { + "epoch": 0.8112084750196181, + "grad_norm": 3.2931682655602788, + "learning_rate": 2.094079715608298e-06, + "loss": 1.4932, + "step": 24810 + }, + { + "epoch": 0.81137195919435, + "grad_norm": 3.1012975656747983, + "learning_rate": 2.09058648360384e-06, + "loss": 1.3376, + "step": 24815 + }, + { + "epoch": 0.8115354433690819, + "grad_norm": 3.187531534665998, + "learning_rate": 2.087095827501572e-06, + "loss": 1.4374, + "step": 24820 + }, + { + "epoch": 0.8116989275438138, + "grad_norm": 3.1938195530538502, + "learning_rate": 2.08360774843832e-06, + "loss": 1.4089, + "step": 24825 + }, + { + "epoch": 0.8118624117185457, + "grad_norm": 3.344232673575034, + "learning_rate": 2.0801222475500604e-06, + "loss": 1.5014, + "step": 24830 + }, + { + "epoch": 0.8120258958932776, + "grad_norm": 3.3006024328288093, + "learning_rate": 2.0766393259719385e-06, + "loss": 1.5221, + "step": 24835 + }, + { + "epoch": 0.8121893800680094, + "grad_norm": 3.219165183233052, + "learning_rate": 2.0731589848382583e-06, + "loss": 1.3924, + "step": 24840 + }, + { + "epoch": 0.8123528642427413, + "grad_norm": 3.3186884943427195, + "learning_rate": 2.069681225282478e-06, + "loss": 1.4533, + "step": 24845 + }, + { + "epoch": 0.8125163484174732, + "grad_norm": 3.220568203007598, + "learning_rate": 2.0662060484372225e-06, + "loss": 1.3189, + "step": 24850 + }, + { + "epoch": 0.8126798325922051, + "grad_norm": 3.3527144017634685, + "learning_rate": 2.062733455434267e-06, + "loss": 1.3635, + "step": 24855 + }, + { + "epoch": 0.812843316766937, + "grad_norm": 3.434154706664831, + "learning_rate": 2.0592634474045527e-06, + "loss": 1.4624, + "step": 24860 + }, + { + "epoch": 0.8130068009416689, + "grad_norm": 3.2079475996404976, + "learning_rate": 2.0557960254781782e-06, + "loss": 1.3701, + "step": 24865 + }, + { + "epoch": 0.8131702851164008, + "grad_norm": 3.147918060934025, + "learning_rate": 2.0523311907843933e-06, + "loss": 1.3776, + "step": 24870 + }, + { + "epoch": 0.8133337692911327, + "grad_norm": 3.34386094521325, + "learning_rate": 2.048868944451615e-06, + "loss": 1.4141, + "step": 24875 + }, + { + "epoch": 0.8134972534658645, + "grad_norm": 3.158555248419694, + "learning_rate": 2.045409287607407e-06, + "loss": 1.394, + "step": 24880 + }, + { + "epoch": 0.8136607376405964, + "grad_norm": 2.9226804692872603, + "learning_rate": 2.041952221378497e-06, + "loss": 1.3805, + "step": 24885 + }, + { + "epoch": 0.8138242218153283, + "grad_norm": 3.1340044325878105, + "learning_rate": 2.0384977468907696e-06, + "loss": 1.2668, + "step": 24890 + }, + { + "epoch": 0.8139877059900602, + "grad_norm": 3.13078937268601, + "learning_rate": 2.0350458652692593e-06, + "loss": 1.3279, + "step": 24895 + }, + { + "epoch": 0.8141511901647921, + "grad_norm": 3.206976017106166, + "learning_rate": 2.0315965776381575e-06, + "loss": 1.3653, + "step": 24900 + }, + { + "epoch": 0.814314674339524, + "grad_norm": 3.4464560665406196, + "learning_rate": 2.028149885120817e-06, + "loss": 1.3877, + "step": 24905 + }, + { + "epoch": 0.8144781585142559, + "grad_norm": 3.7562056930311747, + "learning_rate": 2.024705788839737e-06, + "loss": 1.3907, + "step": 24910 + }, + { + "epoch": 0.8146416426889878, + "grad_norm": 3.1687797305444954, + "learning_rate": 2.021264289916579e-06, + "loss": 1.4746, + "step": 24915 + }, + { + "epoch": 0.8148051268637196, + "grad_norm": 3.1352060185971062, + "learning_rate": 2.0178253894721523e-06, + "loss": 1.3907, + "step": 24920 + }, + { + "epoch": 0.8149686110384515, + "grad_norm": 3.231806190276589, + "learning_rate": 2.014389088626425e-06, + "loss": 1.3416, + "step": 24925 + }, + { + "epoch": 0.8151320952131834, + "grad_norm": 3.2020905983518464, + "learning_rate": 2.010955388498512e-06, + "loss": 1.4155, + "step": 24930 + }, + { + "epoch": 0.8152955793879153, + "grad_norm": 3.04754119697926, + "learning_rate": 2.0075242902066884e-06, + "loss": 1.5389, + "step": 24935 + }, + { + "epoch": 0.8154590635626472, + "grad_norm": 3.334834965195726, + "learning_rate": 2.0040957948683792e-06, + "loss": 1.4338, + "step": 24940 + }, + { + "epoch": 0.815622547737379, + "grad_norm": 3.3881990727173337, + "learning_rate": 2.0006699036001596e-06, + "loss": 1.4817, + "step": 24945 + }, + { + "epoch": 0.8157860319121109, + "grad_norm": 3.1589762219745916, + "learning_rate": 1.9972466175177617e-06, + "loss": 1.5065, + "step": 24950 + }, + { + "epoch": 0.8159495160868427, + "grad_norm": 3.3452257015540736, + "learning_rate": 1.9938259377360604e-06, + "loss": 1.3701, + "step": 24955 + }, + { + "epoch": 0.8161130002615746, + "grad_norm": 3.3630039982654605, + "learning_rate": 1.9904078653690905e-06, + "loss": 1.3644, + "step": 24960 + }, + { + "epoch": 0.8162764844363065, + "grad_norm": 3.265323948490837, + "learning_rate": 1.986992401530037e-06, + "loss": 1.365, + "step": 24965 + }, + { + "epoch": 0.8164399686110384, + "grad_norm": 3.3429902854407936, + "learning_rate": 1.983579547331227e-06, + "loss": 1.283, + "step": 24970 + }, + { + "epoch": 0.8166034527857703, + "grad_norm": 3.2429066501737376, + "learning_rate": 1.9801693038841498e-06, + "loss": 1.4215, + "step": 24975 + }, + { + "epoch": 0.8167669369605022, + "grad_norm": 3.1892682280004285, + "learning_rate": 1.976761672299431e-06, + "loss": 1.3421, + "step": 24980 + }, + { + "epoch": 0.8169304211352341, + "grad_norm": 3.3431759753120533, + "learning_rate": 1.9733566536868576e-06, + "loss": 1.4311, + "step": 24985 + }, + { + "epoch": 0.817093905309966, + "grad_norm": 3.0930823381324606, + "learning_rate": 1.9699542491553625e-06, + "loss": 1.3048, + "step": 24990 + }, + { + "epoch": 0.8172573894846978, + "grad_norm": 3.147463009579642, + "learning_rate": 1.9665544598130204e-06, + "loss": 1.4276, + "step": 24995 + }, + { + "epoch": 0.8174208736594297, + "grad_norm": 3.4603779043404073, + "learning_rate": 1.9631572867670633e-06, + "loss": 1.4041, + "step": 25000 + }, + { + "epoch": 0.8175843578341616, + "grad_norm": 3.3035687530761617, + "learning_rate": 1.9597627311238645e-06, + "loss": 1.2778, + "step": 25005 + }, + { + "epoch": 0.8177478420088935, + "grad_norm": 3.354006336333805, + "learning_rate": 1.9563707939889477e-06, + "loss": 1.397, + "step": 25010 + }, + { + "epoch": 0.8179113261836254, + "grad_norm": 3.3699172619929088, + "learning_rate": 1.9529814764669874e-06, + "loss": 1.4632, + "step": 25015 + }, + { + "epoch": 0.8180748103583573, + "grad_norm": 3.3646545789073508, + "learning_rate": 1.9495947796617963e-06, + "loss": 1.4634, + "step": 25020 + }, + { + "epoch": 0.8182382945330892, + "grad_norm": 3.411790797530077, + "learning_rate": 1.9462107046763435e-06, + "loss": 1.3865, + "step": 25025 + }, + { + "epoch": 0.818401778707821, + "grad_norm": 3.1021428085068576, + "learning_rate": 1.9428292526127344e-06, + "loss": 1.4901, + "step": 25030 + }, + { + "epoch": 0.8185652628825529, + "grad_norm": 3.2582123413053687, + "learning_rate": 1.9394504245722266e-06, + "loss": 1.4426, + "step": 25035 + }, + { + "epoch": 0.8187287470572848, + "grad_norm": 2.9748501701306336, + "learning_rate": 1.9360742216552265e-06, + "loss": 1.4909, + "step": 25040 + }, + { + "epoch": 0.8188922312320167, + "grad_norm": 3.05719783785625, + "learning_rate": 1.9327006449612726e-06, + "loss": 1.3703, + "step": 25045 + }, + { + "epoch": 0.8190557154067486, + "grad_norm": 3.111387682580249, + "learning_rate": 1.9293296955890637e-06, + "loss": 1.2687, + "step": 25050 + }, + { + "epoch": 0.8192191995814805, + "grad_norm": 3.319713552223432, + "learning_rate": 1.9259613746364294e-06, + "loss": 1.5334, + "step": 25055 + }, + { + "epoch": 0.8193826837562124, + "grad_norm": 2.9835509084929313, + "learning_rate": 1.9225956832003535e-06, + "loss": 1.4741, + "step": 25060 + }, + { + "epoch": 0.8195461679309443, + "grad_norm": 3.378336096139437, + "learning_rate": 1.9192326223769552e-06, + "loss": 1.3844, + "step": 25065 + }, + { + "epoch": 0.8197096521056761, + "grad_norm": 3.0641845091035687, + "learning_rate": 1.915872193261503e-06, + "loss": 1.3892, + "step": 25070 + }, + { + "epoch": 0.819873136280408, + "grad_norm": 3.5037714438466123, + "learning_rate": 1.9125143969484105e-06, + "loss": 1.5275, + "step": 25075 + }, + { + "epoch": 0.8200366204551399, + "grad_norm": 3.2250787331283575, + "learning_rate": 1.9091592345312226e-06, + "loss": 1.3711, + "step": 25080 + }, + { + "epoch": 0.8202001046298718, + "grad_norm": 3.3311918348943217, + "learning_rate": 1.9058067071026387e-06, + "loss": 1.3468, + "step": 25085 + }, + { + "epoch": 0.8203635888046037, + "grad_norm": 3.4623747668006732, + "learning_rate": 1.902456815754491e-06, + "loss": 1.518, + "step": 25090 + }, + { + "epoch": 0.8205270729793356, + "grad_norm": 3.5690488626325596, + "learning_rate": 1.8991095615777589e-06, + "loss": 1.4077, + "step": 25095 + }, + { + "epoch": 0.8206905571540675, + "grad_norm": 3.226888610295418, + "learning_rate": 1.895764945662566e-06, + "loss": 1.4236, + "step": 25100 + }, + { + "epoch": 0.8208540413287994, + "grad_norm": 3.2187029010082195, + "learning_rate": 1.892422969098162e-06, + "loss": 1.4102, + "step": 25105 + }, + { + "epoch": 0.8210175255035312, + "grad_norm": 3.3015064091160715, + "learning_rate": 1.8890836329729522e-06, + "loss": 1.4546, + "step": 25110 + }, + { + "epoch": 0.8211810096782631, + "grad_norm": 3.194393139977881, + "learning_rate": 1.8857469383744775e-06, + "loss": 1.3306, + "step": 25115 + }, + { + "epoch": 0.821344493852995, + "grad_norm": 2.994137623958473, + "learning_rate": 1.8824128863894142e-06, + "loss": 1.379, + "step": 25120 + }, + { + "epoch": 0.8215079780277269, + "grad_norm": 3.42133365996329, + "learning_rate": 1.879081478103586e-06, + "loss": 1.4588, + "step": 25125 + }, + { + "epoch": 0.8216714622024588, + "grad_norm": 3.48117776450133, + "learning_rate": 1.875752714601945e-06, + "loss": 1.4055, + "step": 25130 + }, + { + "epoch": 0.8218349463771907, + "grad_norm": 3.024518836493186, + "learning_rate": 1.8724265969685906e-06, + "loss": 1.365, + "step": 25135 + }, + { + "epoch": 0.8219984305519226, + "grad_norm": 3.158637748690525, + "learning_rate": 1.869103126286762e-06, + "loss": 1.4828, + "step": 25140 + }, + { + "epoch": 0.8221619147266545, + "grad_norm": 3.1011170274203845, + "learning_rate": 1.8657823036388255e-06, + "loss": 1.4314, + "step": 25145 + }, + { + "epoch": 0.8223253989013863, + "grad_norm": 3.3636071346905787, + "learning_rate": 1.8624641301062974e-06, + "loss": 1.4912, + "step": 25150 + }, + { + "epoch": 0.8224888830761182, + "grad_norm": 3.2733123488096005, + "learning_rate": 1.8591486067698206e-06, + "loss": 1.4022, + "step": 25155 + }, + { + "epoch": 0.8226523672508501, + "grad_norm": 3.143426631192589, + "learning_rate": 1.8558357347091816e-06, + "loss": 1.3782, + "step": 25160 + }, + { + "epoch": 0.822815851425582, + "grad_norm": 3.2860276246000217, + "learning_rate": 1.8525255150033038e-06, + "loss": 1.4127, + "step": 25165 + }, + { + "epoch": 0.8229793356003139, + "grad_norm": 3.751821007577529, + "learning_rate": 1.849217948730242e-06, + "loss": 1.5056, + "step": 25170 + }, + { + "epoch": 0.8231428197750458, + "grad_norm": 3.285069175740683, + "learning_rate": 1.8459130369671918e-06, + "loss": 1.4423, + "step": 25175 + }, + { + "epoch": 0.8233063039497777, + "grad_norm": 3.151003230283252, + "learning_rate": 1.8426107807904781e-06, + "loss": 1.3598, + "step": 25180 + }, + { + "epoch": 0.8234697881245095, + "grad_norm": 3.101088697926721, + "learning_rate": 1.8393111812755704e-06, + "loss": 1.4147, + "step": 25185 + }, + { + "epoch": 0.8236332722992414, + "grad_norm": 3.1146384281952266, + "learning_rate": 1.8360142394970616e-06, + "loss": 1.3569, + "step": 25190 + }, + { + "epoch": 0.8237967564739733, + "grad_norm": 3.4721761818181456, + "learning_rate": 1.8327199565286869e-06, + "loss": 1.5905, + "step": 25195 + }, + { + "epoch": 0.8239602406487052, + "grad_norm": 3.5856635177085394, + "learning_rate": 1.8294283334433171e-06, + "loss": 1.461, + "step": 25200 + }, + { + "epoch": 0.8241237248234371, + "grad_norm": 3.452804938234267, + "learning_rate": 1.8261393713129484e-06, + "loss": 1.4188, + "step": 25205 + }, + { + "epoch": 0.824287208998169, + "grad_norm": 3.168716265152153, + "learning_rate": 1.82285307120872e-06, + "loss": 1.3902, + "step": 25210 + }, + { + "epoch": 0.8244506931729009, + "grad_norm": 3.3725859971668157, + "learning_rate": 1.819569434200893e-06, + "loss": 1.467, + "step": 25215 + }, + { + "epoch": 0.8246141773476328, + "grad_norm": 3.0399213396392275, + "learning_rate": 1.8162884613588716e-06, + "loss": 1.4047, + "step": 25220 + }, + { + "epoch": 0.8247776615223646, + "grad_norm": 3.357706263961545, + "learning_rate": 1.813010153751189e-06, + "loss": 1.3762, + "step": 25225 + }, + { + "epoch": 0.8249411456970965, + "grad_norm": 3.348056843302862, + "learning_rate": 1.8097345124455068e-06, + "loss": 1.484, + "step": 25230 + }, + { + "epoch": 0.8251046298718284, + "grad_norm": 3.164972300630244, + "learning_rate": 1.8064615385086249e-06, + "loss": 1.357, + "step": 25235 + }, + { + "epoch": 0.8252681140465603, + "grad_norm": 3.224711016945936, + "learning_rate": 1.803191233006466e-06, + "loss": 1.3049, + "step": 25240 + }, + { + "epoch": 0.8254315982212922, + "grad_norm": 2.8944359098230805, + "learning_rate": 1.7999235970040906e-06, + "loss": 1.3753, + "step": 25245 + }, + { + "epoch": 0.8255950823960241, + "grad_norm": 3.357282284904411, + "learning_rate": 1.7966586315656908e-06, + "loss": 1.3944, + "step": 25250 + }, + { + "epoch": 0.825758566570756, + "grad_norm": 3.2571818198066245, + "learning_rate": 1.7933963377545804e-06, + "loss": 1.5106, + "step": 25255 + }, + { + "epoch": 0.8259220507454879, + "grad_norm": 2.9251253117729994, + "learning_rate": 1.7901367166332139e-06, + "loss": 1.4135, + "step": 25260 + }, + { + "epoch": 0.8260855349202197, + "grad_norm": 3.300969568179689, + "learning_rate": 1.7868797692631657e-06, + "loss": 1.4789, + "step": 25265 + }, + { + "epoch": 0.8262490190949516, + "grad_norm": 3.1615483432737097, + "learning_rate": 1.7836254967051436e-06, + "loss": 1.3954, + "step": 25270 + }, + { + "epoch": 0.8264125032696835, + "grad_norm": 3.2370698315020254, + "learning_rate": 1.7803739000189902e-06, + "loss": 1.5175, + "step": 25275 + }, + { + "epoch": 0.8265759874444154, + "grad_norm": 3.0930161315732247, + "learning_rate": 1.7771249802636637e-06, + "loss": 1.453, + "step": 25280 + }, + { + "epoch": 0.8267394716191473, + "grad_norm": 2.9875504105239856, + "learning_rate": 1.7738787384972634e-06, + "loss": 1.2124, + "step": 25285 + }, + { + "epoch": 0.8269029557938792, + "grad_norm": 3.2053753145782813, + "learning_rate": 1.7706351757770058e-06, + "loss": 1.5786, + "step": 25290 + }, + { + "epoch": 0.8270664399686111, + "grad_norm": 3.247314355610596, + "learning_rate": 1.7673942931592426e-06, + "loss": 1.4113, + "step": 25295 + }, + { + "epoch": 0.827229924143343, + "grad_norm": 3.171596971630601, + "learning_rate": 1.7641560916994515e-06, + "loss": 1.4223, + "step": 25300 + }, + { + "epoch": 0.8273934083180748, + "grad_norm": 3.122020006006003, + "learning_rate": 1.7609205724522305e-06, + "loss": 1.3937, + "step": 25305 + }, + { + "epoch": 0.8275568924928067, + "grad_norm": 3.242009617151554, + "learning_rate": 1.7576877364713174e-06, + "loss": 1.3581, + "step": 25310 + }, + { + "epoch": 0.8277203766675386, + "grad_norm": 3.2352621587849666, + "learning_rate": 1.7544575848095568e-06, + "loss": 1.3937, + "step": 25315 + }, + { + "epoch": 0.8278838608422705, + "grad_norm": 3.263872403501958, + "learning_rate": 1.751230118518935e-06, + "loss": 1.3178, + "step": 25320 + }, + { + "epoch": 0.8280473450170024, + "grad_norm": 3.285973650499929, + "learning_rate": 1.7480053386505625e-06, + "loss": 1.3365, + "step": 25325 + }, + { + "epoch": 0.8282108291917343, + "grad_norm": 3.4526528829840504, + "learning_rate": 1.7447832462546632e-06, + "loss": 1.4405, + "step": 25330 + }, + { + "epoch": 0.8283743133664662, + "grad_norm": 3.1957019061522787, + "learning_rate": 1.7415638423806014e-06, + "loss": 1.3894, + "step": 25335 + }, + { + "epoch": 0.828537797541198, + "grad_norm": 3.209665343759604, + "learning_rate": 1.7383471280768528e-06, + "loss": 1.4132, + "step": 25340 + }, + { + "epoch": 0.8287012817159299, + "grad_norm": 3.5262938298305695, + "learning_rate": 1.7351331043910236e-06, + "loss": 1.4334, + "step": 25345 + }, + { + "epoch": 0.8288647658906618, + "grad_norm": 3.023372312264315, + "learning_rate": 1.7319217723698456e-06, + "loss": 1.4073, + "step": 25350 + }, + { + "epoch": 0.8290282500653937, + "grad_norm": 3.0434417605903272, + "learning_rate": 1.7287131330591656e-06, + "loss": 1.4193, + "step": 25355 + }, + { + "epoch": 0.8291917342401256, + "grad_norm": 3.1819868849023747, + "learning_rate": 1.7255071875039653e-06, + "loss": 1.2967, + "step": 25360 + }, + { + "epoch": 0.8293552184148575, + "grad_norm": 3.3336632837814553, + "learning_rate": 1.7223039367483353e-06, + "loss": 1.3895, + "step": 25365 + }, + { + "epoch": 0.8295187025895894, + "grad_norm": 3.0018928595515293, + "learning_rate": 1.7191033818355007e-06, + "loss": 1.4347, + "step": 25370 + }, + { + "epoch": 0.8296821867643213, + "grad_norm": 3.227514473102127, + "learning_rate": 1.715905523807805e-06, + "loss": 1.4117, + "step": 25375 + }, + { + "epoch": 0.8298456709390531, + "grad_norm": 3.052098699084616, + "learning_rate": 1.7127103637067077e-06, + "loss": 1.2951, + "step": 25380 + }, + { + "epoch": 0.830009155113785, + "grad_norm": 3.1636068496437133, + "learning_rate": 1.7095179025727982e-06, + "loss": 1.4465, + "step": 25385 + }, + { + "epoch": 0.8301726392885169, + "grad_norm": 3.7522436265786783, + "learning_rate": 1.7063281414457788e-06, + "loss": 1.4899, + "step": 25390 + }, + { + "epoch": 0.8303361234632488, + "grad_norm": 2.9823958746203356, + "learning_rate": 1.70314108136448e-06, + "loss": 1.2897, + "step": 25395 + }, + { + "epoch": 0.8304996076379807, + "grad_norm": 3.0677397508984803, + "learning_rate": 1.699956723366849e-06, + "loss": 1.3479, + "step": 25400 + }, + { + "epoch": 0.8306630918127126, + "grad_norm": 3.2854646387681874, + "learning_rate": 1.696775068489951e-06, + "loss": 1.4298, + "step": 25405 + }, + { + "epoch": 0.8308265759874444, + "grad_norm": 3.1340663063861616, + "learning_rate": 1.6935961177699766e-06, + "loss": 1.2191, + "step": 25410 + }, + { + "epoch": 0.8309900601621762, + "grad_norm": 3.1463463112875583, + "learning_rate": 1.690419872242227e-06, + "loss": 1.3706, + "step": 25415 + }, + { + "epoch": 0.8311535443369081, + "grad_norm": 3.027881381358751, + "learning_rate": 1.6872463329411303e-06, + "loss": 1.3846, + "step": 25420 + }, + { + "epoch": 0.83131702851164, + "grad_norm": 3.164693948733145, + "learning_rate": 1.684075500900233e-06, + "loss": 1.4057, + "step": 25425 + }, + { + "epoch": 0.8314805126863719, + "grad_norm": 3.037153963693109, + "learning_rate": 1.6809073771521922e-06, + "loss": 1.393, + "step": 25430 + }, + { + "epoch": 0.8316439968611038, + "grad_norm": 3.5152387308617437, + "learning_rate": 1.677741962728795e-06, + "loss": 1.4577, + "step": 25435 + }, + { + "epoch": 0.8318074810358357, + "grad_norm": 3.2582080754822065, + "learning_rate": 1.674579258660931e-06, + "loss": 1.3604, + "step": 25440 + }, + { + "epoch": 0.8319709652105676, + "grad_norm": 3.4614913255064184, + "learning_rate": 1.6714192659786232e-06, + "loss": 1.3053, + "step": 25445 + }, + { + "epoch": 0.8321344493852995, + "grad_norm": 3.074557009459645, + "learning_rate": 1.6682619857109972e-06, + "loss": 1.4074, + "step": 25450 + }, + { + "epoch": 0.8322979335600313, + "grad_norm": 3.2372542556704813, + "learning_rate": 1.6651074188863058e-06, + "loss": 1.497, + "step": 25455 + }, + { + "epoch": 0.8324614177347632, + "grad_norm": 3.0197625449875334, + "learning_rate": 1.6619555665319154e-06, + "loss": 1.3744, + "step": 25460 + }, + { + "epoch": 0.8326249019094951, + "grad_norm": 3.420510971109518, + "learning_rate": 1.6588064296743022e-06, + "loss": 1.4135, + "step": 25465 + }, + { + "epoch": 0.832788386084227, + "grad_norm": 3.124082119684191, + "learning_rate": 1.6556600093390685e-06, + "loss": 1.3408, + "step": 25470 + }, + { + "epoch": 0.8329518702589589, + "grad_norm": 3.057276990023619, + "learning_rate": 1.6525163065509197e-06, + "loss": 1.4092, + "step": 25475 + }, + { + "epoch": 0.8331153544336908, + "grad_norm": 3.2432839447504116, + "learning_rate": 1.6493753223336883e-06, + "loss": 1.3666, + "step": 25480 + }, + { + "epoch": 0.8332788386084227, + "grad_norm": 3.0137040833801145, + "learning_rate": 1.646237057710316e-06, + "loss": 1.4636, + "step": 25485 + }, + { + "epoch": 0.8334423227831546, + "grad_norm": 3.1999359318726865, + "learning_rate": 1.6431015137028538e-06, + "loss": 1.4724, + "step": 25490 + }, + { + "epoch": 0.8336058069578864, + "grad_norm": 3.4083001479360964, + "learning_rate": 1.6399686913324776e-06, + "loss": 1.373, + "step": 25495 + }, + { + "epoch": 0.8337692911326183, + "grad_norm": 3.2617287391472938, + "learning_rate": 1.6368385916194651e-06, + "loss": 1.3796, + "step": 25500 + }, + { + "epoch": 0.8339327753073502, + "grad_norm": 3.2740180587502286, + "learning_rate": 1.633711215583217e-06, + "loss": 1.4321, + "step": 25505 + }, + { + "epoch": 0.8340962594820821, + "grad_norm": 3.494733898214945, + "learning_rate": 1.6305865642422424e-06, + "loss": 1.3459, + "step": 25510 + }, + { + "epoch": 0.834259743656814, + "grad_norm": 3.6224847451665703, + "learning_rate": 1.6274646386141647e-06, + "loss": 1.4674, + "step": 25515 + }, + { + "epoch": 0.8344232278315459, + "grad_norm": 3.2726025875483584, + "learning_rate": 1.6243454397157143e-06, + "loss": 1.4297, + "step": 25520 + }, + { + "epoch": 0.8345867120062778, + "grad_norm": 3.250525365764784, + "learning_rate": 1.6212289685627436e-06, + "loss": 1.4559, + "step": 25525 + }, + { + "epoch": 0.8347501961810097, + "grad_norm": 3.211434947641507, + "learning_rate": 1.618115226170205e-06, + "loss": 1.4521, + "step": 25530 + }, + { + "epoch": 0.8349136803557415, + "grad_norm": 2.988087878356549, + "learning_rate": 1.6150042135521737e-06, + "loss": 1.3417, + "step": 25535 + }, + { + "epoch": 0.8350771645304734, + "grad_norm": 3.196893547791209, + "learning_rate": 1.611895931721824e-06, + "loss": 1.4023, + "step": 25540 + }, + { + "epoch": 0.8352406487052053, + "grad_norm": 3.548493127959279, + "learning_rate": 1.608790381691452e-06, + "loss": 1.4017, + "step": 25545 + }, + { + "epoch": 0.8354041328799372, + "grad_norm": 3.2185979747149127, + "learning_rate": 1.60568756447246e-06, + "loss": 1.3383, + "step": 25550 + }, + { + "epoch": 0.8355676170546691, + "grad_norm": 3.4979693642994376, + "learning_rate": 1.6025874810753562e-06, + "loss": 1.3439, + "step": 25555 + }, + { + "epoch": 0.835731101229401, + "grad_norm": 3.342723921669084, + "learning_rate": 1.5994901325097644e-06, + "loss": 1.503, + "step": 25560 + }, + { + "epoch": 0.8358945854041329, + "grad_norm": 3.39626379765256, + "learning_rate": 1.5963955197844116e-06, + "loss": 1.4068, + "step": 25565 + }, + { + "epoch": 0.8360580695788647, + "grad_norm": 3.1746687463144707, + "learning_rate": 1.5933036439071404e-06, + "loss": 1.4413, + "step": 25570 + }, + { + "epoch": 0.8362215537535966, + "grad_norm": 3.410535140683863, + "learning_rate": 1.5902145058849006e-06, + "loss": 1.4099, + "step": 25575 + }, + { + "epoch": 0.8363850379283285, + "grad_norm": 3.4014015931596138, + "learning_rate": 1.5871281067237432e-06, + "loss": 1.4212, + "step": 25580 + }, + { + "epoch": 0.8365485221030604, + "grad_norm": 3.1617030077536072, + "learning_rate": 1.5840444474288396e-06, + "loss": 1.3554, + "step": 25585 + }, + { + "epoch": 0.8367120062777923, + "grad_norm": 3.4027869751975994, + "learning_rate": 1.5809635290044555e-06, + "loss": 1.4837, + "step": 25590 + }, + { + "epoch": 0.8368754904525242, + "grad_norm": 3.385186767178006, + "learning_rate": 1.5778853524539762e-06, + "loss": 1.4548, + "step": 25595 + }, + { + "epoch": 0.8370389746272561, + "grad_norm": 3.1662611264713543, + "learning_rate": 1.5748099187798826e-06, + "loss": 1.3721, + "step": 25600 + }, + { + "epoch": 0.837202458801988, + "grad_norm": 3.048067888121385, + "learning_rate": 1.5717372289837719e-06, + "loss": 1.305, + "step": 25605 + }, + { + "epoch": 0.8373659429767198, + "grad_norm": 3.322457048692188, + "learning_rate": 1.5686672840663454e-06, + "loss": 1.3744, + "step": 25610 + }, + { + "epoch": 0.8375294271514517, + "grad_norm": 3.167543136262108, + "learning_rate": 1.5656000850274044e-06, + "loss": 1.3628, + "step": 25615 + }, + { + "epoch": 0.8376929113261836, + "grad_norm": 3.257722430032996, + "learning_rate": 1.5625356328658658e-06, + "loss": 1.4609, + "step": 25620 + }, + { + "epoch": 0.8378563955009155, + "grad_norm": 3.2815659845458685, + "learning_rate": 1.55947392857974e-06, + "loss": 1.3813, + "step": 25625 + }, + { + "epoch": 0.8380198796756474, + "grad_norm": 3.329132268999937, + "learning_rate": 1.5564149731661538e-06, + "loss": 1.4374, + "step": 25630 + }, + { + "epoch": 0.8381833638503793, + "grad_norm": 3.272187829313214, + "learning_rate": 1.5533587676213347e-06, + "loss": 1.4618, + "step": 25635 + }, + { + "epoch": 0.8383468480251112, + "grad_norm": 3.217092648621602, + "learning_rate": 1.550305312940611e-06, + "loss": 1.3259, + "step": 25640 + }, + { + "epoch": 0.8385103321998431, + "grad_norm": 3.145274129382854, + "learning_rate": 1.5472546101184206e-06, + "loss": 1.514, + "step": 25645 + }, + { + "epoch": 0.8386738163745749, + "grad_norm": 3.3048640184163705, + "learning_rate": 1.5442066601482985e-06, + "loss": 1.3742, + "step": 25650 + }, + { + "epoch": 0.8388373005493068, + "grad_norm": 3.229433332941046, + "learning_rate": 1.5411614640228912e-06, + "loss": 1.3872, + "step": 25655 + }, + { + "epoch": 0.8390007847240387, + "grad_norm": 3.325518565821298, + "learning_rate": 1.5381190227339448e-06, + "loss": 1.3731, + "step": 25660 + }, + { + "epoch": 0.8391642688987706, + "grad_norm": 3.271126861711878, + "learning_rate": 1.535079337272305e-06, + "loss": 1.4158, + "step": 25665 + }, + { + "epoch": 0.8393277530735025, + "grad_norm": 3.272327780306728, + "learning_rate": 1.5320424086279263e-06, + "loss": 1.3458, + "step": 25670 + }, + { + "epoch": 0.8394912372482344, + "grad_norm": 3.2253550601025602, + "learning_rate": 1.5290082377898585e-06, + "loss": 1.3343, + "step": 25675 + }, + { + "epoch": 0.8396547214229663, + "grad_norm": 3.455706831353958, + "learning_rate": 1.525976825746257e-06, + "loss": 1.4532, + "step": 25680 + }, + { + "epoch": 0.8398182055976982, + "grad_norm": 3.141868400349303, + "learning_rate": 1.5229481734843832e-06, + "loss": 1.3852, + "step": 25685 + }, + { + "epoch": 0.83998168977243, + "grad_norm": 3.701988188216934, + "learning_rate": 1.519922281990588e-06, + "loss": 1.5404, + "step": 25690 + }, + { + "epoch": 0.8401451739471619, + "grad_norm": 3.214652762199681, + "learning_rate": 1.5168991522503363e-06, + "loss": 1.403, + "step": 25695 + }, + { + "epoch": 0.8403086581218938, + "grad_norm": 3.1854353358997503, + "learning_rate": 1.5138787852481828e-06, + "loss": 1.381, + "step": 25700 + }, + { + "epoch": 0.8404721422966257, + "grad_norm": 3.361009682885652, + "learning_rate": 1.510861181967791e-06, + "loss": 1.3398, + "step": 25705 + }, + { + "epoch": 0.8406356264713576, + "grad_norm": 3.228957126545961, + "learning_rate": 1.5078463433919154e-06, + "loss": 1.4261, + "step": 25710 + }, + { + "epoch": 0.8407991106460895, + "grad_norm": 3.2127795499121725, + "learning_rate": 1.5048342705024178e-06, + "loss": 1.3527, + "step": 25715 + }, + { + "epoch": 0.8409625948208214, + "grad_norm": 3.11942815169524, + "learning_rate": 1.5018249642802596e-06, + "loss": 1.3452, + "step": 25720 + }, + { + "epoch": 0.8411260789955533, + "grad_norm": 3.3818796310054924, + "learning_rate": 1.498818425705495e-06, + "loss": 1.4789, + "step": 25725 + }, + { + "epoch": 0.8412895631702851, + "grad_norm": 3.160131295649916, + "learning_rate": 1.4958146557572772e-06, + "loss": 1.3041, + "step": 25730 + }, + { + "epoch": 0.841453047345017, + "grad_norm": 3.202191150119686, + "learning_rate": 1.4928136554138662e-06, + "loss": 1.5398, + "step": 25735 + }, + { + "epoch": 0.8416165315197489, + "grad_norm": 3.2697714059362073, + "learning_rate": 1.4898154256526087e-06, + "loss": 1.4149, + "step": 25740 + }, + { + "epoch": 0.8417800156944808, + "grad_norm": 2.8562120975591516, + "learning_rate": 1.4868199674499596e-06, + "loss": 1.376, + "step": 25745 + }, + { + "epoch": 0.8419434998692127, + "grad_norm": 3.495480180978293, + "learning_rate": 1.4838272817814614e-06, + "loss": 1.4751, + "step": 25750 + }, + { + "epoch": 0.8421069840439446, + "grad_norm": 3.3224897864587883, + "learning_rate": 1.4808373696217626e-06, + "loss": 1.3866, + "step": 25755 + }, + { + "epoch": 0.8422704682186765, + "grad_norm": 3.0129224948119093, + "learning_rate": 1.4778502319446042e-06, + "loss": 1.319, + "step": 25760 + }, + { + "epoch": 0.8424339523934083, + "grad_norm": 3.1014407670591475, + "learning_rate": 1.4748658697228202e-06, + "loss": 1.3399, + "step": 25765 + }, + { + "epoch": 0.8425974365681402, + "grad_norm": 3.5917328173975775, + "learning_rate": 1.4718842839283486e-06, + "loss": 1.5684, + "step": 25770 + }, + { + "epoch": 0.8427609207428721, + "grad_norm": 3.2614447766625716, + "learning_rate": 1.468905475532214e-06, + "loss": 1.3494, + "step": 25775 + }, + { + "epoch": 0.842924404917604, + "grad_norm": 3.37264433949869, + "learning_rate": 1.4659294455045447e-06, + "loss": 1.5702, + "step": 25780 + }, + { + "epoch": 0.8430878890923359, + "grad_norm": 3.1049115772825115, + "learning_rate": 1.462956194814562e-06, + "loss": 1.3958, + "step": 25785 + }, + { + "epoch": 0.8432513732670678, + "grad_norm": 3.260259009549709, + "learning_rate": 1.459985724430577e-06, + "loss": 1.536, + "step": 25790 + }, + { + "epoch": 0.8434148574417997, + "grad_norm": 3.2639258569573477, + "learning_rate": 1.4570180353200036e-06, + "loss": 1.413, + "step": 25795 + }, + { + "epoch": 0.8435783416165316, + "grad_norm": 3.33886600192753, + "learning_rate": 1.4540531284493397e-06, + "loss": 1.3247, + "step": 25800 + }, + { + "epoch": 0.8437418257912634, + "grad_norm": 3.366869163571299, + "learning_rate": 1.4510910047841864e-06, + "loss": 1.3889, + "step": 25805 + }, + { + "epoch": 0.8439053099659953, + "grad_norm": 2.9812052899737216, + "learning_rate": 1.4481316652892363e-06, + "loss": 1.3728, + "step": 25810 + }, + { + "epoch": 0.8440687941407272, + "grad_norm": 3.6664598349861315, + "learning_rate": 1.4451751109282707e-06, + "loss": 1.3717, + "step": 25815 + }, + { + "epoch": 0.8442322783154591, + "grad_norm": 3.488910947158186, + "learning_rate": 1.4422213426641696e-06, + "loss": 1.3785, + "step": 25820 + }, + { + "epoch": 0.844395762490191, + "grad_norm": 3.022659828557933, + "learning_rate": 1.4392703614588988e-06, + "loss": 1.3469, + "step": 25825 + }, + { + "epoch": 0.8445592466649229, + "grad_norm": 2.7418198914842535, + "learning_rate": 1.4363221682735229e-06, + "loss": 1.4011, + "step": 25830 + }, + { + "epoch": 0.8447227308396548, + "grad_norm": 3.0984647327897896, + "learning_rate": 1.4333767640681994e-06, + "loss": 1.3217, + "step": 25835 + }, + { + "epoch": 0.8448862150143867, + "grad_norm": 3.15300257983403, + "learning_rate": 1.430434149802169e-06, + "loss": 1.4148, + "step": 25840 + }, + { + "epoch": 0.8450496991891185, + "grad_norm": 3.1752732699023984, + "learning_rate": 1.4274943264337737e-06, + "loss": 1.34, + "step": 25845 + }, + { + "epoch": 0.8452131833638504, + "grad_norm": 3.1734801646597703, + "learning_rate": 1.4245572949204379e-06, + "loss": 1.44, + "step": 25850 + }, + { + "epoch": 0.8453766675385823, + "grad_norm": 3.4780979273368158, + "learning_rate": 1.4216230562186839e-06, + "loss": 1.4635, + "step": 25855 + }, + { + "epoch": 0.8455401517133142, + "grad_norm": 3.1231362969356113, + "learning_rate": 1.4186916112841186e-06, + "loss": 1.338, + "step": 25860 + }, + { + "epoch": 0.8457036358880461, + "grad_norm": 3.321100216215149, + "learning_rate": 1.4157629610714428e-06, + "loss": 1.4766, + "step": 25865 + }, + { + "epoch": 0.845867120062778, + "grad_norm": 3.0465507049969416, + "learning_rate": 1.4128371065344481e-06, + "loss": 1.2216, + "step": 25870 + }, + { + "epoch": 0.8460306042375099, + "grad_norm": 3.0127641473655964, + "learning_rate": 1.409914048626011e-06, + "loss": 1.2938, + "step": 25875 + }, + { + "epoch": 0.8461940884122416, + "grad_norm": 3.2976826051811505, + "learning_rate": 1.4069937882981022e-06, + "loss": 1.5436, + "step": 25880 + }, + { + "epoch": 0.8463575725869735, + "grad_norm": 3.027800150919116, + "learning_rate": 1.4040763265017765e-06, + "loss": 1.4159, + "step": 25885 + }, + { + "epoch": 0.8465210567617054, + "grad_norm": 3.250032126195255, + "learning_rate": 1.4011616641871795e-06, + "loss": 1.2869, + "step": 25890 + }, + { + "epoch": 0.8466845409364373, + "grad_norm": 3.1971229050095595, + "learning_rate": 1.3982498023035495e-06, + "loss": 1.2924, + "step": 25895 + }, + { + "epoch": 0.8468480251111692, + "grad_norm": 3.214920243727794, + "learning_rate": 1.3953407417992038e-06, + "loss": 1.3257, + "step": 25900 + }, + { + "epoch": 0.8470115092859011, + "grad_norm": 3.2443461172011907, + "learning_rate": 1.3924344836215564e-06, + "loss": 1.3639, + "step": 25905 + }, + { + "epoch": 0.847174993460633, + "grad_norm": 3.320677731903728, + "learning_rate": 1.3895310287170993e-06, + "loss": 1.4062, + "step": 25910 + }, + { + "epoch": 0.8473384776353649, + "grad_norm": 3.4026436376168285, + "learning_rate": 1.3866303780314195e-06, + "loss": 1.4844, + "step": 25915 + }, + { + "epoch": 0.8475019618100967, + "grad_norm": 3.6186460842181143, + "learning_rate": 1.3837325325091899e-06, + "loss": 1.4258, + "step": 25920 + }, + { + "epoch": 0.8476654459848286, + "grad_norm": 3.586465125449427, + "learning_rate": 1.3808374930941637e-06, + "loss": 1.6103, + "step": 25925 + }, + { + "epoch": 0.8478289301595605, + "grad_norm": 3.2080362854989914, + "learning_rate": 1.3779452607291887e-06, + "loss": 1.2848, + "step": 25930 + }, + { + "epoch": 0.8479924143342924, + "grad_norm": 3.165945152490533, + "learning_rate": 1.3750558363561916e-06, + "loss": 1.4557, + "step": 25935 + }, + { + "epoch": 0.8481558985090243, + "grad_norm": 3.3222076805766725, + "learning_rate": 1.3721692209161842e-06, + "loss": 1.4534, + "step": 25940 + }, + { + "epoch": 0.8483193826837562, + "grad_norm": 3.2366887173662326, + "learning_rate": 1.3692854153492729e-06, + "loss": 1.2707, + "step": 25945 + }, + { + "epoch": 0.8484828668584881, + "grad_norm": 3.2045830119104877, + "learning_rate": 1.3664044205946359e-06, + "loss": 1.3615, + "step": 25950 + }, + { + "epoch": 0.84864635103322, + "grad_norm": 3.3002460979207755, + "learning_rate": 1.3635262375905456e-06, + "loss": 1.316, + "step": 25955 + }, + { + "epoch": 0.8488098352079518, + "grad_norm": 3.335507082785736, + "learning_rate": 1.360650867274358e-06, + "loss": 1.4441, + "step": 25960 + }, + { + "epoch": 0.8489733193826837, + "grad_norm": 3.2374095301955337, + "learning_rate": 1.3577783105825071e-06, + "loss": 1.3966, + "step": 25965 + }, + { + "epoch": 0.8491368035574156, + "grad_norm": 3.3589112545882824, + "learning_rate": 1.354908568450517e-06, + "loss": 1.5278, + "step": 25970 + }, + { + "epoch": 0.8493002877321475, + "grad_norm": 3.3615112403885927, + "learning_rate": 1.3520416418129889e-06, + "loss": 1.4916, + "step": 25975 + }, + { + "epoch": 0.8494637719068794, + "grad_norm": 3.1033319871335294, + "learning_rate": 1.349177531603616e-06, + "loss": 1.2788, + "step": 25980 + }, + { + "epoch": 0.8496272560816113, + "grad_norm": 3.2538944511266763, + "learning_rate": 1.346316238755162e-06, + "loss": 1.3507, + "step": 25985 + }, + { + "epoch": 0.8497907402563432, + "grad_norm": 2.773683859812438, + "learning_rate": 1.343457764199485e-06, + "loss": 1.2652, + "step": 25990 + }, + { + "epoch": 0.849954224431075, + "grad_norm": 3.4325325716607167, + "learning_rate": 1.3406021088675203e-06, + "loss": 1.4436, + "step": 25995 + }, + { + "epoch": 0.8501177086058069, + "grad_norm": 3.0879678453318986, + "learning_rate": 1.3377492736892805e-06, + "loss": 1.2829, + "step": 26000 + }, + { + "epoch": 0.8502811927805388, + "grad_norm": 3.0198871024514293, + "learning_rate": 1.3348992595938693e-06, + "loss": 1.3717, + "step": 26005 + }, + { + "epoch": 0.8504446769552707, + "grad_norm": 3.2795575374920793, + "learning_rate": 1.3320520675094607e-06, + "loss": 1.3787, + "step": 26010 + }, + { + "epoch": 0.8506081611300026, + "grad_norm": 3.574725975609397, + "learning_rate": 1.3292076983633196e-06, + "loss": 1.4639, + "step": 26015 + }, + { + "epoch": 0.8507716453047345, + "grad_norm": 3.2987352010626907, + "learning_rate": 1.3263661530817885e-06, + "loss": 1.3521, + "step": 26020 + }, + { + "epoch": 0.8509351294794664, + "grad_norm": 3.3981552577413257, + "learning_rate": 1.3235274325902847e-06, + "loss": 1.4868, + "step": 26025 + }, + { + "epoch": 0.8510986136541983, + "grad_norm": 3.299921649634068, + "learning_rate": 1.3206915378133145e-06, + "loss": 1.4681, + "step": 26030 + }, + { + "epoch": 0.8512620978289301, + "grad_norm": 3.250585769772484, + "learning_rate": 1.3178584696744556e-06, + "loss": 1.3777, + "step": 26035 + }, + { + "epoch": 0.851425582003662, + "grad_norm": 3.0925002178183165, + "learning_rate": 1.315028229096369e-06, + "loss": 1.4612, + "step": 26040 + }, + { + "epoch": 0.8515890661783939, + "grad_norm": 3.4179210054460536, + "learning_rate": 1.3122008170007983e-06, + "loss": 1.5845, + "step": 26045 + }, + { + "epoch": 0.8517525503531258, + "grad_norm": 3.1469103191311603, + "learning_rate": 1.3093762343085592e-06, + "loss": 1.5168, + "step": 26050 + }, + { + "epoch": 0.8519160345278577, + "grad_norm": 3.3157046976122166, + "learning_rate": 1.3065544819395504e-06, + "loss": 1.4248, + "step": 26055 + }, + { + "epoch": 0.8520795187025896, + "grad_norm": 3.2401350359514267, + "learning_rate": 1.3037355608127456e-06, + "loss": 1.4912, + "step": 26060 + }, + { + "epoch": 0.8522430028773215, + "grad_norm": 3.3175447372823665, + "learning_rate": 1.3009194718462004e-06, + "loss": 1.4693, + "step": 26065 + }, + { + "epoch": 0.8524064870520534, + "grad_norm": 3.3024340993881043, + "learning_rate": 1.2981062159570468e-06, + "loss": 1.461, + "step": 26070 + }, + { + "epoch": 0.8525699712267852, + "grad_norm": 3.400975153942639, + "learning_rate": 1.2952957940614896e-06, + "loss": 1.4308, + "step": 26075 + }, + { + "epoch": 0.8527334554015171, + "grad_norm": 3.0574910909687767, + "learning_rate": 1.292488207074819e-06, + "loss": 1.5041, + "step": 26080 + }, + { + "epoch": 0.852896939576249, + "grad_norm": 3.186692608064967, + "learning_rate": 1.2896834559113936e-06, + "loss": 1.4732, + "step": 26085 + }, + { + "epoch": 0.8530604237509809, + "grad_norm": 3.2492628519126487, + "learning_rate": 1.2868815414846525e-06, + "loss": 1.3946, + "step": 26090 + }, + { + "epoch": 0.8532239079257128, + "grad_norm": 3.2557944930535982, + "learning_rate": 1.2840824647071137e-06, + "loss": 1.2537, + "step": 26095 + }, + { + "epoch": 0.8533873921004447, + "grad_norm": 3.0565045371466266, + "learning_rate": 1.2812862264903636e-06, + "loss": 1.2999, + "step": 26100 + }, + { + "epoch": 0.8535508762751766, + "grad_norm": 2.9896921761637567, + "learning_rate": 1.2784928277450737e-06, + "loss": 1.4762, + "step": 26105 + }, + { + "epoch": 0.8537143604499084, + "grad_norm": 2.9088873312043626, + "learning_rate": 1.2757022693809795e-06, + "loss": 1.2848, + "step": 26110 + }, + { + "epoch": 0.8538778446246403, + "grad_norm": 3.3364849722834706, + "learning_rate": 1.272914552306903e-06, + "loss": 1.4658, + "step": 26115 + }, + { + "epoch": 0.8540413287993722, + "grad_norm": 3.1835441726695346, + "learning_rate": 1.2701296774307303e-06, + "loss": 1.2983, + "step": 26120 + }, + { + "epoch": 0.8542048129741041, + "grad_norm": 3.3427015732235246, + "learning_rate": 1.2673476456594292e-06, + "loss": 1.4164, + "step": 26125 + }, + { + "epoch": 0.854368297148836, + "grad_norm": 3.109156603530514, + "learning_rate": 1.2645684578990424e-06, + "loss": 1.4264, + "step": 26130 + }, + { + "epoch": 0.8545317813235679, + "grad_norm": 3.0612938091475947, + "learning_rate": 1.2617921150546796e-06, + "loss": 1.3716, + "step": 26135 + }, + { + "epoch": 0.8546952654982998, + "grad_norm": 3.4529360933484026, + "learning_rate": 1.2590186180305263e-06, + "loss": 1.4011, + "step": 26140 + }, + { + "epoch": 0.8548587496730317, + "grad_norm": 3.2652100100721917, + "learning_rate": 1.2562479677298478e-06, + "loss": 1.4316, + "step": 26145 + }, + { + "epoch": 0.8550222338477635, + "grad_norm": 3.282926395277342, + "learning_rate": 1.253480165054971e-06, + "loss": 1.4428, + "step": 26150 + }, + { + "epoch": 0.8551857180224954, + "grad_norm": 3.348764735119112, + "learning_rate": 1.2507152109073062e-06, + "loss": 1.4142, + "step": 26155 + }, + { + "epoch": 0.8553492021972273, + "grad_norm": 3.1207820628082312, + "learning_rate": 1.2479531061873274e-06, + "loss": 1.3564, + "step": 26160 + }, + { + "epoch": 0.8555126863719592, + "grad_norm": 3.081786257571568, + "learning_rate": 1.2451938517945861e-06, + "loss": 1.4144, + "step": 26165 + }, + { + "epoch": 0.8556761705466911, + "grad_norm": 3.340925588735302, + "learning_rate": 1.242437448627707e-06, + "loss": 1.3019, + "step": 26170 + }, + { + "epoch": 0.855839654721423, + "grad_norm": 3.1795676836053337, + "learning_rate": 1.2396838975843772e-06, + "loss": 1.4837, + "step": 26175 + }, + { + "epoch": 0.8560031388961549, + "grad_norm": 3.2045343347424398, + "learning_rate": 1.2369331995613664e-06, + "loss": 1.2527, + "step": 26180 + }, + { + "epoch": 0.8561666230708868, + "grad_norm": 2.9213520308681624, + "learning_rate": 1.2341853554545036e-06, + "loss": 1.3193, + "step": 26185 + }, + { + "epoch": 0.8563301072456186, + "grad_norm": 3.247508778120075, + "learning_rate": 1.2314403661586993e-06, + "loss": 1.4512, + "step": 26190 + }, + { + "epoch": 0.8564935914203505, + "grad_norm": 3.4332604860613607, + "learning_rate": 1.2286982325679287e-06, + "loss": 1.3861, + "step": 26195 + }, + { + "epoch": 0.8566570755950824, + "grad_norm": 3.28969636310085, + "learning_rate": 1.225958955575235e-06, + "loss": 1.412, + "step": 26200 + }, + { + "epoch": 0.8568205597698143, + "grad_norm": 3.3491372101057597, + "learning_rate": 1.2232225360727356e-06, + "loss": 1.4004, + "step": 26205 + }, + { + "epoch": 0.8569840439445462, + "grad_norm": 3.232918581054415, + "learning_rate": 1.2204889749516136e-06, + "loss": 1.4568, + "step": 26210 + }, + { + "epoch": 0.8571475281192781, + "grad_norm": 3.2892899986746755, + "learning_rate": 1.2177582731021242e-06, + "loss": 1.3231, + "step": 26215 + }, + { + "epoch": 0.85731101229401, + "grad_norm": 3.1915162496031724, + "learning_rate": 1.215030431413592e-06, + "loss": 1.4828, + "step": 26220 + }, + { + "epoch": 0.8574744964687419, + "grad_norm": 3.2171719762975117, + "learning_rate": 1.2123054507744036e-06, + "loss": 1.4475, + "step": 26225 + }, + { + "epoch": 0.8576379806434737, + "grad_norm": 3.3608935898589527, + "learning_rate": 1.209583332072023e-06, + "loss": 1.4198, + "step": 26230 + }, + { + "epoch": 0.8578014648182056, + "grad_norm": 3.3113643185115933, + "learning_rate": 1.206864076192973e-06, + "loss": 1.3847, + "step": 26235 + }, + { + "epoch": 0.8579649489929375, + "grad_norm": 3.4654446004984507, + "learning_rate": 1.2041476840228538e-06, + "loss": 1.3792, + "step": 26240 + }, + { + "epoch": 0.8581284331676694, + "grad_norm": 3.0675483498772143, + "learning_rate": 1.2014341564463227e-06, + "loss": 1.4079, + "step": 26245 + }, + { + "epoch": 0.8582919173424013, + "grad_norm": 3.137001334336431, + "learning_rate": 1.1987234943471115e-06, + "loss": 1.3222, + "step": 26250 + }, + { + "epoch": 0.8584554015171332, + "grad_norm": 3.539482151293414, + "learning_rate": 1.1960156986080185e-06, + "loss": 1.4527, + "step": 26255 + }, + { + "epoch": 0.8586188856918651, + "grad_norm": 3.034672986033231, + "learning_rate": 1.1933107701109026e-06, + "loss": 1.4033, + "step": 26260 + }, + { + "epoch": 0.858782369866597, + "grad_norm": 3.2364828921054367, + "learning_rate": 1.190608709736696e-06, + "loss": 1.416, + "step": 26265 + }, + { + "epoch": 0.8589458540413288, + "grad_norm": 3.4310385146305067, + "learning_rate": 1.18790951836539e-06, + "loss": 1.4766, + "step": 26270 + }, + { + "epoch": 0.8591093382160607, + "grad_norm": 3.2849047553527835, + "learning_rate": 1.1852131968760471e-06, + "loss": 1.4754, + "step": 26275 + }, + { + "epoch": 0.8592728223907926, + "grad_norm": 3.5072354295165415, + "learning_rate": 1.1825197461467952e-06, + "loss": 1.3265, + "step": 26280 + }, + { + "epoch": 0.8594363065655245, + "grad_norm": 3.250648129147009, + "learning_rate": 1.1798291670548222e-06, + "loss": 1.3442, + "step": 26285 + }, + { + "epoch": 0.8595997907402564, + "grad_norm": 3.320105708949314, + "learning_rate": 1.1771414604763853e-06, + "loss": 1.425, + "step": 26290 + }, + { + "epoch": 0.8597632749149883, + "grad_norm": 3.1366714397313253, + "learning_rate": 1.174456627286803e-06, + "loss": 1.4095, + "step": 26295 + }, + { + "epoch": 0.8599267590897202, + "grad_norm": 3.253097079466394, + "learning_rate": 1.1717746683604603e-06, + "loss": 1.4157, + "step": 26300 + }, + { + "epoch": 0.860090243264452, + "grad_norm": 3.247008821912276, + "learning_rate": 1.1690955845708085e-06, + "loss": 1.3068, + "step": 26305 + }, + { + "epoch": 0.8602537274391839, + "grad_norm": 3.5223173057106725, + "learning_rate": 1.1664193767903554e-06, + "loss": 1.592, + "step": 26310 + }, + { + "epoch": 0.8604172116139158, + "grad_norm": 3.0440976806832096, + "learning_rate": 1.1637460458906802e-06, + "loss": 1.4087, + "step": 26315 + }, + { + "epoch": 0.8605806957886477, + "grad_norm": 3.1185642015584274, + "learning_rate": 1.1610755927424167e-06, + "loss": 1.3382, + "step": 26320 + }, + { + "epoch": 0.8607441799633796, + "grad_norm": 3.2932869076775453, + "learning_rate": 1.158408018215268e-06, + "loss": 1.3202, + "step": 26325 + }, + { + "epoch": 0.8609076641381115, + "grad_norm": 3.2718363410301916, + "learning_rate": 1.1557433231780003e-06, + "loss": 1.3299, + "step": 26330 + }, + { + "epoch": 0.8610711483128434, + "grad_norm": 3.2793607907231377, + "learning_rate": 1.153081508498436e-06, + "loss": 1.3595, + "step": 26335 + }, + { + "epoch": 0.8612346324875753, + "grad_norm": 3.1372740006977686, + "learning_rate": 1.1504225750434662e-06, + "loss": 1.409, + "step": 26340 + }, + { + "epoch": 0.861398116662307, + "grad_norm": 3.2017587604829383, + "learning_rate": 1.147766523679038e-06, + "loss": 1.4322, + "step": 26345 + }, + { + "epoch": 0.8615616008370389, + "grad_norm": 3.0354540962949086, + "learning_rate": 1.145113355270161e-06, + "loss": 1.4713, + "step": 26350 + }, + { + "epoch": 0.8617250850117708, + "grad_norm": 3.3229208315207956, + "learning_rate": 1.1424630706809102e-06, + "loss": 1.3909, + "step": 26355 + }, + { + "epoch": 0.8618885691865027, + "grad_norm": 3.4044143597222654, + "learning_rate": 1.1398156707744168e-06, + "loss": 1.3937, + "step": 26360 + }, + { + "epoch": 0.8620520533612346, + "grad_norm": 3.2549434195739457, + "learning_rate": 1.137171156412873e-06, + "loss": 1.3907, + "step": 26365 + }, + { + "epoch": 0.8622155375359665, + "grad_norm": 3.3007656932482745, + "learning_rate": 1.1345295284575364e-06, + "loss": 1.4633, + "step": 26370 + }, + { + "epoch": 0.8623790217106984, + "grad_norm": 3.3174763345059994, + "learning_rate": 1.1318907877687146e-06, + "loss": 1.337, + "step": 26375 + }, + { + "epoch": 0.8625425058854302, + "grad_norm": 3.0608952884692666, + "learning_rate": 1.129254935205787e-06, + "loss": 1.3283, + "step": 26380 + }, + { + "epoch": 0.8627059900601621, + "grad_norm": 3.498600954958598, + "learning_rate": 1.1266219716271808e-06, + "loss": 1.4745, + "step": 26385 + }, + { + "epoch": 0.862869474234894, + "grad_norm": 3.3891425403828825, + "learning_rate": 1.1239918978903929e-06, + "loss": 1.4583, + "step": 26390 + }, + { + "epoch": 0.8630329584096259, + "grad_norm": 3.2258912792935974, + "learning_rate": 1.121364714851968e-06, + "loss": 1.3716, + "step": 26395 + }, + { + "epoch": 0.8631964425843578, + "grad_norm": 3.2167982491312297, + "learning_rate": 1.118740423367518e-06, + "loss": 1.4246, + "step": 26400 + }, + { + "epoch": 0.8633599267590897, + "grad_norm": 3.159433985424205, + "learning_rate": 1.116119024291714e-06, + "loss": 1.3412, + "step": 26405 + }, + { + "epoch": 0.8635234109338216, + "grad_norm": 2.972658749061181, + "learning_rate": 1.1135005184782754e-06, + "loss": 1.3106, + "step": 26410 + }, + { + "epoch": 0.8636868951085535, + "grad_norm": 3.045640530730883, + "learning_rate": 1.1108849067799898e-06, + "loss": 1.3012, + "step": 26415 + }, + { + "epoch": 0.8638503792832853, + "grad_norm": 3.0248104395225925, + "learning_rate": 1.1082721900486948e-06, + "loss": 1.3236, + "step": 26420 + }, + { + "epoch": 0.8640138634580172, + "grad_norm": 3.2745680340304983, + "learning_rate": 1.1056623691352896e-06, + "loss": 1.4293, + "step": 26425 + }, + { + "epoch": 0.8641773476327491, + "grad_norm": 3.320247253372017, + "learning_rate": 1.1030554448897302e-06, + "loss": 1.5139, + "step": 26430 + }, + { + "epoch": 0.864340831807481, + "grad_norm": 3.0274905862228985, + "learning_rate": 1.1004514181610248e-06, + "loss": 1.3226, + "step": 26435 + }, + { + "epoch": 0.8645043159822129, + "grad_norm": 3.2811385428127595, + "learning_rate": 1.0978502897972453e-06, + "loss": 1.3444, + "step": 26440 + }, + { + "epoch": 0.8646678001569448, + "grad_norm": 3.320367333798463, + "learning_rate": 1.0952520606455108e-06, + "loss": 1.377, + "step": 26445 + }, + { + "epoch": 0.8648312843316767, + "grad_norm": 3.0667961053544595, + "learning_rate": 1.092656731552003e-06, + "loss": 1.4285, + "step": 26450 + }, + { + "epoch": 0.8649947685064086, + "grad_norm": 3.2335716644080517, + "learning_rate": 1.0900643033619596e-06, + "loss": 1.4058, + "step": 26455 + }, + { + "epoch": 0.8651582526811404, + "grad_norm": 3.096506866782771, + "learning_rate": 1.087474776919667e-06, + "loss": 1.4653, + "step": 26460 + }, + { + "epoch": 0.8653217368558723, + "grad_norm": 2.80646945000673, + "learning_rate": 1.084888153068473e-06, + "loss": 1.3996, + "step": 26465 + }, + { + "epoch": 0.8654852210306042, + "grad_norm": 3.2823785413675073, + "learning_rate": 1.0823044326507758e-06, + "loss": 1.3038, + "step": 26470 + }, + { + "epoch": 0.8656487052053361, + "grad_norm": 3.059390516277784, + "learning_rate": 1.0797236165080306e-06, + "loss": 1.3506, + "step": 26475 + }, + { + "epoch": 0.865812189380068, + "grad_norm": 3.109069532008803, + "learning_rate": 1.0771457054807466e-06, + "loss": 1.3379, + "step": 26480 + }, + { + "epoch": 0.8659756735547999, + "grad_norm": 3.3576407351080637, + "learning_rate": 1.0745707004084849e-06, + "loss": 1.3973, + "step": 26485 + }, + { + "epoch": 0.8661391577295318, + "grad_norm": 3.3628393661062423, + "learning_rate": 1.0719986021298644e-06, + "loss": 1.4354, + "step": 26490 + }, + { + "epoch": 0.8663026419042636, + "grad_norm": 3.2378792491283552, + "learning_rate": 1.069429411482551e-06, + "loss": 1.4306, + "step": 26495 + }, + { + "epoch": 0.8664661260789955, + "grad_norm": 3.1847434725106862, + "learning_rate": 1.0668631293032694e-06, + "loss": 1.3529, + "step": 26500 + }, + { + "epoch": 0.8666296102537274, + "grad_norm": 3.4067251734679216, + "learning_rate": 1.0642997564277924e-06, + "loss": 1.4376, + "step": 26505 + }, + { + "epoch": 0.8667930944284593, + "grad_norm": 3.2051808771867694, + "learning_rate": 1.0617392936909498e-06, + "loss": 1.484, + "step": 26510 + }, + { + "epoch": 0.8669565786031912, + "grad_norm": 3.255762551948714, + "learning_rate": 1.0591817419266237e-06, + "loss": 1.3302, + "step": 26515 + }, + { + "epoch": 0.8671200627779231, + "grad_norm": 3.311280506659311, + "learning_rate": 1.056627101967741e-06, + "loss": 1.3971, + "step": 26520 + }, + { + "epoch": 0.867283546952655, + "grad_norm": 3.105254343484362, + "learning_rate": 1.0540753746462896e-06, + "loss": 1.4977, + "step": 26525 + }, + { + "epoch": 0.8674470311273869, + "grad_norm": 3.2480221748939004, + "learning_rate": 1.0515265607933012e-06, + "loss": 1.2943, + "step": 26530 + }, + { + "epoch": 0.8676105153021187, + "grad_norm": 3.0124722486493836, + "learning_rate": 1.0489806612388632e-06, + "loss": 1.317, + "step": 26535 + }, + { + "epoch": 0.8677739994768506, + "grad_norm": 3.286179729642748, + "learning_rate": 1.0464376768121154e-06, + "loss": 1.2701, + "step": 26540 + }, + { + "epoch": 0.8679374836515825, + "grad_norm": 3.5427289150410703, + "learning_rate": 1.043897608341241e-06, + "loss": 1.3583, + "step": 26545 + }, + { + "epoch": 0.8681009678263144, + "grad_norm": 3.2996858716993436, + "learning_rate": 1.041360456653483e-06, + "loss": 1.3871, + "step": 26550 + }, + { + "epoch": 0.8682644520010463, + "grad_norm": 3.268361225718052, + "learning_rate": 1.0388262225751266e-06, + "loss": 1.3065, + "step": 26555 + }, + { + "epoch": 0.8684279361757782, + "grad_norm": 2.9164731432672544, + "learning_rate": 1.0362949069315087e-06, + "loss": 1.1783, + "step": 26560 + }, + { + "epoch": 0.8685914203505101, + "grad_norm": 3.3407316132414993, + "learning_rate": 1.0337665105470196e-06, + "loss": 1.3777, + "step": 26565 + }, + { + "epoch": 0.868754904525242, + "grad_norm": 3.3032533309046923, + "learning_rate": 1.031241034245093e-06, + "loss": 1.3805, + "step": 26570 + }, + { + "epoch": 0.8689183886999738, + "grad_norm": 3.190875422112921, + "learning_rate": 1.028718478848215e-06, + "loss": 1.3808, + "step": 26575 + }, + { + "epoch": 0.8690818728747057, + "grad_norm": 3.3722509722540837, + "learning_rate": 1.0261988451779247e-06, + "loss": 1.4014, + "step": 26580 + }, + { + "epoch": 0.8692453570494376, + "grad_norm": 3.225995326955981, + "learning_rate": 1.0236821340547986e-06, + "loss": 1.3662, + "step": 26585 + }, + { + "epoch": 0.8694088412241695, + "grad_norm": 3.258930622536097, + "learning_rate": 1.0211683462984745e-06, + "loss": 1.3423, + "step": 26590 + }, + { + "epoch": 0.8695723253989014, + "grad_norm": 3.1345615378340117, + "learning_rate": 1.0186574827276242e-06, + "loss": 1.349, + "step": 26595 + }, + { + "epoch": 0.8697358095736333, + "grad_norm": 3.4050561176061906, + "learning_rate": 1.016149544159979e-06, + "loss": 1.5305, + "step": 26600 + }, + { + "epoch": 0.8698992937483652, + "grad_norm": 3.1317959360660867, + "learning_rate": 1.0136445314123145e-06, + "loss": 1.3511, + "step": 26605 + }, + { + "epoch": 0.870062777923097, + "grad_norm": 3.1323903574022145, + "learning_rate": 1.011142445300447e-06, + "loss": 1.3174, + "step": 26610 + }, + { + "epoch": 0.8702262620978289, + "grad_norm": 3.4228650769474367, + "learning_rate": 1.0086432866392503e-06, + "loss": 1.3956, + "step": 26615 + }, + { + "epoch": 0.8703897462725608, + "grad_norm": 3.2143979929437427, + "learning_rate": 1.0061470562426334e-06, + "loss": 1.4088, + "step": 26620 + }, + { + "epoch": 0.8705532304472927, + "grad_norm": 3.200497498748417, + "learning_rate": 1.0036537549235602e-06, + "loss": 1.2709, + "step": 26625 + }, + { + "epoch": 0.8707167146220246, + "grad_norm": 3.629445398413691, + "learning_rate": 1.0011633834940393e-06, + "loss": 1.5027, + "step": 26630 + }, + { + "epoch": 0.8708801987967565, + "grad_norm": 2.985561231028981, + "learning_rate": 9.986759427651215e-07, + "loss": 1.3052, + "step": 26635 + }, + { + "epoch": 0.8710436829714884, + "grad_norm": 3.0513920378445816, + "learning_rate": 9.96191433546907e-07, + "loss": 1.3969, + "step": 26640 + }, + { + "epoch": 0.8712071671462203, + "grad_norm": 3.18448556808669, + "learning_rate": 9.93709856648537e-07, + "loss": 1.3926, + "step": 26645 + }, + { + "epoch": 0.8713706513209521, + "grad_norm": 3.4459531127325578, + "learning_rate": 9.912312128782032e-07, + "loss": 1.3346, + "step": 26650 + }, + { + "epoch": 0.871534135495684, + "grad_norm": 3.0638694023865574, + "learning_rate": 9.887555030431362e-07, + "loss": 1.3699, + "step": 26655 + }, + { + "epoch": 0.8716976196704159, + "grad_norm": 3.1505746292169734, + "learning_rate": 9.862827279496157e-07, + "loss": 1.4738, + "step": 26660 + }, + { + "epoch": 0.8718611038451478, + "grad_norm": 3.5825639648305896, + "learning_rate": 9.83812888402965e-07, + "loss": 1.3976, + "step": 26665 + }, + { + "epoch": 0.8720245880198797, + "grad_norm": 2.9435270567745753, + "learning_rate": 9.81345985207549e-07, + "loss": 1.2858, + "step": 26670 + }, + { + "epoch": 0.8721880721946116, + "grad_norm": 3.3032827924408394, + "learning_rate": 9.78882019166778e-07, + "loss": 1.4109, + "step": 26675 + }, + { + "epoch": 0.8723515563693435, + "grad_norm": 3.295269703678187, + "learning_rate": 9.764209910831046e-07, + "loss": 1.4264, + "step": 26680 + }, + { + "epoch": 0.8725150405440754, + "grad_norm": 3.104750716284595, + "learning_rate": 9.739629017580254e-07, + "loss": 1.3189, + "step": 26685 + }, + { + "epoch": 0.8726785247188072, + "grad_norm": 3.2168433156913765, + "learning_rate": 9.715077519920825e-07, + "loss": 1.4504, + "step": 26690 + }, + { + "epoch": 0.8728420088935391, + "grad_norm": 3.2538545128809186, + "learning_rate": 9.690555425848536e-07, + "loss": 1.4289, + "step": 26695 + }, + { + "epoch": 0.873005493068271, + "grad_norm": 3.1173684918275484, + "learning_rate": 9.66606274334968e-07, + "loss": 1.3048, + "step": 26700 + }, + { + "epoch": 0.8731689772430029, + "grad_norm": 3.2184101150576976, + "learning_rate": 9.641599480400875e-07, + "loss": 1.4064, + "step": 26705 + }, + { + "epoch": 0.8733324614177348, + "grad_norm": 3.3225868853816016, + "learning_rate": 9.61716564496924e-07, + "loss": 1.3742, + "step": 26710 + }, + { + "epoch": 0.8734959455924667, + "grad_norm": 3.3622499260697003, + "learning_rate": 9.592761245012272e-07, + "loss": 1.4734, + "step": 26715 + }, + { + "epoch": 0.8736594297671986, + "grad_norm": 3.3647047275576174, + "learning_rate": 9.56838628847787e-07, + "loss": 1.3784, + "step": 26720 + }, + { + "epoch": 0.8738229139419305, + "grad_norm": 3.11080984917068, + "learning_rate": 9.544040783304387e-07, + "loss": 1.3906, + "step": 26725 + }, + { + "epoch": 0.8739863981166623, + "grad_norm": 3.2497063356763842, + "learning_rate": 9.519724737420532e-07, + "loss": 1.3854, + "step": 26730 + }, + { + "epoch": 0.8741498822913942, + "grad_norm": 3.3918796437511682, + "learning_rate": 9.495438158745451e-07, + "loss": 1.4063, + "step": 26735 + }, + { + "epoch": 0.8743133664661261, + "grad_norm": 3.2362525070090777, + "learning_rate": 9.471181055188705e-07, + "loss": 1.4619, + "step": 26740 + }, + { + "epoch": 0.874476850640858, + "grad_norm": 3.4994517919102544, + "learning_rate": 9.446953434650207e-07, + "loss": 1.3594, + "step": 26745 + }, + { + "epoch": 0.8746403348155899, + "grad_norm": 3.1915243665162367, + "learning_rate": 9.422755305020348e-07, + "loss": 1.2806, + "step": 26750 + }, + { + "epoch": 0.8748038189903218, + "grad_norm": 3.3972227688781875, + "learning_rate": 9.39858667417981e-07, + "loss": 1.364, + "step": 26755 + }, + { + "epoch": 0.8749673031650537, + "grad_norm": 3.161015249116548, + "learning_rate": 9.374447549999765e-07, + "loss": 1.3146, + "step": 26760 + }, + { + "epoch": 0.8751307873397856, + "grad_norm": 3.3287210757533203, + "learning_rate": 9.350337940341725e-07, + "loss": 1.3898, + "step": 26765 + }, + { + "epoch": 0.8752942715145174, + "grad_norm": 3.282263608234752, + "learning_rate": 9.326257853057564e-07, + "loss": 1.3122, + "step": 26770 + }, + { + "epoch": 0.8754577556892493, + "grad_norm": 3.0764041865524856, + "learning_rate": 9.30220729598963e-07, + "loss": 1.2527, + "step": 26775 + }, + { + "epoch": 0.8756212398639812, + "grad_norm": 3.3681083180833307, + "learning_rate": 9.278186276970558e-07, + "loss": 1.4877, + "step": 26780 + }, + { + "epoch": 0.8757847240387131, + "grad_norm": 3.068116570791245, + "learning_rate": 9.254194803823424e-07, + "loss": 1.374, + "step": 26785 + }, + { + "epoch": 0.875948208213445, + "grad_norm": 3.014185447736701, + "learning_rate": 9.230232884361678e-07, + "loss": 1.5033, + "step": 26790 + }, + { + "epoch": 0.8761116923881769, + "grad_norm": 3.2325445130709465, + "learning_rate": 9.2063005263891e-07, + "loss": 1.3563, + "step": 26795 + }, + { + "epoch": 0.8762751765629088, + "grad_norm": 3.259113765713001, + "learning_rate": 9.182397737699899e-07, + "loss": 1.458, + "step": 26800 + }, + { + "epoch": 0.8764386607376407, + "grad_norm": 3.087606941144992, + "learning_rate": 9.158524526078594e-07, + "loss": 1.4029, + "step": 26805 + }, + { + "epoch": 0.8766021449123724, + "grad_norm": 3.316856776562646, + "learning_rate": 9.13468089930013e-07, + "loss": 1.4064, + "step": 26810 + }, + { + "epoch": 0.8767656290871043, + "grad_norm": 3.4880560419922184, + "learning_rate": 9.110866865129809e-07, + "loss": 1.4369, + "step": 26815 + }, + { + "epoch": 0.8769291132618362, + "grad_norm": 3.3480324528606933, + "learning_rate": 9.087082431323247e-07, + "loss": 1.4209, + "step": 26820 + }, + { + "epoch": 0.8770925974365681, + "grad_norm": 3.243272273676783, + "learning_rate": 9.063327605626471e-07, + "loss": 1.337, + "step": 26825 + }, + { + "epoch": 0.8772560816113, + "grad_norm": 3.0510856961435957, + "learning_rate": 9.039602395775815e-07, + "loss": 1.4084, + "step": 26830 + }, + { + "epoch": 0.8774195657860319, + "grad_norm": 3.2477870143645267, + "learning_rate": 9.015906809498032e-07, + "loss": 1.4297, + "step": 26835 + }, + { + "epoch": 0.8775830499607638, + "grad_norm": 3.1848914442606686, + "learning_rate": 8.992240854510192e-07, + "loss": 1.4732, + "step": 26840 + }, + { + "epoch": 0.8777465341354956, + "grad_norm": 3.1365432231833763, + "learning_rate": 8.968604538519688e-07, + "loss": 1.2895, + "step": 26845 + }, + { + "epoch": 0.8779100183102275, + "grad_norm": 3.432337445085939, + "learning_rate": 8.944997869224326e-07, + "loss": 1.4648, + "step": 26850 + }, + { + "epoch": 0.8780735024849594, + "grad_norm": 3.2102014769016747, + "learning_rate": 8.921420854312191e-07, + "loss": 1.4024, + "step": 26855 + }, + { + "epoch": 0.8782369866596913, + "grad_norm": 3.4099054602509753, + "learning_rate": 8.897873501461741e-07, + "loss": 1.5025, + "step": 26860 + }, + { + "epoch": 0.8784004708344232, + "grad_norm": 3.0866618550131855, + "learning_rate": 8.874355818341807e-07, + "loss": 1.3293, + "step": 26865 + }, + { + "epoch": 0.8785639550091551, + "grad_norm": 3.319104326380867, + "learning_rate": 8.850867812611475e-07, + "loss": 1.3847, + "step": 26870 + }, + { + "epoch": 0.878727439183887, + "grad_norm": 3.445180057079768, + "learning_rate": 8.827409491920247e-07, + "loss": 1.5116, + "step": 26875 + }, + { + "epoch": 0.8788909233586188, + "grad_norm": 3.0952618070903264, + "learning_rate": 8.803980863907902e-07, + "loss": 1.3223, + "step": 26880 + }, + { + "epoch": 0.8790544075333507, + "grad_norm": 3.195077724934337, + "learning_rate": 8.780581936204569e-07, + "loss": 1.3045, + "step": 26885 + }, + { + "epoch": 0.8792178917080826, + "grad_norm": 3.236667326991905, + "learning_rate": 8.757212716430741e-07, + "loss": 1.3391, + "step": 26890 + }, + { + "epoch": 0.8793813758828145, + "grad_norm": 3.1130050496138377, + "learning_rate": 8.733873212197141e-07, + "loss": 1.5636, + "step": 26895 + }, + { + "epoch": 0.8795448600575464, + "grad_norm": 3.0628611568714406, + "learning_rate": 8.710563431104935e-07, + "loss": 1.3325, + "step": 26900 + }, + { + "epoch": 0.8797083442322783, + "grad_norm": 3.319861910550957, + "learning_rate": 8.687283380745504e-07, + "loss": 1.5384, + "step": 26905 + }, + { + "epoch": 0.8798718284070102, + "grad_norm": 3.2270348462254077, + "learning_rate": 8.664033068700628e-07, + "loss": 1.3122, + "step": 26910 + }, + { + "epoch": 0.8800353125817421, + "grad_norm": 3.1880287038161694, + "learning_rate": 8.640812502542317e-07, + "loss": 1.341, + "step": 26915 + }, + { + "epoch": 0.880198796756474, + "grad_norm": 3.162046077515798, + "learning_rate": 8.617621689832956e-07, + "loss": 1.3726, + "step": 26920 + }, + { + "epoch": 0.8803622809312058, + "grad_norm": 3.549494322657461, + "learning_rate": 8.594460638125268e-07, + "loss": 1.622, + "step": 26925 + }, + { + "epoch": 0.8805257651059377, + "grad_norm": 3.2999726280456327, + "learning_rate": 8.571329354962177e-07, + "loss": 1.6407, + "step": 26930 + }, + { + "epoch": 0.8806892492806696, + "grad_norm": 3.3441353219289107, + "learning_rate": 8.548227847877022e-07, + "loss": 1.4675, + "step": 26935 + }, + { + "epoch": 0.8808527334554015, + "grad_norm": 3.1503435727692612, + "learning_rate": 8.525156124393364e-07, + "loss": 1.4725, + "step": 26940 + }, + { + "epoch": 0.8810162176301334, + "grad_norm": 3.5150276116258707, + "learning_rate": 8.502114192025112e-07, + "loss": 1.3821, + "step": 26945 + }, + { + "epoch": 0.8811797018048653, + "grad_norm": 3.090038726150177, + "learning_rate": 8.479102058276478e-07, + "loss": 1.2972, + "step": 26950 + }, + { + "epoch": 0.8813431859795972, + "grad_norm": 3.2711926931730924, + "learning_rate": 8.456119730641909e-07, + "loss": 1.359, + "step": 26955 + }, + { + "epoch": 0.881506670154329, + "grad_norm": 3.385409332555148, + "learning_rate": 8.433167216606242e-07, + "loss": 1.4923, + "step": 26960 + }, + { + "epoch": 0.8816701543290609, + "grad_norm": 3.0460429121294217, + "learning_rate": 8.410244523644506e-07, + "loss": 1.3339, + "step": 26965 + }, + { + "epoch": 0.8818336385037928, + "grad_norm": 3.1127773851843354, + "learning_rate": 8.387351659222054e-07, + "loss": 1.3117, + "step": 26970 + }, + { + "epoch": 0.8819971226785247, + "grad_norm": 3.0626921942461687, + "learning_rate": 8.364488630794565e-07, + "loss": 1.3268, + "step": 26975 + }, + { + "epoch": 0.8821606068532566, + "grad_norm": 3.4334697425583545, + "learning_rate": 8.341655445807928e-07, + "loss": 1.3568, + "step": 26980 + }, + { + "epoch": 0.8823240910279885, + "grad_norm": 3.056727094194524, + "learning_rate": 8.318852111698383e-07, + "loss": 1.3883, + "step": 26985 + }, + { + "epoch": 0.8824875752027204, + "grad_norm": 2.982153656989579, + "learning_rate": 8.296078635892412e-07, + "loss": 1.4797, + "step": 26990 + }, + { + "epoch": 0.8826510593774523, + "grad_norm": 3.239140055504256, + "learning_rate": 8.273335025806773e-07, + "loss": 1.3966, + "step": 26995 + }, + { + "epoch": 0.8828145435521841, + "grad_norm": 2.9677247984696007, + "learning_rate": 8.250621288848504e-07, + "loss": 1.2611, + "step": 27000 + }, + { + "epoch": 0.882978027726916, + "grad_norm": 3.2329136184273133, + "learning_rate": 8.227937432414912e-07, + "loss": 1.3126, + "step": 27005 + }, + { + "epoch": 0.8831415119016479, + "grad_norm": 3.1925339570027513, + "learning_rate": 8.205283463893555e-07, + "loss": 1.3714, + "step": 27010 + }, + { + "epoch": 0.8833049960763798, + "grad_norm": 3.3840314951244634, + "learning_rate": 8.182659390662329e-07, + "loss": 1.4032, + "step": 27015 + }, + { + "epoch": 0.8834684802511117, + "grad_norm": 3.170244292782025, + "learning_rate": 8.160065220089275e-07, + "loss": 1.3634, + "step": 27020 + }, + { + "epoch": 0.8836319644258436, + "grad_norm": 3.2670963921576703, + "learning_rate": 8.137500959532807e-07, + "loss": 1.4137, + "step": 27025 + }, + { + "epoch": 0.8837954486005755, + "grad_norm": 3.5899945579239256, + "learning_rate": 8.114966616341524e-07, + "loss": 1.5094, + "step": 27030 + }, + { + "epoch": 0.8839589327753073, + "grad_norm": 3.203116561020436, + "learning_rate": 8.092462197854345e-07, + "loss": 1.3662, + "step": 27035 + }, + { + "epoch": 0.8841224169500392, + "grad_norm": 3.3753586291247375, + "learning_rate": 8.069987711400351e-07, + "loss": 1.3499, + "step": 27040 + }, + { + "epoch": 0.8842859011247711, + "grad_norm": 3.271190239493998, + "learning_rate": 8.047543164298977e-07, + "loss": 1.4172, + "step": 27045 + }, + { + "epoch": 0.884449385299503, + "grad_norm": 3.391739948487886, + "learning_rate": 8.025128563859863e-07, + "loss": 1.4927, + "step": 27050 + }, + { + "epoch": 0.8846128694742349, + "grad_norm": 3.2028187760676565, + "learning_rate": 8.00274391738286e-07, + "loss": 1.491, + "step": 27055 + }, + { + "epoch": 0.8847763536489668, + "grad_norm": 3.267059140345124, + "learning_rate": 7.980389232158148e-07, + "loss": 1.381, + "step": 27060 + }, + { + "epoch": 0.8849398378236987, + "grad_norm": 3.2726640313455184, + "learning_rate": 7.958064515466046e-07, + "loss": 1.3857, + "step": 27065 + }, + { + "epoch": 0.8851033219984306, + "grad_norm": 3.297239108130755, + "learning_rate": 7.935769774577196e-07, + "loss": 1.4146, + "step": 27070 + }, + { + "epoch": 0.8852668061731624, + "grad_norm": 3.254239836252439, + "learning_rate": 7.913505016752465e-07, + "loss": 1.2987, + "step": 27075 + }, + { + "epoch": 0.8854302903478943, + "grad_norm": 3.2824911979550855, + "learning_rate": 7.8912702492429e-07, + "loss": 1.3878, + "step": 27080 + }, + { + "epoch": 0.8855937745226262, + "grad_norm": 3.4941612126556256, + "learning_rate": 7.869065479289861e-07, + "loss": 1.3959, + "step": 27085 + }, + { + "epoch": 0.8857572586973581, + "grad_norm": 3.2187352379498515, + "learning_rate": 7.846890714124867e-07, + "loss": 1.269, + "step": 27090 + }, + { + "epoch": 0.88592074287209, + "grad_norm": 3.2549540340847263, + "learning_rate": 7.824745960969704e-07, + "loss": 1.321, + "step": 27095 + }, + { + "epoch": 0.8860842270468219, + "grad_norm": 3.1242891001876822, + "learning_rate": 7.802631227036395e-07, + "loss": 1.3603, + "step": 27100 + }, + { + "epoch": 0.8862477112215538, + "grad_norm": 2.966704014932098, + "learning_rate": 7.780546519527121e-07, + "loss": 1.2426, + "step": 27105 + }, + { + "epoch": 0.8864111953962857, + "grad_norm": 3.1621359608805792, + "learning_rate": 7.758491845634386e-07, + "loss": 1.3279, + "step": 27110 + }, + { + "epoch": 0.8865746795710175, + "grad_norm": 3.119571866981738, + "learning_rate": 7.73646721254081e-07, + "loss": 1.4161, + "step": 27115 + }, + { + "epoch": 0.8867381637457494, + "grad_norm": 3.230017880386487, + "learning_rate": 7.714472627419301e-07, + "loss": 1.386, + "step": 27120 + }, + { + "epoch": 0.8869016479204813, + "grad_norm": 3.3567413557480292, + "learning_rate": 7.692508097432971e-07, + "loss": 1.4614, + "step": 27125 + }, + { + "epoch": 0.8870651320952132, + "grad_norm": 3.2331472481844528, + "learning_rate": 7.670573629735078e-07, + "loss": 1.3074, + "step": 27130 + }, + { + "epoch": 0.8872286162699451, + "grad_norm": 3.204893716707464, + "learning_rate": 7.648669231469208e-07, + "loss": 1.3619, + "step": 27135 + }, + { + "epoch": 0.887392100444677, + "grad_norm": 3.2159269220657625, + "learning_rate": 7.62679490976902e-07, + "loss": 1.4398, + "step": 27140 + }, + { + "epoch": 0.8875555846194089, + "grad_norm": 3.3749331410132175, + "learning_rate": 7.604950671758482e-07, + "loss": 1.3964, + "step": 27145 + }, + { + "epoch": 0.8877190687941408, + "grad_norm": 3.2288088890461513, + "learning_rate": 7.583136524551738e-07, + "loss": 1.3513, + "step": 27150 + }, + { + "epoch": 0.8878825529688726, + "grad_norm": 3.1104173090765745, + "learning_rate": 7.561352475253092e-07, + "loss": 1.3602, + "step": 27155 + }, + { + "epoch": 0.8880460371436045, + "grad_norm": 3.1398175429640087, + "learning_rate": 7.539598530957104e-07, + "loss": 1.3232, + "step": 27160 + }, + { + "epoch": 0.8882095213183364, + "grad_norm": 3.3493705042841264, + "learning_rate": 7.517874698748461e-07, + "loss": 1.4586, + "step": 27165 + }, + { + "epoch": 0.8883730054930683, + "grad_norm": 3.3429903131835648, + "learning_rate": 7.496180985702128e-07, + "loss": 1.3898, + "step": 27170 + }, + { + "epoch": 0.8885364896678002, + "grad_norm": 3.140801459813885, + "learning_rate": 7.474517398883185e-07, + "loss": 1.3168, + "step": 27175 + }, + { + "epoch": 0.8886999738425321, + "grad_norm": 3.134525225156784, + "learning_rate": 7.452883945346934e-07, + "loss": 1.4252, + "step": 27180 + }, + { + "epoch": 0.888863458017264, + "grad_norm": 3.079950939318261, + "learning_rate": 7.431280632138882e-07, + "loss": 1.4555, + "step": 27185 + }, + { + "epoch": 0.8890269421919959, + "grad_norm": 3.386661272075087, + "learning_rate": 7.409707466294669e-07, + "loss": 1.4693, + "step": 27190 + }, + { + "epoch": 0.8891904263667277, + "grad_norm": 3.0761732659960415, + "learning_rate": 7.388164454840152e-07, + "loss": 1.3646, + "step": 27195 + }, + { + "epoch": 0.8893539105414596, + "grad_norm": 3.4096233983884456, + "learning_rate": 7.366651604791398e-07, + "loss": 1.4471, + "step": 27200 + }, + { + "epoch": 0.8895173947161915, + "grad_norm": 3.2630336783282066, + "learning_rate": 7.345168923154567e-07, + "loss": 1.4328, + "step": 27205 + }, + { + "epoch": 0.8896808788909234, + "grad_norm": 3.2830315669660854, + "learning_rate": 7.323716416926086e-07, + "loss": 1.3712, + "step": 27210 + }, + { + "epoch": 0.8898443630656553, + "grad_norm": 3.2898047187459665, + "learning_rate": 7.302294093092466e-07, + "loss": 1.4896, + "step": 27215 + }, + { + "epoch": 0.8900078472403872, + "grad_norm": 3.316705102806722, + "learning_rate": 7.28090195863046e-07, + "loss": 1.3414, + "step": 27220 + }, + { + "epoch": 0.8901713314151191, + "grad_norm": 3.291702108448565, + "learning_rate": 7.259540020506972e-07, + "loss": 1.3828, + "step": 27225 + }, + { + "epoch": 0.890334815589851, + "grad_norm": 3.625277565624663, + "learning_rate": 7.238208285679027e-07, + "loss": 1.5564, + "step": 27230 + }, + { + "epoch": 0.8904982997645828, + "grad_norm": 6.7815245693121335, + "learning_rate": 7.216906761093889e-07, + "loss": 1.4567, + "step": 27235 + }, + { + "epoch": 0.8906617839393147, + "grad_norm": 3.1740789532588005, + "learning_rate": 7.19563545368891e-07, + "loss": 1.3516, + "step": 27240 + }, + { + "epoch": 0.8908252681140466, + "grad_norm": 3.5296467389392894, + "learning_rate": 7.174394370391635e-07, + "loss": 1.3891, + "step": 27245 + }, + { + "epoch": 0.8909887522887785, + "grad_norm": 3.3109841546754986, + "learning_rate": 7.1531835181198e-07, + "loss": 1.4566, + "step": 27250 + }, + { + "epoch": 0.8911522364635104, + "grad_norm": 3.389805664590186, + "learning_rate": 7.132002903781221e-07, + "loss": 1.4376, + "step": 27255 + }, + { + "epoch": 0.8913157206382423, + "grad_norm": 3.0750665521173217, + "learning_rate": 7.110852534273938e-07, + "loss": 1.3429, + "step": 27260 + }, + { + "epoch": 0.8914792048129742, + "grad_norm": 3.297858617970889, + "learning_rate": 7.089732416486062e-07, + "loss": 1.4677, + "step": 27265 + }, + { + "epoch": 0.891642688987706, + "grad_norm": 3.0665881674220246, + "learning_rate": 7.068642557295935e-07, + "loss": 1.3328, + "step": 27270 + }, + { + "epoch": 0.8918061731624378, + "grad_norm": 3.4077748154491903, + "learning_rate": 7.047582963572008e-07, + "loss": 1.4707, + "step": 27275 + }, + { + "epoch": 0.8919696573371697, + "grad_norm": 3.2076740640884656, + "learning_rate": 7.026553642172851e-07, + "loss": 1.2973, + "step": 27280 + }, + { + "epoch": 0.8921331415119016, + "grad_norm": 3.207460244579169, + "learning_rate": 7.005554599947229e-07, + "loss": 1.4101, + "step": 27285 + }, + { + "epoch": 0.8922966256866335, + "grad_norm": 3.236995111082473, + "learning_rate": 6.984585843733982e-07, + "loss": 1.359, + "step": 27290 + }, + { + "epoch": 0.8924601098613654, + "grad_norm": 3.514753295675323, + "learning_rate": 6.96364738036216e-07, + "loss": 1.5554, + "step": 27295 + }, + { + "epoch": 0.8926235940360973, + "grad_norm": 3.350576722926601, + "learning_rate": 6.942739216650863e-07, + "loss": 1.3702, + "step": 27300 + }, + { + "epoch": 0.8927870782108291, + "grad_norm": 3.420759255553588, + "learning_rate": 6.921861359409387e-07, + "loss": 1.6051, + "step": 27305 + }, + { + "epoch": 0.892950562385561, + "grad_norm": 3.215003427919983, + "learning_rate": 6.901013815437152e-07, + "loss": 1.3451, + "step": 27310 + }, + { + "epoch": 0.8931140465602929, + "grad_norm": 3.2450519467430623, + "learning_rate": 6.88019659152368e-07, + "loss": 1.3753, + "step": 27315 + }, + { + "epoch": 0.8932775307350248, + "grad_norm": 3.022787207359544, + "learning_rate": 6.859409694448637e-07, + "loss": 1.373, + "step": 27320 + }, + { + "epoch": 0.8934410149097567, + "grad_norm": 3.2883230898000595, + "learning_rate": 6.838653130981787e-07, + "loss": 1.35, + "step": 27325 + }, + { + "epoch": 0.8936044990844886, + "grad_norm": 3.187754219483417, + "learning_rate": 6.817926907883044e-07, + "loss": 1.3027, + "step": 27330 + }, + { + "epoch": 0.8937679832592205, + "grad_norm": 3.3145020039329367, + "learning_rate": 6.797231031902462e-07, + "loss": 1.4277, + "step": 27335 + }, + { + "epoch": 0.8939314674339524, + "grad_norm": 3.1824880507112603, + "learning_rate": 6.776565509780131e-07, + "loss": 1.3131, + "step": 27340 + }, + { + "epoch": 0.8940949516086842, + "grad_norm": 3.3653691630810534, + "learning_rate": 6.755930348246342e-07, + "loss": 1.3376, + "step": 27345 + }, + { + "epoch": 0.8942584357834161, + "grad_norm": 3.265296902599262, + "learning_rate": 6.735325554021432e-07, + "loss": 1.2311, + "step": 27350 + }, + { + "epoch": 0.894421919958148, + "grad_norm": 3.067483133853957, + "learning_rate": 6.7147511338159e-07, + "loss": 1.2412, + "step": 27355 + }, + { + "epoch": 0.8945854041328799, + "grad_norm": 3.1455399387020218, + "learning_rate": 6.694207094330329e-07, + "loss": 1.264, + "step": 27360 + }, + { + "epoch": 0.8947488883076118, + "grad_norm": 2.851574141642203, + "learning_rate": 6.673693442255402e-07, + "loss": 1.2707, + "step": 27365 + }, + { + "epoch": 0.8949123724823437, + "grad_norm": 3.398207038158719, + "learning_rate": 6.653210184271919e-07, + "loss": 1.2891, + "step": 27370 + }, + { + "epoch": 0.8950758566570756, + "grad_norm": 3.1185087442939197, + "learning_rate": 6.632757327050765e-07, + "loss": 1.384, + "step": 27375 + }, + { + "epoch": 0.8952393408318075, + "grad_norm": 3.113436378952219, + "learning_rate": 6.612334877252946e-07, + "loss": 1.3694, + "step": 27380 + }, + { + "epoch": 0.8954028250065393, + "grad_norm": 3.4689831600300796, + "learning_rate": 6.591942841529553e-07, + "loss": 1.3004, + "step": 27385 + }, + { + "epoch": 0.8955663091812712, + "grad_norm": 3.1706444428268545, + "learning_rate": 6.571581226521751e-07, + "loss": 1.3709, + "step": 27390 + }, + { + "epoch": 0.8957297933560031, + "grad_norm": 3.1960749998130744, + "learning_rate": 6.551250038860834e-07, + "loss": 1.2536, + "step": 27395 + }, + { + "epoch": 0.895893277530735, + "grad_norm": 3.932162229137474, + "learning_rate": 6.530949285168198e-07, + "loss": 1.4631, + "step": 27400 + }, + { + "epoch": 0.8960567617054669, + "grad_norm": 3.079178479998164, + "learning_rate": 6.510678972055251e-07, + "loss": 1.4185, + "step": 27405 + }, + { + "epoch": 0.8962202458801988, + "grad_norm": 3.1968603326910996, + "learning_rate": 6.490439106123592e-07, + "loss": 1.3203, + "step": 27410 + }, + { + "epoch": 0.8963837300549307, + "grad_norm": 3.2817643076945044, + "learning_rate": 6.470229693964791e-07, + "loss": 1.4391, + "step": 27415 + }, + { + "epoch": 0.8965472142296625, + "grad_norm": 3.1382894417815277, + "learning_rate": 6.450050742160596e-07, + "loss": 1.2876, + "step": 27420 + }, + { + "epoch": 0.8967106984043944, + "grad_norm": 3.1020418276654955, + "learning_rate": 6.429902257282794e-07, + "loss": 1.3348, + "step": 27425 + }, + { + "epoch": 0.8968741825791263, + "grad_norm": 3.1995174106327178, + "learning_rate": 6.409784245893247e-07, + "loss": 1.4936, + "step": 27430 + }, + { + "epoch": 0.8970376667538582, + "grad_norm": 3.4192917152582396, + "learning_rate": 6.389696714543902e-07, + "loss": 1.418, + "step": 27435 + }, + { + "epoch": 0.8972011509285901, + "grad_norm": 3.246636802331199, + "learning_rate": 6.369639669776762e-07, + "loss": 1.4004, + "step": 27440 + }, + { + "epoch": 0.897364635103322, + "grad_norm": 3.461101494661023, + "learning_rate": 6.349613118123943e-07, + "loss": 1.4075, + "step": 27445 + }, + { + "epoch": 0.8975281192780539, + "grad_norm": 3.1108315838638587, + "learning_rate": 6.329617066107563e-07, + "loss": 1.3766, + "step": 27450 + }, + { + "epoch": 0.8976916034527858, + "grad_norm": 3.14777594124927, + "learning_rate": 6.309651520239878e-07, + "loss": 1.4287, + "step": 27455 + }, + { + "epoch": 0.8978550876275176, + "grad_norm": 3.1597668620521437, + "learning_rate": 6.289716487023179e-07, + "loss": 1.3846, + "step": 27460 + }, + { + "epoch": 0.8980185718022495, + "grad_norm": 3.297500352813614, + "learning_rate": 6.269811972949791e-07, + "loss": 1.5568, + "step": 27465 + }, + { + "epoch": 0.8981820559769814, + "grad_norm": 3.1930948607810845, + "learning_rate": 6.249937984502153e-07, + "loss": 1.286, + "step": 27470 + }, + { + "epoch": 0.8983455401517133, + "grad_norm": 3.3037625812794147, + "learning_rate": 6.23009452815272e-07, + "loss": 1.4185, + "step": 27475 + }, + { + "epoch": 0.8985090243264452, + "grad_norm": 3.2180587513930496, + "learning_rate": 6.210281610364021e-07, + "loss": 1.3848, + "step": 27480 + }, + { + "epoch": 0.8986725085011771, + "grad_norm": 3.4359116222683084, + "learning_rate": 6.190499237588655e-07, + "loss": 1.3679, + "step": 27485 + }, + { + "epoch": 0.898835992675909, + "grad_norm": 3.254933416036957, + "learning_rate": 6.170747416269219e-07, + "loss": 1.4228, + "step": 27490 + }, + { + "epoch": 0.8989994768506409, + "grad_norm": 3.383835247688548, + "learning_rate": 6.151026152838447e-07, + "loss": 1.4222, + "step": 27495 + }, + { + "epoch": 0.8991629610253727, + "grad_norm": 3.2978972151611146, + "learning_rate": 6.131335453719022e-07, + "loss": 1.4796, + "step": 27500 + }, + { + "epoch": 0.8993264452001046, + "grad_norm": 3.224466140376938, + "learning_rate": 6.111675325323752e-07, + "loss": 1.3657, + "step": 27505 + }, + { + "epoch": 0.8994899293748365, + "grad_norm": 3.3902065480734804, + "learning_rate": 6.092045774055466e-07, + "loss": 1.4987, + "step": 27510 + }, + { + "epoch": 0.8996534135495684, + "grad_norm": 3.353190817128012, + "learning_rate": 6.072446806306997e-07, + "loss": 1.4785, + "step": 27515 + }, + { + "epoch": 0.8998168977243003, + "grad_norm": 3.2135524531459105, + "learning_rate": 6.052878428461284e-07, + "loss": 1.3447, + "step": 27520 + }, + { + "epoch": 0.8999803818990322, + "grad_norm": 3.129390792348118, + "learning_rate": 6.033340646891239e-07, + "loss": 1.3862, + "step": 27525 + }, + { + "epoch": 0.9001438660737641, + "grad_norm": 3.27644393393541, + "learning_rate": 6.01383346795985e-07, + "loss": 1.5549, + "step": 27530 + }, + { + "epoch": 0.900307350248496, + "grad_norm": 3.2776970166022266, + "learning_rate": 5.994356898020137e-07, + "loss": 1.3763, + "step": 27535 + }, + { + "epoch": 0.9004708344232278, + "grad_norm": 3.379364498143357, + "learning_rate": 5.974910943415113e-07, + "loss": 1.3734, + "step": 27540 + }, + { + "epoch": 0.9006343185979597, + "grad_norm": 3.488629333060288, + "learning_rate": 5.95549561047789e-07, + "loss": 1.6936, + "step": 27545 + }, + { + "epoch": 0.9007978027726916, + "grad_norm": 3.1936644609355507, + "learning_rate": 5.936110905531522e-07, + "loss": 1.4188, + "step": 27550 + }, + { + "epoch": 0.9009612869474235, + "grad_norm": 3.0235348449360933, + "learning_rate": 5.916756834889181e-07, + "loss": 1.3861, + "step": 27555 + }, + { + "epoch": 0.9011247711221554, + "grad_norm": 3.279220089637672, + "learning_rate": 5.897433404853969e-07, + "loss": 1.3342, + "step": 27560 + }, + { + "epoch": 0.9012882552968873, + "grad_norm": 3.303854911741634, + "learning_rate": 5.878140621719064e-07, + "loss": 1.365, + "step": 27565 + }, + { + "epoch": 0.9014517394716192, + "grad_norm": 3.2736879658524534, + "learning_rate": 5.858878491767683e-07, + "loss": 1.4635, + "step": 27570 + }, + { + "epoch": 0.901615223646351, + "grad_norm": 3.4036416945722165, + "learning_rate": 5.839647021272987e-07, + "loss": 1.324, + "step": 27575 + }, + { + "epoch": 0.9017787078210829, + "grad_norm": 3.2986361289260433, + "learning_rate": 5.820446216498232e-07, + "loss": 1.5066, + "step": 27580 + }, + { + "epoch": 0.9019421919958148, + "grad_norm": 3.4302386358638075, + "learning_rate": 5.801276083696639e-07, + "loss": 1.3891, + "step": 27585 + }, + { + "epoch": 0.9021056761705467, + "grad_norm": 3.3133586935857506, + "learning_rate": 5.782136629111424e-07, + "loss": 1.3728, + "step": 27590 + }, + { + "epoch": 0.9022691603452786, + "grad_norm": 3.352004170312453, + "learning_rate": 5.763027858975867e-07, + "loss": 1.5457, + "step": 27595 + }, + { + "epoch": 0.9024326445200105, + "grad_norm": 3.170971726794196, + "learning_rate": 5.743949779513214e-07, + "loss": 1.338, + "step": 27600 + }, + { + "epoch": 0.9025961286947424, + "grad_norm": 3.267664872245142, + "learning_rate": 5.724902396936726e-07, + "loss": 1.4192, + "step": 27605 + }, + { + "epoch": 0.9027596128694743, + "grad_norm": 3.1571946692435766, + "learning_rate": 5.705885717449688e-07, + "loss": 1.3014, + "step": 27610 + }, + { + "epoch": 0.9029230970442061, + "grad_norm": 3.723208954534775, + "learning_rate": 5.686899747245345e-07, + "loss": 1.384, + "step": 27615 + }, + { + "epoch": 0.903086581218938, + "grad_norm": 3.2372233660682994, + "learning_rate": 5.667944492506982e-07, + "loss": 1.3482, + "step": 27620 + }, + { + "epoch": 0.9032500653936699, + "grad_norm": 3.259153742343141, + "learning_rate": 5.649019959407842e-07, + "loss": 1.6088, + "step": 27625 + }, + { + "epoch": 0.9034135495684018, + "grad_norm": 3.012357650204813, + "learning_rate": 5.630126154111182e-07, + "loss": 1.3495, + "step": 27630 + }, + { + "epoch": 0.9035770337431337, + "grad_norm": 3.4149082658832826, + "learning_rate": 5.61126308277028e-07, + "loss": 1.5585, + "step": 27635 + }, + { + "epoch": 0.9037405179178656, + "grad_norm": 3.1962743060399452, + "learning_rate": 5.592430751528333e-07, + "loss": 1.3042, + "step": 27640 + }, + { + "epoch": 0.9039040020925975, + "grad_norm": 3.129912334560322, + "learning_rate": 5.573629166518613e-07, + "loss": 1.275, + "step": 27645 + }, + { + "epoch": 0.9040674862673294, + "grad_norm": 3.1199612538427592, + "learning_rate": 5.5548583338643e-07, + "loss": 1.3941, + "step": 27650 + }, + { + "epoch": 0.9042309704420612, + "grad_norm": 3.21193498258873, + "learning_rate": 5.536118259678602e-07, + "loss": 1.3891, + "step": 27655 + }, + { + "epoch": 0.9043944546167931, + "grad_norm": 3.3204765830159895, + "learning_rate": 5.517408950064729e-07, + "loss": 1.428, + "step": 27660 + }, + { + "epoch": 0.904557938791525, + "grad_norm": 3.179249734526584, + "learning_rate": 5.498730411115805e-07, + "loss": 1.3785, + "step": 27665 + }, + { + "epoch": 0.9047214229662569, + "grad_norm": 3.091599432154558, + "learning_rate": 5.48008264891502e-07, + "loss": 1.2756, + "step": 27670 + }, + { + "epoch": 0.9048849071409888, + "grad_norm": 3.358661690983638, + "learning_rate": 5.461465669535437e-07, + "loss": 1.289, + "step": 27675 + }, + { + "epoch": 0.9050483913157207, + "grad_norm": 3.2804737768376016, + "learning_rate": 5.442879479040186e-07, + "loss": 1.4392, + "step": 27680 + }, + { + "epoch": 0.9052118754904526, + "grad_norm": 3.437245836896061, + "learning_rate": 5.424324083482335e-07, + "loss": 1.3787, + "step": 27685 + }, + { + "epoch": 0.9053753596651845, + "grad_norm": 3.2287581761338475, + "learning_rate": 5.405799488904906e-07, + "loss": 1.4098, + "step": 27690 + }, + { + "epoch": 0.9055388438399163, + "grad_norm": 3.0538997941859587, + "learning_rate": 5.387305701340917e-07, + "loss": 1.4535, + "step": 27695 + }, + { + "epoch": 0.9057023280146482, + "grad_norm": 3.3239101328807794, + "learning_rate": 5.368842726813328e-07, + "loss": 1.3516, + "step": 27700 + }, + { + "epoch": 0.9058658121893801, + "grad_norm": 3.305983557841726, + "learning_rate": 5.350410571335107e-07, + "loss": 1.4398, + "step": 27705 + }, + { + "epoch": 0.906029296364112, + "grad_norm": 3.1506681912054635, + "learning_rate": 5.332009240909119e-07, + "loss": 1.3618, + "step": 27710 + }, + { + "epoch": 0.9061927805388439, + "grad_norm": 3.3673177556316682, + "learning_rate": 5.313638741528237e-07, + "loss": 1.3337, + "step": 27715 + }, + { + "epoch": 0.9063562647135758, + "grad_norm": 3.4295598793415025, + "learning_rate": 5.295299079175309e-07, + "loss": 1.4226, + "step": 27720 + }, + { + "epoch": 0.9065197488883077, + "grad_norm": 3.0951024264030313, + "learning_rate": 5.276990259823068e-07, + "loss": 1.4186, + "step": 27725 + }, + { + "epoch": 0.9066832330630396, + "grad_norm": 3.20532996071351, + "learning_rate": 5.258712289434298e-07, + "loss": 1.4864, + "step": 27730 + }, + { + "epoch": 0.9068467172377714, + "grad_norm": 3.3408927609388708, + "learning_rate": 5.240465173961639e-07, + "loss": 1.3469, + "step": 27735 + }, + { + "epoch": 0.9070102014125033, + "grad_norm": 3.0702751757008633, + "learning_rate": 5.22224891934775e-07, + "loss": 1.4016, + "step": 27740 + }, + { + "epoch": 0.9071736855872351, + "grad_norm": 3.147710321389792, + "learning_rate": 5.204063531525238e-07, + "loss": 1.2855, + "step": 27745 + }, + { + "epoch": 0.907337169761967, + "grad_norm": 3.4495891819745363, + "learning_rate": 5.185909016416613e-07, + "loss": 1.4198, + "step": 27750 + }, + { + "epoch": 0.9075006539366989, + "grad_norm": 3.24116499397124, + "learning_rate": 5.167785379934365e-07, + "loss": 1.5066, + "step": 27755 + }, + { + "epoch": 0.9076641381114308, + "grad_norm": 3.0323846664370313, + "learning_rate": 5.149692627980907e-07, + "loss": 1.4314, + "step": 27760 + }, + { + "epoch": 0.9078276222861626, + "grad_norm": 3.2306165853631468, + "learning_rate": 5.131630766448614e-07, + "loss": 1.3448, + "step": 27765 + }, + { + "epoch": 0.9079911064608945, + "grad_norm": 3.267901717660111, + "learning_rate": 5.113599801219804e-07, + "loss": 1.4403, + "step": 27770 + }, + { + "epoch": 0.9081545906356264, + "grad_norm": 3.084662565440458, + "learning_rate": 5.095599738166701e-07, + "loss": 1.4139, + "step": 27775 + }, + { + "epoch": 0.9083180748103583, + "grad_norm": 3.4922930606675626, + "learning_rate": 5.077630583151505e-07, + "loss": 1.4809, + "step": 27780 + }, + { + "epoch": 0.9084815589850902, + "grad_norm": 3.4872156666706418, + "learning_rate": 5.059692342026301e-07, + "loss": 1.6147, + "step": 27785 + }, + { + "epoch": 0.9086450431598221, + "grad_norm": 3.344253735774814, + "learning_rate": 5.04178502063315e-07, + "loss": 1.3515, + "step": 27790 + }, + { + "epoch": 0.908808527334554, + "grad_norm": 3.420153236928917, + "learning_rate": 5.023908624804053e-07, + "loss": 1.5533, + "step": 27795 + }, + { + "epoch": 0.9089720115092859, + "grad_norm": 3.11611758522088, + "learning_rate": 5.006063160360863e-07, + "loss": 1.5205, + "step": 27800 + }, + { + "epoch": 0.9091354956840177, + "grad_norm": 3.306540633121428, + "learning_rate": 4.988248633115444e-07, + "loss": 1.3957, + "step": 27805 + }, + { + "epoch": 0.9092989798587496, + "grad_norm": 3.225807477168097, + "learning_rate": 4.970465048869555e-07, + "loss": 1.3637, + "step": 27810 + }, + { + "epoch": 0.9094624640334815, + "grad_norm": 3.4495177724363555, + "learning_rate": 4.95271241341484e-07, + "loss": 1.4928, + "step": 27815 + }, + { + "epoch": 0.9096259482082134, + "grad_norm": 3.2606122322536537, + "learning_rate": 4.93499073253294e-07, + "loss": 1.5186, + "step": 27820 + }, + { + "epoch": 0.9097894323829453, + "grad_norm": 3.299577386166593, + "learning_rate": 4.91730001199534e-07, + "loss": 1.4317, + "step": 27825 + }, + { + "epoch": 0.9099529165576772, + "grad_norm": 3.1909479606979896, + "learning_rate": 4.899640257563498e-07, + "loss": 1.366, + "step": 27830 + }, + { + "epoch": 0.9101164007324091, + "grad_norm": 3.3041417675149267, + "learning_rate": 4.882011474988746e-07, + "loss": 1.442, + "step": 27835 + }, + { + "epoch": 0.910279884907141, + "grad_norm": 3.3327352582345275, + "learning_rate": 4.864413670012335e-07, + "loss": 1.4575, + "step": 27840 + }, + { + "epoch": 0.9104433690818728, + "grad_norm": 3.2076844135163616, + "learning_rate": 4.846846848365483e-07, + "loss": 1.5197, + "step": 27845 + }, + { + "epoch": 0.9106068532566047, + "grad_norm": 3.1450220187168063, + "learning_rate": 4.829311015769234e-07, + "loss": 1.2965, + "step": 27850 + }, + { + "epoch": 0.9107703374313366, + "grad_norm": 3.4327845171545217, + "learning_rate": 4.811806177934597e-07, + "loss": 1.3559, + "step": 27855 + }, + { + "epoch": 0.9109338216060685, + "grad_norm": 3.0622214273662958, + "learning_rate": 4.794332340562457e-07, + "loss": 1.3335, + "step": 27860 + }, + { + "epoch": 0.9110973057808004, + "grad_norm": 3.1641619286381135, + "learning_rate": 4.776889509343619e-07, + "loss": 1.3378, + "step": 27865 + }, + { + "epoch": 0.9112607899555323, + "grad_norm": 3.3595451164163417, + "learning_rate": 4.759477689958802e-07, + "loss": 1.445, + "step": 27870 + }, + { + "epoch": 0.9114242741302642, + "grad_norm": 3.3035057401750847, + "learning_rate": 4.7420968880785846e-07, + "loss": 1.4119, + "step": 27875 + }, + { + "epoch": 0.911587758304996, + "grad_norm": 3.3695487610961736, + "learning_rate": 4.724747109363481e-07, + "loss": 1.4804, + "step": 27880 + }, + { + "epoch": 0.9117512424797279, + "grad_norm": 3.08817405858584, + "learning_rate": 4.707428359463884e-07, + "loss": 1.2611, + "step": 27885 + }, + { + "epoch": 0.9119147266544598, + "grad_norm": 3.2189550670380993, + "learning_rate": 4.6901406440200804e-07, + "loss": 1.403, + "step": 27890 + }, + { + "epoch": 0.9120782108291917, + "grad_norm": 3.1692935744436954, + "learning_rate": 4.6728839686622894e-07, + "loss": 1.3676, + "step": 27895 + }, + { + "epoch": 0.9122416950039236, + "grad_norm": 3.3870298622339257, + "learning_rate": 4.6556583390105383e-07, + "loss": 1.4687, + "step": 27900 + }, + { + "epoch": 0.9124051791786555, + "grad_norm": 3.3825815940775232, + "learning_rate": 4.6384637606748274e-07, + "loss": 1.385, + "step": 27905 + }, + { + "epoch": 0.9125686633533874, + "grad_norm": 3.066694251194544, + "learning_rate": 4.621300239254989e-07, + "loss": 1.3789, + "step": 27910 + }, + { + "epoch": 0.9127321475281193, + "grad_norm": 3.188955832574244, + "learning_rate": 4.6041677803407735e-07, + "loss": 1.5181, + "step": 27915 + }, + { + "epoch": 0.9128956317028512, + "grad_norm": 3.2199146415202335, + "learning_rate": 4.587066389511807e-07, + "loss": 1.3814, + "step": 27920 + }, + { + "epoch": 0.913059115877583, + "grad_norm": 3.121292526091169, + "learning_rate": 4.5699960723375794e-07, + "loss": 1.4402, + "step": 27925 + }, + { + "epoch": 0.9132226000523149, + "grad_norm": 3.2169004114713955, + "learning_rate": 4.5529568343774977e-07, + "loss": 1.1813, + "step": 27930 + }, + { + "epoch": 0.9133860842270468, + "grad_norm": 3.1418891537863924, + "learning_rate": 4.5359486811807904e-07, + "loss": 1.3601, + "step": 27935 + }, + { + "epoch": 0.9135495684017787, + "grad_norm": 3.125822731222413, + "learning_rate": 4.5189716182866164e-07, + "loss": 1.4837, + "step": 27940 + }, + { + "epoch": 0.9137130525765106, + "grad_norm": 3.3380355936515604, + "learning_rate": 4.502025651223996e-07, + "loss": 1.4018, + "step": 27945 + }, + { + "epoch": 0.9138765367512425, + "grad_norm": 3.053926449045742, + "learning_rate": 4.4851107855118036e-07, + "loss": 1.4008, + "step": 27950 + }, + { + "epoch": 0.9140400209259744, + "grad_norm": 3.053093842701314, + "learning_rate": 4.4682270266588e-07, + "loss": 1.3447, + "step": 27955 + }, + { + "epoch": 0.9142035051007062, + "grad_norm": 3.2947361634036287, + "learning_rate": 4.451374380163609e-07, + "loss": 1.4797, + "step": 27960 + }, + { + "epoch": 0.9143669892754381, + "grad_norm": 3.048049183171731, + "learning_rate": 4.4345528515147286e-07, + "loss": 1.413, + "step": 27965 + }, + { + "epoch": 0.91453047345017, + "grad_norm": 3.2849797606923317, + "learning_rate": 4.41776244619051e-07, + "loss": 1.4765, + "step": 27970 + }, + { + "epoch": 0.9146939576249019, + "grad_norm": 3.27198124816327, + "learning_rate": 4.4010031696591906e-07, + "loss": 1.4497, + "step": 27975 + }, + { + "epoch": 0.9148574417996338, + "grad_norm": 3.0967809690389787, + "learning_rate": 4.384275027378848e-07, + "loss": 1.3909, + "step": 27980 + }, + { + "epoch": 0.9150209259743657, + "grad_norm": 3.253580623040376, + "learning_rate": 4.3675780247974254e-07, + "loss": 1.3933, + "step": 27985 + }, + { + "epoch": 0.9151844101490976, + "grad_norm": 3.469358001967804, + "learning_rate": 4.35091216735275e-07, + "loss": 1.443, + "step": 27990 + }, + { + "epoch": 0.9153478943238295, + "grad_norm": 3.4713416256938596, + "learning_rate": 4.334277460472447e-07, + "loss": 1.488, + "step": 27995 + }, + { + "epoch": 0.9155113784985613, + "grad_norm": 3.2698925015752707, + "learning_rate": 4.3176739095740607e-07, + "loss": 1.3718, + "step": 28000 + }, + { + "epoch": 0.9156748626732932, + "grad_norm": 3.2068735968881863, + "learning_rate": 4.301101520064954e-07, + "loss": 1.4289, + "step": 28005 + }, + { + "epoch": 0.9158383468480251, + "grad_norm": 3.332691612482211, + "learning_rate": 4.2845602973423326e-07, + "loss": 1.357, + "step": 28010 + }, + { + "epoch": 0.916001831022757, + "grad_norm": 3.2128050187201262, + "learning_rate": 4.268050246793276e-07, + "loss": 1.4199, + "step": 28015 + }, + { + "epoch": 0.9161653151974889, + "grad_norm": 3.1118543921045565, + "learning_rate": 4.2515713737947274e-07, + "loss": 1.2681, + "step": 28020 + }, + { + "epoch": 0.9163287993722208, + "grad_norm": 3.406693444917827, + "learning_rate": 4.235123683713405e-07, + "loss": 1.3638, + "step": 28025 + }, + { + "epoch": 0.9164922835469527, + "grad_norm": 3.372842196218152, + "learning_rate": 4.218707181905968e-07, + "loss": 1.3669, + "step": 28030 + }, + { + "epoch": 0.9166557677216846, + "grad_norm": 2.9990280593461027, + "learning_rate": 4.20232187371884e-07, + "loss": 1.4681, + "step": 28035 + }, + { + "epoch": 0.9168192518964164, + "grad_norm": 3.4244215222933776, + "learning_rate": 4.185967764488308e-07, + "loss": 1.4352, + "step": 28040 + }, + { + "epoch": 0.9169827360711483, + "grad_norm": 3.277352886335518, + "learning_rate": 4.1696448595405335e-07, + "loss": 1.4099, + "step": 28045 + }, + { + "epoch": 0.9171462202458802, + "grad_norm": 3.2260486631643346, + "learning_rate": 4.153353164191454e-07, + "loss": 1.3849, + "step": 28050 + }, + { + "epoch": 0.9173097044206121, + "grad_norm": 3.271638675638849, + "learning_rate": 4.137092683746913e-07, + "loss": 1.4347, + "step": 28055 + }, + { + "epoch": 0.917473188595344, + "grad_norm": 3.495970106332848, + "learning_rate": 4.120863423502508e-07, + "loss": 1.3708, + "step": 28060 + }, + { + "epoch": 0.9176366727700759, + "grad_norm": 3.106370537773875, + "learning_rate": 4.1046653887437335e-07, + "loss": 1.4078, + "step": 28065 + }, + { + "epoch": 0.9178001569448078, + "grad_norm": 3.133893962122323, + "learning_rate": 4.088498584745915e-07, + "loss": 1.4061, + "step": 28070 + }, + { + "epoch": 0.9179636411195397, + "grad_norm": 3.2251133964338634, + "learning_rate": 4.072363016774139e-07, + "loss": 1.3201, + "step": 28075 + }, + { + "epoch": 0.9181271252942715, + "grad_norm": 3.0160886092308252, + "learning_rate": 4.056258690083403e-07, + "loss": 1.3119, + "step": 28080 + }, + { + "epoch": 0.9182906094690034, + "grad_norm": 3.309640419011912, + "learning_rate": 4.040185609918457e-07, + "loss": 1.3072, + "step": 28085 + }, + { + "epoch": 0.9184540936437353, + "grad_norm": 3.1246396773750957, + "learning_rate": 4.024143781513945e-07, + "loss": 1.4075, + "step": 28090 + }, + { + "epoch": 0.9186175778184672, + "grad_norm": 3.3025200970966293, + "learning_rate": 4.008133210094267e-07, + "loss": 1.4072, + "step": 28095 + }, + { + "epoch": 0.9187810619931991, + "grad_norm": 3.086608978474607, + "learning_rate": 3.9921539008736965e-07, + "loss": 1.4192, + "step": 28100 + }, + { + "epoch": 0.918944546167931, + "grad_norm": 3.323735718410055, + "learning_rate": 3.9762058590562924e-07, + "loss": 1.3897, + "step": 28105 + }, + { + "epoch": 0.9191080303426629, + "grad_norm": 3.250714588471603, + "learning_rate": 3.960289089835934e-07, + "loss": 1.3906, + "step": 28110 + }, + { + "epoch": 0.9192715145173947, + "grad_norm": 3.148213464820904, + "learning_rate": 3.9444035983963513e-07, + "loss": 1.3299, + "step": 28115 + }, + { + "epoch": 0.9194349986921266, + "grad_norm": 3.0586126834026297, + "learning_rate": 3.928549389911018e-07, + "loss": 1.4277, + "step": 28120 + }, + { + "epoch": 0.9195984828668585, + "grad_norm": 3.2528896401593976, + "learning_rate": 3.912726469543282e-07, + "loss": 1.3627, + "step": 28125 + }, + { + "epoch": 0.9197619670415904, + "grad_norm": 3.4687474725144516, + "learning_rate": 3.8969348424463093e-07, + "loss": 1.3813, + "step": 28130 + }, + { + "epoch": 0.9199254512163223, + "grad_norm": 3.3111322739966558, + "learning_rate": 3.8811745137629865e-07, + "loss": 1.3968, + "step": 28135 + }, + { + "epoch": 0.9200889353910542, + "grad_norm": 3.2541951857455182, + "learning_rate": 3.8654454886261295e-07, + "loss": 1.4724, + "step": 28140 + }, + { + "epoch": 0.9202524195657861, + "grad_norm": 3.4768007625436654, + "learning_rate": 3.849747772158241e-07, + "loss": 1.5998, + "step": 28145 + }, + { + "epoch": 0.920415903740518, + "grad_norm": 3.6846063029301273, + "learning_rate": 3.8340813694717096e-07, + "loss": 1.4272, + "step": 28150 + }, + { + "epoch": 0.9205793879152498, + "grad_norm": 3.220679014028703, + "learning_rate": 3.8184462856687086e-07, + "loss": 1.381, + "step": 28155 + }, + { + "epoch": 0.9207428720899817, + "grad_norm": 3.2421139301341255, + "learning_rate": 3.802842525841177e-07, + "loss": 1.3585, + "step": 28160 + }, + { + "epoch": 0.9209063562647136, + "grad_norm": 3.308703641208201, + "learning_rate": 3.787270095070905e-07, + "loss": 1.3813, + "step": 28165 + }, + { + "epoch": 0.9210698404394455, + "grad_norm": 3.1972786461167013, + "learning_rate": 3.771728998429425e-07, + "loss": 1.314, + "step": 28170 + }, + { + "epoch": 0.9212333246141774, + "grad_norm": 3.350775870491425, + "learning_rate": 3.756219240978098e-07, + "loss": 1.5364, + "step": 28175 + }, + { + "epoch": 0.9213968087889093, + "grad_norm": 3.470081043253315, + "learning_rate": 3.740740827768097e-07, + "loss": 1.5224, + "step": 28180 + }, + { + "epoch": 0.9215602929636412, + "grad_norm": 3.3537224149750853, + "learning_rate": 3.7252937638403206e-07, + "loss": 1.4198, + "step": 28185 + }, + { + "epoch": 0.9217237771383731, + "grad_norm": 3.1500708664869825, + "learning_rate": 3.7098780542255355e-07, + "loss": 1.3272, + "step": 28190 + }, + { + "epoch": 0.9218872613131049, + "grad_norm": 3.3607999386990284, + "learning_rate": 3.6944937039442355e-07, + "loss": 1.3873, + "step": 28195 + }, + { + "epoch": 0.9220507454878368, + "grad_norm": 3.2644889571059954, + "learning_rate": 3.679140718006735e-07, + "loss": 1.3172, + "step": 28200 + }, + { + "epoch": 0.9222142296625687, + "grad_norm": 3.276532323989339, + "learning_rate": 3.6638191014131466e-07, + "loss": 1.4164, + "step": 28205 + }, + { + "epoch": 0.9223777138373005, + "grad_norm": 3.335647562029394, + "learning_rate": 3.6485288591533e-07, + "loss": 1.4302, + "step": 28210 + }, + { + "epoch": 0.9225411980120324, + "grad_norm": 3.4899753649912517, + "learning_rate": 3.6332699962068894e-07, + "loss": 1.4418, + "step": 28215 + }, + { + "epoch": 0.9227046821867643, + "grad_norm": 3.3398199660822114, + "learning_rate": 3.6180425175433407e-07, + "loss": 1.3654, + "step": 28220 + }, + { + "epoch": 0.9228681663614962, + "grad_norm": 3.4224839757647927, + "learning_rate": 3.6028464281218643e-07, + "loss": 1.3798, + "step": 28225 + }, + { + "epoch": 0.923031650536228, + "grad_norm": 3.113504052743817, + "learning_rate": 3.5876817328914795e-07, + "loss": 1.356, + "step": 28230 + }, + { + "epoch": 0.9231951347109599, + "grad_norm": 3.2879313438572195, + "learning_rate": 3.5725484367909124e-07, + "loss": 1.4095, + "step": 28235 + }, + { + "epoch": 0.9233586188856918, + "grad_norm": 3.2103732781724936, + "learning_rate": 3.557446544748755e-07, + "loss": 1.3288, + "step": 28240 + }, + { + "epoch": 0.9235221030604237, + "grad_norm": 3.1377458987338596, + "learning_rate": 3.5423760616832834e-07, + "loss": 1.4113, + "step": 28245 + }, + { + "epoch": 0.9236855872351556, + "grad_norm": 3.5570589018382415, + "learning_rate": 3.527336992502606e-07, + "loss": 1.424, + "step": 28250 + }, + { + "epoch": 0.9238490714098875, + "grad_norm": 3.358914964646138, + "learning_rate": 3.512329342104581e-07, + "loss": 1.3813, + "step": 28255 + }, + { + "epoch": 0.9240125555846194, + "grad_norm": 3.180875496085508, + "learning_rate": 3.4973531153768115e-07, + "loss": 1.4323, + "step": 28260 + }, + { + "epoch": 0.9241760397593513, + "grad_norm": 3.370758458123275, + "learning_rate": 3.482408317196717e-07, + "loss": 1.4184, + "step": 28265 + }, + { + "epoch": 0.9243395239340831, + "grad_norm": 3.2213095838700383, + "learning_rate": 3.4674949524314385e-07, + "loss": 1.4029, + "step": 28270 + }, + { + "epoch": 0.924503008108815, + "grad_norm": 3.1717481845814373, + "learning_rate": 3.4526130259378785e-07, + "loss": 1.4331, + "step": 28275 + }, + { + "epoch": 0.9246664922835469, + "grad_norm": 3.252029463665494, + "learning_rate": 3.4377625425627394e-07, + "loss": 1.41, + "step": 28280 + }, + { + "epoch": 0.9248299764582788, + "grad_norm": 3.419471224272623, + "learning_rate": 3.422943507142451e-07, + "loss": 1.3193, + "step": 28285 + }, + { + "epoch": 0.9249934606330107, + "grad_norm": 3.3594683765890703, + "learning_rate": 3.4081559245032094e-07, + "loss": 1.5167, + "step": 28290 + }, + { + "epoch": 0.9251569448077426, + "grad_norm": 3.552474453629219, + "learning_rate": 3.3933997994609615e-07, + "loss": 1.5054, + "step": 28295 + }, + { + "epoch": 0.9253204289824745, + "grad_norm": 3.2269860326459514, + "learning_rate": 3.3786751368214186e-07, + "loss": 1.354, + "step": 28300 + }, + { + "epoch": 0.9254839131572064, + "grad_norm": 3.4767897898498408, + "learning_rate": 3.363981941380057e-07, + "loss": 1.5089, + "step": 28305 + }, + { + "epoch": 0.9256473973319382, + "grad_norm": 3.456872338793987, + "learning_rate": 3.3493202179220696e-07, + "loss": 1.4327, + "step": 28310 + }, + { + "epoch": 0.9258108815066701, + "grad_norm": 3.157181258403921, + "learning_rate": 3.334689971222449e-07, + "loss": 1.361, + "step": 28315 + }, + { + "epoch": 0.925974365681402, + "grad_norm": 3.125092382891561, + "learning_rate": 3.3200912060458724e-07, + "loss": 1.5232, + "step": 28320 + }, + { + "epoch": 0.9261378498561339, + "grad_norm": 3.044099328002125, + "learning_rate": 3.305523927146814e-07, + "loss": 1.3785, + "step": 28325 + }, + { + "epoch": 0.9263013340308658, + "grad_norm": 2.92212809666568, + "learning_rate": 3.290988139269502e-07, + "loss": 1.4635, + "step": 28330 + }, + { + "epoch": 0.9264648182055977, + "grad_norm": 3.233764377391884, + "learning_rate": 3.2764838471478486e-07, + "loss": 1.5227, + "step": 28335 + }, + { + "epoch": 0.9266283023803296, + "grad_norm": 3.4306032537221727, + "learning_rate": 3.2620110555055763e-07, + "loss": 1.3048, + "step": 28340 + }, + { + "epoch": 0.9267917865550614, + "grad_norm": 3.473930264681285, + "learning_rate": 3.2475697690560913e-07, + "loss": 1.4605, + "step": 28345 + }, + { + "epoch": 0.9269552707297933, + "grad_norm": 3.498968997929204, + "learning_rate": 3.233159992502599e-07, + "loss": 1.4879, + "step": 28350 + }, + { + "epoch": 0.9271187549045252, + "grad_norm": 3.30852735717157, + "learning_rate": 3.218781730537979e-07, + "loss": 1.4413, + "step": 28355 + }, + { + "epoch": 0.9272822390792571, + "grad_norm": 3.2411954711764994, + "learning_rate": 3.2044349878449064e-07, + "loss": 1.357, + "step": 28360 + }, + { + "epoch": 0.927445723253989, + "grad_norm": 3.448986113667746, + "learning_rate": 3.1901197690957454e-07, + "loss": 1.3649, + "step": 28365 + }, + { + "epoch": 0.9276092074287209, + "grad_norm": 3.2621093165893957, + "learning_rate": 3.1758360789526213e-07, + "loss": 1.4031, + "step": 28370 + }, + { + "epoch": 0.9277726916034528, + "grad_norm": 3.442378656587889, + "learning_rate": 3.1615839220673796e-07, + "loss": 1.3475, + "step": 28375 + }, + { + "epoch": 0.9279361757781847, + "grad_norm": 3.1841432693487786, + "learning_rate": 3.1473633030815964e-07, + "loss": 1.3269, + "step": 28380 + }, + { + "epoch": 0.9280996599529165, + "grad_norm": 3.073424422294416, + "learning_rate": 3.133174226626579e-07, + "loss": 1.3737, + "step": 28385 + }, + { + "epoch": 0.9282631441276484, + "grad_norm": 3.3367790711315792, + "learning_rate": 3.1190166973233627e-07, + "loss": 1.5448, + "step": 28390 + }, + { + "epoch": 0.9284266283023803, + "grad_norm": 3.2294308403014758, + "learning_rate": 3.1048907197827047e-07, + "loss": 1.3199, + "step": 28395 + }, + { + "epoch": 0.9285901124771122, + "grad_norm": 3.1113303087307713, + "learning_rate": 3.0907962986051034e-07, + "loss": 1.3218, + "step": 28400 + }, + { + "epoch": 0.9287535966518441, + "grad_norm": 3.3408277046643193, + "learning_rate": 3.076733438380752e-07, + "loss": 1.5673, + "step": 28405 + }, + { + "epoch": 0.928917080826576, + "grad_norm": 3.1498401673664698, + "learning_rate": 3.0627021436895774e-07, + "loss": 1.3808, + "step": 28410 + }, + { + "epoch": 0.9290805650013079, + "grad_norm": 3.4081413109835275, + "learning_rate": 3.048702419101257e-07, + "loss": 1.5015, + "step": 28415 + }, + { + "epoch": 0.9292440491760398, + "grad_norm": 3.349212724952457, + "learning_rate": 3.034734269175121e-07, + "loss": 1.4431, + "step": 28420 + }, + { + "epoch": 0.9294075333507716, + "grad_norm": 3.4200388731401112, + "learning_rate": 3.020797698460265e-07, + "loss": 1.5058, + "step": 28425 + }, + { + "epoch": 0.9295710175255035, + "grad_norm": 3.4114952453434766, + "learning_rate": 3.0068927114955016e-07, + "loss": 1.4009, + "step": 28430 + }, + { + "epoch": 0.9297345017002354, + "grad_norm": 3.389532940232414, + "learning_rate": 2.99301931280932e-07, + "loss": 1.328, + "step": 28435 + }, + { + "epoch": 0.9298979858749673, + "grad_norm": 3.237896527811042, + "learning_rate": 2.9791775069199834e-07, + "loss": 1.4637, + "step": 28440 + }, + { + "epoch": 0.9300614700496992, + "grad_norm": 3.511373242826744, + "learning_rate": 2.9653672983353863e-07, + "loss": 1.4571, + "step": 28445 + }, + { + "epoch": 0.9302249542244311, + "grad_norm": 3.4010468727411736, + "learning_rate": 2.951588691553198e-07, + "loss": 1.3872, + "step": 28450 + }, + { + "epoch": 0.930388438399163, + "grad_norm": 3.1145209640999876, + "learning_rate": 2.9378416910607834e-07, + "loss": 1.2802, + "step": 28455 + }, + { + "epoch": 0.9305519225738949, + "grad_norm": 3.1963297485263094, + "learning_rate": 2.924126301335184e-07, + "loss": 1.4574, + "step": 28460 + }, + { + "epoch": 0.9307154067486267, + "grad_norm": 3.3691065934842648, + "learning_rate": 2.9104425268431825e-07, + "loss": 1.4578, + "step": 28465 + }, + { + "epoch": 0.9308788909233586, + "grad_norm": 3.139885533825459, + "learning_rate": 2.896790372041225e-07, + "loss": 1.4881, + "step": 28470 + }, + { + "epoch": 0.9310423750980905, + "grad_norm": 3.3910080364958426, + "learning_rate": 2.8831698413754994e-07, + "loss": 1.3585, + "step": 28475 + }, + { + "epoch": 0.9312058592728224, + "grad_norm": 3.3054224414807725, + "learning_rate": 2.8695809392818907e-07, + "loss": 1.4471, + "step": 28480 + }, + { + "epoch": 0.9313693434475543, + "grad_norm": 3.2393209156725487, + "learning_rate": 2.856023670185948e-07, + "loss": 1.4399, + "step": 28485 + }, + { + "epoch": 0.9315328276222862, + "grad_norm": 3.164801764174204, + "learning_rate": 2.842498038502961e-07, + "loss": 1.2907, + "step": 28490 + }, + { + "epoch": 0.9316963117970181, + "grad_norm": 3.2580733721891315, + "learning_rate": 2.8290040486378843e-07, + "loss": 1.4906, + "step": 28495 + }, + { + "epoch": 0.93185979597175, + "grad_norm": 3.211153069492485, + "learning_rate": 2.8155417049853915e-07, + "loss": 1.3497, + "step": 28500 + }, + { + "epoch": 0.9320232801464818, + "grad_norm": 3.1774813420762245, + "learning_rate": 2.80211101192982e-07, + "loss": 1.3961, + "step": 28505 + }, + { + "epoch": 0.9321867643212137, + "grad_norm": 3.231060573950707, + "learning_rate": 2.7887119738452263e-07, + "loss": 1.3344, + "step": 28510 + }, + { + "epoch": 0.9323502484959456, + "grad_norm": 3.3632243098311108, + "learning_rate": 2.7753445950953526e-07, + "loss": 1.4356, + "step": 28515 + }, + { + "epoch": 0.9325137326706775, + "grad_norm": 3.202088611092328, + "learning_rate": 2.7620088800336287e-07, + "loss": 1.3267, + "step": 28520 + }, + { + "epoch": 0.9326772168454094, + "grad_norm": 3.2448453865358906, + "learning_rate": 2.7487048330031683e-07, + "loss": 1.433, + "step": 28525 + }, + { + "epoch": 0.9328407010201413, + "grad_norm": 3.5237424157337194, + "learning_rate": 2.735432458336762e-07, + "loss": 1.4138, + "step": 28530 + }, + { + "epoch": 0.9330041851948732, + "grad_norm": 3.3073287874137565, + "learning_rate": 2.722191760356896e-07, + "loss": 1.4043, + "step": 28535 + }, + { + "epoch": 0.933167669369605, + "grad_norm": 3.0604028959795233, + "learning_rate": 2.7089827433757763e-07, + "loss": 1.3516, + "step": 28540 + }, + { + "epoch": 0.9333311535443369, + "grad_norm": 3.2880921159775918, + "learning_rate": 2.695805411695218e-07, + "loss": 1.2998, + "step": 28545 + }, + { + "epoch": 0.9334946377190688, + "grad_norm": 3.29405354669556, + "learning_rate": 2.682659769606777e-07, + "loss": 1.3589, + "step": 28550 + }, + { + "epoch": 0.9336581218938007, + "grad_norm": 2.9325287943859224, + "learning_rate": 2.669545821391639e-07, + "loss": 1.4146, + "step": 28555 + }, + { + "epoch": 0.9338216060685326, + "grad_norm": 3.377816067733593, + "learning_rate": 2.656463571320722e-07, + "loss": 1.4103, + "step": 28560 + }, + { + "epoch": 0.9339850902432645, + "grad_norm": 3.210452855331698, + "learning_rate": 2.6434130236546063e-07, + "loss": 1.4639, + "step": 28565 + }, + { + "epoch": 0.9341485744179964, + "grad_norm": 3.297717519352969, + "learning_rate": 2.630394182643492e-07, + "loss": 1.4714, + "step": 28570 + }, + { + "epoch": 0.9343120585927283, + "grad_norm": 3.111544689423473, + "learning_rate": 2.6174070525273433e-07, + "loss": 1.4011, + "step": 28575 + }, + { + "epoch": 0.9344755427674601, + "grad_norm": 3.1796151443118066, + "learning_rate": 2.6044516375357097e-07, + "loss": 1.338, + "step": 28580 + }, + { + "epoch": 0.934639026942192, + "grad_norm": 3.345085685154409, + "learning_rate": 2.5915279418878724e-07, + "loss": 1.4635, + "step": 28585 + }, + { + "epoch": 0.9348025111169239, + "grad_norm": 3.070608212390199, + "learning_rate": 2.578635969792764e-07, + "loss": 1.3466, + "step": 28590 + }, + { + "epoch": 0.9349659952916558, + "grad_norm": 3.2818360455121147, + "learning_rate": 2.565775725448982e-07, + "loss": 1.2434, + "step": 28595 + }, + { + "epoch": 0.9351294794663877, + "grad_norm": 3.180803341290728, + "learning_rate": 2.5529472130447984e-07, + "loss": 1.3577, + "step": 28600 + }, + { + "epoch": 0.9352929636411196, + "grad_norm": 3.201310484802818, + "learning_rate": 2.5401504367581266e-07, + "loss": 1.3223, + "step": 28605 + }, + { + "epoch": 0.9354564478158515, + "grad_norm": 3.4295202661892343, + "learning_rate": 2.527385400756577e-07, + "loss": 1.3832, + "step": 28610 + }, + { + "epoch": 0.9356199319905834, + "grad_norm": 3.1813970831403373, + "learning_rate": 2.514652109197413e-07, + "loss": 1.3759, + "step": 28615 + }, + { + "epoch": 0.9357834161653152, + "grad_norm": 3.3442187457497212, + "learning_rate": 2.501950566227551e-07, + "loss": 1.4389, + "step": 28620 + }, + { + "epoch": 0.9359469003400471, + "grad_norm": 3.21331215387764, + "learning_rate": 2.4892807759835716e-07, + "loss": 1.3792, + "step": 28625 + }, + { + "epoch": 0.936110384514779, + "grad_norm": 3.1159484759227998, + "learning_rate": 2.476642742591695e-07, + "loss": 1.3912, + "step": 28630 + }, + { + "epoch": 0.9362738686895109, + "grad_norm": 3.0782692348720104, + "learning_rate": 2.464036470167852e-07, + "loss": 1.2433, + "step": 28635 + }, + { + "epoch": 0.9364373528642428, + "grad_norm": 3.6221614070845596, + "learning_rate": 2.4514619628175917e-07, + "loss": 1.4724, + "step": 28640 + }, + { + "epoch": 0.9366008370389747, + "grad_norm": 3.183076983969111, + "learning_rate": 2.438919224636105e-07, + "loss": 1.3255, + "step": 28645 + }, + { + "epoch": 0.9367643212137066, + "grad_norm": 3.2859639816666926, + "learning_rate": 2.4264082597082685e-07, + "loss": 1.4845, + "step": 28650 + }, + { + "epoch": 0.9369278053884385, + "grad_norm": 3.199473335579126, + "learning_rate": 2.413929072108578e-07, + "loss": 1.4684, + "step": 28655 + }, + { + "epoch": 0.9370912895631703, + "grad_norm": 3.1733565300251367, + "learning_rate": 2.4014816659012283e-07, + "loss": 1.3728, + "step": 28660 + }, + { + "epoch": 0.9372547737379022, + "grad_norm": 3.4555150231969187, + "learning_rate": 2.3890660451400207e-07, + "loss": 1.4998, + "step": 28665 + }, + { + "epoch": 0.9374182579126341, + "grad_norm": 3.1468131153355867, + "learning_rate": 2.3766822138684086e-07, + "loss": 1.3317, + "step": 28670 + }, + { + "epoch": 0.9375817420873659, + "grad_norm": 3.294580477965974, + "learning_rate": 2.3643301761195226e-07, + "loss": 1.3113, + "step": 28675 + }, + { + "epoch": 0.9377452262620978, + "grad_norm": 3.387597569294976, + "learning_rate": 2.3520099359160997e-07, + "loss": 1.4258, + "step": 28680 + }, + { + "epoch": 0.9379087104368297, + "grad_norm": 3.2348887280881606, + "learning_rate": 2.3397214972705418e-07, + "loss": 1.3654, + "step": 28685 + }, + { + "epoch": 0.9380721946115615, + "grad_norm": 3.189273924638315, + "learning_rate": 2.327464864184914e-07, + "loss": 1.3551, + "step": 28690 + }, + { + "epoch": 0.9382356787862934, + "grad_norm": 3.2072910757828565, + "learning_rate": 2.315240040650879e-07, + "loss": 1.381, + "step": 28695 + }, + { + "epoch": 0.9383991629610253, + "grad_norm": 3.354827352676209, + "learning_rate": 2.3030470306497744e-07, + "loss": 1.4184, + "step": 28700 + }, + { + "epoch": 0.9385626471357572, + "grad_norm": 3.2261171852893216, + "learning_rate": 2.2908858381525568e-07, + "loss": 1.4938, + "step": 28705 + }, + { + "epoch": 0.9387261313104891, + "grad_norm": 3.381163549193489, + "learning_rate": 2.2787564671198247e-07, + "loss": 1.5879, + "step": 28710 + }, + { + "epoch": 0.938889615485221, + "grad_norm": 3.063863337236595, + "learning_rate": 2.2666589215018297e-07, + "loss": 1.3914, + "step": 28715 + }, + { + "epoch": 0.9390530996599529, + "grad_norm": 3.144094169457995, + "learning_rate": 2.2545932052384422e-07, + "loss": 1.3844, + "step": 28720 + }, + { + "epoch": 0.9392165838346848, + "grad_norm": 3.367570048228001, + "learning_rate": 2.2425593222591746e-07, + "loss": 1.5422, + "step": 28725 + }, + { + "epoch": 0.9393800680094166, + "grad_norm": 3.4289471074335305, + "learning_rate": 2.2305572764831473e-07, + "loss": 1.3451, + "step": 28730 + }, + { + "epoch": 0.9395435521841485, + "grad_norm": 3.54502294341222, + "learning_rate": 2.218587071819156e-07, + "loss": 1.5305, + "step": 28735 + }, + { + "epoch": 0.9397070363588804, + "grad_norm": 3.211822353600199, + "learning_rate": 2.2066487121655933e-07, + "loss": 1.3754, + "step": 28740 + }, + { + "epoch": 0.9398705205336123, + "grad_norm": 3.0835515519316936, + "learning_rate": 2.1947422014104936e-07, + "loss": 1.2457, + "step": 28745 + }, + { + "epoch": 0.9400340047083442, + "grad_norm": 3.0644512259727668, + "learning_rate": 2.1828675434315106e-07, + "loss": 1.328, + "step": 28750 + }, + { + "epoch": 0.9401974888830761, + "grad_norm": 3.1697912424178867, + "learning_rate": 2.1710247420959286e-07, + "loss": 1.423, + "step": 28755 + }, + { + "epoch": 0.940360973057808, + "grad_norm": 3.2001651829120306, + "learning_rate": 2.1592138012606735e-07, + "loss": 1.2686, + "step": 28760 + }, + { + "epoch": 0.9405244572325399, + "grad_norm": 3.1011423719266697, + "learning_rate": 2.1474347247722572e-07, + "loss": 1.4035, + "step": 28765 + }, + { + "epoch": 0.9406879414072717, + "grad_norm": 3.1284421254081725, + "learning_rate": 2.1356875164668445e-07, + "loss": 1.3455, + "step": 28770 + }, + { + "epoch": 0.9408514255820036, + "grad_norm": 3.2229673857843926, + "learning_rate": 2.1239721801702196e-07, + "loss": 1.5022, + "step": 28775 + }, + { + "epoch": 0.9410149097567355, + "grad_norm": 3.364934500383338, + "learning_rate": 2.1122887196977747e-07, + "loss": 1.3534, + "step": 28780 + }, + { + "epoch": 0.9411783939314674, + "grad_norm": 3.2092799317750833, + "learning_rate": 2.1006371388545331e-07, + "loss": 1.3353, + "step": 28785 + }, + { + "epoch": 0.9413418781061993, + "grad_norm": 3.2832244093743315, + "learning_rate": 2.089017441435115e-07, + "loss": 1.4522, + "step": 28790 + }, + { + "epoch": 0.9415053622809312, + "grad_norm": 3.206486906595861, + "learning_rate": 2.077429631223782e-07, + "loss": 1.2823, + "step": 28795 + }, + { + "epoch": 0.9416688464556631, + "grad_norm": 3.1257943787159532, + "learning_rate": 2.065873711994415e-07, + "loss": 1.3451, + "step": 28800 + }, + { + "epoch": 0.941832330630395, + "grad_norm": 3.2820975238159775, + "learning_rate": 2.0543496875104596e-07, + "loss": 1.2891, + "step": 28805 + }, + { + "epoch": 0.9419958148051268, + "grad_norm": 3.0286008060525207, + "learning_rate": 2.0428575615250357e-07, + "loss": 1.2877, + "step": 28810 + }, + { + "epoch": 0.9421592989798587, + "grad_norm": 3.360742314374481, + "learning_rate": 2.031397337780827e-07, + "loss": 1.3134, + "step": 28815 + }, + { + "epoch": 0.9423227831545906, + "grad_norm": 3.018271179135086, + "learning_rate": 2.0199690200101596e-07, + "loss": 1.3009, + "step": 28820 + }, + { + "epoch": 0.9424862673293225, + "grad_norm": 3.0869463645850663, + "learning_rate": 2.0085726119349669e-07, + "loss": 1.3772, + "step": 28825 + }, + { + "epoch": 0.9426497515040544, + "grad_norm": 2.9694276526091743, + "learning_rate": 1.9972081172667578e-07, + "loss": 1.4872, + "step": 28830 + }, + { + "epoch": 0.9428132356787863, + "grad_norm": 3.31566948696778, + "learning_rate": 1.985875539706672e-07, + "loss": 1.3513, + "step": 28835 + }, + { + "epoch": 0.9429767198535182, + "grad_norm": 3.279806721217125, + "learning_rate": 1.974574882945468e-07, + "loss": 1.3444, + "step": 28840 + }, + { + "epoch": 0.94314020402825, + "grad_norm": 3.277309404464013, + "learning_rate": 1.963306150663491e-07, + "loss": 1.5155, + "step": 28845 + }, + { + "epoch": 0.9433036882029819, + "grad_norm": 3.36234580773404, + "learning_rate": 1.9520693465306829e-07, + "loss": 1.3407, + "step": 28850 + }, + { + "epoch": 0.9434671723777138, + "grad_norm": 3.1913358965808287, + "learning_rate": 1.940864474206583e-07, + "loss": 1.1997, + "step": 28855 + }, + { + "epoch": 0.9436306565524457, + "grad_norm": 3.35992025251665, + "learning_rate": 1.9296915373403614e-07, + "loss": 1.404, + "step": 28860 + }, + { + "epoch": 0.9437941407271776, + "grad_norm": 3.1374862530943766, + "learning_rate": 1.918550539570785e-07, + "loss": 1.4031, + "step": 28865 + }, + { + "epoch": 0.9439576249019095, + "grad_norm": 3.0815480216630124, + "learning_rate": 1.9074414845261625e-07, + "loss": 1.4663, + "step": 28870 + }, + { + "epoch": 0.9441211090766414, + "grad_norm": 3.3061601220394046, + "learning_rate": 1.8963643758244888e-07, + "loss": 1.4844, + "step": 28875 + }, + { + "epoch": 0.9442845932513733, + "grad_norm": 3.108017877218345, + "learning_rate": 1.885319217073256e-07, + "loss": 1.362, + "step": 28880 + }, + { + "epoch": 0.9444480774261051, + "grad_norm": 3.170104747656387, + "learning_rate": 1.8743060118696422e-07, + "loss": 1.3348, + "step": 28885 + }, + { + "epoch": 0.944611561600837, + "grad_norm": 3.0904933215265076, + "learning_rate": 1.8633247638003672e-07, + "loss": 1.3365, + "step": 28890 + }, + { + "epoch": 0.9447750457755689, + "grad_norm": 3.2546492431800296, + "learning_rate": 1.8523754764417367e-07, + "loss": 1.2968, + "step": 28895 + }, + { + "epoch": 0.9449385299503008, + "grad_norm": 3.082032518730421, + "learning_rate": 1.8414581533596877e-07, + "loss": 1.2903, + "step": 28900 + }, + { + "epoch": 0.9451020141250327, + "grad_norm": 3.3124562986187467, + "learning_rate": 1.8305727981097198e-07, + "loss": 1.3732, + "step": 28905 + }, + { + "epoch": 0.9452654982997646, + "grad_norm": 2.981564982415001, + "learning_rate": 1.81971941423692e-07, + "loss": 1.3226, + "step": 28910 + }, + { + "epoch": 0.9454289824744965, + "grad_norm": 3.1885415974752647, + "learning_rate": 1.808898005275972e-07, + "loss": 1.4435, + "step": 28915 + }, + { + "epoch": 0.9455924666492284, + "grad_norm": 3.3326011171608223, + "learning_rate": 1.798108574751145e-07, + "loss": 1.2858, + "step": 28920 + }, + { + "epoch": 0.9457559508239602, + "grad_norm": 3.200415647716205, + "learning_rate": 1.7873511261762956e-07, + "loss": 1.328, + "step": 28925 + }, + { + "epoch": 0.9459194349986921, + "grad_norm": 3.2178346111494744, + "learning_rate": 1.776625663054843e-07, + "loss": 1.3497, + "step": 28930 + }, + { + "epoch": 0.946082919173424, + "grad_norm": 3.610236985016628, + "learning_rate": 1.7659321888798487e-07, + "loss": 1.5313, + "step": 28935 + }, + { + "epoch": 0.9462464033481559, + "grad_norm": 3.0504326063228207, + "learning_rate": 1.7552707071338605e-07, + "loss": 1.4237, + "step": 28940 + }, + { + "epoch": 0.9464098875228878, + "grad_norm": 3.1409358110831382, + "learning_rate": 1.7446412212891006e-07, + "loss": 1.3267, + "step": 28945 + }, + { + "epoch": 0.9465733716976197, + "grad_norm": 3.323152122854552, + "learning_rate": 1.7340437348073335e-07, + "loss": 1.3756, + "step": 28950 + }, + { + "epoch": 0.9467368558723516, + "grad_norm": 3.3300088696492236, + "learning_rate": 1.723478251139876e-07, + "loss": 1.4772, + "step": 28955 + }, + { + "epoch": 0.9469003400470835, + "grad_norm": 3.1450673776789873, + "learning_rate": 1.712944773727665e-07, + "loss": 1.3345, + "step": 28960 + }, + { + "epoch": 0.9470638242218153, + "grad_norm": 3.3801491445314804, + "learning_rate": 1.702443306001178e-07, + "loss": 1.4398, + "step": 28965 + }, + { + "epoch": 0.9472273083965472, + "grad_norm": 3.2401011300245197, + "learning_rate": 1.6919738513805128e-07, + "loss": 1.285, + "step": 28970 + }, + { + "epoch": 0.9473907925712791, + "grad_norm": 2.8738594551537973, + "learning_rate": 1.6815364132752975e-07, + "loss": 1.4179, + "step": 28975 + }, + { + "epoch": 0.947554276746011, + "grad_norm": 3.2544449033296767, + "learning_rate": 1.6711309950847466e-07, + "loss": 1.4941, + "step": 28980 + }, + { + "epoch": 0.9477177609207429, + "grad_norm": 3.2237136753880327, + "learning_rate": 1.6607576001976712e-07, + "loss": 1.4147, + "step": 28985 + }, + { + "epoch": 0.9478812450954748, + "grad_norm": 2.952848654344409, + "learning_rate": 1.6504162319924021e-07, + "loss": 1.2553, + "step": 28990 + }, + { + "epoch": 0.9480447292702067, + "grad_norm": 2.6690424493230593, + "learning_rate": 1.6401068938368902e-07, + "loss": 1.2393, + "step": 28995 + }, + { + "epoch": 0.9482082134449386, + "grad_norm": 3.2085783833622687, + "learning_rate": 1.6298295890886273e-07, + "loss": 1.3457, + "step": 29000 + }, + { + "epoch": 0.9483716976196704, + "grad_norm": 3.185927607360007, + "learning_rate": 1.6195843210946806e-07, + "loss": 1.3218, + "step": 29005 + }, + { + "epoch": 0.9485351817944023, + "grad_norm": 3.207363765067752, + "learning_rate": 1.6093710931916917e-07, + "loss": 1.4696, + "step": 29010 + }, + { + "epoch": 0.9486986659691342, + "grad_norm": 3.2485507618655514, + "learning_rate": 1.5991899087058338e-07, + "loss": 1.3379, + "step": 29015 + }, + { + "epoch": 0.9488621501438661, + "grad_norm": 2.9638737580063435, + "learning_rate": 1.5890407709528988e-07, + "loss": 1.35, + "step": 29020 + }, + { + "epoch": 0.949025634318598, + "grad_norm": 3.0617121222078847, + "learning_rate": 1.5789236832381872e-07, + "loss": 1.3851, + "step": 29025 + }, + { + "epoch": 0.9491891184933299, + "grad_norm": 3.073924081925635, + "learning_rate": 1.5688386488565966e-07, + "loss": 1.3126, + "step": 29030 + }, + { + "epoch": 0.9493526026680618, + "grad_norm": 3.1442813705630335, + "learning_rate": 1.5587856710925998e-07, + "loss": 1.4241, + "step": 29035 + }, + { + "epoch": 0.9495160868427936, + "grad_norm": 3.0811188454869267, + "learning_rate": 1.5487647532201667e-07, + "loss": 1.2851, + "step": 29040 + }, + { + "epoch": 0.9496795710175255, + "grad_norm": 3.236124587046036, + "learning_rate": 1.5387758985028755e-07, + "loss": 1.3264, + "step": 29045 + }, + { + "epoch": 0.9498430551922574, + "grad_norm": 3.3510773664841085, + "learning_rate": 1.5288191101938686e-07, + "loss": 1.4109, + "step": 29050 + }, + { + "epoch": 0.9500065393669893, + "grad_norm": 3.4452252459483352, + "learning_rate": 1.5188943915358078e-07, + "loss": 1.3867, + "step": 29055 + }, + { + "epoch": 0.9501700235417212, + "grad_norm": 3.0070892049012534, + "learning_rate": 1.5090017457609408e-07, + "loss": 1.307, + "step": 29060 + }, + { + "epoch": 0.9503335077164531, + "grad_norm": 2.857980780348485, + "learning_rate": 1.4991411760910568e-07, + "loss": 1.2909, + "step": 29065 + }, + { + "epoch": 0.950496991891185, + "grad_norm": 3.051179223116454, + "learning_rate": 1.4893126857374985e-07, + "loss": 1.3715, + "step": 29070 + }, + { + "epoch": 0.9506604760659169, + "grad_norm": 3.3038458428997033, + "learning_rate": 1.4795162779011829e-07, + "loss": 1.3441, + "step": 29075 + }, + { + "epoch": 0.9508239602406487, + "grad_norm": 3.213704104475932, + "learning_rate": 1.4697519557725359e-07, + "loss": 1.5258, + "step": 29080 + }, + { + "epoch": 0.9509874444153806, + "grad_norm": 3.355329759541301, + "learning_rate": 1.4600197225315805e-07, + "loss": 1.4037, + "step": 29085 + }, + { + "epoch": 0.9511509285901125, + "grad_norm": 3.1771654668172884, + "learning_rate": 1.4503195813478365e-07, + "loss": 1.3359, + "step": 29090 + }, + { + "epoch": 0.9513144127648444, + "grad_norm": 3.044291200435128, + "learning_rate": 1.4406515353804328e-07, + "loss": 1.2356, + "step": 29095 + }, + { + "epoch": 0.9514778969395763, + "grad_norm": 3.004447145666331, + "learning_rate": 1.431015587777995e-07, + "loss": 1.3948, + "step": 29100 + }, + { + "epoch": 0.9516413811143082, + "grad_norm": 3.1140309668857866, + "learning_rate": 1.4214117416787243e-07, + "loss": 1.4456, + "step": 29105 + }, + { + "epoch": 0.9518048652890401, + "grad_norm": 3.1948374667382, + "learning_rate": 1.4118400002103628e-07, + "loss": 1.3411, + "step": 29110 + }, + { + "epoch": 0.951968349463772, + "grad_norm": 3.307705043199918, + "learning_rate": 1.4023003664901835e-07, + "loss": 1.3577, + "step": 29115 + }, + { + "epoch": 0.9521318336385038, + "grad_norm": 3.4857885103816693, + "learning_rate": 1.3927928436250015e-07, + "loss": 1.3485, + "step": 29120 + }, + { + "epoch": 0.9522953178132357, + "grad_norm": 3.2388125126153313, + "learning_rate": 1.3833174347112067e-07, + "loss": 1.4548, + "step": 29125 + }, + { + "epoch": 0.9524588019879676, + "grad_norm": 3.1619696438240683, + "learning_rate": 1.373874142834697e-07, + "loss": 1.3133, + "step": 29130 + }, + { + "epoch": 0.9526222861626995, + "grad_norm": 3.1593766531479734, + "learning_rate": 1.3644629710709233e-07, + "loss": 1.3846, + "step": 29135 + }, + { + "epoch": 0.9527857703374313, + "grad_norm": 3.148228205750718, + "learning_rate": 1.3550839224848677e-07, + "loss": 1.4251, + "step": 29140 + }, + { + "epoch": 0.9529492545121632, + "grad_norm": 3.2616479176754347, + "learning_rate": 1.3457370001310643e-07, + "loss": 1.2713, + "step": 29145 + }, + { + "epoch": 0.9531127386868951, + "grad_norm": 3.244186268078288, + "learning_rate": 1.336422207053567e-07, + "loss": 1.3449, + "step": 29150 + }, + { + "epoch": 0.9532762228616269, + "grad_norm": 3.1488673538592735, + "learning_rate": 1.3271395462859826e-07, + "loss": 1.3774, + "step": 29155 + }, + { + "epoch": 0.9534397070363588, + "grad_norm": 3.3864485679000267, + "learning_rate": 1.3178890208514484e-07, + "loss": 1.3831, + "step": 29160 + }, + { + "epoch": 0.9536031912110907, + "grad_norm": 3.2952314439993793, + "learning_rate": 1.308670633762632e-07, + "loss": 1.3716, + "step": 29165 + }, + { + "epoch": 0.9537666753858226, + "grad_norm": 3.3466429165619793, + "learning_rate": 1.2994843880217323e-07, + "loss": 1.3452, + "step": 29170 + }, + { + "epoch": 0.9539301595605545, + "grad_norm": 3.3476203715785813, + "learning_rate": 1.2903302866204782e-07, + "loss": 1.3567, + "step": 29175 + }, + { + "epoch": 0.9540936437352864, + "grad_norm": 3.1652429799401465, + "learning_rate": 1.2812083325401514e-07, + "loss": 1.3206, + "step": 29180 + }, + { + "epoch": 0.9542571279100183, + "grad_norm": 3.7024050612622026, + "learning_rate": 1.2721185287515537e-07, + "loss": 1.4027, + "step": 29185 + }, + { + "epoch": 0.9544206120847502, + "grad_norm": 3.247360705738891, + "learning_rate": 1.2630608782149834e-07, + "loss": 1.4432, + "step": 29190 + }, + { + "epoch": 0.954584096259482, + "grad_norm": 3.224857538532301, + "learning_rate": 1.2540353838803254e-07, + "loss": 1.4658, + "step": 29195 + }, + { + "epoch": 0.9547475804342139, + "grad_norm": 3.045107718531212, + "learning_rate": 1.2450420486869398e-07, + "loss": 1.355, + "step": 29200 + }, + { + "epoch": 0.9549110646089458, + "grad_norm": 3.3800968276988472, + "learning_rate": 1.23608087556375e-07, + "loss": 1.4094, + "step": 29205 + }, + { + "epoch": 0.9550745487836777, + "grad_norm": 3.3837196331999784, + "learning_rate": 1.2271518674291882e-07, + "loss": 1.428, + "step": 29210 + }, + { + "epoch": 0.9552380329584096, + "grad_norm": 3.1916596551798655, + "learning_rate": 1.2182550271912064e-07, + "loss": 1.2578, + "step": 29215 + }, + { + "epoch": 0.9554015171331415, + "grad_norm": 3.4039894245655025, + "learning_rate": 1.2093903577472864e-07, + "loss": 1.3825, + "step": 29220 + }, + { + "epoch": 0.9555650013078734, + "grad_norm": 3.126038614089172, + "learning_rate": 1.2005578619844417e-07, + "loss": 1.2912, + "step": 29225 + }, + { + "epoch": 0.9557284854826053, + "grad_norm": 3.2418842693140166, + "learning_rate": 1.1917575427792038e-07, + "loss": 1.2308, + "step": 29230 + }, + { + "epoch": 0.9558919696573371, + "grad_norm": 3.2203775572370925, + "learning_rate": 1.1829894029976141e-07, + "loss": 1.312, + "step": 29235 + }, + { + "epoch": 0.956055453832069, + "grad_norm": 3.4021346515322057, + "learning_rate": 1.1742534454952326e-07, + "loss": 1.4954, + "step": 29240 + }, + { + "epoch": 0.9562189380068009, + "grad_norm": 3.149408685990815, + "learning_rate": 1.1655496731171612e-07, + "loss": 1.3514, + "step": 29245 + }, + { + "epoch": 0.9563824221815328, + "grad_norm": 3.168750603611439, + "learning_rate": 1.1568780886979991e-07, + "loss": 1.4433, + "step": 29250 + }, + { + "epoch": 0.9565459063562647, + "grad_norm": 3.2411589602516293, + "learning_rate": 1.1482386950618652e-07, + "loss": 1.2984, + "step": 29255 + }, + { + "epoch": 0.9567093905309966, + "grad_norm": 3.0096136360829715, + "learning_rate": 1.1396314950224085e-07, + "loss": 1.4349, + "step": 29260 + }, + { + "epoch": 0.9568728747057285, + "grad_norm": 3.3362091354186894, + "learning_rate": 1.1310564913827759e-07, + "loss": 1.3438, + "step": 29265 + }, + { + "epoch": 0.9570363588804603, + "grad_norm": 3.2719478323577422, + "learning_rate": 1.1225136869356335e-07, + "loss": 1.3565, + "step": 29270 + }, + { + "epoch": 0.9571998430551922, + "grad_norm": 3.1883734953725305, + "learning_rate": 1.1140030844631667e-07, + "loss": 1.3662, + "step": 29275 + }, + { + "epoch": 0.9573633272299241, + "grad_norm": 3.108284925541571, + "learning_rate": 1.1055246867370695e-07, + "loss": 1.3773, + "step": 29280 + }, + { + "epoch": 0.957526811404656, + "grad_norm": 3.243775060340855, + "learning_rate": 1.0970784965185555e-07, + "loss": 1.2941, + "step": 29285 + }, + { + "epoch": 0.9576902955793879, + "grad_norm": 3.2977351397877848, + "learning_rate": 1.0886645165583242e-07, + "loss": 1.4178, + "step": 29290 + }, + { + "epoch": 0.9578537797541198, + "grad_norm": 3.266573082845503, + "learning_rate": 1.0802827495966283e-07, + "loss": 1.4095, + "step": 29295 + }, + { + "epoch": 0.9580172639288517, + "grad_norm": 3.235328755783824, + "learning_rate": 1.0719331983631842e-07, + "loss": 1.3927, + "step": 29300 + }, + { + "epoch": 0.9581807481035836, + "grad_norm": 3.3657853251445866, + "learning_rate": 1.0636158655772277e-07, + "loss": 1.4843, + "step": 29305 + }, + { + "epoch": 0.9583442322783154, + "grad_norm": 3.3466930286666416, + "learning_rate": 1.0553307539475477e-07, + "loss": 1.4011, + "step": 29310 + }, + { + "epoch": 0.9585077164530473, + "grad_norm": 3.1806467031623904, + "learning_rate": 1.0470778661723635e-07, + "loss": 1.3441, + "step": 29315 + }, + { + "epoch": 0.9586712006277792, + "grad_norm": 3.2222507847358552, + "learning_rate": 1.0388572049394586e-07, + "loss": 1.5666, + "step": 29320 + }, + { + "epoch": 0.9588346848025111, + "grad_norm": 3.099549980338132, + "learning_rate": 1.0306687729260912e-07, + "loss": 1.2753, + "step": 29325 + }, + { + "epoch": 0.958998168977243, + "grad_norm": 3.040370528598551, + "learning_rate": 1.0225125727990393e-07, + "loss": 1.3128, + "step": 29330 + }, + { + "epoch": 0.9591616531519749, + "grad_norm": 3.3160711910242857, + "learning_rate": 1.0143886072145892e-07, + "loss": 1.3602, + "step": 29335 + }, + { + "epoch": 0.9593251373267068, + "grad_norm": 3.162885073111507, + "learning_rate": 1.0062968788184912e-07, + "loss": 1.4369, + "step": 29340 + }, + { + "epoch": 0.9594886215014387, + "grad_norm": 3.4299840210215424, + "learning_rate": 9.982373902460374e-08, + "loss": 1.4219, + "step": 29345 + }, + { + "epoch": 0.9596521056761705, + "grad_norm": 3.1572279777158183, + "learning_rate": 9.902101441220057e-08, + "loss": 1.4605, + "step": 29350 + }, + { + "epoch": 0.9598155898509024, + "grad_norm": 3.3050865601609214, + "learning_rate": 9.822151430606608e-08, + "loss": 1.3279, + "step": 29355 + }, + { + "epoch": 0.9599790740256343, + "grad_norm": 3.436710587256859, + "learning_rate": 9.742523896658085e-08, + "loss": 1.3827, + "step": 29360 + }, + { + "epoch": 0.9601425582003662, + "grad_norm": 3.0218323803089944, + "learning_rate": 9.663218865306966e-08, + "loss": 1.2417, + "step": 29365 + }, + { + "epoch": 0.9603060423750981, + "grad_norm": 3.1626194480707928, + "learning_rate": 9.584236362381038e-08, + "loss": 1.362, + "step": 29370 + }, + { + "epoch": 0.96046952654983, + "grad_norm": 3.191624581221797, + "learning_rate": 9.505576413602946e-08, + "loss": 1.4106, + "step": 29375 + }, + { + "epoch": 0.9606330107245619, + "grad_norm": 3.141567578712392, + "learning_rate": 9.427239044590309e-08, + "loss": 1.3197, + "step": 29380 + }, + { + "epoch": 0.9607964948992938, + "grad_norm": 3.27912122662435, + "learning_rate": 9.349224280855718e-08, + "loss": 1.4614, + "step": 29385 + }, + { + "epoch": 0.9609599790740256, + "grad_norm": 3.175698732079475, + "learning_rate": 9.271532147806628e-08, + "loss": 1.3454, + "step": 29390 + }, + { + "epoch": 0.9611234632487575, + "grad_norm": 3.2858639916837715, + "learning_rate": 9.194162670745466e-08, + "loss": 1.3894, + "step": 29395 + }, + { + "epoch": 0.9612869474234894, + "grad_norm": 3.1881841478884456, + "learning_rate": 9.117115874869631e-08, + "loss": 1.4208, + "step": 29400 + }, + { + "epoch": 0.9614504315982213, + "grad_norm": 3.198701471934895, + "learning_rate": 9.040391785271385e-08, + "loss": 1.4091, + "step": 29405 + }, + { + "epoch": 0.9616139157729532, + "grad_norm": 3.0980536977998807, + "learning_rate": 8.963990426937852e-08, + "loss": 1.4265, + "step": 29410 + }, + { + "epoch": 0.9617773999476851, + "grad_norm": 3.3681423367947474, + "learning_rate": 8.887911824750905e-08, + "loss": 1.3564, + "step": 29415 + }, + { + "epoch": 0.961940884122417, + "grad_norm": 3.168725559835737, + "learning_rate": 8.812156003487837e-08, + "loss": 1.4016, + "step": 29420 + }, + { + "epoch": 0.9621043682971488, + "grad_norm": 3.1901862289264127, + "learning_rate": 8.736722987820245e-08, + "loss": 1.336, + "step": 29425 + }, + { + "epoch": 0.9622678524718807, + "grad_norm": 3.088268677458012, + "learning_rate": 8.661612802314811e-08, + "loss": 1.3754, + "step": 29430 + }, + { + "epoch": 0.9624313366466126, + "grad_norm": 2.9469220825972036, + "learning_rate": 8.58682547143308e-08, + "loss": 1.3275, + "step": 29435 + }, + { + "epoch": 0.9625948208213445, + "grad_norm": 3.382159282224757, + "learning_rate": 8.512361019531456e-08, + "loss": 1.4773, + "step": 29440 + }, + { + "epoch": 0.9627583049960764, + "grad_norm": 3.1292413960182532, + "learning_rate": 8.43821947086132e-08, + "loss": 1.2945, + "step": 29445 + }, + { + "epoch": 0.9629217891708083, + "grad_norm": 3.2092054322448473, + "learning_rate": 8.364400849568688e-08, + "loss": 1.3257, + "step": 29450 + }, + { + "epoch": 0.9630852733455402, + "grad_norm": 3.0237661405356056, + "learning_rate": 8.29090517969433e-08, + "loss": 1.2605, + "step": 29455 + }, + { + "epoch": 0.9632487575202721, + "grad_norm": 3.3133562844688766, + "learning_rate": 8.217732485174101e-08, + "loss": 1.3654, + "step": 29460 + }, + { + "epoch": 0.963412241695004, + "grad_norm": 2.885973539325255, + "learning_rate": 8.144882789838604e-08, + "loss": 1.4197, + "step": 29465 + }, + { + "epoch": 0.9635757258697358, + "grad_norm": 3.1960559998159814, + "learning_rate": 8.072356117413193e-08, + "loss": 1.46, + "step": 29470 + }, + { + "epoch": 0.9637392100444677, + "grad_norm": 3.3066235848385443, + "learning_rate": 8.000152491517865e-08, + "loss": 1.4098, + "step": 29475 + }, + { + "epoch": 0.9639026942191996, + "grad_norm": 3.12796595478336, + "learning_rate": 7.928271935667697e-08, + "loss": 1.4488, + "step": 29480 + }, + { + "epoch": 0.9640661783939315, + "grad_norm": 3.157230909053608, + "learning_rate": 7.85671447327252e-08, + "loss": 1.4171, + "step": 29485 + }, + { + "epoch": 0.9642296625686634, + "grad_norm": 3.3602065969217483, + "learning_rate": 7.78548012763669e-08, + "loss": 1.4121, + "step": 29490 + }, + { + "epoch": 0.9643931467433953, + "grad_norm": 3.0734705256275827, + "learning_rate": 7.714568921959543e-08, + "loss": 1.2451, + "step": 29495 + }, + { + "epoch": 0.9645566309181272, + "grad_norm": 3.093824375341345, + "learning_rate": 7.643980879335155e-08, + "loss": 1.4145, + "step": 29500 + }, + { + "epoch": 0.964720115092859, + "grad_norm": 3.1701404672206324, + "learning_rate": 7.573716022752254e-08, + "loss": 1.2274, + "step": 29505 + }, + { + "epoch": 0.9648835992675909, + "grad_norm": 3.866942177519845, + "learning_rate": 7.503774375094419e-08, + "loss": 1.3716, + "step": 29510 + }, + { + "epoch": 0.9650470834423228, + "grad_norm": 3.4365530274677503, + "learning_rate": 7.434155959139988e-08, + "loss": 1.3127, + "step": 29515 + }, + { + "epoch": 0.9652105676170547, + "grad_norm": 3.2711495124104197, + "learning_rate": 7.364860797561823e-08, + "loss": 1.4935, + "step": 29520 + }, + { + "epoch": 0.9653740517917866, + "grad_norm": 3.1175582820432037, + "learning_rate": 7.295888912927762e-08, + "loss": 1.4984, + "step": 29525 + }, + { + "epoch": 0.9655375359665185, + "grad_norm": 3.068663267990995, + "learning_rate": 7.22724032770028e-08, + "loss": 1.2667, + "step": 29530 + }, + { + "epoch": 0.9657010201412504, + "grad_norm": 3.421329279418242, + "learning_rate": 7.158915064236494e-08, + "loss": 1.5188, + "step": 29535 + }, + { + "epoch": 0.9658645043159823, + "grad_norm": 3.320386401589732, + "learning_rate": 7.09091314478827e-08, + "loss": 1.4449, + "step": 29540 + }, + { + "epoch": 0.9660279884907141, + "grad_norm": 3.4881872796135407, + "learning_rate": 7.023234591502225e-08, + "loss": 1.526, + "step": 29545 + }, + { + "epoch": 0.966191472665446, + "grad_norm": 3.3670889663048236, + "learning_rate": 6.95587942641951e-08, + "loss": 1.4302, + "step": 29550 + }, + { + "epoch": 0.9663549568401779, + "grad_norm": 3.4333235721539466, + "learning_rate": 6.888847671476128e-08, + "loss": 1.496, + "step": 29555 + }, + { + "epoch": 0.9665184410149098, + "grad_norm": 3.2035751924791716, + "learning_rate": 6.822139348502622e-08, + "loss": 1.4846, + "step": 29560 + }, + { + "epoch": 0.9666819251896417, + "grad_norm": 3.5971398450260708, + "learning_rate": 6.755754479224274e-08, + "loss": 1.4573, + "step": 29565 + }, + { + "epoch": 0.9668454093643736, + "grad_norm": 3.340260660351165, + "learning_rate": 6.689693085261129e-08, + "loss": 1.4265, + "step": 29570 + }, + { + "epoch": 0.9670088935391055, + "grad_norm": 3.224319159079206, + "learning_rate": 6.623955188127529e-08, + "loss": 1.3454, + "step": 29575 + }, + { + "epoch": 0.9671723777138374, + "grad_norm": 3.332547397343014, + "learning_rate": 6.558540809232905e-08, + "loss": 1.4469, + "step": 29580 + }, + { + "epoch": 0.9673358618885692, + "grad_norm": 3.3208844200921135, + "learning_rate": 6.493449969880994e-08, + "loss": 1.2998, + "step": 29585 + }, + { + "epoch": 0.9674993460633011, + "grad_norm": 3.262315011694942, + "learning_rate": 6.428682691270393e-08, + "loss": 1.3289, + "step": 29590 + }, + { + "epoch": 0.967662830238033, + "grad_norm": 3.1841203973493917, + "learning_rate": 6.364238994494121e-08, + "loss": 1.3896, + "step": 29595 + }, + { + "epoch": 0.9678263144127649, + "grad_norm": 3.1744821544137722, + "learning_rate": 6.300118900539942e-08, + "loss": 1.3243, + "step": 29600 + }, + { + "epoch": 0.9679897985874967, + "grad_norm": 3.1676061242386386, + "learning_rate": 6.236322430290154e-08, + "loss": 1.3739, + "step": 29605 + }, + { + "epoch": 0.9681532827622286, + "grad_norm": 3.0174069184404395, + "learning_rate": 6.172849604521803e-08, + "loss": 1.3446, + "step": 29610 + }, + { + "epoch": 0.9683167669369604, + "grad_norm": 3.313487060684974, + "learning_rate": 6.109700443906352e-08, + "loss": 1.2836, + "step": 29615 + }, + { + "epoch": 0.9684802511116923, + "grad_norm": 3.090030075694633, + "learning_rate": 6.046874969010019e-08, + "loss": 1.3806, + "step": 29620 + }, + { + "epoch": 0.9686437352864242, + "grad_norm": 3.115256201285166, + "learning_rate": 5.984373200293436e-08, + "loss": 1.4225, + "step": 29625 + }, + { + "epoch": 0.9688072194611561, + "grad_norm": 3.0982892435786025, + "learning_rate": 5.922195158111988e-08, + "loss": 1.284, + "step": 29630 + }, + { + "epoch": 0.968970703635888, + "grad_norm": 3.455595828667322, + "learning_rate": 5.8603408627154745e-08, + "loss": 1.3073, + "step": 29635 + }, + { + "epoch": 0.9691341878106199, + "grad_norm": 2.9918690317881573, + "learning_rate": 5.798810334248228e-08, + "loss": 1.3675, + "step": 29640 + }, + { + "epoch": 0.9692976719853518, + "grad_norm": 3.1354243083899695, + "learning_rate": 5.737603592749441e-08, + "loss": 1.316, + "step": 29645 + }, + { + "epoch": 0.9694611561600837, + "grad_norm": 3.236200856517364, + "learning_rate": 5.676720658152501e-08, + "loss": 1.3465, + "step": 29650 + }, + { + "epoch": 0.9696246403348155, + "grad_norm": 3.167943539872876, + "learning_rate": 5.616161550285659e-08, + "loss": 1.3297, + "step": 29655 + }, + { + "epoch": 0.9697881245095474, + "grad_norm": 3.3102631823293285, + "learning_rate": 5.5559262888713604e-08, + "loss": 1.4168, + "step": 29660 + }, + { + "epoch": 0.9699516086842793, + "grad_norm": 4.38895762388277, + "learning_rate": 5.496014893526691e-08, + "loss": 1.4225, + "step": 29665 + }, + { + "epoch": 0.9701150928590112, + "grad_norm": 3.146810197846011, + "learning_rate": 5.4364273837635984e-08, + "loss": 1.3183, + "step": 29670 + }, + { + "epoch": 0.9702785770337431, + "grad_norm": 3.3967374272420243, + "learning_rate": 5.377163778987893e-08, + "loss": 1.5251, + "step": 29675 + }, + { + "epoch": 0.970442061208475, + "grad_norm": 3.490810292206503, + "learning_rate": 5.318224098500691e-08, + "loss": 1.5005, + "step": 29680 + }, + { + "epoch": 0.9706055453832069, + "grad_norm": 3.359152725749823, + "learning_rate": 5.259608361496971e-08, + "loss": 1.4132, + "step": 29685 + }, + { + "epoch": 0.9707690295579388, + "grad_norm": 3.2604272169917556, + "learning_rate": 5.201316587066352e-08, + "loss": 1.5443, + "step": 29690 + }, + { + "epoch": 0.9709325137326706, + "grad_norm": 3.303184175552404, + "learning_rate": 5.143348794193315e-08, + "loss": 1.5304, + "step": 29695 + }, + { + "epoch": 0.9710959979074025, + "grad_norm": 3.225196872986748, + "learning_rate": 5.0857050017563135e-08, + "loss": 1.2542, + "step": 29700 + }, + { + "epoch": 0.9712594820821344, + "grad_norm": 3.4716775859416846, + "learning_rate": 5.028385228528665e-08, + "loss": 1.4542, + "step": 29705 + }, + { + "epoch": 0.9714229662568663, + "grad_norm": 3.2430922597303846, + "learning_rate": 4.9713894931779914e-08, + "loss": 1.4973, + "step": 29710 + }, + { + "epoch": 0.9715864504315982, + "grad_norm": 3.2577525581881353, + "learning_rate": 4.914717814266223e-08, + "loss": 1.4819, + "step": 29715 + }, + { + "epoch": 0.9717499346063301, + "grad_norm": 3.2600736599096805, + "learning_rate": 4.858370210250263e-08, + "loss": 1.5058, + "step": 29720 + }, + { + "epoch": 0.971913418781062, + "grad_norm": 3.27517395215344, + "learning_rate": 4.802346699480875e-08, + "loss": 1.3918, + "step": 29725 + }, + { + "epoch": 0.9720769029557939, + "grad_norm": 3.062209076935492, + "learning_rate": 4.746647300203688e-08, + "loss": 1.4352, + "step": 29730 + }, + { + "epoch": 0.9722403871305257, + "grad_norm": 3.377451136057462, + "learning_rate": 4.691272030558525e-08, + "loss": 1.422, + "step": 29735 + }, + { + "epoch": 0.9724038713052576, + "grad_norm": 3.3610176576176904, + "learning_rate": 4.636220908579736e-08, + "loss": 1.3925, + "step": 29740 + }, + { + "epoch": 0.9725673554799895, + "grad_norm": 3.0489048535076146, + "learning_rate": 4.5814939521963143e-08, + "loss": 1.354, + "step": 29745 + }, + { + "epoch": 0.9727308396547214, + "grad_norm": 3.1711776077227647, + "learning_rate": 4.527091179231335e-08, + "loss": 1.3527, + "step": 29750 + }, + { + "epoch": 0.9728943238294533, + "grad_norm": 3.181968237519425, + "learning_rate": 4.473012607402516e-08, + "loss": 1.4677, + "step": 29755 + }, + { + "epoch": 0.9730578080041852, + "grad_norm": 3.304534208827277, + "learning_rate": 4.419258254321879e-08, + "loss": 1.3762, + "step": 29760 + }, + { + "epoch": 0.9732212921789171, + "grad_norm": 3.1820456709588267, + "learning_rate": 4.365828137495864e-08, + "loss": 1.3325, + "step": 29765 + }, + { + "epoch": 0.973384776353649, + "grad_norm": 3.036784597468968, + "learning_rate": 4.312722274325443e-08, + "loss": 1.2945, + "step": 29770 + }, + { + "epoch": 0.9735482605283808, + "grad_norm": 3.3202458983033027, + "learning_rate": 4.25994068210589e-08, + "loss": 1.3525, + "step": 29775 + }, + { + "epoch": 0.9737117447031127, + "grad_norm": 3.030138508738172, + "learning_rate": 4.2074833780267886e-08, + "loss": 1.3405, + "step": 29780 + }, + { + "epoch": 0.9738752288778446, + "grad_norm": 3.4954766103291903, + "learning_rate": 4.1553503791722516e-08, + "loss": 1.3001, + "step": 29785 + }, + { + "epoch": 0.9740387130525765, + "grad_norm": 3.2420658111050353, + "learning_rate": 4.1035417025206966e-08, + "loss": 1.2952, + "step": 29790 + }, + { + "epoch": 0.9742021972273084, + "grad_norm": 3.2774816403830904, + "learning_rate": 4.052057364944961e-08, + "loss": 1.378, + "step": 29795 + }, + { + "epoch": 0.9743656814020403, + "grad_norm": 3.268504456205084, + "learning_rate": 4.000897383212188e-08, + "loss": 1.4292, + "step": 29800 + }, + { + "epoch": 0.9745291655767722, + "grad_norm": 3.043246678722577, + "learning_rate": 3.950061773984048e-08, + "loss": 1.2465, + "step": 29805 + }, + { + "epoch": 0.974692649751504, + "grad_norm": 3.096904603520359, + "learning_rate": 3.8995505538164115e-08, + "loss": 1.4408, + "step": 29810 + }, + { + "epoch": 0.9748561339262359, + "grad_norm": 3.1668246705153194, + "learning_rate": 3.849363739159451e-08, + "loss": 1.4537, + "step": 29815 + }, + { + "epoch": 0.9750196181009678, + "grad_norm": 3.3263857530710474, + "learning_rate": 3.799501346357759e-08, + "loss": 1.5173, + "step": 29820 + }, + { + "epoch": 0.9751831022756997, + "grad_norm": 3.0461679604914433, + "learning_rate": 3.7499633916504575e-08, + "loss": 1.3969, + "step": 29825 + }, + { + "epoch": 0.9753465864504316, + "grad_norm": 3.299365745749444, + "learning_rate": 3.7007498911708626e-08, + "loss": 1.2838, + "step": 29830 + }, + { + "epoch": 0.9755100706251635, + "grad_norm": 3.0788068210565864, + "learning_rate": 3.651860860946377e-08, + "loss": 1.3259, + "step": 29835 + }, + { + "epoch": 0.9756735547998954, + "grad_norm": 3.504470629456325, + "learning_rate": 3.603296316899152e-08, + "loss": 1.467, + "step": 29840 + }, + { + "epoch": 0.9758370389746273, + "grad_norm": 3.623892982559108, + "learning_rate": 3.5550562748453146e-08, + "loss": 1.5503, + "step": 29845 + }, + { + "epoch": 0.9760005231493591, + "grad_norm": 3.1745762695884077, + "learning_rate": 3.50714075049563e-08, + "loss": 1.3692, + "step": 29850 + }, + { + "epoch": 0.976164007324091, + "grad_norm": 3.106257570685193, + "learning_rate": 3.45954975945495e-08, + "loss": 1.4028, + "step": 29855 + }, + { + "epoch": 0.9763274914988229, + "grad_norm": 3.3940061360317992, + "learning_rate": 3.412283317222542e-08, + "loss": 1.3659, + "step": 29860 + }, + { + "epoch": 0.9764909756735548, + "grad_norm": 3.1689853829345482, + "learning_rate": 3.36534143919176e-08, + "loss": 1.3713, + "step": 29865 + }, + { + "epoch": 0.9766544598482867, + "grad_norm": 3.2622530820596123, + "learning_rate": 3.318724140650598e-08, + "loss": 1.4376, + "step": 29870 + }, + { + "epoch": 0.9768179440230186, + "grad_norm": 3.159029755042722, + "learning_rate": 3.2724314367810204e-08, + "loss": 1.3609, + "step": 29875 + }, + { + "epoch": 0.9769814281977505, + "grad_norm": 3.085889261257453, + "learning_rate": 3.226463342659636e-08, + "loss": 1.3014, + "step": 29880 + }, + { + "epoch": 0.9771449123724824, + "grad_norm": 3.4079015849308116, + "learning_rate": 3.180819873256913e-08, + "loss": 1.4797, + "step": 29885 + }, + { + "epoch": 0.9773083965472142, + "grad_norm": 3.4192238678337707, + "learning_rate": 3.1355010434378495e-08, + "loss": 1.4658, + "step": 29890 + }, + { + "epoch": 0.9774718807219461, + "grad_norm": 3.5026977157814745, + "learning_rate": 3.090506867961862e-08, + "loss": 1.375, + "step": 29895 + }, + { + "epoch": 0.977635364896678, + "grad_norm": 3.1603136677954393, + "learning_rate": 3.045837361482229e-08, + "loss": 1.3346, + "step": 29900 + }, + { + "epoch": 0.9777988490714099, + "grad_norm": 3.421390899170272, + "learning_rate": 3.001492538546869e-08, + "loss": 1.3701, + "step": 29905 + }, + { + "epoch": 0.9779623332461418, + "grad_norm": 3.319199008365997, + "learning_rate": 2.957472413597673e-08, + "loss": 1.3934, + "step": 29910 + }, + { + "epoch": 0.9781258174208737, + "grad_norm": 2.9859897841314598, + "learning_rate": 2.913777000970952e-08, + "loss": 1.368, + "step": 29915 + }, + { + "epoch": 0.9782893015956056, + "grad_norm": 3.142473520399317, + "learning_rate": 2.8704063148973227e-08, + "loss": 1.3796, + "step": 29920 + }, + { + "epoch": 0.9784527857703375, + "grad_norm": 3.1705123507581883, + "learning_rate": 2.8273603695013752e-08, + "loss": 1.3502, + "step": 29925 + }, + { + "epoch": 0.9786162699450693, + "grad_norm": 3.380864493622303, + "learning_rate": 2.7846391788023397e-08, + "loss": 1.4842, + "step": 29930 + }, + { + "epoch": 0.9787797541198012, + "grad_norm": 3.1383207038551144, + "learning_rate": 2.74224275671342e-08, + "loss": 1.263, + "step": 29935 + }, + { + "epoch": 0.9789432382945331, + "grad_norm": 3.0841782330937146, + "learning_rate": 2.7001711170419055e-08, + "loss": 1.3131, + "step": 29940 + }, + { + "epoch": 0.979106722469265, + "grad_norm": 3.1523297465550106, + "learning_rate": 2.6584242734897236e-08, + "loss": 1.4225, + "step": 29945 + }, + { + "epoch": 0.9792702066439969, + "grad_norm": 3.3959828801853775, + "learning_rate": 2.6170022396525553e-08, + "loss": 1.4165, + "step": 29950 + }, + { + "epoch": 0.9794336908187288, + "grad_norm": 3.231460693220705, + "learning_rate": 2.575905029020831e-08, + "loss": 1.3661, + "step": 29955 + }, + { + "epoch": 0.9795971749934607, + "grad_norm": 3.3645416127865753, + "learning_rate": 2.5351326549787336e-08, + "loss": 1.5297, + "step": 29960 + }, + { + "epoch": 0.9797606591681925, + "grad_norm": 3.4132916083837963, + "learning_rate": 2.4946851308048635e-08, + "loss": 1.3975, + "step": 29965 + }, + { + "epoch": 0.9799241433429244, + "grad_norm": 3.0796327736258102, + "learning_rate": 2.4545624696719062e-08, + "loss": 1.3386, + "step": 29970 + }, + { + "epoch": 0.9800876275176563, + "grad_norm": 3.2919792709219444, + "learning_rate": 2.414764684646853e-08, + "loss": 1.3862, + "step": 29975 + }, + { + "epoch": 0.9802511116923882, + "grad_norm": 3.264059288302199, + "learning_rate": 2.3752917886910032e-08, + "loss": 1.427, + "step": 29980 + }, + { + "epoch": 0.9804145958671201, + "grad_norm": 3.1645311179135494, + "learning_rate": 2.3361437946596287e-08, + "loss": 1.3562, + "step": 29985 + }, + { + "epoch": 0.980578080041852, + "grad_norm": 3.3419073450109744, + "learning_rate": 2.2973207153023093e-08, + "loss": 1.4327, + "step": 29990 + }, + { + "epoch": 0.9807415642165839, + "grad_norm": 3.398739599706025, + "learning_rate": 2.258822563262597e-08, + "loss": 1.4639, + "step": 29995 + }, + { + "epoch": 0.9809050483913158, + "grad_norm": 3.493581255116629, + "learning_rate": 2.2206493510785744e-08, + "loss": 1.4122, + "step": 30000 + }, + { + "epoch": 0.9810685325660476, + "grad_norm": 3.304369953680327, + "learning_rate": 2.1828010911822962e-08, + "loss": 1.2886, + "step": 30005 + }, + { + "epoch": 0.9812320167407795, + "grad_norm": 3.1855968729033544, + "learning_rate": 2.1452777959000137e-08, + "loss": 1.3268, + "step": 30010 + }, + { + "epoch": 0.9813955009155114, + "grad_norm": 3.0357850413651244, + "learning_rate": 2.108079477452063e-08, + "loss": 1.3717, + "step": 30015 + }, + { + "epoch": 0.9815589850902433, + "grad_norm": 3.2511018280905803, + "learning_rate": 2.0712061479530865e-08, + "loss": 1.4843, + "step": 30020 + }, + { + "epoch": 0.9817224692649752, + "grad_norm": 3.1785449008999356, + "learning_rate": 2.0346578194119227e-08, + "loss": 1.3226, + "step": 30025 + }, + { + "epoch": 0.9818859534397071, + "grad_norm": 3.2470239992691745, + "learning_rate": 1.9984345037312724e-08, + "loss": 1.2861, + "step": 30030 + }, + { + "epoch": 0.982049437614439, + "grad_norm": 3.089521699158079, + "learning_rate": 1.962536212708255e-08, + "loss": 1.4089, + "step": 30035 + }, + { + "epoch": 0.9822129217891709, + "grad_norm": 3.6450196703187845, + "learning_rate": 1.9269629580341842e-08, + "loss": 1.5268, + "step": 30040 + }, + { + "epoch": 0.9823764059639027, + "grad_norm": 3.2930811414212404, + "learning_rate": 1.891714751294238e-08, + "loss": 1.4728, + "step": 30045 + }, + { + "epoch": 0.9825398901386346, + "grad_norm": 3.1810120723745214, + "learning_rate": 1.8567916039679e-08, + "loss": 1.3884, + "step": 30050 + }, + { + "epoch": 0.9827033743133665, + "grad_norm": 3.2382402068702127, + "learning_rate": 1.8221935274288504e-08, + "loss": 1.541, + "step": 30055 + }, + { + "epoch": 0.9828668584880984, + "grad_norm": 3.349060234539652, + "learning_rate": 1.7879205329448535e-08, + "loss": 1.4326, + "step": 30060 + }, + { + "epoch": 0.9830303426628303, + "grad_norm": 3.1807568182870747, + "learning_rate": 1.7539726316778694e-08, + "loss": 1.4106, + "step": 30065 + }, + { + "epoch": 0.9831938268375622, + "grad_norm": 3.4165739271476734, + "learning_rate": 1.72034983468361e-08, + "loss": 1.3986, + "step": 30070 + }, + { + "epoch": 0.983357311012294, + "grad_norm": 3.5392774449685493, + "learning_rate": 1.6870521529124272e-08, + "loss": 1.4328, + "step": 30075 + }, + { + "epoch": 0.9835207951870258, + "grad_norm": 3.0990507125573674, + "learning_rate": 1.6540795972085354e-08, + "loss": 1.2263, + "step": 30080 + }, + { + "epoch": 0.9836842793617577, + "grad_norm": 3.20479871057285, + "learning_rate": 1.6214321783102337e-08, + "loss": 1.3449, + "step": 30085 + }, + { + "epoch": 0.9838477635364896, + "grad_norm": 3.12505905703931, + "learning_rate": 1.589109906850017e-08, + "loss": 1.283, + "step": 30090 + }, + { + "epoch": 0.9840112477112215, + "grad_norm": 3.164970158741551, + "learning_rate": 1.557112793354354e-08, + "loss": 1.3852, + "step": 30095 + }, + { + "epoch": 0.9841747318859534, + "grad_norm": 3.1311061686566615, + "learning_rate": 1.5254408482441306e-08, + "loss": 1.3427, + "step": 30100 + }, + { + "epoch": 0.9843382160606853, + "grad_norm": 3.3855396220066885, + "learning_rate": 1.4940940818338745e-08, + "loss": 1.4683, + "step": 30105 + }, + { + "epoch": 0.9845017002354172, + "grad_norm": 3.292409137194223, + "learning_rate": 1.463072504332752e-08, + "loss": 1.3705, + "step": 30110 + }, + { + "epoch": 0.984665184410149, + "grad_norm": 3.195486633333364, + "learning_rate": 1.4323761258434599e-08, + "loss": 1.4691, + "step": 30115 + }, + { + "epoch": 0.9848286685848809, + "grad_norm": 3.16498511282881, + "learning_rate": 1.4020049563632232e-08, + "loss": 1.3411, + "step": 30120 + }, + { + "epoch": 0.9849921527596128, + "grad_norm": 3.1102748717293083, + "learning_rate": 1.371959005783019e-08, + "loss": 1.4215, + "step": 30125 + }, + { + "epoch": 0.9851556369343447, + "grad_norm": 3.4881976721424435, + "learning_rate": 1.342238283888242e-08, + "loss": 1.4857, + "step": 30130 + }, + { + "epoch": 0.9853191211090766, + "grad_norm": 3.2039273816430858, + "learning_rate": 1.3128428003582605e-08, + "loss": 1.5442, + "step": 30135 + }, + { + "epoch": 0.9854826052838085, + "grad_norm": 3.2998993776315544, + "learning_rate": 1.2837725647661947e-08, + "loss": 1.5229, + "step": 30140 + }, + { + "epoch": 0.9856460894585404, + "grad_norm": 3.2003447564488554, + "learning_rate": 1.2550275865798046e-08, + "loss": 1.4783, + "step": 30145 + }, + { + "epoch": 0.9858095736332723, + "grad_norm": 3.2659358008648867, + "learning_rate": 1.2266078751603794e-08, + "loss": 1.3013, + "step": 30150 + }, + { + "epoch": 0.9859730578080041, + "grad_norm": 3.191710414672456, + "learning_rate": 1.1985134397636266e-08, + "loss": 1.4234, + "step": 30155 + }, + { + "epoch": 0.986136541982736, + "grad_norm": 3.127396366427738, + "learning_rate": 1.1707442895393384e-08, + "loss": 1.2615, + "step": 30160 + }, + { + "epoch": 0.9863000261574679, + "grad_norm": 3.278639874164814, + "learning_rate": 1.1433004335310582e-08, + "loss": 1.4647, + "step": 30165 + }, + { + "epoch": 0.9864635103321998, + "grad_norm": 3.1088179138295007, + "learning_rate": 1.1161818806765257e-08, + "loss": 1.4086, + "step": 30170 + }, + { + "epoch": 0.9866269945069317, + "grad_norm": 3.2541592308580336, + "learning_rate": 1.0893886398078979e-08, + "loss": 1.5185, + "step": 30175 + }, + { + "epoch": 0.9867904786816636, + "grad_norm": 3.2592198101569845, + "learning_rate": 1.0629207196507506e-08, + "loss": 1.3491, + "step": 30180 + }, + { + "epoch": 0.9869539628563955, + "grad_norm": 3.2255610900690446, + "learning_rate": 1.0367781288252998e-08, + "loss": 1.3107, + "step": 30185 + }, + { + "epoch": 0.9871174470311274, + "grad_norm": 3.286083229911037, + "learning_rate": 1.0109608758452905e-08, + "loss": 1.4008, + "step": 30190 + }, + { + "epoch": 0.9872809312058592, + "grad_norm": 3.086518630165403, + "learning_rate": 9.854689691189967e-09, + "loss": 1.3172, + "step": 30195 + }, + { + "epoch": 0.9874444153805911, + "grad_norm": 3.4719718151144403, + "learning_rate": 9.603024169483333e-09, + "loss": 1.4142, + "step": 30200 + }, + { + "epoch": 0.987607899555323, + "grad_norm": 2.9472735714172402, + "learning_rate": 9.354612275296326e-09, + "loss": 1.3889, + "step": 30205 + }, + { + "epoch": 0.9877713837300549, + "grad_norm": 3.496379275019599, + "learning_rate": 9.109454089528679e-09, + "loss": 1.4109, + "step": 30210 + }, + { + "epoch": 0.9879348679047868, + "grad_norm": 3.338235669782584, + "learning_rate": 8.867549692022082e-09, + "loss": 1.4389, + "step": 30215 + }, + { + "epoch": 0.9880983520795187, + "grad_norm": 3.2697086580587906, + "learning_rate": 8.628899161561288e-09, + "loss": 1.2821, + "step": 30220 + }, + { + "epoch": 0.9882618362542506, + "grad_norm": 3.2018629662315488, + "learning_rate": 8.393502575867463e-09, + "loss": 1.3929, + "step": 30225 + }, + { + "epoch": 0.9884253204289825, + "grad_norm": 5.025817549559419, + "learning_rate": 8.161360011602614e-09, + "loss": 1.3399, + "step": 30230 + }, + { + "epoch": 0.9885888046037143, + "grad_norm": 3.3045423624327226, + "learning_rate": 7.932471544371822e-09, + "loss": 1.2794, + "step": 30235 + }, + { + "epoch": 0.9887522887784462, + "grad_norm": 3.5054759409497365, + "learning_rate": 7.706837248716569e-09, + "loss": 1.3682, + "step": 30240 + }, + { + "epoch": 0.9889157729531781, + "grad_norm": 3.0952772744098804, + "learning_rate": 7.48445719812141e-09, + "loss": 1.2598, + "step": 30245 + }, + { + "epoch": 0.98907925712791, + "grad_norm": 3.555099971555164, + "learning_rate": 7.265331465010628e-09, + "loss": 1.5786, + "step": 30250 + }, + { + "epoch": 0.9892427413026419, + "grad_norm": 3.1979828576532143, + "learning_rate": 7.0494601207471425e-09, + "loss": 1.4013, + "step": 30255 + }, + { + "epoch": 0.9894062254773738, + "grad_norm": 3.353915581880548, + "learning_rate": 6.836843235635826e-09, + "loss": 1.3312, + "step": 30260 + }, + { + "epoch": 0.9895697096521057, + "grad_norm": 3.000042504929294, + "learning_rate": 6.627480878920179e-09, + "loss": 1.3234, + "step": 30265 + }, + { + "epoch": 0.9897331938268376, + "grad_norm": 3.1762056405429866, + "learning_rate": 6.421373118783436e-09, + "loss": 1.4067, + "step": 30270 + }, + { + "epoch": 0.9898966780015694, + "grad_norm": 3.309100405544345, + "learning_rate": 6.2185200223519034e-09, + "loss": 1.3598, + "step": 30275 + }, + { + "epoch": 0.9900601621763013, + "grad_norm": 3.279105394486695, + "learning_rate": 6.018921655688293e-09, + "loss": 1.3312, + "step": 30280 + }, + { + "epoch": 0.9902236463510332, + "grad_norm": 3.0501308104581883, + "learning_rate": 5.82257808379727e-09, + "loss": 1.267, + "step": 30285 + }, + { + "epoch": 0.9903871305257651, + "grad_norm": 3.502217932416888, + "learning_rate": 5.629489370624352e-09, + "loss": 1.5273, + "step": 30290 + }, + { + "epoch": 0.990550614700497, + "grad_norm": 3.1704782179596815, + "learning_rate": 5.439655579051461e-09, + "loss": 1.3618, + "step": 30295 + }, + { + "epoch": 0.9907140988752289, + "grad_norm": 3.165762515906695, + "learning_rate": 5.253076770904697e-09, + "loss": 1.4218, + "step": 30300 + }, + { + "epoch": 0.9908775830499608, + "grad_norm": 3.1637296907919454, + "learning_rate": 5.069753006947675e-09, + "loss": 1.3447, + "step": 30305 + }, + { + "epoch": 0.9910410672246927, + "grad_norm": 3.282266305943697, + "learning_rate": 4.88968434688375e-09, + "loss": 1.2593, + "step": 30310 + }, + { + "epoch": 0.9912045513994245, + "grad_norm": 3.4130445285803117, + "learning_rate": 4.712870849358231e-09, + "loss": 1.3956, + "step": 30315 + }, + { + "epoch": 0.9913680355741564, + "grad_norm": 3.2502215803680703, + "learning_rate": 4.539312571953946e-09, + "loss": 1.4675, + "step": 30320 + }, + { + "epoch": 0.9915315197488883, + "grad_norm": 3.3826966184219396, + "learning_rate": 4.3690095711945716e-09, + "loss": 1.3782, + "step": 30325 + }, + { + "epoch": 0.9916950039236202, + "grad_norm": 3.1948586348967867, + "learning_rate": 4.201961902544626e-09, + "loss": 1.3837, + "step": 30330 + }, + { + "epoch": 0.9918584880983521, + "grad_norm": 3.5003001647665744, + "learning_rate": 4.038169620406152e-09, + "loss": 1.5308, + "step": 30335 + }, + { + "epoch": 0.992021972273084, + "grad_norm": 3.6216279758766015, + "learning_rate": 3.877632778123141e-09, + "loss": 1.546, + "step": 30340 + }, + { + "epoch": 0.9921854564478159, + "grad_norm": 3.261115137091128, + "learning_rate": 3.7203514279782195e-09, + "loss": 1.4241, + "step": 30345 + }, + { + "epoch": 0.9923489406225477, + "grad_norm": 3.3411789892387405, + "learning_rate": 3.566325621193745e-09, + "loss": 1.38, + "step": 30350 + }, + { + "epoch": 0.9925124247972796, + "grad_norm": 3.3437128406814702, + "learning_rate": 3.415555407931814e-09, + "loss": 1.3958, + "step": 30355 + }, + { + "epoch": 0.9926759089720115, + "grad_norm": 3.7488367791929678, + "learning_rate": 3.2680408372964824e-09, + "loss": 1.3316, + "step": 30360 + }, + { + "epoch": 0.9928393931467434, + "grad_norm": 3.2219579218798153, + "learning_rate": 3.1237819573282093e-09, + "loss": 1.3043, + "step": 30365 + }, + { + "epoch": 0.9930028773214753, + "grad_norm": 3.10681001328819, + "learning_rate": 2.9827788150083025e-09, + "loss": 1.4636, + "step": 30370 + }, + { + "epoch": 0.9931663614962072, + "grad_norm": 3.079177936649035, + "learning_rate": 2.8450314562589176e-09, + "loss": 1.3806, + "step": 30375 + }, + { + "epoch": 0.9933298456709391, + "grad_norm": 3.2166747525734323, + "learning_rate": 2.710539925939726e-09, + "loss": 1.3644, + "step": 30380 + }, + { + "epoch": 0.993493329845671, + "grad_norm": 3.195780794701971, + "learning_rate": 2.579304267852356e-09, + "loss": 1.4779, + "step": 30385 + }, + { + "epoch": 0.9936568140204028, + "grad_norm": 3.343416680817022, + "learning_rate": 2.4513245247381746e-09, + "loss": 1.3791, + "step": 30390 + }, + { + "epoch": 0.9938202981951347, + "grad_norm": 3.4336950090886664, + "learning_rate": 2.3266007382749535e-09, + "loss": 1.4755, + "step": 30395 + }, + { + "epoch": 0.9939837823698666, + "grad_norm": 3.141328412892026, + "learning_rate": 2.2051329490824225e-09, + "loss": 1.376, + "step": 30400 + }, + { + "epoch": 0.9941472665445985, + "grad_norm": 2.8730456929682626, + "learning_rate": 2.0869211967200486e-09, + "loss": 1.3994, + "step": 30405 + }, + { + "epoch": 0.9943107507193304, + "grad_norm": 3.3692519252886006, + "learning_rate": 1.971965519687036e-09, + "loss": 1.4034, + "step": 30410 + }, + { + "epoch": 0.9944742348940623, + "grad_norm": 3.1301477389775596, + "learning_rate": 1.8602659554223246e-09, + "loss": 1.3965, + "step": 30415 + }, + { + "epoch": 0.9946377190687942, + "grad_norm": 3.0260033852002817, + "learning_rate": 1.7518225403012623e-09, + "loss": 1.5187, + "step": 30420 + }, + { + "epoch": 0.994801203243526, + "grad_norm": 3.2911635684022182, + "learning_rate": 1.6466353096433741e-09, + "loss": 1.3607, + "step": 30425 + }, + { + "epoch": 0.9949646874182579, + "grad_norm": 3.349022994661782, + "learning_rate": 1.5447042977034809e-09, + "loss": 1.4661, + "step": 30430 + }, + { + "epoch": 0.9951281715929898, + "grad_norm": 3.116794923559155, + "learning_rate": 1.446029537680582e-09, + "loss": 1.4253, + "step": 30435 + }, + { + "epoch": 0.9952916557677217, + "grad_norm": 3.2145617206912225, + "learning_rate": 1.3506110617089728e-09, + "loss": 1.4165, + "step": 30440 + }, + { + "epoch": 0.9954551399424536, + "grad_norm": 3.1625995020839777, + "learning_rate": 1.2584489008649058e-09, + "loss": 1.3924, + "step": 30445 + }, + { + "epoch": 0.9956186241171855, + "grad_norm": 3.3441555826249387, + "learning_rate": 1.1695430851621502e-09, + "loss": 1.5259, + "step": 30450 + }, + { + "epoch": 0.9957821082919174, + "grad_norm": 3.33658988323461, + "learning_rate": 1.0838936435564328e-09, + "loss": 1.3853, + "step": 30455 + }, + { + "epoch": 0.9959455924666493, + "grad_norm": 3.0166911186005003, + "learning_rate": 1.0015006039409969e-09, + "loss": 1.3836, + "step": 30460 + }, + { + "epoch": 0.9961090766413812, + "grad_norm": 3.3253028840501537, + "learning_rate": 9.223639931499328e-10, + "loss": 1.3467, + "step": 30465 + }, + { + "epoch": 0.996272560816113, + "grad_norm": 3.205862770601422, + "learning_rate": 8.464838369548478e-10, + "loss": 1.3365, + "step": 30470 + }, + { + "epoch": 0.9964360449908449, + "grad_norm": 3.119811648677019, + "learning_rate": 7.738601600693063e-10, + "loss": 1.3999, + "step": 30475 + }, + { + "epoch": 0.9965995291655768, + "grad_norm": 3.4435975574998094, + "learning_rate": 7.044929861443895e-10, + "loss": 1.3904, + "step": 30480 + }, + { + "epoch": 0.9967630133403087, + "grad_norm": 2.84749060204318, + "learning_rate": 6.383823377709153e-10, + "loss": 1.2873, + "step": 30485 + }, + { + "epoch": 0.9969264975150406, + "grad_norm": 3.357415636658497, + "learning_rate": 5.755282364805493e-10, + "loss": 1.4048, + "step": 30490 + }, + { + "epoch": 0.9970899816897725, + "grad_norm": 3.283649237590372, + "learning_rate": 5.159307027435834e-10, + "loss": 1.5006, + "step": 30495 + }, + { + "epoch": 0.9972534658645044, + "grad_norm": 3.351368124325966, + "learning_rate": 4.595897559678264e-10, + "loss": 1.3404, + "step": 30500 + }, + { + "epoch": 0.9974169500392362, + "grad_norm": 3.353420691131299, + "learning_rate": 4.065054145030445e-10, + "loss": 1.3791, + "step": 30505 + }, + { + "epoch": 0.9975804342139681, + "grad_norm": 3.252704248234707, + "learning_rate": 3.566776956365203e-10, + "loss": 1.4901, + "step": 30510 + }, + { + "epoch": 0.9977439183887, + "grad_norm": 3.2967967316661633, + "learning_rate": 3.1010661559860434e-10, + "loss": 1.4651, + "step": 30515 + }, + { + "epoch": 0.9979074025634319, + "grad_norm": 3.207301574912762, + "learning_rate": 2.667921895538328e-10, + "loss": 1.4035, + "step": 30520 + }, + { + "epoch": 0.9980708867381638, + "grad_norm": 3.2578108729678728, + "learning_rate": 2.2673443160980968e-10, + "loss": 1.485, + "step": 30525 + }, + { + "epoch": 0.9982343709128957, + "grad_norm": 3.0900534832125355, + "learning_rate": 1.8993335481165554e-10, + "loss": 1.3352, + "step": 30530 + }, + { + "epoch": 0.9983978550876276, + "grad_norm": 3.01622335623775, + "learning_rate": 1.563889711442279e-10, + "loss": 1.4281, + "step": 30535 + }, + { + "epoch": 0.9985613392623593, + "grad_norm": 3.2487064088692623, + "learning_rate": 1.261012915343418e-10, + "loss": 1.4709, + "step": 30540 + }, + { + "epoch": 0.9987248234370912, + "grad_norm": 3.0545583157092433, + "learning_rate": 9.907032584299814e-11, + "loss": 1.5467, + "step": 30545 + }, + { + "epoch": 0.9988883076118231, + "grad_norm": 3.5172694325811267, + "learning_rate": 7.529608287537571e-11, + "loss": 1.4498, + "step": 30550 + }, + { + "epoch": 0.999051791786555, + "grad_norm": 3.472450349475894, + "learning_rate": 5.4778570374169895e-11, + "loss": 1.4529, + "step": 30555 + }, + { + "epoch": 0.9992152759612869, + "grad_norm": 3.3601377493714892, + "learning_rate": 3.7517795020702854e-11, + "loss": 1.3502, + "step": 30560 + }, + { + "epoch": 0.9993787601360188, + "grad_norm": 3.3815905053004967, + "learning_rate": 2.3513762437143982e-11, + "loss": 1.5253, + "step": 30565 + }, + { + "epoch": 0.9995422443107507, + "grad_norm": 3.2911930057624104, + "learning_rate": 1.2766477183179249e-11, + "loss": 1.5669, + "step": 30570 + }, + { + "epoch": 0.9997057284854826, + "grad_norm": 2.9041935783310686, + "learning_rate": 5.275942760452068e-12, + "loss": 1.289, + "step": 30575 + }, + { + "epoch": 0.9998692126602144, + "grad_norm": 3.0830178882253843, + "learning_rate": 1.0421616059019813e-12, + "loss": 1.2863, + "step": 30580 + }, + { + "epoch": 1.0, + "eval_loss": 1.4073771238327026, + "eval_runtime": 474.2976, + "eval_samples_per_second": 28.545, + "eval_steps_per_second": 7.137, + "step": 30584 + }, + { + "epoch": 1.0, + "step": 30584, + "total_flos": 107559842217984.0, + "train_loss": 1.251657804362661, + "train_runtime": 20444.6838, + "train_samples_per_second": 5.984, + "train_steps_per_second": 1.496 + } + ], + "logging_steps": 5, + "max_steps": 30584, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 107559842217984.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}