diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5633 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00125, + "grad_norm": 5.53125, + "learning_rate": 4.999980723453676e-05, + "loss": 0.6803, + "step": 5 + }, + { + "epoch": 0.0025, + "grad_norm": 3.0, + "learning_rate": 4.9999228941119745e-05, + "loss": 0.6176, + "step": 10 + }, + { + "epoch": 0.00375, + "grad_norm": 1.8984375, + "learning_rate": 4.999826512866693e-05, + "loss": 0.5313, + "step": 15 + }, + { + "epoch": 0.005, + "grad_norm": 1.6015625, + "learning_rate": 4.999691581204152e-05, + "loss": 0.5659, + "step": 20 + }, + { + "epoch": 0.00625, + "grad_norm": 1.5703125, + "learning_rate": 4.9995181012051625e-05, + "loss": 0.5822, + "step": 25 + }, + { + "epoch": 0.0075, + "grad_norm": 1.515625, + "learning_rate": 4.9993060755450015e-05, + "loss": 0.5849, + "step": 30 + }, + { + "epoch": 0.00875, + "grad_norm": 1.4375, + "learning_rate": 4.999055507493368e-05, + "loss": 0.55, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 1.515625, + "learning_rate": 4.998766400914329e-05, + "loss": 0.5323, + "step": 40 + }, + { + "epoch": 0.01125, + "grad_norm": 1.6640625, + "learning_rate": 4.9984387602662675e-05, + "loss": 0.5775, + "step": 45 + }, + { + "epoch": 0.0125, + "grad_norm": 1.8359375, + "learning_rate": 4.9980725906018074e-05, + "loss": 0.5523, + "step": 50 + }, + { + "epoch": 0.01375, + "grad_norm": 1.3984375, + "learning_rate": 4.9976678975677376e-05, + "loss": 0.6089, + "step": 55 + }, + { + "epoch": 0.015, + "grad_norm": 1.40625, + "learning_rate": 4.9972246874049254e-05, + "loss": 0.5445, + "step": 60 + }, + { + "epoch": 0.01625, + "grad_norm": 1.375, + "learning_rate": 4.996742966948219e-05, + "loss": 0.5256, + "step": 65 + }, + { + "epoch": 0.0175, + "grad_norm": 1.359375, + "learning_rate": 4.9962227436263453e-05, + "loss": 0.5118, + "step": 70 + }, + { + "epoch": 0.01875, + "grad_norm": 1.2421875, + "learning_rate": 4.9956640254617906e-05, + "loss": 0.5458, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 1.484375, + "learning_rate": 4.995066821070679e-05, + "loss": 0.5946, + "step": 80 + }, + { + "epoch": 0.02125, + "grad_norm": 1.4375, + "learning_rate": 4.994431139662643e-05, + "loss": 0.515, + "step": 85 + }, + { + "epoch": 0.0225, + "grad_norm": 1.4140625, + "learning_rate": 4.9937569910406756e-05, + "loss": 0.5501, + "step": 90 + }, + { + "epoch": 0.02375, + "grad_norm": 1.265625, + "learning_rate": 4.9930443856009826e-05, + "loss": 0.5475, + "step": 95 + }, + { + "epoch": 0.025, + "grad_norm": 1.46875, + "learning_rate": 4.99229333433282e-05, + "loss": 0.5625, + "step": 100 + }, + { + "epoch": 0.02625, + "grad_norm": 1.578125, + "learning_rate": 4.9915038488183295e-05, + "loss": 0.5627, + "step": 105 + }, + { + "epoch": 0.0275, + "grad_norm": 1.640625, + "learning_rate": 4.990675941232353e-05, + "loss": 0.5561, + "step": 110 + }, + { + "epoch": 0.02875, + "grad_norm": 1.3046875, + "learning_rate": 4.989809624342251e-05, + "loss": 0.5254, + "step": 115 + }, + { + "epoch": 0.03, + "grad_norm": 1.0859375, + "learning_rate": 4.9889049115077005e-05, + "loss": 0.5184, + "step": 120 + }, + { + "epoch": 0.03125, + "grad_norm": 1.390625, + "learning_rate": 4.987961816680492e-05, + "loss": 0.5563, + "step": 125 + }, + { + "epoch": 0.0325, + "grad_norm": 1.6875, + "learning_rate": 4.9869803544043166e-05, + "loss": 0.5536, + "step": 130 + }, + { + "epoch": 0.03375, + "grad_norm": 1.109375, + "learning_rate": 4.985960539814535e-05, + "loss": 0.5544, + "step": 135 + }, + { + "epoch": 0.035, + "grad_norm": 1.453125, + "learning_rate": 4.98490238863795e-05, + "loss": 0.5117, + "step": 140 + }, + { + "epoch": 0.03625, + "grad_norm": 1.421875, + "learning_rate": 4.983805917192561e-05, + "loss": 0.5125, + "step": 145 + }, + { + "epoch": 0.0375, + "grad_norm": 1.1875, + "learning_rate": 4.982671142387316e-05, + "loss": 0.5563, + "step": 150 + }, + { + "epoch": 0.03875, + "grad_norm": 1.375, + "learning_rate": 4.9814980817218447e-05, + "loss": 0.5408, + "step": 155 + }, + { + "epoch": 0.04, + "grad_norm": 1.1796875, + "learning_rate": 4.980286753286195e-05, + "loss": 0.5249, + "step": 160 + }, + { + "epoch": 0.04125, + "grad_norm": 1.546875, + "learning_rate": 4.979037175760548e-05, + "loss": 0.546, + "step": 165 + }, + { + "epoch": 0.0425, + "grad_norm": 1.28125, + "learning_rate": 4.9777493684149375e-05, + "loss": 0.5019, + "step": 170 + }, + { + "epoch": 0.04375, + "grad_norm": 1.453125, + "learning_rate": 4.976423351108943e-05, + "loss": 0.5364, + "step": 175 + }, + { + "epoch": 0.045, + "grad_norm": 1.5234375, + "learning_rate": 4.975059144291394e-05, + "loss": 0.5504, + "step": 180 + }, + { + "epoch": 0.04625, + "grad_norm": 1.2109375, + "learning_rate": 4.973656769000046e-05, + "loss": 0.4682, + "step": 185 + }, + { + "epoch": 0.0475, + "grad_norm": 1.328125, + "learning_rate": 4.972216246861262e-05, + "loss": 0.5262, + "step": 190 + }, + { + "epoch": 0.04875, + "grad_norm": 1.09375, + "learning_rate": 4.9707376000896736e-05, + "loss": 0.5343, + "step": 195 + }, + { + "epoch": 0.05, + "grad_norm": 1.3671875, + "learning_rate": 4.9692208514878444e-05, + "loss": 0.5171, + "step": 200 + }, + { + "epoch": 0.05125, + "grad_norm": 1.1640625, + "learning_rate": 4.967666024445914e-05, + "loss": 0.5454, + "step": 205 + }, + { + "epoch": 0.0525, + "grad_norm": 1.25, + "learning_rate": 4.966073142941239e-05, + "loss": 0.5378, + "step": 210 + }, + { + "epoch": 0.05375, + "grad_norm": 1.140625, + "learning_rate": 4.9644422315380225e-05, + "loss": 0.4792, + "step": 215 + }, + { + "epoch": 0.055, + "grad_norm": 1.46875, + "learning_rate": 4.962773315386935e-05, + "loss": 0.5336, + "step": 220 + }, + { + "epoch": 0.05625, + "grad_norm": 1.359375, + "learning_rate": 4.9610664202247294e-05, + "loss": 0.5293, + "step": 225 + }, + { + "epoch": 0.0575, + "grad_norm": 1.0546875, + "learning_rate": 4.9593215723738404e-05, + "loss": 0.4678, + "step": 230 + }, + { + "epoch": 0.05875, + "grad_norm": 1.421875, + "learning_rate": 4.957538798741979e-05, + "loss": 0.549, + "step": 235 + }, + { + "epoch": 0.06, + "grad_norm": 1.3046875, + "learning_rate": 4.9557181268217227e-05, + "loss": 0.5642, + "step": 240 + }, + { + "epoch": 0.06125, + "grad_norm": 1.2734375, + "learning_rate": 4.953859584690082e-05, + "loss": 0.5544, + "step": 245 + }, + { + "epoch": 0.0625, + "grad_norm": 1.25, + "learning_rate": 4.951963201008076e-05, + "loss": 0.5604, + "step": 250 + }, + { + "epoch": 0.06375, + "grad_norm": 1.15625, + "learning_rate": 4.9500290050202894e-05, + "loss": 0.5349, + "step": 255 + }, + { + "epoch": 0.065, + "grad_norm": 1.5234375, + "learning_rate": 4.9480570265544144e-05, + "loss": 0.5393, + "step": 260 + }, + { + "epoch": 0.06625, + "grad_norm": 1.2109375, + "learning_rate": 4.9460472960208e-05, + "loss": 0.527, + "step": 265 + }, + { + "epoch": 0.0675, + "grad_norm": 1.203125, + "learning_rate": 4.943999844411977e-05, + "loss": 0.4982, + "step": 270 + }, + { + "epoch": 0.06875, + "grad_norm": 0.9765625, + "learning_rate": 4.9419147033021814e-05, + "loss": 0.4377, + "step": 275 + }, + { + "epoch": 0.07, + "grad_norm": 1.453125, + "learning_rate": 4.939791904846869e-05, + "loss": 0.4919, + "step": 280 + }, + { + "epoch": 0.07125, + "grad_norm": 1.7421875, + "learning_rate": 4.937631481782218e-05, + "loss": 0.5107, + "step": 285 + }, + { + "epoch": 0.0725, + "grad_norm": 1.4921875, + "learning_rate": 4.935433467424624e-05, + "loss": 0.5611, + "step": 290 + }, + { + "epoch": 0.07375, + "grad_norm": 1.8984375, + "learning_rate": 4.9331978956701875e-05, + "loss": 0.534, + "step": 295 + }, + { + "epoch": 0.075, + "grad_norm": 1.3515625, + "learning_rate": 4.9309248009941914e-05, + "loss": 0.5319, + "step": 300 + }, + { + "epoch": 0.07625, + "grad_norm": 1.2265625, + "learning_rate": 4.928614218450568e-05, + "loss": 0.4805, + "step": 305 + }, + { + "epoch": 0.0775, + "grad_norm": 1.390625, + "learning_rate": 4.9262661836713564e-05, + "loss": 0.4656, + "step": 310 + }, + { + "epoch": 0.07875, + "grad_norm": 1.0546875, + "learning_rate": 4.923880732866159e-05, + "loss": 0.5328, + "step": 315 + }, + { + "epoch": 0.08, + "grad_norm": 1.0, + "learning_rate": 4.9214579028215776e-05, + "loss": 0.4949, + "step": 320 + }, + { + "epoch": 0.08125, + "grad_norm": 1.2109375, + "learning_rate": 4.9189977309006495e-05, + "loss": 0.5222, + "step": 325 + }, + { + "epoch": 0.0825, + "grad_norm": 1.2421875, + "learning_rate": 4.916500255042268e-05, + "loss": 0.5029, + "step": 330 + }, + { + "epoch": 0.08375, + "grad_norm": 1.0546875, + "learning_rate": 4.9139655137606015e-05, + "loss": 0.5188, + "step": 335 + }, + { + "epoch": 0.085, + "grad_norm": 1.109375, + "learning_rate": 4.9113935461444955e-05, + "loss": 0.5651, + "step": 340 + }, + { + "epoch": 0.08625, + "grad_norm": 1.046875, + "learning_rate": 4.908784391856872e-05, + "loss": 0.4586, + "step": 345 + }, + { + "epoch": 0.0875, + "grad_norm": 1.0625, + "learning_rate": 4.906138091134118e-05, + "loss": 0.539, + "step": 350 + }, + { + "epoch": 0.08875, + "grad_norm": 1.3828125, + "learning_rate": 4.9034546847854656e-05, + "loss": 0.5331, + "step": 355 + }, + { + "epoch": 0.09, + "grad_norm": 1.015625, + "learning_rate": 4.900734214192358e-05, + "loss": 0.4227, + "step": 360 + }, + { + "epoch": 0.09125, + "grad_norm": 1.28125, + "learning_rate": 4.897976721307819e-05, + "loss": 0.5005, + "step": 365 + }, + { + "epoch": 0.0925, + "grad_norm": 1.1171875, + "learning_rate": 4.8951822486557986e-05, + "loss": 0.5294, + "step": 370 + }, + { + "epoch": 0.09375, + "grad_norm": 1.375, + "learning_rate": 4.892350839330522e-05, + "loss": 0.5729, + "step": 375 + }, + { + "epoch": 0.095, + "grad_norm": 0.95703125, + "learning_rate": 4.8894825369958255e-05, + "loss": 0.4837, + "step": 380 + }, + { + "epoch": 0.09625, + "grad_norm": 1.5078125, + "learning_rate": 4.8865773858844776e-05, + "loss": 0.5266, + "step": 385 + }, + { + "epoch": 0.0975, + "grad_norm": 1.375, + "learning_rate": 4.8836354307975026e-05, + "loss": 0.5329, + "step": 390 + }, + { + "epoch": 0.09875, + "grad_norm": 1.4921875, + "learning_rate": 4.880656717103489e-05, + "loss": 0.5096, + "step": 395 + }, + { + "epoch": 0.1, + "grad_norm": 1.171875, + "learning_rate": 4.877641290737884e-05, + "loss": 0.4919, + "step": 400 + }, + { + "epoch": 0.10125, + "grad_norm": 1.078125, + "learning_rate": 4.874589198202294e-05, + "loss": 0.4633, + "step": 405 + }, + { + "epoch": 0.1025, + "grad_norm": 1.4296875, + "learning_rate": 4.8715004865637614e-05, + "loss": 0.4981, + "step": 410 + }, + { + "epoch": 0.10375, + "grad_norm": 1.109375, + "learning_rate": 4.868375203454041e-05, + "loss": 0.4699, + "step": 415 + }, + { + "epoch": 0.105, + "grad_norm": 1.1796875, + "learning_rate": 4.8652133970688636e-05, + "loss": 0.5086, + "step": 420 + }, + { + "epoch": 0.10625, + "grad_norm": 1.1015625, + "learning_rate": 4.862015116167196e-05, + "loss": 0.5406, + "step": 425 + }, + { + "epoch": 0.1075, + "grad_norm": 1.640625, + "learning_rate": 4.8587804100704845e-05, + "loss": 0.5456, + "step": 430 + }, + { + "epoch": 0.10875, + "grad_norm": 1.1875, + "learning_rate": 4.8555093286618995e-05, + "loss": 0.5107, + "step": 435 + }, + { + "epoch": 0.11, + "grad_norm": 1.34375, + "learning_rate": 4.852201922385564e-05, + "loss": 0.4078, + "step": 440 + }, + { + "epoch": 0.11125, + "grad_norm": 1.3046875, + "learning_rate": 4.848858242245773e-05, + "loss": 0.4958, + "step": 445 + }, + { + "epoch": 0.1125, + "grad_norm": 1.4765625, + "learning_rate": 4.8454783398062106e-05, + "loss": 0.4822, + "step": 450 + }, + { + "epoch": 0.11375, + "grad_norm": 1.546875, + "learning_rate": 4.8420622671891533e-05, + "loss": 0.5489, + "step": 455 + }, + { + "epoch": 0.115, + "grad_norm": 1.0703125, + "learning_rate": 4.838610077074669e-05, + "loss": 0.4884, + "step": 460 + }, + { + "epoch": 0.11625, + "grad_norm": 1.609375, + "learning_rate": 4.835121822699796e-05, + "loss": 0.529, + "step": 465 + }, + { + "epoch": 0.1175, + "grad_norm": 1.234375, + "learning_rate": 4.8315975578577355e-05, + "loss": 0.5414, + "step": 470 + }, + { + "epoch": 0.11875, + "grad_norm": 1.5078125, + "learning_rate": 4.828037336897009e-05, + "loss": 0.4749, + "step": 475 + }, + { + "epoch": 0.12, + "grad_norm": 0.9765625, + "learning_rate": 4.8244412147206284e-05, + "loss": 0.5205, + "step": 480 + }, + { + "epoch": 0.12125, + "grad_norm": 1.2890625, + "learning_rate": 4.820809246785247e-05, + "loss": 0.5343, + "step": 485 + }, + { + "epoch": 0.1225, + "grad_norm": 1.3828125, + "learning_rate": 4.817141489100302e-05, + "loss": 0.5324, + "step": 490 + }, + { + "epoch": 0.12375, + "grad_norm": 1.546875, + "learning_rate": 4.8134379982271556e-05, + "loss": 0.5451, + "step": 495 + }, + { + "epoch": 0.125, + "grad_norm": 1.390625, + "learning_rate": 4.8096988312782174e-05, + "loss": 0.5428, + "step": 500 + }, + { + "epoch": 0.12625, + "grad_norm": 1.0546875, + "learning_rate": 4.805924045916067e-05, + "loss": 0.5002, + "step": 505 + }, + { + "epoch": 0.1275, + "grad_norm": 1.2890625, + "learning_rate": 4.8021137003525664e-05, + "loss": 0.5277, + "step": 510 + }, + { + "epoch": 0.12875, + "grad_norm": 1.3125, + "learning_rate": 4.7982678533479555e-05, + "loss": 0.5185, + "step": 515 + }, + { + "epoch": 0.13, + "grad_norm": 1.3125, + "learning_rate": 4.794386564209953e-05, + "loss": 0.501, + "step": 520 + }, + { + "epoch": 0.13125, + "grad_norm": 1.4453125, + "learning_rate": 4.7904698927928406e-05, + "loss": 0.4903, + "step": 525 + }, + { + "epoch": 0.1325, + "grad_norm": 1.1484375, + "learning_rate": 4.7865178994965344e-05, + "loss": 0.4764, + "step": 530 + }, + { + "epoch": 0.13375, + "grad_norm": 1.3515625, + "learning_rate": 4.782530645265661e-05, + "loss": 0.5046, + "step": 535 + }, + { + "epoch": 0.135, + "grad_norm": 1.25, + "learning_rate": 4.7785081915886134e-05, + "loss": 0.4849, + "step": 540 + }, + { + "epoch": 0.13625, + "grad_norm": 1.4375, + "learning_rate": 4.7744506004966025e-05, + "loss": 0.4874, + "step": 545 + }, + { + "epoch": 0.1375, + "grad_norm": 1.1796875, + "learning_rate": 4.7703579345627035e-05, + "loss": 0.5632, + "step": 550 + }, + { + "epoch": 0.13875, + "grad_norm": 1.1953125, + "learning_rate": 4.766230256900887e-05, + "loss": 0.4894, + "step": 555 + }, + { + "epoch": 0.14, + "grad_norm": 0.9140625, + "learning_rate": 4.762067631165049e-05, + "loss": 0.4819, + "step": 560 + }, + { + "epoch": 0.14125, + "grad_norm": 1.3203125, + "learning_rate": 4.7578701215480284e-05, + "loss": 0.4872, + "step": 565 + }, + { + "epoch": 0.1425, + "grad_norm": 1.6328125, + "learning_rate": 4.753637792780614e-05, + "loss": 0.5274, + "step": 570 + }, + { + "epoch": 0.14375, + "grad_norm": 1.359375, + "learning_rate": 4.749370710130554e-05, + "loss": 0.5052, + "step": 575 + }, + { + "epoch": 0.145, + "grad_norm": 1.2109375, + "learning_rate": 4.745068939401539e-05, + "loss": 0.4819, + "step": 580 + }, + { + "epoch": 0.14625, + "grad_norm": 1.4296875, + "learning_rate": 4.740732546932197e-05, + "loss": 0.5159, + "step": 585 + }, + { + "epoch": 0.1475, + "grad_norm": 1.203125, + "learning_rate": 4.7363615995950626e-05, + "loss": 0.5338, + "step": 590 + }, + { + "epoch": 0.14875, + "grad_norm": 1.1171875, + "learning_rate": 4.7319561647955526e-05, + "loss": 0.4797, + "step": 595 + }, + { + "epoch": 0.15, + "grad_norm": 1.2578125, + "learning_rate": 4.72751631047092e-05, + "loss": 0.5453, + "step": 600 + }, + { + "epoch": 0.15125, + "grad_norm": 1.234375, + "learning_rate": 4.7230421050892116e-05, + "loss": 0.5009, + "step": 605 + }, + { + "epoch": 0.1525, + "grad_norm": 1.5625, + "learning_rate": 4.718533617648209e-05, + "loss": 0.4602, + "step": 610 + }, + { + "epoch": 0.15375, + "grad_norm": 1.484375, + "learning_rate": 4.713990917674365e-05, + "loss": 0.5399, + "step": 615 + }, + { + "epoch": 0.155, + "grad_norm": 1.3828125, + "learning_rate": 4.709414075221734e-05, + "loss": 0.5006, + "step": 620 + }, + { + "epoch": 0.15625, + "grad_norm": 1.0546875, + "learning_rate": 4.7048031608708876e-05, + "loss": 0.4784, + "step": 625 + }, + { + "epoch": 0.1575, + "grad_norm": 1.1953125, + "learning_rate": 4.7001582457278304e-05, + "loss": 0.4764, + "step": 630 + }, + { + "epoch": 0.15875, + "grad_norm": 1.03125, + "learning_rate": 4.695479401422898e-05, + "loss": 0.5003, + "step": 635 + }, + { + "epoch": 0.16, + "grad_norm": 1.9921875, + "learning_rate": 4.690766700109659e-05, + "loss": 0.4586, + "step": 640 + }, + { + "epoch": 0.16125, + "grad_norm": 1.5234375, + "learning_rate": 4.686020214463798e-05, + "loss": 0.5272, + "step": 645 + }, + { + "epoch": 0.1625, + "grad_norm": 1.3203125, + "learning_rate": 4.681240017681993e-05, + "loss": 0.5626, + "step": 650 + }, + { + "epoch": 0.16375, + "grad_norm": 1.3984375, + "learning_rate": 4.676426183480794e-05, + "loss": 0.5696, + "step": 655 + }, + { + "epoch": 0.165, + "grad_norm": 1.1640625, + "learning_rate": 4.671578786095478e-05, + "loss": 0.5391, + "step": 660 + }, + { + "epoch": 0.16625, + "grad_norm": 1.1484375, + "learning_rate": 4.6666979002789105e-05, + "loss": 0.5195, + "step": 665 + }, + { + "epoch": 0.1675, + "grad_norm": 1.5, + "learning_rate": 4.661783601300388e-05, + "loss": 0.4973, + "step": 670 + }, + { + "epoch": 0.16875, + "grad_norm": 1.2890625, + "learning_rate": 4.65683596494448e-05, + "loss": 0.4741, + "step": 675 + }, + { + "epoch": 0.17, + "grad_norm": 1.265625, + "learning_rate": 4.65185506750986e-05, + "loss": 0.4684, + "step": 680 + }, + { + "epoch": 0.17125, + "grad_norm": 1.5390625, + "learning_rate": 4.646840985808126e-05, + "loss": 0.5307, + "step": 685 + }, + { + "epoch": 0.1725, + "grad_norm": 1.296875, + "learning_rate": 4.6417937971626245e-05, + "loss": 0.5154, + "step": 690 + }, + { + "epoch": 0.17375, + "grad_norm": 1.265625, + "learning_rate": 4.636713579407245e-05, + "loss": 0.5348, + "step": 695 + }, + { + "epoch": 0.175, + "grad_norm": 1.1640625, + "learning_rate": 4.6316004108852305e-05, + "loss": 0.477, + "step": 700 + }, + { + "epoch": 0.17625, + "grad_norm": 1.1640625, + "learning_rate": 4.6264543704479655e-05, + "loss": 0.4989, + "step": 705 + }, + { + "epoch": 0.1775, + "grad_norm": 1.2421875, + "learning_rate": 4.6212755374537596e-05, + "loss": 0.5109, + "step": 710 + }, + { + "epoch": 0.17875, + "grad_norm": 1.0625, + "learning_rate": 4.616063991766623e-05, + "loss": 0.48, + "step": 715 + }, + { + "epoch": 0.18, + "grad_norm": 1.3984375, + "learning_rate": 4.610819813755038e-05, + "loss": 0.5151, + "step": 720 + }, + { + "epoch": 0.18125, + "grad_norm": 0.9453125, + "learning_rate": 4.6055430842907167e-05, + "loss": 0.5235, + "step": 725 + }, + { + "epoch": 0.1825, + "grad_norm": 1.5078125, + "learning_rate": 4.600233884747355e-05, + "loss": 0.5006, + "step": 730 + }, + { + "epoch": 0.18375, + "grad_norm": 1.234375, + "learning_rate": 4.594892296999378e-05, + "loss": 0.479, + "step": 735 + }, + { + "epoch": 0.185, + "grad_norm": 1.015625, + "learning_rate": 4.5895184034206765e-05, + "loss": 0.4807, + "step": 740 + }, + { + "epoch": 0.18625, + "grad_norm": 1.40625, + "learning_rate": 4.5841122868833364e-05, + "loss": 0.5189, + "step": 745 + }, + { + "epoch": 0.1875, + "grad_norm": 1.1953125, + "learning_rate": 4.5786740307563636e-05, + "loss": 0.4768, + "step": 750 + }, + { + "epoch": 0.18875, + "grad_norm": 1.5234375, + "learning_rate": 4.573203718904394e-05, + "loss": 0.4747, + "step": 755 + }, + { + "epoch": 0.19, + "grad_norm": 1.5703125, + "learning_rate": 4.567701435686404e-05, + "loss": 0.4756, + "step": 760 + }, + { + "epoch": 0.19125, + "grad_norm": 1.1171875, + "learning_rate": 4.562167265954409e-05, + "loss": 0.5102, + "step": 765 + }, + { + "epoch": 0.1925, + "grad_norm": 1.953125, + "learning_rate": 4.55660129505215e-05, + "loss": 0.5229, + "step": 770 + }, + { + "epoch": 0.19375, + "grad_norm": 1.203125, + "learning_rate": 4.551003608813784e-05, + "loss": 0.5103, + "step": 775 + }, + { + "epoch": 0.195, + "grad_norm": 1.21875, + "learning_rate": 4.545374293562559e-05, + "loss": 0.5216, + "step": 780 + }, + { + "epoch": 0.19625, + "grad_norm": 1.078125, + "learning_rate": 4.5397134361094786e-05, + "loss": 0.5039, + "step": 785 + }, + { + "epoch": 0.1975, + "grad_norm": 1.3828125, + "learning_rate": 4.534021123751968e-05, + "loss": 0.4834, + "step": 790 + }, + { + "epoch": 0.19875, + "grad_norm": 1.3828125, + "learning_rate": 4.528297444272525e-05, + "loss": 0.4386, + "step": 795 + }, + { + "epoch": 0.2, + "grad_norm": 1.328125, + "learning_rate": 4.522542485937369e-05, + "loss": 0.5097, + "step": 800 + }, + { + "epoch": 0.20125, + "grad_norm": 1.125, + "learning_rate": 4.516756337495075e-05, + "loss": 0.5574, + "step": 805 + }, + { + "epoch": 0.2025, + "grad_norm": 1.1953125, + "learning_rate": 4.5109390881752114e-05, + "loss": 0.5492, + "step": 810 + }, + { + "epoch": 0.20375, + "grad_norm": 1.109375, + "learning_rate": 4.5050908276869586e-05, + "loss": 0.5281, + "step": 815 + }, + { + "epoch": 0.205, + "grad_norm": 1.6484375, + "learning_rate": 4.499211646217727e-05, + "loss": 0.5042, + "step": 820 + }, + { + "epoch": 0.20625, + "grad_norm": 0.9765625, + "learning_rate": 4.493301634431768e-05, + "loss": 0.4746, + "step": 825 + }, + { + "epoch": 0.2075, + "grad_norm": 1.3046875, + "learning_rate": 4.487360883468775e-05, + "loss": 0.5611, + "step": 830 + }, + { + "epoch": 0.20875, + "grad_norm": 1.703125, + "learning_rate": 4.481389484942478e-05, + "loss": 0.5058, + "step": 835 + }, + { + "epoch": 0.21, + "grad_norm": 1.0703125, + "learning_rate": 4.4753875309392266e-05, + "loss": 0.4352, + "step": 840 + }, + { + "epoch": 0.21125, + "grad_norm": 1.125, + "learning_rate": 4.469355114016577e-05, + "loss": 0.4849, + "step": 845 + }, + { + "epoch": 0.2125, + "grad_norm": 1.1171875, + "learning_rate": 4.463292327201862e-05, + "loss": 0.5195, + "step": 850 + }, + { + "epoch": 0.21375, + "grad_norm": 1.09375, + "learning_rate": 4.4571992639907545e-05, + "loss": 0.3864, + "step": 855 + }, + { + "epoch": 0.215, + "grad_norm": 1.2578125, + "learning_rate": 4.451076018345825e-05, + "loss": 0.4903, + "step": 860 + }, + { + "epoch": 0.21625, + "grad_norm": 1.21875, + "learning_rate": 4.444922684695097e-05, + "loss": 0.5126, + "step": 865 + }, + { + "epoch": 0.2175, + "grad_norm": 1.1796875, + "learning_rate": 4.4387393579305865e-05, + "loss": 0.4958, + "step": 870 + }, + { + "epoch": 0.21875, + "grad_norm": 1.5078125, + "learning_rate": 4.4325261334068426e-05, + "loss": 0.5307, + "step": 875 + }, + { + "epoch": 0.22, + "grad_norm": 1.5, + "learning_rate": 4.426283106939474e-05, + "loss": 0.5048, + "step": 880 + }, + { + "epoch": 0.22125, + "grad_norm": 1.0, + "learning_rate": 4.4200103748036695e-05, + "loss": 0.4757, + "step": 885 + }, + { + "epoch": 0.2225, + "grad_norm": 1.359375, + "learning_rate": 4.4137080337327205e-05, + "loss": 0.5525, + "step": 890 + }, + { + "epoch": 0.22375, + "grad_norm": 1.5390625, + "learning_rate": 4.407376180916522e-05, + "loss": 0.4781, + "step": 895 + }, + { + "epoch": 0.225, + "grad_norm": 1.28125, + "learning_rate": 4.401014914000078e-05, + "loss": 0.4797, + "step": 900 + }, + { + "epoch": 0.22625, + "grad_norm": 1.484375, + "learning_rate": 4.3946243310819926e-05, + "loss": 0.5529, + "step": 905 + }, + { + "epoch": 0.2275, + "grad_norm": 1.265625, + "learning_rate": 4.3882045307129594e-05, + "loss": 0.4669, + "step": 910 + }, + { + "epoch": 0.22875, + "grad_norm": 1.234375, + "learning_rate": 4.3817556118942425e-05, + "loss": 0.5328, + "step": 915 + }, + { + "epoch": 0.23, + "grad_norm": 1.4140625, + "learning_rate": 4.375277674076149e-05, + "loss": 0.5347, + "step": 920 + }, + { + "epoch": 0.23125, + "grad_norm": 1.15625, + "learning_rate": 4.3687708171564925e-05, + "loss": 0.4615, + "step": 925 + }, + { + "epoch": 0.2325, + "grad_norm": 1.1328125, + "learning_rate": 4.3622351414790554e-05, + "loss": 0.5132, + "step": 930 + }, + { + "epoch": 0.23375, + "grad_norm": 1.1171875, + "learning_rate": 4.355670747832042e-05, + "loss": 0.4063, + "step": 935 + }, + { + "epoch": 0.235, + "grad_norm": 1.1171875, + "learning_rate": 4.349077737446525e-05, + "loss": 0.493, + "step": 940 + }, + { + "epoch": 0.23625, + "grad_norm": 1.078125, + "learning_rate": 4.3424562119948776e-05, + "loss": 0.4826, + "step": 945 + }, + { + "epoch": 0.2375, + "grad_norm": 1.0546875, + "learning_rate": 4.335806273589214e-05, + "loss": 0.4726, + "step": 950 + }, + { + "epoch": 0.23875, + "grad_norm": 1.59375, + "learning_rate": 4.329128024779812e-05, + "loss": 0.4672, + "step": 955 + }, + { + "epoch": 0.24, + "grad_norm": 0.91796875, + "learning_rate": 4.3224215685535294e-05, + "loss": 0.4467, + "step": 960 + }, + { + "epoch": 0.24125, + "grad_norm": 1.484375, + "learning_rate": 4.315687008332217e-05, + "loss": 0.5019, + "step": 965 + }, + { + "epoch": 0.2425, + "grad_norm": 1.2265625, + "learning_rate": 4.3089244479711236e-05, + "loss": 0.526, + "step": 970 + }, + { + "epoch": 0.24375, + "grad_norm": 1.2890625, + "learning_rate": 4.302133991757297e-05, + "loss": 0.509, + "step": 975 + }, + { + "epoch": 0.245, + "grad_norm": 0.953125, + "learning_rate": 4.295315744407972e-05, + "loss": 0.447, + "step": 980 + }, + { + "epoch": 0.24625, + "grad_norm": 1.2578125, + "learning_rate": 4.2884698110689575e-05, + "loss": 0.4927, + "step": 985 + }, + { + "epoch": 0.2475, + "grad_norm": 1.578125, + "learning_rate": 4.281596297313013e-05, + "loss": 0.4891, + "step": 990 + }, + { + "epoch": 0.24875, + "grad_norm": 2.21875, + "learning_rate": 4.274695309138226e-05, + "loss": 0.5046, + "step": 995 + }, + { + "epoch": 0.25, + "grad_norm": 1.1796875, + "learning_rate": 4.267766952966369e-05, + "loss": 0.4642, + "step": 1000 + }, + { + "epoch": 0.25125, + "grad_norm": 1.109375, + "learning_rate": 4.260811335641266e-05, + "loss": 0.4396, + "step": 1005 + }, + { + "epoch": 0.2525, + "grad_norm": 1.2265625, + "learning_rate": 4.25382856442714e-05, + "loss": 0.4386, + "step": 1010 + }, + { + "epoch": 0.25375, + "grad_norm": 1.546875, + "learning_rate": 4.2468187470069607e-05, + "loss": 0.5335, + "step": 1015 + }, + { + "epoch": 0.255, + "grad_norm": 1.59375, + "learning_rate": 4.2397819914807856e-05, + "loss": 0.4703, + "step": 1020 + }, + { + "epoch": 0.25625, + "grad_norm": 1.0625, + "learning_rate": 4.23271840636409e-05, + "loss": 0.5011, + "step": 1025 + }, + { + "epoch": 0.2575, + "grad_norm": 1.046875, + "learning_rate": 4.225628100586093e-05, + "loss": 0.5234, + "step": 1030 + }, + { + "epoch": 0.25875, + "grad_norm": 1.0078125, + "learning_rate": 4.218511183488082e-05, + "loss": 0.4749, + "step": 1035 + }, + { + "epoch": 0.26, + "grad_norm": 1.34375, + "learning_rate": 4.211367764821722e-05, + "loss": 0.5361, + "step": 1040 + }, + { + "epoch": 0.26125, + "grad_norm": 1.1875, + "learning_rate": 4.2041979547473665e-05, + "loss": 0.4458, + "step": 1045 + }, + { + "epoch": 0.2625, + "grad_norm": 1.140625, + "learning_rate": 4.197001863832355e-05, + "loss": 0.4517, + "step": 1050 + }, + { + "epoch": 0.26375, + "grad_norm": 0.9921875, + "learning_rate": 4.189779603049312e-05, + "loss": 0.4571, + "step": 1055 + }, + { + "epoch": 0.265, + "grad_norm": 1.15625, + "learning_rate": 4.182531283774434e-05, + "loss": 0.487, + "step": 1060 + }, + { + "epoch": 0.26625, + "grad_norm": 1.140625, + "learning_rate": 4.17525701778577e-05, + "loss": 0.5186, + "step": 1065 + }, + { + "epoch": 0.2675, + "grad_norm": 1.2578125, + "learning_rate": 4.1679569172614996e-05, + "loss": 0.4815, + "step": 1070 + }, + { + "epoch": 0.26875, + "grad_norm": 0.98046875, + "learning_rate": 4.1606310947782044e-05, + "loss": 0.4563, + "step": 1075 + }, + { + "epoch": 0.27, + "grad_norm": 1.0, + "learning_rate": 4.1532796633091296e-05, + "loss": 0.4585, + "step": 1080 + }, + { + "epoch": 0.27125, + "grad_norm": 1.1640625, + "learning_rate": 4.1459027362224436e-05, + "loss": 0.4846, + "step": 1085 + }, + { + "epoch": 0.2725, + "grad_norm": 1.03125, + "learning_rate": 4.138500427279485e-05, + "loss": 0.505, + "step": 1090 + }, + { + "epoch": 0.27375, + "grad_norm": 1.2421875, + "learning_rate": 4.1310728506330174e-05, + "loss": 0.4765, + "step": 1095 + }, + { + "epoch": 0.275, + "grad_norm": 1.5546875, + "learning_rate": 4.123620120825459e-05, + "loss": 0.5105, + "step": 1100 + }, + { + "epoch": 0.27625, + "grad_norm": 0.94140625, + "learning_rate": 4.116142352787125e-05, + "loss": 0.4193, + "step": 1105 + }, + { + "epoch": 0.2775, + "grad_norm": 1.203125, + "learning_rate": 4.1086396618344476e-05, + "loss": 0.4953, + "step": 1110 + }, + { + "epoch": 0.27875, + "grad_norm": 1.265625, + "learning_rate": 4.101112163668203e-05, + "loss": 0.4572, + "step": 1115 + }, + { + "epoch": 0.28, + "grad_norm": 1.1015625, + "learning_rate": 4.093559974371725e-05, + "loss": 0.4247, + "step": 1120 + }, + { + "epoch": 0.28125, + "grad_norm": 1.484375, + "learning_rate": 4.085983210409114e-05, + "loss": 0.483, + "step": 1125 + }, + { + "epoch": 0.2825, + "grad_norm": 0.98828125, + "learning_rate": 4.0783819886234445e-05, + "loss": 0.4787, + "step": 1130 + }, + { + "epoch": 0.28375, + "grad_norm": 1.015625, + "learning_rate": 4.0707564262349595e-05, + "loss": 0.4891, + "step": 1135 + }, + { + "epoch": 0.285, + "grad_norm": 1.5859375, + "learning_rate": 4.063106640839264e-05, + "loss": 0.503, + "step": 1140 + }, + { + "epoch": 0.28625, + "grad_norm": 1.4140625, + "learning_rate": 4.05543275040551e-05, + "loss": 0.5003, + "step": 1145 + }, + { + "epoch": 0.2875, + "grad_norm": 1.328125, + "learning_rate": 4.047734873274586e-05, + "loss": 0.5444, + "step": 1150 + }, + { + "epoch": 0.28875, + "grad_norm": 0.86328125, + "learning_rate": 4.040013128157275e-05, + "loss": 0.4193, + "step": 1155 + }, + { + "epoch": 0.29, + "grad_norm": 1.1328125, + "learning_rate": 4.0322676341324415e-05, + "loss": 0.497, + "step": 1160 + }, + { + "epoch": 0.29125, + "grad_norm": 0.94140625, + "learning_rate": 4.024498510645185e-05, + "loss": 0.377, + "step": 1165 + }, + { + "epoch": 0.2925, + "grad_norm": 1.2421875, + "learning_rate": 4.0167058775049996e-05, + "loss": 0.5118, + "step": 1170 + }, + { + "epoch": 0.29375, + "grad_norm": 1.59375, + "learning_rate": 4.008889854883929e-05, + "loss": 0.4941, + "step": 1175 + }, + { + "epoch": 0.295, + "grad_norm": 1.15625, + "learning_rate": 4.0010505633147106e-05, + "loss": 0.5302, + "step": 1180 + }, + { + "epoch": 0.29625, + "grad_norm": 1.1796875, + "learning_rate": 3.993188123688918e-05, + "loss": 0.5273, + "step": 1185 + }, + { + "epoch": 0.2975, + "grad_norm": 1.4921875, + "learning_rate": 3.985302657255097e-05, + "loss": 0.463, + "step": 1190 + }, + { + "epoch": 0.29875, + "grad_norm": 1.4453125, + "learning_rate": 3.977394285616893e-05, + "loss": 0.5116, + "step": 1195 + }, + { + "epoch": 0.3, + "grad_norm": 1.0078125, + "learning_rate": 3.969463130731183e-05, + "loss": 0.5089, + "step": 1200 + }, + { + "epoch": 0.30125, + "grad_norm": 0.890625, + "learning_rate": 3.961509314906184e-05, + "loss": 0.5043, + "step": 1205 + }, + { + "epoch": 0.3025, + "grad_norm": 1.2265625, + "learning_rate": 3.953532960799577e-05, + "loss": 0.4877, + "step": 1210 + }, + { + "epoch": 0.30375, + "grad_norm": 0.953125, + "learning_rate": 3.9455341914166075e-05, + "loss": 0.5368, + "step": 1215 + }, + { + "epoch": 0.305, + "grad_norm": 1.1015625, + "learning_rate": 3.937513130108197e-05, + "loss": 0.4303, + "step": 1220 + }, + { + "epoch": 0.30625, + "grad_norm": 0.953125, + "learning_rate": 3.9294699005690305e-05, + "loss": 0.4978, + "step": 1225 + }, + { + "epoch": 0.3075, + "grad_norm": 1.4453125, + "learning_rate": 3.92140462683566e-05, + "loss": 0.4898, + "step": 1230 + }, + { + "epoch": 0.30875, + "grad_norm": 1.3828125, + "learning_rate": 3.913317433284582e-05, + "loss": 0.4307, + "step": 1235 + }, + { + "epoch": 0.31, + "grad_norm": 1.078125, + "learning_rate": 3.905208444630327e-05, + "loss": 0.4599, + "step": 1240 + }, + { + "epoch": 0.31125, + "grad_norm": 0.94921875, + "learning_rate": 3.897077785923529e-05, + "loss": 0.4449, + "step": 1245 + }, + { + "epoch": 0.3125, + "grad_norm": 1.1796875, + "learning_rate": 3.888925582549006e-05, + "loss": 0.4508, + "step": 1250 + }, + { + "epoch": 0.31375, + "grad_norm": 1.0703125, + "learning_rate": 3.880751960223817e-05, + "loss": 0.4523, + "step": 1255 + }, + { + "epoch": 0.315, + "grad_norm": 1.203125, + "learning_rate": 3.87255704499533e-05, + "loss": 0.4782, + "step": 1260 + }, + { + "epoch": 0.31625, + "grad_norm": 1.09375, + "learning_rate": 3.864340963239275e-05, + "loss": 0.4821, + "step": 1265 + }, + { + "epoch": 0.3175, + "grad_norm": 1.0625, + "learning_rate": 3.856103841657797e-05, + "loss": 0.393, + "step": 1270 + }, + { + "epoch": 0.31875, + "grad_norm": 1.1328125, + "learning_rate": 3.847845807277502e-05, + "loss": 0.4731, + "step": 1275 + }, + { + "epoch": 0.32, + "grad_norm": 0.94140625, + "learning_rate": 3.8395669874474915e-05, + "loss": 0.4644, + "step": 1280 + }, + { + "epoch": 0.32125, + "grad_norm": 1.171875, + "learning_rate": 3.831267509837414e-05, + "loss": 0.5069, + "step": 1285 + }, + { + "epoch": 0.3225, + "grad_norm": 1.1328125, + "learning_rate": 3.822947502435477e-05, + "loss": 0.4767, + "step": 1290 + }, + { + "epoch": 0.32375, + "grad_norm": 1.296875, + "learning_rate": 3.814607093546489e-05, + "loss": 0.472, + "step": 1295 + }, + { + "epoch": 0.325, + "grad_norm": 1.109375, + "learning_rate": 3.8062464117898724e-05, + "loss": 0.4598, + "step": 1300 + }, + { + "epoch": 0.32625, + "grad_norm": 1.71875, + "learning_rate": 3.7978655860976824e-05, + "loss": 0.4794, + "step": 1305 + }, + { + "epoch": 0.3275, + "grad_norm": 1.046875, + "learning_rate": 3.789464745712619e-05, + "loss": 0.4728, + "step": 1310 + }, + { + "epoch": 0.32875, + "grad_norm": 1.2421875, + "learning_rate": 3.7810440201860334e-05, + "loss": 0.4535, + "step": 1315 + }, + { + "epoch": 0.33, + "grad_norm": 1.2578125, + "learning_rate": 3.7726035393759285e-05, + "loss": 0.4646, + "step": 1320 + }, + { + "epoch": 0.33125, + "grad_norm": 1.109375, + "learning_rate": 3.764143433444962e-05, + "loss": 0.4597, + "step": 1325 + }, + { + "epoch": 0.3325, + "grad_norm": 0.9921875, + "learning_rate": 3.755663832858432e-05, + "loss": 0.516, + "step": 1330 + }, + { + "epoch": 0.33375, + "grad_norm": 1.078125, + "learning_rate": 3.747164868382269e-05, + "loss": 0.4492, + "step": 1335 + }, + { + "epoch": 0.335, + "grad_norm": 1.4765625, + "learning_rate": 3.7386466710810194e-05, + "loss": 0.4644, + "step": 1340 + }, + { + "epoch": 0.33625, + "grad_norm": 1.03125, + "learning_rate": 3.730109372315822e-05, + "loss": 0.5028, + "step": 1345 + }, + { + "epoch": 0.3375, + "grad_norm": 1.359375, + "learning_rate": 3.721553103742388e-05, + "loss": 0.424, + "step": 1350 + }, + { + "epoch": 0.33875, + "grad_norm": 1.1015625, + "learning_rate": 3.71297799730896e-05, + "loss": 0.4592, + "step": 1355 + }, + { + "epoch": 0.34, + "grad_norm": 1.0390625, + "learning_rate": 3.704384185254288e-05, + "loss": 0.4678, + "step": 1360 + }, + { + "epoch": 0.34125, + "grad_norm": 1.0703125, + "learning_rate": 3.695771800105586e-05, + "loss": 0.4809, + "step": 1365 + }, + { + "epoch": 0.3425, + "grad_norm": 1.171875, + "learning_rate": 3.6871409746764865e-05, + "loss": 0.5093, + "step": 1370 + }, + { + "epoch": 0.34375, + "grad_norm": 1.5625, + "learning_rate": 3.678491842064995e-05, + "loss": 0.4937, + "step": 1375 + }, + { + "epoch": 0.345, + "grad_norm": 1.5625, + "learning_rate": 3.6698245356514335e-05, + "loss": 0.4107, + "step": 1380 + }, + { + "epoch": 0.34625, + "grad_norm": 1.0, + "learning_rate": 3.661139189096391e-05, + "loss": 0.4578, + "step": 1385 + }, + { + "epoch": 0.3475, + "grad_norm": 0.8359375, + "learning_rate": 3.652435936338656e-05, + "loss": 0.3964, + "step": 1390 + }, + { + "epoch": 0.34875, + "grad_norm": 1.1171875, + "learning_rate": 3.6437149115931514e-05, + "loss": 0.5011, + "step": 1395 + }, + { + "epoch": 0.35, + "grad_norm": 1.28125, + "learning_rate": 3.634976249348867e-05, + "loss": 0.494, + "step": 1400 + }, + { + "epoch": 0.35125, + "grad_norm": 1.125, + "learning_rate": 3.626220084366786e-05, + "loss": 0.4773, + "step": 1405 + }, + { + "epoch": 0.3525, + "grad_norm": 1.0390625, + "learning_rate": 3.6174465516778035e-05, + "loss": 0.4338, + "step": 1410 + }, + { + "epoch": 0.35375, + "grad_norm": 0.99609375, + "learning_rate": 3.608655786580647e-05, + "loss": 0.4538, + "step": 1415 + }, + { + "epoch": 0.355, + "grad_norm": 1.1875, + "learning_rate": 3.599847924639788e-05, + "loss": 0.4537, + "step": 1420 + }, + { + "epoch": 0.35625, + "grad_norm": 1.0078125, + "learning_rate": 3.591023101683355e-05, + "loss": 0.448, + "step": 1425 + }, + { + "epoch": 0.3575, + "grad_norm": 1.125, + "learning_rate": 3.582181453801036e-05, + "loss": 0.4645, + "step": 1430 + }, + { + "epoch": 0.35875, + "grad_norm": 1.6015625, + "learning_rate": 3.5733231173419754e-05, + "loss": 0.4578, + "step": 1435 + }, + { + "epoch": 0.36, + "grad_norm": 1.0, + "learning_rate": 3.564448228912682e-05, + "loss": 0.4704, + "step": 1440 + }, + { + "epoch": 0.36125, + "grad_norm": 1.8671875, + "learning_rate": 3.555556925374914e-05, + "loss": 0.4383, + "step": 1445 + }, + { + "epoch": 0.3625, + "grad_norm": 1.2734375, + "learning_rate": 3.54664934384357e-05, + "loss": 0.4192, + "step": 1450 + }, + { + "epoch": 0.36375, + "grad_norm": 0.94921875, + "learning_rate": 3.5377256216845785e-05, + "loss": 0.5063, + "step": 1455 + }, + { + "epoch": 0.365, + "grad_norm": 1.203125, + "learning_rate": 3.528785896512772e-05, + "loss": 0.4711, + "step": 1460 + }, + { + "epoch": 0.36625, + "grad_norm": 1.15625, + "learning_rate": 3.519830306189773e-05, + "loss": 0.4494, + "step": 1465 + }, + { + "epoch": 0.3675, + "grad_norm": 1.1796875, + "learning_rate": 3.510858988821863e-05, + "loss": 0.4972, + "step": 1470 + }, + { + "epoch": 0.36875, + "grad_norm": 1.4765625, + "learning_rate": 3.5018720827578524e-05, + "loss": 0.4312, + "step": 1475 + }, + { + "epoch": 0.37, + "grad_norm": 0.99609375, + "learning_rate": 3.4928697265869515e-05, + "loss": 0.4267, + "step": 1480 + }, + { + "epoch": 0.37125, + "grad_norm": 1.390625, + "learning_rate": 3.483852059136629e-05, + "loss": 0.4563, + "step": 1485 + }, + { + "epoch": 0.3725, + "grad_norm": 0.99609375, + "learning_rate": 3.474819219470471e-05, + "loss": 0.4642, + "step": 1490 + }, + { + "epoch": 0.37375, + "grad_norm": 1.1796875, + "learning_rate": 3.4657713468860405e-05, + "loss": 0.414, + "step": 1495 + }, + { + "epoch": 0.375, + "grad_norm": 1.140625, + "learning_rate": 3.456708580912725e-05, + "loss": 0.4919, + "step": 1500 + }, + { + "epoch": 0.37625, + "grad_norm": 1.25, + "learning_rate": 3.447631061309587e-05, + "loss": 0.5023, + "step": 1505 + }, + { + "epoch": 0.3775, + "grad_norm": 1.140625, + "learning_rate": 3.438538928063208e-05, + "loss": 0.469, + "step": 1510 + }, + { + "epoch": 0.37875, + "grad_norm": 1.0859375, + "learning_rate": 3.4294323213855305e-05, + "loss": 0.4322, + "step": 1515 + }, + { + "epoch": 0.38, + "grad_norm": 0.9296875, + "learning_rate": 3.4203113817116957e-05, + "loss": 0.4393, + "step": 1520 + }, + { + "epoch": 0.38125, + "grad_norm": 1.1015625, + "learning_rate": 3.411176249697875e-05, + "loss": 0.4005, + "step": 1525 + }, + { + "epoch": 0.3825, + "grad_norm": 1.4609375, + "learning_rate": 3.402027066219105e-05, + "loss": 0.4094, + "step": 1530 + }, + { + "epoch": 0.38375, + "grad_norm": 1.4921875, + "learning_rate": 3.392863972367114e-05, + "loss": 0.4474, + "step": 1535 + }, + { + "epoch": 0.385, + "grad_norm": 1.265625, + "learning_rate": 3.383687109448143e-05, + "loss": 0.399, + "step": 1540 + }, + { + "epoch": 0.38625, + "grad_norm": 1.3515625, + "learning_rate": 3.374496618980772e-05, + "loss": 0.4342, + "step": 1545 + }, + { + "epoch": 0.3875, + "grad_norm": 1.2890625, + "learning_rate": 3.365292642693732e-05, + "loss": 0.4847, + "step": 1550 + }, + { + "epoch": 0.38875, + "grad_norm": 1.046875, + "learning_rate": 3.356075322523725e-05, + "loss": 0.4343, + "step": 1555 + }, + { + "epoch": 0.39, + "grad_norm": 0.9921875, + "learning_rate": 3.346844800613229e-05, + "loss": 0.498, + "step": 1560 + }, + { + "epoch": 0.39125, + "grad_norm": 1.0703125, + "learning_rate": 3.33760121930831e-05, + "loss": 0.4737, + "step": 1565 + }, + { + "epoch": 0.3925, + "grad_norm": 0.99609375, + "learning_rate": 3.3283447211564276e-05, + "loss": 0.4965, + "step": 1570 + }, + { + "epoch": 0.39375, + "grad_norm": 1.3359375, + "learning_rate": 3.319075448904234e-05, + "loss": 0.4626, + "step": 1575 + }, + { + "epoch": 0.395, + "grad_norm": 1.1640625, + "learning_rate": 3.309793545495374e-05, + "loss": 0.5161, + "step": 1580 + }, + { + "epoch": 0.39625, + "grad_norm": 1.21875, + "learning_rate": 3.3004991540682796e-05, + "loss": 0.4371, + "step": 1585 + }, + { + "epoch": 0.3975, + "grad_norm": 1.109375, + "learning_rate": 3.2911924179539656e-05, + "loss": 0.4427, + "step": 1590 + }, + { + "epoch": 0.39875, + "grad_norm": 0.93359375, + "learning_rate": 3.281873480673815e-05, + "loss": 0.4318, + "step": 1595 + }, + { + "epoch": 0.4, + "grad_norm": 1.046875, + "learning_rate": 3.272542485937369e-05, + "loss": 0.4756, + "step": 1600 + }, + { + "epoch": 0.40125, + "grad_norm": 1.0390625, + "learning_rate": 3.2631995776401094e-05, + "loss": 0.4507, + "step": 1605 + }, + { + "epoch": 0.4025, + "grad_norm": 1.03125, + "learning_rate": 3.253844899861239e-05, + "loss": 0.4444, + "step": 1610 + }, + { + "epoch": 0.40375, + "grad_norm": 1.046875, + "learning_rate": 3.244478596861464e-05, + "loss": 0.4291, + "step": 1615 + }, + { + "epoch": 0.405, + "grad_norm": 1.125, + "learning_rate": 3.23510081308076e-05, + "loss": 0.4615, + "step": 1620 + }, + { + "epoch": 0.40625, + "grad_norm": 1.0546875, + "learning_rate": 3.225711693136156e-05, + "loss": 0.4347, + "step": 1625 + }, + { + "epoch": 0.4075, + "grad_norm": 1.3046875, + "learning_rate": 3.2163113818194964e-05, + "loss": 0.4349, + "step": 1630 + }, + { + "epoch": 0.40875, + "grad_norm": 1.609375, + "learning_rate": 3.206900024095208e-05, + "loss": 0.4814, + "step": 1635 + }, + { + "epoch": 0.41, + "grad_norm": 1.3828125, + "learning_rate": 3.1974777650980735e-05, + "loss": 0.502, + "step": 1640 + }, + { + "epoch": 0.41125, + "grad_norm": 1.5234375, + "learning_rate": 3.188044750130979e-05, + "loss": 0.4457, + "step": 1645 + }, + { + "epoch": 0.4125, + "grad_norm": 1.140625, + "learning_rate": 3.178601124662686e-05, + "loss": 0.505, + "step": 1650 + }, + { + "epoch": 0.41375, + "grad_norm": 1.109375, + "learning_rate": 3.169147034325582e-05, + "loss": 0.4941, + "step": 1655 + }, + { + "epoch": 0.415, + "grad_norm": 1.078125, + "learning_rate": 3.1596826249134324e-05, + "loss": 0.4524, + "step": 1660 + }, + { + "epoch": 0.41625, + "grad_norm": 1.1015625, + "learning_rate": 3.150208042379142e-05, + "loss": 0.4826, + "step": 1665 + }, + { + "epoch": 0.4175, + "grad_norm": 1.109375, + "learning_rate": 3.140723432832492e-05, + "loss": 0.4101, + "step": 1670 + }, + { + "epoch": 0.41875, + "grad_norm": 1.3359375, + "learning_rate": 3.131228942537895e-05, + "loss": 0.4068, + "step": 1675 + }, + { + "epoch": 0.42, + "grad_norm": 1.53125, + "learning_rate": 3.121724717912138e-05, + "loss": 0.4341, + "step": 1680 + }, + { + "epoch": 0.42125, + "grad_norm": 1.2109375, + "learning_rate": 3.112210905522119e-05, + "loss": 0.4197, + "step": 1685 + }, + { + "epoch": 0.4225, + "grad_norm": 1.3046875, + "learning_rate": 3.102687652082597e-05, + "loss": 0.4257, + "step": 1690 + }, + { + "epoch": 0.42375, + "grad_norm": 1.25, + "learning_rate": 3.0931551044539194e-05, + "loss": 0.4513, + "step": 1695 + }, + { + "epoch": 0.425, + "grad_norm": 1.21875, + "learning_rate": 3.083613409639764e-05, + "loss": 0.4757, + "step": 1700 + }, + { + "epoch": 0.42625, + "grad_norm": 1.2421875, + "learning_rate": 3.0740627147848675e-05, + "loss": 0.441, + "step": 1705 + }, + { + "epoch": 0.4275, + "grad_norm": 1.2265625, + "learning_rate": 3.06450316717276e-05, + "loss": 0.4249, + "step": 1710 + }, + { + "epoch": 0.42875, + "grad_norm": 1.1953125, + "learning_rate": 3.05493491422349e-05, + "loss": 0.4189, + "step": 1715 + }, + { + "epoch": 0.43, + "grad_norm": 1.3203125, + "learning_rate": 3.045358103491357e-05, + "loss": 0.4315, + "step": 1720 + }, + { + "epoch": 0.43125, + "grad_norm": 1.15625, + "learning_rate": 3.035772882662627e-05, + "loss": 0.4641, + "step": 1725 + }, + { + "epoch": 0.4325, + "grad_norm": 1.390625, + "learning_rate": 3.026179399553264e-05, + "loss": 0.4701, + "step": 1730 + }, + { + "epoch": 0.43375, + "grad_norm": 1.421875, + "learning_rate": 3.0165778021066453e-05, + "loss": 0.4827, + "step": 1735 + }, + { + "epoch": 0.435, + "grad_norm": 1.3203125, + "learning_rate": 3.0069682383912813e-05, + "loss": 0.4439, + "step": 1740 + }, + { + "epoch": 0.43625, + "grad_norm": 1.28125, + "learning_rate": 2.9973508565985313e-05, + "loss": 0.4916, + "step": 1745 + }, + { + "epoch": 0.4375, + "grad_norm": 1.4140625, + "learning_rate": 2.9877258050403212e-05, + "loss": 0.464, + "step": 1750 + }, + { + "epoch": 0.43875, + "grad_norm": 1.0078125, + "learning_rate": 2.9780932321468515e-05, + "loss": 0.4105, + "step": 1755 + }, + { + "epoch": 0.44, + "grad_norm": 1.3125, + "learning_rate": 2.9684532864643122e-05, + "loss": 0.4312, + "step": 1760 + }, + { + "epoch": 0.44125, + "grad_norm": 1.1875, + "learning_rate": 2.9588061166525914e-05, + "loss": 0.4465, + "step": 1765 + }, + { + "epoch": 0.4425, + "grad_norm": 1.5, + "learning_rate": 2.949151871482982e-05, + "loss": 0.4136, + "step": 1770 + }, + { + "epoch": 0.44375, + "grad_norm": 1.21875, + "learning_rate": 2.9394906998358868e-05, + "loss": 0.4107, + "step": 1775 + }, + { + "epoch": 0.445, + "grad_norm": 0.98828125, + "learning_rate": 2.929822750698524e-05, + "loss": 0.4327, + "step": 1780 + }, + { + "epoch": 0.44625, + "grad_norm": 1.2890625, + "learning_rate": 2.92014817316263e-05, + "loss": 0.4597, + "step": 1785 + }, + { + "epoch": 0.4475, + "grad_norm": 1.046875, + "learning_rate": 2.9104671164221576e-05, + "loss": 0.4685, + "step": 1790 + }, + { + "epoch": 0.44875, + "grad_norm": 1.046875, + "learning_rate": 2.9007797297709782e-05, + "loss": 0.451, + "step": 1795 + }, + { + "epoch": 0.45, + "grad_norm": 1.40625, + "learning_rate": 2.8910861626005776e-05, + "loss": 0.4101, + "step": 1800 + }, + { + "epoch": 0.45125, + "grad_norm": 1.2109375, + "learning_rate": 2.8813865643977526e-05, + "loss": 0.4775, + "step": 1805 + }, + { + "epoch": 0.4525, + "grad_norm": 1.234375, + "learning_rate": 2.871681084742308e-05, + "loss": 0.4588, + "step": 1810 + }, + { + "epoch": 0.45375, + "grad_norm": 1.265625, + "learning_rate": 2.8619698733047447e-05, + "loss": 0.4476, + "step": 1815 + }, + { + "epoch": 0.455, + "grad_norm": 1.140625, + "learning_rate": 2.8522530798439567e-05, + "loss": 0.4375, + "step": 1820 + }, + { + "epoch": 0.45625, + "grad_norm": 1.015625, + "learning_rate": 2.8425308542049206e-05, + "loss": 0.422, + "step": 1825 + }, + { + "epoch": 0.4575, + "grad_norm": 0.94921875, + "learning_rate": 2.832803346316381e-05, + "loss": 0.4887, + "step": 1830 + }, + { + "epoch": 0.45875, + "grad_norm": 1.1796875, + "learning_rate": 2.8230707061885443e-05, + "loss": 0.4136, + "step": 1835 + }, + { + "epoch": 0.46, + "grad_norm": 1.140625, + "learning_rate": 2.8133330839107608e-05, + "loss": 0.4236, + "step": 1840 + }, + { + "epoch": 0.46125, + "grad_norm": 1.078125, + "learning_rate": 2.803590629649212e-05, + "loss": 0.4983, + "step": 1845 + }, + { + "epoch": 0.4625, + "grad_norm": 1.265625, + "learning_rate": 2.7938434936445945e-05, + "loss": 0.4988, + "step": 1850 + }, + { + "epoch": 0.46375, + "grad_norm": 0.8828125, + "learning_rate": 2.784091826209803e-05, + "loss": 0.4337, + "step": 1855 + }, + { + "epoch": 0.465, + "grad_norm": 1.1640625, + "learning_rate": 2.774335777727613e-05, + "loss": 0.4574, + "step": 1860 + }, + { + "epoch": 0.46625, + "grad_norm": 0.95703125, + "learning_rate": 2.764575498648362e-05, + "loss": 0.4606, + "step": 1865 + }, + { + "epoch": 0.4675, + "grad_norm": 1.1328125, + "learning_rate": 2.754811139487625e-05, + "loss": 0.4489, + "step": 1870 + }, + { + "epoch": 0.46875, + "grad_norm": 1.1171875, + "learning_rate": 2.7450428508239024e-05, + "loss": 0.4016, + "step": 1875 + }, + { + "epoch": 0.47, + "grad_norm": 1.1328125, + "learning_rate": 2.7352707832962865e-05, + "loss": 0.4191, + "step": 1880 + }, + { + "epoch": 0.47125, + "grad_norm": 1.2109375, + "learning_rate": 2.725495087602148e-05, + "loss": 0.5397, + "step": 1885 + }, + { + "epoch": 0.4725, + "grad_norm": 1.4609375, + "learning_rate": 2.7157159144948092e-05, + "loss": 0.4646, + "step": 1890 + }, + { + "epoch": 0.47375, + "grad_norm": 0.9453125, + "learning_rate": 2.7059334147812142e-05, + "loss": 0.4443, + "step": 1895 + }, + { + "epoch": 0.475, + "grad_norm": 0.95703125, + "learning_rate": 2.6961477393196126e-05, + "loss": 0.4943, + "step": 1900 + }, + { + "epoch": 0.47625, + "grad_norm": 0.8046875, + "learning_rate": 2.6863590390172243e-05, + "loss": 0.4654, + "step": 1905 + }, + { + "epoch": 0.4775, + "grad_norm": 1.2890625, + "learning_rate": 2.6765674648279172e-05, + "loss": 0.4517, + "step": 1910 + }, + { + "epoch": 0.47875, + "grad_norm": 1.1875, + "learning_rate": 2.666773167749878e-05, + "loss": 0.4525, + "step": 1915 + }, + { + "epoch": 0.48, + "grad_norm": 1.09375, + "learning_rate": 2.656976298823284e-05, + "loss": 0.4676, + "step": 1920 + }, + { + "epoch": 0.48125, + "grad_norm": 1.4609375, + "learning_rate": 2.6471770091279724e-05, + "loss": 0.495, + "step": 1925 + }, + { + "epoch": 0.4825, + "grad_norm": 1.0234375, + "learning_rate": 2.637375449781115e-05, + "loss": 0.4322, + "step": 1930 + }, + { + "epoch": 0.48375, + "grad_norm": 1.1328125, + "learning_rate": 2.627571771934879e-05, + "loss": 0.4147, + "step": 1935 + }, + { + "epoch": 0.485, + "grad_norm": 1.078125, + "learning_rate": 2.6177661267741065e-05, + "loss": 0.4204, + "step": 1940 + }, + { + "epoch": 0.48625, + "grad_norm": 0.98828125, + "learning_rate": 2.607958665513976e-05, + "loss": 0.4245, + "step": 1945 + }, + { + "epoch": 0.4875, + "grad_norm": 0.95703125, + "learning_rate": 2.598149539397672e-05, + "loss": 0.4582, + "step": 1950 + }, + { + "epoch": 0.48875, + "grad_norm": 1.46875, + "learning_rate": 2.5883388996940534e-05, + "loss": 0.4445, + "step": 1955 + }, + { + "epoch": 0.49, + "grad_norm": 1.25, + "learning_rate": 2.578526897695321e-05, + "loss": 0.4533, + "step": 1960 + }, + { + "epoch": 0.49125, + "grad_norm": 1.0390625, + "learning_rate": 2.5687136847146838e-05, + "loss": 0.4334, + "step": 1965 + }, + { + "epoch": 0.4925, + "grad_norm": 0.89453125, + "learning_rate": 2.558899412084026e-05, + "loss": 0.434, + "step": 1970 + }, + { + "epoch": 0.49375, + "grad_norm": 1.25, + "learning_rate": 2.5490842311515707e-05, + "loss": 0.4257, + "step": 1975 + }, + { + "epoch": 0.495, + "grad_norm": 1.6953125, + "learning_rate": 2.539268293279552e-05, + "loss": 0.4503, + "step": 1980 + }, + { + "epoch": 0.49625, + "grad_norm": 1.234375, + "learning_rate": 2.529451749841873e-05, + "loss": 0.5045, + "step": 1985 + }, + { + "epoch": 0.4975, + "grad_norm": 1.3046875, + "learning_rate": 2.5196347522217784e-05, + "loss": 0.4307, + "step": 1990 + }, + { + "epoch": 0.49875, + "grad_norm": 1.0078125, + "learning_rate": 2.509817451809515e-05, + "loss": 0.4701, + "step": 1995 + }, + { + "epoch": 0.5, + "grad_norm": 1.3046875, + "learning_rate": 2.5e-05, + "loss": 0.4573, + "step": 2000 + }, + { + "epoch": 0.50125, + "grad_norm": 1.2421875, + "learning_rate": 2.4901825481904855e-05, + "loss": 0.4304, + "step": 2005 + }, + { + "epoch": 0.5025, + "grad_norm": 1.578125, + "learning_rate": 2.480365247778223e-05, + "loss": 0.4334, + "step": 2010 + }, + { + "epoch": 0.50375, + "grad_norm": 1.0390625, + "learning_rate": 2.4705482501581266e-05, + "loss": 0.4507, + "step": 2015 + }, + { + "epoch": 0.505, + "grad_norm": 1.3984375, + "learning_rate": 2.460731706720449e-05, + "loss": 0.4555, + "step": 2020 + }, + { + "epoch": 0.50625, + "grad_norm": 1.1328125, + "learning_rate": 2.4509157688484295e-05, + "loss": 0.4791, + "step": 2025 + }, + { + "epoch": 0.5075, + "grad_norm": 1.171875, + "learning_rate": 2.4411005879159753e-05, + "loss": 0.4324, + "step": 2030 + }, + { + "epoch": 0.50875, + "grad_norm": 1.3828125, + "learning_rate": 2.4312863152853165e-05, + "loss": 0.4534, + "step": 2035 + }, + { + "epoch": 0.51, + "grad_norm": 1.5546875, + "learning_rate": 2.4214731023046793e-05, + "loss": 0.4411, + "step": 2040 + }, + { + "epoch": 0.51125, + "grad_norm": 1.3359375, + "learning_rate": 2.4116611003059472e-05, + "loss": 0.4333, + "step": 2045 + }, + { + "epoch": 0.5125, + "grad_norm": 1.46875, + "learning_rate": 2.4018504606023293e-05, + "loss": 0.4231, + "step": 2050 + }, + { + "epoch": 0.51375, + "grad_norm": 1.4140625, + "learning_rate": 2.392041334486024e-05, + "loss": 0.3752, + "step": 2055 + }, + { + "epoch": 0.515, + "grad_norm": 1.1328125, + "learning_rate": 2.3822338732258937e-05, + "loss": 0.4876, + "step": 2060 + }, + { + "epoch": 0.51625, + "grad_norm": 0.91796875, + "learning_rate": 2.3724282280651214e-05, + "loss": 0.3989, + "step": 2065 + }, + { + "epoch": 0.5175, + "grad_norm": 0.859375, + "learning_rate": 2.3626245502188864e-05, + "loss": 0.4102, + "step": 2070 + }, + { + "epoch": 0.51875, + "grad_norm": 0.9921875, + "learning_rate": 2.3528229908720272e-05, + "loss": 0.3997, + "step": 2075 + }, + { + "epoch": 0.52, + "grad_norm": 0.71484375, + "learning_rate": 2.3430237011767167e-05, + "loss": 0.4009, + "step": 2080 + }, + { + "epoch": 0.52125, + "grad_norm": 1.046875, + "learning_rate": 2.3332268322501228e-05, + "loss": 0.4769, + "step": 2085 + }, + { + "epoch": 0.5225, + "grad_norm": 1.078125, + "learning_rate": 2.323432535172084e-05, + "loss": 0.4405, + "step": 2090 + }, + { + "epoch": 0.52375, + "grad_norm": 1.0625, + "learning_rate": 2.313640960982776e-05, + "loss": 0.4436, + "step": 2095 + }, + { + "epoch": 0.525, + "grad_norm": 1.09375, + "learning_rate": 2.303852260680388e-05, + "loss": 0.4027, + "step": 2100 + }, + { + "epoch": 0.52625, + "grad_norm": 1.2890625, + "learning_rate": 2.294066585218786e-05, + "loss": 0.4086, + "step": 2105 + }, + { + "epoch": 0.5275, + "grad_norm": 1.046875, + "learning_rate": 2.284284085505192e-05, + "loss": 0.4262, + "step": 2110 + }, + { + "epoch": 0.52875, + "grad_norm": 1.484375, + "learning_rate": 2.274504912397852e-05, + "loss": 0.4605, + "step": 2115 + }, + { + "epoch": 0.53, + "grad_norm": 1.2734375, + "learning_rate": 2.2647292167037144e-05, + "loss": 0.4534, + "step": 2120 + }, + { + "epoch": 0.53125, + "grad_norm": 0.890625, + "learning_rate": 2.2549571491760986e-05, + "loss": 0.3628, + "step": 2125 + }, + { + "epoch": 0.5325, + "grad_norm": 1.3046875, + "learning_rate": 2.2451888605123754e-05, + "loss": 0.4879, + "step": 2130 + }, + { + "epoch": 0.53375, + "grad_norm": 0.96875, + "learning_rate": 2.2354245013516393e-05, + "loss": 0.4517, + "step": 2135 + }, + { + "epoch": 0.535, + "grad_norm": 1.1640625, + "learning_rate": 2.225664222272387e-05, + "loss": 0.4303, + "step": 2140 + }, + { + "epoch": 0.53625, + "grad_norm": 0.83203125, + "learning_rate": 2.2159081737901975e-05, + "loss": 0.4172, + "step": 2145 + }, + { + "epoch": 0.5375, + "grad_norm": 1.125, + "learning_rate": 2.2061565063554064e-05, + "loss": 0.4169, + "step": 2150 + }, + { + "epoch": 0.53875, + "grad_norm": 1.25, + "learning_rate": 2.1964093703507893e-05, + "loss": 0.4839, + "step": 2155 + }, + { + "epoch": 0.54, + "grad_norm": 0.8359375, + "learning_rate": 2.186666916089239e-05, + "loss": 0.3919, + "step": 2160 + }, + { + "epoch": 0.54125, + "grad_norm": 1.2421875, + "learning_rate": 2.1769292938114563e-05, + "loss": 0.4435, + "step": 2165 + }, + { + "epoch": 0.5425, + "grad_norm": 1.3046875, + "learning_rate": 2.1671966536836196e-05, + "loss": 0.4902, + "step": 2170 + }, + { + "epoch": 0.54375, + "grad_norm": 1.296875, + "learning_rate": 2.1574691457950803e-05, + "loss": 0.4667, + "step": 2175 + }, + { + "epoch": 0.545, + "grad_norm": 1.1015625, + "learning_rate": 2.1477469201560435e-05, + "loss": 0.3795, + "step": 2180 + }, + { + "epoch": 0.54625, + "grad_norm": 1.140625, + "learning_rate": 2.1380301266952556e-05, + "loss": 0.4658, + "step": 2185 + }, + { + "epoch": 0.5475, + "grad_norm": 1.6171875, + "learning_rate": 2.1283189152576925e-05, + "loss": 0.4589, + "step": 2190 + }, + { + "epoch": 0.54875, + "grad_norm": 1.3828125, + "learning_rate": 2.118613435602248e-05, + "loss": 0.4394, + "step": 2195 + }, + { + "epoch": 0.55, + "grad_norm": 1.0390625, + "learning_rate": 2.1089138373994223e-05, + "loss": 0.4321, + "step": 2200 + }, + { + "epoch": 0.55125, + "grad_norm": 1.3046875, + "learning_rate": 2.0992202702290227e-05, + "loss": 0.4084, + "step": 2205 + }, + { + "epoch": 0.5525, + "grad_norm": 1.2890625, + "learning_rate": 2.089532883577843e-05, + "loss": 0.4489, + "step": 2210 + }, + { + "epoch": 0.55375, + "grad_norm": 1.2421875, + "learning_rate": 2.0798518268373706e-05, + "loss": 0.4403, + "step": 2215 + }, + { + "epoch": 0.555, + "grad_norm": 1.1796875, + "learning_rate": 2.070177249301476e-05, + "loss": 0.4286, + "step": 2220 + }, + { + "epoch": 0.55625, + "grad_norm": 1.28125, + "learning_rate": 2.0605093001641138e-05, + "loss": 0.4557, + "step": 2225 + }, + { + "epoch": 0.5575, + "grad_norm": 1.171875, + "learning_rate": 2.0508481285170186e-05, + "loss": 0.4686, + "step": 2230 + }, + { + "epoch": 0.55875, + "grad_norm": 1.375, + "learning_rate": 2.04119388334741e-05, + "loss": 0.4402, + "step": 2235 + }, + { + "epoch": 0.56, + "grad_norm": 1.1953125, + "learning_rate": 2.031546713535688e-05, + "loss": 0.3973, + "step": 2240 + }, + { + "epoch": 0.56125, + "grad_norm": 1.1015625, + "learning_rate": 2.0219067678531494e-05, + "loss": 0.4349, + "step": 2245 + }, + { + "epoch": 0.5625, + "grad_norm": 1.1015625, + "learning_rate": 2.0122741949596797e-05, + "loss": 0.4329, + "step": 2250 + }, + { + "epoch": 0.56375, + "grad_norm": 1.1875, + "learning_rate": 2.002649143401469e-05, + "loss": 0.4402, + "step": 2255 + }, + { + "epoch": 0.565, + "grad_norm": 1.0546875, + "learning_rate": 1.9930317616087196e-05, + "loss": 0.4342, + "step": 2260 + }, + { + "epoch": 0.56625, + "grad_norm": 0.98046875, + "learning_rate": 1.9834221978933543e-05, + "loss": 0.4537, + "step": 2265 + }, + { + "epoch": 0.5675, + "grad_norm": 1.2890625, + "learning_rate": 1.9738206004467363e-05, + "loss": 0.4597, + "step": 2270 + }, + { + "epoch": 0.56875, + "grad_norm": 1.4296875, + "learning_rate": 1.9642271173373737e-05, + "loss": 0.4372, + "step": 2275 + }, + { + "epoch": 0.57, + "grad_norm": 1.171875, + "learning_rate": 1.9546418965086442e-05, + "loss": 0.4062, + "step": 2280 + }, + { + "epoch": 0.57125, + "grad_norm": 1.453125, + "learning_rate": 1.9450650857765102e-05, + "loss": 0.4698, + "step": 2285 + }, + { + "epoch": 0.5725, + "grad_norm": 1.140625, + "learning_rate": 1.935496832827241e-05, + "loss": 0.4312, + "step": 2290 + }, + { + "epoch": 0.57375, + "grad_norm": 1.21875, + "learning_rate": 1.925937285215133e-05, + "loss": 0.4643, + "step": 2295 + }, + { + "epoch": 0.575, + "grad_norm": 1.1015625, + "learning_rate": 1.9163865903602374e-05, + "loss": 0.4256, + "step": 2300 + }, + { + "epoch": 0.57625, + "grad_norm": 1.1171875, + "learning_rate": 1.9068448955460805e-05, + "loss": 0.3879, + "step": 2305 + }, + { + "epoch": 0.5775, + "grad_norm": 1.5390625, + "learning_rate": 1.897312347917404e-05, + "loss": 0.4048, + "step": 2310 + }, + { + "epoch": 0.57875, + "grad_norm": 1.03125, + "learning_rate": 1.8877890944778815e-05, + "loss": 0.4572, + "step": 2315 + }, + { + "epoch": 0.58, + "grad_norm": 1.1171875, + "learning_rate": 1.8782752820878634e-05, + "loss": 0.4634, + "step": 2320 + }, + { + "epoch": 0.58125, + "grad_norm": 1.2265625, + "learning_rate": 1.868771057462105e-05, + "loss": 0.3985, + "step": 2325 + }, + { + "epoch": 0.5825, + "grad_norm": 1.1328125, + "learning_rate": 1.8592765671675084e-05, + "loss": 0.4463, + "step": 2330 + }, + { + "epoch": 0.58375, + "grad_norm": 0.91015625, + "learning_rate": 1.8497919576208585e-05, + "loss": 0.4083, + "step": 2335 + }, + { + "epoch": 0.585, + "grad_norm": 1.4921875, + "learning_rate": 1.8403173750865685e-05, + "loss": 0.3929, + "step": 2340 + }, + { + "epoch": 0.58625, + "grad_norm": 1.390625, + "learning_rate": 1.830852965674419e-05, + "loss": 0.4659, + "step": 2345 + }, + { + "epoch": 0.5875, + "grad_norm": 1.2109375, + "learning_rate": 1.8213988753373146e-05, + "loss": 0.3986, + "step": 2350 + }, + { + "epoch": 0.58875, + "grad_norm": 1.046875, + "learning_rate": 1.8119552498690215e-05, + "loss": 0.4043, + "step": 2355 + }, + { + "epoch": 0.59, + "grad_norm": 2.84375, + "learning_rate": 1.802522234901927e-05, + "loss": 0.449, + "step": 2360 + }, + { + "epoch": 0.59125, + "grad_norm": 1.40625, + "learning_rate": 1.793099975904791e-05, + "loss": 0.4178, + "step": 2365 + }, + { + "epoch": 0.5925, + "grad_norm": 1.390625, + "learning_rate": 1.783688618180504e-05, + "loss": 0.4422, + "step": 2370 + }, + { + "epoch": 0.59375, + "grad_norm": 0.94921875, + "learning_rate": 1.7742883068638447e-05, + "loss": 0.4666, + "step": 2375 + }, + { + "epoch": 0.595, + "grad_norm": 1.015625, + "learning_rate": 1.7648991869192405e-05, + "loss": 0.4226, + "step": 2380 + }, + { + "epoch": 0.59625, + "grad_norm": 1.40625, + "learning_rate": 1.7555214031385375e-05, + "loss": 0.408, + "step": 2385 + }, + { + "epoch": 0.5975, + "grad_norm": 1.7109375, + "learning_rate": 1.746155100138761e-05, + "loss": 0.4778, + "step": 2390 + }, + { + "epoch": 0.59875, + "grad_norm": 1.25, + "learning_rate": 1.7368004223598912e-05, + "loss": 0.4059, + "step": 2395 + }, + { + "epoch": 0.6, + "grad_norm": 1.296875, + "learning_rate": 1.7274575140626318e-05, + "loss": 0.4398, + "step": 2400 + }, + { + "epoch": 0.60125, + "grad_norm": 1.1015625, + "learning_rate": 1.7181265193261865e-05, + "loss": 0.482, + "step": 2405 + }, + { + "epoch": 0.6025, + "grad_norm": 0.84765625, + "learning_rate": 1.7088075820460346e-05, + "loss": 0.4192, + "step": 2410 + }, + { + "epoch": 0.60375, + "grad_norm": 1.3125, + "learning_rate": 1.6995008459317206e-05, + "loss": 0.4748, + "step": 2415 + }, + { + "epoch": 0.605, + "grad_norm": 1.5703125, + "learning_rate": 1.690206454504627e-05, + "loss": 0.4276, + "step": 2420 + }, + { + "epoch": 0.60625, + "grad_norm": 1.125, + "learning_rate": 1.6809245510957665e-05, + "loss": 0.3548, + "step": 2425 + }, + { + "epoch": 0.6075, + "grad_norm": 1.0078125, + "learning_rate": 1.6716552788435724e-05, + "loss": 0.4122, + "step": 2430 + }, + { + "epoch": 0.60875, + "grad_norm": 1.1484375, + "learning_rate": 1.66239878069169e-05, + "loss": 0.432, + "step": 2435 + }, + { + "epoch": 0.61, + "grad_norm": 1.1015625, + "learning_rate": 1.6531551993867717e-05, + "loss": 0.4467, + "step": 2440 + }, + { + "epoch": 0.61125, + "grad_norm": 1.3828125, + "learning_rate": 1.643924677476276e-05, + "loss": 0.4652, + "step": 2445 + }, + { + "epoch": 0.6125, + "grad_norm": 1.1328125, + "learning_rate": 1.6347073573062672e-05, + "loss": 0.4024, + "step": 2450 + }, + { + "epoch": 0.61375, + "grad_norm": 1.1953125, + "learning_rate": 1.6255033810192282e-05, + "loss": 0.3974, + "step": 2455 + }, + { + "epoch": 0.615, + "grad_norm": 1.09375, + "learning_rate": 1.6163128905518578e-05, + "loss": 0.3891, + "step": 2460 + }, + { + "epoch": 0.61625, + "grad_norm": 1.328125, + "learning_rate": 1.6071360276328874e-05, + "loss": 0.3499, + "step": 2465 + }, + { + "epoch": 0.6175, + "grad_norm": 1.109375, + "learning_rate": 1.5979729337808955e-05, + "loss": 0.4386, + "step": 2470 + }, + { + "epoch": 0.61875, + "grad_norm": 1.1328125, + "learning_rate": 1.588823750302126e-05, + "loss": 0.4494, + "step": 2475 + }, + { + "epoch": 0.62, + "grad_norm": 1.4296875, + "learning_rate": 1.5796886182883053e-05, + "loss": 0.4025, + "step": 2480 + }, + { + "epoch": 0.62125, + "grad_norm": 1.6796875, + "learning_rate": 1.57056767861447e-05, + "loss": 0.4192, + "step": 2485 + }, + { + "epoch": 0.6225, + "grad_norm": 1.3046875, + "learning_rate": 1.561461071936792e-05, + "loss": 0.4509, + "step": 2490 + }, + { + "epoch": 0.62375, + "grad_norm": 1.2109375, + "learning_rate": 1.552368938690414e-05, + "loss": 0.3897, + "step": 2495 + }, + { + "epoch": 0.625, + "grad_norm": 1.28125, + "learning_rate": 1.5432914190872757e-05, + "loss": 0.473, + "step": 2500 + }, + { + "epoch": 0.62625, + "grad_norm": 1.1328125, + "learning_rate": 1.5342286531139605e-05, + "loss": 0.4333, + "step": 2505 + }, + { + "epoch": 0.6275, + "grad_norm": 1.0546875, + "learning_rate": 1.5251807805295302e-05, + "loss": 0.4245, + "step": 2510 + }, + { + "epoch": 0.62875, + "grad_norm": 1.1953125, + "learning_rate": 1.5161479408633713e-05, + "loss": 0.4342, + "step": 2515 + }, + { + "epoch": 0.63, + "grad_norm": 1.0703125, + "learning_rate": 1.5071302734130489e-05, + "loss": 0.3951, + "step": 2520 + }, + { + "epoch": 0.63125, + "grad_norm": 1.2890625, + "learning_rate": 1.498127917242148e-05, + "loss": 0.4367, + "step": 2525 + }, + { + "epoch": 0.6325, + "grad_norm": 1.2578125, + "learning_rate": 1.4891410111781378e-05, + "loss": 0.4766, + "step": 2530 + }, + { + "epoch": 0.63375, + "grad_norm": 1.3828125, + "learning_rate": 1.4801696938102272e-05, + "loss": 0.373, + "step": 2535 + }, + { + "epoch": 0.635, + "grad_norm": 1.3046875, + "learning_rate": 1.4712141034872282e-05, + "loss": 0.3804, + "step": 2540 + }, + { + "epoch": 0.63625, + "grad_norm": 1.09375, + "learning_rate": 1.4622743783154223e-05, + "loss": 0.4206, + "step": 2545 + }, + { + "epoch": 0.6375, + "grad_norm": 1.1875, + "learning_rate": 1.4533506561564306e-05, + "loss": 0.4585, + "step": 2550 + }, + { + "epoch": 0.63875, + "grad_norm": 0.84765625, + "learning_rate": 1.4444430746250867e-05, + "loss": 0.3796, + "step": 2555 + }, + { + "epoch": 0.64, + "grad_norm": 1.0625, + "learning_rate": 1.4355517710873184e-05, + "loss": 0.4296, + "step": 2560 + }, + { + "epoch": 0.64125, + "grad_norm": 1.6796875, + "learning_rate": 1.4266768826580257e-05, + "loss": 0.5008, + "step": 2565 + }, + { + "epoch": 0.6425, + "grad_norm": 1.0625, + "learning_rate": 1.4178185461989662e-05, + "loss": 0.3952, + "step": 2570 + }, + { + "epoch": 0.64375, + "grad_norm": 1.0703125, + "learning_rate": 1.4089768983166444e-05, + "loss": 0.4494, + "step": 2575 + }, + { + "epoch": 0.645, + "grad_norm": 0.88671875, + "learning_rate": 1.4001520753602121e-05, + "loss": 0.3944, + "step": 2580 + }, + { + "epoch": 0.64625, + "grad_norm": 1.328125, + "learning_rate": 1.3913442134193544e-05, + "loss": 0.4276, + "step": 2585 + }, + { + "epoch": 0.6475, + "grad_norm": 1.4296875, + "learning_rate": 1.3825534483221974e-05, + "loss": 0.4433, + "step": 2590 + }, + { + "epoch": 0.64875, + "grad_norm": 1.1640625, + "learning_rate": 1.3737799156332143e-05, + "loss": 0.3992, + "step": 2595 + }, + { + "epoch": 0.65, + "grad_norm": 0.96875, + "learning_rate": 1.3650237506511331e-05, + "loss": 0.4488, + "step": 2600 + }, + { + "epoch": 0.65125, + "grad_norm": 1.3828125, + "learning_rate": 1.3562850884068487e-05, + "loss": 0.4243, + "step": 2605 + }, + { + "epoch": 0.6525, + "grad_norm": 1.1171875, + "learning_rate": 1.3475640636613446e-05, + "loss": 0.3477, + "step": 2610 + }, + { + "epoch": 0.65375, + "grad_norm": 1.2734375, + "learning_rate": 1.3388608109036086e-05, + "loss": 0.4413, + "step": 2615 + }, + { + "epoch": 0.655, + "grad_norm": 1.0625, + "learning_rate": 1.330175464348567e-05, + "loss": 0.4487, + "step": 2620 + }, + { + "epoch": 0.65625, + "grad_norm": 1.28125, + "learning_rate": 1.3215081579350058e-05, + "loss": 0.4122, + "step": 2625 + }, + { + "epoch": 0.6575, + "grad_norm": 1.09375, + "learning_rate": 1.312859025323514e-05, + "loss": 0.424, + "step": 2630 + }, + { + "epoch": 0.65875, + "grad_norm": 1.078125, + "learning_rate": 1.3042281998944151e-05, + "loss": 0.4013, + "step": 2635 + }, + { + "epoch": 0.66, + "grad_norm": 1.9375, + "learning_rate": 1.2956158147457115e-05, + "loss": 0.5066, + "step": 2640 + }, + { + "epoch": 0.66125, + "grad_norm": 1.25, + "learning_rate": 1.2870220026910407e-05, + "loss": 0.3935, + "step": 2645 + }, + { + "epoch": 0.6625, + "grad_norm": 1.21875, + "learning_rate": 1.2784468962576136e-05, + "loss": 0.4039, + "step": 2650 + }, + { + "epoch": 0.66375, + "grad_norm": 1.1875, + "learning_rate": 1.2698906276841776e-05, + "loss": 0.4817, + "step": 2655 + }, + { + "epoch": 0.665, + "grad_norm": 0.99609375, + "learning_rate": 1.261353328918981e-05, + "loss": 0.3917, + "step": 2660 + }, + { + "epoch": 0.66625, + "grad_norm": 1.2421875, + "learning_rate": 1.2528351316177319e-05, + "loss": 0.425, + "step": 2665 + }, + { + "epoch": 0.6675, + "grad_norm": 1.015625, + "learning_rate": 1.2443361671415687e-05, + "loss": 0.4234, + "step": 2670 + }, + { + "epoch": 0.66875, + "grad_norm": 1.1171875, + "learning_rate": 1.235856566555039e-05, + "loss": 0.4414, + "step": 2675 + }, + { + "epoch": 0.67, + "grad_norm": 1.40625, + "learning_rate": 1.2273964606240718e-05, + "loss": 0.4563, + "step": 2680 + }, + { + "epoch": 0.67125, + "grad_norm": 1.15625, + "learning_rate": 1.2189559798139682e-05, + "loss": 0.4132, + "step": 2685 + }, + { + "epoch": 0.6725, + "grad_norm": 1.765625, + "learning_rate": 1.2105352542873815e-05, + "loss": 0.4317, + "step": 2690 + }, + { + "epoch": 0.67375, + "grad_norm": 0.9296875, + "learning_rate": 1.2021344139023186e-05, + "loss": 0.4073, + "step": 2695 + }, + { + "epoch": 0.675, + "grad_norm": 1.4765625, + "learning_rate": 1.1937535882101281e-05, + "loss": 0.4147, + "step": 2700 + }, + { + "epoch": 0.67625, + "grad_norm": 1.5625, + "learning_rate": 1.1853929064535111e-05, + "loss": 0.4394, + "step": 2705 + }, + { + "epoch": 0.6775, + "grad_norm": 1.40625, + "learning_rate": 1.1770524975645238e-05, + "loss": 0.461, + "step": 2710 + }, + { + "epoch": 0.67875, + "grad_norm": 1.0, + "learning_rate": 1.1687324901625879e-05, + "loss": 0.4279, + "step": 2715 + }, + { + "epoch": 0.68, + "grad_norm": 0.9765625, + "learning_rate": 1.1604330125525079e-05, + "loss": 0.4201, + "step": 2720 + }, + { + "epoch": 0.68125, + "grad_norm": 0.98828125, + "learning_rate": 1.1521541927224994e-05, + "loss": 0.4392, + "step": 2725 + }, + { + "epoch": 0.6825, + "grad_norm": 1.140625, + "learning_rate": 1.1438961583422037e-05, + "loss": 0.4064, + "step": 2730 + }, + { + "epoch": 0.68375, + "grad_norm": 1.1796875, + "learning_rate": 1.1356590367607252e-05, + "loss": 0.4081, + "step": 2735 + }, + { + "epoch": 0.685, + "grad_norm": 1.1171875, + "learning_rate": 1.1274429550046704e-05, + "loss": 0.4629, + "step": 2740 + }, + { + "epoch": 0.68625, + "grad_norm": 0.9453125, + "learning_rate": 1.1192480397761837e-05, + "loss": 0.3942, + "step": 2745 + }, + { + "epoch": 0.6875, + "grad_norm": 0.9765625, + "learning_rate": 1.1110744174509952e-05, + "loss": 0.4581, + "step": 2750 + }, + { + "epoch": 0.68875, + "grad_norm": 1.1015625, + "learning_rate": 1.1029222140764712e-05, + "loss": 0.4079, + "step": 2755 + }, + { + "epoch": 0.69, + "grad_norm": 1.1328125, + "learning_rate": 1.0947915553696742e-05, + "loss": 0.3924, + "step": 2760 + }, + { + "epoch": 0.69125, + "grad_norm": 0.94921875, + "learning_rate": 1.0866825667154182e-05, + "loss": 0.3715, + "step": 2765 + }, + { + "epoch": 0.6925, + "grad_norm": 1.09375, + "learning_rate": 1.07859537316434e-05, + "loss": 0.4238, + "step": 2770 + }, + { + "epoch": 0.69375, + "grad_norm": 1.3203125, + "learning_rate": 1.0705300994309697e-05, + "loss": 0.4465, + "step": 2775 + }, + { + "epoch": 0.695, + "grad_norm": 1.046875, + "learning_rate": 1.0624868698918045e-05, + "loss": 0.4295, + "step": 2780 + }, + { + "epoch": 0.69625, + "grad_norm": 1.140625, + "learning_rate": 1.0544658085833919e-05, + "loss": 0.4527, + "step": 2785 + }, + { + "epoch": 0.6975, + "grad_norm": 1.1015625, + "learning_rate": 1.0464670392004235e-05, + "loss": 0.4721, + "step": 2790 + }, + { + "epoch": 0.69875, + "grad_norm": 1.234375, + "learning_rate": 1.0384906850938166e-05, + "loss": 0.4632, + "step": 2795 + }, + { + "epoch": 0.7, + "grad_norm": 1.03125, + "learning_rate": 1.0305368692688174e-05, + "loss": 0.4382, + "step": 2800 + }, + { + "epoch": 0.70125, + "grad_norm": 1.046875, + "learning_rate": 1.0226057143831064e-05, + "loss": 0.4699, + "step": 2805 + }, + { + "epoch": 0.7025, + "grad_norm": 1.4296875, + "learning_rate": 1.0146973427449038e-05, + "loss": 0.4368, + "step": 2810 + }, + { + "epoch": 0.70375, + "grad_norm": 0.9765625, + "learning_rate": 1.0068118763110824e-05, + "loss": 0.4513, + "step": 2815 + }, + { + "epoch": 0.705, + "grad_norm": 1.0, + "learning_rate": 9.989494366852904e-06, + "loss": 0.3863, + "step": 2820 + }, + { + "epoch": 0.70625, + "grad_norm": 1.0390625, + "learning_rate": 9.911101451160715e-06, + "loss": 0.3907, + "step": 2825 + }, + { + "epoch": 0.7075, + "grad_norm": 1.078125, + "learning_rate": 9.832941224950012e-06, + "loss": 0.4537, + "step": 2830 + }, + { + "epoch": 0.70875, + "grad_norm": 1.3125, + "learning_rate": 9.755014893548157e-06, + "loss": 0.4082, + "step": 2835 + }, + { + "epoch": 0.71, + "grad_norm": 1.1015625, + "learning_rate": 9.677323658675594e-06, + "loss": 0.3992, + "step": 2840 + }, + { + "epoch": 0.71125, + "grad_norm": 1.3125, + "learning_rate": 9.599868718427257e-06, + "loss": 0.4512, + "step": 2845 + }, + { + "epoch": 0.7125, + "grad_norm": 0.8984375, + "learning_rate": 9.522651267254149e-06, + "loss": 0.419, + "step": 2850 + }, + { + "epoch": 0.71375, + "grad_norm": 1.3671875, + "learning_rate": 9.445672495944899e-06, + "loss": 0.4542, + "step": 2855 + }, + { + "epoch": 0.715, + "grad_norm": 1.1640625, + "learning_rate": 9.368933591607378e-06, + "loss": 0.4554, + "step": 2860 + }, + { + "epoch": 0.71625, + "grad_norm": 1.3359375, + "learning_rate": 9.292435737650407e-06, + "loss": 0.4158, + "step": 2865 + }, + { + "epoch": 0.7175, + "grad_norm": 0.8671875, + "learning_rate": 9.216180113765558e-06, + "loss": 0.4145, + "step": 2870 + }, + { + "epoch": 0.71875, + "grad_norm": 1.1640625, + "learning_rate": 9.140167895908867e-06, + "loss": 0.4276, + "step": 2875 + }, + { + "epoch": 0.72, + "grad_norm": 1.125, + "learning_rate": 9.064400256282757e-06, + "loss": 0.4477, + "step": 2880 + }, + { + "epoch": 0.72125, + "grad_norm": 1.1796875, + "learning_rate": 8.988878363317979e-06, + "loss": 0.4563, + "step": 2885 + }, + { + "epoch": 0.7225, + "grad_norm": 1.234375, + "learning_rate": 8.913603381655528e-06, + "loss": 0.4396, + "step": 2890 + }, + { + "epoch": 0.72375, + "grad_norm": 1.5390625, + "learning_rate": 8.838576472128756e-06, + "loss": 0.4831, + "step": 2895 + }, + { + "epoch": 0.725, + "grad_norm": 1.03125, + "learning_rate": 8.763798791745411e-06, + "loss": 0.4437, + "step": 2900 + }, + { + "epoch": 0.72625, + "grad_norm": 1.140625, + "learning_rate": 8.689271493669837e-06, + "loss": 0.4639, + "step": 2905 + }, + { + "epoch": 0.7275, + "grad_norm": 1.421875, + "learning_rate": 8.614995727205156e-06, + "loss": 0.4215, + "step": 2910 + }, + { + "epoch": 0.72875, + "grad_norm": 1.28125, + "learning_rate": 8.540972637775572e-06, + "loss": 0.4615, + "step": 2915 + }, + { + "epoch": 0.73, + "grad_norm": 1.0703125, + "learning_rate": 8.467203366908707e-06, + "loss": 0.4043, + "step": 2920 + }, + { + "epoch": 0.73125, + "grad_norm": 1.3046875, + "learning_rate": 8.393689052217966e-06, + "loss": 0.4634, + "step": 2925 + }, + { + "epoch": 0.7325, + "grad_norm": 1.328125, + "learning_rate": 8.320430827385003e-06, + "loss": 0.4411, + "step": 2930 + }, + { + "epoch": 0.73375, + "grad_norm": 1.2265625, + "learning_rate": 8.24742982214231e-06, + "loss": 0.3556, + "step": 2935 + }, + { + "epoch": 0.735, + "grad_norm": 1.0234375, + "learning_rate": 8.174687162255672e-06, + "loss": 0.4456, + "step": 2940 + }, + { + "epoch": 0.73625, + "grad_norm": 0.8984375, + "learning_rate": 8.102203969506886e-06, + "loss": 0.4254, + "step": 2945 + }, + { + "epoch": 0.7375, + "grad_norm": 1.234375, + "learning_rate": 8.029981361676456e-06, + "loss": 0.427, + "step": 2950 + }, + { + "epoch": 0.73875, + "grad_norm": 1.0390625, + "learning_rate": 7.958020452526346e-06, + "loss": 0.4323, + "step": 2955 + }, + { + "epoch": 0.74, + "grad_norm": 1.125, + "learning_rate": 7.886322351782783e-06, + "loss": 0.3968, + "step": 2960 + }, + { + "epoch": 0.74125, + "grad_norm": 1.0234375, + "learning_rate": 7.814888165119186e-06, + "loss": 0.4628, + "step": 2965 + }, + { + "epoch": 0.7425, + "grad_norm": 1.0078125, + "learning_rate": 7.743718994139071e-06, + "loss": 0.4388, + "step": 2970 + }, + { + "epoch": 0.74375, + "grad_norm": 1.34375, + "learning_rate": 7.672815936359107e-06, + "loss": 0.4029, + "step": 2975 + }, + { + "epoch": 0.745, + "grad_norm": 1.0703125, + "learning_rate": 7.602180085192143e-06, + "loss": 0.4214, + "step": 2980 + }, + { + "epoch": 0.74625, + "grad_norm": 1.4296875, + "learning_rate": 7.531812529930398e-06, + "loss": 0.4165, + "step": 2985 + }, + { + "epoch": 0.7475, + "grad_norm": 1.1171875, + "learning_rate": 7.461714355728608e-06, + "loss": 0.4016, + "step": 2990 + }, + { + "epoch": 0.74875, + "grad_norm": 1.203125, + "learning_rate": 7.391886643587342e-06, + "loss": 0.4527, + "step": 2995 + }, + { + "epoch": 0.75, + "grad_norm": 1.15625, + "learning_rate": 7.3223304703363135e-06, + "loss": 0.4143, + "step": 3000 + }, + { + "epoch": 0.75125, + "grad_norm": 1.328125, + "learning_rate": 7.253046908617747e-06, + "loss": 0.4667, + "step": 3005 + }, + { + "epoch": 0.7525, + "grad_norm": 1.2734375, + "learning_rate": 7.184037026869867e-06, + "loss": 0.4032, + "step": 3010 + }, + { + "epoch": 0.75375, + "grad_norm": 1.0859375, + "learning_rate": 7.115301889310427e-06, + "loss": 0.433, + "step": 3015 + }, + { + "epoch": 0.755, + "grad_norm": 1.046875, + "learning_rate": 7.046842555920283e-06, + "loss": 0.4017, + "step": 3020 + }, + { + "epoch": 0.75625, + "grad_norm": 1.0546875, + "learning_rate": 6.9786600824270296e-06, + "loss": 0.4006, + "step": 3025 + }, + { + "epoch": 0.7575, + "grad_norm": 1.3515625, + "learning_rate": 6.91075552028877e-06, + "loss": 0.4536, + "step": 3030 + }, + { + "epoch": 0.75875, + "grad_norm": 1.4609375, + "learning_rate": 6.84312991667784e-06, + "loss": 0.4295, + "step": 3035 + }, + { + "epoch": 0.76, + "grad_norm": 0.9375, + "learning_rate": 6.775784314464717e-06, + "loss": 0.4216, + "step": 3040 + }, + { + "epoch": 0.76125, + "grad_norm": 1.1328125, + "learning_rate": 6.708719752201884e-06, + "loss": 0.4071, + "step": 3045 + }, + { + "epoch": 0.7625, + "grad_norm": 0.87109375, + "learning_rate": 6.641937264107867e-06, + "loss": 0.4518, + "step": 3050 + }, + { + "epoch": 0.76375, + "grad_norm": 1.453125, + "learning_rate": 6.575437880051233e-06, + "loss": 0.4776, + "step": 3055 + }, + { + "epoch": 0.765, + "grad_norm": 1.1796875, + "learning_rate": 6.509222625534755e-06, + "loss": 0.4084, + "step": 3060 + }, + { + "epoch": 0.76625, + "grad_norm": 1.4609375, + "learning_rate": 6.443292521679578e-06, + "loss": 0.4825, + "step": 3065 + }, + { + "epoch": 0.7675, + "grad_norm": 1.171875, + "learning_rate": 6.377648585209456e-06, + "loss": 0.4788, + "step": 3070 + }, + { + "epoch": 0.76875, + "grad_norm": 1.1953125, + "learning_rate": 6.312291828435077e-06, + "loss": 0.4077, + "step": 3075 + }, + { + "epoch": 0.77, + "grad_norm": 0.98046875, + "learning_rate": 6.247223259238511e-06, + "loss": 0.4103, + "step": 3080 + }, + { + "epoch": 0.77125, + "grad_norm": 1.21875, + "learning_rate": 6.182443881057576e-06, + "loss": 0.4401, + "step": 3085 + }, + { + "epoch": 0.7725, + "grad_norm": 1.3984375, + "learning_rate": 6.117954692870412e-06, + "loss": 0.4628, + "step": 3090 + }, + { + "epoch": 0.77375, + "grad_norm": 1.15625, + "learning_rate": 6.053756689180082e-06, + "loss": 0.3789, + "step": 3095 + }, + { + "epoch": 0.775, + "grad_norm": 0.9609375, + "learning_rate": 5.989850859999227e-06, + "loss": 0.4261, + "step": 3100 + }, + { + "epoch": 0.77625, + "grad_norm": 1.3359375, + "learning_rate": 5.926238190834779e-06, + "loss": 0.4548, + "step": 3105 + }, + { + "epoch": 0.7775, + "grad_norm": 1.125, + "learning_rate": 5.8629196626728e-06, + "loss": 0.4496, + "step": 3110 + }, + { + "epoch": 0.77875, + "grad_norm": 0.984375, + "learning_rate": 5.7998962519633045e-06, + "loss": 0.3764, + "step": 3115 + }, + { + "epoch": 0.78, + "grad_norm": 1.6015625, + "learning_rate": 5.737168930605272e-06, + "loss": 0.3888, + "step": 3120 + }, + { + "epoch": 0.78125, + "grad_norm": 1.015625, + "learning_rate": 5.674738665931575e-06, + "loss": 0.4209, + "step": 3125 + }, + { + "epoch": 0.7825, + "grad_norm": 1.3203125, + "learning_rate": 5.612606420694141e-06, + "loss": 0.4727, + "step": 3130 + }, + { + "epoch": 0.78375, + "grad_norm": 1.0546875, + "learning_rate": 5.550773153049046e-06, + "loss": 0.4365, + "step": 3135 + }, + { + "epoch": 0.785, + "grad_norm": 1.2421875, + "learning_rate": 5.489239816541755e-06, + "loss": 0.4403, + "step": 3140 + }, + { + "epoch": 0.78625, + "grad_norm": 1.390625, + "learning_rate": 5.428007360092463e-06, + "loss": 0.4521, + "step": 3145 + }, + { + "epoch": 0.7875, + "grad_norm": 1.25, + "learning_rate": 5.367076727981382e-06, + "loss": 0.4657, + "step": 3150 + }, + { + "epoch": 0.78875, + "grad_norm": 1.015625, + "learning_rate": 5.306448859834228e-06, + "loss": 0.4367, + "step": 3155 + }, + { + "epoch": 0.79, + "grad_norm": 1.1015625, + "learning_rate": 5.24612469060774e-06, + "loss": 0.4053, + "step": 3160 + }, + { + "epoch": 0.79125, + "grad_norm": 1.515625, + "learning_rate": 5.186105150575232e-06, + "loss": 0.3926, + "step": 3165 + }, + { + "epoch": 0.7925, + "grad_norm": 1.34375, + "learning_rate": 5.12639116531225e-06, + "loss": 0.4534, + "step": 3170 + }, + { + "epoch": 0.79375, + "grad_norm": 1.3125, + "learning_rate": 5.066983655682325e-06, + "loss": 0.4551, + "step": 3175 + }, + { + "epoch": 0.795, + "grad_norm": 1.375, + "learning_rate": 5.007883537822736e-06, + "loss": 0.4066, + "step": 3180 + }, + { + "epoch": 0.79625, + "grad_norm": 0.75, + "learning_rate": 4.949091723130425e-06, + "loss": 0.4247, + "step": 3185 + }, + { + "epoch": 0.7975, + "grad_norm": 1.6640625, + "learning_rate": 4.890609118247888e-06, + "loss": 0.4215, + "step": 3190 + }, + { + "epoch": 0.79875, + "grad_norm": 1.4296875, + "learning_rate": 4.832436625049256e-06, + "loss": 0.4385, + "step": 3195 + }, + { + "epoch": 0.8, + "grad_norm": 0.98046875, + "learning_rate": 4.7745751406263165e-06, + "loss": 0.4393, + "step": 3200 + }, + { + "epoch": 0.80125, + "grad_norm": 0.9140625, + "learning_rate": 4.717025557274749e-06, + "loss": 0.42, + "step": 3205 + }, + { + "epoch": 0.8025, + "grad_norm": 0.90234375, + "learning_rate": 4.659788762480327e-06, + "loss": 0.3758, + "step": 3210 + }, + { + "epoch": 0.80375, + "grad_norm": 1.1640625, + "learning_rate": 4.602865638905224e-06, + "loss": 0.4448, + "step": 3215 + }, + { + "epoch": 0.805, + "grad_norm": 1.078125, + "learning_rate": 4.54625706437441e-06, + "loss": 0.4453, + "step": 3220 + }, + { + "epoch": 0.80625, + "grad_norm": 1.3125, + "learning_rate": 4.48996391186216e-06, + "loss": 0.4359, + "step": 3225 + }, + { + "epoch": 0.8075, + "grad_norm": 1.3515625, + "learning_rate": 4.433987049478508e-06, + "loss": 0.3974, + "step": 3230 + }, + { + "epoch": 0.80875, + "grad_norm": 1.125, + "learning_rate": 4.378327340455915e-06, + "loss": 0.4194, + "step": 3235 + }, + { + "epoch": 0.81, + "grad_norm": 0.87890625, + "learning_rate": 4.322985643135952e-06, + "loss": 0.4214, + "step": 3240 + }, + { + "epoch": 0.81125, + "grad_norm": 1.1640625, + "learning_rate": 4.267962810956061e-06, + "loss": 0.3592, + "step": 3245 + }, + { + "epoch": 0.8125, + "grad_norm": 1.2421875, + "learning_rate": 4.213259692436367e-06, + "loss": 0.3997, + "step": 3250 + }, + { + "epoch": 0.81375, + "grad_norm": 1.484375, + "learning_rate": 4.158877131166641e-06, + "loss": 0.4471, + "step": 3255 + }, + { + "epoch": 0.815, + "grad_norm": 0.953125, + "learning_rate": 4.104815965793249e-06, + "loss": 0.4293, + "step": 3260 + }, + { + "epoch": 0.81625, + "grad_norm": 1.203125, + "learning_rate": 4.051077030006228e-06, + "loss": 0.4562, + "step": 3265 + }, + { + "epoch": 0.8175, + "grad_norm": 1.0703125, + "learning_rate": 3.9976611525264525e-06, + "loss": 0.434, + "step": 3270 + }, + { + "epoch": 0.81875, + "grad_norm": 0.8671875, + "learning_rate": 3.944569157092839e-06, + "loss": 0.4524, + "step": 3275 + }, + { + "epoch": 0.82, + "grad_norm": 1.296875, + "learning_rate": 3.891801862449629e-06, + "loss": 0.4498, + "step": 3280 + }, + { + "epoch": 0.82125, + "grad_norm": 1.0625, + "learning_rate": 3.839360082333771e-06, + "loss": 0.4329, + "step": 3285 + }, + { + "epoch": 0.8225, + "grad_norm": 1.375, + "learning_rate": 3.7872446254624104e-06, + "loss": 0.3884, + "step": 3290 + }, + { + "epoch": 0.82375, + "grad_norm": 1.046875, + "learning_rate": 3.735456295520348e-06, + "loss": 0.4114, + "step": 3295 + }, + { + "epoch": 0.825, + "grad_norm": 1.3125, + "learning_rate": 3.6839958911476957e-06, + "loss": 0.4404, + "step": 3300 + }, + { + "epoch": 0.82625, + "grad_norm": 1.171875, + "learning_rate": 3.6328642059275524e-06, + "loss": 0.4548, + "step": 3305 + }, + { + "epoch": 0.8275, + "grad_norm": 0.92578125, + "learning_rate": 3.5820620283737616e-06, + "loss": 0.4648, + "step": 3310 + }, + { + "epoch": 0.82875, + "grad_norm": 0.875, + "learning_rate": 3.5315901419187363e-06, + "loss": 0.4233, + "step": 3315 + }, + { + "epoch": 0.83, + "grad_norm": 0.953125, + "learning_rate": 3.4814493249014116e-06, + "loss": 0.4021, + "step": 3320 + }, + { + "epoch": 0.83125, + "grad_norm": 1.0546875, + "learning_rate": 3.431640350555204e-06, + "loss": 0.4732, + "step": 3325 + }, + { + "epoch": 0.8325, + "grad_norm": 1.0546875, + "learning_rate": 3.382163986996126e-06, + "loss": 0.4174, + "step": 3330 + }, + { + "epoch": 0.83375, + "grad_norm": 1.3984375, + "learning_rate": 3.3330209972108976e-06, + "loss": 0.4284, + "step": 3335 + }, + { + "epoch": 0.835, + "grad_norm": 1.109375, + "learning_rate": 3.284212139045223e-06, + "loss": 0.4183, + "step": 3340 + }, + { + "epoch": 0.83625, + "grad_norm": 1.390625, + "learning_rate": 3.2357381651920648e-06, + "loss": 0.3996, + "step": 3345 + }, + { + "epoch": 0.8375, + "grad_norm": 1.2578125, + "learning_rate": 3.187599823180071e-06, + "loss": 0.4317, + "step": 3350 + }, + { + "epoch": 0.83875, + "grad_norm": 1.21875, + "learning_rate": 3.139797855362031e-06, + "loss": 0.4341, + "step": 3355 + }, + { + "epoch": 0.84, + "grad_norm": 1.59375, + "learning_rate": 3.092332998903416e-06, + "loss": 0.4516, + "step": 3360 + }, + { + "epoch": 0.84125, + "grad_norm": 1.0703125, + "learning_rate": 3.0452059857710186e-06, + "loss": 0.4371, + "step": 3365 + }, + { + "epoch": 0.8425, + "grad_norm": 0.9921875, + "learning_rate": 2.9984175427217016e-06, + "loss": 0.4346, + "step": 3370 + }, + { + "epoch": 0.84375, + "grad_norm": 0.96875, + "learning_rate": 2.9519683912911266e-06, + "loss": 0.3893, + "step": 3375 + }, + { + "epoch": 0.845, + "grad_norm": 1.046875, + "learning_rate": 2.9058592477826636e-06, + "loss": 0.4086, + "step": 3380 + }, + { + "epoch": 0.84625, + "grad_norm": 0.9375, + "learning_rate": 2.860090823256359e-06, + "loss": 0.4211, + "step": 3385 + }, + { + "epoch": 0.8475, + "grad_norm": 1.296875, + "learning_rate": 2.8146638235179213e-06, + "loss": 0.422, + "step": 3390 + }, + { + "epoch": 0.84875, + "grad_norm": 1.0234375, + "learning_rate": 2.769578949107893e-06, + "loss": 0.4117, + "step": 3395 + }, + { + "epoch": 0.85, + "grad_norm": 0.90625, + "learning_rate": 2.7248368952908053e-06, + "loss": 0.3836, + "step": 3400 + }, + { + "epoch": 0.85125, + "grad_norm": 1.046875, + "learning_rate": 2.6804383520444815e-06, + "loss": 0.3996, + "step": 3405 + }, + { + "epoch": 0.8525, + "grad_norm": 1.234375, + "learning_rate": 2.6363840040493747e-06, + "loss": 0.4007, + "step": 3410 + }, + { + "epoch": 0.85375, + "grad_norm": 1.0625, + "learning_rate": 2.5926745306780324e-06, + "loss": 0.4431, + "step": 3415 + }, + { + "epoch": 0.855, + "grad_norm": 1.09375, + "learning_rate": 2.5493106059846116e-06, + "loss": 0.4013, + "step": 3420 + }, + { + "epoch": 0.85625, + "grad_norm": 1.3125, + "learning_rate": 2.506292898694468e-06, + "loss": 0.4748, + "step": 3425 + }, + { + "epoch": 0.8575, + "grad_norm": 1.3828125, + "learning_rate": 2.4636220721938554e-06, + "loss": 0.4454, + "step": 3430 + }, + { + "epoch": 0.85875, + "grad_norm": 1.546875, + "learning_rate": 2.421298784519724e-06, + "loss": 0.3844, + "step": 3435 + }, + { + "epoch": 0.86, + "grad_norm": 1.296875, + "learning_rate": 2.379323688349516e-06, + "loss": 0.4772, + "step": 3440 + }, + { + "epoch": 0.86125, + "grad_norm": 1.09375, + "learning_rate": 2.3376974309911343e-06, + "loss": 0.4668, + "step": 3445 + }, + { + "epoch": 0.8625, + "grad_norm": 1.359375, + "learning_rate": 2.296420654372966e-06, + "loss": 0.4191, + "step": 3450 + }, + { + "epoch": 0.86375, + "grad_norm": 0.8828125, + "learning_rate": 2.2554939950339747e-06, + "loss": 0.3971, + "step": 3455 + }, + { + "epoch": 0.865, + "grad_norm": 1.296875, + "learning_rate": 2.2149180841138676e-06, + "loss": 0.4282, + "step": 3460 + }, + { + "epoch": 0.86625, + "grad_norm": 1.0, + "learning_rate": 2.1746935473433928e-06, + "loss": 0.4406, + "step": 3465 + }, + { + "epoch": 0.8675, + "grad_norm": 1.3828125, + "learning_rate": 2.1348210050346595e-06, + "loss": 0.3914, + "step": 3470 + }, + { + "epoch": 0.86875, + "grad_norm": 1.1015625, + "learning_rate": 2.0953010720716037e-06, + "loss": 0.3676, + "step": 3475 + }, + { + "epoch": 0.87, + "grad_norm": 1.234375, + "learning_rate": 2.0561343579004715e-06, + "loss": 0.3973, + "step": 3480 + }, + { + "epoch": 0.87125, + "grad_norm": 1.4375, + "learning_rate": 2.0173214665204555e-06, + "loss": 0.4067, + "step": 3485 + }, + { + "epoch": 0.8725, + "grad_norm": 1.2109375, + "learning_rate": 1.9788629964743455e-06, + "loss": 0.4279, + "step": 3490 + }, + { + "epoch": 0.87375, + "grad_norm": 1.3359375, + "learning_rate": 1.940759540839329e-06, + "loss": 0.4449, + "step": 3495 + }, + { + "epoch": 0.875, + "grad_norm": 1.2890625, + "learning_rate": 1.9030116872178316e-06, + "loss": 0.4364, + "step": 3500 + }, + { + "epoch": 0.87625, + "grad_norm": 1.1015625, + "learning_rate": 1.8656200177284505e-06, + "loss": 0.3991, + "step": 3505 + }, + { + "epoch": 0.8775, + "grad_norm": 1.109375, + "learning_rate": 1.8285851089969802e-06, + "loss": 0.3922, + "step": 3510 + }, + { + "epoch": 0.87875, + "grad_norm": 1.015625, + "learning_rate": 1.7919075321475325e-06, + "loss": 0.4559, + "step": 3515 + }, + { + "epoch": 0.88, + "grad_norm": 1.15625, + "learning_rate": 1.7555878527937164e-06, + "loss": 0.3947, + "step": 3520 + }, + { + "epoch": 0.88125, + "grad_norm": 0.98828125, + "learning_rate": 1.7196266310299108e-06, + "loss": 0.3853, + "step": 3525 + }, + { + "epoch": 0.8825, + "grad_norm": 1.1015625, + "learning_rate": 1.6840244214226502e-06, + "loss": 0.4429, + "step": 3530 + }, + { + "epoch": 0.88375, + "grad_norm": 1.2109375, + "learning_rate": 1.6487817730020365e-06, + "loss": 0.4092, + "step": 3535 + }, + { + "epoch": 0.885, + "grad_norm": 1.046875, + "learning_rate": 1.6138992292533183e-06, + "loss": 0.4348, + "step": 3540 + }, + { + "epoch": 0.88625, + "grad_norm": 0.92578125, + "learning_rate": 1.579377328108464e-06, + "loss": 0.4362, + "step": 3545 + }, + { + "epoch": 0.8875, + "grad_norm": 1.3359375, + "learning_rate": 1.5452166019378989e-06, + "loss": 0.431, + "step": 3550 + }, + { + "epoch": 0.88875, + "grad_norm": 1.0703125, + "learning_rate": 1.5114175775422762e-06, + "loss": 0.4164, + "step": 3555 + }, + { + "epoch": 0.89, + "grad_norm": 1.296875, + "learning_rate": 1.4779807761443636e-06, + "loss": 0.4154, + "step": 3560 + }, + { + "epoch": 0.89125, + "grad_norm": 1.3046875, + "learning_rate": 1.4449067133810056e-06, + "loss": 0.4108, + "step": 3565 + }, + { + "epoch": 0.8925, + "grad_norm": 1.1796875, + "learning_rate": 1.4121958992951629e-06, + "loss": 0.4024, + "step": 3570 + }, + { + "epoch": 0.89375, + "grad_norm": 1.0703125, + "learning_rate": 1.379848838328049e-06, + "loss": 0.4041, + "step": 3575 + }, + { + "epoch": 0.895, + "grad_norm": 1.2109375, + "learning_rate": 1.3478660293113676e-06, + "loss": 0.4291, + "step": 3580 + }, + { + "epoch": 0.89625, + "grad_norm": 1.0703125, + "learning_rate": 1.3162479654595938e-06, + "loss": 0.4561, + "step": 3585 + }, + { + "epoch": 0.8975, + "grad_norm": 2.03125, + "learning_rate": 1.284995134362385e-06, + "loss": 0.4599, + "step": 3590 + }, + { + "epoch": 0.89875, + "grad_norm": 0.92578125, + "learning_rate": 1.2541080179770571e-06, + "loss": 0.371, + "step": 3595 + }, + { + "epoch": 0.9, + "grad_norm": 1.1171875, + "learning_rate": 1.2235870926211619e-06, + "loss": 0.428, + "step": 3600 + }, + { + "epoch": 0.90125, + "grad_norm": 1.1796875, + "learning_rate": 1.193432828965113e-06, + "loss": 0.4093, + "step": 3605 + }, + { + "epoch": 0.9025, + "grad_norm": 1.0078125, + "learning_rate": 1.16364569202497e-06, + "loss": 0.4123, + "step": 3610 + }, + { + "epoch": 0.90375, + "grad_norm": 1.0625, + "learning_rate": 1.134226141155223e-06, + "loss": 0.4212, + "step": 3615 + }, + { + "epoch": 0.905, + "grad_norm": 0.99609375, + "learning_rate": 1.105174630041747e-06, + "loss": 0.4379, + "step": 3620 + }, + { + "epoch": 0.90625, + "grad_norm": 1.28125, + "learning_rate": 1.0764916066947794e-06, + "loss": 0.4788, + "step": 3625 + }, + { + "epoch": 0.9075, + "grad_norm": 1.1875, + "learning_rate": 1.0481775134420225e-06, + "loss": 0.4523, + "step": 3630 + }, + { + "epoch": 0.90875, + "grad_norm": 1.5078125, + "learning_rate": 1.020232786921821e-06, + "loss": 0.4141, + "step": 3635 + }, + { + "epoch": 0.91, + "grad_norm": 1.0078125, + "learning_rate": 9.926578580764234e-07, + "loss": 0.3628, + "step": 3640 + }, + { + "epoch": 0.91125, + "grad_norm": 1.109375, + "learning_rate": 9.654531521453513e-07, + "loss": 0.4142, + "step": 3645 + }, + { + "epoch": 0.9125, + "grad_norm": 1.3046875, + "learning_rate": 9.386190886588208e-07, + "loss": 0.4348, + "step": 3650 + }, + { + "epoch": 0.91375, + "grad_norm": 1.6796875, + "learning_rate": 9.121560814312813e-07, + "loss": 0.4138, + "step": 3655 + }, + { + "epoch": 0.915, + "grad_norm": 1.078125, + "learning_rate": 8.860645385550481e-07, + "loss": 0.4332, + "step": 3660 + }, + { + "epoch": 0.91625, + "grad_norm": 1.1484375, + "learning_rate": 8.603448623939858e-07, + "loss": 0.4577, + "step": 3665 + }, + { + "epoch": 0.9175, + "grad_norm": 1.1484375, + "learning_rate": 8.349974495773183e-07, + "loss": 0.4456, + "step": 3670 + }, + { + "epoch": 0.91875, + "grad_norm": 1.28125, + "learning_rate": 8.10022690993506e-07, + "loss": 0.3946, + "step": 3675 + }, + { + "epoch": 0.92, + "grad_norm": 1.3203125, + "learning_rate": 7.854209717842231e-07, + "loss": 0.478, + "step": 3680 + }, + { + "epoch": 0.92125, + "grad_norm": 1.1171875, + "learning_rate": 7.611926713384121e-07, + "loss": 0.3592, + "step": 3685 + }, + { + "epoch": 0.9225, + "grad_norm": 1.234375, + "learning_rate": 7.373381632864384e-07, + "loss": 0.4425, + "step": 3690 + }, + { + "epoch": 0.92375, + "grad_norm": 1.1796875, + "learning_rate": 7.138578154943288e-07, + "loss": 0.4219, + "step": 3695 + }, + { + "epoch": 0.925, + "grad_norm": 1.3125, + "learning_rate": 6.907519900580861e-07, + "loss": 0.4419, + "step": 3700 + }, + { + "epoch": 0.92625, + "grad_norm": 1.1796875, + "learning_rate": 6.680210432981254e-07, + "loss": 0.3983, + "step": 3705 + }, + { + "epoch": 0.9275, + "grad_norm": 1.046875, + "learning_rate": 6.456653257537665e-07, + "loss": 0.4417, + "step": 3710 + }, + { + "epoch": 0.92875, + "grad_norm": 1.03125, + "learning_rate": 6.2368518217783e-07, + "loss": 0.4469, + "step": 3715 + }, + { + "epoch": 0.93, + "grad_norm": 1.0234375, + "learning_rate": 6.020809515313142e-07, + "loss": 0.435, + "step": 3720 + }, + { + "epoch": 0.93125, + "grad_norm": 1.6875, + "learning_rate": 5.808529669781904e-07, + "loss": 0.3856, + "step": 3725 + }, + { + "epoch": 0.9325, + "grad_norm": 1.1875, + "learning_rate": 5.600015558802352e-07, + "loss": 0.4587, + "step": 3730 + }, + { + "epoch": 0.93375, + "grad_norm": 1.203125, + "learning_rate": 5.39527039792001e-07, + "loss": 0.4325, + "step": 3735 + }, + { + "epoch": 0.935, + "grad_norm": 1.484375, + "learning_rate": 5.194297344558536e-07, + "loss": 0.4166, + "step": 3740 + }, + { + "epoch": 0.93625, + "grad_norm": 1.2890625, + "learning_rate": 4.997099497971114e-07, + "loss": 0.4347, + "step": 3745 + }, + { + "epoch": 0.9375, + "grad_norm": 0.95703125, + "learning_rate": 4.803679899192392e-07, + "loss": 0.4252, + "step": 3750 + }, + { + "epoch": 0.93875, + "grad_norm": 1.1875, + "learning_rate": 4.614041530991903e-07, + "loss": 0.4036, + "step": 3755 + }, + { + "epoch": 0.94, + "grad_norm": 1.375, + "learning_rate": 4.4281873178278475e-07, + "loss": 0.4359, + "step": 3760 + }, + { + "epoch": 0.94125, + "grad_norm": 1.359375, + "learning_rate": 4.246120125802111e-07, + "loss": 0.4566, + "step": 3765 + }, + { + "epoch": 0.9425, + "grad_norm": 1.0078125, + "learning_rate": 4.067842762616014e-07, + "loss": 0.4226, + "step": 3770 + }, + { + "epoch": 0.94375, + "grad_norm": 1.1953125, + "learning_rate": 3.8933579775271013e-07, + "loss": 0.3903, + "step": 3775 + }, + { + "epoch": 0.945, + "grad_norm": 1.125, + "learning_rate": 3.7226684613065333e-07, + "loss": 0.4119, + "step": 3780 + }, + { + "epoch": 0.94625, + "grad_norm": 1.3515625, + "learning_rate": 3.555776846197817e-07, + "loss": 0.4268, + "step": 3785 + }, + { + "epoch": 0.9475, + "grad_norm": 0.9765625, + "learning_rate": 3.3926857058761417e-07, + "loss": 0.405, + "step": 3790 + }, + { + "epoch": 0.94875, + "grad_norm": 1.2734375, + "learning_rate": 3.233397555408607e-07, + "loss": 0.4161, + "step": 3795 + }, + { + "epoch": 0.95, + "grad_norm": 1.2578125, + "learning_rate": 3.077914851215585e-07, + "loss": 0.4293, + "step": 3800 + }, + { + "epoch": 0.95125, + "grad_norm": 1.3671875, + "learning_rate": 2.92623999103267e-07, + "loss": 0.4066, + "step": 3805 + }, + { + "epoch": 0.9525, + "grad_norm": 1.0859375, + "learning_rate": 2.778375313873871e-07, + "loss": 0.4309, + "step": 3810 + }, + { + "epoch": 0.95375, + "grad_norm": 1.2421875, + "learning_rate": 2.634323099995395e-07, + "loss": 0.4623, + "step": 3815 + }, + { + "epoch": 0.955, + "grad_norm": 1.6015625, + "learning_rate": 2.494085570860616e-07, + "loss": 0.3977, + "step": 3820 + }, + { + "epoch": 0.95625, + "grad_norm": 1.28125, + "learning_rate": 2.3576648891056875e-07, + "loss": 0.4135, + "step": 3825 + }, + { + "epoch": 0.9575, + "grad_norm": 1.1640625, + "learning_rate": 2.2250631585063186e-07, + "loss": 0.3874, + "step": 3830 + }, + { + "epoch": 0.95875, + "grad_norm": 1.21875, + "learning_rate": 2.0962824239451894e-07, + "loss": 0.4494, + "step": 3835 + }, + { + "epoch": 0.96, + "grad_norm": 1.1328125, + "learning_rate": 1.9713246713805588e-07, + "loss": 0.3946, + "step": 3840 + }, + { + "epoch": 0.96125, + "grad_norm": 0.85546875, + "learning_rate": 1.8501918278155393e-07, + "loss": 0.4312, + "step": 3845 + }, + { + "epoch": 0.9625, + "grad_norm": 1.359375, + "learning_rate": 1.732885761268427e-07, + "loss": 0.4453, + "step": 3850 + }, + { + "epoch": 0.96375, + "grad_norm": 0.984375, + "learning_rate": 1.619408280743917e-07, + "loss": 0.3943, + "step": 3855 + }, + { + "epoch": 0.965, + "grad_norm": 0.90234375, + "learning_rate": 1.509761136205101e-07, + "loss": 0.4144, + "step": 3860 + }, + { + "epoch": 0.96625, + "grad_norm": 1.046875, + "learning_rate": 1.4039460185465703e-07, + "loss": 0.4026, + "step": 3865 + }, + { + "epoch": 0.9675, + "grad_norm": 1.0625, + "learning_rate": 1.3019645595683806e-07, + "loss": 0.432, + "step": 3870 + }, + { + "epoch": 0.96875, + "grad_norm": 1.2890625, + "learning_rate": 1.2038183319507955e-07, + "loss": 0.4384, + "step": 3875 + }, + { + "epoch": 0.97, + "grad_norm": 1.328125, + "learning_rate": 1.109508849230001e-07, + "loss": 0.4622, + "step": 3880 + }, + { + "epoch": 0.97125, + "grad_norm": 1.265625, + "learning_rate": 1.0190375657749274e-07, + "loss": 0.4136, + "step": 3885 + }, + { + "epoch": 0.9725, + "grad_norm": 1.359375, + "learning_rate": 9.324058767646859e-08, + "loss": 0.4417, + "step": 3890 + }, + { + "epoch": 0.97375, + "grad_norm": 1.0625, + "learning_rate": 8.496151181670852e-08, + "loss": 0.4316, + "step": 3895 + }, + { + "epoch": 0.975, + "grad_norm": 0.8515625, + "learning_rate": 7.706665667180091e-08, + "loss": 0.4246, + "step": 3900 + }, + { + "epoch": 0.97625, + "grad_norm": 1.3515625, + "learning_rate": 6.955614399018206e-08, + "loss": 0.4481, + "step": 3905 + }, + { + "epoch": 0.9775, + "grad_norm": 1.1171875, + "learning_rate": 6.243008959324892e-08, + "loss": 0.4544, + "step": 3910 + }, + { + "epoch": 0.97875, + "grad_norm": 1.3671875, + "learning_rate": 5.568860337357151e-08, + "loss": 0.4005, + "step": 3915 + }, + { + "epoch": 0.98, + "grad_norm": 1.265625, + "learning_rate": 4.9331789293211026e-08, + "loss": 0.4955, + "step": 3920 + }, + { + "epoch": 0.98125, + "grad_norm": 1.5546875, + "learning_rate": 4.335974538210441e-08, + "loss": 0.4102, + "step": 3925 + }, + { + "epoch": 0.9825, + "grad_norm": 1.2421875, + "learning_rate": 3.7772563736551694e-08, + "loss": 0.4542, + "step": 3930 + }, + { + "epoch": 0.98375, + "grad_norm": 1.2421875, + "learning_rate": 3.2570330517811555e-08, + "loss": 0.4691, + "step": 3935 + }, + { + "epoch": 0.985, + "grad_norm": 0.9140625, + "learning_rate": 2.7753125950752413e-08, + "loss": 0.4155, + "step": 3940 + }, + { + "epoch": 0.98625, + "grad_norm": 1.3125, + "learning_rate": 2.3321024322625617e-08, + "loss": 0.4268, + "step": 3945 + }, + { + "epoch": 0.9875, + "grad_norm": 1.0546875, + "learning_rate": 1.9274093981927478e-08, + "loss": 0.4098, + "step": 3950 + }, + { + "epoch": 0.98875, + "grad_norm": 1.171875, + "learning_rate": 1.5612397337325113e-08, + "loss": 0.3965, + "step": 3955 + }, + { + "epoch": 0.99, + "grad_norm": 1.2421875, + "learning_rate": 1.233599085671e-08, + "loss": 0.4047, + "step": 3960 + }, + { + "epoch": 0.99125, + "grad_norm": 1.3125, + "learning_rate": 9.444925066329213e-09, + "loss": 0.4358, + "step": 3965 + }, + { + "epoch": 0.9925, + "grad_norm": 0.88671875, + "learning_rate": 6.939244549986068e-09, + "loss": 0.465, + "step": 3970 + }, + { + "epoch": 0.99375, + "grad_norm": 1.109375, + "learning_rate": 4.818987948379539e-09, + "loss": 0.4483, + "step": 3975 + }, + { + "epoch": 0.995, + "grad_norm": 1.46875, + "learning_rate": 3.0841879584853073e-09, + "loss": 0.4986, + "step": 3980 + }, + { + "epoch": 0.99625, + "grad_norm": 1.2734375, + "learning_rate": 1.7348713330672671e-09, + "loss": 0.4121, + "step": 3985 + }, + { + "epoch": 0.9975, + "grad_norm": 1.078125, + "learning_rate": 7.710588802584129e-10, + "loss": 0.3764, + "step": 3990 + }, + { + "epoch": 0.99875, + "grad_norm": 1.125, + "learning_rate": 1.9276546323609978e-10, + "loss": 0.4443, + "step": 3995 + }, + { + "epoch": 1.0, + "grad_norm": 0.8515625, + "learning_rate": 0.0, + "loss": 0.4357, + "step": 4000 + } + ], + "logging_steps": 5, + "max_steps": 4000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 400, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.184022656018907e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}