diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4501 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9929408114188193, + "eval_steps": 500, + "global_step": 3200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 20.729074478149414, + "learning_rate": 2.5e-06, + "loss": 8.0612, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 16.320600509643555, + "learning_rate": 5e-06, + "loss": 7.3007, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 17.508378982543945, + "learning_rate": 7.5e-06, + "loss": 7.7541, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 18.7609920501709, + "learning_rate": 1e-05, + "loss": 7.0762, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 10.039741516113281, + "learning_rate": 1.25e-05, + "loss": 6.3794, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 10.681583404541016, + "learning_rate": 1.5e-05, + "loss": 5.7463, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 8.521218299865723, + "learning_rate": 1.75e-05, + "loss": 5.1425, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 8.024609565734863, + "learning_rate": 2e-05, + "loss": 4.8565, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 6.419050216674805, + "learning_rate": 2.25e-05, + "loss": 4.4552, + "step": 45 + }, + { + "epoch": 0.02, + "grad_norm": 7.1052398681640625, + "learning_rate": 2.5e-05, + "loss": 4.1432, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 7.79315710067749, + "learning_rate": 2.7500000000000004e-05, + "loss": 3.9919, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 5.008393287658691, + "learning_rate": 3e-05, + "loss": 3.3339, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 8.750615119934082, + "learning_rate": 3.2500000000000004e-05, + "loss": 3.3154, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 5.283076286315918, + "learning_rate": 3.5e-05, + "loss": 2.8296, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 6.005578517913818, + "learning_rate": 3.7500000000000003e-05, + "loss": 2.8239, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 7.009499549865723, + "learning_rate": 4e-05, + "loss": 3.0532, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 5.712557315826416, + "learning_rate": 4.25e-05, + "loss": 2.8819, + "step": 85 + }, + { + "epoch": 0.03, + "grad_norm": 4.914234638214111, + "learning_rate": 4.5e-05, + "loss": 2.8031, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 7.396793842315674, + "learning_rate": 4.75e-05, + "loss": 2.6904, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 5.087535381317139, + "learning_rate": 5e-05, + "loss": 2.772, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 6.230583190917969, + "learning_rate": 4.9999683566063894e-05, + "loss": 2.6301, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 4.741369724273682, + "learning_rate": 4.9998734272266e-05, + "loss": 2.5966, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 4.758203506469727, + "learning_rate": 4.9997152142637426e-05, + "loss": 2.4406, + "step": 115 + }, + { + "epoch": 0.04, + "grad_norm": 4.093080997467041, + "learning_rate": 4.999493721722933e-05, + "loss": 2.6457, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 5.253550052642822, + "learning_rate": 4.999208955211192e-05, + "loss": 2.5449, + "step": 125 + }, + { + "epoch": 0.04, + "grad_norm": 5.3556294441223145, + "learning_rate": 4.998860921937302e-05, + "loss": 2.5182, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 3.888378620147705, + "learning_rate": 4.998449630711627e-05, + "loss": 2.6575, + "step": 135 + }, + { + "epoch": 0.04, + "grad_norm": 4.9733967781066895, + "learning_rate": 4.997975091945886e-05, + "loss": 2.5669, + "step": 140 + }, + { + "epoch": 0.04, + "grad_norm": 3.3941574096679688, + "learning_rate": 4.997437317652894e-05, + "loss": 2.5628, + "step": 145 + }, + { + "epoch": 0.05, + "grad_norm": 3.743703842163086, + "learning_rate": 4.996836321446253e-05, + "loss": 2.6051, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 3.359017848968506, + "learning_rate": 4.99617211854001e-05, + "loss": 2.2357, + "step": 155 + }, + { + "epoch": 0.05, + "grad_norm": 4.703392028808594, + "learning_rate": 4.995444725748274e-05, + "loss": 2.4146, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 4.182121753692627, + "learning_rate": 4.994654161484784e-05, + "loss": 2.4228, + "step": 165 + }, + { + "epoch": 0.05, + "grad_norm": 4.623451232910156, + "learning_rate": 4.993800445762451e-05, + "loss": 2.4149, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 3.7832231521606445, + "learning_rate": 4.992883600192844e-05, + "loss": 2.4566, + "step": 175 + }, + { + "epoch": 0.06, + "grad_norm": 3.907249689102173, + "learning_rate": 4.991903647985646e-05, + "loss": 2.403, + "step": 180 + }, + { + "epoch": 0.06, + "grad_norm": 3.4823191165924072, + "learning_rate": 4.990860613948071e-05, + "loss": 2.518, + "step": 185 + }, + { + "epoch": 0.06, + "grad_norm": 4.531657695770264, + "learning_rate": 4.989754524484225e-05, + "loss": 2.4007, + "step": 190 + }, + { + "epoch": 0.06, + "grad_norm": 4.945577621459961, + "learning_rate": 4.988585407594449e-05, + "loss": 2.3891, + "step": 195 + }, + { + "epoch": 0.06, + "grad_norm": 3.9174554347991943, + "learning_rate": 4.9873532928746036e-05, + "loss": 2.2904, + "step": 200 + }, + { + "epoch": 0.06, + "grad_norm": 3.8385236263275146, + "learning_rate": 4.986058211515321e-05, + "loss": 2.2802, + "step": 205 + }, + { + "epoch": 0.07, + "grad_norm": 4.326376914978027, + "learning_rate": 4.9847001963012176e-05, + "loss": 2.295, + "step": 210 + }, + { + "epoch": 0.07, + "grad_norm": 5.581832408905029, + "learning_rate": 4.9832792816100605e-05, + "loss": 2.4895, + "step": 215 + }, + { + "epoch": 0.07, + "grad_norm": 3.5401458740234375, + "learning_rate": 4.981795503411901e-05, + "loss": 2.3254, + "step": 220 + }, + { + "epoch": 0.07, + "grad_norm": 4.960626125335693, + "learning_rate": 4.9802488992681594e-05, + "loss": 2.2977, + "step": 225 + }, + { + "epoch": 0.07, + "grad_norm": 4.908995628356934, + "learning_rate": 4.978639508330681e-05, + "loss": 2.3534, + "step": 230 + }, + { + "epoch": 0.07, + "grad_norm": 4.865789890289307, + "learning_rate": 4.976967371340736e-05, + "loss": 2.3781, + "step": 235 + }, + { + "epoch": 0.07, + "grad_norm": 4.27896785736084, + "learning_rate": 4.975232530627998e-05, + "loss": 2.3221, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 3.9018704891204834, + "learning_rate": 4.973435030109463e-05, + "loss": 2.407, + "step": 245 + }, + { + "epoch": 0.08, + "grad_norm": 3.4363269805908203, + "learning_rate": 4.971574915288345e-05, + "loss": 2.3857, + "step": 250 + }, + { + "epoch": 0.08, + "grad_norm": 4.802529335021973, + "learning_rate": 4.9696522332529205e-05, + "loss": 2.183, + "step": 255 + }, + { + "epoch": 0.08, + "grad_norm": 4.064101696014404, + "learning_rate": 4.967667032675337e-05, + "loss": 2.2134, + "step": 260 + }, + { + "epoch": 0.08, + "grad_norm": 5.066267490386963, + "learning_rate": 4.965619363810381e-05, + "loss": 2.2722, + "step": 265 + }, + { + "epoch": 0.08, + "grad_norm": 4.149215221405029, + "learning_rate": 4.9635092784942064e-05, + "loss": 2.3393, + "step": 270 + }, + { + "epoch": 0.09, + "grad_norm": 3.8846592903137207, + "learning_rate": 4.9613368301430194e-05, + "loss": 2.2163, + "step": 275 + }, + { + "epoch": 0.09, + "grad_norm": 4.181525230407715, + "learning_rate": 4.9591020737517335e-05, + "loss": 2.4478, + "step": 280 + }, + { + "epoch": 0.09, + "grad_norm": 3.1801464557647705, + "learning_rate": 4.956805065892568e-05, + "loss": 2.2887, + "step": 285 + }, + { + "epoch": 0.09, + "grad_norm": 5.8738250732421875, + "learning_rate": 4.954445864713622e-05, + "loss": 2.29, + "step": 290 + }, + { + "epoch": 0.09, + "grad_norm": 4.968664646148682, + "learning_rate": 4.9520245299374014e-05, + "loss": 2.2801, + "step": 295 + }, + { + "epoch": 0.09, + "grad_norm": 5.4960784912109375, + "learning_rate": 4.949541122859305e-05, + "loss": 2.3109, + "step": 300 + }, + { + "epoch": 0.09, + "grad_norm": 3.6677656173706055, + "learning_rate": 4.9469957063460747e-05, + "loss": 2.2748, + "step": 305 + }, + { + "epoch": 0.1, + "grad_norm": 2.90336275100708, + "learning_rate": 4.944388344834205e-05, + "loss": 2.2016, + "step": 310 + }, + { + "epoch": 0.1, + "grad_norm": 3.515296459197998, + "learning_rate": 4.9417191043283086e-05, + "loss": 2.3607, + "step": 315 + }, + { + "epoch": 0.1, + "grad_norm": 3.070936679840088, + "learning_rate": 4.938988052399447e-05, + "loss": 2.3314, + "step": 320 + }, + { + "epoch": 0.1, + "grad_norm": 3.801671028137207, + "learning_rate": 4.936195258183422e-05, + "loss": 2.2395, + "step": 325 + }, + { + "epoch": 0.1, + "grad_norm": 4.183629035949707, + "learning_rate": 4.933340792379023e-05, + "loss": 2.4527, + "step": 330 + }, + { + "epoch": 0.1, + "grad_norm": 3.9023029804229736, + "learning_rate": 4.930424727246238e-05, + "loss": 2.2828, + "step": 335 + }, + { + "epoch": 0.11, + "grad_norm": 3.6366467475891113, + "learning_rate": 4.927447136604424e-05, + "loss": 2.2859, + "step": 340 + }, + { + "epoch": 0.11, + "grad_norm": 3.219228506088257, + "learning_rate": 4.924408095830439e-05, + "loss": 2.3497, + "step": 345 + }, + { + "epoch": 0.11, + "grad_norm": 3.768355369567871, + "learning_rate": 4.921307681856735e-05, + "loss": 2.1229, + "step": 350 + }, + { + "epoch": 0.11, + "grad_norm": 3.8723647594451904, + "learning_rate": 4.9181459731694054e-05, + "loss": 2.3544, + "step": 355 + }, + { + "epoch": 0.11, + "grad_norm": 3.512420892715454, + "learning_rate": 4.914923049806207e-05, + "loss": 1.9489, + "step": 360 + }, + { + "epoch": 0.11, + "grad_norm": 3.77095627784729, + "learning_rate": 4.911638993354524e-05, + "loss": 2.2499, + "step": 365 + }, + { + "epoch": 0.11, + "grad_norm": 3.8103721141815186, + "learning_rate": 4.90829388694931e-05, + "loss": 2.1032, + "step": 370 + }, + { + "epoch": 0.12, + "grad_norm": 3.6579902172088623, + "learning_rate": 4.9048878152709785e-05, + "loss": 2.2104, + "step": 375 + }, + { + "epoch": 0.12, + "grad_norm": 5.087968826293945, + "learning_rate": 4.901420864543265e-05, + "loss": 2.2601, + "step": 380 + }, + { + "epoch": 0.12, + "grad_norm": 3.773608684539795, + "learning_rate": 4.8978931225310375e-05, + "loss": 2.1831, + "step": 385 + }, + { + "epoch": 0.12, + "grad_norm": 6.229213714599609, + "learning_rate": 4.8943046785380795e-05, + "loss": 2.2507, + "step": 390 + }, + { + "epoch": 0.12, + "grad_norm": 5.113283634185791, + "learning_rate": 4.890655623404828e-05, + "loss": 2.2868, + "step": 395 + }, + { + "epoch": 0.12, + "grad_norm": 3.9976158142089844, + "learning_rate": 4.8869460495060726e-05, + "loss": 2.264, + "step": 400 + }, + { + "epoch": 0.13, + "grad_norm": 4.450018405914307, + "learning_rate": 4.883176050748619e-05, + "loss": 2.2319, + "step": 405 + }, + { + "epoch": 0.13, + "grad_norm": 3.610208511352539, + "learning_rate": 4.879345722568911e-05, + "loss": 2.1011, + "step": 410 + }, + { + "epoch": 0.13, + "grad_norm": 3.5385842323303223, + "learning_rate": 4.875455161930614e-05, + "loss": 2.2372, + "step": 415 + }, + { + "epoch": 0.13, + "grad_norm": 3.5152907371520996, + "learning_rate": 4.871504467322162e-05, + "loss": 2.3424, + "step": 420 + }, + { + "epoch": 0.13, + "grad_norm": 3.0804309844970703, + "learning_rate": 4.867493738754263e-05, + "loss": 1.9902, + "step": 425 + }, + { + "epoch": 0.13, + "grad_norm": 4.568037033081055, + "learning_rate": 4.8634230777573655e-05, + "loss": 2.216, + "step": 430 + }, + { + "epoch": 0.13, + "grad_norm": 3.0766966342926025, + "learning_rate": 4.859292587379094e-05, + "loss": 2.2049, + "step": 435 + }, + { + "epoch": 0.14, + "grad_norm": 3.8717846870422363, + "learning_rate": 4.855102372181634e-05, + "loss": 2.179, + "step": 440 + }, + { + "epoch": 0.14, + "grad_norm": 3.963639497756958, + "learning_rate": 4.8508525382390876e-05, + "loss": 2.3567, + "step": 445 + }, + { + "epoch": 0.14, + "grad_norm": 3.3204896450042725, + "learning_rate": 4.8465431931347904e-05, + "loss": 2.1157, + "step": 450 + }, + { + "epoch": 0.14, + "grad_norm": 4.884645938873291, + "learning_rate": 4.842174445958585e-05, + "loss": 2.192, + "step": 455 + }, + { + "epoch": 0.14, + "grad_norm": 5.058561325073242, + "learning_rate": 4.837746407304061e-05, + "loss": 2.2785, + "step": 460 + }, + { + "epoch": 0.14, + "grad_norm": 4.240612983703613, + "learning_rate": 4.833259189265753e-05, + "loss": 2.3115, + "step": 465 + }, + { + "epoch": 0.15, + "grad_norm": 3.628058433532715, + "learning_rate": 4.8287129054363076e-05, + "loss": 2.3267, + "step": 470 + }, + { + "epoch": 0.15, + "grad_norm": 3.4856207370758057, + "learning_rate": 4.8241076709036036e-05, + "loss": 2.1803, + "step": 475 + }, + { + "epoch": 0.15, + "grad_norm": 4.317348480224609, + "learning_rate": 4.8194436022478404e-05, + "loss": 2.1224, + "step": 480 + }, + { + "epoch": 0.15, + "grad_norm": 3.6160759925842285, + "learning_rate": 4.814720817538585e-05, + "loss": 2.1848, + "step": 485 + }, + { + "epoch": 0.15, + "grad_norm": 3.2244794368743896, + "learning_rate": 4.809939436331786e-05, + "loss": 2.2176, + "step": 490 + }, + { + "epoch": 0.15, + "grad_norm": 3.645427942276001, + "learning_rate": 4.805099579666748e-05, + "loss": 2.1778, + "step": 495 + }, + { + "epoch": 0.16, + "grad_norm": 3.9020988941192627, + "learning_rate": 4.800201370063059e-05, + "loss": 2.2817, + "step": 500 + }, + { + "epoch": 0.16, + "grad_norm": 4.484887599945068, + "learning_rate": 4.7952449315174996e-05, + "loss": 1.9207, + "step": 505 + }, + { + "epoch": 0.16, + "grad_norm": 4.281662464141846, + "learning_rate": 4.790230389500901e-05, + "loss": 2.2251, + "step": 510 + }, + { + "epoch": 0.16, + "grad_norm": 3.9683914184570312, + "learning_rate": 4.785157870954961e-05, + "loss": 2.22, + "step": 515 + }, + { + "epoch": 0.16, + "grad_norm": 3.39128041267395, + "learning_rate": 4.780027504289042e-05, + "loss": 2.3237, + "step": 520 + }, + { + "epoch": 0.16, + "grad_norm": 3.148158550262451, + "learning_rate": 4.774839419376914e-05, + "loss": 2.1838, + "step": 525 + }, + { + "epoch": 0.16, + "grad_norm": 4.339906692504883, + "learning_rate": 4.769593747553468e-05, + "loss": 2.0075, + "step": 530 + }, + { + "epoch": 0.17, + "grad_norm": 3.3067688941955566, + "learning_rate": 4.764290621611388e-05, + "loss": 2.1666, + "step": 535 + }, + { + "epoch": 0.17, + "grad_norm": 4.491573810577393, + "learning_rate": 4.758930175797797e-05, + "loss": 2.3295, + "step": 540 + }, + { + "epoch": 0.17, + "grad_norm": 3.894711494445801, + "learning_rate": 4.753512545810851e-05, + "loss": 2.1021, + "step": 545 + }, + { + "epoch": 0.17, + "grad_norm": 2.7983195781707764, + "learning_rate": 4.7480378687963114e-05, + "loss": 2.2335, + "step": 550 + }, + { + "epoch": 0.17, + "grad_norm": 3.40674090385437, + "learning_rate": 4.7425062833440634e-05, + "loss": 2.0456, + "step": 555 + }, + { + "epoch": 0.17, + "grad_norm": 3.834815263748169, + "learning_rate": 4.736917929484616e-05, + "loss": 2.3161, + "step": 560 + }, + { + "epoch": 0.18, + "grad_norm": 3.8907999992370605, + "learning_rate": 4.731272948685554e-05, + "loss": 2.1104, + "step": 565 + }, + { + "epoch": 0.18, + "grad_norm": 3.7746763229370117, + "learning_rate": 4.725571483847958e-05, + "loss": 2.0498, + "step": 570 + }, + { + "epoch": 0.18, + "grad_norm": 4.495760917663574, + "learning_rate": 4.719813679302784e-05, + "loss": 2.231, + "step": 575 + }, + { + "epoch": 0.18, + "grad_norm": 3.9231886863708496, + "learning_rate": 4.713999680807211e-05, + "loss": 2.1878, + "step": 580 + }, + { + "epoch": 0.18, + "grad_norm": 4.197574138641357, + "learning_rate": 4.708129635540955e-05, + "loss": 2.1897, + "step": 585 + }, + { + "epoch": 0.18, + "grad_norm": 4.721147060394287, + "learning_rate": 4.702203692102539e-05, + "loss": 2.1359, + "step": 590 + }, + { + "epoch": 0.18, + "grad_norm": 2.4958722591400146, + "learning_rate": 4.696222000505529e-05, + "loss": 2.1873, + "step": 595 + }, + { + "epoch": 0.19, + "grad_norm": 3.5209269523620605, + "learning_rate": 4.6901847121747455e-05, + "loss": 2.0386, + "step": 600 + }, + { + "epoch": 0.19, + "grad_norm": 3.6823954582214355, + "learning_rate": 4.6840919799424186e-05, + "loss": 2.0325, + "step": 605 + }, + { + "epoch": 0.19, + "grad_norm": 4.033428192138672, + "learning_rate": 4.677943958044329e-05, + "loss": 2.13, + "step": 610 + }, + { + "epoch": 0.19, + "grad_norm": 3.907592535018921, + "learning_rate": 4.671740802115897e-05, + "loss": 2.0553, + "step": 615 + }, + { + "epoch": 0.19, + "grad_norm": 3.318100690841675, + "learning_rate": 4.665482669188248e-05, + "loss": 2.0218, + "step": 620 + }, + { + "epoch": 0.19, + "grad_norm": 4.057621479034424, + "learning_rate": 4.659169717684232e-05, + "loss": 2.1056, + "step": 625 + }, + { + "epoch": 0.2, + "grad_norm": 4.882345199584961, + "learning_rate": 4.6528021074144165e-05, + "loss": 2.1249, + "step": 630 + }, + { + "epoch": 0.2, + "grad_norm": 4.954129219055176, + "learning_rate": 4.646379999573039e-05, + "loss": 2.1942, + "step": 635 + }, + { + "epoch": 0.2, + "grad_norm": 4.156874656677246, + "learning_rate": 4.639903556733931e-05, + "loss": 2.175, + "step": 640 + }, + { + "epoch": 0.2, + "grad_norm": 4.1573710441589355, + "learning_rate": 4.633372942846393e-05, + "loss": 2.0856, + "step": 645 + }, + { + "epoch": 0.2, + "grad_norm": 5.385977745056152, + "learning_rate": 4.6267883232310575e-05, + "loss": 2.2399, + "step": 650 + }, + { + "epoch": 0.2, + "grad_norm": 4.143659591674805, + "learning_rate": 4.620149864575689e-05, + "loss": 2.17, + "step": 655 + }, + { + "epoch": 0.2, + "grad_norm": 3.286294460296631, + "learning_rate": 4.613457734930978e-05, + "loss": 2.0458, + "step": 660 + }, + { + "epoch": 0.21, + "grad_norm": 4.520682334899902, + "learning_rate": 4.606712103706278e-05, + "loss": 2.1244, + "step": 665 + }, + { + "epoch": 0.21, + "grad_norm": 3.6921236515045166, + "learning_rate": 4.59991314166532e-05, + "loss": 2.0801, + "step": 670 + }, + { + "epoch": 0.21, + "grad_norm": 3.1880507469177246, + "learning_rate": 4.593061020921889e-05, + "loss": 2.3062, + "step": 675 + }, + { + "epoch": 0.21, + "grad_norm": 3.380157709121704, + "learning_rate": 4.586155914935469e-05, + "loss": 2.0267, + "step": 680 + }, + { + "epoch": 0.21, + "grad_norm": 3.0647785663604736, + "learning_rate": 4.57919799850685e-05, + "loss": 2.1566, + "step": 685 + }, + { + "epoch": 0.21, + "grad_norm": 3.353318691253662, + "learning_rate": 4.5721874477737006e-05, + "loss": 2.0618, + "step": 690 + }, + { + "epoch": 0.22, + "grad_norm": 3.342336654663086, + "learning_rate": 4.5651244402061144e-05, + "loss": 1.9534, + "step": 695 + }, + { + "epoch": 0.22, + "grad_norm": 4.064236640930176, + "learning_rate": 4.558009154602115e-05, + "loss": 2.1573, + "step": 700 + }, + { + "epoch": 0.22, + "grad_norm": 3.5223772525787354, + "learning_rate": 4.550841771083129e-05, + "loss": 2.0089, + "step": 705 + }, + { + "epoch": 0.22, + "grad_norm": 4.3469557762146, + "learning_rate": 4.543622471089426e-05, + "loss": 2.1214, + "step": 710 + }, + { + "epoch": 0.22, + "grad_norm": 3.922893762588501, + "learning_rate": 4.536351437375526e-05, + "loss": 2.0982, + "step": 715 + }, + { + "epoch": 0.22, + "grad_norm": 3.053823947906494, + "learning_rate": 4.529028854005576e-05, + "loss": 2.0791, + "step": 720 + }, + { + "epoch": 0.22, + "grad_norm": 3.636437177658081, + "learning_rate": 4.521654906348687e-05, + "loss": 2.1326, + "step": 725 + }, + { + "epoch": 0.23, + "grad_norm": 4.3226318359375, + "learning_rate": 4.51422978107424e-05, + "loss": 2.2037, + "step": 730 + }, + { + "epoch": 0.23, + "grad_norm": 4.59119987487793, + "learning_rate": 4.506753666147163e-05, + "loss": 2.1187, + "step": 735 + }, + { + "epoch": 0.23, + "grad_norm": 5.592061996459961, + "learning_rate": 4.499226750823177e-05, + "loss": 2.3031, + "step": 740 + }, + { + "epoch": 0.23, + "grad_norm": 4.18353271484375, + "learning_rate": 4.491649225643996e-05, + "loss": 2.0337, + "step": 745 + }, + { + "epoch": 0.23, + "grad_norm": 3.2864906787872314, + "learning_rate": 4.484021282432509e-05, + "loss": 2.0575, + "step": 750 + }, + { + "epoch": 0.23, + "grad_norm": 3.3072474002838135, + "learning_rate": 4.476343114287924e-05, + "loss": 2.0173, + "step": 755 + }, + { + "epoch": 0.24, + "grad_norm": 4.088031768798828, + "learning_rate": 4.468614915580879e-05, + "loss": 2.1929, + "step": 760 + }, + { + "epoch": 0.24, + "grad_norm": 4.264316082000732, + "learning_rate": 4.4608368819485204e-05, + "loss": 2.0457, + "step": 765 + }, + { + "epoch": 0.24, + "grad_norm": 4.678459644317627, + "learning_rate": 4.453009210289551e-05, + "loss": 2.031, + "step": 770 + }, + { + "epoch": 0.24, + "grad_norm": 3.3418045043945312, + "learning_rate": 4.445132098759249e-05, + "loss": 2.1464, + "step": 775 + }, + { + "epoch": 0.24, + "grad_norm": 3.89583420753479, + "learning_rate": 4.4372057467644455e-05, + "loss": 2.1509, + "step": 780 + }, + { + "epoch": 0.24, + "grad_norm": 2.6973416805267334, + "learning_rate": 4.4292303549584816e-05, + "loss": 2.072, + "step": 785 + }, + { + "epoch": 0.25, + "grad_norm": 4.848878383636475, + "learning_rate": 4.421206125236128e-05, + "loss": 2.166, + "step": 790 + }, + { + "epoch": 0.25, + "grad_norm": 3.48630428314209, + "learning_rate": 4.4131332607284706e-05, + "loss": 1.9686, + "step": 795 + }, + { + "epoch": 0.25, + "grad_norm": 3.4183597564697266, + "learning_rate": 4.405011965797775e-05, + "loss": 2.0781, + "step": 800 + }, + { + "epoch": 0.25, + "grad_norm": 3.5883586406707764, + "learning_rate": 4.3968424460323047e-05, + "loss": 2.0631, + "step": 805 + }, + { + "epoch": 0.25, + "grad_norm": 3.683375835418701, + "learning_rate": 4.388624908241124e-05, + "loss": 2.0533, + "step": 810 + }, + { + "epoch": 0.25, + "grad_norm": 3.0786943435668945, + "learning_rate": 4.3803595604488595e-05, + "loss": 1.8946, + "step": 815 + }, + { + "epoch": 0.25, + "grad_norm": 3.2280662059783936, + "learning_rate": 4.372046611890434e-05, + "loss": 2.0221, + "step": 820 + }, + { + "epoch": 0.26, + "grad_norm": 3.1918365955352783, + "learning_rate": 4.36368627300577e-05, + "loss": 2.0023, + "step": 825 + }, + { + "epoch": 0.26, + "grad_norm": 4.814984321594238, + "learning_rate": 4.3552787554344634e-05, + "loss": 2.0967, + "step": 830 + }, + { + "epoch": 0.26, + "grad_norm": 5.989580154418945, + "learning_rate": 4.346824272010423e-05, + "loss": 1.9698, + "step": 835 + }, + { + "epoch": 0.26, + "grad_norm": 3.2674803733825684, + "learning_rate": 4.338323036756488e-05, + "loss": 2.0381, + "step": 840 + }, + { + "epoch": 0.26, + "grad_norm": 3.6016860008239746, + "learning_rate": 4.3297752648790035e-05, + "loss": 2.0444, + "step": 845 + }, + { + "epoch": 0.26, + "grad_norm": 4.092184543609619, + "learning_rate": 4.321181172762379e-05, + "loss": 2.1514, + "step": 850 + }, + { + "epoch": 0.27, + "grad_norm": 3.5366742610931396, + "learning_rate": 4.312540977963604e-05, + "loss": 2.0518, + "step": 855 + }, + { + "epoch": 0.27, + "grad_norm": 4.222804069519043, + "learning_rate": 4.303854899206749e-05, + "loss": 1.9858, + "step": 860 + }, + { + "epoch": 0.27, + "grad_norm": 4.207810401916504, + "learning_rate": 4.295123156377419e-05, + "loss": 2.0067, + "step": 865 + }, + { + "epoch": 0.27, + "grad_norm": 3.15069842338562, + "learning_rate": 4.2863459705171945e-05, + "loss": 1.9234, + "step": 870 + }, + { + "epoch": 0.27, + "grad_norm": 3.337561845779419, + "learning_rate": 4.2775235638180344e-05, + "loss": 1.974, + "step": 875 + }, + { + "epoch": 0.27, + "grad_norm": 5.987912178039551, + "learning_rate": 4.2686561596166487e-05, + "loss": 2.1928, + "step": 880 + }, + { + "epoch": 0.27, + "grad_norm": 3.9456374645233154, + "learning_rate": 4.259743982388845e-05, + "loss": 2.023, + "step": 885 + }, + { + "epoch": 0.28, + "grad_norm": 4.308691501617432, + "learning_rate": 4.250787257743851e-05, + "loss": 2.1075, + "step": 890 + }, + { + "epoch": 0.28, + "grad_norm": 3.699410915374756, + "learning_rate": 4.2417862124185955e-05, + "loss": 2.0471, + "step": 895 + }, + { + "epoch": 0.28, + "grad_norm": 4.254593372344971, + "learning_rate": 4.232741074271977e-05, + "loss": 2.0331, + "step": 900 + }, + { + "epoch": 0.28, + "grad_norm": 3.2899739742279053, + "learning_rate": 4.2236520722790855e-05, + "loss": 2.0153, + "step": 905 + }, + { + "epoch": 0.28, + "grad_norm": 5.5724616050720215, + "learning_rate": 4.214519436525418e-05, + "loss": 2.1466, + "step": 910 + }, + { + "epoch": 0.28, + "grad_norm": 3.673755168914795, + "learning_rate": 4.2053433982010436e-05, + "loss": 2.1062, + "step": 915 + }, + { + "epoch": 0.29, + "grad_norm": 4.009172439575195, + "learning_rate": 4.1961241895947554e-05, + "loss": 2.013, + "step": 920 + }, + { + "epoch": 0.29, + "grad_norm": 3.0359890460968018, + "learning_rate": 4.1868620440881925e-05, + "loss": 2.1153, + "step": 925 + }, + { + "epoch": 0.29, + "grad_norm": 4.953378200531006, + "learning_rate": 4.177557196149927e-05, + "loss": 2.0847, + "step": 930 + }, + { + "epoch": 0.29, + "grad_norm": 3.580415964126587, + "learning_rate": 4.168209881329531e-05, + "loss": 1.9907, + "step": 935 + }, + { + "epoch": 0.29, + "grad_norm": 3.3144888877868652, + "learning_rate": 4.1588203362516153e-05, + "loss": 2.0741, + "step": 940 + }, + { + "epoch": 0.29, + "grad_norm": 4.115612983703613, + "learning_rate": 4.149388798609836e-05, + "loss": 1.9596, + "step": 945 + }, + { + "epoch": 0.29, + "grad_norm": 5.178717613220215, + "learning_rate": 4.1399155071608774e-05, + "loss": 2.142, + "step": 950 + }, + { + "epoch": 0.3, + "grad_norm": 3.350316286087036, + "learning_rate": 4.1304007017184146e-05, + "loss": 2.06, + "step": 955 + }, + { + "epoch": 0.3, + "grad_norm": 4.030082702636719, + "learning_rate": 4.120844623147033e-05, + "loss": 2.0618, + "step": 960 + }, + { + "epoch": 0.3, + "grad_norm": 5.1543707847595215, + "learning_rate": 4.1112475133561376e-05, + "loss": 2.3692, + "step": 965 + }, + { + "epoch": 0.3, + "grad_norm": 3.9695091247558594, + "learning_rate": 4.101609615293827e-05, + "loss": 2.0065, + "step": 970 + }, + { + "epoch": 0.3, + "grad_norm": 3.1106691360473633, + "learning_rate": 4.0919311729407416e-05, + "loss": 2.0318, + "step": 975 + }, + { + "epoch": 0.3, + "grad_norm": 3.532636880874634, + "learning_rate": 4.0822124313038904e-05, + "loss": 2.139, + "step": 980 + }, + { + "epoch": 0.31, + "grad_norm": 4.04263162612915, + "learning_rate": 4.072453636410448e-05, + "loss": 2.1352, + "step": 985 + }, + { + "epoch": 0.31, + "grad_norm": 4.174222946166992, + "learning_rate": 4.0626550353015236e-05, + "loss": 2.0269, + "step": 990 + }, + { + "epoch": 0.31, + "grad_norm": 4.390026569366455, + "learning_rate": 4.052816876025912e-05, + "loss": 2.0775, + "step": 995 + }, + { + "epoch": 0.31, + "grad_norm": 4.04339075088501, + "learning_rate": 4.042939407633808e-05, + "loss": 2.0042, + "step": 1000 + }, + { + "epoch": 0.31, + "grad_norm": 3.5550975799560547, + "learning_rate": 4.03302288017051e-05, + "loss": 1.9624, + "step": 1005 + }, + { + "epoch": 0.31, + "grad_norm": 4.015019416809082, + "learning_rate": 4.023067544670082e-05, + "loss": 2.142, + "step": 1010 + }, + { + "epoch": 0.31, + "grad_norm": 3.452937126159668, + "learning_rate": 4.013073653149005e-05, + "loss": 2.0798, + "step": 1015 + }, + { + "epoch": 0.32, + "grad_norm": 4.2777509689331055, + "learning_rate": 4.0030414585997925e-05, + "loss": 2.0245, + "step": 1020 + }, + { + "epoch": 0.32, + "grad_norm": 5.5015459060668945, + "learning_rate": 3.99297121498459e-05, + "loss": 2.0897, + "step": 1025 + }, + { + "epoch": 0.32, + "grad_norm": 4.524988651275635, + "learning_rate": 3.982863177228743e-05, + "loss": 2.182, + "step": 1030 + }, + { + "epoch": 0.32, + "grad_norm": 4.300734043121338, + "learning_rate": 3.972717601214345e-05, + "loss": 2.0477, + "step": 1035 + }, + { + "epoch": 0.32, + "grad_norm": 3.456317186355591, + "learning_rate": 3.962534743773761e-05, + "loss": 2.1261, + "step": 1040 + }, + { + "epoch": 0.32, + "grad_norm": 3.567162275314331, + "learning_rate": 3.9523148626831234e-05, + "loss": 2.119, + "step": 1045 + }, + { + "epoch": 0.33, + "grad_norm": 3.5200531482696533, + "learning_rate": 3.942058216655808e-05, + "loss": 1.9731, + "step": 1050 + }, + { + "epoch": 0.33, + "grad_norm": 4.380658149719238, + "learning_rate": 3.931765065335886e-05, + "loss": 1.9642, + "step": 1055 + }, + { + "epoch": 0.33, + "grad_norm": 4.44472074508667, + "learning_rate": 3.921435669291547e-05, + "loss": 1.8666, + "step": 1060 + }, + { + "epoch": 0.33, + "grad_norm": 5.24396276473999, + "learning_rate": 3.9110702900085064e-05, + "loss": 2.0983, + "step": 1065 + }, + { + "epoch": 0.33, + "grad_norm": 4.166001319885254, + "learning_rate": 3.900669189883386e-05, + "loss": 1.9032, + "step": 1070 + }, + { + "epoch": 0.33, + "grad_norm": 3.893059730529785, + "learning_rate": 3.890232632217071e-05, + "loss": 1.9269, + "step": 1075 + }, + { + "epoch": 0.34, + "grad_norm": 3.5707895755767822, + "learning_rate": 3.879760881208042e-05, + "loss": 1.9055, + "step": 1080 + }, + { + "epoch": 0.34, + "grad_norm": 4.270632743835449, + "learning_rate": 3.869254201945692e-05, + "loss": 1.9936, + "step": 1085 + }, + { + "epoch": 0.34, + "grad_norm": 4.152591228485107, + "learning_rate": 3.858712860403608e-05, + "loss": 2.1007, + "step": 1090 + }, + { + "epoch": 0.34, + "grad_norm": 3.5370168685913086, + "learning_rate": 3.848137123432848e-05, + "loss": 2.1225, + "step": 1095 + }, + { + "epoch": 0.34, + "grad_norm": 3.657259941101074, + "learning_rate": 3.837527258755177e-05, + "loss": 1.9526, + "step": 1100 + }, + { + "epoch": 0.34, + "grad_norm": 4.236551761627197, + "learning_rate": 3.8268835349562946e-05, + "loss": 1.9357, + "step": 1105 + }, + { + "epoch": 0.34, + "grad_norm": 3.312053680419922, + "learning_rate": 3.816206221479034e-05, + "loss": 1.9833, + "step": 1110 + }, + { + "epoch": 0.35, + "grad_norm": 3.346323013305664, + "learning_rate": 3.8054955886165427e-05, + "loss": 1.9351, + "step": 1115 + }, + { + "epoch": 0.35, + "grad_norm": 3.557433843612671, + "learning_rate": 3.7947519075054364e-05, + "loss": 2.0037, + "step": 1120 + }, + { + "epoch": 0.35, + "grad_norm": 3.824169635772705, + "learning_rate": 3.7839754501189406e-05, + "loss": 2.1035, + "step": 1125 + }, + { + "epoch": 0.35, + "grad_norm": 4.1984968185424805, + "learning_rate": 3.7731664892600004e-05, + "loss": 1.9416, + "step": 1130 + }, + { + "epoch": 0.35, + "grad_norm": 2.998347520828247, + "learning_rate": 3.762325298554379e-05, + "loss": 1.9615, + "step": 1135 + }, + { + "epoch": 0.35, + "grad_norm": 4.985104560852051, + "learning_rate": 3.751452152443728e-05, + "loss": 1.912, + "step": 1140 + }, + { + "epoch": 0.36, + "grad_norm": 3.560026168823242, + "learning_rate": 3.74054732617864e-05, + "loss": 1.9317, + "step": 1145 + }, + { + "epoch": 0.36, + "grad_norm": 3.894937515258789, + "learning_rate": 3.7296110958116844e-05, + "loss": 1.9516, + "step": 1150 + }, + { + "epoch": 0.36, + "grad_norm": 3.1330158710479736, + "learning_rate": 3.718643738190414e-05, + "loss": 1.8787, + "step": 1155 + }, + { + "epoch": 0.36, + "grad_norm": 3.924584150314331, + "learning_rate": 3.707645530950361e-05, + "loss": 1.9294, + "step": 1160 + }, + { + "epoch": 0.36, + "grad_norm": 3.2176225185394287, + "learning_rate": 3.6966167525080056e-05, + "loss": 2.1003, + "step": 1165 + }, + { + "epoch": 0.36, + "grad_norm": 3.9685873985290527, + "learning_rate": 3.6855576820537277e-05, + "loss": 1.9088, + "step": 1170 + }, + { + "epoch": 0.36, + "grad_norm": 4.544212818145752, + "learning_rate": 3.674468599544746e-05, + "loss": 2.0211, + "step": 1175 + }, + { + "epoch": 0.37, + "grad_norm": 3.6609127521514893, + "learning_rate": 3.663349785698021e-05, + "loss": 2.0021, + "step": 1180 + }, + { + "epoch": 0.37, + "grad_norm": 4.17726469039917, + "learning_rate": 3.6522015219831546e-05, + "loss": 2.0828, + "step": 1185 + }, + { + "epoch": 0.37, + "grad_norm": 3.6899638175964355, + "learning_rate": 3.641024090615265e-05, + "loss": 1.9462, + "step": 1190 + }, + { + "epoch": 0.37, + "grad_norm": 3.7764229774475098, + "learning_rate": 3.62981777454784e-05, + "loss": 2.0825, + "step": 1195 + }, + { + "epoch": 0.37, + "grad_norm": 4.037018775939941, + "learning_rate": 3.6185828574655766e-05, + "loss": 1.8715, + "step": 1200 + }, + { + "epoch": 0.37, + "grad_norm": 3.727513074874878, + "learning_rate": 3.607319623777196e-05, + "loss": 1.9394, + "step": 1205 + }, + { + "epoch": 0.38, + "grad_norm": 4.162086009979248, + "learning_rate": 3.59602835860825e-05, + "loss": 1.89, + "step": 1210 + }, + { + "epoch": 0.38, + "grad_norm": 3.546518564224243, + "learning_rate": 3.5847093477938956e-05, + "loss": 1.8102, + "step": 1215 + }, + { + "epoch": 0.38, + "grad_norm": 4.054803371429443, + "learning_rate": 3.5733628778716646e-05, + "loss": 1.8825, + "step": 1220 + }, + { + "epoch": 0.38, + "grad_norm": 3.638885498046875, + "learning_rate": 3.5619892360742075e-05, + "loss": 2.0755, + "step": 1225 + }, + { + "epoch": 0.38, + "grad_norm": 3.433565378189087, + "learning_rate": 3.5505887103220254e-05, + "loss": 2.0261, + "step": 1230 + }, + { + "epoch": 0.38, + "grad_norm": 3.5785629749298096, + "learning_rate": 3.5391615892161754e-05, + "loss": 2.1362, + "step": 1235 + }, + { + "epoch": 0.38, + "grad_norm": 3.4514031410217285, + "learning_rate": 3.527708162030971e-05, + "loss": 1.8821, + "step": 1240 + }, + { + "epoch": 0.39, + "grad_norm": 4.2519073486328125, + "learning_rate": 3.516228718706656e-05, + "loss": 2.112, + "step": 1245 + }, + { + "epoch": 0.39, + "grad_norm": 3.0281126499176025, + "learning_rate": 3.504723549842066e-05, + "loss": 1.8516, + "step": 1250 + }, + { + "epoch": 0.39, + "grad_norm": 3.3636157512664795, + "learning_rate": 3.4931929466872685e-05, + "loss": 1.9612, + "step": 1255 + }, + { + "epoch": 0.39, + "grad_norm": 3.7413578033447266, + "learning_rate": 3.481637201136197e-05, + "loss": 1.9865, + "step": 1260 + }, + { + "epoch": 0.39, + "grad_norm": 3.007408618927002, + "learning_rate": 3.4700566057192544e-05, + "loss": 1.9493, + "step": 1265 + }, + { + "epoch": 0.39, + "grad_norm": 4.331480979919434, + "learning_rate": 3.4584514535959114e-05, + "loss": 2.1174, + "step": 1270 + }, + { + "epoch": 0.4, + "grad_norm": 4.286431312561035, + "learning_rate": 3.446822038547287e-05, + "loss": 1.883, + "step": 1275 + }, + { + "epoch": 0.4, + "grad_norm": 3.356170177459717, + "learning_rate": 3.435168654968706e-05, + "loss": 1.9707, + "step": 1280 + }, + { + "epoch": 0.4, + "grad_norm": 3.436434507369995, + "learning_rate": 3.423491597862251e-05, + "loss": 1.8922, + "step": 1285 + }, + { + "epoch": 0.4, + "grad_norm": 3.307274580001831, + "learning_rate": 3.411791162829294e-05, + "loss": 2.0583, + "step": 1290 + }, + { + "epoch": 0.4, + "grad_norm": 4.032553195953369, + "learning_rate": 3.4000676460630126e-05, + "loss": 2.0121, + "step": 1295 + }, + { + "epoch": 0.4, + "grad_norm": 3.4915122985839844, + "learning_rate": 3.3883213443408903e-05, + "loss": 1.9361, + "step": 1300 + }, + { + "epoch": 0.4, + "grad_norm": 3.969005823135376, + "learning_rate": 3.3765525550172066e-05, + "loss": 1.8782, + "step": 1305 + }, + { + "epoch": 0.41, + "grad_norm": 3.772780179977417, + "learning_rate": 3.364761576015507e-05, + "loss": 2.0914, + "step": 1310 + }, + { + "epoch": 0.41, + "grad_norm": 2.9640040397644043, + "learning_rate": 3.352948705821065e-05, + "loss": 1.9143, + "step": 1315 + }, + { + "epoch": 0.41, + "grad_norm": 5.698980331420898, + "learning_rate": 3.341114243473319e-05, + "loss": 1.9417, + "step": 1320 + }, + { + "epoch": 0.41, + "grad_norm": 3.4275810718536377, + "learning_rate": 3.3292584885583114e-05, + "loss": 1.9053, + "step": 1325 + }, + { + "epoch": 0.41, + "grad_norm": 3.2752602100372314, + "learning_rate": 3.317381741201097e-05, + "loss": 2.0126, + "step": 1330 + }, + { + "epoch": 0.41, + "grad_norm": 4.166382312774658, + "learning_rate": 3.305484302058148e-05, + "loss": 1.9256, + "step": 1335 + }, + { + "epoch": 0.42, + "grad_norm": 3.7549707889556885, + "learning_rate": 3.293566472309746e-05, + "loss": 2.0742, + "step": 1340 + }, + { + "epoch": 0.42, + "grad_norm": 3.449774980545044, + "learning_rate": 3.2816285536523515e-05, + "loss": 1.9322, + "step": 1345 + }, + { + "epoch": 0.42, + "grad_norm": 3.590756416320801, + "learning_rate": 3.269670848290973e-05, + "loss": 1.9619, + "step": 1350 + }, + { + "epoch": 0.42, + "grad_norm": 4.403102874755859, + "learning_rate": 3.2576936589315124e-05, + "loss": 1.9513, + "step": 1355 + }, + { + "epoch": 0.42, + "grad_norm": 4.1176676750183105, + "learning_rate": 3.245697288773102e-05, + "loss": 2.0274, + "step": 1360 + }, + { + "epoch": 0.42, + "grad_norm": 4.0299859046936035, + "learning_rate": 3.233682041500433e-05, + "loss": 1.9853, + "step": 1365 + }, + { + "epoch": 0.43, + "grad_norm": 4.306421279907227, + "learning_rate": 3.2216482212760646e-05, + "loss": 1.949, + "step": 1370 + }, + { + "epoch": 0.43, + "grad_norm": 3.9233736991882324, + "learning_rate": 3.209596132732725e-05, + "loss": 1.9009, + "step": 1375 + }, + { + "epoch": 0.43, + "grad_norm": 3.82336163520813, + "learning_rate": 3.197526080965598e-05, + "loss": 2.1035, + "step": 1380 + }, + { + "epoch": 0.43, + "grad_norm": 3.946753740310669, + "learning_rate": 3.185438371524605e-05, + "loss": 1.9775, + "step": 1385 + }, + { + "epoch": 0.43, + "grad_norm": 4.122159481048584, + "learning_rate": 3.173333310406662e-05, + "loss": 1.7694, + "step": 1390 + }, + { + "epoch": 0.43, + "grad_norm": 3.5491435527801514, + "learning_rate": 3.161211204047943e-05, + "loss": 2.0022, + "step": 1395 + }, + { + "epoch": 0.43, + "grad_norm": 4.0456438064575195, + "learning_rate": 3.1490723593161096e-05, + "loss": 2.1332, + "step": 1400 + }, + { + "epoch": 0.44, + "grad_norm": 3.476616621017456, + "learning_rate": 3.1369170835025594e-05, + "loss": 1.9567, + "step": 1405 + }, + { + "epoch": 0.44, + "grad_norm": 3.3506128787994385, + "learning_rate": 3.124745684314633e-05, + "loss": 2.1015, + "step": 1410 + }, + { + "epoch": 0.44, + "grad_norm": 3.737765312194824, + "learning_rate": 3.112558469867829e-05, + "loss": 1.9677, + "step": 1415 + }, + { + "epoch": 0.44, + "grad_norm": 3.6628215312957764, + "learning_rate": 3.100355748678009e-05, + "loss": 2.1167, + "step": 1420 + }, + { + "epoch": 0.44, + "grad_norm": 3.3631627559661865, + "learning_rate": 3.0881378296535784e-05, + "loss": 1.928, + "step": 1425 + }, + { + "epoch": 0.44, + "grad_norm": 4.281042575836182, + "learning_rate": 3.075905022087675e-05, + "loss": 1.9394, + "step": 1430 + }, + { + "epoch": 0.45, + "grad_norm": 3.994631290435791, + "learning_rate": 3.063657635650335e-05, + "loss": 1.8533, + "step": 1435 + }, + { + "epoch": 0.45, + "grad_norm": 5.131731033325195, + "learning_rate": 3.0513959803806526e-05, + "loss": 1.9484, + "step": 1440 + }, + { + "epoch": 0.45, + "grad_norm": 3.4644176959991455, + "learning_rate": 3.039120366678937e-05, + "loss": 1.9492, + "step": 1445 + }, + { + "epoch": 0.45, + "grad_norm": 3.832453966140747, + "learning_rate": 3.0268311052988473e-05, + "loss": 1.869, + "step": 1450 + }, + { + "epoch": 0.45, + "grad_norm": 3.8497562408447266, + "learning_rate": 3.0145285073395334e-05, + "loss": 1.8965, + "step": 1455 + }, + { + "epoch": 0.45, + "grad_norm": 3.4898972511291504, + "learning_rate": 3.0022128842377534e-05, + "loss": 2.0029, + "step": 1460 + }, + { + "epoch": 0.45, + "grad_norm": 4.340991020202637, + "learning_rate": 2.9898845477599963e-05, + "loss": 1.9139, + "step": 1465 + }, + { + "epoch": 0.46, + "grad_norm": 5.687810897827148, + "learning_rate": 2.9775438099945836e-05, + "loss": 2.0196, + "step": 1470 + }, + { + "epoch": 0.46, + "grad_norm": 3.468388795852661, + "learning_rate": 2.965190983343774e-05, + "loss": 2.0382, + "step": 1475 + }, + { + "epoch": 0.46, + "grad_norm": 3.2167277336120605, + "learning_rate": 2.9528263805158524e-05, + "loss": 2.0924, + "step": 1480 + }, + { + "epoch": 0.46, + "grad_norm": 4.481842041015625, + "learning_rate": 2.940450314517214e-05, + "loss": 2.0535, + "step": 1485 + }, + { + "epoch": 0.46, + "grad_norm": 4.334501266479492, + "learning_rate": 2.92806309864444e-05, + "loss": 1.9523, + "step": 1490 + }, + { + "epoch": 0.46, + "grad_norm": 4.137599945068359, + "learning_rate": 2.9156650464763713e-05, + "loss": 2.0247, + "step": 1495 + }, + { + "epoch": 0.47, + "grad_norm": 3.5023269653320312, + "learning_rate": 2.9032564718661603e-05, + "loss": 2.0151, + "step": 1500 + }, + { + "epoch": 0.47, + "grad_norm": 4.225565433502197, + "learning_rate": 2.8908376889333376e-05, + "loss": 1.9438, + "step": 1505 + }, + { + "epoch": 0.47, + "grad_norm": 3.86175799369812, + "learning_rate": 2.8784090120558515e-05, + "loss": 2.0108, + "step": 1510 + }, + { + "epoch": 0.47, + "grad_norm": 2.7544214725494385, + "learning_rate": 2.865970755862114e-05, + "loss": 1.943, + "step": 1515 + }, + { + "epoch": 0.47, + "grad_norm": 3.8477399349212646, + "learning_rate": 2.8535232352230345e-05, + "loss": 1.891, + "step": 1520 + }, + { + "epoch": 0.47, + "grad_norm": 3.7875800132751465, + "learning_rate": 2.8410667652440482e-05, + "loss": 1.9343, + "step": 1525 + }, + { + "epoch": 0.47, + "grad_norm": 3.8977842330932617, + "learning_rate": 2.828601661257142e-05, + "loss": 1.8978, + "step": 1530 + }, + { + "epoch": 0.48, + "grad_norm": 3.39017915725708, + "learning_rate": 2.8161282388128696e-05, + "loss": 1.9368, + "step": 1535 + }, + { + "epoch": 0.48, + "grad_norm": 4.3148322105407715, + "learning_rate": 2.8036468136723627e-05, + "loss": 1.9393, + "step": 1540 + }, + { + "epoch": 0.48, + "grad_norm": 3.528031587600708, + "learning_rate": 2.7911577017993412e-05, + "loss": 1.831, + "step": 1545 + }, + { + "epoch": 0.48, + "grad_norm": 4.506915092468262, + "learning_rate": 2.778661219352111e-05, + "loss": 2.1384, + "step": 1550 + }, + { + "epoch": 0.48, + "grad_norm": 4.252208709716797, + "learning_rate": 2.766157682675562e-05, + "loss": 1.9593, + "step": 1555 + }, + { + "epoch": 0.48, + "grad_norm": 3.718641996383667, + "learning_rate": 2.753647408293161e-05, + "loss": 1.9347, + "step": 1560 + }, + { + "epoch": 0.49, + "grad_norm": 3.7793309688568115, + "learning_rate": 2.7411307128989368e-05, + "loss": 1.9519, + "step": 1565 + }, + { + "epoch": 0.49, + "grad_norm": 3.7921085357666016, + "learning_rate": 2.728607913349464e-05, + "loss": 1.8966, + "step": 1570 + }, + { + "epoch": 0.49, + "grad_norm": 3.735579252243042, + "learning_rate": 2.7160793266558443e-05, + "loss": 1.8972, + "step": 1575 + }, + { + "epoch": 0.49, + "grad_norm": 4.979485511779785, + "learning_rate": 2.7035452699756768e-05, + "loss": 1.9879, + "step": 1580 + }, + { + "epoch": 0.49, + "grad_norm": 3.672161102294922, + "learning_rate": 2.6910060606050324e-05, + "loss": 1.895, + "step": 1585 + }, + { + "epoch": 0.49, + "grad_norm": 3.2381715774536133, + "learning_rate": 2.6784620159704222e-05, + "loss": 1.9259, + "step": 1590 + }, + { + "epoch": 0.49, + "grad_norm": 5.407585620880127, + "learning_rate": 2.6659134536207587e-05, + "loss": 1.9021, + "step": 1595 + }, + { + "epoch": 0.5, + "grad_norm": 3.894399642944336, + "learning_rate": 2.6533606912193216e-05, + "loss": 2.0666, + "step": 1600 + }, + { + "epoch": 0.5, + "grad_norm": 3.4516754150390625, + "learning_rate": 2.6408040465357097e-05, + "loss": 1.9388, + "step": 1605 + }, + { + "epoch": 0.5, + "grad_norm": 5.389581203460693, + "learning_rate": 2.628243837437806e-05, + "loss": 1.9731, + "step": 1610 + }, + { + "epoch": 0.5, + "grad_norm": 3.623656988143921, + "learning_rate": 2.6156803818837204e-05, + "loss": 1.8931, + "step": 1615 + }, + { + "epoch": 0.5, + "grad_norm": 3.5042312145233154, + "learning_rate": 2.6031139979137492e-05, + "loss": 1.8365, + "step": 1620 + }, + { + "epoch": 0.5, + "grad_norm": 5.07073974609375, + "learning_rate": 2.59054500364232e-05, + "loss": 2.0215, + "step": 1625 + }, + { + "epoch": 0.51, + "grad_norm": 4.199176788330078, + "learning_rate": 2.5779737172499396e-05, + "loss": 1.967, + "step": 1630 + }, + { + "epoch": 0.51, + "grad_norm": 4.009402751922607, + "learning_rate": 2.565400456975138e-05, + "loss": 2.0154, + "step": 1635 + }, + { + "epoch": 0.51, + "grad_norm": 3.114271640777588, + "learning_rate": 2.552825541106414e-05, + "loss": 1.9405, + "step": 1640 + }, + { + "epoch": 0.51, + "grad_norm": 3.4758782386779785, + "learning_rate": 2.540249287974178e-05, + "loss": 1.94, + "step": 1645 + }, + { + "epoch": 0.51, + "grad_norm": 6.038011074066162, + "learning_rate": 2.527672015942693e-05, + "loss": 2.1653, + "step": 1650 + }, + { + "epoch": 0.51, + "grad_norm": 3.370410203933716, + "learning_rate": 2.5150940434020132e-05, + "loss": 1.9588, + "step": 1655 + }, + { + "epoch": 0.52, + "grad_norm": 3.766829252243042, + "learning_rate": 2.5025156887599288e-05, + "loss": 1.8133, + "step": 1660 + }, + { + "epoch": 0.52, + "grad_norm": 3.650520086288452, + "learning_rate": 2.489937270433901e-05, + "loss": 1.9111, + "step": 1665 + }, + { + "epoch": 0.52, + "grad_norm": 3.1080238819122314, + "learning_rate": 2.4773591068430018e-05, + "loss": 1.8758, + "step": 1670 + }, + { + "epoch": 0.52, + "grad_norm": 3.3637783527374268, + "learning_rate": 2.4647815163998585e-05, + "loss": 1.7589, + "step": 1675 + }, + { + "epoch": 0.52, + "grad_norm": 4.043179988861084, + "learning_rate": 2.452204817502587e-05, + "loss": 1.9339, + "step": 1680 + }, + { + "epoch": 0.52, + "grad_norm": 4.033404350280762, + "learning_rate": 2.4396293285267327e-05, + "loss": 1.9412, + "step": 1685 + }, + { + "epoch": 0.52, + "grad_norm": 4.043616771697998, + "learning_rate": 2.427055367817214e-05, + "loss": 1.8728, + "step": 1690 + }, + { + "epoch": 0.53, + "grad_norm": 4.840696811676025, + "learning_rate": 2.4144832536802628e-05, + "loss": 1.9966, + "step": 1695 + }, + { + "epoch": 0.53, + "grad_norm": 4.977992057800293, + "learning_rate": 2.4019133043753628e-05, + "loss": 1.9621, + "step": 1700 + }, + { + "epoch": 0.53, + "grad_norm": 3.1471240520477295, + "learning_rate": 2.3893458381071964e-05, + "loss": 2.0315, + "step": 1705 + }, + { + "epoch": 0.53, + "grad_norm": 5.21504020690918, + "learning_rate": 2.376781173017589e-05, + "loss": 1.9859, + "step": 1710 + }, + { + "epoch": 0.53, + "grad_norm": 3.4117472171783447, + "learning_rate": 2.3642196271774568e-05, + "loss": 1.905, + "step": 1715 + }, + { + "epoch": 0.53, + "grad_norm": 3.8640167713165283, + "learning_rate": 2.3516615185787494e-05, + "loss": 2.0321, + "step": 1720 + }, + { + "epoch": 0.54, + "grad_norm": 3.5830259323120117, + "learning_rate": 2.3391071651264064e-05, + "loss": 1.9936, + "step": 1725 + }, + { + "epoch": 0.54, + "grad_norm": 5.528283596038818, + "learning_rate": 2.3265568846303054e-05, + "loss": 1.8955, + "step": 1730 + }, + { + "epoch": 0.54, + "grad_norm": 3.968691110610962, + "learning_rate": 2.3140109947972204e-05, + "loss": 1.9137, + "step": 1735 + }, + { + "epoch": 0.54, + "grad_norm": 3.56799054145813, + "learning_rate": 2.3014698132227735e-05, + "loss": 1.9854, + "step": 1740 + }, + { + "epoch": 0.54, + "grad_norm": 4.353531360626221, + "learning_rate": 2.2889336573834027e-05, + "loss": 1.8967, + "step": 1745 + }, + { + "epoch": 0.54, + "grad_norm": 3.8630661964416504, + "learning_rate": 2.276402844628317e-05, + "loss": 1.8833, + "step": 1750 + }, + { + "epoch": 0.54, + "grad_norm": 3.5117268562316895, + "learning_rate": 2.2638776921714696e-05, + "loss": 1.8493, + "step": 1755 + }, + { + "epoch": 0.55, + "grad_norm": 4.000200271606445, + "learning_rate": 2.251358517083524e-05, + "loss": 1.8717, + "step": 1760 + }, + { + "epoch": 0.55, + "grad_norm": 3.0542423725128174, + "learning_rate": 2.2388456362838283e-05, + "loss": 1.9941, + "step": 1765 + }, + { + "epoch": 0.55, + "grad_norm": 4.117686748504639, + "learning_rate": 2.2263393665323907e-05, + "loss": 2.0925, + "step": 1770 + }, + { + "epoch": 0.55, + "grad_norm": 5.376316070556641, + "learning_rate": 2.2138400244218665e-05, + "loss": 2.0568, + "step": 1775 + }, + { + "epoch": 0.55, + "grad_norm": 3.879211187362671, + "learning_rate": 2.2013479263695368e-05, + "loss": 1.9256, + "step": 1780 + }, + { + "epoch": 0.55, + "grad_norm": 4.660920143127441, + "learning_rate": 2.1888633886093017e-05, + "loss": 2.092, + "step": 1785 + }, + { + "epoch": 0.56, + "grad_norm": 3.143937587738037, + "learning_rate": 2.176386727183676e-05, + "loss": 1.7624, + "step": 1790 + }, + { + "epoch": 0.56, + "grad_norm": 4.354220390319824, + "learning_rate": 2.1639182579357846e-05, + "loss": 1.8961, + "step": 1795 + }, + { + "epoch": 0.56, + "grad_norm": 5.339317798614502, + "learning_rate": 2.151458296501374e-05, + "loss": 1.9361, + "step": 1800 + }, + { + "epoch": 0.56, + "grad_norm": 3.080310344696045, + "learning_rate": 2.139007158300814e-05, + "loss": 1.8459, + "step": 1805 + }, + { + "epoch": 0.56, + "grad_norm": 3.5018744468688965, + "learning_rate": 2.126565158531119e-05, + "loss": 1.9086, + "step": 1810 + }, + { + "epoch": 0.56, + "grad_norm": 5.1605072021484375, + "learning_rate": 2.1141326121579638e-05, + "loss": 1.9395, + "step": 1815 + }, + { + "epoch": 0.56, + "grad_norm": 4.0767998695373535, + "learning_rate": 2.1017098339077176e-05, + "loss": 2.005, + "step": 1820 + }, + { + "epoch": 0.57, + "grad_norm": 4.308762073516846, + "learning_rate": 2.0892971382594694e-05, + "loss": 1.8772, + "step": 1825 + }, + { + "epoch": 0.57, + "grad_norm": 3.049802541732788, + "learning_rate": 2.0768948394370702e-05, + "loss": 1.9591, + "step": 1830 + }, + { + "epoch": 0.57, + "grad_norm": 3.853872060775757, + "learning_rate": 2.0645032514011773e-05, + "loss": 1.8408, + "step": 1835 + }, + { + "epoch": 0.57, + "grad_norm": 3.8186545372009277, + "learning_rate": 2.052122687841311e-05, + "loss": 1.9765, + "step": 1840 + }, + { + "epoch": 0.57, + "grad_norm": 3.238193988800049, + "learning_rate": 2.0397534621679075e-05, + "loss": 1.931, + "step": 1845 + }, + { + "epoch": 0.57, + "grad_norm": 3.316253662109375, + "learning_rate": 2.0273958875043874e-05, + "loss": 1.9787, + "step": 1850 + }, + { + "epoch": 0.58, + "grad_norm": 4.303181171417236, + "learning_rate": 2.0150502766792298e-05, + "loss": 1.9991, + "step": 1855 + }, + { + "epoch": 0.58, + "grad_norm": 3.6812000274658203, + "learning_rate": 2.0027169422180546e-05, + "loss": 1.8782, + "step": 1860 + }, + { + "epoch": 0.58, + "grad_norm": 5.033133506774902, + "learning_rate": 1.990396196335706e-05, + "loss": 1.8406, + "step": 1865 + }, + { + "epoch": 0.58, + "grad_norm": 4.612210750579834, + "learning_rate": 1.9780883509283526e-05, + "loss": 2.0226, + "step": 1870 + }, + { + "epoch": 0.58, + "grad_norm": 4.63312292098999, + "learning_rate": 1.9657937175655922e-05, + "loss": 1.9403, + "step": 1875 + }, + { + "epoch": 0.58, + "grad_norm": 3.5263733863830566, + "learning_rate": 1.9535126074825647e-05, + "loss": 1.9812, + "step": 1880 + }, + { + "epoch": 0.58, + "grad_norm": 3.100794792175293, + "learning_rate": 1.941245331572068e-05, + "loss": 1.8332, + "step": 1885 + }, + { + "epoch": 0.59, + "grad_norm": 4.041380405426025, + "learning_rate": 1.9289922003766962e-05, + "loss": 1.9352, + "step": 1890 + }, + { + "epoch": 0.59, + "grad_norm": 3.329756736755371, + "learning_rate": 1.9167535240809703e-05, + "loss": 1.9084, + "step": 1895 + }, + { + "epoch": 0.59, + "grad_norm": 3.596053123474121, + "learning_rate": 1.904529612503493e-05, + "loss": 1.8971, + "step": 1900 + }, + { + "epoch": 0.59, + "grad_norm": 3.9134511947631836, + "learning_rate": 1.8923207750890992e-05, + "loss": 2.0642, + "step": 1905 + }, + { + "epoch": 0.59, + "grad_norm": 3.707994222640991, + "learning_rate": 1.8801273209010284e-05, + "loss": 1.8276, + "step": 1910 + }, + { + "epoch": 0.59, + "grad_norm": 4.338993072509766, + "learning_rate": 1.8679495586130952e-05, + "loss": 1.9576, + "step": 1915 + }, + { + "epoch": 0.6, + "grad_norm": 3.758429765701294, + "learning_rate": 1.8557877965018817e-05, + "loss": 1.9956, + "step": 1920 + }, + { + "epoch": 0.6, + "grad_norm": 3.7816905975341797, + "learning_rate": 1.843642342438928e-05, + "loss": 1.9079, + "step": 1925 + }, + { + "epoch": 0.6, + "grad_norm": 5.009194850921631, + "learning_rate": 1.8315135038829406e-05, + "loss": 1.9509, + "step": 1930 + }, + { + "epoch": 0.6, + "grad_norm": 3.4465157985687256, + "learning_rate": 1.8194015878720084e-05, + "loss": 2.0019, + "step": 1935 + }, + { + "epoch": 0.6, + "grad_norm": 3.6948273181915283, + "learning_rate": 1.8073069010158334e-05, + "loss": 2.0043, + "step": 1940 + }, + { + "epoch": 0.6, + "grad_norm": 3.3850791454315186, + "learning_rate": 1.795229749487965e-05, + "loss": 1.9031, + "step": 1945 + }, + { + "epoch": 0.61, + "grad_norm": 5.051716327667236, + "learning_rate": 1.7831704390180498e-05, + "loss": 1.8958, + "step": 1950 + }, + { + "epoch": 0.61, + "grad_norm": 2.8910887241363525, + "learning_rate": 1.7711292748840943e-05, + "loss": 1.8856, + "step": 1955 + }, + { + "epoch": 0.61, + "grad_norm": 3.8123810291290283, + "learning_rate": 1.759106561904737e-05, + "loss": 1.8229, + "step": 1960 + }, + { + "epoch": 0.61, + "grad_norm": 4.154626369476318, + "learning_rate": 1.747102604431528e-05, + "loss": 1.9509, + "step": 1965 + }, + { + "epoch": 0.61, + "grad_norm": 4.20812463760376, + "learning_rate": 1.7351177063412276e-05, + "loss": 1.9501, + "step": 1970 + }, + { + "epoch": 0.61, + "grad_norm": 3.2041704654693604, + "learning_rate": 1.723152171028114e-05, + "loss": 1.9888, + "step": 1975 + }, + { + "epoch": 0.61, + "grad_norm": 3.133105754852295, + "learning_rate": 1.7112063013963044e-05, + "loss": 2.0086, + "step": 1980 + }, + { + "epoch": 0.62, + "grad_norm": 4.227274417877197, + "learning_rate": 1.6992803998520794e-05, + "loss": 1.9373, + "step": 1985 + }, + { + "epoch": 0.62, + "grad_norm": 3.2231645584106445, + "learning_rate": 1.6873747682962394e-05, + "loss": 1.7439, + "step": 1990 + }, + { + "epoch": 0.62, + "grad_norm": 2.90924334526062, + "learning_rate": 1.67548970811645e-05, + "loss": 1.8914, + "step": 1995 + }, + { + "epoch": 0.62, + "grad_norm": 3.2363147735595703, + "learning_rate": 1.6636255201796237e-05, + "loss": 1.9674, + "step": 2000 + }, + { + "epoch": 0.62, + "grad_norm": 4.925014019012451, + "learning_rate": 1.6517825048242936e-05, + "loss": 1.8693, + "step": 2005 + }, + { + "epoch": 0.62, + "grad_norm": 3.2326242923736572, + "learning_rate": 1.6399609618530183e-05, + "loss": 1.8776, + "step": 2010 + }, + { + "epoch": 0.63, + "grad_norm": 3.984081506729126, + "learning_rate": 1.6281611905247855e-05, + "loss": 1.881, + "step": 2015 + }, + { + "epoch": 0.63, + "grad_norm": 3.8823959827423096, + "learning_rate": 1.6163834895474445e-05, + "loss": 1.9769, + "step": 2020 + }, + { + "epoch": 0.63, + "grad_norm": 4.131060600280762, + "learning_rate": 1.604628157070136e-05, + "loss": 1.9811, + "step": 2025 + }, + { + "epoch": 0.63, + "grad_norm": 4.516271591186523, + "learning_rate": 1.5928954906757515e-05, + "loss": 1.995, + "step": 2030 + }, + { + "epoch": 0.63, + "grad_norm": 3.9269816875457764, + "learning_rate": 1.5811857873733942e-05, + "loss": 1.8224, + "step": 2035 + }, + { + "epoch": 0.63, + "grad_norm": 3.7068333625793457, + "learning_rate": 1.5694993435908646e-05, + "loss": 1.8799, + "step": 2040 + }, + { + "epoch": 0.63, + "grad_norm": 4.0933756828308105, + "learning_rate": 1.557836455167157e-05, + "loss": 1.9251, + "step": 2045 + }, + { + "epoch": 0.64, + "grad_norm": 4.189598560333252, + "learning_rate": 1.546197417344965e-05, + "loss": 2.032, + "step": 2050 + }, + { + "epoch": 0.64, + "grad_norm": 3.609545946121216, + "learning_rate": 1.5345825247632135e-05, + "loss": 1.9399, + "step": 2055 + }, + { + "epoch": 0.64, + "grad_norm": 3.9929699897766113, + "learning_rate": 1.5229920714495948e-05, + "loss": 1.8803, + "step": 2060 + }, + { + "epoch": 0.64, + "grad_norm": 3.578582286834717, + "learning_rate": 1.5114263508131327e-05, + "loss": 1.8303, + "step": 2065 + }, + { + "epoch": 0.64, + "grad_norm": 3.167156457901001, + "learning_rate": 1.499885655636746e-05, + "loss": 2.0741, + "step": 2070 + }, + { + "epoch": 0.64, + "grad_norm": 3.376950263977051, + "learning_rate": 1.4883702780698433e-05, + "loss": 1.8935, + "step": 2075 + }, + { + "epoch": 0.65, + "grad_norm": 7.022952556610107, + "learning_rate": 1.4768805096209231e-05, + "loss": 1.9285, + "step": 2080 + }, + { + "epoch": 0.65, + "grad_norm": 4.465900897979736, + "learning_rate": 1.4654166411502002e-05, + "loss": 1.9464, + "step": 2085 + }, + { + "epoch": 0.65, + "grad_norm": 2.990349292755127, + "learning_rate": 1.4539789628622347e-05, + "loss": 1.8252, + "step": 2090 + }, + { + "epoch": 0.65, + "grad_norm": 3.1683619022369385, + "learning_rate": 1.4425677642985924e-05, + "loss": 1.8346, + "step": 2095 + }, + { + "epoch": 0.65, + "grad_norm": 3.782841444015503, + "learning_rate": 1.4311833343305097e-05, + "loss": 1.8584, + "step": 2100 + }, + { + "epoch": 0.65, + "grad_norm": 3.302788257598877, + "learning_rate": 1.4198259611515886e-05, + "loss": 1.9615, + "step": 2105 + }, + { + "epoch": 0.65, + "grad_norm": 4.179065227508545, + "learning_rate": 1.4084959322704893e-05, + "loss": 2.0387, + "step": 2110 + }, + { + "epoch": 0.66, + "grad_norm": 3.3860225677490234, + "learning_rate": 1.3971935345036657e-05, + "loss": 1.7267, + "step": 2115 + }, + { + "epoch": 0.66, + "grad_norm": 4.326015472412109, + "learning_rate": 1.3859190539680927e-05, + "loss": 1.9828, + "step": 2120 + }, + { + "epoch": 0.66, + "grad_norm": 3.4805123805999756, + "learning_rate": 1.3746727760740328e-05, + "loss": 1.8873, + "step": 2125 + }, + { + "epoch": 0.66, + "grad_norm": 2.8176207542419434, + "learning_rate": 1.3634549855178028e-05, + "loss": 2.0302, + "step": 2130 + }, + { + "epoch": 0.66, + "grad_norm": 2.756837844848633, + "learning_rate": 1.3522659662745723e-05, + "loss": 1.9893, + "step": 2135 + }, + { + "epoch": 0.66, + "grad_norm": 4.258969783782959, + "learning_rate": 1.3411060015911734e-05, + "loss": 1.847, + "step": 2140 + }, + { + "epoch": 0.67, + "grad_norm": 5.707541465759277, + "learning_rate": 1.32997537397893e-05, + "loss": 1.8802, + "step": 2145 + }, + { + "epoch": 0.67, + "grad_norm": 3.7876532077789307, + "learning_rate": 1.3188743652065083e-05, + "loss": 1.9015, + "step": 2150 + }, + { + "epoch": 0.67, + "grad_norm": 3.91947340965271, + "learning_rate": 1.3078032562927788e-05, + "loss": 1.8293, + "step": 2155 + }, + { + "epoch": 0.67, + "grad_norm": 4.129434108734131, + "learning_rate": 1.296762327499707e-05, + "loss": 1.786, + "step": 2160 + }, + { + "epoch": 0.67, + "grad_norm": 3.0605030059814453, + "learning_rate": 1.2857518583252587e-05, + "loss": 1.9754, + "step": 2165 + }, + { + "epoch": 0.67, + "grad_norm": 3.6712772846221924, + "learning_rate": 1.2747721274963214e-05, + "loss": 1.8931, + "step": 2170 + }, + { + "epoch": 0.67, + "grad_norm": 3.6777453422546387, + "learning_rate": 1.2638234129616488e-05, + "loss": 1.9122, + "step": 2175 + }, + { + "epoch": 0.68, + "grad_norm": 3.1498284339904785, + "learning_rate": 1.2529059918848296e-05, + "loss": 1.8041, + "step": 2180 + }, + { + "epoch": 0.68, + "grad_norm": 3.7665841579437256, + "learning_rate": 1.2420201406372662e-05, + "loss": 1.7802, + "step": 2185 + }, + { + "epoch": 0.68, + "grad_norm": 3.147603988647461, + "learning_rate": 1.2311661347911783e-05, + "loss": 1.9658, + "step": 2190 + }, + { + "epoch": 0.68, + "grad_norm": 3.327116012573242, + "learning_rate": 1.220344249112629e-05, + "loss": 1.8795, + "step": 2195 + }, + { + "epoch": 0.68, + "grad_norm": 3.689382553100586, + "learning_rate": 1.2095547575545686e-05, + "loss": 1.942, + "step": 2200 + }, + { + "epoch": 0.68, + "grad_norm": 3.967803955078125, + "learning_rate": 1.1987979332499011e-05, + "loss": 1.8653, + "step": 2205 + }, + { + "epoch": 0.69, + "grad_norm": 3.113976001739502, + "learning_rate": 1.1880740485045649e-05, + "loss": 1.8737, + "step": 2210 + }, + { + "epoch": 0.69, + "grad_norm": 3.3383049964904785, + "learning_rate": 1.1773833747906471e-05, + "loss": 1.9163, + "step": 2215 + }, + { + "epoch": 0.69, + "grad_norm": 3.971327304840088, + "learning_rate": 1.1667261827395035e-05, + "loss": 2.0355, + "step": 2220 + }, + { + "epoch": 0.69, + "grad_norm": 3.8071823120117188, + "learning_rate": 1.1561027421349117e-05, + "loss": 1.7467, + "step": 2225 + }, + { + "epoch": 0.69, + "grad_norm": 3.7409048080444336, + "learning_rate": 1.145513321906243e-05, + "loss": 1.847, + "step": 2230 + }, + { + "epoch": 0.69, + "grad_norm": 5.195309162139893, + "learning_rate": 1.1349581901216514e-05, + "loss": 2.0805, + "step": 2235 + }, + { + "epoch": 0.7, + "grad_norm": 2.922433376312256, + "learning_rate": 1.1244376139812867e-05, + "loss": 1.7545, + "step": 2240 + }, + { + "epoch": 0.7, + "grad_norm": 5.311805725097656, + "learning_rate": 1.1139518598105358e-05, + "loss": 1.9093, + "step": 2245 + }, + { + "epoch": 0.7, + "grad_norm": 3.9856057167053223, + "learning_rate": 1.1035011930532771e-05, + "loss": 1.8777, + "step": 2250 + }, + { + "epoch": 0.7, + "grad_norm": 3.006605386734009, + "learning_rate": 1.0930858782651585e-05, + "loss": 1.9631, + "step": 2255 + }, + { + "epoch": 0.7, + "grad_norm": 3.3158912658691406, + "learning_rate": 1.0827061791069045e-05, + "loss": 1.8097, + "step": 2260 + }, + { + "epoch": 0.7, + "grad_norm": 4.086146831512451, + "learning_rate": 1.0723623583376392e-05, + "loss": 1.9171, + "step": 2265 + }, + { + "epoch": 0.7, + "grad_norm": 4.822931289672852, + "learning_rate": 1.062054677808238e-05, + "loss": 2.1704, + "step": 2270 + }, + { + "epoch": 0.71, + "grad_norm": 3.8096282482147217, + "learning_rate": 1.0517833984546923e-05, + "loss": 1.9599, + "step": 2275 + }, + { + "epoch": 0.71, + "grad_norm": 5.096799373626709, + "learning_rate": 1.0415487802915133e-05, + "loss": 1.9463, + "step": 2280 + }, + { + "epoch": 0.71, + "grad_norm": 3.9913666248321533, + "learning_rate": 1.0313510824051393e-05, + "loss": 1.9045, + "step": 2285 + }, + { + "epoch": 0.71, + "grad_norm": 3.0718228816986084, + "learning_rate": 1.0211905629473866e-05, + "loss": 1.7678, + "step": 2290 + }, + { + "epoch": 0.71, + "grad_norm": 5.186037540435791, + "learning_rate": 1.0110674791289079e-05, + "loss": 1.9355, + "step": 2295 + }, + { + "epoch": 0.71, + "grad_norm": 3.739786386489868, + "learning_rate": 1.0009820872126835e-05, + "loss": 2.015, + "step": 2300 + }, + { + "epoch": 0.72, + "grad_norm": 3.730051040649414, + "learning_rate": 9.909346425075335e-06, + "loss": 1.9639, + "step": 2305 + }, + { + "epoch": 0.72, + "grad_norm": 4.366475582122803, + "learning_rate": 9.809253993616569e-06, + "loss": 2.1142, + "step": 2310 + }, + { + "epoch": 0.72, + "grad_norm": 2.9198176860809326, + "learning_rate": 9.709546111561913e-06, + "loss": 1.8616, + "step": 2315 + }, + { + "epoch": 0.72, + "grad_norm": 3.5179014205932617, + "learning_rate": 9.610225302987961e-06, + "loss": 1.8651, + "step": 2320 + }, + { + "epoch": 0.72, + "grad_norm": 3.9303548336029053, + "learning_rate": 9.511294082172653e-06, + "loss": 2.0002, + "step": 2325 + }, + { + "epoch": 0.72, + "grad_norm": 3.435821771621704, + "learning_rate": 9.412754953531663e-06, + "loss": 1.8817, + "step": 2330 + }, + { + "epoch": 0.72, + "grad_norm": 4.4535932540893555, + "learning_rate": 9.314610411554925e-06, + "loss": 1.8213, + "step": 2335 + }, + { + "epoch": 0.73, + "grad_norm": 3.345769166946411, + "learning_rate": 9.216862940743529e-06, + "loss": 1.8374, + "step": 2340 + }, + { + "epoch": 0.73, + "grad_norm": 4.314777851104736, + "learning_rate": 9.119515015546836e-06, + "loss": 2.0438, + "step": 2345 + }, + { + "epoch": 0.73, + "grad_norm": 4.599632263183594, + "learning_rate": 9.02256910029983e-06, + "loss": 1.8459, + "step": 2350 + }, + { + "epoch": 0.73, + "grad_norm": 3.590637683868408, + "learning_rate": 8.926027649160704e-06, + "loss": 1.8009, + "step": 2355 + }, + { + "epoch": 0.73, + "grad_norm": 3.119189500808716, + "learning_rate": 8.82989310604877e-06, + "loss": 1.9651, + "step": 2360 + }, + { + "epoch": 0.73, + "grad_norm": 3.1386303901672363, + "learning_rate": 8.734167904582566e-06, + "loss": 1.7791, + "step": 2365 + }, + { + "epoch": 0.74, + "grad_norm": 3.6528995037078857, + "learning_rate": 8.638854468018296e-06, + "loss": 1.9259, + "step": 2370 + }, + { + "epoch": 0.74, + "grad_norm": 4.182424545288086, + "learning_rate": 8.543955209188412e-06, + "loss": 1.8853, + "step": 2375 + }, + { + "epoch": 0.74, + "grad_norm": 5.662861347198486, + "learning_rate": 8.449472530440612e-06, + "loss": 1.9349, + "step": 2380 + }, + { + "epoch": 0.74, + "grad_norm": 4.169982433319092, + "learning_rate": 8.355408823576951e-06, + "loss": 1.9554, + "step": 2385 + }, + { + "epoch": 0.74, + "grad_norm": 3.808478832244873, + "learning_rate": 8.261766469793373e-06, + "loss": 1.8309, + "step": 2390 + }, + { + "epoch": 0.74, + "grad_norm": 3.801201343536377, + "learning_rate": 8.168547839619352e-06, + "loss": 1.8714, + "step": 2395 + }, + { + "epoch": 0.74, + "grad_norm": 3.8212218284606934, + "learning_rate": 8.075755292857933e-06, + "loss": 1.844, + "step": 2400 + }, + { + "epoch": 0.75, + "grad_norm": 4.7147650718688965, + "learning_rate": 7.983391178525979e-06, + "loss": 1.9004, + "step": 2405 + }, + { + "epoch": 0.75, + "grad_norm": 3.4768807888031006, + "learning_rate": 7.89145783479471e-06, + "loss": 1.947, + "step": 2410 + }, + { + "epoch": 0.75, + "grad_norm": 3.307199478149414, + "learning_rate": 7.799957588930523e-06, + "loss": 1.9069, + "step": 2415 + }, + { + "epoch": 0.75, + "grad_norm": 4.613658905029297, + "learning_rate": 7.708892757236047e-06, + "loss": 1.917, + "step": 2420 + }, + { + "epoch": 0.75, + "grad_norm": 2.8293955326080322, + "learning_rate": 7.618265644991535e-06, + "loss": 1.8854, + "step": 2425 + }, + { + "epoch": 0.75, + "grad_norm": 3.302823066711426, + "learning_rate": 7.528078546396481e-06, + "loss": 2.0073, + "step": 2430 + }, + { + "epoch": 0.76, + "grad_norm": 2.862478494644165, + "learning_rate": 7.438333744511591e-06, + "loss": 1.9243, + "step": 2435 + }, + { + "epoch": 0.76, + "grad_norm": 4.1902899742126465, + "learning_rate": 7.3490335112009225e-06, + "loss": 1.8696, + "step": 2440 + }, + { + "epoch": 0.76, + "grad_norm": 3.4848709106445312, + "learning_rate": 7.260180107074438e-06, + "loss": 2.0236, + "step": 2445 + }, + { + "epoch": 0.76, + "grad_norm": 2.9219446182250977, + "learning_rate": 7.171775781430712e-06, + "loss": 1.9218, + "step": 2450 + }, + { + "epoch": 0.76, + "grad_norm": 3.458622694015503, + "learning_rate": 7.083822772200058e-06, + "loss": 1.9155, + "step": 2455 + }, + { + "epoch": 0.76, + "grad_norm": 3.5859556198120117, + "learning_rate": 6.996323305887822e-06, + "loss": 1.9701, + "step": 2460 + }, + { + "epoch": 0.76, + "grad_norm": 3.7645373344421387, + "learning_rate": 6.909279597518048e-06, + "loss": 1.9555, + "step": 2465 + }, + { + "epoch": 0.77, + "grad_norm": 5.934003829956055, + "learning_rate": 6.822693850577385e-06, + "loss": 1.9963, + "step": 2470 + }, + { + "epoch": 0.77, + "grad_norm": 4.152750015258789, + "learning_rate": 6.7365682569593496e-06, + "loss": 1.8777, + "step": 2475 + }, + { + "epoch": 0.77, + "grad_norm": 3.7498714923858643, + "learning_rate": 6.6509049969087715e-06, + "loss": 1.9313, + "step": 2480 + }, + { + "epoch": 0.77, + "grad_norm": 2.86311411857605, + "learning_rate": 6.565706238966671e-06, + "loss": 1.7692, + "step": 2485 + }, + { + "epoch": 0.77, + "grad_norm": 4.296627521514893, + "learning_rate": 6.480974139915297e-06, + "loss": 1.942, + "step": 2490 + }, + { + "epoch": 0.77, + "grad_norm": 3.102341890335083, + "learning_rate": 6.396710844723597e-06, + "loss": 1.9011, + "step": 2495 + }, + { + "epoch": 0.78, + "grad_norm": 4.467423439025879, + "learning_rate": 6.312918486492855e-06, + "loss": 1.8276, + "step": 2500 + }, + { + "epoch": 0.78, + "grad_norm": 4.662038803100586, + "learning_rate": 6.229599186402729e-06, + "loss": 1.8927, + "step": 2505 + }, + { + "epoch": 0.78, + "grad_norm": 6.194324493408203, + "learning_rate": 6.146755053657541e-06, + "loss": 1.8046, + "step": 2510 + }, + { + "epoch": 0.78, + "grad_norm": 3.2271151542663574, + "learning_rate": 6.064388185432898e-06, + "loss": 1.7897, + "step": 2515 + }, + { + "epoch": 0.78, + "grad_norm": 3.0152978897094727, + "learning_rate": 5.9825006668225905e-06, + "loss": 1.8203, + "step": 2520 + }, + { + "epoch": 0.78, + "grad_norm": 3.5677027702331543, + "learning_rate": 5.901094570785798e-06, + "loss": 1.9312, + "step": 2525 + }, + { + "epoch": 0.79, + "grad_norm": 3.464501142501831, + "learning_rate": 5.820171958094628e-06, + "loss": 1.9227, + "step": 2530 + }, + { + "epoch": 0.79, + "grad_norm": 4.184050559997559, + "learning_rate": 5.73973487728196e-06, + "loss": 1.8542, + "step": 2535 + }, + { + "epoch": 0.79, + "grad_norm": 3.7280945777893066, + "learning_rate": 5.659785364589556e-06, + "loss": 2.0387, + "step": 2540 + }, + { + "epoch": 0.79, + "grad_norm": 3.863532543182373, + "learning_rate": 5.580325443916526e-06, + "loss": 1.8824, + "step": 2545 + }, + { + "epoch": 0.79, + "grad_norm": 3.403118133544922, + "learning_rate": 5.501357126768117e-06, + "loss": 1.8999, + "step": 2550 + }, + { + "epoch": 0.79, + "grad_norm": 3.203178644180298, + "learning_rate": 5.422882412204766e-06, + "loss": 2.0521, + "step": 2555 + }, + { + "epoch": 0.79, + "grad_norm": 3.8374898433685303, + "learning_rate": 5.344903286791494e-06, + "loss": 1.8838, + "step": 2560 + }, + { + "epoch": 0.8, + "grad_norm": 3.570945978164673, + "learning_rate": 5.267421724547627e-06, + "loss": 1.9615, + "step": 2565 + }, + { + "epoch": 0.8, + "grad_norm": 6.397089004516602, + "learning_rate": 5.1904396868968195e-06, + "loss": 1.9624, + "step": 2570 + }, + { + "epoch": 0.8, + "grad_norm": 3.234090805053711, + "learning_rate": 5.113959122617412e-06, + "loss": 1.9239, + "step": 2575 + }, + { + "epoch": 0.8, + "grad_norm": 3.1682183742523193, + "learning_rate": 5.037981967793076e-06, + "loss": 1.8498, + "step": 2580 + }, + { + "epoch": 0.8, + "grad_norm": 4.0839152336120605, + "learning_rate": 4.9625101457638376e-06, + "loss": 1.9856, + "step": 2585 + }, + { + "epoch": 0.8, + "grad_norm": 3.629542589187622, + "learning_rate": 4.887545567077337e-06, + "loss": 1.8867, + "step": 2590 + }, + { + "epoch": 0.81, + "grad_norm": 4.0674638748168945, + "learning_rate": 4.8130901294405255e-06, + "loss": 2.0402, + "step": 2595 + }, + { + "epoch": 0.81, + "grad_norm": 3.093059539794922, + "learning_rate": 4.739145717671572e-06, + "loss": 1.9107, + "step": 2600 + }, + { + "epoch": 0.81, + "grad_norm": 6.425740718841553, + "learning_rate": 4.665714203652177e-06, + "loss": 1.8893, + "step": 2605 + }, + { + "epoch": 0.81, + "grad_norm": 3.764960765838623, + "learning_rate": 4.592797446280178e-06, + "loss": 1.8649, + "step": 2610 + }, + { + "epoch": 0.81, + "grad_norm": 3.2027156352996826, + "learning_rate": 4.520397291422501e-06, + "loss": 1.991, + "step": 2615 + }, + { + "epoch": 0.81, + "grad_norm": 4.535457134246826, + "learning_rate": 4.448515571868434e-06, + "loss": 1.8798, + "step": 2620 + }, + { + "epoch": 0.81, + "grad_norm": 3.6848881244659424, + "learning_rate": 4.3771541072832045e-06, + "loss": 1.9349, + "step": 2625 + }, + { + "epoch": 0.82, + "grad_norm": 3.817534923553467, + "learning_rate": 4.306314704161937e-06, + "loss": 1.8637, + "step": 2630 + }, + { + "epoch": 0.82, + "grad_norm": 3.4655098915100098, + "learning_rate": 4.23599915578394e-06, + "loss": 1.8615, + "step": 2635 + }, + { + "epoch": 0.82, + "grad_norm": 2.829066276550293, + "learning_rate": 4.16620924216726e-06, + "loss": 1.7928, + "step": 2640 + }, + { + "epoch": 0.82, + "grad_norm": 4.525213241577148, + "learning_rate": 4.096946730023662e-06, + "loss": 1.903, + "step": 2645 + }, + { + "epoch": 0.82, + "grad_norm": 3.8306119441986084, + "learning_rate": 4.028213372713904e-06, + "loss": 1.9473, + "step": 2650 + }, + { + "epoch": 0.82, + "grad_norm": 4.448178768157959, + "learning_rate": 3.960010910203319e-06, + "loss": 1.959, + "step": 2655 + }, + { + "epoch": 0.83, + "grad_norm": 3.6487441062927246, + "learning_rate": 3.892341069017808e-06, + "loss": 1.9932, + "step": 2660 + }, + { + "epoch": 0.83, + "grad_norm": 3.487689256668091, + "learning_rate": 3.825205562200101e-06, + "loss": 1.9578, + "step": 2665 + }, + { + "epoch": 0.83, + "grad_norm": 3.0234782695770264, + "learning_rate": 3.75860608926642e-06, + "loss": 1.9083, + "step": 2670 + }, + { + "epoch": 0.83, + "grad_norm": 3.328275203704834, + "learning_rate": 3.69254433616342e-06, + "loss": 2.0128, + "step": 2675 + }, + { + "epoch": 0.83, + "grad_norm": 2.9996497631073, + "learning_rate": 3.627021975225553e-06, + "loss": 1.633, + "step": 2680 + }, + { + "epoch": 0.83, + "grad_norm": 3.9526045322418213, + "learning_rate": 3.562040665132715e-06, + "loss": 1.8948, + "step": 2685 + }, + { + "epoch": 0.83, + "grad_norm": 4.027220249176025, + "learning_rate": 3.4976020508682344e-06, + "loss": 1.8918, + "step": 2690 + }, + { + "epoch": 0.84, + "grad_norm": 4.6429829597473145, + "learning_rate": 3.4337077636772547e-06, + "loss": 1.8865, + "step": 2695 + }, + { + "epoch": 0.84, + "grad_norm": 4.5367865562438965, + "learning_rate": 3.3703594210254487e-06, + "loss": 1.895, + "step": 2700 + }, + { + "epoch": 0.84, + "grad_norm": 3.4687774181365967, + "learning_rate": 3.3075586265580494e-06, + "loss": 1.8908, + "step": 2705 + }, + { + "epoch": 0.84, + "grad_norm": 4.654914855957031, + "learning_rate": 3.24530697005925e-06, + "loss": 1.7785, + "step": 2710 + }, + { + "epoch": 0.84, + "grad_norm": 4.516482353210449, + "learning_rate": 3.183606027411998e-06, + "loss": 1.7936, + "step": 2715 + }, + { + "epoch": 0.84, + "grad_norm": 4.209545135498047, + "learning_rate": 3.1224573605580648e-06, + "loss": 1.9851, + "step": 2720 + }, + { + "epoch": 0.85, + "grad_norm": 4.1666178703308105, + "learning_rate": 3.061862517458519e-06, + "loss": 1.858, + "step": 2725 + }, + { + "epoch": 0.85, + "grad_norm": 5.190033912658691, + "learning_rate": 3.001823032054532e-06, + "loss": 1.9802, + "step": 2730 + }, + { + "epoch": 0.85, + "grad_norm": 4.3511528968811035, + "learning_rate": 2.942340424228554e-06, + "loss": 1.9403, + "step": 2735 + }, + { + "epoch": 0.85, + "grad_norm": 4.630067348480225, + "learning_rate": 2.8834161997658565e-06, + "loss": 1.7726, + "step": 2740 + }, + { + "epoch": 0.85, + "grad_norm": 3.705087184906006, + "learning_rate": 2.825051850316371e-06, + "loss": 1.8286, + "step": 2745 + }, + { + "epoch": 0.85, + "grad_norm": 3.315842628479004, + "learning_rate": 2.767248853356971e-06, + "loss": 1.8397, + "step": 2750 + }, + { + "epoch": 0.85, + "grad_norm": 5.60033655166626, + "learning_rate": 2.710008672154035e-06, + "loss": 1.994, + "step": 2755 + }, + { + "epoch": 0.86, + "grad_norm": 4.465238571166992, + "learning_rate": 2.65333275572644e-06, + "loss": 1.9824, + "step": 2760 + }, + { + "epoch": 0.86, + "grad_norm": 3.8040528297424316, + "learning_rate": 2.5972225388088497e-06, + "loss": 1.8507, + "step": 2765 + }, + { + "epoch": 0.86, + "grad_norm": 3.2600059509277344, + "learning_rate": 2.5416794418154035e-06, + "loss": 1.992, + "step": 2770 + }, + { + "epoch": 0.86, + "grad_norm": 4.9075703620910645, + "learning_rate": 2.486704870803763e-06, + "loss": 1.8189, + "step": 2775 + }, + { + "epoch": 0.86, + "grad_norm": 4.047214508056641, + "learning_rate": 2.432300217439526e-06, + "loss": 1.9156, + "step": 2780 + }, + { + "epoch": 0.86, + "grad_norm": 4.082090854644775, + "learning_rate": 2.3784668589609814e-06, + "loss": 1.8582, + "step": 2785 + }, + { + "epoch": 0.87, + "grad_norm": 3.8980605602264404, + "learning_rate": 2.3252061581442496e-06, + "loss": 1.8418, + "step": 2790 + }, + { + "epoch": 0.87, + "grad_norm": 4.5113372802734375, + "learning_rate": 2.2725194632687795e-06, + "loss": 1.8942, + "step": 2795 + }, + { + "epoch": 0.87, + "grad_norm": 4.78348445892334, + "learning_rate": 2.220408108083244e-06, + "loss": 1.868, + "step": 2800 + }, + { + "epoch": 0.87, + "grad_norm": 3.327033281326294, + "learning_rate": 2.1688734117717295e-06, + "loss": 1.9177, + "step": 2805 + }, + { + "epoch": 0.87, + "grad_norm": 3.6453311443328857, + "learning_rate": 2.117916678920384e-06, + "loss": 1.8282, + "step": 2810 + }, + { + "epoch": 0.87, + "grad_norm": 3.0697853565216064, + "learning_rate": 2.0675391994843695e-06, + "loss": 1.8374, + "step": 2815 + }, + { + "epoch": 0.88, + "grad_norm": 3.6173019409179688, + "learning_rate": 2.017742248755225e-06, + "loss": 1.9797, + "step": 2820 + }, + { + "epoch": 0.88, + "grad_norm": 3.858684539794922, + "learning_rate": 1.9685270873285505e-06, + "loss": 1.9083, + "step": 2825 + }, + { + "epoch": 0.88, + "grad_norm": 3.6615593433380127, + "learning_rate": 1.9198949610721273e-06, + "loss": 2.0119, + "step": 2830 + }, + { + "epoch": 0.88, + "grad_norm": 4.125614643096924, + "learning_rate": 1.8718471010943623e-06, + "loss": 1.8927, + "step": 2835 + }, + { + "epoch": 0.88, + "grad_norm": 3.79669451713562, + "learning_rate": 1.8243847237131406e-06, + "loss": 1.8407, + "step": 2840 + }, + { + "epoch": 0.88, + "grad_norm": 3.5093576908111572, + "learning_rate": 1.7775090304250065e-06, + "loss": 1.9293, + "step": 2845 + }, + { + "epoch": 0.88, + "grad_norm": 3.6266543865203857, + "learning_rate": 1.7312212078747781e-06, + "loss": 1.6496, + "step": 2850 + }, + { + "epoch": 0.89, + "grad_norm": 4.086301326751709, + "learning_rate": 1.6855224278254812e-06, + "loss": 1.9496, + "step": 2855 + }, + { + "epoch": 0.89, + "grad_norm": 3.14742374420166, + "learning_rate": 1.6404138471286966e-06, + "loss": 1.8646, + "step": 2860 + }, + { + "epoch": 0.89, + "grad_norm": 2.868939161300659, + "learning_rate": 1.5958966076952992e-06, + "loss": 1.9593, + "step": 2865 + }, + { + "epoch": 0.89, + "grad_norm": 3.424562931060791, + "learning_rate": 1.5519718364665009e-06, + "loss": 1.7344, + "step": 2870 + }, + { + "epoch": 0.89, + "grad_norm": 3.9741764068603516, + "learning_rate": 1.5086406453853646e-06, + "loss": 1.7876, + "step": 2875 + }, + { + "epoch": 0.89, + "grad_norm": 4.209314346313477, + "learning_rate": 1.4659041313686366e-06, + "loss": 2.1263, + "step": 2880 + }, + { + "epoch": 0.9, + "grad_norm": 4.095180034637451, + "learning_rate": 1.4237633762789942e-06, + "loss": 1.7563, + "step": 2885 + }, + { + "epoch": 0.9, + "grad_norm": 4.4438066482543945, + "learning_rate": 1.3822194468976284e-06, + "loss": 1.8099, + "step": 2890 + }, + { + "epoch": 0.9, + "grad_norm": 4.844168663024902, + "learning_rate": 1.3412733948972688e-06, + "loss": 1.8867, + "step": 2895 + }, + { + "epoch": 0.9, + "grad_norm": 3.2806739807128906, + "learning_rate": 1.300926256815546e-06, + "loss": 1.9385, + "step": 2900 + }, + { + "epoch": 0.9, + "grad_norm": 3.7914087772369385, + "learning_rate": 1.2611790540287633e-06, + "loss": 1.7425, + "step": 2905 + }, + { + "epoch": 0.9, + "grad_norm": 4.138453960418701, + "learning_rate": 1.2220327927260161e-06, + "loss": 1.9172, + "step": 2910 + }, + { + "epoch": 0.9, + "grad_norm": 3.3346848487854004, + "learning_rate": 1.1834884638837613e-06, + "loss": 1.9754, + "step": 2915 + }, + { + "epoch": 0.91, + "grad_norm": 3.6204893589019775, + "learning_rate": 1.1455470432406829e-06, + "loss": 1.7101, + "step": 2920 + }, + { + "epoch": 0.91, + "grad_norm": 4.972575664520264, + "learning_rate": 1.108209491273035e-06, + "loss": 1.8861, + "step": 2925 + }, + { + "epoch": 0.91, + "grad_norm": 3.620809316635132, + "learning_rate": 1.0714767531702973e-06, + "loss": 1.8525, + "step": 2930 + }, + { + "epoch": 0.91, + "grad_norm": 3.33205509185791, + "learning_rate": 1.035349758811263e-06, + "loss": 1.8453, + "step": 2935 + }, + { + "epoch": 0.91, + "grad_norm": 3.7018685340881348, + "learning_rate": 9.998294227404863e-07, + "loss": 2.0806, + "step": 2940 + }, + { + "epoch": 0.91, + "grad_norm": 4.9941864013671875, + "learning_rate": 9.649166441451557e-07, + "loss": 1.94, + "step": 2945 + }, + { + "epoch": 0.92, + "grad_norm": 4.217085361480713, + "learning_rate": 9.306123068323097e-07, + "loss": 1.9168, + "step": 2950 + }, + { + "epoch": 0.92, + "grad_norm": 3.2208547592163086, + "learning_rate": 8.969172792064634e-07, + "loss": 1.8819, + "step": 2955 + }, + { + "epoch": 0.92, + "grad_norm": 3.9018375873565674, + "learning_rate": 8.638324142476284e-07, + "loss": 1.9311, + "step": 2960 + }, + { + "epoch": 0.92, + "grad_norm": 3.776543140411377, + "learning_rate": 8.313585494897385e-07, + "loss": 1.762, + "step": 2965 + }, + { + "epoch": 0.92, + "grad_norm": 6.1161603927612305, + "learning_rate": 7.994965069994142e-07, + "loss": 1.8604, + "step": 2970 + }, + { + "epoch": 0.92, + "grad_norm": 3.6044158935546875, + "learning_rate": 7.682470933551761e-07, + "loss": 1.7736, + "step": 2975 + }, + { + "epoch": 0.92, + "grad_norm": 4.38954496383667, + "learning_rate": 7.376110996270281e-07, + "loss": 1.9429, + "step": 2980 + }, + { + "epoch": 0.93, + "grad_norm": 4.361955165863037, + "learning_rate": 7.075893013564123e-07, + "loss": 1.8157, + "step": 2985 + }, + { + "epoch": 0.93, + "grad_norm": 3.799809217453003, + "learning_rate": 6.781824585365915e-07, + "loss": 1.9094, + "step": 2990 + }, + { + "epoch": 0.93, + "grad_norm": 4.269566059112549, + "learning_rate": 6.493913155934117e-07, + "loss": 1.9207, + "step": 2995 + }, + { + "epoch": 0.93, + "grad_norm": 4.451285362243652, + "learning_rate": 6.212166013664422e-07, + "loss": 1.6652, + "step": 3000 + }, + { + "epoch": 0.93, + "grad_norm": 3.91097092628479, + "learning_rate": 5.93659029090543e-07, + "loss": 1.9185, + "step": 3005 + }, + { + "epoch": 0.93, + "grad_norm": 3.952296257019043, + "learning_rate": 5.667192963778017e-07, + "loss": 1.7982, + "step": 3010 + }, + { + "epoch": 0.94, + "grad_norm": 3.8603575229644775, + "learning_rate": 5.403980851998669e-07, + "loss": 1.8665, + "step": 3015 + }, + { + "epoch": 0.94, + "grad_norm": 4.040564060211182, + "learning_rate": 5.146960618706981e-07, + "loss": 1.8744, + "step": 3020 + }, + { + "epoch": 0.94, + "grad_norm": 3.266788959503174, + "learning_rate": 4.896138770296876e-07, + "loss": 1.8463, + "step": 3025 + }, + { + "epoch": 0.94, + "grad_norm": 3.374309539794922, + "learning_rate": 4.6515216562519615e-07, + "loss": 1.8195, + "step": 3030 + }, + { + "epoch": 0.94, + "grad_norm": 3.7271621227264404, + "learning_rate": 4.41311546898468e-07, + "loss": 1.788, + "step": 3035 + }, + { + "epoch": 0.94, + "grad_norm": 3.1484320163726807, + "learning_rate": 4.180926243679689e-07, + "loss": 1.8316, + "step": 3040 + }, + { + "epoch": 0.94, + "grad_norm": 3.443974256515503, + "learning_rate": 3.954959858141066e-07, + "loss": 1.9071, + "step": 3045 + }, + { + "epoch": 0.95, + "grad_norm": 3.8171606063842773, + "learning_rate": 3.735222032643426e-07, + "loss": 2.1321, + "step": 3050 + }, + { + "epoch": 0.95, + "grad_norm": 3.141526699066162, + "learning_rate": 3.521718329787177e-07, + "loss": 1.8597, + "step": 3055 + }, + { + "epoch": 0.95, + "grad_norm": 3.848994255065918, + "learning_rate": 3.314454154357688e-07, + "loss": 1.9906, + "step": 3060 + }, + { + "epoch": 0.95, + "grad_norm": 3.9238314628601074, + "learning_rate": 3.1134347531884267e-07, + "loss": 1.9433, + "step": 3065 + }, + { + "epoch": 0.95, + "grad_norm": 4.169834136962891, + "learning_rate": 2.9186652150282603e-07, + "loss": 1.7679, + "step": 3070 + }, + { + "epoch": 0.95, + "grad_norm": 6.12147331237793, + "learning_rate": 2.7301504704125016e-07, + "loss": 1.6556, + "step": 3075 + }, + { + "epoch": 0.96, + "grad_norm": 3.5053157806396484, + "learning_rate": 2.547895291538177e-07, + "loss": 1.9142, + "step": 3080 + }, + { + "epoch": 0.96, + "grad_norm": 4.274362087249756, + "learning_rate": 2.371904292143151e-07, + "loss": 1.8754, + "step": 3085 + }, + { + "epoch": 0.96, + "grad_norm": 3.843151569366455, + "learning_rate": 2.2021819273894127e-07, + "loss": 1.7239, + "step": 3090 + }, + { + "epoch": 0.96, + "grad_norm": 3.5693886280059814, + "learning_rate": 2.0387324937502505e-07, + "loss": 1.8063, + "step": 3095 + }, + { + "epoch": 0.96, + "grad_norm": 4.155526161193848, + "learning_rate": 1.8815601289014496e-07, + "loss": 1.8008, + "step": 3100 + }, + { + "epoch": 0.96, + "grad_norm": 4.957355499267578, + "learning_rate": 1.730668811616598e-07, + "loss": 1.9108, + "step": 3105 + }, + { + "epoch": 0.97, + "grad_norm": 5.035935878753662, + "learning_rate": 1.5860623616664184e-07, + "loss": 2.0325, + "step": 3110 + }, + { + "epoch": 0.97, + "grad_norm": 4.176791667938232, + "learning_rate": 1.4477444397219542e-07, + "loss": 1.8947, + "step": 3115 + }, + { + "epoch": 0.97, + "grad_norm": 3.648829460144043, + "learning_rate": 1.3157185472619516e-07, + "loss": 1.8535, + "step": 3120 + }, + { + "epoch": 0.97, + "grad_norm": 3.8320178985595703, + "learning_rate": 1.1899880264842068e-07, + "loss": 1.8678, + "step": 3125 + }, + { + "epoch": 0.97, + "grad_norm": 3.046886682510376, + "learning_rate": 1.0705560602210784e-07, + "loss": 1.8263, + "step": 3130 + }, + { + "epoch": 0.97, + "grad_norm": 5.341119766235352, + "learning_rate": 9.574256718586639e-08, + "loss": 1.9319, + "step": 3135 + }, + { + "epoch": 0.97, + "grad_norm": 3.0084095001220703, + "learning_rate": 8.505997252605258e-08, + "loss": 1.7669, + "step": 3140 + }, + { + "epoch": 0.98, + "grad_norm": 3.5134646892547607, + "learning_rate": 7.500809246950569e-08, + "loss": 1.824, + "step": 3145 + }, + { + "epoch": 0.98, + "grad_norm": 3.576869249343872, + "learning_rate": 6.558718147670339e-08, + "loss": 1.8971, + "step": 3150 + }, + { + "epoch": 0.98, + "grad_norm": 3.1408050060272217, + "learning_rate": 5.679747803531699e-08, + "loss": 1.9365, + "step": 3155 + }, + { + "epoch": 0.98, + "grad_norm": 4.063467979431152, + "learning_rate": 4.863920465418836e-08, + "loss": 1.8272, + "step": 3160 + }, + { + "epoch": 0.98, + "grad_norm": 3.66452693939209, + "learning_rate": 4.111256785767903e-08, + "loss": 1.7885, + "step": 3165 + }, + { + "epoch": 0.98, + "grad_norm": 3.7975409030914307, + "learning_rate": 3.421775818045481e-08, + "loss": 1.879, + "step": 3170 + }, + { + "epoch": 0.99, + "grad_norm": 4.497860908508301, + "learning_rate": 2.7954950162656367e-08, + "loss": 1.828, + "step": 3175 + }, + { + "epoch": 0.99, + "grad_norm": 3.815382242202759, + "learning_rate": 2.2324302345483327e-08, + "loss": 1.9715, + "step": 3180 + }, + { + "epoch": 0.99, + "grad_norm": 5.165794849395752, + "learning_rate": 1.7325957267180782e-08, + "loss": 1.8856, + "step": 3185 + }, + { + "epoch": 0.99, + "grad_norm": 4.661296367645264, + "learning_rate": 1.2960041459425532e-08, + "loss": 1.9542, + "step": 3190 + }, + { + "epoch": 0.99, + "grad_norm": 4.152047157287598, + "learning_rate": 9.226665444136973e-09, + "loss": 1.9453, + "step": 3195 + }, + { + "epoch": 0.99, + "grad_norm": 3.161618232727051, + "learning_rate": 6.1259237306599e-09, + "loss": 1.7805, + "step": 3200 + } + ], + "logging_steps": 5, + "max_steps": 3222, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 200, + "total_flos": 4.797270917531566e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}