diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5705 @@ +{ + "best_metric": 2.4689557552337646, + "best_model_checkpoint": "./output/training_results/C017_random_sample_llama3-8b-base_pretrain_20240504_182259/checkpoint-800", + "epoch": 4.0, + "eval_steps": 200, + "global_step": 3944, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0010141987829614604, + "grad_norm": 4.267137538119642, + "learning_rate": 7.5e-07, + "loss": 2.7134, + "step": 1 + }, + { + "epoch": 0.005070993914807302, + "grad_norm": 4.879489677016923, + "learning_rate": 2.25e-06, + "loss": 2.7254, + "step": 5 + }, + { + "epoch": 0.010141987829614604, + "grad_norm": 2.7621009561709564, + "learning_rate": 6e-06, + "loss": 2.707, + "step": 10 + }, + { + "epoch": 0.015212981744421906, + "grad_norm": 2.404100845677231, + "learning_rate": 9e-06, + "loss": 2.6421, + "step": 15 + }, + { + "epoch": 0.02028397565922921, + "grad_norm": 2.4429846538599254, + "learning_rate": 1.275e-05, + "loss": 2.6682, + "step": 20 + }, + { + "epoch": 0.02535496957403651, + "grad_norm": 2.8575493026010625, + "learning_rate": 1.4916395742870319e-05, + "loss": 2.6639, + "step": 25 + }, + { + "epoch": 0.030425963488843813, + "grad_norm": 2.4347171369214538, + "learning_rate": 1.4709241308404976e-05, + "loss": 2.6624, + "step": 30 + }, + { + "epoch": 0.035496957403651115, + "grad_norm": 2.5792627004512942, + "learning_rate": 1.4504714365262738e-05, + "loss": 2.6351, + "step": 35 + }, + { + "epoch": 0.04056795131845842, + "grad_norm": 2.1789139866654366, + "learning_rate": 1.4302784881547452e-05, + "loss": 2.6055, + "step": 40 + }, + { + "epoch": 0.04563894523326572, + "grad_norm": 2.232485210798856, + "learning_rate": 1.4103423130872168e-05, + "loss": 2.5938, + "step": 45 + }, + { + "epoch": 0.05070993914807302, + "grad_norm": 2.2896589926745814, + "learning_rate": 1.390659968963626e-05, + "loss": 2.6334, + "step": 50 + }, + { + "epoch": 0.055780933062880324, + "grad_norm": 2.7780457428021985, + "learning_rate": 1.3712285434323396e-05, + "loss": 2.646, + "step": 55 + }, + { + "epoch": 0.060851926977687626, + "grad_norm": 1.9399001575023072, + "learning_rate": 1.352045153882017e-05, + "loss": 2.6182, + "step": 60 + }, + { + "epoch": 0.06592292089249494, + "grad_norm": 1.9083156579424998, + "learning_rate": 1.3331069471755332e-05, + "loss": 2.6056, + "step": 65 + }, + { + "epoch": 0.07099391480730223, + "grad_norm": 2.2298396560554683, + "learning_rate": 1.314411099385942e-05, + "loss": 2.6043, + "step": 70 + }, + { + "epoch": 0.07606490872210954, + "grad_norm": 1.9661711744318215, + "learning_rate": 1.2959548155344706e-05, + "loss": 2.6321, + "step": 75 + }, + { + "epoch": 0.08113590263691683, + "grad_norm": 2.1260634398939438, + "learning_rate": 1.2777353293305311e-05, + "loss": 2.5744, + "step": 80 + }, + { + "epoch": 0.08620689655172414, + "grad_norm": 2.171189842092272, + "learning_rate": 1.2597499029137354e-05, + "loss": 2.6102, + "step": 85 + }, + { + "epoch": 0.09127789046653144, + "grad_norm": 2.118995328928547, + "learning_rate": 1.2419958265979023e-05, + "loss": 2.6056, + "step": 90 + }, + { + "epoch": 0.09634888438133875, + "grad_norm": 2.1743656445294466, + "learning_rate": 1.2244704186170414e-05, + "loss": 2.591, + "step": 95 + }, + { + "epoch": 0.10141987829614604, + "grad_norm": 2.100620832387391, + "learning_rate": 1.2106129489565247e-05, + "loss": 2.6461, + "step": 100 + }, + { + "epoch": 0.10649087221095335, + "grad_norm": 2.02911049207023, + "learning_rate": 1.1934924740853141e-05, + "loss": 2.5878, + "step": 105 + }, + { + "epoch": 0.11156186612576065, + "grad_norm": 2.12870974325018, + "learning_rate": 1.1765933050017452e-05, + "loss": 2.5793, + "step": 110 + }, + { + "epoch": 0.11663286004056796, + "grad_norm": 1.9038783159180614, + "learning_rate": 1.1599128637544344e-05, + "loss": 2.5612, + "step": 115 + }, + { + "epoch": 0.12170385395537525, + "grad_norm": 1.9647399779959451, + "learning_rate": 1.1434485991200533e-05, + "loss": 2.6083, + "step": 120 + }, + { + "epoch": 0.12677484787018256, + "grad_norm": 1.88937427094592, + "learning_rate": 1.1271979863605386e-05, + "loss": 2.5561, + "step": 125 + }, + { + "epoch": 0.13184584178498987, + "grad_norm": 1.8208051471693176, + "learning_rate": 1.111158526982193e-05, + "loss": 2.5884, + "step": 130 + }, + { + "epoch": 0.13691683569979715, + "grad_norm": 1.771422341312915, + "learning_rate": 1.0953277484966689e-05, + "loss": 2.5509, + "step": 135 + }, + { + "epoch": 0.14198782961460446, + "grad_norm": 1.8296701053391813, + "learning_rate": 1.0797032041838185e-05, + "loss": 2.5784, + "step": 140 + }, + { + "epoch": 0.14705882352941177, + "grad_norm": 1.8139046565289612, + "learning_rate": 1.0642824728564022e-05, + "loss": 2.5624, + "step": 145 + }, + { + "epoch": 0.15212981744421908, + "grad_norm": 1.9862915107502803, + "learning_rate": 1.0490631586266381e-05, + "loss": 2.6007, + "step": 150 + }, + { + "epoch": 0.15720081135902636, + "grad_norm": 1.8392246134083736, + "learning_rate": 1.0340428906745863e-05, + "loss": 2.5775, + "step": 155 + }, + { + "epoch": 0.16227180527383367, + "grad_norm": 1.9250085841598776, + "learning_rate": 1.0192193230183505e-05, + "loss": 2.6045, + "step": 160 + }, + { + "epoch": 0.16734279918864098, + "grad_norm": 2.1119936162911825, + "learning_rate": 1.0045901342860905e-05, + "loss": 2.5838, + "step": 165 + }, + { + "epoch": 0.1724137931034483, + "grad_norm": 1.9416866546338962, + "learning_rate": 9.901530274898272e-06, + "loss": 2.5643, + "step": 170 + }, + { + "epoch": 0.17748478701825557, + "grad_norm": 1.871570899679003, + "learning_rate": 9.75905729801036e-06, + "loss": 2.5549, + "step": 175 + }, + { + "epoch": 0.18255578093306288, + "grad_norm": 2.0672616615182897, + "learning_rate": 9.61845992328009e-06, + "loss": 2.561, + "step": 180 + }, + { + "epoch": 0.1876267748478702, + "grad_norm": 1.8373271363293353, + "learning_rate": 9.479715898949807e-06, + "loss": 2.5728, + "step": 185 + }, + { + "epoch": 0.1926977687626775, + "grad_norm": 1.9497106449021773, + "learning_rate": 9.342803208230014e-06, + "loss": 2.5535, + "step": 190 + }, + { + "epoch": 0.19776876267748478, + "grad_norm": 1.913646357656738, + "learning_rate": 9.207700067125492e-06, + "loss": 2.5411, + "step": 195 + }, + { + "epoch": 0.2028397565922921, + "grad_norm": 1.7027113982701332, + "learning_rate": 9.074384922278684e-06, + "loss": 2.5442, + "step": 200 + }, + { + "epoch": 0.2028397565922921, + "eval_loss": 2.55521821975708, + "eval_runtime": 81.0607, + "eval_samples_per_second": 86.429, + "eval_steps_per_second": 0.679, + "step": 200 + }, + { + "epoch": 0.2079107505070994, + "grad_norm": 1.753576639879344, + "learning_rate": 8.942836448830213e-06, + "loss": 2.5264, + "step": 205 + }, + { + "epoch": 0.2129817444219067, + "grad_norm": 1.7785092188900598, + "learning_rate": 8.813033548296443e-06, + "loss": 2.5645, + "step": 210 + }, + { + "epoch": 0.21805273833671399, + "grad_norm": 1.7915296631060966, + "learning_rate": 8.684955346463971e-06, + "loss": 2.555, + "step": 215 + }, + { + "epoch": 0.2231237322515213, + "grad_norm": 1.7452346531223148, + "learning_rate": 8.558581191300906e-06, + "loss": 2.6118, + "step": 220 + }, + { + "epoch": 0.2281947261663286, + "grad_norm": 2.339774136223256, + "learning_rate": 8.433890650884857e-06, + "loss": 2.5284, + "step": 225 + }, + { + "epoch": 0.2332657200811359, + "grad_norm": 1.7961229516339332, + "learning_rate": 8.310863511347508e-06, + "loss": 2.558, + "step": 230 + }, + { + "epoch": 0.2383367139959432, + "grad_norm": 2.000305491613022, + "learning_rate": 8.189479774835651e-06, + "loss": 2.5312, + "step": 235 + }, + { + "epoch": 0.2434077079107505, + "grad_norm": 1.9162907270104979, + "learning_rate": 8.069719657488614e-06, + "loss": 2.4983, + "step": 240 + }, + { + "epoch": 0.2484787018255578, + "grad_norm": 1.9447544732938296, + "learning_rate": 7.951563587431902e-06, + "loss": 2.5462, + "step": 245 + }, + { + "epoch": 0.2535496957403651, + "grad_norm": 1.8244106804572084, + "learning_rate": 7.834992202787018e-06, + "loss": 2.5354, + "step": 250 + }, + { + "epoch": 0.25862068965517243, + "grad_norm": 1.714609238517639, + "learning_rate": 7.719986349697309e-06, + "loss": 2.5386, + "step": 255 + }, + { + "epoch": 0.26369168356997974, + "grad_norm": 1.795436681758725, + "learning_rate": 7.606527080369728e-06, + "loss": 2.5388, + "step": 260 + }, + { + "epoch": 0.268762677484787, + "grad_norm": 1.7081706265027667, + "learning_rate": 7.494595651132443e-06, + "loss": 2.568, + "step": 265 + }, + { + "epoch": 0.2738336713995943, + "grad_norm": 1.6958291617768828, + "learning_rate": 7.384173520508138e-06, + "loss": 2.5489, + "step": 270 + }, + { + "epoch": 0.2789046653144016, + "grad_norm": 1.6677502189962874, + "learning_rate": 7.275242347302937e-06, + "loss": 2.5666, + "step": 275 + }, + { + "epoch": 0.2839756592292089, + "grad_norm": 1.6916519769077745, + "learning_rate": 7.167783988710829e-06, + "loss": 2.5161, + "step": 280 + }, + { + "epoch": 0.28904665314401623, + "grad_norm": 1.9276199368209956, + "learning_rate": 7.061780498433485e-06, + "loss": 2.5461, + "step": 285 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 1.721858200785338, + "learning_rate": 6.957214124815376e-06, + "loss": 2.56, + "step": 290 + }, + { + "epoch": 0.29918864097363085, + "grad_norm": 1.7023218265873687, + "learning_rate": 6.854067308994081e-06, + "loss": 2.5252, + "step": 295 + }, + { + "epoch": 0.30425963488843816, + "grad_norm": 1.7702063060142263, + "learning_rate": 6.752322683065677e-06, + "loss": 2.5365, + "step": 300 + }, + { + "epoch": 0.3093306288032454, + "grad_norm": 1.807175965887596, + "learning_rate": 6.651963068265119e-06, + "loss": 2.5351, + "step": 305 + }, + { + "epoch": 0.3144016227180527, + "grad_norm": 1.7687398862728192, + "learning_rate": 6.5529714731614995e-06, + "loss": 2.5184, + "step": 310 + }, + { + "epoch": 0.31947261663286003, + "grad_norm": 1.808664958617461, + "learning_rate": 6.455331091868087e-06, + "loss": 2.5062, + "step": 315 + }, + { + "epoch": 0.32454361054766734, + "grad_norm": 1.9021979000655393, + "learning_rate": 6.359025302267049e-06, + "loss": 2.5225, + "step": 320 + }, + { + "epoch": 0.32961460446247465, + "grad_norm": 1.704473391712384, + "learning_rate": 6.264037664248752e-06, + "loss": 2.5233, + "step": 325 + }, + { + "epoch": 0.33468559837728196, + "grad_norm": 1.751379362669565, + "learning_rate": 6.17035191796554e-06, + "loss": 2.4854, + "step": 330 + }, + { + "epoch": 0.33975659229208927, + "grad_norm": 1.6980009285341724, + "learning_rate": 6.077951982099886e-06, + "loss": 2.5008, + "step": 335 + }, + { + "epoch": 0.3448275862068966, + "grad_norm": 1.6987141788770321, + "learning_rate": 5.986821952146847e-06, + "loss": 2.5438, + "step": 340 + }, + { + "epoch": 0.34989858012170383, + "grad_norm": 1.6781775461943316, + "learning_rate": 5.89694609871067e-06, + "loss": 2.5417, + "step": 345 + }, + { + "epoch": 0.35496957403651114, + "grad_norm": 1.7326892052245193, + "learning_rate": 5.808308865815513e-06, + "loss": 2.5185, + "step": 350 + }, + { + "epoch": 0.36004056795131845, + "grad_norm": 1.743645811121294, + "learning_rate": 5.720894869230136e-06, + "loss": 2.5094, + "step": 355 + }, + { + "epoch": 0.36511156186612576, + "grad_norm": 1.7256678519147217, + "learning_rate": 5.634688894806482e-06, + "loss": 2.5316, + "step": 360 + }, + { + "epoch": 0.37018255578093306, + "grad_norm": 1.6209115792712339, + "learning_rate": 5.549675896832072e-06, + "loss": 2.5164, + "step": 365 + }, + { + "epoch": 0.3752535496957404, + "grad_norm": 1.6497735310259896, + "learning_rate": 5.465840996396076e-06, + "loss": 2.5363, + "step": 370 + }, + { + "epoch": 0.3803245436105477, + "grad_norm": 1.665747208014539, + "learning_rate": 5.383169479769005e-06, + "loss": 2.5015, + "step": 375 + }, + { + "epoch": 0.385395537525355, + "grad_norm": 1.8360023746562857, + "learning_rate": 5.301646796795905e-06, + "loss": 2.4465, + "step": 380 + }, + { + "epoch": 0.39046653144016225, + "grad_norm": 1.721788501212322, + "learning_rate": 5.221258559302969e-06, + "loss": 2.5104, + "step": 385 + }, + { + "epoch": 0.39553752535496955, + "grad_norm": 1.7896539066797603, + "learning_rate": 5.141990539517474e-06, + "loss": 2.5406, + "step": 390 + }, + { + "epoch": 0.40060851926977686, + "grad_norm": 1.7026594592165973, + "learning_rate": 5.0638286685009445e-06, + "loss": 2.5403, + "step": 395 + }, + { + "epoch": 0.4056795131845842, + "grad_norm": 1.7666645373608338, + "learning_rate": 4.986759034595453e-06, + "loss": 2.5376, + "step": 400 + }, + { + "epoch": 0.4056795131845842, + "eval_loss": 2.509550094604492, + "eval_runtime": 81.0126, + "eval_samples_per_second": 86.48, + "eval_steps_per_second": 0.679, + "step": 400 + }, + { + "epoch": 0.4107505070993915, + "grad_norm": 1.702454460655481, + "learning_rate": 4.910767881882966e-06, + "loss": 2.5017, + "step": 405 + }, + { + "epoch": 0.4158215010141988, + "grad_norm": 1.6625424708509573, + "learning_rate": 4.83584160865765e-06, + "loss": 2.5271, + "step": 410 + }, + { + "epoch": 0.4208924949290061, + "grad_norm": 1.6622717975288752, + "learning_rate": 4.761966765911026e-06, + "loss": 2.5238, + "step": 415 + }, + { + "epoch": 0.4259634888438134, + "grad_norm": 1.6256800857720881, + "learning_rate": 4.689130055829907e-06, + "loss": 2.5191, + "step": 420 + }, + { + "epoch": 0.43103448275862066, + "grad_norm": 1.7950911413498376, + "learning_rate": 4.617318330307044e-06, + "loss": 2.4909, + "step": 425 + }, + { + "epoch": 0.43610547667342797, + "grad_norm": 1.5866160053351177, + "learning_rate": 4.5465185894642715e-06, + "loss": 2.5128, + "step": 430 + }, + { + "epoch": 0.4411764705882353, + "grad_norm": 1.6754882575554404, + "learning_rate": 4.476717980188313e-06, + "loss": 2.5028, + "step": 435 + }, + { + "epoch": 0.4462474645030426, + "grad_norm": 1.6606915353792953, + "learning_rate": 4.407903794678819e-06, + "loss": 2.5207, + "step": 440 + }, + { + "epoch": 0.4513184584178499, + "grad_norm": 1.8160247477825882, + "learning_rate": 4.340063469008923e-06, + "loss": 2.5017, + "step": 445 + }, + { + "epoch": 0.4563894523326572, + "grad_norm": 1.7663094048322825, + "learning_rate": 4.2731845816978475e-06, + "loss": 2.5021, + "step": 450 + }, + { + "epoch": 0.4614604462474645, + "grad_norm": 1.7799998175038592, + "learning_rate": 4.207254852295854e-06, + "loss": 2.4953, + "step": 455 + }, + { + "epoch": 0.4665314401622718, + "grad_norm": 1.6715645487953392, + "learning_rate": 4.142262139981073e-06, + "loss": 2.4435, + "step": 460 + }, + { + "epoch": 0.4716024340770791, + "grad_norm": 1.7256265015398793, + "learning_rate": 4.078194442168494e-06, + "loss": 2.5146, + "step": 465 + }, + { + "epoch": 0.4766734279918864, + "grad_norm": 1.6662015811964308, + "learning_rate": 4.015039893130705e-06, + "loss": 2.5187, + "step": 470 + }, + { + "epoch": 0.4817444219066937, + "grad_norm": 1.7649431318197315, + "learning_rate": 3.952786762630535e-06, + "loss": 2.5223, + "step": 475 + }, + { + "epoch": 0.486815415821501, + "grad_norm": 1.679617464261057, + "learning_rate": 3.891423454565385e-06, + "loss": 2.4394, + "step": 480 + }, + { + "epoch": 0.4918864097363083, + "grad_norm": 1.6233085596184735, + "learning_rate": 3.830938505623211e-06, + "loss": 2.512, + "step": 485 + }, + { + "epoch": 0.4969574036511156, + "grad_norm": 1.7195900327055993, + "learning_rate": 3.7713205839500707e-06, + "loss": 2.4649, + "step": 490 + }, + { + "epoch": 0.5020283975659229, + "grad_norm": 1.7034828407083669, + "learning_rate": 3.7125584878291374e-06, + "loss": 2.497, + "step": 495 + }, + { + "epoch": 0.5070993914807302, + "grad_norm": 1.7618287486879018, + "learning_rate": 3.6546411443711164e-06, + "loss": 2.5353, + "step": 500 + }, + { + "epoch": 0.5121703853955375, + "grad_norm": 1.6191614066287776, + "learning_rate": 3.597557608215969e-06, + "loss": 2.5052, + "step": 505 + }, + { + "epoch": 0.5172413793103449, + "grad_norm": 1.6450813134062763, + "learning_rate": 3.54129706024587e-06, + "loss": 2.5106, + "step": 510 + }, + { + "epoch": 0.5223123732251521, + "grad_norm": 1.7767916102532666, + "learning_rate": 3.4858488063093135e-06, + "loss": 2.4651, + "step": 515 + }, + { + "epoch": 0.5273833671399595, + "grad_norm": 1.6720237829560067, + "learning_rate": 3.431202275956285e-06, + "loss": 2.4908, + "step": 520 + }, + { + "epoch": 0.5324543610547667, + "grad_norm": 1.6484154917054958, + "learning_rate": 3.3773470211844283e-06, + "loss": 2.4856, + "step": 525 + }, + { + "epoch": 0.537525354969574, + "grad_norm": 1.651838194240797, + "learning_rate": 3.324272715196116e-06, + "loss": 2.4675, + "step": 530 + }, + { + "epoch": 0.5425963488843814, + "grad_norm": 1.6241151521510617, + "learning_rate": 3.2719691511663524e-06, + "loss": 2.4896, + "step": 535 + }, + { + "epoch": 0.5476673427991886, + "grad_norm": 1.6894175077795812, + "learning_rate": 3.2204262410214273e-06, + "loss": 2.4556, + "step": 540 + }, + { + "epoch": 0.552738336713996, + "grad_norm": 1.6686417855987385, + "learning_rate": 3.1696340142282437e-06, + "loss": 2.5062, + "step": 545 + }, + { + "epoch": 0.5578093306288032, + "grad_norm": 1.7200856267540612, + "learning_rate": 3.119582616594238e-06, + "loss": 2.4878, + "step": 550 + }, + { + "epoch": 0.5628803245436106, + "grad_norm": 1.672252633477676, + "learning_rate": 3.0702623090778174e-06, + "loss": 2.5077, + "step": 555 + }, + { + "epoch": 0.5679513184584178, + "grad_norm": 1.7008466667698958, + "learning_rate": 3.021663466609246e-06, + "loss": 2.4837, + "step": 560 + }, + { + "epoch": 0.5730223123732252, + "grad_norm": 1.6805676799462346, + "learning_rate": 2.973776576921883e-06, + "loss": 2.5062, + "step": 565 + }, + { + "epoch": 0.5780933062880325, + "grad_norm": 1.6136103005628197, + "learning_rate": 2.9265922393937183e-06, + "loss": 2.5035, + "step": 570 + }, + { + "epoch": 0.5831643002028397, + "grad_norm": 1.6014078073339035, + "learning_rate": 2.880101163899116e-06, + "loss": 2.5101, + "step": 575 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 1.7220406203120746, + "learning_rate": 2.8342941696706994e-06, + "loss": 2.5217, + "step": 580 + }, + { + "epoch": 0.5933062880324543, + "grad_norm": 1.6605964063316545, + "learning_rate": 2.789162184171294e-06, + "loss": 2.4756, + "step": 585 + }, + { + "epoch": 0.5983772819472617, + "grad_norm": 1.6566249973518374, + "learning_rate": 2.7446962419758632e-06, + "loss": 2.4739, + "step": 590 + }, + { + "epoch": 0.603448275862069, + "grad_norm": 1.6340883136536262, + "learning_rate": 2.700887483663357e-06, + "loss": 2.4869, + "step": 595 + }, + { + "epoch": 0.6085192697768763, + "grad_norm": 1.6233109361058542, + "learning_rate": 2.657727154718401e-06, + "loss": 2.4487, + "step": 600 + }, + { + "epoch": 0.6085192697768763, + "eval_loss": 2.4831416606903076, + "eval_runtime": 80.984, + "eval_samples_per_second": 86.511, + "eval_steps_per_second": 0.679, + "step": 600 + }, + { + "epoch": 0.6135902636916836, + "grad_norm": 1.616769928055098, + "learning_rate": 2.615206604442756e-06, + "loss": 2.4638, + "step": 605 + }, + { + "epoch": 0.6186612576064908, + "grad_norm": 1.6396235170920117, + "learning_rate": 2.5733172848764733e-06, + "loss": 2.4891, + "step": 610 + }, + { + "epoch": 0.6237322515212982, + "grad_norm": 1.5936144163067276, + "learning_rate": 2.5320507497286705e-06, + "loss": 2.4902, + "step": 615 + }, + { + "epoch": 0.6288032454361054, + "grad_norm": 1.6679977682798468, + "learning_rate": 2.491398653317866e-06, + "loss": 2.4695, + "step": 620 + }, + { + "epoch": 0.6338742393509128, + "grad_norm": 1.7008178983911084, + "learning_rate": 2.4513527495217875e-06, + "loss": 2.4626, + "step": 625 + }, + { + "epoch": 0.6389452332657201, + "grad_norm": 1.610985443276998, + "learning_rate": 2.4119048907365937e-06, + "loss": 2.4934, + "step": 630 + }, + { + "epoch": 0.6440162271805274, + "grad_norm": 1.6323121910464156, + "learning_rate": 2.3730470268454385e-06, + "loss": 2.4819, + "step": 635 + }, + { + "epoch": 0.6490872210953347, + "grad_norm": 1.6525382291119861, + "learning_rate": 2.3347712041962997e-06, + "loss": 2.5046, + "step": 640 + }, + { + "epoch": 0.654158215010142, + "grad_norm": 1.6380351817927594, + "learning_rate": 2.297069564589013e-06, + "loss": 2.4864, + "step": 645 + }, + { + "epoch": 0.6592292089249493, + "grad_norm": 1.6579813340009797, + "learning_rate": 2.259934344271433e-06, + "loss": 2.4715, + "step": 650 + }, + { + "epoch": 0.6643002028397565, + "grad_norm": 1.7919239246160015, + "learning_rate": 2.22335787294466e-06, + "loss": 2.4972, + "step": 655 + }, + { + "epoch": 0.6693711967545639, + "grad_norm": 1.586961409961355, + "learning_rate": 2.18733257277726e-06, + "loss": 2.4787, + "step": 660 + }, + { + "epoch": 0.6744421906693712, + "grad_norm": 1.684301176230389, + "learning_rate": 2.1518509574284106e-06, + "loss": 2.4158, + "step": 665 + }, + { + "epoch": 0.6795131845841785, + "grad_norm": 1.6178388175493554, + "learning_rate": 2.123852145211829e-06, + "loss": 2.5152, + "step": 670 + }, + { + "epoch": 0.6845841784989858, + "grad_norm": 1.704137336957441, + "learning_rate": 2.089330585293108e-06, + "loss": 2.4807, + "step": 675 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 1.653288753563856, + "learning_rate": 2.055332226962747e-06, + "loss": 2.4781, + "step": 680 + }, + { + "epoch": 0.6947261663286004, + "grad_norm": 1.6910190620923418, + "learning_rate": 2.0218499227907136e-06, + "loss": 2.5114, + "step": 685 + }, + { + "epoch": 0.6997971602434077, + "grad_norm": 1.6297896630103186, + "learning_rate": 1.988876612270826e-06, + "loss": 2.4963, + "step": 690 + }, + { + "epoch": 0.704868154158215, + "grad_norm": 1.6254042637268307, + "learning_rate": 1.9564053208943578e-06, + "loss": 2.4651, + "step": 695 + }, + { + "epoch": 0.7099391480730223, + "grad_norm": 1.849820644961665, + "learning_rate": 1.924429159232111e-06, + "loss": 2.4625, + "step": 700 + }, + { + "epoch": 0.7150101419878296, + "grad_norm": 1.6947938784926828, + "learning_rate": 1.892941322024907e-06, + "loss": 2.4683, + "step": 705 + }, + { + "epoch": 0.7200811359026369, + "grad_norm": 1.6500218076608433, + "learning_rate": 1.861935087282421e-06, + "loss": 2.474, + "step": 710 + }, + { + "epoch": 0.7251521298174443, + "grad_norm": 1.5695461599237197, + "learning_rate": 1.8314038153902991e-06, + "loss": 2.4626, + "step": 715 + }, + { + "epoch": 0.7302231237322515, + "grad_norm": 1.661274439764298, + "learning_rate": 1.8013409482254947e-06, + "loss": 2.4901, + "step": 720 + }, + { + "epoch": 0.7352941176470589, + "grad_norm": 1.5971717624468098, + "learning_rate": 1.7717400082797614e-06, + "loss": 2.498, + "step": 725 + }, + { + "epoch": 0.7403651115618661, + "grad_norm": 1.6006841184664817, + "learning_rate": 1.7425945977912387e-06, + "loss": 2.5096, + "step": 730 + }, + { + "epoch": 0.7454361054766734, + "grad_norm": 1.8078007149616142, + "learning_rate": 1.7138983978840686e-06, + "loss": 2.4733, + "step": 735 + }, + { + "epoch": 0.7505070993914807, + "grad_norm": 1.6080637102108633, + "learning_rate": 1.685645167715982e-06, + "loss": 2.4645, + "step": 740 + }, + { + "epoch": 0.755578093306288, + "grad_norm": 1.6034092883417612, + "learning_rate": 1.6578287436337897e-06, + "loss": 2.4874, + "step": 745 + }, + { + "epoch": 0.7606490872210954, + "grad_norm": 1.6562691168973722, + "learning_rate": 1.6304430383367233e-06, + "loss": 2.5147, + "step": 750 + }, + { + "epoch": 0.7657200811359026, + "grad_norm": 1.631836734297837, + "learning_rate": 1.6034820400475576e-06, + "loss": 2.449, + "step": 755 + }, + { + "epoch": 0.77079107505071, + "grad_norm": 2.633902381426751, + "learning_rate": 1.5769398116914607e-06, + "loss": 2.4502, + "step": 760 + }, + { + "epoch": 0.7758620689655172, + "grad_norm": 1.6338196504524252, + "learning_rate": 1.550810490082507e-06, + "loss": 2.4375, + "step": 765 + }, + { + "epoch": 0.7809330628803245, + "grad_norm": 1.6881605246261733, + "learning_rate": 1.5250882851177956e-06, + "loss": 2.4623, + "step": 770 + }, + { + "epoch": 0.7860040567951319, + "grad_norm": 1.7430128340035491, + "learning_rate": 1.4997674789791142e-06, + "loss": 2.4592, + "step": 775 + }, + { + "epoch": 0.7910750507099391, + "grad_norm": 1.6974037503954427, + "learning_rate": 1.4748424253420905e-06, + "loss": 2.5001, + "step": 780 + }, + { + "epoch": 0.7961460446247465, + "grad_norm": 1.6057434981804433, + "learning_rate": 1.4503075485927704e-06, + "loss": 2.4603, + "step": 785 + }, + { + "epoch": 0.8012170385395537, + "grad_norm": 1.5564356238507298, + "learning_rate": 1.4261573430515669e-06, + "loss": 2.4357, + "step": 790 + }, + { + "epoch": 0.8062880324543611, + "grad_norm": 1.7042405076576008, + "learning_rate": 1.4023863722045201e-06, + "loss": 2.4747, + "step": 795 + }, + { + "epoch": 0.8113590263691683, + "grad_norm": 1.5640034942530554, + "learning_rate": 1.3789892679418134e-06, + "loss": 2.5324, + "step": 800 + }, + { + "epoch": 0.8113590263691683, + "eval_loss": 2.4689557552337646, + "eval_runtime": 81.0232, + "eval_samples_per_second": 86.469, + "eval_steps_per_second": 0.679, + "step": 800 + }, + { + "epoch": 0.8164300202839757, + "grad_norm": 1.7227060519078905, + "learning_rate": 1.3559607298034838e-06, + "loss": 2.4806, + "step": 805 + }, + { + "epoch": 0.821501014198783, + "grad_norm": 1.5855673393298833, + "learning_rate": 1.333295524232277e-06, + "loss": 2.4642, + "step": 810 + }, + { + "epoch": 0.8265720081135902, + "grad_norm": 1.8155636812941185, + "learning_rate": 1.310988483833583e-06, + "loss": 2.4746, + "step": 815 + }, + { + "epoch": 0.8316430020283976, + "grad_norm": 1.6824796691575312, + "learning_rate": 1.289034506642401e-06, + "loss": 2.5168, + "step": 820 + }, + { + "epoch": 0.8367139959432048, + "grad_norm": 1.6084122349859742, + "learning_rate": 1.2674285553972776e-06, + "loss": 2.4112, + "step": 825 + }, + { + "epoch": 0.8417849898580122, + "grad_norm": 1.6807591569306923, + "learning_rate": 1.2461656568211607e-06, + "loss": 2.4555, + "step": 830 + }, + { + "epoch": 0.8468559837728195, + "grad_norm": 1.64520194930749, + "learning_rate": 1.2252409009091154e-06, + "loss": 2.5222, + "step": 835 + }, + { + "epoch": 0.8519269776876268, + "grad_norm": 1.642941398726877, + "learning_rate": 1.2046494402228485e-06, + "loss": 2.4607, + "step": 840 + }, + { + "epoch": 0.8569979716024341, + "grad_norm": 1.6323907187692908, + "learning_rate": 1.1843864891919843e-06, + "loss": 2.4724, + "step": 845 + }, + { + "epoch": 0.8620689655172413, + "grad_norm": 1.6489728444762863, + "learning_rate": 1.1644473234220412e-06, + "loss": 2.483, + "step": 850 + }, + { + "epoch": 0.8671399594320487, + "grad_norm": 1.5735584816022383, + "learning_rate": 1.1448272790090529e-06, + "loss": 2.4423, + "step": 855 + }, + { + "epoch": 0.8722109533468559, + "grad_norm": 1.6290164674794758, + "learning_rate": 1.1255217518607806e-06, + "loss": 2.4745, + "step": 860 + }, + { + "epoch": 0.8772819472616633, + "grad_norm": 1.9631129344699565, + "learning_rate": 1.1065261970244678e-06, + "loss": 2.4595, + "step": 865 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 1.8876833985138877, + "learning_rate": 1.0878361280210782e-06, + "loss": 2.4761, + "step": 870 + }, + { + "epoch": 0.8874239350912779, + "grad_norm": 1.7449962901668679, + "learning_rate": 1.0694471161859696e-06, + "loss": 2.4726, + "step": 875 + }, + { + "epoch": 0.8924949290060852, + "grad_norm": 1.6608657901001447, + "learning_rate": 1.051354790015952e-06, + "loss": 2.4817, + "step": 880 + }, + { + "epoch": 0.8975659229208925, + "grad_norm": 1.6370419920913908, + "learning_rate": 1.0335548345226733e-06, + "loss": 2.4861, + "step": 885 + }, + { + "epoch": 0.9026369168356998, + "grad_norm": 1.6266725844295284, + "learning_rate": 1.016042990592287e-06, + "loss": 2.4437, + "step": 890 + }, + { + "epoch": 0.907707910750507, + "grad_norm": 1.5909779389607082, + "learning_rate": 9.988150543513476e-07, + "loss": 2.4605, + "step": 895 + }, + { + "epoch": 0.9127789046653144, + "grad_norm": 1.5796802186393568, + "learning_rate": 9.818668765388872e-07, + "loss": 2.4863, + "step": 900 + }, + { + "epoch": 0.9178498985801217, + "grad_norm": 1.5779871460684796, + "learning_rate": 9.651943618846152e-07, + "loss": 2.4514, + "step": 905 + }, + { + "epoch": 0.922920892494929, + "grad_norm": 1.605102383763968, + "learning_rate": 9.487934684931995e-07, + "loss": 2.474, + "step": 910 + }, + { + "epoch": 0.9279918864097363, + "grad_norm": 1.6069103870683263, + "learning_rate": 9.326602072345758e-07, + "loss": 2.4828, + "step": 915 + }, + { + "epoch": 0.9330628803245437, + "grad_norm": 1.6236038441464034, + "learning_rate": 9.167906411402357e-07, + "loss": 2.4501, + "step": 920 + }, + { + "epoch": 0.9381338742393509, + "grad_norm": 1.6140284100171378, + "learning_rate": 9.011808848054445e-07, + "loss": 2.4441, + "step": 925 + }, + { + "epoch": 0.9432048681541582, + "grad_norm": 1.9823289784825078, + "learning_rate": 8.858271037973411e-07, + "loss": 2.4834, + "step": 930 + }, + { + "epoch": 0.9482758620689655, + "grad_norm": 1.7094985628575186, + "learning_rate": 8.707255140688767e-07, + "loss": 2.4428, + "step": 935 + }, + { + "epoch": 0.9533468559837728, + "grad_norm": 1.5851821971427773, + "learning_rate": 8.558723813785198e-07, + "loss": 2.4459, + "step": 940 + }, + { + "epoch": 0.9584178498985801, + "grad_norm": 1.8489283203955083, + "learning_rate": 8.412640207157327e-07, + "loss": 2.4671, + "step": 945 + }, + { + "epoch": 0.9634888438133874, + "grad_norm": 1.565327828926634, + "learning_rate": 8.268967957320976e-07, + "loss": 2.4762, + "step": 950 + }, + { + "epoch": 0.9685598377281948, + "grad_norm": 1.5753092524917698, + "learning_rate": 8.127671181781262e-07, + "loss": 2.487, + "step": 955 + }, + { + "epoch": 0.973630831643002, + "grad_norm": 1.5627741498336793, + "learning_rate": 7.988714473456279e-07, + "loss": 2.4899, + "step": 960 + }, + { + "epoch": 0.9787018255578094, + "grad_norm": 1.7322054425536324, + "learning_rate": 7.852062895156654e-07, + "loss": 2.4328, + "step": 965 + }, + { + "epoch": 0.9837728194726166, + "grad_norm": 1.5912533141539165, + "learning_rate": 7.717681974119764e-07, + "loss": 2.4887, + "step": 970 + }, + { + "epoch": 0.9888438133874239, + "grad_norm": 1.7127177872013957, + "learning_rate": 7.585537696598922e-07, + "loss": 2.4414, + "step": 975 + }, + { + "epoch": 0.9939148073022313, + "grad_norm": 1.6239111267541033, + "learning_rate": 7.455596502506312e-07, + "loss": 2.4962, + "step": 980 + }, + { + "epoch": 0.9989858012170385, + "grad_norm": 1.6117561424503084, + "learning_rate": 7.327825280109957e-07, + "loss": 2.4738, + "step": 985 + }, + { + "epoch": 1.0040567951318458, + "grad_norm": 1.9019039739296713, + "learning_rate": 7.20219136078357e-07, + "loss": 2.27, + "step": 990 + }, + { + "epoch": 1.0091277890466532, + "grad_norm": 1.7075178009820928, + "learning_rate": 7.078662513809528e-07, + "loss": 2.3072, + "step": 995 + }, + { + "epoch": 1.0141987829614605, + "grad_norm": 1.7844249258995124, + "learning_rate": 6.957206941233838e-07, + "loss": 2.265, + "step": 1000 + }, + { + "epoch": 1.0141987829614605, + "eval_loss": 2.473280668258667, + "eval_runtime": 81.0085, + "eval_samples_per_second": 86.485, + "eval_steps_per_second": 0.679, + "step": 1000 + }, + { + "epoch": 1.0192697768762677, + "grad_norm": 1.833316481131949, + "learning_rate": 6.837793272773345e-07, + "loss": 2.3069, + "step": 1005 + }, + { + "epoch": 1.024340770791075, + "grad_norm": 1.7388775994426842, + "learning_rate": 6.720390560774066e-07, + "loss": 2.266, + "step": 1010 + }, + { + "epoch": 1.0294117647058822, + "grad_norm": 1.6270190329782648, + "learning_rate": 6.604968275220875e-07, + "loss": 2.2664, + "step": 1015 + }, + { + "epoch": 1.0344827586206897, + "grad_norm": 1.7956207149367391, + "learning_rate": 6.491496298797458e-07, + "loss": 2.2394, + "step": 1020 + }, + { + "epoch": 1.039553752535497, + "grad_norm": 1.6994135825189252, + "learning_rate": 6.379944921996764e-07, + "loss": 2.2727, + "step": 1025 + }, + { + "epoch": 1.0446247464503042, + "grad_norm": 1.677197538792328, + "learning_rate": 6.270284838280882e-07, + "loss": 2.2072, + "step": 1030 + }, + { + "epoch": 1.0496957403651115, + "grad_norm": 1.719327046611783, + "learning_rate": 6.162487139290532e-07, + "loss": 2.3021, + "step": 1035 + }, + { + "epoch": 1.054766734279919, + "grad_norm": 1.7292340128968464, + "learning_rate": 6.056523310103172e-07, + "loss": 2.2737, + "step": 1040 + }, + { + "epoch": 1.0598377281947262, + "grad_norm": 1.7428974260955565, + "learning_rate": 5.95236522453988e-07, + "loss": 2.2556, + "step": 1045 + }, + { + "epoch": 1.0649087221095335, + "grad_norm": 1.694959472171586, + "learning_rate": 5.849985140519998e-07, + "loss": 2.2992, + "step": 1050 + }, + { + "epoch": 1.0699797160243407, + "grad_norm": 1.7439692178448947, + "learning_rate": 5.749355695463754e-07, + "loss": 2.2557, + "step": 1055 + }, + { + "epoch": 1.075050709939148, + "grad_norm": 1.7558636029085997, + "learning_rate": 5.650449901741813e-07, + "loss": 2.2474, + "step": 1060 + }, + { + "epoch": 1.0801217038539555, + "grad_norm": 1.785367595963534, + "learning_rate": 5.553241142171985e-07, + "loss": 2.267, + "step": 1065 + }, + { + "epoch": 1.0851926977687627, + "grad_norm": 1.7537584511707027, + "learning_rate": 5.45770316556211e-07, + "loss": 2.2823, + "step": 1070 + }, + { + "epoch": 1.09026369168357, + "grad_norm": 1.6825060417395732, + "learning_rate": 5.363810082299148e-07, + "loss": 2.2525, + "step": 1075 + }, + { + "epoch": 1.0953346855983772, + "grad_norm": 1.7339475460772475, + "learning_rate": 5.27153635998387e-07, + "loss": 2.3006, + "step": 1080 + }, + { + "epoch": 1.1004056795131847, + "grad_norm": 1.6977028436147512, + "learning_rate": 5.180856819110773e-07, + "loss": 2.2862, + "step": 1085 + }, + { + "epoch": 1.105476673427992, + "grad_norm": 1.7119437312783958, + "learning_rate": 5.091746628792904e-07, + "loss": 2.243, + "step": 1090 + }, + { + "epoch": 1.1105476673427992, + "grad_norm": 1.7918277133466605, + "learning_rate": 5.004181302531108e-07, + "loss": 2.2653, + "step": 1095 + }, + { + "epoch": 1.1156186612576064, + "grad_norm": 1.7198038075584687, + "learning_rate": 4.918136694027396e-07, + "loss": 2.2741, + "step": 1100 + }, + { + "epoch": 1.1206896551724137, + "grad_norm": 1.7122122501534425, + "learning_rate": 4.833588993041994e-07, + "loss": 2.2757, + "step": 1105 + }, + { + "epoch": 1.1257606490872212, + "grad_norm": 1.6934117050919777, + "learning_rate": 4.750514721293719e-07, + "loss": 2.2484, + "step": 1110 + }, + { + "epoch": 1.1308316430020284, + "grad_norm": 1.8096755323665539, + "learning_rate": 4.6688907284032994e-07, + "loss": 2.2329, + "step": 1115 + }, + { + "epoch": 1.1359026369168357, + "grad_norm": 1.7732841203420067, + "learning_rate": 4.588694187879258e-07, + "loss": 2.2636, + "step": 1120 + }, + { + "epoch": 1.140973630831643, + "grad_norm": 1.70514589311023, + "learning_rate": 4.5099025931459913e-07, + "loss": 2.2778, + "step": 1125 + }, + { + "epoch": 1.1460446247464504, + "grad_norm": 1.7135354540773058, + "learning_rate": 4.4324937536136735e-07, + "loss": 2.2905, + "step": 1130 + }, + { + "epoch": 1.1511156186612577, + "grad_norm": 1.6901713268949445, + "learning_rate": 4.3564457907896125e-07, + "loss": 2.302, + "step": 1135 + }, + { + "epoch": 1.156186612576065, + "grad_norm": 1.7350424488382163, + "learning_rate": 4.281737134430704e-07, + "loss": 2.2441, + "step": 1140 + }, + { + "epoch": 1.1612576064908722, + "grad_norm": 1.7433418190612922, + "learning_rate": 4.208346518736604e-07, + "loss": 2.2639, + "step": 1145 + }, + { + "epoch": 1.1663286004056794, + "grad_norm": 1.7278183208713844, + "learning_rate": 4.136252978583281e-07, + "loss": 2.272, + "step": 1150 + }, + { + "epoch": 1.171399594320487, + "grad_norm": 1.7049575091462312, + "learning_rate": 4.0654358457965706e-07, + "loss": 2.2822, + "step": 1155 + }, + { + "epoch": 1.1764705882352942, + "grad_norm": 1.7614119208994081, + "learning_rate": 3.995874745465392e-07, + "loss": 2.2882, + "step": 1160 + }, + { + "epoch": 1.1815415821501014, + "grad_norm": 1.7783667378053016, + "learning_rate": 3.927549592294267e-07, + "loss": 2.2779, + "step": 1165 + }, + { + "epoch": 1.1866125760649087, + "grad_norm": 1.7857803604726208, + "learning_rate": 3.8604405869947905e-07, + "loss": 2.2504, + "step": 1170 + }, + { + "epoch": 1.1916835699797161, + "grad_norm": 1.7894737586957659, + "learning_rate": 3.794528212715714e-07, + "loss": 2.2896, + "step": 1175 + }, + { + "epoch": 1.1967545638945234, + "grad_norm": 1.7605294591830605, + "learning_rate": 3.7297932315112855e-07, + "loss": 2.2803, + "step": 1180 + }, + { + "epoch": 1.2018255578093306, + "grad_norm": 1.7037189312181982, + "learning_rate": 3.6662166808475126e-07, + "loss": 2.2595, + "step": 1185 + }, + { + "epoch": 1.206896551724138, + "grad_norm": 1.802568691083643, + "learning_rate": 3.6037798701460037e-07, + "loss": 2.3097, + "step": 1190 + }, + { + "epoch": 1.2119675456389452, + "grad_norm": 1.7227242510965723, + "learning_rate": 3.5424643773650545e-07, + "loss": 2.2473, + "step": 1195 + }, + { + "epoch": 1.2170385395537526, + "grad_norm": 1.7126735182979083, + "learning_rate": 3.482252045617637e-07, + "loss": 2.3002, + "step": 1200 + }, + { + "epoch": 1.2170385395537526, + "eval_loss": 2.4735846519470215, + "eval_runtime": 81.0924, + "eval_samples_per_second": 86.395, + "eval_steps_per_second": 0.678, + "step": 1200 + }, + { + "epoch": 1.2221095334685599, + "grad_norm": 1.7418672417675343, + "learning_rate": 3.423124979825969e-07, + "loss": 2.2259, + "step": 1205 + }, + { + "epoch": 1.2271805273833671, + "grad_norm": 1.7536106052680211, + "learning_rate": 3.365065543412324e-07, + "loss": 2.2625, + "step": 1210 + }, + { + "epoch": 1.2322515212981744, + "grad_norm": 1.6738354256007202, + "learning_rate": 3.3080563550257607e-07, + "loss": 2.2762, + "step": 1215 + }, + { + "epoch": 1.2373225152129819, + "grad_norm": 1.7304199756653005, + "learning_rate": 3.2520802853044393e-07, + "loss": 2.2864, + "step": 1220 + }, + { + "epoch": 1.2423935091277891, + "grad_norm": 1.761088776037141, + "learning_rate": 3.197120453673215e-07, + "loss": 2.2665, + "step": 1225 + }, + { + "epoch": 1.2474645030425964, + "grad_norm": 1.7101358055188194, + "learning_rate": 3.143160225176168e-07, + "loss": 2.2775, + "step": 1230 + }, + { + "epoch": 1.2525354969574036, + "grad_norm": 1.7571854143932952, + "learning_rate": 3.0901832073437713e-07, + "loss": 2.2979, + "step": 1235 + }, + { + "epoch": 1.2576064908722109, + "grad_norm": 1.7216743809437804, + "learning_rate": 3.0381732470943653e-07, + "loss": 2.3094, + "step": 1240 + }, + { + "epoch": 1.2626774847870181, + "grad_norm": 1.6935950803242086, + "learning_rate": 2.9871144276696387e-07, + "loss": 2.2707, + "step": 1245 + }, + { + "epoch": 1.2677484787018256, + "grad_norm": 1.7158452472154153, + "learning_rate": 2.9369910656037903e-07, + "loss": 2.2532, + "step": 1250 + }, + { + "epoch": 1.2728194726166329, + "grad_norm": 1.7587458046328184, + "learning_rate": 2.8877877077260676e-07, + "loss": 2.2968, + "step": 1255 + }, + { + "epoch": 1.2778904665314401, + "grad_norm": 1.7348605445713965, + "learning_rate": 2.839489128196406e-07, + "loss": 2.2596, + "step": 1260 + }, + { + "epoch": 1.2829614604462476, + "grad_norm": 1.6962275978449755, + "learning_rate": 2.7920803255737635e-07, + "loss": 2.2579, + "step": 1265 + }, + { + "epoch": 1.2880324543610548, + "grad_norm": 1.7562952815143784, + "learning_rate": 2.7455465199170286e-07, + "loss": 2.2518, + "step": 1270 + }, + { + "epoch": 1.293103448275862, + "grad_norm": 1.6974150722131578, + "learning_rate": 2.699873149917968e-07, + "loss": 2.2504, + "step": 1275 + }, + { + "epoch": 1.2981744421906694, + "grad_norm": 1.7036916845012207, + "learning_rate": 2.655045870066172e-07, + "loss": 2.2861, + "step": 1280 + }, + { + "epoch": 1.3032454361054766, + "grad_norm": 1.7486208966066876, + "learning_rate": 2.6110505478454324e-07, + "loss": 2.2467, + "step": 1285 + }, + { + "epoch": 1.3083164300202839, + "grad_norm": 1.712258524308874, + "learning_rate": 2.5678732609615423e-07, + "loss": 2.2515, + "step": 1290 + }, + { + "epoch": 1.3133874239350913, + "grad_norm": 1.7341023622582277, + "learning_rate": 2.525500294600939e-07, + "loss": 2.2757, + "step": 1295 + }, + { + "epoch": 1.3184584178498986, + "grad_norm": 1.889990239211246, + "learning_rate": 2.4839181387201796e-07, + "loss": 2.2791, + "step": 1300 + }, + { + "epoch": 1.3235294117647058, + "grad_norm": 1.798861207791198, + "learning_rate": 2.4431134853656976e-07, + "loss": 2.2817, + "step": 1305 + }, + { + "epoch": 1.3286004056795133, + "grad_norm": 1.7472239831698717, + "learning_rate": 2.4030732260238086e-07, + "loss": 2.2521, + "step": 1310 + }, + { + "epoch": 1.3336713995943206, + "grad_norm": 1.782522588407923, + "learning_rate": 2.3637844490004408e-07, + "loss": 2.2316, + "step": 1315 + }, + { + "epoch": 1.3387423935091278, + "grad_norm": 1.6996053792107884, + "learning_rate": 2.325234436830538e-07, + "loss": 2.2734, + "step": 1320 + }, + { + "epoch": 1.343813387423935, + "grad_norm": 1.7994805518930097, + "learning_rate": 2.2874106637166403e-07, + "loss": 2.2484, + "step": 1325 + }, + { + "epoch": 1.3488843813387423, + "grad_norm": 1.7489331509437775, + "learning_rate": 2.2503007929965749e-07, + "loss": 2.28, + "step": 1330 + }, + { + "epoch": 1.3539553752535496, + "grad_norm": 1.7160678233869127, + "learning_rate": 2.2138926746397777e-07, + "loss": 2.2565, + "step": 1335 + }, + { + "epoch": 1.359026369168357, + "grad_norm": 1.814687918697313, + "learning_rate": 2.178174342772177e-07, + "loss": 2.2517, + "step": 1340 + }, + { + "epoch": 1.3640973630831643, + "grad_norm": 1.6987256946879317, + "learning_rate": 2.143134013229167e-07, + "loss": 2.2672, + "step": 1345 + }, + { + "epoch": 1.3691683569979716, + "grad_norm": 1.7371785897491874, + "learning_rate": 2.1087600811366032e-07, + "loss": 2.2628, + "step": 1350 + }, + { + "epoch": 1.3742393509127788, + "grad_norm": 1.745926263655127, + "learning_rate": 2.075041118519355e-07, + "loss": 2.2532, + "step": 1355 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 1.700613383279488, + "learning_rate": 2.0419658719373504e-07, + "loss": 2.2617, + "step": 1360 + }, + { + "epoch": 1.3843813387423936, + "grad_norm": 1.691103098158946, + "learning_rate": 2.009523260148652e-07, + "loss": 2.2391, + "step": 1365 + }, + { + "epoch": 1.3894523326572008, + "grad_norm": 1.6917956046319294, + "learning_rate": 1.977702371799498e-07, + "loss": 2.2973, + "step": 1370 + }, + { + "epoch": 1.394523326572008, + "grad_norm": 1.7504566996070137, + "learning_rate": 1.946492463140869e-07, + "loss": 2.3102, + "step": 1375 + }, + { + "epoch": 1.3995943204868153, + "grad_norm": 1.838843879022522, + "learning_rate": 1.9158829557714903e-07, + "loss": 2.2819, + "step": 1380 + }, + { + "epoch": 1.4046653144016228, + "grad_norm": 1.7034157869918263, + "learning_rate": 1.8858634344068625e-07, + "loss": 2.2463, + "step": 1385 + }, + { + "epoch": 1.40973630831643, + "grad_norm": 1.7726664220307162, + "learning_rate": 1.8564236446742146e-07, + "loss": 2.2458, + "step": 1390 + }, + { + "epoch": 1.4148073022312373, + "grad_norm": 1.7584441947795304, + "learning_rate": 1.8275534909329853e-07, + "loss": 2.2663, + "step": 1395 + }, + { + "epoch": 1.4198782961460445, + "grad_norm": 1.7548926938859895, + "learning_rate": 1.7992430341207304e-07, + "loss": 2.29, + "step": 1400 + }, + { + "epoch": 1.4198782961460445, + "eval_loss": 2.4734323024749756, + "eval_runtime": 81.002, + "eval_samples_per_second": 86.492, + "eval_steps_per_second": 0.679, + "step": 1400 + }, + { + "epoch": 1.424949290060852, + "grad_norm": 1.691411914276979, + "learning_rate": 1.7714824896240595e-07, + "loss": 2.2565, + "step": 1405 + }, + { + "epoch": 1.4300202839756593, + "grad_norm": 1.7523279327159709, + "learning_rate": 1.7442622251745125e-07, + "loss": 2.2582, + "step": 1410 + }, + { + "epoch": 1.4350912778904665, + "grad_norm": 1.6844227513504313, + "learning_rate": 1.717572758768978e-07, + "loss": 2.2416, + "step": 1415 + }, + { + "epoch": 1.4401622718052738, + "grad_norm": 2.2030630647830245, + "learning_rate": 1.6914047566145662e-07, + "loss": 2.2289, + "step": 1420 + }, + { + "epoch": 1.445233265720081, + "grad_norm": 1.7795541841017355, + "learning_rate": 1.6657490310975468e-07, + "loss": 2.2841, + "step": 1425 + }, + { + "epoch": 1.4503042596348885, + "grad_norm": 1.8134633165357201, + "learning_rate": 1.6405965387762636e-07, + "loss": 2.2542, + "step": 1430 + }, + { + "epoch": 1.4553752535496958, + "grad_norm": 1.7604092301048675, + "learning_rate": 1.615938378397648e-07, + "loss": 2.2493, + "step": 1435 + }, + { + "epoch": 1.460446247464503, + "grad_norm": 1.8595724042593027, + "learning_rate": 1.5917657889372315e-07, + "loss": 2.2484, + "step": 1440 + }, + { + "epoch": 1.4655172413793103, + "grad_norm": 1.7081713686615858, + "learning_rate": 1.568070147662311e-07, + "loss": 2.2744, + "step": 1445 + }, + { + "epoch": 1.4705882352941178, + "grad_norm": 14.41030902656843, + "learning_rate": 1.5448429682181186e-07, + "loss": 2.2609, + "step": 1450 + }, + { + "epoch": 1.475659229208925, + "grad_norm": 1.7702111899429174, + "learning_rate": 1.5220758987367309e-07, + "loss": 2.2955, + "step": 1455 + }, + { + "epoch": 1.4807302231237323, + "grad_norm": 1.7932941724173908, + "learning_rate": 1.4997607199684964e-07, + "loss": 2.2478, + "step": 1460 + }, + { + "epoch": 1.4858012170385395, + "grad_norm": 1.7327449633169845, + "learning_rate": 1.477889343435765e-07, + "loss": 2.2713, + "step": 1465 + }, + { + "epoch": 1.4908722109533468, + "grad_norm": 1.7047486187689578, + "learning_rate": 1.456453809608691e-07, + "loss": 2.2586, + "step": 1470 + }, + { + "epoch": 1.495943204868154, + "grad_norm": 1.7085975289965103, + "learning_rate": 1.4354462861028889e-07, + "loss": 2.2602, + "step": 1475 + }, + { + "epoch": 1.5010141987829615, + "grad_norm": 1.7708851051604204, + "learning_rate": 1.414859065898731e-07, + "loss": 2.2913, + "step": 1480 + }, + { + "epoch": 1.5060851926977687, + "grad_norm": 1.6849008491575197, + "learning_rate": 1.3946845655820588e-07, + "loss": 2.2129, + "step": 1485 + }, + { + "epoch": 1.5111561866125762, + "grad_norm": 1.6770410018579935, + "learning_rate": 1.374915323606102e-07, + "loss": 2.2641, + "step": 1490 + }, + { + "epoch": 1.5162271805273835, + "grad_norm": 1.7333889728562109, + "learning_rate": 1.3555439985743863e-07, + "loss": 2.3096, + "step": 1495 + }, + { + "epoch": 1.5212981744421907, + "grad_norm": 1.7381149429179856, + "learning_rate": 1.3365633675444236e-07, + "loss": 2.2449, + "step": 1500 + }, + { + "epoch": 1.526369168356998, + "grad_norm": 1.7508604376509869, + "learning_rate": 1.317966324351968e-07, + "loss": 2.3006, + "step": 1505 + }, + { + "epoch": 1.5314401622718052, + "grad_norm": 1.731173156378831, + "learning_rate": 1.2997458779556342e-07, + "loss": 2.2721, + "step": 1510 + }, + { + "epoch": 1.5365111561866125, + "grad_norm": 1.7880722742651989, + "learning_rate": 1.2818951508016706e-07, + "loss": 2.2839, + "step": 1515 + }, + { + "epoch": 1.5415821501014197, + "grad_norm": 1.766456825336907, + "learning_rate": 1.264407377208682e-07, + "loss": 2.2542, + "step": 1520 + }, + { + "epoch": 1.5466531440162272, + "grad_norm": 1.793293076179441, + "learning_rate": 1.2472759017720967e-07, + "loss": 2.2345, + "step": 1525 + }, + { + "epoch": 1.5517241379310345, + "grad_norm": 1.7255231286858488, + "learning_rate": 1.2304941777881816e-07, + "loss": 2.2587, + "step": 1530 + }, + { + "epoch": 1.556795131845842, + "grad_norm": 1.7107497208562314, + "learning_rate": 1.214055765697399e-07, + "loss": 2.2587, + "step": 1535 + }, + { + "epoch": 1.5618661257606492, + "grad_norm": 1.7448234273922532, + "learning_rate": 1.197954331546911e-07, + "loss": 2.2493, + "step": 1540 + }, + { + "epoch": 1.5669371196754565, + "grad_norm": 1.713933005233849, + "learning_rate": 1.1821836454720342e-07, + "loss": 2.3028, + "step": 1545 + }, + { + "epoch": 1.5720081135902637, + "grad_norm": 1.8430768650069782, + "learning_rate": 1.1667375801964492e-07, + "loss": 2.2595, + "step": 1550 + }, + { + "epoch": 1.577079107505071, + "grad_norm": 1.7903141506679578, + "learning_rate": 1.15161010955097e-07, + "loss": 2.2555, + "step": 1555 + }, + { + "epoch": 1.5821501014198782, + "grad_norm": 1.810165731715535, + "learning_rate": 1.136795307010685e-07, + "loss": 2.2728, + "step": 1560 + }, + { + "epoch": 1.5872210953346855, + "grad_norm": 1.7357274884238136, + "learning_rate": 1.1222873442502753e-07, + "loss": 2.2741, + "step": 1565 + }, + { + "epoch": 1.592292089249493, + "grad_norm": 1.7545984913046129, + "learning_rate": 1.108080489717326e-07, + "loss": 2.2609, + "step": 1570 + }, + { + "epoch": 1.5973630831643002, + "grad_norm": 1.8639925458297812, + "learning_rate": 1.0941691072234387e-07, + "loss": 2.2349, + "step": 1575 + }, + { + "epoch": 1.6024340770791075, + "grad_norm": 1.7125402909483072, + "learning_rate": 1.080547654552963e-07, + "loss": 2.2929, + "step": 1580 + }, + { + "epoch": 1.607505070993915, + "grad_norm": 1.7300627575439524, + "learning_rate": 1.0672106820891631e-07, + "loss": 2.2823, + "step": 1585 + }, + { + "epoch": 1.6125760649087222, + "grad_norm": 1.7190554348875562, + "learning_rate": 1.0541528314576339e-07, + "loss": 2.2708, + "step": 1590 + }, + { + "epoch": 1.6176470588235294, + "grad_norm": 1.724918915538896, + "learning_rate": 1.04136883418679e-07, + "loss": 2.2491, + "step": 1595 + }, + { + "epoch": 1.6227180527383367, + "grad_norm": 1.7342048226287368, + "learning_rate": 1.0288535103852444e-07, + "loss": 2.2566, + "step": 1600 + }, + { + "epoch": 1.6227180527383367, + "eval_loss": 2.472487688064575, + "eval_runtime": 81.0795, + "eval_samples_per_second": 86.409, + "eval_steps_per_second": 0.678, + "step": 1600 + }, + { + "epoch": 1.627789046653144, + "grad_norm": 1.752725508386252, + "learning_rate": 1.0166017674359012e-07, + "loss": 2.2115, + "step": 1605 + }, + { + "epoch": 1.6328600405679512, + "grad_norm": 1.7053034674622713, + "learning_rate": 1.0046085987065856e-07, + "loss": 2.2349, + "step": 1610 + }, + { + "epoch": 1.6379310344827587, + "grad_norm": 1.6910767224745546, + "learning_rate": 9.928690822770361e-08, + "loss": 2.2661, + "step": 1615 + }, + { + "epoch": 1.643002028397566, + "grad_norm": 1.9415101732879068, + "learning_rate": 9.81378379682085e-08, + "loss": 2.2355, + "step": 1620 + }, + { + "epoch": 1.6480730223123732, + "grad_norm": 1.7692640477521646, + "learning_rate": 9.70131734670856e-08, + "loss": 2.2605, + "step": 1625 + }, + { + "epoch": 1.6531440162271807, + "grad_norm": 1.7825871200246013, + "learning_rate": 9.59124471981808e-08, + "loss": 2.2842, + "step": 1630 + }, + { + "epoch": 1.658215010141988, + "grad_norm": 1.805395258521555, + "learning_rate": 9.483519961334607e-08, + "loss": 2.2543, + "step": 1635 + }, + { + "epoch": 1.6632860040567952, + "grad_norm": 1.7151309029731219, + "learning_rate": 9.378097902306157e-08, + "loss": 2.2507, + "step": 1640 + }, + { + "epoch": 1.6683569979716024, + "grad_norm": 1.7662462146082336, + "learning_rate": 9.274934147859458e-08, + "loss": 2.2822, + "step": 1645 + }, + { + "epoch": 1.6734279918864097, + "grad_norm": 1.7065430440445857, + "learning_rate": 9.173985065567343e-08, + "loss": 2.2727, + "step": 1650 + }, + { + "epoch": 1.678498985801217, + "grad_norm": 1.8167004072102202, + "learning_rate": 9.075207773966592e-08, + "loss": 2.2582, + "step": 1655 + }, + { + "epoch": 1.6835699797160242, + "grad_norm": 1.7276973068156511, + "learning_rate": 8.978560131224021e-08, + "loss": 2.2451, + "step": 1660 + }, + { + "epoch": 1.6886409736308317, + "grad_norm": 1.7787413203893692, + "learning_rate": 8.88400072394981e-08, + "loss": 2.2421, + "step": 1665 + }, + { + "epoch": 1.693711967545639, + "grad_norm": 0.8868153668800921, + "learning_rate": 8.791488856155857e-08, + "loss": 2.2354, + "step": 1670 + }, + { + "epoch": 1.6987829614604464, + "grad_norm": 1.6998265742091707, + "learning_rate": 8.700984538358205e-08, + "loss": 2.264, + "step": 1675 + }, + { + "epoch": 1.7038539553752536, + "grad_norm": 1.7045446815412617, + "learning_rate": 8.612448476821393e-08, + "loss": 2.2775, + "step": 1680 + }, + { + "epoch": 1.708924949290061, + "grad_norm": 1.7898247009022359, + "learning_rate": 8.525842062943714e-08, + "loss": 2.2733, + "step": 1685 + }, + { + "epoch": 1.7139959432048681, + "grad_norm": 1.7604334600933766, + "learning_rate": 8.441127362781345e-08, + "loss": 2.2704, + "step": 1690 + }, + { + "epoch": 1.7190669371196754, + "grad_norm": 1.8108867949678853, + "learning_rate": 8.358267106710315e-08, + "loss": 2.2626, + "step": 1695 + }, + { + "epoch": 1.7241379310344827, + "grad_norm": 1.6881452920332736, + "learning_rate": 8.277224679224312e-08, + "loss": 2.2694, + "step": 1700 + }, + { + "epoch": 1.72920892494929, + "grad_norm": 1.7530216839199022, + "learning_rate": 8.197964108867328e-08, + "loss": 2.2622, + "step": 1705 + }, + { + "epoch": 1.7342799188640974, + "grad_norm": 1.7278497657123897, + "learning_rate": 8.12045005829916e-08, + "loss": 2.2471, + "step": 1710 + }, + { + "epoch": 1.7393509127789046, + "grad_norm": 1.8213327178561642, + "learning_rate": 8.044647814492792e-08, + "loss": 2.2313, + "step": 1715 + }, + { + "epoch": 1.744421906693712, + "grad_norm": 1.8304362576609268, + "learning_rate": 7.970523279061717e-08, + "loss": 2.2738, + "step": 1720 + }, + { + "epoch": 1.7494929006085194, + "grad_norm": 1.7718300765439339, + "learning_rate": 7.898042958716228e-08, + "loss": 2.2308, + "step": 1725 + }, + { + "epoch": 1.7545638945233266, + "grad_norm": 1.7305535723288619, + "learning_rate": 7.827173955846786e-08, + "loss": 2.2513, + "step": 1730 + }, + { + "epoch": 1.7596348884381339, + "grad_norm": 1.7402125464421778, + "learning_rate": 7.757883959233495e-08, + "loss": 2.2429, + "step": 1735 + }, + { + "epoch": 1.7647058823529411, + "grad_norm": 1.8175975710441392, + "learning_rate": 7.690141234879847e-08, + "loss": 2.288, + "step": 1740 + }, + { + "epoch": 1.7697768762677484, + "grad_norm": 1.851991292226803, + "learning_rate": 7.623914616969753e-08, + "loss": 2.2644, + "step": 1745 + }, + { + "epoch": 1.7748478701825556, + "grad_norm": 1.6602366231900278, + "learning_rate": 7.559173498946088e-08, + "loss": 2.2733, + "step": 1750 + }, + { + "epoch": 1.779918864097363, + "grad_norm": 1.7034994512549433, + "learning_rate": 7.495887824709769e-08, + "loss": 2.2674, + "step": 1755 + }, + { + "epoch": 1.7849898580121704, + "grad_norm": 1.7102833212058115, + "learning_rate": 7.434028079937624e-08, + "loss": 2.2752, + "step": 1760 + }, + { + "epoch": 1.7900608519269778, + "grad_norm": 2.1016603731428067, + "learning_rate": 7.373565283518085e-08, + "loss": 2.2726, + "step": 1765 + }, + { + "epoch": 1.795131845841785, + "grad_norm": 1.7876491597075783, + "learning_rate": 7.314470979103019e-08, + "loss": 2.2188, + "step": 1770 + }, + { + "epoch": 1.8002028397565923, + "grad_norm": 1.7984832581935817, + "learning_rate": 7.256717226774701e-08, + "loss": 2.2772, + "step": 1775 + }, + { + "epoch": 1.8052738336713996, + "grad_norm": 1.7621637378160073, + "learning_rate": 7.200276594826329e-08, + "loss": 2.2466, + "step": 1780 + }, + { + "epoch": 1.8103448275862069, + "grad_norm": 1.7255493399444854, + "learning_rate": 7.145122151655066e-08, + "loss": 2.2633, + "step": 1785 + }, + { + "epoch": 1.815415821501014, + "grad_norm": 1.7774418294615342, + "learning_rate": 7.101906869364121e-08, + "loss": 2.2966, + "step": 1790 + }, + { + "epoch": 1.8204868154158214, + "grad_norm": 1.7397631305330485, + "learning_rate": 7.049001264123894e-08, + "loss": 2.2644, + "step": 1795 + }, + { + "epoch": 1.8255578093306288, + "grad_norm": 1.7641738767791946, + "learning_rate": 6.997309032084255e-08, + "loss": 2.3052, + "step": 1800 + }, + { + "epoch": 1.8255578093306288, + "eval_loss": 2.4720866680145264, + "eval_runtime": 81.0596, + "eval_samples_per_second": 86.43, + "eval_steps_per_second": 0.679, + "step": 1800 + }, + { + "epoch": 1.830628803245436, + "grad_norm": 1.730995593445214, + "learning_rate": 6.946805070044455e-08, + "loss": 2.2748, + "step": 1805 + }, + { + "epoch": 1.8356997971602436, + "grad_norm": 1.708076665562477, + "learning_rate": 6.897464737518235e-08, + "loss": 2.2709, + "step": 1810 + }, + { + "epoch": 1.8407707910750508, + "grad_norm": 1.7961247246527527, + "learning_rate": 6.849263849253629e-08, + "loss": 2.2756, + "step": 1815 + }, + { + "epoch": 1.845841784989858, + "grad_norm": 1.7873259024447121, + "learning_rate": 6.802178667856782e-08, + "loss": 2.2619, + "step": 1820 + }, + { + "epoch": 1.8509127789046653, + "grad_norm": 1.7208578483390204, + "learning_rate": 6.756185896518329e-08, + "loss": 2.2563, + "step": 1825 + }, + { + "epoch": 1.8559837728194726, + "grad_norm": 1.6824119656694438, + "learning_rate": 6.711262671841385e-08, + "loss": 2.2524, + "step": 1830 + }, + { + "epoch": 1.8610547667342798, + "grad_norm": 1.717042060961093, + "learning_rate": 6.667386556769717e-08, + "loss": 2.3135, + "step": 1835 + }, + { + "epoch": 1.866125760649087, + "grad_norm": 1.736419652896857, + "learning_rate": 6.624535533615173e-08, + "loss": 2.288, + "step": 1840 + }, + { + "epoch": 1.8711967545638946, + "grad_norm": 1.75637188785577, + "learning_rate": 6.582687997182971e-08, + "loss": 2.2392, + "step": 1845 + }, + { + "epoch": 1.8762677484787018, + "grad_norm": 1.7282509939601418, + "learning_rate": 6.54182274799391e-08, + "loss": 2.2662, + "step": 1850 + }, + { + "epoch": 1.8813387423935093, + "grad_norm": 1.7060962855685544, + "learning_rate": 6.501918985602177e-08, + "loss": 2.2935, + "step": 1855 + }, + { + "epoch": 1.8864097363083165, + "grad_norm": 1.7581616823404618, + "learning_rate": 6.462956302007797e-08, + "loss": 2.2478, + "step": 1860 + }, + { + "epoch": 1.8914807302231238, + "grad_norm": 1.7987997676993257, + "learning_rate": 6.424914675162432e-08, + "loss": 2.2853, + "step": 1865 + }, + { + "epoch": 1.896551724137931, + "grad_norm": 1.7116689993633696, + "learning_rate": 6.387774462567602e-08, + "loss": 2.2503, + "step": 1870 + }, + { + "epoch": 1.9016227180527383, + "grad_norm": 1.7086258587789072, + "learning_rate": 6.351516394964051e-08, + "loss": 2.2822, + "step": 1875 + }, + { + "epoch": 1.9066937119675456, + "grad_norm": 1.8235148496074345, + "learning_rate": 6.31612157011135e-08, + "loss": 2.2879, + "step": 1880 + }, + { + "epoch": 1.9117647058823528, + "grad_norm": 1.7448709638927917, + "learning_rate": 6.281571446656485e-08, + "loss": 2.2586, + "step": 1885 + }, + { + "epoch": 1.9168356997971603, + "grad_norm": 1.7421662505581106, + "learning_rate": 6.247847838090545e-08, + "loss": 2.2791, + "step": 1890 + }, + { + "epoch": 1.9219066937119675, + "grad_norm": 1.825830026911039, + "learning_rate": 6.21493290679226e-08, + "loss": 2.2385, + "step": 1895 + }, + { + "epoch": 1.9269776876267748, + "grad_norm": 1.796187481606512, + "learning_rate": 6.182809158157558e-08, + "loss": 2.2756, + "step": 1900 + }, + { + "epoch": 1.9320486815415823, + "grad_norm": 1.7552941496595575, + "learning_rate": 6.151459434813879e-08, + "loss": 2.2587, + "step": 1905 + }, + { + "epoch": 1.9371196754563895, + "grad_norm": 1.7522494947057408, + "learning_rate": 6.120866910918446e-08, + "loss": 2.2585, + "step": 1910 + }, + { + "epoch": 1.9421906693711968, + "grad_norm": 1.7522459962159465, + "learning_rate": 6.091015086539273e-08, + "loss": 2.251, + "step": 1915 + }, + { + "epoch": 1.947261663286004, + "grad_norm": 1.702096284758162, + "learning_rate": 6.061887782118077e-08, + "loss": 2.285, + "step": 1920 + }, + { + "epoch": 1.9523326572008113, + "grad_norm": 1.7643281133012019, + "learning_rate": 6.033469133013957e-08, + "loss": 2.2846, + "step": 1925 + }, + { + "epoch": 1.9574036511156185, + "grad_norm": 1.6926355627529537, + "learning_rate": 6.005743584126981e-08, + "loss": 2.2124, + "step": 1930 + }, + { + "epoch": 1.962474645030426, + "grad_norm": 1.6991484085258466, + "learning_rate": 5.984051918509233e-08, + "loss": 2.2919, + "step": 1935 + }, + { + "epoch": 1.9675456389452333, + "grad_norm": 1.6959402402475394, + "learning_rate": 5.957535718971899e-08, + "loss": 2.2133, + "step": 1940 + }, + { + "epoch": 1.9726166328600405, + "grad_norm": 1.7435422008262311, + "learning_rate": 5.931670667334593e-08, + "loss": 2.2272, + "step": 1945 + }, + { + "epoch": 1.977687626774848, + "grad_norm": 1.7235339509485863, + "learning_rate": 5.906442337098544e-08, + "loss": 2.2566, + "step": 1950 + }, + { + "epoch": 1.9827586206896552, + "grad_norm": 1.8046591422600013, + "learning_rate": 5.881836586579961e-08, + "loss": 2.295, + "step": 1955 + }, + { + "epoch": 1.9878296146044625, + "grad_norm": 1.8447312096680564, + "learning_rate": 5.8578395539777033e-08, + "loss": 2.29, + "step": 1960 + }, + { + "epoch": 1.9929006085192698, + "grad_norm": 1.6943108398877464, + "learning_rate": 5.834437652514426e-08, + "loss": 2.2188, + "step": 1965 + }, + { + "epoch": 1.997971602434077, + "grad_norm": 1.7174652428188777, + "learning_rate": 5.811617565650129e-08, + "loss": 2.2692, + "step": 1970 + }, + { + "epoch": 2.0030425963488843, + "grad_norm": 1.6831299340128894, + "learning_rate": 5.7893662423673665e-08, + "loss": 2.2025, + "step": 1975 + }, + { + "epoch": 2.0081135902636915, + "grad_norm": 1.826795197065323, + "learning_rate": 5.767670892527061e-08, + "loss": 2.2579, + "step": 1980 + }, + { + "epoch": 2.0131845841784988, + "grad_norm": 1.7520235012361185, + "learning_rate": 5.746518982294192e-08, + "loss": 2.2388, + "step": 1985 + }, + { + "epoch": 2.0182555780933065, + "grad_norm": 1.8440219249964744, + "learning_rate": 5.72589822963234e-08, + "loss": 2.2582, + "step": 1990 + }, + { + "epoch": 2.0233265720081137, + "grad_norm": 1.7151060194819, + "learning_rate": 5.705796599866345e-08, + "loss": 2.2156, + "step": 1995 + }, + { + "epoch": 2.028397565922921, + "grad_norm": 1.7333738899068507, + "learning_rate": 5.686202301312118e-08, + "loss": 2.2702, + "step": 2000 + }, + { + "epoch": 2.028397565922921, + "eval_loss": 2.4733877182006836, + "eval_runtime": 81.1205, + "eval_samples_per_second": 86.365, + "eval_steps_per_second": 0.678, + "step": 2000 + }, + { + "epoch": 2.0334685598377282, + "grad_norm": 1.7637474983877708, + "learning_rate": 5.667103780972823e-08, + "loss": 2.2378, + "step": 2005 + }, + { + "epoch": 2.0385395537525355, + "grad_norm": 1.7730571315134518, + "learning_rate": 5.648489720300554e-08, + "loss": 2.2513, + "step": 2010 + }, + { + "epoch": 2.0436105476673427, + "grad_norm": 1.774271074894755, + "learning_rate": 5.630349031022691e-08, + "loss": 2.2518, + "step": 2015 + }, + { + "epoch": 2.04868154158215, + "grad_norm": 1.6997020509374097, + "learning_rate": 5.6126708510320976e-08, + "loss": 2.2464, + "step": 2020 + }, + { + "epoch": 2.0537525354969572, + "grad_norm": 1.7833382557650153, + "learning_rate": 5.595444540340353e-08, + "loss": 2.2317, + "step": 2025 + }, + { + "epoch": 2.0588235294117645, + "grad_norm": 1.7296871432561252, + "learning_rate": 5.578659677093205e-08, + "loss": 2.231, + "step": 2030 + }, + { + "epoch": 2.063894523326572, + "grad_norm": 1.7166463945290173, + "learning_rate": 5.562306053647459e-08, + "loss": 2.2347, + "step": 2035 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 1.7948324654757548, + "learning_rate": 5.546373672708482e-08, + "loss": 2.2458, + "step": 2040 + }, + { + "epoch": 2.0740365111561867, + "grad_norm": 1.745646645076283, + "learning_rate": 5.530852743527571e-08, + "loss": 2.2504, + "step": 2045 + }, + { + "epoch": 2.079107505070994, + "grad_norm": 1.7778201657756552, + "learning_rate": 5.515733678158393e-08, + "loss": 2.26, + "step": 2050 + }, + { + "epoch": 2.084178498985801, + "grad_norm": 1.7226724662159607, + "learning_rate": 5.5010070877717374e-08, + "loss": 2.24, + "step": 2055 + }, + { + "epoch": 2.0892494929006085, + "grad_norm": 1.737085412071484, + "learning_rate": 5.486663779027808e-08, + "loss": 2.2138, + "step": 2060 + }, + { + "epoch": 2.0943204868154157, + "grad_norm": 1.7680067007098665, + "learning_rate": 5.4726947505053265e-08, + "loss": 2.2688, + "step": 2065 + }, + { + "epoch": 2.099391480730223, + "grad_norm": 1.7414742255329991, + "learning_rate": 5.459091189186688e-08, + "loss": 2.2591, + "step": 2070 + }, + { + "epoch": 2.1044624746450302, + "grad_norm": 1.7804223600059563, + "learning_rate": 5.4458444669984314e-08, + "loss": 2.2337, + "step": 2075 + }, + { + "epoch": 2.109533468559838, + "grad_norm": 1.7481822321590552, + "learning_rate": 5.432946137406314e-08, + "loss": 2.2792, + "step": 2080 + }, + { + "epoch": 2.114604462474645, + "grad_norm": 1.7497391573214505, + "learning_rate": 5.420387932064249e-08, + "loss": 2.2927, + "step": 2085 + }, + { + "epoch": 2.1196754563894524, + "grad_norm": 1.7279168540890797, + "learning_rate": 5.408161757516413e-08, + "loss": 2.2451, + "step": 2090 + }, + { + "epoch": 2.1247464503042597, + "grad_norm": 1.7394662730899328, + "learning_rate": 5.396259691951805e-08, + "loss": 2.2424, + "step": 2095 + }, + { + "epoch": 2.129817444219067, + "grad_norm": 1.77875077601377, + "learning_rate": 5.384673982010568e-08, + "loss": 2.2402, + "step": 2100 + }, + { + "epoch": 2.134888438133874, + "grad_norm": 1.7319261658863345, + "learning_rate": 5.373397039641377e-08, + "loss": 2.2287, + "step": 2105 + }, + { + "epoch": 2.1399594320486814, + "grad_norm": 1.751571162082358, + "learning_rate": 5.362421439009217e-08, + "loss": 2.2334, + "step": 2110 + }, + { + "epoch": 2.1450304259634887, + "grad_norm": 1.8093044605440316, + "learning_rate": 5.351739913452874e-08, + "loss": 2.271, + "step": 2115 + }, + { + "epoch": 2.150101419878296, + "grad_norm": 1.8469881188013633, + "learning_rate": 5.341345352491468e-08, + "loss": 2.2284, + "step": 2120 + }, + { + "epoch": 2.1551724137931036, + "grad_norm": 1.7711139740473771, + "learning_rate": 5.331230798879373e-08, + "loss": 2.2644, + "step": 2125 + }, + { + "epoch": 2.160243407707911, + "grad_norm": 1.7271859975777568, + "learning_rate": 5.3213894457088646e-08, + "loss": 2.2378, + "step": 2130 + }, + { + "epoch": 2.165314401622718, + "grad_norm": 1.8925272013685321, + "learning_rate": 5.3118146335598536e-08, + "loss": 2.265, + "step": 2135 + }, + { + "epoch": 2.1703853955375254, + "grad_norm": 1.7527393142771752, + "learning_rate": 5.3024998476960626e-08, + "loss": 2.2183, + "step": 2140 + }, + { + "epoch": 2.1754563894523327, + "grad_norm": 1.7698628867396988, + "learning_rate": 5.293438715307019e-08, + "loss": 2.233, + "step": 2145 + }, + { + "epoch": 2.18052738336714, + "grad_norm": 1.724950058777004, + "learning_rate": 5.2846250027952295e-08, + "loss": 2.249, + "step": 2150 + }, + { + "epoch": 2.185598377281947, + "grad_norm": 1.9072718835854334, + "learning_rate": 5.276052613107927e-08, + "loss": 2.2342, + "step": 2155 + }, + { + "epoch": 2.1906693711967544, + "grad_norm": 1.7983471937343785, + "learning_rate": 5.2677155831127696e-08, + "loss": 2.2707, + "step": 2160 + }, + { + "epoch": 2.1957403651115617, + "grad_norm": 1.7092533410568467, + "learning_rate": 5.259608081016899e-08, + "loss": 2.2479, + "step": 2165 + }, + { + "epoch": 2.2008113590263694, + "grad_norm": 1.7921254707864127, + "learning_rate": 5.2517244038287416e-08, + "loss": 2.229, + "step": 2170 + }, + { + "epoch": 2.2058823529411766, + "grad_norm": 1.75489401951672, + "learning_rate": 5.244058974861976e-08, + "loss": 2.2772, + "step": 2175 + }, + { + "epoch": 2.210953346855984, + "grad_norm": 1.8175479517709452, + "learning_rate": 5.236606341281078e-08, + "loss": 2.2356, + "step": 2180 + }, + { + "epoch": 2.216024340770791, + "grad_norm": 1.808556074117745, + "learning_rate": 5.229361171687859e-08, + "loss": 2.2553, + "step": 2185 + }, + { + "epoch": 2.2210953346855984, + "grad_norm": 1.7664667006627157, + "learning_rate": 5.2223182537484316e-08, + "loss": 2.2719, + "step": 2190 + }, + { + "epoch": 2.2261663286004056, + "grad_norm": 1.7502392717778497, + "learning_rate": 5.2154724918600314e-08, + "loss": 2.2583, + "step": 2195 + }, + { + "epoch": 2.231237322515213, + "grad_norm": 1.7242967584463027, + "learning_rate": 5.208818904857144e-08, + "loss": 2.2411, + "step": 2200 + }, + { + "epoch": 2.231237322515213, + "eval_loss": 2.474597930908203, + "eval_runtime": 81.0438, + "eval_samples_per_second": 86.447, + "eval_steps_per_second": 0.679, + "step": 2200 + }, + { + "epoch": 2.23630831643002, + "grad_norm": 1.760326712726159, + "learning_rate": 5.202352623756371e-08, + "loss": 2.2356, + "step": 2205 + }, + { + "epoch": 2.2413793103448274, + "grad_norm": 1.7625638663030738, + "learning_rate": 5.1960688895395006e-08, + "loss": 2.2441, + "step": 2210 + }, + { + "epoch": 2.2464503042596347, + "grad_norm": 1.7518142596486186, + "learning_rate": 5.189963050974238e-08, + "loss": 2.2674, + "step": 2215 + }, + { + "epoch": 2.2515212981744424, + "grad_norm": 1.8040378121090448, + "learning_rate": 5.184030562472053e-08, + "loss": 2.2233, + "step": 2220 + }, + { + "epoch": 2.2565922920892496, + "grad_norm": 1.769147010660197, + "learning_rate": 5.1782669819826294e-08, + "loss": 2.2445, + "step": 2225 + }, + { + "epoch": 2.261663286004057, + "grad_norm": 1.802360281392845, + "learning_rate": 5.1726679689243875e-08, + "loss": 2.234, + "step": 2230 + }, + { + "epoch": 2.266734279918864, + "grad_norm": 1.763707867667644, + "learning_rate": 5.1672292821505586e-08, + "loss": 2.2132, + "step": 2235 + }, + { + "epoch": 2.2718052738336714, + "grad_norm": 1.75034581686763, + "learning_rate": 5.161946777950308e-08, + "loss": 2.2381, + "step": 2240 + }, + { + "epoch": 2.2768762677484786, + "grad_norm": 1.7401836199474783, + "learning_rate": 5.1568164080844036e-08, + "loss": 2.2416, + "step": 2245 + }, + { + "epoch": 2.281947261663286, + "grad_norm": 1.7713650977668527, + "learning_rate": 5.1518342178549174e-08, + "loss": 2.224, + "step": 2250 + }, + { + "epoch": 2.287018255578093, + "grad_norm": 1.7671231076913356, + "learning_rate": 5.146996344208486e-08, + "loss": 2.2183, + "step": 2255 + }, + { + "epoch": 2.292089249492901, + "grad_norm": 1.7464419032652747, + "learning_rate": 5.142299013872629e-08, + "loss": 2.2419, + "step": 2260 + }, + { + "epoch": 2.297160243407708, + "grad_norm": 1.7990294085116565, + "learning_rate": 5.1377385415246445e-08, + "loss": 2.2311, + "step": 2265 + }, + { + "epoch": 2.3022312373225153, + "grad_norm": 1.7543351264072877, + "learning_rate": 5.1333113279926185e-08, + "loss": 2.238, + "step": 2270 + }, + { + "epoch": 2.3073022312373226, + "grad_norm": 1.6898279670163325, + "learning_rate": 5.129013858488057e-08, + "loss": 2.2308, + "step": 2275 + }, + { + "epoch": 2.31237322515213, + "grad_norm": 1.7334567047607963, + "learning_rate": 5.124842700869695e-08, + "loss": 2.3031, + "step": 2280 + }, + { + "epoch": 2.317444219066937, + "grad_norm": 1.760983319309442, + "learning_rate": 5.120794503938012e-08, + "loss": 2.2455, + "step": 2285 + }, + { + "epoch": 2.3225152129817443, + "grad_norm": 1.7621675205518297, + "learning_rate": 5.116865995760006e-08, + "loss": 2.228, + "step": 2290 + }, + { + "epoch": 2.3275862068965516, + "grad_norm": 1.8080633887862172, + "learning_rate": 5.113053982023768e-08, + "loss": 2.284, + "step": 2295 + }, + { + "epoch": 2.332657200811359, + "grad_norm": 1.7592998081055247, + "learning_rate": 5.1093553444224286e-08, + "loss": 2.2196, + "step": 2300 + }, + { + "epoch": 2.337728194726166, + "grad_norm": 1.7831607571885368, + "learning_rate": 5.105767039067024e-08, + "loss": 2.269, + "step": 2305 + }, + { + "epoch": 2.342799188640974, + "grad_norm": 1.7176459519033709, + "learning_rate": 5.102286094927856e-08, + "loss": 2.2435, + "step": 2310 + }, + { + "epoch": 2.347870182555781, + "grad_norm": 1.7512756209003166, + "learning_rate": 5.098909612303925e-08, + "loss": 2.2579, + "step": 2315 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 1.7419259056225642, + "learning_rate": 5.095634761319991e-08, + "loss": 2.268, + "step": 2320 + }, + { + "epoch": 2.3580121703853956, + "grad_norm": 1.7461469979215953, + "learning_rate": 5.092458780450876e-08, + "loss": 2.2252, + "step": 2325 + }, + { + "epoch": 2.363083164300203, + "grad_norm": 1.745083473021831, + "learning_rate": 5.089378975072569e-08, + "loss": 2.2591, + "step": 2330 + }, + { + "epoch": 2.36815415821501, + "grad_norm": 1.8343705825023535, + "learning_rate": 5.086392716039744e-08, + "loss": 2.2626, + "step": 2335 + }, + { + "epoch": 2.3732251521298173, + "grad_norm": 1.7515682941502182, + "learning_rate": 5.0834974382892763e-08, + "loss": 2.2378, + "step": 2340 + }, + { + "epoch": 2.3782961460446246, + "grad_norm": 1.772483228062822, + "learning_rate": 5.080690639469371e-08, + "loss": 2.2906, + "step": 2345 + }, + { + "epoch": 2.3833671399594323, + "grad_norm": 1.8298309311035177, + "learning_rate": 5.077969878593903e-08, + "loss": 2.2782, + "step": 2350 + }, + { + "epoch": 2.3884381338742395, + "grad_norm": 1.778228901931638, + "learning_rate": 5.0753327747215805e-08, + "loss": 2.2687, + "step": 2355 + }, + { + "epoch": 2.393509127789047, + "grad_norm": 1.9355725485663295, + "learning_rate": 5.0727770056595594e-08, + "loss": 2.25, + "step": 2360 + }, + { + "epoch": 2.398580121703854, + "grad_norm": 1.7876677525732199, + "learning_rate": 5.070300306691114e-08, + "loss": 2.2811, + "step": 2365 + }, + { + "epoch": 2.4036511156186613, + "grad_norm": 1.766450812020173, + "learning_rate": 5.067900469327011e-08, + "loss": 2.265, + "step": 2370 + }, + { + "epoch": 2.4087221095334685, + "grad_norm": 1.6988211316677768, + "learning_rate": 5.065575340080193e-08, + "loss": 2.2458, + "step": 2375 + }, + { + "epoch": 2.413793103448276, + "grad_norm": 1.777565241311822, + "learning_rate": 5.063322819263436e-08, + "loss": 2.289, + "step": 2380 + }, + { + "epoch": 2.418864097363083, + "grad_norm": 1.766648317811343, + "learning_rate": 5.061140859809592e-08, + "loss": 2.2263, + "step": 2385 + }, + { + "epoch": 2.4239350912778903, + "grad_norm": 1.760808570512941, + "learning_rate": 5.059027466114087e-08, + "loss": 2.2371, + "step": 2390 + }, + { + "epoch": 2.4290060851926976, + "grad_norm": 1.7497881623660254, + "learning_rate": 5.056980692899308e-08, + "loss": 2.2186, + "step": 2395 + }, + { + "epoch": 2.4340770791075053, + "grad_norm": 1.904368651484495, + "learning_rate": 5.0549986441005356e-08, + "loss": 2.2413, + "step": 2400 + }, + { + "epoch": 2.4340770791075053, + "eval_loss": 2.4748759269714355, + "eval_runtime": 81.0832, + "eval_samples_per_second": 86.405, + "eval_steps_per_second": 0.678, + "step": 2400 + }, + { + "epoch": 2.4391480730223125, + "grad_norm": 1.7410363640013542, + "learning_rate": 5.053079471773089e-08, + "loss": 2.2531, + "step": 2405 + }, + { + "epoch": 2.4442190669371198, + "grad_norm": 1.7518018775000213, + "learning_rate": 5.0512213750203305e-08, + "loss": 2.2473, + "step": 2410 + }, + { + "epoch": 2.449290060851927, + "grad_norm": 1.7662222396602074, + "learning_rate": 5.049422598942212e-08, + "loss": 2.2389, + "step": 2415 + }, + { + "epoch": 2.4543610547667343, + "grad_norm": 1.780666367007688, + "learning_rate": 5.0476814336040274e-08, + "loss": 2.197, + "step": 2420 + }, + { + "epoch": 2.4594320486815415, + "grad_norm": 1.7499711395815145, + "learning_rate": 5.04599621302504e-08, + "loss": 2.2261, + "step": 2425 + }, + { + "epoch": 2.464503042596349, + "grad_norm": 1.7882713122146334, + "learning_rate": 5.04436531418668e-08, + "loss": 2.2393, + "step": 2430 + }, + { + "epoch": 2.469574036511156, + "grad_norm": 1.75643986036064, + "learning_rate": 5.042787156059982e-08, + "loss": 2.2439, + "step": 2435 + }, + { + "epoch": 2.4746450304259637, + "grad_norm": 1.7353199942499, + "learning_rate": 5.041260198651953e-08, + "loss": 2.2275, + "step": 2440 + }, + { + "epoch": 2.479716024340771, + "grad_norm": 1.7683236873580634, + "learning_rate": 5.039782942070575e-08, + "loss": 2.2378, + "step": 2445 + }, + { + "epoch": 2.4847870182555782, + "grad_norm": 1.7482878827223234, + "learning_rate": 5.038353925608112e-08, + "loss": 2.2655, + "step": 2450 + }, + { + "epoch": 2.4898580121703855, + "grad_norm": 1.7553465772492238, + "learning_rate": 5.036971726842454e-08, + "loss": 2.2509, + "step": 2455 + }, + { + "epoch": 2.4949290060851927, + "grad_norm": 1.7194051175937297, + "learning_rate": 5.035634960756173e-08, + "loss": 2.2246, + "step": 2460 + }, + { + "epoch": 2.5, + "grad_norm": 1.780820717878673, + "learning_rate": 5.0345973520341744e-08, + "loss": 2.3116, + "step": 2465 + }, + { + "epoch": 2.5050709939148073, + "grad_norm": 1.7092302368812895, + "learning_rate": 5.0333389906255366e-08, + "loss": 2.2434, + "step": 2470 + }, + { + "epoch": 2.5101419878296145, + "grad_norm": 1.6995993050400164, + "learning_rate": 5.03212237555571e-08, + "loss": 2.234, + "step": 2475 + }, + { + "epoch": 2.5152129817444218, + "grad_norm": 1.7916125090755124, + "learning_rate": 5.030946256214713e-08, + "loss": 2.2365, + "step": 2480 + }, + { + "epoch": 2.520283975659229, + "grad_norm": 1.743409123646943, + "learning_rate": 5.0298094154063516e-08, + "loss": 2.2778, + "step": 2485 + }, + { + "epoch": 2.5253549695740363, + "grad_norm": 1.7989761193864806, + "learning_rate": 5.028710668564437e-08, + "loss": 2.2698, + "step": 2490 + }, + { + "epoch": 2.530425963488844, + "grad_norm": 1.768436463277154, + "learning_rate": 5.027648862984817e-08, + "loss": 2.2295, + "step": 2495 + }, + { + "epoch": 2.535496957403651, + "grad_norm": 1.7762161444449078, + "learning_rate": 5.026622877072948e-08, + "loss": 2.2772, + "step": 2500 + }, + { + "epoch": 2.5405679513184585, + "grad_norm": 1.7325943514517332, + "learning_rate": 5.0256316196067565e-08, + "loss": 2.2326, + "step": 2505 + }, + { + "epoch": 2.5456389452332657, + "grad_norm": 1.7568007182157335, + "learning_rate": 5.024674029014512e-08, + "loss": 2.2575, + "step": 2510 + }, + { + "epoch": 2.550709939148073, + "grad_norm": 1.7465474101311085, + "learning_rate": 5.023749072667476e-08, + "loss": 2.2398, + "step": 2515 + }, + { + "epoch": 2.5557809330628802, + "grad_norm": 1.7105972624166814, + "learning_rate": 5.022855746187064e-08, + "loss": 2.2348, + "step": 2520 + }, + { + "epoch": 2.5608519269776875, + "grad_norm": 1.759196327867933, + "learning_rate": 5.021993072766265e-08, + "loss": 2.2302, + "step": 2525 + }, + { + "epoch": 2.565922920892495, + "grad_norm": 1.7618696598564434, + "learning_rate": 5.0211601025050875e-08, + "loss": 2.2783, + "step": 2530 + }, + { + "epoch": 2.5709939148073024, + "grad_norm": 1.7357397604845723, + "learning_rate": 5.020355911759782e-08, + "loss": 2.2399, + "step": 2535 + }, + { + "epoch": 2.5760649087221097, + "grad_norm": 1.7797963559349856, + "learning_rate": 5.019579602505595e-08, + "loss": 2.3119, + "step": 2540 + }, + { + "epoch": 2.581135902636917, + "grad_norm": 1.7476476267237637, + "learning_rate": 5.0188303017128396e-08, + "loss": 2.2362, + "step": 2545 + }, + { + "epoch": 2.586206896551724, + "grad_norm": 1.7871655712678034, + "learning_rate": 5.018107160736018e-08, + "loss": 2.2684, + "step": 2550 + }, + { + "epoch": 2.5912778904665315, + "grad_norm": 1.8564365985849263, + "learning_rate": 5.0174093547158035e-08, + "loss": 2.2683, + "step": 2555 + }, + { + "epoch": 2.5963488843813387, + "grad_norm": 1.7498854511370805, + "learning_rate": 5.016736081993624e-08, + "loss": 2.2518, + "step": 2560 + }, + { + "epoch": 2.601419878296146, + "grad_norm": 1.7966977533010748, + "learning_rate": 5.016086563538651e-08, + "loss": 2.2218, + "step": 2565 + }, + { + "epoch": 2.606490872210953, + "grad_norm": 1.7558979371137615, + "learning_rate": 5.015460042386951e-08, + "loss": 2.2658, + "step": 2570 + }, + { + "epoch": 2.6115618661257605, + "grad_norm": 1.7805268954368878, + "learning_rate": 5.014855783092602e-08, + "loss": 2.2324, + "step": 2575 + }, + { + "epoch": 2.6166328600405677, + "grad_norm": 1.7547744035144406, + "learning_rate": 5.0142730711905564e-08, + "loss": 2.2635, + "step": 2580 + }, + { + "epoch": 2.6217038539553754, + "grad_norm": 1.7892043381738651, + "learning_rate": 5.013711212671024e-08, + "loss": 2.2174, + "step": 2585 + }, + { + "epoch": 2.6267748478701827, + "grad_norm": 1.7661048483256172, + "learning_rate": 5.013169533465201e-08, + "loss": 2.2411, + "step": 2590 + }, + { + "epoch": 2.63184584178499, + "grad_norm": 1.7714992602824393, + "learning_rate": 5.012647378942108e-08, + "loss": 2.2379, + "step": 2595 + }, + { + "epoch": 2.636916835699797, + "grad_norm": 1.757980523509378, + "learning_rate": 5.0121441134163554e-08, + "loss": 2.216, + "step": 2600 + }, + { + "epoch": 2.636916835699797, + "eval_loss": 2.4749209880828857, + "eval_runtime": 81.0391, + "eval_samples_per_second": 86.452, + "eval_steps_per_second": 0.679, + "step": 2600 + }, + { + "epoch": 2.6419878296146044, + "grad_norm": 1.8485215916583273, + "learning_rate": 5.011659119666631e-08, + "loss": 2.2233, + "step": 2605 + }, + { + "epoch": 2.6470588235294117, + "grad_norm": 1.7863067371305124, + "learning_rate": 5.0111917984647157e-08, + "loss": 2.244, + "step": 2610 + }, + { + "epoch": 2.652129817444219, + "grad_norm": 1.7146816358296353, + "learning_rate": 5.010741568114834e-08, + "loss": 2.2351, + "step": 2615 + }, + { + "epoch": 2.6572008113590266, + "grad_norm": 1.831188399230356, + "learning_rate": 5.0103078640031516e-08, + "loss": 2.2269, + "step": 2620 + }, + { + "epoch": 2.662271805273834, + "grad_norm": 1.7724728214531387, + "learning_rate": 5.009890138157231e-08, + "loss": 2.2075, + "step": 2625 + }, + { + "epoch": 2.667342799188641, + "grad_norm": 1.782021890238949, + "learning_rate": 5.009487858815262e-08, + "loss": 2.217, + "step": 2630 + }, + { + "epoch": 2.6724137931034484, + "grad_norm": 1.7481328251498853, + "learning_rate": 5.0091005100048845e-08, + "loss": 2.2719, + "step": 2635 + }, + { + "epoch": 2.6774847870182557, + "grad_norm": 1.7906104909059064, + "learning_rate": 5.0087275911314286e-08, + "loss": 2.236, + "step": 2640 + }, + { + "epoch": 2.682555780933063, + "grad_norm": 1.7602535674283515, + "learning_rate": 5.008368616575389e-08, + "loss": 2.2479, + "step": 2645 + }, + { + "epoch": 2.68762677484787, + "grad_norm": 1.775336072092801, + "learning_rate": 5.00802311529897e-08, + "loss": 2.2651, + "step": 2650 + }, + { + "epoch": 2.6926977687626774, + "grad_norm": 1.7553544981528668, + "learning_rate": 5.00769063046152e-08, + "loss": 2.2695, + "step": 2655 + }, + { + "epoch": 2.6977687626774847, + "grad_norm": 1.827043155040219, + "learning_rate": 5.0073707190436947e-08, + "loss": 2.2565, + "step": 2660 + }, + { + "epoch": 2.702839756592292, + "grad_norm": 1.7286161050152862, + "learning_rate": 5.00706295148018e-08, + "loss": 2.2447, + "step": 2665 + }, + { + "epoch": 2.707910750507099, + "grad_norm": 1.818175461042268, + "learning_rate": 5.0067669113008144e-08, + "loss": 2.2437, + "step": 2670 + }, + { + "epoch": 2.7129817444219064, + "grad_norm": 1.8017061603291116, + "learning_rate": 5.006482194779946e-08, + "loss": 2.2557, + "step": 2675 + }, + { + "epoch": 2.718052738336714, + "grad_norm": 1.7866064039916518, + "learning_rate": 5.006208410593867e-08, + "loss": 2.2752, + "step": 2680 + }, + { + "epoch": 2.7231237322515214, + "grad_norm": 1.7655940160674672, + "learning_rate": 5.0059451794861766e-08, + "loss": 2.2834, + "step": 2685 + }, + { + "epoch": 2.7281947261663286, + "grad_norm": 1.7936324116014108, + "learning_rate": 5.005692133940906e-08, + "loss": 2.2634, + "step": 2690 + }, + { + "epoch": 2.733265720081136, + "grad_norm": 1.7857563825463283, + "learning_rate": 5.00544891786327e-08, + "loss": 2.2741, + "step": 2695 + }, + { + "epoch": 2.738336713995943, + "grad_norm": 1.7472045814339527, + "learning_rate": 5.005215186267882e-08, + "loss": 2.2644, + "step": 2700 + }, + { + "epoch": 2.7434077079107504, + "grad_norm": 1.8795177703424921, + "learning_rate": 5.0049906049743e-08, + "loss": 2.3007, + "step": 2705 + }, + { + "epoch": 2.7484787018255576, + "grad_norm": 1.8521743861085576, + "learning_rate": 5.004774850309745e-08, + "loss": 2.2366, + "step": 2710 + }, + { + "epoch": 2.7535496957403653, + "grad_norm": 1.7735396381086006, + "learning_rate": 5.0045676088188616e-08, + "loss": 2.2481, + "step": 2715 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 1.750426755759642, + "learning_rate": 5.004368576980381e-08, + "loss": 2.2235, + "step": 2720 + }, + { + "epoch": 2.76369168356998, + "grad_norm": 1.7041388090684644, + "learning_rate": 5.004177460930539e-08, + "loss": 2.2231, + "step": 2725 + }, + { + "epoch": 2.768762677484787, + "grad_norm": 1.8140115420681437, + "learning_rate": 5.003993976193124e-08, + "loss": 2.2138, + "step": 2730 + }, + { + "epoch": 2.7738336713995944, + "grad_norm": 1.822513477317258, + "learning_rate": 5.0038178474160234e-08, + "loss": 2.2612, + "step": 2735 + }, + { + "epoch": 2.7789046653144016, + "grad_norm": 1.7108014551704207, + "learning_rate": 5.003648808114121e-08, + "loss": 2.2464, + "step": 2740 + }, + { + "epoch": 2.783975659229209, + "grad_norm": 1.7880353168056893, + "learning_rate": 5.0034866004184443e-08, + "loss": 2.2571, + "step": 2745 + }, + { + "epoch": 2.789046653144016, + "grad_norm": 1.738078469289302, + "learning_rate": 5.003330974831406e-08, + "loss": 2.2712, + "step": 2750 + }, + { + "epoch": 2.7941176470588234, + "grad_norm": 1.851997917147577, + "learning_rate": 5.0031816899880413e-08, + "loss": 2.266, + "step": 2755 + }, + { + "epoch": 2.7991886409736306, + "grad_norm": 1.7297614602052127, + "learning_rate": 5.0030385124230966e-08, + "loss": 2.2423, + "step": 2760 + }, + { + "epoch": 2.804259634888438, + "grad_norm": 1.8006816107770167, + "learning_rate": 5.002901216343864e-08, + "loss": 2.2506, + "step": 2765 + }, + { + "epoch": 2.8093306288032456, + "grad_norm": 1.8037373860257597, + "learning_rate": 5.002769583408638e-08, + "loss": 2.2504, + "step": 2770 + }, + { + "epoch": 2.814401622718053, + "grad_norm": 1.7406557827783702, + "learning_rate": 5.002643402510677e-08, + "loss": 2.2676, + "step": 2775 + }, + { + "epoch": 2.81947261663286, + "grad_norm": 1.7784795193672072, + "learning_rate": 5.0025224695675576e-08, + "loss": 2.2052, + "step": 2780 + }, + { + "epoch": 2.8245436105476673, + "grad_norm": 1.7627831810019972, + "learning_rate": 5.002406587315805e-08, + "loss": 2.2315, + "step": 2785 + }, + { + "epoch": 2.8296146044624746, + "grad_norm": 1.798869752268086, + "learning_rate": 5.0022955651106973e-08, + "loss": 2.2436, + "step": 2790 + }, + { + "epoch": 2.834685598377282, + "grad_norm": 1.712097491290732, + "learning_rate": 5.00218921873112e-08, + "loss": 2.274, + "step": 2795 + }, + { + "epoch": 2.839756592292089, + "grad_norm": 1.8197661388888422, + "learning_rate": 5.002087370189384e-08, + "loss": 2.2696, + "step": 2800 + }, + { + "epoch": 2.839756592292089, + "eval_loss": 2.4746689796447754, + "eval_runtime": 80.933, + "eval_samples_per_second": 86.565, + "eval_steps_per_second": 0.68, + "step": 2800 + }, + { + "epoch": 2.844827586206897, + "grad_norm": 1.7693694208988924, + "learning_rate": 5.001989847545882e-08, + "loss": 2.2054, + "step": 2805 + }, + { + "epoch": 2.849898580121704, + "grad_norm": 1.8223549799119019, + "learning_rate": 5.001896484728491e-08, + "loss": 2.2656, + "step": 2810 + }, + { + "epoch": 2.8549695740365113, + "grad_norm": 1.805868445642325, + "learning_rate": 5.00180712135662e-08, + "loss": 2.26, + "step": 2815 + }, + { + "epoch": 2.8600405679513186, + "grad_norm": 1.7505054153674502, + "learning_rate": 5.001721602569797e-08, + "loss": 2.2465, + "step": 2820 + }, + { + "epoch": 2.865111561866126, + "grad_norm": 1.8486977309170785, + "learning_rate": 5.0016397788606984e-08, + "loss": 2.2764, + "step": 2825 + }, + { + "epoch": 2.870182555780933, + "grad_norm": 1.7740829866432102, + "learning_rate": 5.0015615059125324e-08, + "loss": 2.2303, + "step": 2830 + }, + { + "epoch": 2.8752535496957403, + "grad_norm": 1.7656514305652502, + "learning_rate": 5.00148664444067e-08, + "loss": 2.238, + "step": 2835 + }, + { + "epoch": 2.8803245436105476, + "grad_norm": 1.7634420973902674, + "learning_rate": 5.001415060038435e-08, + "loss": 2.2489, + "step": 2840 + }, + { + "epoch": 2.885395537525355, + "grad_norm": 1.8143454888420456, + "learning_rate": 5.0013466230269694e-08, + "loss": 2.2607, + "step": 2845 + }, + { + "epoch": 2.890466531440162, + "grad_norm": 1.7405623983796592, + "learning_rate": 5.001281208309067e-08, + "loss": 2.2677, + "step": 2850 + }, + { + "epoch": 2.8955375253549693, + "grad_norm": 1.7692613071607504, + "learning_rate": 5.0012186952269086e-08, + "loss": 2.2499, + "step": 2855 + }, + { + "epoch": 2.900608519269777, + "grad_norm": 1.8007487263191868, + "learning_rate": 5.0011589674235926e-08, + "loss": 2.277, + "step": 2860 + }, + { + "epoch": 2.9056795131845843, + "grad_norm": 1.7487914626739638, + "learning_rate": 5.001101912708386e-08, + "loss": 2.2377, + "step": 2865 + }, + { + "epoch": 2.9107505070993915, + "grad_norm": 1.7555747509644022, + "learning_rate": 5.0010474229256126e-08, + "loss": 2.2532, + "step": 2870 + }, + { + "epoch": 2.915821501014199, + "grad_norm": 1.791874000591728, + "learning_rate": 5.0009953938270927e-08, + "loss": 2.234, + "step": 2875 + }, + { + "epoch": 2.920892494929006, + "grad_norm": 1.8071787232301668, + "learning_rate": 5.0009457249480536e-08, + "loss": 2.2316, + "step": 2880 + }, + { + "epoch": 2.9259634888438133, + "grad_norm": 1.7814343272445903, + "learning_rate": 5.000898319486436e-08, + "loss": 2.2427, + "step": 2885 + }, + { + "epoch": 2.9310344827586206, + "grad_norm": 1.8248593697919109, + "learning_rate": 5.000853084185513e-08, + "loss": 2.2027, + "step": 2890 + }, + { + "epoch": 2.9361054766734282, + "grad_norm": 1.7986268547334479, + "learning_rate": 5.00080992921975e-08, + "loss": 2.244, + "step": 2895 + }, + { + "epoch": 2.9411764705882355, + "grad_norm": 1.8701642658692874, + "learning_rate": 5.0007687680838296e-08, + "loss": 2.2341, + "step": 2900 + }, + { + "epoch": 2.9462474645030428, + "grad_norm": 1.7265239787323012, + "learning_rate": 5.000729517484766e-08, + "loss": 2.2781, + "step": 2905 + }, + { + "epoch": 2.95131845841785, + "grad_norm": 1.7596094154490194, + "learning_rate": 5.0006920972370384e-08, + "loss": 2.2184, + "step": 2910 + }, + { + "epoch": 2.9563894523326573, + "grad_norm": 1.775542548895703, + "learning_rate": 5.000656430160671e-08, + "loss": 2.2404, + "step": 2915 + }, + { + "epoch": 2.9614604462474645, + "grad_norm": 1.7859302210997496, + "learning_rate": 5.0006224419821984e-08, + "loss": 2.2567, + "step": 2920 + }, + { + "epoch": 2.9665314401622718, + "grad_norm": 1.8410867262560875, + "learning_rate": 5.000590061238431e-08, + "loss": 2.2288, + "step": 2925 + }, + { + "epoch": 2.971602434077079, + "grad_norm": 1.79261063919542, + "learning_rate": 5.0005592191829755e-08, + "loss": 2.2421, + "step": 2930 + }, + { + "epoch": 2.9766734279918863, + "grad_norm": 1.787266539908181, + "learning_rate": 5.0005298496954236e-08, + "loss": 2.2713, + "step": 2935 + }, + { + "epoch": 2.9817444219066935, + "grad_norm": 1.8046073077938924, + "learning_rate": 5.000501889193161e-08, + "loss": 2.2292, + "step": 2940 + }, + { + "epoch": 2.986815415821501, + "grad_norm": 1.785150585134779, + "learning_rate": 5.0004752765457286e-08, + "loss": 2.2557, + "step": 2945 + }, + { + "epoch": 2.991886409736308, + "grad_norm": 1.7007836630596234, + "learning_rate": 5.000449952991666e-08, + "loss": 2.2913, + "step": 2950 + }, + { + "epoch": 2.9969574036511157, + "grad_norm": 1.7834634363941848, + "learning_rate": 5.000425862057791e-08, + "loss": 2.2178, + "step": 2955 + }, + { + "epoch": 3.002028397565923, + "grad_norm": 1.7711499203458665, + "learning_rate": 5.000402949480845e-08, + "loss": 2.2302, + "step": 2960 + }, + { + "epoch": 3.0070993914807302, + "grad_norm": 1.757400702100505, + "learning_rate": 5.000381163131448e-08, + "loss": 2.228, + "step": 2965 + }, + { + "epoch": 3.0121703853955375, + "grad_norm": 1.7587243027978727, + "learning_rate": 5.0003604529403105e-08, + "loss": 2.2532, + "step": 2970 + }, + { + "epoch": 3.0172413793103448, + "grad_norm": 1.8076763012567914, + "learning_rate": 5.000340770826644e-08, + "loss": 2.2812, + "step": 2975 + }, + { + "epoch": 3.022312373225152, + "grad_norm": 1.7710168575859588, + "learning_rate": 5.000322070628711e-08, + "loss": 2.2227, + "step": 2980 + }, + { + "epoch": 3.0273833671399593, + "grad_norm": 1.7518567665908418, + "learning_rate": 5.0003043080364665e-08, + "loss": 2.267, + "step": 2985 + }, + { + "epoch": 3.032454361054767, + "grad_norm": 1.75371879782544, + "learning_rate": 5.0002874405262365e-08, + "loss": 2.2748, + "step": 2990 + }, + { + "epoch": 3.037525354969574, + "grad_norm": 1.7604102341237111, + "learning_rate": 5.000271427297382e-08, + "loss": 2.244, + "step": 2995 + }, + { + "epoch": 3.0425963488843815, + "grad_norm": 1.7473066315528492, + "learning_rate": 5.0002562292108974e-08, + "loss": 2.2455, + "step": 3000 + }, + { + "epoch": 3.0425963488843815, + "eval_loss": 2.475208282470703, + "eval_runtime": 81.0816, + "eval_samples_per_second": 86.407, + "eval_steps_per_second": 0.678, + "step": 3000 + }, + { + "epoch": 3.0476673427991887, + "grad_norm": 1.8183626105425974, + "learning_rate": 5.000241808729891e-08, + "loss": 2.2598, + "step": 3005 + }, + { + "epoch": 3.052738336713996, + "grad_norm": 1.776003383845723, + "learning_rate": 5.00022812986191e-08, + "loss": 2.2749, + "step": 3010 + }, + { + "epoch": 3.0578093306288032, + "grad_norm": 1.8405505191800016, + "learning_rate": 5.0002151581030434e-08, + "loss": 2.2201, + "step": 3015 + }, + { + "epoch": 3.0628803245436105, + "grad_norm": 1.7687042107524293, + "learning_rate": 5.00020286038378e-08, + "loss": 2.2398, + "step": 3020 + }, + { + "epoch": 3.0679513184584177, + "grad_norm": 1.7504153888466234, + "learning_rate": 5.000191205016553e-08, + "loss": 2.2221, + "step": 3025 + }, + { + "epoch": 3.073022312373225, + "grad_norm": 1.7642074409964643, + "learning_rate": 5.000180161644944e-08, + "loss": 2.2223, + "step": 3030 + }, + { + "epoch": 3.0780933062880322, + "grad_norm": 1.7392036544850287, + "learning_rate": 5.000169701194494e-08, + "loss": 2.2192, + "step": 3035 + }, + { + "epoch": 3.08316430020284, + "grad_norm": 1.720350344708903, + "learning_rate": 5.0001597958250776e-08, + "loss": 2.2315, + "step": 3040 + }, + { + "epoch": 3.088235294117647, + "grad_norm": 1.7724706443214726, + "learning_rate": 5.000150418884808e-08, + "loss": 2.2501, + "step": 3045 + }, + { + "epoch": 3.0933062880324544, + "grad_norm": 1.7924639073969963, + "learning_rate": 5.000141544865421e-08, + "loss": 2.2446, + "step": 3050 + }, + { + "epoch": 3.0983772819472617, + "grad_norm": 1.736852243176053, + "learning_rate": 5.000133149359102e-08, + "loss": 2.2457, + "step": 3055 + }, + { + "epoch": 3.103448275862069, + "grad_norm": 1.784090807966895, + "learning_rate": 5.000125209016723e-08, + "loss": 2.2521, + "step": 3060 + }, + { + "epoch": 3.108519269776876, + "grad_norm": 1.7552195819841987, + "learning_rate": 5.000117701507439e-08, + "loss": 2.2331, + "step": 3065 + }, + { + "epoch": 3.1135902636916835, + "grad_norm": 1.7588419707647238, + "learning_rate": 5.0001106054796176e-08, + "loss": 2.2465, + "step": 3070 + }, + { + "epoch": 3.1186612576064907, + "grad_norm": 1.731249391051153, + "learning_rate": 5.000103900523059e-08, + "loss": 2.2154, + "step": 3075 + }, + { + "epoch": 3.123732251521298, + "grad_norm": 1.86107961069035, + "learning_rate": 5.0000975671324725e-08, + "loss": 2.2498, + "step": 3080 + }, + { + "epoch": 3.1288032454361057, + "grad_norm": 1.7453958505335196, + "learning_rate": 5.000091586672176e-08, + "loss": 2.213, + "step": 3085 + }, + { + "epoch": 3.133874239350913, + "grad_norm": 1.739107722358469, + "learning_rate": 5.000085941341981e-08, + "loss": 2.2703, + "step": 3090 + }, + { + "epoch": 3.13894523326572, + "grad_norm": 1.723031377500322, + "learning_rate": 5.000080614144228e-08, + "loss": 2.256, + "step": 3095 + }, + { + "epoch": 3.1440162271805274, + "grad_norm": 1.7859618571141844, + "learning_rate": 5.0000755888519526e-08, + "loss": 2.2446, + "step": 3100 + }, + { + "epoch": 3.1490872210953347, + "grad_norm": 1.7642645902841112, + "learning_rate": 5.0000708499781274e-08, + "loss": 2.2365, + "step": 3105 + }, + { + "epoch": 3.154158215010142, + "grad_norm": 1.8188951223969028, + "learning_rate": 5.000066382745973e-08, + "loss": 2.2743, + "step": 3110 + }, + { + "epoch": 3.159229208924949, + "grad_norm": 1.8017937041457348, + "learning_rate": 5.000062173060291e-08, + "loss": 2.2501, + "step": 3115 + }, + { + "epoch": 3.1643002028397564, + "grad_norm": 1.7816544045204796, + "learning_rate": 5.0000582074797944e-08, + "loss": 2.2025, + "step": 3120 + }, + { + "epoch": 3.1693711967545637, + "grad_norm": 1.7911385432695703, + "learning_rate": 5.0000544731904076e-08, + "loss": 2.2284, + "step": 3125 + }, + { + "epoch": 3.1744421906693714, + "grad_norm": 1.9232399576032946, + "learning_rate": 5.000050957979507e-08, + "loss": 2.2407, + "step": 3130 + }, + { + "epoch": 3.1795131845841786, + "grad_norm": 1.7293397524348884, + "learning_rate": 5.000047650211071e-08, + "loss": 2.2468, + "step": 3135 + }, + { + "epoch": 3.184584178498986, + "grad_norm": 1.7870474846773756, + "learning_rate": 5.000044538801721e-08, + "loss": 2.2432, + "step": 3140 + }, + { + "epoch": 3.189655172413793, + "grad_norm": 1.7179456705770244, + "learning_rate": 5.000041613197611e-08, + "loss": 2.2478, + "step": 3145 + }, + { + "epoch": 3.1947261663286004, + "grad_norm": 1.782930312662543, + "learning_rate": 5.0000388633521626e-08, + "loss": 2.219, + "step": 3150 + }, + { + "epoch": 3.1997971602434077, + "grad_norm": 1.8396726211182168, + "learning_rate": 5.000036279704598e-08, + "loss": 2.2131, + "step": 3155 + }, + { + "epoch": 3.204868154158215, + "grad_norm": 1.7441223394696925, + "learning_rate": 5.000033853159261e-08, + "loss": 2.216, + "step": 3160 + }, + { + "epoch": 3.209939148073022, + "grad_norm": 1.79701015495686, + "learning_rate": 5.000031575065695e-08, + "loss": 2.2423, + "step": 3165 + }, + { + "epoch": 3.2150101419878294, + "grad_norm": 1.7824241551117812, + "learning_rate": 5.000029437199458e-08, + "loss": 2.245, + "step": 3170 + }, + { + "epoch": 3.220081135902637, + "grad_norm": 1.7859671284571614, + "learning_rate": 5.000027431743653e-08, + "loss": 2.2466, + "step": 3175 + }, + { + "epoch": 3.2251521298174444, + "grad_norm": 1.7508641392805016, + "learning_rate": 5.000025551271141e-08, + "loss": 2.2123, + "step": 3180 + }, + { + "epoch": 3.2302231237322516, + "grad_norm": 1.790375251718636, + "learning_rate": 5.000023788727435e-08, + "loss": 2.2387, + "step": 3185 + }, + { + "epoch": 3.235294117647059, + "grad_norm": 1.8347285573544698, + "learning_rate": 5.0000221374142326e-08, + "loss": 2.2024, + "step": 3190 + }, + { + "epoch": 3.240365111561866, + "grad_norm": 1.766020664546832, + "learning_rate": 5.0000205909735805e-08, + "loss": 2.25, + "step": 3195 + }, + { + "epoch": 3.2454361054766734, + "grad_norm": 1.7685652184853669, + "learning_rate": 5.000019143372644e-08, + "loss": 2.216, + "step": 3200 + }, + { + "epoch": 3.2454361054766734, + "eval_loss": 2.475315809249878, + "eval_runtime": 81.0728, + "eval_samples_per_second": 86.416, + "eval_steps_per_second": 0.678, + "step": 3200 + }, + { + "epoch": 3.2505070993914806, + "grad_norm": 1.8114020440458831, + "learning_rate": 5.000017788889067e-08, + "loss": 2.2909, + "step": 3205 + }, + { + "epoch": 3.255578093306288, + "grad_norm": 1.8044780174846506, + "learning_rate": 5.0000165220969006e-08, + "loss": 2.2682, + "step": 3210 + }, + { + "epoch": 3.260649087221095, + "grad_norm": 1.8227060747974817, + "learning_rate": 5.0000153378530776e-08, + "loss": 2.2551, + "step": 3215 + }, + { + "epoch": 3.2657200811359024, + "grad_norm": 1.712746112733307, + "learning_rate": 5.000014231284425e-08, + "loss": 2.2085, + "step": 3220 + }, + { + "epoch": 3.27079107505071, + "grad_norm": 1.7693643379563115, + "learning_rate": 5.000013197775189e-08, + "loss": 2.2089, + "step": 3225 + }, + { + "epoch": 3.2758620689655173, + "grad_norm": 1.742416891486272, + "learning_rate": 5.000012232955056e-08, + "loss": 2.2256, + "step": 3230 + }, + { + "epoch": 3.2809330628803246, + "grad_norm": 1.7588332712006007, + "learning_rate": 5.000011332687656e-08, + "loss": 2.2411, + "step": 3235 + }, + { + "epoch": 3.286004056795132, + "grad_norm": 1.748987632844159, + "learning_rate": 5.000010493059533e-08, + "loss": 2.2161, + "step": 3240 + }, + { + "epoch": 3.291075050709939, + "grad_norm": 1.7730209178260556, + "learning_rate": 5.000009710369558e-08, + "loss": 2.2454, + "step": 3245 + }, + { + "epoch": 3.2961460446247464, + "grad_norm": 1.7638994477476329, + "learning_rate": 5.000008981118782e-08, + "loss": 2.2762, + "step": 3250 + }, + { + "epoch": 3.3012170385395536, + "grad_norm": 1.8306906774843352, + "learning_rate": 5.000008302000705e-08, + "loss": 2.2484, + "step": 3255 + }, + { + "epoch": 3.306288032454361, + "grad_norm": 1.8155910247025784, + "learning_rate": 5.0000076698919504e-08, + "loss": 2.2172, + "step": 3260 + }, + { + "epoch": 3.3113590263691686, + "grad_norm": 1.9000838772092157, + "learning_rate": 5.0000070818433264e-08, + "loss": 2.2639, + "step": 3265 + }, + { + "epoch": 3.316430020283976, + "grad_norm": 1.8182257588876376, + "learning_rate": 5.000006535071267e-08, + "loss": 2.2302, + "step": 3270 + }, + { + "epoch": 3.321501014198783, + "grad_norm": 1.7421030430480422, + "learning_rate": 5.0000060269496374e-08, + "loss": 2.2618, + "step": 3275 + }, + { + "epoch": 3.3265720081135903, + "grad_norm": 1.7545361773998456, + "learning_rate": 5.0000055550018825e-08, + "loss": 2.2174, + "step": 3280 + }, + { + "epoch": 3.3316430020283976, + "grad_norm": 1.7382589137313635, + "learning_rate": 5.000005116893524e-08, + "loss": 2.2497, + "step": 3285 + }, + { + "epoch": 3.336713995943205, + "grad_norm": 1.7544110577796528, + "learning_rate": 5.000004710424972e-08, + "loss": 2.2386, + "step": 3290 + }, + { + "epoch": 3.341784989858012, + "grad_norm": 1.7756370830140873, + "learning_rate": 5.0000043335246576e-08, + "loss": 2.2124, + "step": 3295 + }, + { + "epoch": 3.3468559837728193, + "grad_norm": 1.7647740914276824, + "learning_rate": 5.0000039842424645e-08, + "loss": 2.2357, + "step": 3300 + }, + { + "epoch": 3.3519269776876266, + "grad_norm": 1.7614092517536837, + "learning_rate": 5.000003660743452e-08, + "loss": 2.2823, + "step": 3305 + }, + { + "epoch": 3.356997971602434, + "grad_norm": 1.7889494130903192, + "learning_rate": 5.000003361301858e-08, + "loss": 2.1835, + "step": 3310 + }, + { + "epoch": 3.3620689655172415, + "grad_norm": 1.7154434994558871, + "learning_rate": 5.000003084295374e-08, + "loss": 2.2724, + "step": 3315 + }, + { + "epoch": 3.367139959432049, + "grad_norm": 1.8155130093382392, + "learning_rate": 5.0000028281996743e-08, + "loss": 2.2823, + "step": 3320 + }, + { + "epoch": 3.372210953346856, + "grad_norm": 1.880078020122213, + "learning_rate": 5.0000025915832e-08, + "loss": 2.2421, + "step": 3325 + }, + { + "epoch": 3.3772819472616633, + "grad_norm": 1.7913171122885942, + "learning_rate": 5.000002373102181e-08, + "loss": 2.1806, + "step": 3330 + }, + { + "epoch": 3.3823529411764706, + "grad_norm": 1.8110141267464457, + "learning_rate": 5.000002171495887e-08, + "loss": 2.2315, + "step": 3335 + }, + { + "epoch": 3.387423935091278, + "grad_norm": 1.8187945379716748, + "learning_rate": 5.000001985582107e-08, + "loss": 2.2207, + "step": 3340 + }, + { + "epoch": 3.392494929006085, + "grad_norm": 1.7822827152937282, + "learning_rate": 5.000001814252828e-08, + "loss": 2.2411, + "step": 3345 + }, + { + "epoch": 3.3975659229208923, + "grad_norm": 1.7281310183638643, + "learning_rate": 5.0000016564701364e-08, + "loss": 2.2415, + "step": 3350 + }, + { + "epoch": 3.4026369168357, + "grad_norm": 1.7550793470914747, + "learning_rate": 5.000001511262302e-08, + "loss": 2.2464, + "step": 3355 + }, + { + "epoch": 3.4077079107505073, + "grad_norm": 1.7459578038518018, + "learning_rate": 5.0000013777200565e-08, + "loss": 2.2504, + "step": 3360 + }, + { + "epoch": 3.4127789046653145, + "grad_norm": 1.740338062654503, + "learning_rate": 5.000001254993049e-08, + "loss": 2.2292, + "step": 3365 + }, + { + "epoch": 3.417849898580122, + "grad_norm": 1.8005446847395141, + "learning_rate": 5.000001142286484e-08, + "loss": 2.2646, + "step": 3370 + }, + { + "epoch": 3.422920892494929, + "grad_norm": 1.8075984781615184, + "learning_rate": 5.000001038857911e-08, + "loss": 2.2549, + "step": 3375 + }, + { + "epoch": 3.4279918864097363, + "grad_norm": 1.7944612854944237, + "learning_rate": 5.000000944014192e-08, + "loss": 2.2607, + "step": 3380 + }, + { + "epoch": 3.4330628803245435, + "grad_norm": 1.8042996177778357, + "learning_rate": 5.000000857108604e-08, + "loss": 2.2129, + "step": 3385 + }, + { + "epoch": 3.438133874239351, + "grad_norm": 1.812331539187214, + "learning_rate": 5.0000007775380984e-08, + "loss": 2.247, + "step": 3390 + }, + { + "epoch": 3.443204868154158, + "grad_norm": 1.7634101221518121, + "learning_rate": 5.0000007047407e-08, + "loss": 2.2454, + "step": 3395 + }, + { + "epoch": 3.4482758620689653, + "grad_norm": 1.8137979752467785, + "learning_rate": 5.000000638193037e-08, + "loss": 2.2348, + "step": 3400 + }, + { + "epoch": 3.4482758620689653, + "eval_loss": 2.475677013397217, + "eval_runtime": 81.0429, + "eval_samples_per_second": 86.448, + "eval_steps_per_second": 0.679, + "step": 3400 + }, + { + "epoch": 3.453346855983773, + "grad_norm": 1.7639358988388496, + "learning_rate": 5.0000005774079994e-08, + "loss": 2.2434, + "step": 3405 + }, + { + "epoch": 3.4584178498985803, + "grad_norm": 1.8860372894717414, + "learning_rate": 5.0000005219325215e-08, + "loss": 2.2184, + "step": 3410 + }, + { + "epoch": 3.4634888438133875, + "grad_norm": 1.792302245525526, + "learning_rate": 5.000000471345483e-08, + "loss": 2.2405, + "step": 3415 + }, + { + "epoch": 3.4685598377281948, + "grad_norm": 1.7326646638681342, + "learning_rate": 5.000000425255718e-08, + "loss": 2.2582, + "step": 3420 + }, + { + "epoch": 3.473630831643002, + "grad_norm": 1.7944771245301363, + "learning_rate": 5.0000003833001365e-08, + "loss": 2.202, + "step": 3425 + }, + { + "epoch": 3.4787018255578093, + "grad_norm": 1.8158606522431084, + "learning_rate": 5.000000345141943e-08, + "loss": 2.2533, + "step": 3430 + }, + { + "epoch": 3.4837728194726165, + "grad_norm": 1.8541024781685664, + "learning_rate": 5.0000003104689555e-08, + "loss": 2.2387, + "step": 3435 + }, + { + "epoch": 3.4888438133874238, + "grad_norm": 1.7999921658917655, + "learning_rate": 5.0000002789920174e-08, + "loss": 2.2441, + "step": 3440 + }, + { + "epoch": 3.4939148073022315, + "grad_norm": 1.7685774604287066, + "learning_rate": 5.000000250443497e-08, + "loss": 2.3018, + "step": 3445 + }, + { + "epoch": 3.4989858012170387, + "grad_norm": 1.7777470112493552, + "learning_rate": 5.000000224575872e-08, + "loss": 2.2433, + "step": 3450 + }, + { + "epoch": 3.504056795131846, + "grad_norm": 1.7748253950125374, + "learning_rate": 5.000000201160396e-08, + "loss": 2.2782, + "step": 3455 + }, + { + "epoch": 3.5091277890466532, + "grad_norm": 1.7842700957790634, + "learning_rate": 5.000000179985839e-08, + "loss": 2.2659, + "step": 3460 + }, + { + "epoch": 3.5141987829614605, + "grad_norm": 1.798939281745875, + "learning_rate": 5.000000160857302e-08, + "loss": 2.2396, + "step": 3465 + }, + { + "epoch": 3.5192697768762677, + "grad_norm": 1.8045276757468276, + "learning_rate": 5.000000143595102e-08, + "loss": 2.2325, + "step": 3470 + }, + { + "epoch": 3.524340770791075, + "grad_norm": 1.7262031285233723, + "learning_rate": 5.0000001280337235e-08, + "loss": 2.243, + "step": 3475 + }, + { + "epoch": 3.5294117647058822, + "grad_norm": 1.8375261220518257, + "learning_rate": 5.000000114020828e-08, + "loss": 2.2075, + "step": 3480 + }, + { + "epoch": 3.5344827586206895, + "grad_norm": 1.8163152606406519, + "learning_rate": 5.0000001014163305e-08, + "loss": 2.2494, + "step": 3485 + }, + { + "epoch": 3.5395537525354968, + "grad_norm": 1.8525927462335219, + "learning_rate": 5.0000000900915245e-08, + "loss": 2.2163, + "step": 3490 + }, + { + "epoch": 3.544624746450304, + "grad_norm": 1.7805165281974848, + "learning_rate": 5.000000079928269e-08, + "loss": 2.2525, + "step": 3495 + }, + { + "epoch": 3.5496957403651117, + "grad_norm": 1.7990737454408499, + "learning_rate": 5.000000070818217e-08, + "loss": 2.2874, + "step": 3500 + }, + { + "epoch": 3.554766734279919, + "grad_norm": 1.8247781997920414, + "learning_rate": 5.000000062662102e-08, + "loss": 2.2215, + "step": 3505 + }, + { + "epoch": 3.559837728194726, + "grad_norm": 1.9826615858248522, + "learning_rate": 5.000000055369062e-08, + "loss": 2.2443, + "step": 3510 + }, + { + "epoch": 3.5649087221095335, + "grad_norm": 1.799487216606698, + "learning_rate": 5.000000048856012e-08, + "loss": 2.2266, + "step": 3515 + }, + { + "epoch": 3.5699797160243407, + "grad_norm": 1.8091696515518445, + "learning_rate": 5.0000000430470526e-08, + "loss": 2.2517, + "step": 3520 + }, + { + "epoch": 3.575050709939148, + "grad_norm": 1.7814535925288772, + "learning_rate": 5.0000000378729234e-08, + "loss": 2.2321, + "step": 3525 + }, + { + "epoch": 3.5801217038539552, + "grad_norm": 1.850742214981416, + "learning_rate": 5.000000033270488e-08, + "loss": 2.2597, + "step": 3530 + }, + { + "epoch": 3.585192697768763, + "grad_norm": 1.7822355084719033, + "learning_rate": 5.000000029182252e-08, + "loss": 2.2963, + "step": 3535 + }, + { + "epoch": 3.59026369168357, + "grad_norm": 1.7548584963433536, + "learning_rate": 5.0000000255559235e-08, + "loss": 2.2669, + "step": 3540 + }, + { + "epoch": 3.5953346855983774, + "grad_norm": 1.8526633444752874, + "learning_rate": 5.0000000223439884e-08, + "loss": 2.2367, + "step": 3545 + }, + { + "epoch": 3.6004056795131847, + "grad_norm": 1.8813498033155052, + "learning_rate": 5.0000000195033304e-08, + "loss": 2.2373, + "step": 3550 + }, + { + "epoch": 3.605476673427992, + "grad_norm": 1.7670822667081592, + "learning_rate": 5.0000000169948675e-08, + "loss": 2.2705, + "step": 3555 + }, + { + "epoch": 3.610547667342799, + "grad_norm": 1.7756286276528583, + "learning_rate": 5.000000014783217e-08, + "loss": 2.2979, + "step": 3560 + }, + { + "epoch": 3.6156186612576064, + "grad_norm": 1.7467172856710016, + "learning_rate": 5.000000012836387e-08, + "loss": 2.2538, + "step": 3565 + }, + { + "epoch": 3.6206896551724137, + "grad_norm": 1.7107623358426811, + "learning_rate": 5.000000011125491e-08, + "loss": 2.2807, + "step": 3570 + }, + { + "epoch": 3.625760649087221, + "grad_norm": 1.8431542462448438, + "learning_rate": 5.000000009624475e-08, + "loss": 2.252, + "step": 3575 + }, + { + "epoch": 3.630831643002028, + "grad_norm": 1.7683303237840782, + "learning_rate": 5.000000008309876e-08, + "loss": 2.2722, + "step": 3580 + }, + { + "epoch": 3.6359026369168355, + "grad_norm": 1.7463535795755278, + "learning_rate": 5.000000007160591e-08, + "loss": 2.2712, + "step": 3585 + }, + { + "epoch": 3.640973630831643, + "grad_norm": 1.8412435208194315, + "learning_rate": 5.0000000061576706e-08, + "loss": 2.2438, + "step": 3590 + }, + { + "epoch": 3.6460446247464504, + "grad_norm": 1.7731354966851007, + "learning_rate": 5.000000005284119e-08, + "loss": 2.2305, + "step": 3595 + }, + { + "epoch": 3.6511156186612577, + "grad_norm": 1.7263977118619886, + "learning_rate": 5.0000000045247174e-08, + "loss": 2.238, + "step": 3600 + }, + { + "epoch": 3.6511156186612577, + "eval_loss": 2.475299596786499, + "eval_runtime": 81.0503, + "eval_samples_per_second": 86.44, + "eval_steps_per_second": 0.679, + "step": 3600 + }, + { + "epoch": 3.656186612576065, + "grad_norm": 1.725184319305705, + "learning_rate": 5.000000003865863e-08, + "loss": 2.2283, + "step": 3605 + }, + { + "epoch": 3.661257606490872, + "grad_norm": 1.9023050674895976, + "learning_rate": 5.000000003295409e-08, + "loss": 2.21, + "step": 3610 + }, + { + "epoch": 3.6663286004056794, + "grad_norm": 1.8044353617499143, + "learning_rate": 5.0000000028025353e-08, + "loss": 2.2658, + "step": 3615 + }, + { + "epoch": 3.6713995943204867, + "grad_norm": 1.7560239895320502, + "learning_rate": 5.0000000023776127e-08, + "loss": 2.2558, + "step": 3620 + }, + { + "epoch": 3.6764705882352944, + "grad_norm": 1.9019670084185585, + "learning_rate": 5.00000000201209e-08, + "loss": 2.2154, + "step": 3625 + }, + { + "epoch": 3.6815415821501016, + "grad_norm": 1.835689830804529, + "learning_rate": 5.0000000016983875e-08, + "loss": 2.2586, + "step": 3630 + }, + { + "epoch": 3.686612576064909, + "grad_norm": 1.8589538257906977, + "learning_rate": 5.000000001429796e-08, + "loss": 2.2388, + "step": 3635 + }, + { + "epoch": 3.691683569979716, + "grad_norm": 1.8068715773945243, + "learning_rate": 5.000000001200391e-08, + "loss": 2.2571, + "step": 3640 + }, + { + "epoch": 3.6967545638945234, + "grad_norm": 1.7775448603509494, + "learning_rate": 5.0000000010049494e-08, + "loss": 2.2751, + "step": 3645 + }, + { + "epoch": 3.7018255578093306, + "grad_norm": 1.748064680879759, + "learning_rate": 5.0000000008388774e-08, + "loss": 2.2183, + "step": 3650 + }, + { + "epoch": 3.706896551724138, + "grad_norm": 1.752057568304335, + "learning_rate": 5.000000000698141e-08, + "loss": 2.2532, + "step": 3655 + }, + { + "epoch": 3.711967545638945, + "grad_norm": 1.7976874660325244, + "learning_rate": 5.000000000579206e-08, + "loss": 2.2447, + "step": 3660 + }, + { + "epoch": 3.7170385395537524, + "grad_norm": 1.8361658170177098, + "learning_rate": 5.000000000478986e-08, + "loss": 2.2274, + "step": 3665 + }, + { + "epoch": 3.7221095334685597, + "grad_norm": 1.7595086838837224, + "learning_rate": 5.0000000003947866e-08, + "loss": 2.2704, + "step": 3670 + }, + { + "epoch": 3.727180527383367, + "grad_norm": 1.7772692374868122, + "learning_rate": 5.0000000003242645e-08, + "loss": 2.2394, + "step": 3675 + }, + { + "epoch": 3.732251521298174, + "grad_norm": 1.7860835232171102, + "learning_rate": 5.000000000265387e-08, + "loss": 2.238, + "step": 3680 + }, + { + "epoch": 3.737322515212982, + "grad_norm": 1.7590689183822192, + "learning_rate": 5.000000000216394e-08, + "loss": 2.2764, + "step": 3685 + }, + { + "epoch": 3.742393509127789, + "grad_norm": 1.7659065260707336, + "learning_rate": 5.0000000001757664e-08, + "loss": 2.2459, + "step": 3690 + }, + { + "epoch": 3.7474645030425964, + "grad_norm": 1.8153822365083379, + "learning_rate": 5.0000000001421954e-08, + "loss": 2.2299, + "step": 3695 + }, + { + "epoch": 3.7525354969574036, + "grad_norm": 1.7704864144251407, + "learning_rate": 5.0000000001145583e-08, + "loss": 2.247, + "step": 3700 + }, + { + "epoch": 3.757606490872211, + "grad_norm": 1.7268180675977047, + "learning_rate": 5.000000000091894e-08, + "loss": 2.2483, + "step": 3705 + }, + { + "epoch": 3.762677484787018, + "grad_norm": 1.7808473093189052, + "learning_rate": 5.000000000073382e-08, + "loss": 2.2774, + "step": 3710 + }, + { + "epoch": 3.767748478701826, + "grad_norm": 1.7999930755140212, + "learning_rate": 5.0000000000583246e-08, + "loss": 2.2209, + "step": 3715 + }, + { + "epoch": 3.772819472616633, + "grad_norm": 1.7741565202241085, + "learning_rate": 5.0000000000461306e-08, + "loss": 2.2353, + "step": 3720 + }, + { + "epoch": 3.7778904665314403, + "grad_norm": 1.8046657930760326, + "learning_rate": 5.0000000000363e-08, + "loss": 2.2255, + "step": 3725 + }, + { + "epoch": 3.7829614604462476, + "grad_norm": 1.8038566902418574, + "learning_rate": 5.000000000028412e-08, + "loss": 2.2781, + "step": 3730 + }, + { + "epoch": 3.788032454361055, + "grad_norm": 1.7944584001789026, + "learning_rate": 5.0000000000221146e-08, + "loss": 2.272, + "step": 3735 + }, + { + "epoch": 3.793103448275862, + "grad_norm": 1.7491739462315397, + "learning_rate": 5.0000000000171125e-08, + "loss": 2.2293, + "step": 3740 + }, + { + "epoch": 3.7981744421906694, + "grad_norm": 1.7505007397811716, + "learning_rate": 5.000000000013161e-08, + "loss": 2.2373, + "step": 3745 + }, + { + "epoch": 3.8032454361054766, + "grad_norm": 1.8014769703402196, + "learning_rate": 5.000000000010057e-08, + "loss": 2.2552, + "step": 3750 + }, + { + "epoch": 3.808316430020284, + "grad_norm": 1.7608287864741985, + "learning_rate": 5.0000000000076337e-08, + "loss": 2.2277, + "step": 3755 + }, + { + "epoch": 3.813387423935091, + "grad_norm": 1.8323757058256038, + "learning_rate": 5.0000000000057536e-08, + "loss": 2.2341, + "step": 3760 + }, + { + "epoch": 3.8184584178498984, + "grad_norm": 1.7574657806555387, + "learning_rate": 5.000000000004304e-08, + "loss": 2.2223, + "step": 3765 + }, + { + "epoch": 3.8235294117647056, + "grad_norm": 1.7900689784727426, + "learning_rate": 5.000000000003194e-08, + "loss": 2.2445, + "step": 3770 + }, + { + "epoch": 3.8286004056795133, + "grad_norm": 1.7873969080046235, + "learning_rate": 5.000000000002351e-08, + "loss": 2.2692, + "step": 3775 + }, + { + "epoch": 3.8336713995943206, + "grad_norm": 1.7693343584107923, + "learning_rate": 5.000000000001716e-08, + "loss": 2.1982, + "step": 3780 + }, + { + "epoch": 3.838742393509128, + "grad_norm": 1.782049247288072, + "learning_rate": 5.00000000000124e-08, + "loss": 2.2417, + "step": 3785 + }, + { + "epoch": 3.843813387423935, + "grad_norm": 1.8357614780582354, + "learning_rate": 5.000000000000888e-08, + "loss": 2.2414, + "step": 3790 + }, + { + "epoch": 3.8488843813387423, + "grad_norm": 1.7593131764821546, + "learning_rate": 5.0000000000006284e-08, + "loss": 2.2721, + "step": 3795 + }, + { + "epoch": 3.8539553752535496, + "grad_norm": 1.8355045282767246, + "learning_rate": 5.0000000000004405e-08, + "loss": 2.2349, + "step": 3800 + }, + { + "epoch": 3.8539553752535496, + "eval_loss": 2.475205421447754, + "eval_runtime": 81.089, + "eval_samples_per_second": 86.399, + "eval_steps_per_second": 0.678, + "step": 3800 + }, + { + "epoch": 3.859026369168357, + "grad_norm": 1.7617334472370734, + "learning_rate": 5.000000000000305e-08, + "loss": 2.2796, + "step": 3805 + }, + { + "epoch": 3.8640973630831645, + "grad_norm": 1.7655616354078496, + "learning_rate": 5.000000000000208e-08, + "loss": 2.2904, + "step": 3810 + }, + { + "epoch": 3.869168356997972, + "grad_norm": 1.7499887502905194, + "learning_rate": 5.00000000000014e-08, + "loss": 2.2289, + "step": 3815 + }, + { + "epoch": 3.874239350912779, + "grad_norm": 1.7552158736441676, + "learning_rate": 5.000000000000093e-08, + "loss": 2.2524, + "step": 3820 + }, + { + "epoch": 3.8793103448275863, + "grad_norm": 1.779864718453615, + "learning_rate": 5.0000000000000607e-08, + "loss": 2.2557, + "step": 3825 + }, + { + "epoch": 3.8843813387423936, + "grad_norm": 1.8326086874257492, + "learning_rate": 5.000000000000039e-08, + "loss": 2.2642, + "step": 3830 + }, + { + "epoch": 3.889452332657201, + "grad_norm": 1.7709614684441606, + "learning_rate": 5.000000000000024e-08, + "loss": 2.2316, + "step": 3835 + }, + { + "epoch": 3.894523326572008, + "grad_norm": 1.8053802580849208, + "learning_rate": 5.000000000000015e-08, + "loss": 2.2568, + "step": 3840 + }, + { + "epoch": 3.8995943204868153, + "grad_norm": 1.7935470548184194, + "learning_rate": 5.0000000000000104e-08, + "loss": 2.2993, + "step": 3845 + }, + { + "epoch": 3.9046653144016226, + "grad_norm": 1.7497664491493299, + "learning_rate": 5.000000000000006e-08, + "loss": 2.1989, + "step": 3850 + }, + { + "epoch": 3.90973630831643, + "grad_norm": 1.754972418650299, + "learning_rate": 5.000000000000003e-08, + "loss": 2.2424, + "step": 3855 + }, + { + "epoch": 3.914807302231237, + "grad_norm": 1.7589479994346042, + "learning_rate": 5.000000000000002e-08, + "loss": 2.2632, + "step": 3860 + }, + { + "epoch": 3.9198782961460448, + "grad_norm": 1.7971848831669277, + "learning_rate": 5.000000000000001e-08, + "loss": 2.2336, + "step": 3865 + }, + { + "epoch": 3.924949290060852, + "grad_norm": 1.7639968737695348, + "learning_rate": 5.0000000000000004e-08, + "loss": 2.2296, + "step": 3870 + }, + { + "epoch": 3.9300202839756593, + "grad_norm": 1.72827012743299, + "learning_rate": 5e-08, + "loss": 2.2451, + "step": 3875 + }, + { + "epoch": 3.9350912778904665, + "grad_norm": 1.749153588059136, + "learning_rate": 5e-08, + "loss": 2.258, + "step": 3880 + }, + { + "epoch": 3.940162271805274, + "grad_norm": 1.753206456867822, + "learning_rate": 5e-08, + "loss": 2.2587, + "step": 3885 + }, + { + "epoch": 3.945233265720081, + "grad_norm": 1.7816747777928572, + "learning_rate": 5e-08, + "loss": 2.2532, + "step": 3890 + }, + { + "epoch": 3.9503042596348883, + "grad_norm": 1.7762615930524053, + "learning_rate": 5e-08, + "loss": 2.2331, + "step": 3895 + }, + { + "epoch": 3.955375253549696, + "grad_norm": 1.8039115341801395, + "learning_rate": 5e-08, + "loss": 2.2271, + "step": 3900 + }, + { + "epoch": 3.9604462474645032, + "grad_norm": 1.7530354888252304, + "learning_rate": 5e-08, + "loss": 2.2191, + "step": 3905 + }, + { + "epoch": 3.9655172413793105, + "grad_norm": 1.883699780217342, + "learning_rate": 5e-08, + "loss": 2.2059, + "step": 3910 + }, + { + "epoch": 3.9705882352941178, + "grad_norm": 1.7246634345468168, + "learning_rate": 5e-08, + "loss": 2.2482, + "step": 3915 + }, + { + "epoch": 3.975659229208925, + "grad_norm": 1.762677648630269, + "learning_rate": 5e-08, + "loss": 2.2521, + "step": 3920 + }, + { + "epoch": 3.9807302231237323, + "grad_norm": 1.786354894638501, + "learning_rate": 5e-08, + "loss": 2.2763, + "step": 3925 + }, + { + "epoch": 3.9858012170385395, + "grad_norm": 1.81100838850099, + "learning_rate": 5e-08, + "loss": 2.2326, + "step": 3930 + }, + { + "epoch": 3.9908722109533468, + "grad_norm": 1.8115971845880692, + "learning_rate": 5e-08, + "loss": 2.2409, + "step": 3935 + }, + { + "epoch": 3.995943204868154, + "grad_norm": 1.901268059775357, + "learning_rate": 5e-08, + "loss": 2.2217, + "step": 3940 + }, + { + "epoch": 4.0, + "step": 3944, + "total_flos": 411954472550400.0, + "train_loss": 2.318100369605283, + "train_runtime": 14372.236, + "train_samples_per_second": 17.546, + "train_steps_per_second": 0.274 + } + ], + "logging_steps": 5, + "max_steps": 3944, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 200, + "total_flos": 411954472550400.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}