diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4540 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 3188, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0012547051442910915, + "grad_norm": 1.3249917794841757, + "learning_rate": 6.269592476489028e-08, + "loss": 0.5553, + "step": 1 + }, + { + "epoch": 0.006273525721455458, + "grad_norm": 1.3024175063683066, + "learning_rate": 3.134796238244514e-07, + "loss": 0.5542, + "step": 5 + }, + { + "epoch": 0.012547051442910916, + "grad_norm": 1.1325683997600668, + "learning_rate": 6.269592476489028e-07, + "loss": 0.5626, + "step": 10 + }, + { + "epoch": 0.018820577164366373, + "grad_norm": 1.1174840712777703, + "learning_rate": 9.404388714733543e-07, + "loss": 0.5642, + "step": 15 + }, + { + "epoch": 0.025094102885821833, + "grad_norm": 0.8002431481734196, + "learning_rate": 1.2539184952978056e-06, + "loss": 0.5294, + "step": 20 + }, + { + "epoch": 0.03136762860727729, + "grad_norm": 0.6333668642117685, + "learning_rate": 1.5673981191222572e-06, + "loss": 0.5079, + "step": 25 + }, + { + "epoch": 0.037641154328732745, + "grad_norm": 0.5771411644874956, + "learning_rate": 1.8808777429467086e-06, + "loss": 0.4655, + "step": 30 + }, + { + "epoch": 0.043914680050188205, + "grad_norm": 0.49532156556877105, + "learning_rate": 2.1943573667711602e-06, + "loss": 0.4592, + "step": 35 + }, + { + "epoch": 0.050188205771643665, + "grad_norm": 0.4975545815570396, + "learning_rate": 2.507836990595611e-06, + "loss": 0.4329, + "step": 40 + }, + { + "epoch": 0.056461731493099125, + "grad_norm": 0.4338284541758203, + "learning_rate": 2.8213166144200626e-06, + "loss": 0.4322, + "step": 45 + }, + { + "epoch": 0.06273525721455459, + "grad_norm": 0.40887016256028313, + "learning_rate": 3.1347962382445144e-06, + "loss": 0.426, + "step": 50 + }, + { + "epoch": 0.06900878293601004, + "grad_norm": 0.4045559102283436, + "learning_rate": 3.448275862068966e-06, + "loss": 0.4227, + "step": 55 + }, + { + "epoch": 0.07528230865746549, + "grad_norm": 0.3889912721227527, + "learning_rate": 3.7617554858934172e-06, + "loss": 0.4189, + "step": 60 + }, + { + "epoch": 0.08155583437892096, + "grad_norm": 0.3618436855347915, + "learning_rate": 4.075235109717869e-06, + "loss": 0.4177, + "step": 65 + }, + { + "epoch": 0.08782936010037641, + "grad_norm": 0.36459156314439295, + "learning_rate": 4.3887147335423205e-06, + "loss": 0.4049, + "step": 70 + }, + { + "epoch": 0.09410288582183186, + "grad_norm": 0.3488164206674813, + "learning_rate": 4.7021943573667714e-06, + "loss": 0.3961, + "step": 75 + }, + { + "epoch": 0.10037641154328733, + "grad_norm": 0.37238296726059605, + "learning_rate": 5.015673981191222e-06, + "loss": 0.3955, + "step": 80 + }, + { + "epoch": 0.10664993726474278, + "grad_norm": 0.3605666937163523, + "learning_rate": 5.329153605015674e-06, + "loss": 0.377, + "step": 85 + }, + { + "epoch": 0.11292346298619825, + "grad_norm": 0.35760877488985304, + "learning_rate": 5.642633228840125e-06, + "loss": 0.3915, + "step": 90 + }, + { + "epoch": 0.1191969887076537, + "grad_norm": 0.356976698911797, + "learning_rate": 5.956112852664577e-06, + "loss": 0.3938, + "step": 95 + }, + { + "epoch": 0.12547051442910917, + "grad_norm": 0.3620265985263758, + "learning_rate": 6.269592476489029e-06, + "loss": 0.404, + "step": 100 + }, + { + "epoch": 0.13174404015056462, + "grad_norm": 0.3710088599948379, + "learning_rate": 6.58307210031348e-06, + "loss": 0.3941, + "step": 105 + }, + { + "epoch": 0.13801756587202008, + "grad_norm": 0.3831687285367315, + "learning_rate": 6.896551724137932e-06, + "loss": 0.3859, + "step": 110 + }, + { + "epoch": 0.14429109159347553, + "grad_norm": 0.36915661135239697, + "learning_rate": 7.210031347962383e-06, + "loss": 0.3895, + "step": 115 + }, + { + "epoch": 0.15056461731493098, + "grad_norm": 0.3573608106603279, + "learning_rate": 7.5235109717868345e-06, + "loss": 0.3857, + "step": 120 + }, + { + "epoch": 0.15683814303638646, + "grad_norm": 0.3508976740749952, + "learning_rate": 7.836990595611285e-06, + "loss": 0.3915, + "step": 125 + }, + { + "epoch": 0.16311166875784192, + "grad_norm": 0.36876078420022057, + "learning_rate": 8.150470219435737e-06, + "loss": 0.3989, + "step": 130 + }, + { + "epoch": 0.16938519447929737, + "grad_norm": 0.38691192833572297, + "learning_rate": 8.463949843260189e-06, + "loss": 0.3874, + "step": 135 + }, + { + "epoch": 0.17565872020075282, + "grad_norm": 0.3735894147297392, + "learning_rate": 8.777429467084641e-06, + "loss": 0.385, + "step": 140 + }, + { + "epoch": 0.18193224592220827, + "grad_norm": 0.3710457112042887, + "learning_rate": 9.090909090909091e-06, + "loss": 0.3764, + "step": 145 + }, + { + "epoch": 0.18820577164366373, + "grad_norm": 0.3708916534975576, + "learning_rate": 9.404388714733543e-06, + "loss": 0.3808, + "step": 150 + }, + { + "epoch": 0.1944792973651192, + "grad_norm": 0.3552539505765215, + "learning_rate": 9.717868338557995e-06, + "loss": 0.4018, + "step": 155 + }, + { + "epoch": 0.20075282308657466, + "grad_norm": 0.3666385022952003, + "learning_rate": 1.0031347962382445e-05, + "loss": 0.3839, + "step": 160 + }, + { + "epoch": 0.20702634880803011, + "grad_norm": 0.3631078993279255, + "learning_rate": 1.0344827586206898e-05, + "loss": 0.3858, + "step": 165 + }, + { + "epoch": 0.21329987452948557, + "grad_norm": 0.39722641298153644, + "learning_rate": 1.0658307210031348e-05, + "loss": 0.3847, + "step": 170 + }, + { + "epoch": 0.21957340025094102, + "grad_norm": 0.3691443252830364, + "learning_rate": 1.09717868338558e-05, + "loss": 0.3826, + "step": 175 + }, + { + "epoch": 0.2258469259723965, + "grad_norm": 0.39755512136650856, + "learning_rate": 1.128526645768025e-05, + "loss": 0.3766, + "step": 180 + }, + { + "epoch": 0.23212045169385195, + "grad_norm": 0.3627344076525724, + "learning_rate": 1.1598746081504704e-05, + "loss": 0.409, + "step": 185 + }, + { + "epoch": 0.2383939774153074, + "grad_norm": 0.36755006735418844, + "learning_rate": 1.1912225705329154e-05, + "loss": 0.385, + "step": 190 + }, + { + "epoch": 0.24466750313676286, + "grad_norm": 0.3491529760844153, + "learning_rate": 1.2225705329153606e-05, + "loss": 0.3944, + "step": 195 + }, + { + "epoch": 0.25094102885821834, + "grad_norm": 0.38344322242421625, + "learning_rate": 1.2539184952978058e-05, + "loss": 0.3814, + "step": 200 + }, + { + "epoch": 0.2572145545796738, + "grad_norm": 0.3668326981157007, + "learning_rate": 1.285266457680251e-05, + "loss": 0.3769, + "step": 205 + }, + { + "epoch": 0.26348808030112925, + "grad_norm": 0.3606059535955065, + "learning_rate": 1.316614420062696e-05, + "loss": 0.3882, + "step": 210 + }, + { + "epoch": 0.2697616060225847, + "grad_norm": 0.3743059326798353, + "learning_rate": 1.3479623824451411e-05, + "loss": 0.3792, + "step": 215 + }, + { + "epoch": 0.27603513174404015, + "grad_norm": 0.3987264784021991, + "learning_rate": 1.3793103448275863e-05, + "loss": 0.3742, + "step": 220 + }, + { + "epoch": 0.2823086574654956, + "grad_norm": 0.41749115581528207, + "learning_rate": 1.4106583072100315e-05, + "loss": 0.3879, + "step": 225 + }, + { + "epoch": 0.28858218318695106, + "grad_norm": 0.36990941061813976, + "learning_rate": 1.4420062695924765e-05, + "loss": 0.3763, + "step": 230 + }, + { + "epoch": 0.2948557089084065, + "grad_norm": 0.350857226534173, + "learning_rate": 1.4733542319749217e-05, + "loss": 0.3936, + "step": 235 + }, + { + "epoch": 0.30112923462986196, + "grad_norm": 0.3699010810723354, + "learning_rate": 1.5047021943573669e-05, + "loss": 0.3827, + "step": 240 + }, + { + "epoch": 0.3074027603513174, + "grad_norm": 0.383417773018869, + "learning_rate": 1.536050156739812e-05, + "loss": 0.3753, + "step": 245 + }, + { + "epoch": 0.3136762860727729, + "grad_norm": 0.37418028131825143, + "learning_rate": 1.567398119122257e-05, + "loss": 0.393, + "step": 250 + }, + { + "epoch": 0.3199498117942284, + "grad_norm": 0.375158938790808, + "learning_rate": 1.598746081504702e-05, + "loss": 0.3803, + "step": 255 + }, + { + "epoch": 0.32622333751568383, + "grad_norm": 0.35545430825067814, + "learning_rate": 1.6300940438871475e-05, + "loss": 0.3858, + "step": 260 + }, + { + "epoch": 0.3324968632371393, + "grad_norm": 0.39047195961342007, + "learning_rate": 1.6614420062695925e-05, + "loss": 0.3956, + "step": 265 + }, + { + "epoch": 0.33877038895859474, + "grad_norm": 0.3763861614570858, + "learning_rate": 1.6927899686520378e-05, + "loss": 0.3863, + "step": 270 + }, + { + "epoch": 0.3450439146800502, + "grad_norm": 0.33810866516589266, + "learning_rate": 1.7241379310344828e-05, + "loss": 0.3861, + "step": 275 + }, + { + "epoch": 0.35131744040150564, + "grad_norm": 0.3577598772376036, + "learning_rate": 1.7554858934169282e-05, + "loss": 0.3847, + "step": 280 + }, + { + "epoch": 0.3575909661229611, + "grad_norm": 0.39952196485063435, + "learning_rate": 1.7868338557993732e-05, + "loss": 0.3803, + "step": 285 + }, + { + "epoch": 0.36386449184441655, + "grad_norm": 0.3560924326294842, + "learning_rate": 1.8181818181818182e-05, + "loss": 0.3823, + "step": 290 + }, + { + "epoch": 0.370138017565872, + "grad_norm": 0.36592070219456535, + "learning_rate": 1.8495297805642636e-05, + "loss": 0.3877, + "step": 295 + }, + { + "epoch": 0.37641154328732745, + "grad_norm": 0.36996538529023604, + "learning_rate": 1.8808777429467086e-05, + "loss": 0.3816, + "step": 300 + }, + { + "epoch": 0.38268506900878296, + "grad_norm": 0.3944353625018826, + "learning_rate": 1.9122257053291536e-05, + "loss": 0.3885, + "step": 305 + }, + { + "epoch": 0.3889585947302384, + "grad_norm": 0.3605090389245764, + "learning_rate": 1.943573667711599e-05, + "loss": 0.3746, + "step": 310 + }, + { + "epoch": 0.39523212045169387, + "grad_norm": 0.3489216159243111, + "learning_rate": 1.9749216300940443e-05, + "loss": 0.3807, + "step": 315 + }, + { + "epoch": 0.4015056461731493, + "grad_norm": 0.3902867659960602, + "learning_rate": 1.9999994004731887e-05, + "loss": 0.394, + "step": 320 + }, + { + "epoch": 0.4077791718946048, + "grad_norm": 0.3833754079678544, + "learning_rate": 1.999978417110275e-05, + "loss": 0.3657, + "step": 325 + }, + { + "epoch": 0.41405269761606023, + "grad_norm": 0.37800625764548235, + "learning_rate": 1.9999274581256576e-05, + "loss": 0.3822, + "step": 330 + }, + { + "epoch": 0.4203262233375157, + "grad_norm": 0.3548880184290815, + "learning_rate": 1.999846525046898e-05, + "loss": 0.3816, + "step": 335 + }, + { + "epoch": 0.42659974905897113, + "grad_norm": 0.37094441921423654, + "learning_rate": 1.9997356203000667e-05, + "loss": 0.3853, + "step": 340 + }, + { + "epoch": 0.4328732747804266, + "grad_norm": 0.3594535908155001, + "learning_rate": 1.9995947472096752e-05, + "loss": 0.3703, + "step": 345 + }, + { + "epoch": 0.43914680050188204, + "grad_norm": 0.3492383976696482, + "learning_rate": 1.9994239099985727e-05, + "loss": 0.3858, + "step": 350 + }, + { + "epoch": 0.4454203262233375, + "grad_norm": 0.3545945901168298, + "learning_rate": 1.9992231137878213e-05, + "loss": 0.3723, + "step": 355 + }, + { + "epoch": 0.451693851944793, + "grad_norm": 0.36080708632510694, + "learning_rate": 1.9989923645965418e-05, + "loss": 0.3952, + "step": 360 + }, + { + "epoch": 0.45796737766624845, + "grad_norm": 0.5493517294545232, + "learning_rate": 1.998731669341735e-05, + "loss": 0.3723, + "step": 365 + }, + { + "epoch": 0.4642409033877039, + "grad_norm": 0.34501637664109114, + "learning_rate": 1.998441035838071e-05, + "loss": 0.3787, + "step": 370 + }, + { + "epoch": 0.47051442910915936, + "grad_norm": 0.36834831423716424, + "learning_rate": 1.9981204727976577e-05, + "loss": 0.3871, + "step": 375 + }, + { + "epoch": 0.4767879548306148, + "grad_norm": 0.359064185739471, + "learning_rate": 1.9977699898297794e-05, + "loss": 0.4078, + "step": 380 + }, + { + "epoch": 0.48306148055207027, + "grad_norm": 0.37676963093518556, + "learning_rate": 1.997389597440608e-05, + "loss": 0.3997, + "step": 385 + }, + { + "epoch": 0.4893350062735257, + "grad_norm": 0.35164360932787275, + "learning_rate": 1.9969793070328872e-05, + "loss": 0.3706, + "step": 390 + }, + { + "epoch": 0.49560853199498117, + "grad_norm": 0.3499090374786232, + "learning_rate": 1.996539130905593e-05, + "loss": 0.3931, + "step": 395 + }, + { + "epoch": 0.5018820577164367, + "grad_norm": 0.35516374008901963, + "learning_rate": 1.9960690822535632e-05, + "loss": 0.3917, + "step": 400 + }, + { + "epoch": 0.5081555834378921, + "grad_norm": 0.3859915521260531, + "learning_rate": 1.995569175167102e-05, + "loss": 0.3862, + "step": 405 + }, + { + "epoch": 0.5144291091593476, + "grad_norm": 0.3700726542619639, + "learning_rate": 1.9950394246315594e-05, + "loss": 0.3977, + "step": 410 + }, + { + "epoch": 0.520702634880803, + "grad_norm": 0.36807219365873434, + "learning_rate": 1.994479846526879e-05, + "loss": 0.391, + "step": 415 + }, + { + "epoch": 0.5269761606022585, + "grad_norm": 0.34834015226308745, + "learning_rate": 1.9938904576271247e-05, + "loss": 0.3947, + "step": 420 + }, + { + "epoch": 0.533249686323714, + "grad_norm": 0.3475144346489988, + "learning_rate": 1.9932712755999768e-05, + "loss": 0.3797, + "step": 425 + }, + { + "epoch": 0.5395232120451694, + "grad_norm": 0.3471973955561176, + "learning_rate": 1.9926223190062015e-05, + "loss": 0.3777, + "step": 430 + }, + { + "epoch": 0.5457967377666249, + "grad_norm": 0.3702513132489804, + "learning_rate": 1.9919436072990967e-05, + "loss": 0.4113, + "step": 435 + }, + { + "epoch": 0.5520702634880803, + "grad_norm": 0.3480545734033953, + "learning_rate": 1.9912351608239064e-05, + "loss": 0.3717, + "step": 440 + }, + { + "epoch": 0.5583437892095358, + "grad_norm": 0.3705010339965962, + "learning_rate": 1.9904970008172128e-05, + "loss": 0.3903, + "step": 445 + }, + { + "epoch": 0.5646173149309912, + "grad_norm": 0.3545524157937066, + "learning_rate": 1.989729149406298e-05, + "loss": 0.3971, + "step": 450 + }, + { + "epoch": 0.5708908406524467, + "grad_norm": 0.34489507859345525, + "learning_rate": 1.988931629608483e-05, + "loss": 0.3715, + "step": 455 + }, + { + "epoch": 0.5771643663739021, + "grad_norm": 0.3419133499625096, + "learning_rate": 1.9881044653304347e-05, + "loss": 0.3722, + "step": 460 + }, + { + "epoch": 0.5834378920953576, + "grad_norm": 0.3544487799327636, + "learning_rate": 1.9872476813674527e-05, + "loss": 0.3854, + "step": 465 + }, + { + "epoch": 0.589711417816813, + "grad_norm": 0.3526030038053533, + "learning_rate": 1.9863613034027224e-05, + "loss": 0.3854, + "step": 470 + }, + { + "epoch": 0.5959849435382685, + "grad_norm": 0.35801287965649553, + "learning_rate": 1.9854453580065485e-05, + "loss": 0.3773, + "step": 475 + }, + { + "epoch": 0.6022584692597239, + "grad_norm": 0.33687873507306343, + "learning_rate": 1.984499872635556e-05, + "loss": 0.3839, + "step": 480 + }, + { + "epoch": 0.6085319949811794, + "grad_norm": 0.34289605388996236, + "learning_rate": 1.983524875631868e-05, + "loss": 0.3899, + "step": 485 + }, + { + "epoch": 0.6148055207026348, + "grad_norm": 0.35193056323984084, + "learning_rate": 1.9825203962222573e-05, + "loss": 0.3736, + "step": 490 + }, + { + "epoch": 0.6210790464240903, + "grad_norm": 0.3603036239481184, + "learning_rate": 1.9814864645172684e-05, + "loss": 0.3927, + "step": 495 + }, + { + "epoch": 0.6273525721455459, + "grad_norm": 0.3610916261894253, + "learning_rate": 1.9804231115103155e-05, + "loss": 0.3729, + "step": 500 + }, + { + "epoch": 0.6336260978670013, + "grad_norm": 0.33695606360007346, + "learning_rate": 1.9793303690767543e-05, + "loss": 0.3773, + "step": 505 + }, + { + "epoch": 0.6398996235884568, + "grad_norm": 0.37708359246844253, + "learning_rate": 1.9782082699729255e-05, + "loss": 0.3915, + "step": 510 + }, + { + "epoch": 0.6461731493099122, + "grad_norm": 0.37063002060189804, + "learning_rate": 1.9770568478351736e-05, + "loss": 0.384, + "step": 515 + }, + { + "epoch": 0.6524466750313677, + "grad_norm": 0.347781428498976, + "learning_rate": 1.9758761371788376e-05, + "loss": 0.3912, + "step": 520 + }, + { + "epoch": 0.6587202007528231, + "grad_norm": 0.37037259025304425, + "learning_rate": 1.974666173397218e-05, + "loss": 0.3795, + "step": 525 + }, + { + "epoch": 0.6649937264742786, + "grad_norm": 0.35088852677024873, + "learning_rate": 1.9734269927605134e-05, + "loss": 0.3803, + "step": 530 + }, + { + "epoch": 0.671267252195734, + "grad_norm": 0.3569377164202312, + "learning_rate": 1.972158632414736e-05, + "loss": 0.3775, + "step": 535 + }, + { + "epoch": 0.6775407779171895, + "grad_norm": 0.3394929535259636, + "learning_rate": 1.970861130380596e-05, + "loss": 0.3741, + "step": 540 + }, + { + "epoch": 0.6838143036386449, + "grad_norm": 0.3420677685134694, + "learning_rate": 1.9695345255523634e-05, + "loss": 0.3878, + "step": 545 + }, + { + "epoch": 0.6900878293601004, + "grad_norm": 0.3531313507208391, + "learning_rate": 1.9681788576967004e-05, + "loss": 0.3787, + "step": 550 + }, + { + "epoch": 0.6963613550815558, + "grad_norm": 0.338822289408876, + "learning_rate": 1.9667941674514712e-05, + "loss": 0.3829, + "step": 555 + }, + { + "epoch": 0.7026348808030113, + "grad_norm": 0.36931664322135854, + "learning_rate": 1.9653804963245226e-05, + "loss": 0.3893, + "step": 560 + }, + { + "epoch": 0.7089084065244667, + "grad_norm": 0.35793185710348313, + "learning_rate": 1.9639378866924405e-05, + "loss": 0.3816, + "step": 565 + }, + { + "epoch": 0.7151819322459222, + "grad_norm": 0.36642001426471366, + "learning_rate": 1.9624663817992783e-05, + "loss": 0.3825, + "step": 570 + }, + { + "epoch": 0.7214554579673776, + "grad_norm": 0.36184277756969885, + "learning_rate": 1.960966025755262e-05, + "loss": 0.3918, + "step": 575 + }, + { + "epoch": 0.7277289836888331, + "grad_norm": 0.34191036136824954, + "learning_rate": 1.9594368635354676e-05, + "loss": 0.3812, + "step": 580 + }, + { + "epoch": 0.7340025094102886, + "grad_norm": 0.4053922705018229, + "learning_rate": 1.9578789409784727e-05, + "loss": 0.3796, + "step": 585 + }, + { + "epoch": 0.740276035131744, + "grad_norm": 0.3254681140813354, + "learning_rate": 1.9562923047849828e-05, + "loss": 0.381, + "step": 590 + }, + { + "epoch": 0.7465495608531995, + "grad_norm": 0.3655079090417533, + "learning_rate": 1.9546770025164304e-05, + "loss": 0.3808, + "step": 595 + }, + { + "epoch": 0.7528230865746549, + "grad_norm": 0.3309145310004598, + "learning_rate": 1.95303308259355e-05, + "loss": 0.3924, + "step": 600 + }, + { + "epoch": 0.7590966122961104, + "grad_norm": 0.34826624551002194, + "learning_rate": 1.9513605942949277e-05, + "loss": 0.3752, + "step": 605 + }, + { + "epoch": 0.7653701380175659, + "grad_norm": 0.3463792668326179, + "learning_rate": 1.9496595877555212e-05, + "loss": 0.3922, + "step": 610 + }, + { + "epoch": 0.7716436637390214, + "grad_norm": 0.3409526655488519, + "learning_rate": 1.94793011396516e-05, + "loss": 0.3868, + "step": 615 + }, + { + "epoch": 0.7779171894604768, + "grad_norm": 0.34614309257239506, + "learning_rate": 1.946172224767015e-05, + "loss": 0.3816, + "step": 620 + }, + { + "epoch": 0.7841907151819323, + "grad_norm": 0.36738342533490437, + "learning_rate": 1.9443859728560458e-05, + "loss": 0.3809, + "step": 625 + }, + { + "epoch": 0.7904642409033877, + "grad_norm": 0.5642673777402893, + "learning_rate": 1.9425714117774183e-05, + "loss": 0.3678, + "step": 630 + }, + { + "epoch": 0.7967377666248432, + "grad_norm": 0.3734297840619315, + "learning_rate": 1.940728595924904e-05, + "loss": 0.3856, + "step": 635 + }, + { + "epoch": 0.8030112923462986, + "grad_norm": 0.34992964702526325, + "learning_rate": 1.9388575805392453e-05, + "loss": 0.3937, + "step": 640 + }, + { + "epoch": 0.8092848180677541, + "grad_norm": 0.34532043284186725, + "learning_rate": 1.9369584217065025e-05, + "loss": 0.3718, + "step": 645 + }, + { + "epoch": 0.8155583437892095, + "grad_norm": 0.3390467838397306, + "learning_rate": 1.935031176356371e-05, + "loss": 0.3829, + "step": 650 + }, + { + "epoch": 0.821831869510665, + "grad_norm": 0.3522889151490647, + "learning_rate": 1.933075902260475e-05, + "loss": 0.3876, + "step": 655 + }, + { + "epoch": 0.8281053952321205, + "grad_norm": 0.38123310023732215, + "learning_rate": 1.9310926580306365e-05, + "loss": 0.3707, + "step": 660 + }, + { + "epoch": 0.8343789209535759, + "grad_norm": 0.37101425687357464, + "learning_rate": 1.929081503117117e-05, + "loss": 0.3777, + "step": 665 + }, + { + "epoch": 0.8406524466750314, + "grad_norm": 0.35327596368808917, + "learning_rate": 1.9270424978068368e-05, + "loss": 0.372, + "step": 670 + }, + { + "epoch": 0.8469259723964868, + "grad_norm": 0.48025305553666425, + "learning_rate": 1.9249757032215674e-05, + "loss": 0.3719, + "step": 675 + }, + { + "epoch": 0.8531994981179423, + "grad_norm": 1.3954653921865106, + "learning_rate": 1.9228811813160972e-05, + "loss": 0.3774, + "step": 680 + }, + { + "epoch": 0.8594730238393977, + "grad_norm": 0.3729066926513616, + "learning_rate": 1.920758994876379e-05, + "loss": 0.3845, + "step": 685 + }, + { + "epoch": 0.8657465495608532, + "grad_norm": 0.3368507345057043, + "learning_rate": 1.918609207517643e-05, + "loss": 0.3674, + "step": 690 + }, + { + "epoch": 0.8720200752823086, + "grad_norm": 0.3381829661628343, + "learning_rate": 1.9164318836824928e-05, + "loss": 0.3895, + "step": 695 + }, + { + "epoch": 0.8782936010037641, + "grad_norm": 0.35406742056379137, + "learning_rate": 1.9142270886389726e-05, + "loss": 0.3888, + "step": 700 + }, + { + "epoch": 0.8845671267252195, + "grad_norm": 0.35267605322252205, + "learning_rate": 1.911994888478611e-05, + "loss": 0.416, + "step": 705 + }, + { + "epoch": 0.890840652446675, + "grad_norm": 0.36994927588198245, + "learning_rate": 1.9097353501144403e-05, + "loss": 0.3881, + "step": 710 + }, + { + "epoch": 0.8971141781681304, + "grad_norm": 0.344878616221491, + "learning_rate": 1.9074485412789886e-05, + "loss": 0.3916, + "step": 715 + }, + { + "epoch": 0.903387703889586, + "grad_norm": 0.3618364967642679, + "learning_rate": 1.9051345305222527e-05, + "loss": 0.3877, + "step": 720 + }, + { + "epoch": 0.9096612296110415, + "grad_norm": 0.3348128268620291, + "learning_rate": 1.9027933872096403e-05, + "loss": 0.3758, + "step": 725 + }, + { + "epoch": 0.9159347553324969, + "grad_norm": 0.34960639615133043, + "learning_rate": 1.900425181519893e-05, + "loss": 0.3842, + "step": 730 + }, + { + "epoch": 0.9222082810539524, + "grad_norm": 0.36222991570969465, + "learning_rate": 1.8980299844429804e-05, + "loss": 0.379, + "step": 735 + }, + { + "epoch": 0.9284818067754078, + "grad_norm": 0.3499002387438987, + "learning_rate": 1.8956078677779738e-05, + "loss": 0.3715, + "step": 740 + }, + { + "epoch": 0.9347553324968633, + "grad_norm": 0.33211819623666405, + "learning_rate": 1.8931589041308926e-05, + "loss": 0.3736, + "step": 745 + }, + { + "epoch": 0.9410288582183187, + "grad_norm": 0.3495894811325005, + "learning_rate": 1.8906831669125293e-05, + "loss": 0.3778, + "step": 750 + }, + { + "epoch": 0.9473023839397742, + "grad_norm": 0.3315684966056081, + "learning_rate": 1.8881807303362484e-05, + "loss": 0.3776, + "step": 755 + }, + { + "epoch": 0.9535759096612296, + "grad_norm": 0.3409622621170755, + "learning_rate": 1.885651669415761e-05, + "loss": 0.3873, + "step": 760 + }, + { + "epoch": 0.9598494353826851, + "grad_norm": 0.34785709596472625, + "learning_rate": 1.883096059962876e-05, + "loss": 0.3861, + "step": 765 + }, + { + "epoch": 0.9661229611041405, + "grad_norm": 0.3477615933807258, + "learning_rate": 1.8805139785852297e-05, + "loss": 0.3783, + "step": 770 + }, + { + "epoch": 0.972396486825596, + "grad_norm": 0.343324271874741, + "learning_rate": 1.877905502683987e-05, + "loss": 0.3746, + "step": 775 + }, + { + "epoch": 0.9786700125470514, + "grad_norm": 0.47223667640674677, + "learning_rate": 1.8752707104515223e-05, + "loss": 0.3793, + "step": 780 + }, + { + "epoch": 0.9849435382685069, + "grad_norm": 0.3234411234996811, + "learning_rate": 1.8726096808690757e-05, + "loss": 0.3676, + "step": 785 + }, + { + "epoch": 0.9912170639899623, + "grad_norm": 0.3574211042049705, + "learning_rate": 1.8699224937043846e-05, + "loss": 0.3735, + "step": 790 + }, + { + "epoch": 0.9974905897114178, + "grad_norm": 0.3811194221638418, + "learning_rate": 1.8672092295092935e-05, + "loss": 0.3939, + "step": 795 + }, + { + "epoch": 1.0, + "eval_loss": 0.3792824149131775, + "eval_runtime": 2.8922, + "eval_samples_per_second": 12.101, + "eval_steps_per_second": 0.692, + "step": 797 + }, + { + "epoch": 1.0037641154328734, + "grad_norm": 0.3353538294927158, + "learning_rate": 1.8644699696173393e-05, + "loss": 0.3241, + "step": 800 + }, + { + "epoch": 1.0100376411543288, + "grad_norm": 0.3828496951284579, + "learning_rate": 1.8617047961413122e-05, + "loss": 0.2601, + "step": 805 + }, + { + "epoch": 1.0163111668757843, + "grad_norm": 0.33996880676819174, + "learning_rate": 1.858913791970795e-05, + "loss": 0.2689, + "step": 810 + }, + { + "epoch": 1.0225846925972397, + "grad_norm": 0.3787602779680391, + "learning_rate": 1.8560970407696787e-05, + "loss": 0.2686, + "step": 815 + }, + { + "epoch": 1.0288582183186952, + "grad_norm": 0.5196175862012924, + "learning_rate": 1.8532546269736546e-05, + "loss": 0.2747, + "step": 820 + }, + { + "epoch": 1.0351317440401506, + "grad_norm": 0.37332309268508856, + "learning_rate": 1.850386635787682e-05, + "loss": 0.2627, + "step": 825 + }, + { + "epoch": 1.041405269761606, + "grad_norm": 0.3471559800133524, + "learning_rate": 1.847493153183435e-05, + "loss": 0.2787, + "step": 830 + }, + { + "epoch": 1.0476787954830615, + "grad_norm": 0.3558613761958236, + "learning_rate": 1.844574265896726e-05, + "loss": 0.268, + "step": 835 + }, + { + "epoch": 1.053952321204517, + "grad_norm": 0.3816351874674652, + "learning_rate": 1.8416300614249044e-05, + "loss": 0.2668, + "step": 840 + }, + { + "epoch": 1.0602258469259724, + "grad_norm": 0.34373084787867114, + "learning_rate": 1.8386606280242342e-05, + "loss": 0.2648, + "step": 845 + }, + { + "epoch": 1.066499372647428, + "grad_norm": 0.34772061503647006, + "learning_rate": 1.8356660547072493e-05, + "loss": 0.2664, + "step": 850 + }, + { + "epoch": 1.0727728983688833, + "grad_norm": 0.3575051030279196, + "learning_rate": 1.8326464312400835e-05, + "loss": 0.2741, + "step": 855 + }, + { + "epoch": 1.0790464240903388, + "grad_norm": 0.3496509974703566, + "learning_rate": 1.8296018481397818e-05, + "loss": 0.2583, + "step": 860 + }, + { + "epoch": 1.0853199498117942, + "grad_norm": 0.3724719074552806, + "learning_rate": 1.826532396671585e-05, + "loss": 0.2694, + "step": 865 + }, + { + "epoch": 1.0915934755332497, + "grad_norm": 0.3727323786558387, + "learning_rate": 1.8234381688461943e-05, + "loss": 0.2772, + "step": 870 + }, + { + "epoch": 1.0978670012547052, + "grad_norm": 0.3335956714818362, + "learning_rate": 1.8203192574170154e-05, + "loss": 0.2563, + "step": 875 + }, + { + "epoch": 1.1041405269761606, + "grad_norm": 0.34610791981119976, + "learning_rate": 1.8171757558773747e-05, + "loss": 0.254, + "step": 880 + }, + { + "epoch": 1.110414052697616, + "grad_norm": 0.3647153604437883, + "learning_rate": 1.8140077584577193e-05, + "loss": 0.269, + "step": 885 + }, + { + "epoch": 1.1166875784190715, + "grad_norm": 0.3620894127576232, + "learning_rate": 1.81081536012279e-05, + "loss": 0.2642, + "step": 890 + }, + { + "epoch": 1.122961104140527, + "grad_norm": 0.3368502269678017, + "learning_rate": 1.8075986565687785e-05, + "loss": 0.2621, + "step": 895 + }, + { + "epoch": 1.1292346298619824, + "grad_norm": 0.3487017808626127, + "learning_rate": 1.804357744220454e-05, + "loss": 0.2741, + "step": 900 + }, + { + "epoch": 1.1355081555834379, + "grad_norm": 0.346569504128483, + "learning_rate": 1.8010927202282758e-05, + "loss": 0.2522, + "step": 905 + }, + { + "epoch": 1.1417816813048933, + "grad_norm": 0.3448349298442695, + "learning_rate": 1.7978036824654806e-05, + "loss": 0.2539, + "step": 910 + }, + { + "epoch": 1.1480552070263488, + "grad_norm": 0.35055993507582917, + "learning_rate": 1.7944907295251478e-05, + "loss": 0.2716, + "step": 915 + }, + { + "epoch": 1.1543287327478042, + "grad_norm": 0.3483298654120021, + "learning_rate": 1.7911539607172447e-05, + "loss": 0.2585, + "step": 920 + }, + { + "epoch": 1.1606022584692597, + "grad_norm": 0.3452159026039351, + "learning_rate": 1.78779347606565e-05, + "loss": 0.2598, + "step": 925 + }, + { + "epoch": 1.1668757841907151, + "grad_norm": 0.3434243523852655, + "learning_rate": 1.7844093763051543e-05, + "loss": 0.2681, + "step": 930 + }, + { + "epoch": 1.1731493099121706, + "grad_norm": 0.3459951917334934, + "learning_rate": 1.7810017628784416e-05, + "loss": 0.2567, + "step": 935 + }, + { + "epoch": 1.179422835633626, + "grad_norm": 0.31955125292674175, + "learning_rate": 1.777570737933047e-05, + "loss": 0.2673, + "step": 940 + }, + { + "epoch": 1.1856963613550815, + "grad_norm": 0.33191052346485606, + "learning_rate": 1.7741164043182967e-05, + "loss": 0.258, + "step": 945 + }, + { + "epoch": 1.191969887076537, + "grad_norm": 0.3745301206766351, + "learning_rate": 1.7706388655822223e-05, + "loss": 0.2671, + "step": 950 + }, + { + "epoch": 1.1982434127979924, + "grad_norm": 0.3435343557827377, + "learning_rate": 1.7671382259684603e-05, + "loss": 0.2711, + "step": 955 + }, + { + "epoch": 1.2045169385194479, + "grad_norm": 0.3522241286327592, + "learning_rate": 1.7636145904131233e-05, + "loss": 0.2715, + "step": 960 + }, + { + "epoch": 1.2107904642409033, + "grad_norm": 0.34428731270476376, + "learning_rate": 1.7600680645416583e-05, + "loss": 0.2655, + "step": 965 + }, + { + "epoch": 1.2170639899623588, + "grad_norm": 0.3534701552438621, + "learning_rate": 1.7564987546656778e-05, + "loss": 0.2601, + "step": 970 + }, + { + "epoch": 1.2233375156838142, + "grad_norm": 0.34480580113424486, + "learning_rate": 1.7529067677797727e-05, + "loss": 0.2581, + "step": 975 + }, + { + "epoch": 1.2296110414052697, + "grad_norm": 0.3741756431765012, + "learning_rate": 1.7492922115583077e-05, + "loss": 0.2701, + "step": 980 + }, + { + "epoch": 1.2358845671267251, + "grad_norm": 0.3555549239846533, + "learning_rate": 1.745655194352191e-05, + "loss": 0.2716, + "step": 985 + }, + { + "epoch": 1.2421580928481806, + "grad_norm": 0.3327711036535926, + "learning_rate": 1.7419958251856276e-05, + "loss": 0.2577, + "step": 990 + }, + { + "epoch": 1.248431618569636, + "grad_norm": 0.3661107928778811, + "learning_rate": 1.738314213752851e-05, + "loss": 0.2649, + "step": 995 + }, + { + "epoch": 1.2547051442910915, + "grad_norm": 0.3407939311803759, + "learning_rate": 1.7346104704148343e-05, + "loss": 0.2506, + "step": 1000 + }, + { + "epoch": 1.260978670012547, + "grad_norm": 0.33612826697533044, + "learning_rate": 1.730884706195983e-05, + "loss": 0.2645, + "step": 1005 + }, + { + "epoch": 1.2672521957340024, + "grad_norm": 0.34109027432250294, + "learning_rate": 1.727137032780807e-05, + "loss": 0.2687, + "step": 1010 + }, + { + "epoch": 1.2735257214554578, + "grad_norm": 0.3424345987544216, + "learning_rate": 1.7233675625105703e-05, + "loss": 0.2659, + "step": 1015 + }, + { + "epoch": 1.2797992471769133, + "grad_norm": 0.3670304260632612, + "learning_rate": 1.7195764083799277e-05, + "loss": 0.2785, + "step": 1020 + }, + { + "epoch": 1.286072772898369, + "grad_norm": 0.35224545856472056, + "learning_rate": 1.7157636840335334e-05, + "loss": 0.2736, + "step": 1025 + }, + { + "epoch": 1.2923462986198244, + "grad_norm": 0.8557471051222927, + "learning_rate": 1.7119295037626366e-05, + "loss": 0.2598, + "step": 1030 + }, + { + "epoch": 1.2986198243412799, + "grad_norm": 0.3367737302829996, + "learning_rate": 1.708073982501656e-05, + "loss": 0.2612, + "step": 1035 + }, + { + "epoch": 1.3048933500627353, + "grad_norm": 0.34877985553107826, + "learning_rate": 1.704197235824732e-05, + "loss": 0.2726, + "step": 1040 + }, + { + "epoch": 1.3111668757841908, + "grad_norm": 0.34440291584591926, + "learning_rate": 1.7002993799422652e-05, + "loss": 0.2618, + "step": 1045 + }, + { + "epoch": 1.3174404015056462, + "grad_norm": 0.33534085525712676, + "learning_rate": 1.6963805316974303e-05, + "loss": 0.2609, + "step": 1050 + }, + { + "epoch": 1.3237139272271017, + "grad_norm": 0.3662577678235449, + "learning_rate": 1.6924408085626756e-05, + "loss": 0.2571, + "step": 1055 + }, + { + "epoch": 1.3299874529485571, + "grad_norm": 0.3516887590807691, + "learning_rate": 1.6884803286362e-05, + "loss": 0.2549, + "step": 1060 + }, + { + "epoch": 1.3362609786700126, + "grad_norm": 0.35333946604034366, + "learning_rate": 1.684499210638414e-05, + "loss": 0.264, + "step": 1065 + }, + { + "epoch": 1.342534504391468, + "grad_norm": 0.34307627027241056, + "learning_rate": 1.6804975739083803e-05, + "loss": 0.2503, + "step": 1070 + }, + { + "epoch": 1.3488080301129235, + "grad_norm": 0.35493913250542247, + "learning_rate": 1.6764755384002372e-05, + "loss": 0.2759, + "step": 1075 + }, + { + "epoch": 1.355081555834379, + "grad_norm": 0.3540415537021871, + "learning_rate": 1.6724332246796008e-05, + "loss": 0.2697, + "step": 1080 + }, + { + "epoch": 1.3613550815558344, + "grad_norm": 0.3318252959324338, + "learning_rate": 1.6683707539199538e-05, + "loss": 0.2669, + "step": 1085 + }, + { + "epoch": 1.3676286072772899, + "grad_norm": 0.33137640825273385, + "learning_rate": 1.6642882478990112e-05, + "loss": 0.2485, + "step": 1090 + }, + { + "epoch": 1.3739021329987453, + "grad_norm": 0.34017928191383223, + "learning_rate": 1.66018582899507e-05, + "loss": 0.2784, + "step": 1095 + }, + { + "epoch": 1.3801756587202008, + "grad_norm": 0.34200366455572445, + "learning_rate": 1.6560636201833423e-05, + "loss": 0.2673, + "step": 1100 + }, + { + "epoch": 1.3864491844416562, + "grad_norm": 0.35384418379159516, + "learning_rate": 1.6519217450322657e-05, + "loss": 0.2713, + "step": 1105 + }, + { + "epoch": 1.3927227101631117, + "grad_norm": 0.33307624664228463, + "learning_rate": 1.6477603276998037e-05, + "loss": 0.2742, + "step": 1110 + }, + { + "epoch": 1.3989962358845671, + "grad_norm": 0.356765357000532, + "learning_rate": 1.64357949292972e-05, + "loss": 0.2689, + "step": 1115 + }, + { + "epoch": 1.4052697616060226, + "grad_norm": 0.3252207580977864, + "learning_rate": 1.6393793660478406e-05, + "loss": 0.2506, + "step": 1120 + }, + { + "epoch": 1.411543287327478, + "grad_norm": 0.36204092721369197, + "learning_rate": 1.6351600729582977e-05, + "loss": 0.2636, + "step": 1125 + }, + { + "epoch": 1.4178168130489335, + "grad_norm": 0.3335102584738542, + "learning_rate": 1.630921740139755e-05, + "loss": 0.2616, + "step": 1130 + }, + { + "epoch": 1.424090338770389, + "grad_norm": 0.34898006017841243, + "learning_rate": 1.6266644946416148e-05, + "loss": 0.2781, + "step": 1135 + }, + { + "epoch": 1.4303638644918444, + "grad_norm": 0.3487474238464629, + "learning_rate": 1.622388464080213e-05, + "loss": 0.2773, + "step": 1140 + }, + { + "epoch": 1.4366373902132998, + "grad_norm": 0.35273437538491903, + "learning_rate": 1.61809377663499e-05, + "loss": 0.2682, + "step": 1145 + }, + { + "epoch": 1.4429109159347553, + "grad_norm": 0.36466350484277693, + "learning_rate": 1.6137805610446508e-05, + "loss": 0.2685, + "step": 1150 + }, + { + "epoch": 1.4491844416562107, + "grad_norm": 0.35090011471222154, + "learning_rate": 1.609448946603304e-05, + "loss": 0.2657, + "step": 1155 + }, + { + "epoch": 1.4554579673776662, + "grad_norm": 0.36713922458350784, + "learning_rate": 1.6050990631565894e-05, + "loss": 0.276, + "step": 1160 + }, + { + "epoch": 1.4617314930991216, + "grad_norm": 0.3552048435134842, + "learning_rate": 1.6007310410977807e-05, + "loss": 0.2796, + "step": 1165 + }, + { + "epoch": 1.468005018820577, + "grad_norm": 0.3469345931045424, + "learning_rate": 1.5963450113638815e-05, + "loss": 0.2592, + "step": 1170 + }, + { + "epoch": 1.4742785445420326, + "grad_norm": 2.7973818683619016, + "learning_rate": 1.5919411054316966e-05, + "loss": 0.2727, + "step": 1175 + }, + { + "epoch": 1.480552070263488, + "grad_norm": 0.3477691166092876, + "learning_rate": 1.5875194553138942e-05, + "loss": 0.2708, + "step": 1180 + }, + { + "epoch": 1.4868255959849435, + "grad_norm": 0.42047181326852134, + "learning_rate": 1.5830801935550462e-05, + "loss": 0.2583, + "step": 1185 + }, + { + "epoch": 1.4930991217063991, + "grad_norm": 0.38351145606864295, + "learning_rate": 1.5786234532276555e-05, + "loss": 0.2665, + "step": 1190 + }, + { + "epoch": 1.4993726474278546, + "grad_norm": 0.33888116221517656, + "learning_rate": 1.574149367928168e-05, + "loss": 0.2788, + "step": 1195 + }, + { + "epoch": 1.50564617314931, + "grad_norm": 0.3533659596857954, + "learning_rate": 1.5696580717729665e-05, + "loss": 0.2709, + "step": 1200 + }, + { + "epoch": 1.5119196988707655, + "grad_norm": 0.33797242592368726, + "learning_rate": 1.5651496993943507e-05, + "loss": 0.2552, + "step": 1205 + }, + { + "epoch": 1.518193224592221, + "grad_norm": 0.3380805624100092, + "learning_rate": 1.5606243859365033e-05, + "loss": 0.2696, + "step": 1210 + }, + { + "epoch": 1.5244667503136764, + "grad_norm": 0.3589011909382504, + "learning_rate": 1.5560822670514356e-05, + "loss": 0.2729, + "step": 1215 + }, + { + "epoch": 1.5307402760351319, + "grad_norm": 0.3410908318740735, + "learning_rate": 1.5515234788949238e-05, + "loss": 0.2658, + "step": 1220 + }, + { + "epoch": 1.5370138017565873, + "grad_norm": 0.4071357427518539, + "learning_rate": 1.5469481581224274e-05, + "loss": 0.2618, + "step": 1225 + }, + { + "epoch": 1.5432873274780428, + "grad_norm": 0.4050327080992723, + "learning_rate": 1.5423564418849895e-05, + "loss": 0.2707, + "step": 1230 + }, + { + "epoch": 1.5495608531994982, + "grad_norm": 0.3512739607697075, + "learning_rate": 1.537748467825131e-05, + "loss": 0.2762, + "step": 1235 + }, + { + "epoch": 1.5558343789209537, + "grad_norm": 0.3548946907435895, + "learning_rate": 1.5331243740727203e-05, + "loss": 0.285, + "step": 1240 + }, + { + "epoch": 1.5621079046424091, + "grad_norm": 0.3241833124220892, + "learning_rate": 1.5284842992408336e-05, + "loss": 0.2675, + "step": 1245 + }, + { + "epoch": 1.5683814303638646, + "grad_norm": 0.35085102130243395, + "learning_rate": 1.5238283824216015e-05, + "loss": 0.2681, + "step": 1250 + }, + { + "epoch": 1.57465495608532, + "grad_norm": 0.37148590291911643, + "learning_rate": 1.5191567631820364e-05, + "loss": 0.27, + "step": 1255 + }, + { + "epoch": 1.5809284818067755, + "grad_norm": 0.3535604634586656, + "learning_rate": 1.5144695815598529e-05, + "loss": 0.2717, + "step": 1260 + }, + { + "epoch": 1.587202007528231, + "grad_norm": 0.34794195830922975, + "learning_rate": 1.5097669780592658e-05, + "loss": 0.2633, + "step": 1265 + }, + { + "epoch": 1.5934755332496864, + "grad_norm": 0.35489225458342305, + "learning_rate": 1.5050490936467814e-05, + "loss": 0.2735, + "step": 1270 + }, + { + "epoch": 1.5997490589711418, + "grad_norm": 0.3350637434620735, + "learning_rate": 1.5003160697469707e-05, + "loss": 0.2544, + "step": 1275 + }, + { + "epoch": 1.6060225846925973, + "grad_norm": 0.3381089856973841, + "learning_rate": 1.4955680482382296e-05, + "loss": 0.2564, + "step": 1280 + }, + { + "epoch": 1.6122961104140527, + "grad_norm": 0.32684634964616277, + "learning_rate": 1.4908051714485266e-05, + "loss": 0.2741, + "step": 1285 + }, + { + "epoch": 1.6185696361355082, + "grad_norm": 0.34645698891886967, + "learning_rate": 1.4860275821511359e-05, + "loss": 0.2748, + "step": 1290 + }, + { + "epoch": 1.6248431618569636, + "grad_norm": 0.36961906780329834, + "learning_rate": 1.481235423560358e-05, + "loss": 0.2721, + "step": 1295 + }, + { + "epoch": 1.631116687578419, + "grad_norm": 0.350579115602874, + "learning_rate": 1.4764288393272258e-05, + "loss": 0.2628, + "step": 1300 + }, + { + "epoch": 1.6373902132998746, + "grad_norm": 0.3306477114399784, + "learning_rate": 1.4716079735352006e-05, + "loss": 0.2729, + "step": 1305 + }, + { + "epoch": 1.64366373902133, + "grad_norm": 0.35455700175873195, + "learning_rate": 1.46677297069585e-05, + "loss": 0.2667, + "step": 1310 + }, + { + "epoch": 1.6499372647427855, + "grad_norm": 0.33847253281006606, + "learning_rate": 1.4619239757445187e-05, + "loss": 0.2706, + "step": 1315 + }, + { + "epoch": 1.656210790464241, + "grad_norm": 0.34327567130216446, + "learning_rate": 1.4570611340359821e-05, + "loss": 0.266, + "step": 1320 + }, + { + "epoch": 1.6624843161856964, + "grad_norm": 0.3557992340297897, + "learning_rate": 1.4521845913400891e-05, + "loss": 0.2746, + "step": 1325 + }, + { + "epoch": 1.6687578419071518, + "grad_norm": 0.35121604621554686, + "learning_rate": 1.4472944938373945e-05, + "loss": 0.2704, + "step": 1330 + }, + { + "epoch": 1.6750313676286073, + "grad_norm": 0.34165359487510566, + "learning_rate": 1.4423909881147747e-05, + "loss": 0.2692, + "step": 1335 + }, + { + "epoch": 1.6813048933500627, + "grad_norm": 0.36505578592627197, + "learning_rate": 1.4374742211610345e-05, + "loss": 0.2662, + "step": 1340 + }, + { + "epoch": 1.6875784190715182, + "grad_norm": 0.34422086687232467, + "learning_rate": 1.4325443403625012e-05, + "loss": 0.275, + "step": 1345 + }, + { + "epoch": 1.6938519447929736, + "grad_norm": 0.3675735392039838, + "learning_rate": 1.4276014934986064e-05, + "loss": 0.272, + "step": 1350 + }, + { + "epoch": 1.700125470514429, + "grad_norm": 0.35389447351847136, + "learning_rate": 1.4226458287374555e-05, + "loss": 0.2713, + "step": 1355 + }, + { + "epoch": 1.7063989962358845, + "grad_norm": 0.3379229270723559, + "learning_rate": 1.4176774946313872e-05, + "loss": 0.2625, + "step": 1360 + }, + { + "epoch": 1.71267252195734, + "grad_norm": 0.34402002879314064, + "learning_rate": 1.4126966401125189e-05, + "loss": 0.268, + "step": 1365 + }, + { + "epoch": 1.7189460476787954, + "grad_norm": 0.3532344899080162, + "learning_rate": 1.4077034144882843e-05, + "loss": 0.2632, + "step": 1370 + }, + { + "epoch": 1.725219573400251, + "grad_norm": 0.3401295622140909, + "learning_rate": 1.4026979674369566e-05, + "loss": 0.2613, + "step": 1375 + }, + { + "epoch": 1.7314930991217063, + "grad_norm": 0.3391840532185442, + "learning_rate": 1.3976804490031608e-05, + "loss": 0.2719, + "step": 1380 + }, + { + "epoch": 1.7377666248431618, + "grad_norm": 0.3427490028776178, + "learning_rate": 1.3926510095933781e-05, + "loss": 0.2692, + "step": 1385 + }, + { + "epoch": 1.7440401505646173, + "grad_norm": 0.3433287489294571, + "learning_rate": 1.387609799971435e-05, + "loss": 0.2649, + "step": 1390 + }, + { + "epoch": 1.7503136762860727, + "grad_norm": 0.33114909580993174, + "learning_rate": 1.3825569712539864e-05, + "loss": 0.2527, + "step": 1395 + }, + { + "epoch": 1.7565872020075282, + "grad_norm": 0.337515925683474, + "learning_rate": 1.3774926749059826e-05, + "loss": 0.2556, + "step": 1400 + }, + { + "epoch": 1.7628607277289836, + "grad_norm": 0.3419996547297016, + "learning_rate": 1.3724170627361323e-05, + "loss": 0.2638, + "step": 1405 + }, + { + "epoch": 1.769134253450439, + "grad_norm": 0.35073376743994084, + "learning_rate": 1.3673302868923491e-05, + "loss": 0.2704, + "step": 1410 + }, + { + "epoch": 1.7754077791718945, + "grad_norm": 0.32861646593191174, + "learning_rate": 1.3622324998571928e-05, + "loss": 0.2519, + "step": 1415 + }, + { + "epoch": 1.78168130489335, + "grad_norm": 0.33312714617584277, + "learning_rate": 1.3571238544432968e-05, + "loss": 0.2664, + "step": 1420 + }, + { + "epoch": 1.7879548306148054, + "grad_norm": 0.3374614354621205, + "learning_rate": 1.352004503788789e-05, + "loss": 0.2585, + "step": 1425 + }, + { + "epoch": 1.7942283563362609, + "grad_norm": 0.3554543246581463, + "learning_rate": 1.3468746013527e-05, + "loss": 0.2762, + "step": 1430 + }, + { + "epoch": 1.8005018820577163, + "grad_norm": 0.3367124035830617, + "learning_rate": 1.3417343009103634e-05, + "loss": 0.261, + "step": 1435 + }, + { + "epoch": 1.8067754077791718, + "grad_norm": 0.3181211071389625, + "learning_rate": 1.3365837565488065e-05, + "loss": 0.2715, + "step": 1440 + }, + { + "epoch": 1.8130489335006272, + "grad_norm": 0.3480040032313692, + "learning_rate": 1.3314231226621305e-05, + "loss": 0.2624, + "step": 1445 + }, + { + "epoch": 1.8193224592220827, + "grad_norm": 0.3416171971995866, + "learning_rate": 1.3262525539468839e-05, + "loss": 0.2642, + "step": 1450 + }, + { + "epoch": 1.8255959849435381, + "grad_norm": 0.3635819899278629, + "learning_rate": 1.3210722053974233e-05, + "loss": 0.2632, + "step": 1455 + }, + { + "epoch": 1.8318695106649936, + "grad_norm": 0.3265652400172599, + "learning_rate": 1.315882232301269e-05, + "loss": 0.2612, + "step": 1460 + }, + { + "epoch": 1.838143036386449, + "grad_norm": 0.3342794882432052, + "learning_rate": 1.3106827902344485e-05, + "loss": 0.2623, + "step": 1465 + }, + { + "epoch": 1.8444165621079045, + "grad_norm": 0.3347520583742969, + "learning_rate": 1.3054740350568346e-05, + "loss": 0.2741, + "step": 1470 + }, + { + "epoch": 1.85069008782936, + "grad_norm": 0.34328068285452285, + "learning_rate": 1.3002561229074719e-05, + "loss": 0.2561, + "step": 1475 + }, + { + "epoch": 1.8569636135508154, + "grad_norm": 0.35131941587904497, + "learning_rate": 1.2950292101998967e-05, + "loss": 0.2747, + "step": 1480 + }, + { + "epoch": 1.8632371392722709, + "grad_norm": 0.34360731854818805, + "learning_rate": 1.289793453617449e-05, + "loss": 0.2627, + "step": 1485 + }, + { + "epoch": 1.8695106649937263, + "grad_norm": 0.3498923646707763, + "learning_rate": 1.2845490101085744e-05, + "loss": 0.2562, + "step": 1490 + }, + { + "epoch": 1.875784190715182, + "grad_norm": 0.34701974294822086, + "learning_rate": 1.2792960368821212e-05, + "loss": 0.265, + "step": 1495 + }, + { + "epoch": 1.8820577164366374, + "grad_norm": 0.33841922800891855, + "learning_rate": 1.2740346914026258e-05, + "loss": 0.2638, + "step": 1500 + }, + { + "epoch": 1.888331242158093, + "grad_norm": 0.3375511420369947, + "learning_rate": 1.2687651313855937e-05, + "loss": 0.2589, + "step": 1505 + }, + { + "epoch": 1.8946047678795483, + "grad_norm": 0.34124342881268466, + "learning_rate": 1.2634875147927726e-05, + "loss": 0.2689, + "step": 1510 + }, + { + "epoch": 1.9008782936010038, + "grad_norm": 0.3492720225961315, + "learning_rate": 1.2582019998274142e-05, + "loss": 0.2619, + "step": 1515 + }, + { + "epoch": 1.9071518193224593, + "grad_norm": 0.3424592345393382, + "learning_rate": 1.252908744929536e-05, + "loss": 0.2673, + "step": 1520 + }, + { + "epoch": 1.9134253450439147, + "grad_norm": 0.3541786605023589, + "learning_rate": 1.2476079087711695e-05, + "loss": 0.2741, + "step": 1525 + }, + { + "epoch": 1.9196988707653702, + "grad_norm": 0.36100779817450435, + "learning_rate": 1.2422996502516023e-05, + "loss": 0.2708, + "step": 1530 + }, + { + "epoch": 1.9259723964868256, + "grad_norm": 0.3637900870051361, + "learning_rate": 1.236984128492619e-05, + "loss": 0.2679, + "step": 1535 + }, + { + "epoch": 1.932245922208281, + "grad_norm": 0.34654925833715405, + "learning_rate": 1.231661502833728e-05, + "loss": 0.2705, + "step": 1540 + }, + { + "epoch": 1.9385194479297365, + "grad_norm": 0.3555711761052598, + "learning_rate": 1.2263319328273853e-05, + "loss": 0.2732, + "step": 1545 + }, + { + "epoch": 1.944792973651192, + "grad_norm": 0.4361336719233523, + "learning_rate": 1.220995578234214e-05, + "loss": 0.2818, + "step": 1550 + }, + { + "epoch": 1.9510664993726474, + "grad_norm": 0.35818262065748885, + "learning_rate": 1.2156525990182132e-05, + "loss": 0.2714, + "step": 1555 + }, + { + "epoch": 1.9573400250941029, + "grad_norm": 0.34020072227504516, + "learning_rate": 1.2103031553419629e-05, + "loss": 0.2561, + "step": 1560 + }, + { + "epoch": 1.9636135508155583, + "grad_norm": 0.35424378525712236, + "learning_rate": 1.2049474075618244e-05, + "loss": 0.2817, + "step": 1565 + }, + { + "epoch": 1.9698870765370138, + "grad_norm": 0.3436811391936569, + "learning_rate": 1.1995855162231323e-05, + "loss": 0.2727, + "step": 1570 + }, + { + "epoch": 1.9761606022584692, + "grad_norm": 0.35547373104319596, + "learning_rate": 1.1942176420553817e-05, + "loss": 0.279, + "step": 1575 + }, + { + "epoch": 1.9824341279799247, + "grad_norm": 0.33151574599317196, + "learning_rate": 1.1888439459674107e-05, + "loss": 0.2736, + "step": 1580 + }, + { + "epoch": 1.9887076537013801, + "grad_norm": 0.3489287650284772, + "learning_rate": 1.1834645890425773e-05, + "loss": 0.2674, + "step": 1585 + }, + { + "epoch": 1.9949811794228356, + "grad_norm": 0.4674786217571983, + "learning_rate": 1.1780797325339301e-05, + "loss": 0.2618, + "step": 1590 + }, + { + "epoch": 2.0, + "eval_loss": 0.3876406252384186, + "eval_runtime": 2.3724, + "eval_samples_per_second": 14.753, + "eval_steps_per_second": 0.843, + "step": 1594 + }, + { + "epoch": 2.0012547051442913, + "grad_norm": 0.41826837808172157, + "learning_rate": 1.1726895378593745e-05, + "loss": 0.2453, + "step": 1595 + }, + { + "epoch": 2.0075282308657467, + "grad_norm": 0.3896113439351613, + "learning_rate": 1.167294166596834e-05, + "loss": 0.1307, + "step": 1600 + }, + { + "epoch": 2.013801756587202, + "grad_norm": 0.40723627183822325, + "learning_rate": 1.1618937804794077e-05, + "loss": 0.1253, + "step": 1605 + }, + { + "epoch": 2.0200752823086576, + "grad_norm": 0.3393249627107008, + "learning_rate": 1.1564885413905205e-05, + "loss": 0.1212, + "step": 1610 + }, + { + "epoch": 2.026348808030113, + "grad_norm": 0.3969867667512431, + "learning_rate": 1.1510786113590715e-05, + "loss": 0.1213, + "step": 1615 + }, + { + "epoch": 2.0326223337515685, + "grad_norm": 0.3807098036418188, + "learning_rate": 1.1456641525545768e-05, + "loss": 0.115, + "step": 1620 + }, + { + "epoch": 2.038895859473024, + "grad_norm": 0.3600283222530161, + "learning_rate": 1.1402453272823086e-05, + "loss": 0.1178, + "step": 1625 + }, + { + "epoch": 2.0451693851944794, + "grad_norm": 0.37082981681871713, + "learning_rate": 1.1348222979784289e-05, + "loss": 0.1186, + "step": 1630 + }, + { + "epoch": 2.051442910915935, + "grad_norm": 0.3701722986939684, + "learning_rate": 1.1293952272051217e-05, + "loss": 0.1161, + "step": 1635 + }, + { + "epoch": 2.0577164366373903, + "grad_norm": 0.39437640184582917, + "learning_rate": 1.1239642776457176e-05, + "loss": 0.112, + "step": 1640 + }, + { + "epoch": 2.063989962358846, + "grad_norm": 0.35512823472089206, + "learning_rate": 1.1185296120998208e-05, + "loss": 0.1227, + "step": 1645 + }, + { + "epoch": 2.0702634880803013, + "grad_norm": 0.39699134768151145, + "learning_rate": 1.1130913934784255e-05, + "loss": 0.118, + "step": 1650 + }, + { + "epoch": 2.0765370138017567, + "grad_norm": 0.361766879756225, + "learning_rate": 1.107649784799034e-05, + "loss": 0.1148, + "step": 1655 + }, + { + "epoch": 2.082810539523212, + "grad_norm": 0.37983521046428353, + "learning_rate": 1.1022049491807703e-05, + "loss": 0.1105, + "step": 1660 + }, + { + "epoch": 2.0890840652446676, + "grad_norm": 0.36791365726333974, + "learning_rate": 1.0967570498394895e-05, + "loss": 0.1197, + "step": 1665 + }, + { + "epoch": 2.095357590966123, + "grad_norm": 0.38778258880907535, + "learning_rate": 1.0913062500828865e-05, + "loss": 0.119, + "step": 1670 + }, + { + "epoch": 2.1016311166875785, + "grad_norm": 0.3686039497467697, + "learning_rate": 1.0858527133055994e-05, + "loss": 0.1197, + "step": 1675 + }, + { + "epoch": 2.107904642409034, + "grad_norm": 0.39330229406582323, + "learning_rate": 1.0803966029843114e-05, + "loss": 0.1166, + "step": 1680 + }, + { + "epoch": 2.1141781681304894, + "grad_norm": 0.3636181831711105, + "learning_rate": 1.0749380826728513e-05, + "loss": 0.1133, + "step": 1685 + }, + { + "epoch": 2.120451693851945, + "grad_norm": 0.4335397800674325, + "learning_rate": 1.0694773159972912e-05, + "loss": 0.1246, + "step": 1690 + }, + { + "epoch": 2.1267252195734003, + "grad_norm": 0.37508400928061725, + "learning_rate": 1.0640144666510392e-05, + "loss": 0.1196, + "step": 1695 + }, + { + "epoch": 2.132998745294856, + "grad_norm": 0.3987416537308343, + "learning_rate": 1.0585496983899361e-05, + "loss": 0.1226, + "step": 1700 + }, + { + "epoch": 2.1392722710163112, + "grad_norm": 0.38791702283560353, + "learning_rate": 1.0530831750273428e-05, + "loss": 0.1117, + "step": 1705 + }, + { + "epoch": 2.1455457967377667, + "grad_norm": 0.39592521931999036, + "learning_rate": 1.0476150604292329e-05, + "loss": 0.1198, + "step": 1710 + }, + { + "epoch": 2.151819322459222, + "grad_norm": 0.6134906074452066, + "learning_rate": 1.0421455185092784e-05, + "loss": 0.1168, + "step": 1715 + }, + { + "epoch": 2.1580928481806776, + "grad_norm": 0.3936394784460519, + "learning_rate": 1.0366747132239374e-05, + "loss": 0.1137, + "step": 1720 + }, + { + "epoch": 2.164366373902133, + "grad_norm": 0.38023062505112215, + "learning_rate": 1.0312028085675393e-05, + "loss": 0.1216, + "step": 1725 + }, + { + "epoch": 2.1706398996235885, + "grad_norm": 0.3879080250933175, + "learning_rate": 1.025729968567368e-05, + "loss": 0.1163, + "step": 1730 + }, + { + "epoch": 2.176913425345044, + "grad_norm": 0.37494689918032786, + "learning_rate": 1.0202563572787457e-05, + "loss": 0.1155, + "step": 1735 + }, + { + "epoch": 2.1831869510664994, + "grad_norm": 0.4170219240353852, + "learning_rate": 1.0147821387801154e-05, + "loss": 0.1231, + "step": 1740 + }, + { + "epoch": 2.189460476787955, + "grad_norm": 0.36447068742427746, + "learning_rate": 1.0093074771681214e-05, + "loss": 0.1173, + "step": 1745 + }, + { + "epoch": 2.1957340025094103, + "grad_norm": 0.37383691981995226, + "learning_rate": 1.003832536552691e-05, + "loss": 0.1181, + "step": 1750 + }, + { + "epoch": 2.2020075282308658, + "grad_norm": 0.35361007854482546, + "learning_rate": 9.983574810521151e-06, + "loss": 0.1141, + "step": 1755 + }, + { + "epoch": 2.208281053952321, + "grad_norm": 0.37593622722746173, + "learning_rate": 9.928824747881286e-06, + "loss": 0.117, + "step": 1760 + }, + { + "epoch": 2.2145545796737767, + "grad_norm": 0.38527811994324745, + "learning_rate": 9.874076818809903e-06, + "loss": 0.1222, + "step": 1765 + }, + { + "epoch": 2.220828105395232, + "grad_norm": 0.37756703878021675, + "learning_rate": 9.81933266444563e-06, + "loss": 0.117, + "step": 1770 + }, + { + "epoch": 2.2271016311166876, + "grad_norm": 0.3981056432095895, + "learning_rate": 9.76459392581395e-06, + "loss": 0.1187, + "step": 1775 + }, + { + "epoch": 2.233375156838143, + "grad_norm": 0.35779965724307555, + "learning_rate": 9.709862243777998e-06, + "loss": 0.1201, + "step": 1780 + }, + { + "epoch": 2.2396486825595985, + "grad_norm": 0.39287403460106407, + "learning_rate": 9.655139258989379e-06, + "loss": 0.1173, + "step": 1785 + }, + { + "epoch": 2.245922208281054, + "grad_norm": 0.39081524580807464, + "learning_rate": 9.60042661183899e-06, + "loss": 0.114, + "step": 1790 + }, + { + "epoch": 2.2521957340025094, + "grad_norm": 0.5348108468458116, + "learning_rate": 9.54572594240784e-06, + "loss": 0.1145, + "step": 1795 + }, + { + "epoch": 2.258469259723965, + "grad_norm": 0.35615572604956347, + "learning_rate": 9.491038890417894e-06, + "loss": 0.1128, + "step": 1800 + }, + { + "epoch": 2.2647427854454203, + "grad_norm": 0.37107476709616843, + "learning_rate": 9.436367095182916e-06, + "loss": 0.1228, + "step": 1805 + }, + { + "epoch": 2.2710163111668757, + "grad_norm": 0.38323250415960275, + "learning_rate": 9.381712195559324e-06, + "loss": 0.118, + "step": 1810 + }, + { + "epoch": 2.277289836888331, + "grad_norm": 0.359171122780413, + "learning_rate": 9.327075829897082e-06, + "loss": 0.1191, + "step": 1815 + }, + { + "epoch": 2.2835633626097867, + "grad_norm": 0.4114654516418914, + "learning_rate": 9.272459635990563e-06, + "loss": 0.1235, + "step": 1820 + }, + { + "epoch": 2.289836888331242, + "grad_norm": 0.39460304180293915, + "learning_rate": 9.217865251029469e-06, + "loss": 0.1187, + "step": 1825 + }, + { + "epoch": 2.2961104140526976, + "grad_norm": 0.35839055639361983, + "learning_rate": 9.163294311549753e-06, + "loss": 0.1156, + "step": 1830 + }, + { + "epoch": 2.302383939774153, + "grad_norm": 0.3482054803738314, + "learning_rate": 9.108748453384559e-06, + "loss": 0.1198, + "step": 1835 + }, + { + "epoch": 2.3086574654956085, + "grad_norm": 0.3820940219983755, + "learning_rate": 9.054229311615178e-06, + "loss": 0.117, + "step": 1840 + }, + { + "epoch": 2.314930991217064, + "grad_norm": 0.39037798204086893, + "learning_rate": 8.999738520522065e-06, + "loss": 0.1197, + "step": 1845 + }, + { + "epoch": 2.3212045169385194, + "grad_norm": 0.3822304512858218, + "learning_rate": 8.945277713535809e-06, + "loss": 0.1152, + "step": 1850 + }, + { + "epoch": 2.327478042659975, + "grad_norm": 0.3965998438998078, + "learning_rate": 8.890848523188192e-06, + "loss": 0.1243, + "step": 1855 + }, + { + "epoch": 2.3337515683814303, + "grad_norm": 0.3939416904284715, + "learning_rate": 8.836452581063248e-06, + "loss": 0.1195, + "step": 1860 + }, + { + "epoch": 2.3400250941028857, + "grad_norm": 0.4276348064874773, + "learning_rate": 8.78209151774835e-06, + "loss": 0.1211, + "step": 1865 + }, + { + "epoch": 2.346298619824341, + "grad_norm": 0.39140182898284753, + "learning_rate": 8.727766962785344e-06, + "loss": 0.1157, + "step": 1870 + }, + { + "epoch": 2.3525721455457966, + "grad_norm": 0.3634468198734603, + "learning_rate": 8.673480544621681e-06, + "loss": 0.1129, + "step": 1875 + }, + { + "epoch": 2.358845671267252, + "grad_norm": 0.39837989743789176, + "learning_rate": 8.61923389056162e-06, + "loss": 0.1198, + "step": 1880 + }, + { + "epoch": 2.3651191969887075, + "grad_norm": 0.3935841490044898, + "learning_rate": 8.565028626717435e-06, + "loss": 0.1203, + "step": 1885 + }, + { + "epoch": 2.371392722710163, + "grad_norm": 0.35305962598333074, + "learning_rate": 8.51086637796068e-06, + "loss": 0.1128, + "step": 1890 + }, + { + "epoch": 2.3776662484316184, + "grad_norm": 0.394308865525823, + "learning_rate": 8.456748767873474e-06, + "loss": 0.1124, + "step": 1895 + }, + { + "epoch": 2.383939774153074, + "grad_norm": 0.3841761354621664, + "learning_rate": 8.402677418699842e-06, + "loss": 0.1145, + "step": 1900 + }, + { + "epoch": 2.3902132998745294, + "grad_norm": 0.3893473191716482, + "learning_rate": 8.34865395129707e-06, + "loss": 0.1197, + "step": 1905 + }, + { + "epoch": 2.396486825595985, + "grad_norm": 0.42534674714643167, + "learning_rate": 8.294679985087137e-06, + "loss": 0.1179, + "step": 1910 + }, + { + "epoch": 2.4027603513174403, + "grad_norm": 0.40229677875453496, + "learning_rate": 8.240757138008149e-06, + "loss": 0.1236, + "step": 1915 + }, + { + "epoch": 2.4090338770388957, + "grad_norm": 0.37978448174191587, + "learning_rate": 8.186887026465857e-06, + "loss": 0.1125, + "step": 1920 + }, + { + "epoch": 2.415307402760351, + "grad_norm": 0.38090023973889275, + "learning_rate": 8.133071265285209e-06, + "loss": 0.1175, + "step": 1925 + }, + { + "epoch": 2.4215809284818066, + "grad_norm": 0.3709724709852035, + "learning_rate": 8.079311467661912e-06, + "loss": 0.1189, + "step": 1930 + }, + { + "epoch": 2.427854454203262, + "grad_norm": 0.3968859415543936, + "learning_rate": 8.025609245114107e-06, + "loss": 0.1208, + "step": 1935 + }, + { + "epoch": 2.4341279799247175, + "grad_norm": 0.38140929315858313, + "learning_rate": 7.971966207434045e-06, + "loss": 0.1167, + "step": 1940 + }, + { + "epoch": 2.440401505646173, + "grad_norm": 0.3813844728323988, + "learning_rate": 7.918383962639835e-06, + "loss": 0.1186, + "step": 1945 + }, + { + "epoch": 2.4466750313676284, + "grad_norm": 0.36925458542907064, + "learning_rate": 7.864864116927245e-06, + "loss": 0.1167, + "step": 1950 + }, + { + "epoch": 2.452948557089084, + "grad_norm": 0.3813928431538188, + "learning_rate": 7.811408274621549e-06, + "loss": 0.1217, + "step": 1955 + }, + { + "epoch": 2.4592220828105393, + "grad_norm": 0.38045397571366496, + "learning_rate": 7.75801803812944e-06, + "loss": 0.1176, + "step": 1960 + }, + { + "epoch": 2.4654956085319952, + "grad_norm": 0.409917244408148, + "learning_rate": 7.704695007890988e-06, + "loss": 0.1214, + "step": 1965 + }, + { + "epoch": 2.4717691342534502, + "grad_norm": 0.39881220216006136, + "learning_rate": 7.651440782331679e-06, + "loss": 0.1176, + "step": 1970 + }, + { + "epoch": 2.478042659974906, + "grad_norm": 0.3551138970811604, + "learning_rate": 7.598256957814479e-06, + "loss": 0.1156, + "step": 1975 + }, + { + "epoch": 2.484316185696361, + "grad_norm": 0.38209748278035194, + "learning_rate": 7.545145128592009e-06, + "loss": 0.1128, + "step": 1980 + }, + { + "epoch": 2.490589711417817, + "grad_norm": 0.38676641594007305, + "learning_rate": 7.49210688675873e-06, + "loss": 0.1175, + "step": 1985 + }, + { + "epoch": 2.496863237139272, + "grad_norm": 0.36950074837730973, + "learning_rate": 7.4391438222032265e-06, + "loss": 0.1139, + "step": 1990 + }, + { + "epoch": 2.503136762860728, + "grad_norm": 0.3915583584835345, + "learning_rate": 7.3862575225605535e-06, + "loss": 0.1179, + "step": 1995 + }, + { + "epoch": 2.509410288582183, + "grad_norm": 0.3866601171887957, + "learning_rate": 7.333449573164634e-06, + "loss": 0.1207, + "step": 2000 + }, + { + "epoch": 2.515683814303639, + "grad_norm": 0.4110664967201194, + "learning_rate": 7.280721557000759e-06, + "loss": 0.1166, + "step": 2005 + }, + { + "epoch": 2.521957340025094, + "grad_norm": 0.37778473055073203, + "learning_rate": 7.228075054658096e-06, + "loss": 0.1157, + "step": 2010 + }, + { + "epoch": 2.5282308657465498, + "grad_norm": 0.37225323631681123, + "learning_rate": 7.175511644282349e-06, + "loss": 0.1156, + "step": 2015 + }, + { + "epoch": 2.5345043914680048, + "grad_norm": 0.3824521057716352, + "learning_rate": 7.123032901528431e-06, + "loss": 0.1182, + "step": 2020 + }, + { + "epoch": 2.5407779171894607, + "grad_norm": 0.38115325746292966, + "learning_rate": 7.070640399513232e-06, + "loss": 0.1158, + "step": 2025 + }, + { + "epoch": 2.5470514429109157, + "grad_norm": 0.3822411406686136, + "learning_rate": 7.018335708768467e-06, + "loss": 0.1177, + "step": 2030 + }, + { + "epoch": 2.5533249686323716, + "grad_norm": 0.407300076312883, + "learning_rate": 6.966120397193605e-06, + "loss": 0.1152, + "step": 2035 + }, + { + "epoch": 2.5595984943538266, + "grad_norm": 0.3666367383521563, + "learning_rate": 6.913996030008853e-06, + "loss": 0.1153, + "step": 2040 + }, + { + "epoch": 2.5658720200752825, + "grad_norm": 0.3632368895081262, + "learning_rate": 6.861964169708245e-06, + "loss": 0.1107, + "step": 2045 + }, + { + "epoch": 2.572145545796738, + "grad_norm": 0.3649524530723953, + "learning_rate": 6.810026376012808e-06, + "loss": 0.1145, + "step": 2050 + }, + { + "epoch": 2.5784190715181934, + "grad_norm": 0.39293153907868017, + "learning_rate": 6.758184205823791e-06, + "loss": 0.1162, + "step": 2055 + }, + { + "epoch": 2.584692597239649, + "grad_norm": 0.3931749546166204, + "learning_rate": 6.706439213176028e-06, + "loss": 0.1132, + "step": 2060 + }, + { + "epoch": 2.5909661229611043, + "grad_norm": 0.3825377326068989, + "learning_rate": 6.654792949191317e-06, + "loss": 0.1222, + "step": 2065 + }, + { + "epoch": 2.5972396486825597, + "grad_norm": 0.35192545209688325, + "learning_rate": 6.603246962031942e-06, + "loss": 0.112, + "step": 2070 + }, + { + "epoch": 2.603513174404015, + "grad_norm": 0.3638182336052127, + "learning_rate": 6.551802796854265e-06, + "loss": 0.1144, + "step": 2075 + }, + { + "epoch": 2.6097867001254706, + "grad_norm": 0.38711833706267534, + "learning_rate": 6.500461995762402e-06, + "loss": 0.1133, + "step": 2080 + }, + { + "epoch": 2.616060225846926, + "grad_norm": 0.3897188834137444, + "learning_rate": 6.449226097762e-06, + "loss": 0.123, + "step": 2085 + }, + { + "epoch": 2.6223337515683816, + "grad_norm": 0.37823736524426615, + "learning_rate": 6.398096638714106e-06, + "loss": 0.1184, + "step": 2090 + }, + { + "epoch": 2.628607277289837, + "grad_norm": 0.3932949556705336, + "learning_rate": 6.34707515128912e-06, + "loss": 0.1143, + "step": 2095 + }, + { + "epoch": 2.6348808030112925, + "grad_norm": 0.3884831444296721, + "learning_rate": 6.296163164920858e-06, + "loss": 0.1113, + "step": 2100 + }, + { + "epoch": 2.641154328732748, + "grad_norm": 0.36382200051340396, + "learning_rate": 6.245362205760703e-06, + "loss": 0.1159, + "step": 2105 + }, + { + "epoch": 2.6474278544542034, + "grad_norm": 0.3924419492733963, + "learning_rate": 6.194673796631852e-06, + "loss": 0.109, + "step": 2110 + }, + { + "epoch": 2.653701380175659, + "grad_norm": 0.3947013863716826, + "learning_rate": 6.144099456983681e-06, + "loss": 0.1115, + "step": 2115 + }, + { + "epoch": 2.6599749058971143, + "grad_norm": 0.3943261219862062, + "learning_rate": 6.093640702846182e-06, + "loss": 0.1122, + "step": 2120 + }, + { + "epoch": 2.6662484316185697, + "grad_norm": 0.39005774295522977, + "learning_rate": 6.043299046784526e-06, + "loss": 0.1187, + "step": 2125 + }, + { + "epoch": 2.672521957340025, + "grad_norm": 0.37092235530502005, + "learning_rate": 5.993075997853719e-06, + "loss": 0.1149, + "step": 2130 + }, + { + "epoch": 2.6787954830614806, + "grad_norm": 0.3807422171419007, + "learning_rate": 5.94297306155337e-06, + "loss": 0.1135, + "step": 2135 + }, + { + "epoch": 2.685069008782936, + "grad_norm": 0.3792169573613968, + "learning_rate": 5.892991739782557e-06, + "loss": 0.1199, + "step": 2140 + }, + { + "epoch": 2.6913425345043915, + "grad_norm": 0.3831811436833811, + "learning_rate": 5.843133530794817e-06, + "loss": 0.1096, + "step": 2145 + }, + { + "epoch": 2.697616060225847, + "grad_norm": 0.37933754782226464, + "learning_rate": 5.793399929153216e-06, + "loss": 0.1106, + "step": 2150 + }, + { + "epoch": 2.7038895859473024, + "grad_norm": 0.38366361518402914, + "learning_rate": 5.743792425685554e-06, + "loss": 0.1154, + "step": 2155 + }, + { + "epoch": 2.710163111668758, + "grad_norm": 0.3631788769456335, + "learning_rate": 5.694312507439691e-06, + "loss": 0.1141, + "step": 2160 + }, + { + "epoch": 2.7164366373902133, + "grad_norm": 0.39467438603923816, + "learning_rate": 5.644961657638942e-06, + "loss": 0.1148, + "step": 2165 + }, + { + "epoch": 2.722710163111669, + "grad_norm": 0.3643220647785576, + "learning_rate": 5.595741355637645e-06, + "loss": 0.1098, + "step": 2170 + }, + { + "epoch": 2.7289836888331243, + "grad_norm": 0.39291715101415214, + "learning_rate": 5.5466530768768005e-06, + "loss": 0.1159, + "step": 2175 + }, + { + "epoch": 2.7352572145545797, + "grad_norm": 0.40345238378734466, + "learning_rate": 5.497698292839835e-06, + "loss": 0.1117, + "step": 2180 + }, + { + "epoch": 2.741530740276035, + "grad_norm": 0.41110337094460886, + "learning_rate": 5.448878471008513e-06, + "loss": 0.1134, + "step": 2185 + }, + { + "epoch": 2.7478042659974906, + "grad_norm": 0.4006579187318595, + "learning_rate": 5.400195074818924e-06, + "loss": 0.1228, + "step": 2190 + }, + { + "epoch": 2.754077791718946, + "grad_norm": 0.4006920463713775, + "learning_rate": 5.351649563617638e-06, + "loss": 0.1157, + "step": 2195 + }, + { + "epoch": 2.7603513174404015, + "grad_norm": 0.40311348122015783, + "learning_rate": 5.3032433926179395e-06, + "loss": 0.1211, + "step": 2200 + }, + { + "epoch": 2.766624843161857, + "grad_norm": 0.3833185052390898, + "learning_rate": 5.25497801285622e-06, + "loss": 0.1085, + "step": 2205 + }, + { + "epoch": 2.7728983688833124, + "grad_norm": 0.40698791566770903, + "learning_rate": 5.206854871148466e-06, + "loss": 0.1145, + "step": 2210 + }, + { + "epoch": 2.779171894604768, + "grad_norm": 0.37018390800544043, + "learning_rate": 5.158875410046906e-06, + "loss": 0.1148, + "step": 2215 + }, + { + "epoch": 2.7854454203262233, + "grad_norm": 0.4097654897566281, + "learning_rate": 5.111041067796754e-06, + "loss": 0.1112, + "step": 2220 + }, + { + "epoch": 2.791718946047679, + "grad_norm": 0.38185615087945834, + "learning_rate": 5.063353278293106e-06, + "loss": 0.1129, + "step": 2225 + }, + { + "epoch": 2.7979924717691342, + "grad_norm": 0.38860623269143496, + "learning_rate": 5.0158134710379595e-06, + "loss": 0.1157, + "step": 2230 + }, + { + "epoch": 2.8042659974905897, + "grad_norm": 0.41677106161054384, + "learning_rate": 4.9684230710973394e-06, + "loss": 0.1179, + "step": 2235 + }, + { + "epoch": 2.810539523212045, + "grad_norm": 0.3659456249492505, + "learning_rate": 4.921183499058615e-06, + "loss": 0.1179, + "step": 2240 + }, + { + "epoch": 2.8168130489335006, + "grad_norm": 0.390728899484992, + "learning_rate": 4.8740961709878834e-06, + "loss": 0.1118, + "step": 2245 + }, + { + "epoch": 2.823086574654956, + "grad_norm": 0.41326006522454833, + "learning_rate": 4.827162498387544e-06, + "loss": 0.1123, + "step": 2250 + }, + { + "epoch": 2.8293601003764115, + "grad_norm": 0.37982873419881247, + "learning_rate": 4.780383888153983e-06, + "loss": 0.1099, + "step": 2255 + }, + { + "epoch": 2.835633626097867, + "grad_norm": 0.3826347732319729, + "learning_rate": 4.733761742535381e-06, + "loss": 0.1119, + "step": 2260 + }, + { + "epoch": 2.8419071518193224, + "grad_norm": 0.4070056837379538, + "learning_rate": 4.687297459089708e-06, + "loss": 0.1169, + "step": 2265 + }, + { + "epoch": 2.848180677540778, + "grad_norm": 0.3907501846052804, + "learning_rate": 4.640992430642801e-06, + "loss": 0.1184, + "step": 2270 + }, + { + "epoch": 2.8544542032622333, + "grad_norm": 0.3980879608103747, + "learning_rate": 4.594848045246638e-06, + "loss": 0.1143, + "step": 2275 + }, + { + "epoch": 2.8607277289836888, + "grad_norm": 0.3831724940346972, + "learning_rate": 4.548865686137718e-06, + "loss": 0.1165, + "step": 2280 + }, + { + "epoch": 2.867001254705144, + "grad_norm": 0.3554795717929537, + "learning_rate": 4.503046731695584e-06, + "loss": 0.1103, + "step": 2285 + }, + { + "epoch": 2.8732747804265997, + "grad_norm": 0.37945026941432614, + "learning_rate": 4.457392555401531e-06, + "loss": 0.1165, + "step": 2290 + }, + { + "epoch": 2.879548306148055, + "grad_norm": 0.4040384960083639, + "learning_rate": 4.411904525797408e-06, + "loss": 0.112, + "step": 2295 + }, + { + "epoch": 2.8858218318695106, + "grad_norm": 0.37374435050142246, + "learning_rate": 4.3665840064446165e-06, + "loss": 0.1152, + "step": 2300 + }, + { + "epoch": 2.892095357590966, + "grad_norm": 0.38395743236739355, + "learning_rate": 4.321432355883219e-06, + "loss": 0.1158, + "step": 2305 + }, + { + "epoch": 2.8983688833124215, + "grad_norm": 0.38107246526906685, + "learning_rate": 4.276450927591229e-06, + "loss": 0.1099, + "step": 2310 + }, + { + "epoch": 2.904642409033877, + "grad_norm": 0.40288534078286153, + "learning_rate": 4.231641069944019e-06, + "loss": 0.1135, + "step": 2315 + }, + { + "epoch": 2.9109159347553324, + "grad_norm": 0.39407625180559624, + "learning_rate": 4.187004126173928e-06, + "loss": 0.1153, + "step": 2320 + }, + { + "epoch": 2.917189460476788, + "grad_norm": 0.390694853717115, + "learning_rate": 4.1425414343299734e-06, + "loss": 0.1113, + "step": 2325 + }, + { + "epoch": 2.9234629861982433, + "grad_norm": 0.37505537186817833, + "learning_rate": 4.098254327237742e-06, + "loss": 0.1107, + "step": 2330 + }, + { + "epoch": 2.9297365119196987, + "grad_norm": 0.3797569810287486, + "learning_rate": 4.054144132459471e-06, + "loss": 0.1125, + "step": 2335 + }, + { + "epoch": 2.936010037641154, + "grad_norm": 0.39039171406836526, + "learning_rate": 4.010212172254201e-06, + "loss": 0.1139, + "step": 2340 + }, + { + "epoch": 2.9422835633626097, + "grad_norm": 0.3735230020383592, + "learning_rate": 3.966459763538179e-06, + "loss": 0.1162, + "step": 2345 + }, + { + "epoch": 2.948557089084065, + "grad_norm": 0.3799822252359316, + "learning_rate": 3.92288821784536e-06, + "loss": 0.1157, + "step": 2350 + }, + { + "epoch": 2.9548306148055206, + "grad_norm": 0.37435313221883065, + "learning_rate": 3.879498841288105e-06, + "loss": 0.1109, + "step": 2355 + }, + { + "epoch": 2.961104140526976, + "grad_norm": 0.37834252836846144, + "learning_rate": 3.836292934518029e-06, + "loss": 0.1124, + "step": 2360 + }, + { + "epoch": 2.9673776662484315, + "grad_norm": 0.3755100276116184, + "learning_rate": 3.793271792686993e-06, + "loss": 0.1122, + "step": 2365 + }, + { + "epoch": 2.973651191969887, + "grad_norm": 0.38213190237560674, + "learning_rate": 3.750436705408311e-06, + "loss": 0.1131, + "step": 2370 + }, + { + "epoch": 2.9799247176913424, + "grad_norm": 0.39502326617269706, + "learning_rate": 3.7077889567180625e-06, + "loss": 0.113, + "step": 2375 + }, + { + "epoch": 2.9861982434127983, + "grad_norm": 0.3907005868892978, + "learning_rate": 3.6653298250366265e-06, + "loss": 0.1121, + "step": 2380 + }, + { + "epoch": 2.9924717691342533, + "grad_norm": 0.3757187294889032, + "learning_rate": 3.6230605831303354e-06, + "loss": 0.1138, + "step": 2385 + }, + { + "epoch": 2.998745294855709, + "grad_norm": 0.36519480053180337, + "learning_rate": 3.5809824980733445e-06, + "loss": 0.1141, + "step": 2390 + }, + { + "epoch": 3.0, + "eval_loss": 0.43099531531333923, + "eval_runtime": 2.3647, + "eval_samples_per_second": 14.801, + "eval_steps_per_second": 0.846, + "step": 2391 + }, + { + "epoch": 3.005018820577164, + "grad_norm": 0.2442197751144262, + "learning_rate": 3.5390968312096396e-06, + "loss": 0.0533, + "step": 2395 + }, + { + "epoch": 3.0112923462986196, + "grad_norm": 0.21528572471833773, + "learning_rate": 3.497404838115219e-06, + "loss": 0.0415, + "step": 2400 + }, + { + "epoch": 3.017565872020075, + "grad_norm": 0.36503732282533347, + "learning_rate": 3.455907768560477e-06, + "loss": 0.0419, + "step": 2405 + }, + { + "epoch": 3.0238393977415305, + "grad_norm": 0.34377984007445206, + "learning_rate": 3.414606866472707e-06, + "loss": 0.0402, + "step": 2410 + }, + { + "epoch": 3.030112923462986, + "grad_norm": 0.3334896061335003, + "learning_rate": 3.373503369898862e-06, + "loss": 0.04, + "step": 2415 + }, + { + "epoch": 3.0363864491844414, + "grad_norm": 0.2723536621745364, + "learning_rate": 3.3325985109683877e-06, + "loss": 0.0396, + "step": 2420 + }, + { + "epoch": 3.042659974905897, + "grad_norm": 0.2691974596202031, + "learning_rate": 3.291893515856334e-06, + "loss": 0.0389, + "step": 2425 + }, + { + "epoch": 3.0489335006273524, + "grad_norm": 0.28187769151055436, + "learning_rate": 3.2513896047465654e-06, + "loss": 0.0379, + "step": 2430 + }, + { + "epoch": 3.055207026348808, + "grad_norm": 0.26156534288635025, + "learning_rate": 3.211087991795201e-06, + "loss": 0.0385, + "step": 2435 + }, + { + "epoch": 3.0614805520702637, + "grad_norm": 0.2632608582642457, + "learning_rate": 3.1709898850942234e-06, + "loss": 0.0381, + "step": 2440 + }, + { + "epoch": 3.067754077791719, + "grad_norm": 0.28572880260972616, + "learning_rate": 3.1310964866352524e-06, + "loss": 0.0389, + "step": 2445 + }, + { + "epoch": 3.0740276035131746, + "grad_norm": 0.28301745878682383, + "learning_rate": 3.0914089922735215e-06, + "loss": 0.0375, + "step": 2450 + }, + { + "epoch": 3.08030112923463, + "grad_norm": 0.29208432776276283, + "learning_rate": 3.051928591692017e-06, + "loss": 0.0381, + "step": 2455 + }, + { + "epoch": 3.0865746549560855, + "grad_norm": 0.2847903910965424, + "learning_rate": 3.012656468365842e-06, + "loss": 0.0368, + "step": 2460 + }, + { + "epoch": 3.092848180677541, + "grad_norm": 0.2966192251884188, + "learning_rate": 2.9735937995267108e-06, + "loss": 0.0371, + "step": 2465 + }, + { + "epoch": 3.0991217063989964, + "grad_norm": 0.2796882316242176, + "learning_rate": 2.9347417561276812e-06, + "loss": 0.0376, + "step": 2470 + }, + { + "epoch": 3.105395232120452, + "grad_norm": 0.3038556064545983, + "learning_rate": 2.8961015028080506e-06, + "loss": 0.0385, + "step": 2475 + }, + { + "epoch": 3.1116687578419073, + "grad_norm": 0.2800625738008763, + "learning_rate": 2.8576741978584265e-06, + "loss": 0.0386, + "step": 2480 + }, + { + "epoch": 3.117942283563363, + "grad_norm": 0.29844382477038445, + "learning_rate": 2.819460993186032e-06, + "loss": 0.0388, + "step": 2485 + }, + { + "epoch": 3.1242158092848182, + "grad_norm": 0.3177106222509896, + "learning_rate": 2.781463034280153e-06, + "loss": 0.0379, + "step": 2490 + }, + { + "epoch": 3.1304893350062737, + "grad_norm": 0.2995753268233054, + "learning_rate": 2.7436814601778174e-06, + "loss": 0.0387, + "step": 2495 + }, + { + "epoch": 3.136762860727729, + "grad_norm": 0.30624087534086814, + "learning_rate": 2.7061174034296434e-06, + "loss": 0.0398, + "step": 2500 + }, + { + "epoch": 3.1430363864491846, + "grad_norm": 0.3087285778258557, + "learning_rate": 2.668771990065884e-06, + "loss": 0.0394, + "step": 2505 + }, + { + "epoch": 3.14930991217064, + "grad_norm": 0.2941075887579034, + "learning_rate": 2.631646339562689e-06, + "loss": 0.0386, + "step": 2510 + }, + { + "epoch": 3.1555834378920955, + "grad_norm": 0.3118597830883225, + "learning_rate": 2.594741564808527e-06, + "loss": 0.0397, + "step": 2515 + }, + { + "epoch": 3.161856963613551, + "grad_norm": 0.3208254308061256, + "learning_rate": 2.558058772070846e-06, + "loss": 0.0385, + "step": 2520 + }, + { + "epoch": 3.1681304893350064, + "grad_norm": 0.26274648640285136, + "learning_rate": 2.521599060962895e-06, + "loss": 0.039, + "step": 2525 + }, + { + "epoch": 3.174404015056462, + "grad_norm": 0.3244783368883984, + "learning_rate": 2.4853635244107743e-06, + "loss": 0.0373, + "step": 2530 + }, + { + "epoch": 3.1806775407779173, + "grad_norm": 0.3321834864243195, + "learning_rate": 2.449353248620657e-06, + "loss": 0.0369, + "step": 2535 + }, + { + "epoch": 3.1869510664993728, + "grad_norm": 0.298380472148583, + "learning_rate": 2.41356931304625e-06, + "loss": 0.0383, + "step": 2540 + }, + { + "epoch": 3.193224592220828, + "grad_norm": 0.2887212580399006, + "learning_rate": 2.37801279035642e-06, + "loss": 0.0383, + "step": 2545 + }, + { + "epoch": 3.1994981179422837, + "grad_norm": 0.2685957992866305, + "learning_rate": 2.342684746403037e-06, + "loss": 0.0382, + "step": 2550 + }, + { + "epoch": 3.205771643663739, + "grad_norm": 0.3137213818837783, + "learning_rate": 2.307586240189049e-06, + "loss": 0.04, + "step": 2555 + }, + { + "epoch": 3.2120451693851946, + "grad_norm": 0.29781867047646216, + "learning_rate": 2.272718323836701e-06, + "loss": 0.0379, + "step": 2560 + }, + { + "epoch": 3.21831869510665, + "grad_norm": 0.28001089505480686, + "learning_rate": 2.238082042556029e-06, + "loss": 0.0382, + "step": 2565 + }, + { + "epoch": 3.2245922208281055, + "grad_norm": 0.31914940913340023, + "learning_rate": 2.2036784346134976e-06, + "loss": 0.0376, + "step": 2570 + }, + { + "epoch": 3.230865746549561, + "grad_norm": 0.2728867395710102, + "learning_rate": 2.169508531300908e-06, + "loss": 0.0382, + "step": 2575 + }, + { + "epoch": 3.2371392722710164, + "grad_norm": 0.30042453420426346, + "learning_rate": 2.1355733569044633e-06, + "loss": 0.0389, + "step": 2580 + }, + { + "epoch": 3.243412797992472, + "grad_norm": 0.2752139872422629, + "learning_rate": 2.101873928674064e-06, + "loss": 0.0374, + "step": 2585 + }, + { + "epoch": 3.2496863237139273, + "grad_norm": 0.3662305601638148, + "learning_rate": 2.0684112567928314e-06, + "loss": 0.0369, + "step": 2590 + }, + { + "epoch": 3.2559598494353827, + "grad_norm": 0.2853508636394266, + "learning_rate": 2.035186344346801e-06, + "loss": 0.0379, + "step": 2595 + }, + { + "epoch": 3.262233375156838, + "grad_norm": 0.2909774250341541, + "learning_rate": 2.0022001872948814e-06, + "loss": 0.0374, + "step": 2600 + }, + { + "epoch": 3.2685069008782937, + "grad_norm": 0.303083557347797, + "learning_rate": 1.9694537744389754e-06, + "loss": 0.0372, + "step": 2605 + }, + { + "epoch": 3.274780426599749, + "grad_norm": 0.2837072081387949, + "learning_rate": 1.9369480873943524e-06, + "loss": 0.037, + "step": 2610 + }, + { + "epoch": 3.2810539523212046, + "grad_norm": 0.293463671873792, + "learning_rate": 1.9046841005602268e-06, + "loss": 0.0368, + "step": 2615 + }, + { + "epoch": 3.28732747804266, + "grad_norm": 0.29070250069814846, + "learning_rate": 1.8726627810905284e-06, + "loss": 0.037, + "step": 2620 + }, + { + "epoch": 3.2936010037641155, + "grad_norm": 0.28990000456159315, + "learning_rate": 1.8408850888649398e-06, + "loss": 0.0368, + "step": 2625 + }, + { + "epoch": 3.299874529485571, + "grad_norm": 0.2768135817440434, + "learning_rate": 1.8093519764600931e-06, + "loss": 0.0381, + "step": 2630 + }, + { + "epoch": 3.3061480552070264, + "grad_norm": 0.3106851238797327, + "learning_rate": 1.778064389121048e-06, + "loss": 0.0373, + "step": 2635 + }, + { + "epoch": 3.312421580928482, + "grad_norm": 0.26724960926250796, + "learning_rate": 1.7470232647329222e-06, + "loss": 0.0382, + "step": 2640 + }, + { + "epoch": 3.3186951066499373, + "grad_norm": 0.3030133716572175, + "learning_rate": 1.7162295337928036e-06, + "loss": 0.0384, + "step": 2645 + }, + { + "epoch": 3.3249686323713927, + "grad_norm": 0.31346010178385275, + "learning_rate": 1.685684119381844e-06, + "loss": 0.038, + "step": 2650 + }, + { + "epoch": 3.331242158092848, + "grad_norm": 0.28444292176954256, + "learning_rate": 1.655387937137589e-06, + "loss": 0.0363, + "step": 2655 + }, + { + "epoch": 3.3375156838143036, + "grad_norm": 0.29308964877533195, + "learning_rate": 1.6253418952265398e-06, + "loss": 0.0372, + "step": 2660 + }, + { + "epoch": 3.343789209535759, + "grad_norm": 0.29189420850538267, + "learning_rate": 1.5955468943169217e-06, + "loss": 0.0377, + "step": 2665 + }, + { + "epoch": 3.3500627352572145, + "grad_norm": 0.3102531079304282, + "learning_rate": 1.5660038275516898e-06, + "loss": 0.0355, + "step": 2670 + }, + { + "epoch": 3.35633626097867, + "grad_norm": 0.27790750204144, + "learning_rate": 1.536713580521746e-06, + "loss": 0.0372, + "step": 2675 + }, + { + "epoch": 3.3626097867001254, + "grad_norm": 0.3226354962908228, + "learning_rate": 1.5076770312394096e-06, + "loss": 0.0369, + "step": 2680 + }, + { + "epoch": 3.368883312421581, + "grad_norm": 0.26898149132297955, + "learning_rate": 1.4788950501120781e-06, + "loss": 0.0382, + "step": 2685 + }, + { + "epoch": 3.3751568381430364, + "grad_norm": 0.3188502485680553, + "learning_rate": 1.450368499916155e-06, + "loss": 0.0381, + "step": 2690 + }, + { + "epoch": 3.381430363864492, + "grad_norm": 0.3007960797574723, + "learning_rate": 1.4220982357711743e-06, + "loss": 0.0372, + "step": 2695 + }, + { + "epoch": 3.3877038895859473, + "grad_norm": 0.29650117453341984, + "learning_rate": 1.3940851051141646e-06, + "loss": 0.0377, + "step": 2700 + }, + { + "epoch": 3.3939774153074027, + "grad_norm": 0.29709762351304697, + "learning_rate": 1.366329947674263e-06, + "loss": 0.0372, + "step": 2705 + }, + { + "epoch": 3.400250941028858, + "grad_norm": 0.2951278070367851, + "learning_rate": 1.3388335954475207e-06, + "loss": 0.0375, + "step": 2710 + }, + { + "epoch": 3.4065244667503136, + "grad_norm": 0.27968586803836637, + "learning_rate": 1.3115968726719819e-06, + "loss": 0.0359, + "step": 2715 + }, + { + "epoch": 3.412797992471769, + "grad_norm": 0.30665692772003755, + "learning_rate": 1.284620595802969e-06, + "loss": 0.0376, + "step": 2720 + }, + { + "epoch": 3.4190715181932245, + "grad_norm": 0.3166229935564368, + "learning_rate": 1.2579055734886004e-06, + "loss": 0.0361, + "step": 2725 + }, + { + "epoch": 3.42534504391468, + "grad_norm": 0.28812682524392, + "learning_rate": 1.2314526065455678e-06, + "loss": 0.0371, + "step": 2730 + }, + { + "epoch": 3.4316185696361354, + "grad_norm": 0.2962918652302945, + "learning_rate": 1.2052624879351105e-06, + "loss": 0.0375, + "step": 2735 + }, + { + "epoch": 3.437892095357591, + "grad_norm": 0.2959207573782531, + "learning_rate": 1.179336002739263e-06, + "loss": 0.0368, + "step": 2740 + }, + { + "epoch": 3.4441656210790463, + "grad_norm": 0.3222288463448185, + "learning_rate": 1.1536739281373122e-06, + "loss": 0.0366, + "step": 2745 + }, + { + "epoch": 3.450439146800502, + "grad_norm": 0.2924594477361343, + "learning_rate": 1.1282770333825022e-06, + "loss": 0.0375, + "step": 2750 + }, + { + "epoch": 3.4567126725219572, + "grad_norm": 0.32299277919132047, + "learning_rate": 1.1031460797789718e-06, + "loss": 0.0374, + "step": 2755 + }, + { + "epoch": 3.4629861982434127, + "grad_norm": 0.3109436060749281, + "learning_rate": 1.0782818206589375e-06, + "loss": 0.0367, + "step": 2760 + }, + { + "epoch": 3.469259723964868, + "grad_norm": 0.27272406454567366, + "learning_rate": 1.053685001360112e-06, + "loss": 0.0356, + "step": 2765 + }, + { + "epoch": 3.4755332496863236, + "grad_norm": 0.30006540518061847, + "learning_rate": 1.0293563592033595e-06, + "loss": 0.0366, + "step": 2770 + }, + { + "epoch": 3.481806775407779, + "grad_norm": 0.28002730472344367, + "learning_rate": 1.0052966234705953e-06, + "loss": 0.0383, + "step": 2775 + }, + { + "epoch": 3.4880803011292345, + "grad_norm": 0.32173748821041304, + "learning_rate": 9.815065153829195e-07, + "loss": 0.0366, + "step": 2780 + }, + { + "epoch": 3.49435382685069, + "grad_norm": 0.3042093630820431, + "learning_rate": 9.579867480790061e-07, + "loss": 0.0369, + "step": 2785 + }, + { + "epoch": 3.5006273525721454, + "grad_norm": 0.2975724581779801, + "learning_rate": 9.347380265937167e-07, + "loss": 0.0361, + "step": 2790 + }, + { + "epoch": 3.506900878293601, + "grad_norm": 0.2893317769414031, + "learning_rate": 9.117610478369743e-07, + "loss": 0.0383, + "step": 2795 + }, + { + "epoch": 3.5131744040150563, + "grad_norm": 0.30137847743662044, + "learning_rate": 8.890565005728691e-07, + "loss": 0.0384, + "step": 2800 + }, + { + "epoch": 3.5194479297365118, + "grad_norm": 0.2713586307285477, + "learning_rate": 8.666250653990071e-07, + "loss": 0.0364, + "step": 2805 + }, + { + "epoch": 3.5257214554579672, + "grad_norm": 0.2749487801394873, + "learning_rate": 8.44467414726119e-07, + "loss": 0.0353, + "step": 2810 + }, + { + "epoch": 3.5319949811794227, + "grad_norm": 0.3243183464549784, + "learning_rate": 8.225842127578909e-07, + "loss": 0.0369, + "step": 2815 + }, + { + "epoch": 3.538268506900878, + "grad_norm": 0.2890309082875547, + "learning_rate": 8.009761154710671e-07, + "loss": 0.0369, + "step": 2820 + }, + { + "epoch": 3.544542032622334, + "grad_norm": 0.2879952226873715, + "learning_rate": 7.796437705957782e-07, + "loss": 0.0368, + "step": 2825 + }, + { + "epoch": 3.550815558343789, + "grad_norm": 0.2696320382813176, + "learning_rate": 7.585878175961237e-07, + "loss": 0.0371, + "step": 2830 + }, + { + "epoch": 3.557089084065245, + "grad_norm": 0.2936735805318314, + "learning_rate": 7.378088876510092e-07, + "loss": 0.0376, + "step": 2835 + }, + { + "epoch": 3.5633626097867, + "grad_norm": 0.25416101821322773, + "learning_rate": 7.1730760363522e-07, + "loss": 0.0352, + "step": 2840 + }, + { + "epoch": 3.569636135508156, + "grad_norm": 0.30300674321039756, + "learning_rate": 6.970845801007564e-07, + "loss": 0.0353, + "step": 2845 + }, + { + "epoch": 3.575909661229611, + "grad_norm": 0.2732998588656073, + "learning_rate": 6.771404232584011e-07, + "loss": 0.0362, + "step": 2850 + }, + { + "epoch": 3.5821831869510667, + "grad_norm": 0.3795076600221709, + "learning_rate": 6.574757309595636e-07, + "loss": 0.0361, + "step": 2855 + }, + { + "epoch": 3.5884567126725218, + "grad_norm": 0.28624108454045416, + "learning_rate": 6.380910926783402e-07, + "loss": 0.0363, + "step": 2860 + }, + { + "epoch": 3.5947302383939777, + "grad_norm": 0.2674910858708607, + "learning_rate": 6.189870894938587e-07, + "loss": 0.0353, + "step": 2865 + }, + { + "epoch": 3.6010037641154327, + "grad_norm": 0.2789232394059912, + "learning_rate": 6.001642940728503e-07, + "loss": 0.0354, + "step": 2870 + }, + { + "epoch": 3.6072772898368886, + "grad_norm": 0.30882594178964384, + "learning_rate": 5.816232706524838e-07, + "loss": 0.0366, + "step": 2875 + }, + { + "epoch": 3.6135508155583436, + "grad_norm": 0.2806952241750321, + "learning_rate": 5.63364575023465e-07, + "loss": 0.0361, + "step": 2880 + }, + { + "epoch": 3.6198243412797995, + "grad_norm": 0.30164828030350377, + "learning_rate": 5.453887545133563e-07, + "loss": 0.0378, + "step": 2885 + }, + { + "epoch": 3.6260978670012545, + "grad_norm": 0.2679619622153946, + "learning_rate": 5.276963479701857e-07, + "loss": 0.0358, + "step": 2890 + }, + { + "epoch": 3.6323713927227104, + "grad_norm": 0.3231146432041487, + "learning_rate": 5.102878857462811e-07, + "loss": 0.0389, + "step": 2895 + }, + { + "epoch": 3.6386449184441654, + "grad_norm": 0.2779119069062509, + "learning_rate": 4.931638896823876e-07, + "loss": 0.0384, + "step": 2900 + }, + { + "epoch": 3.6449184441656213, + "grad_norm": 0.2722147852694776, + "learning_rate": 4.763248730920089e-07, + "loss": 0.0358, + "step": 2905 + }, + { + "epoch": 3.6511919698870763, + "grad_norm": 0.293829432405976, + "learning_rate": 4.5977134074603246e-07, + "loss": 0.037, + "step": 2910 + }, + { + "epoch": 3.657465495608532, + "grad_norm": 0.3065193645356902, + "learning_rate": 4.4350378885759105e-07, + "loss": 0.0371, + "step": 2915 + }, + { + "epoch": 3.663739021329987, + "grad_norm": 0.28257044296271877, + "learning_rate": 4.275227050671904e-07, + "loss": 0.0364, + "step": 2920 + }, + { + "epoch": 3.670012547051443, + "grad_norm": 0.2782812011499114, + "learning_rate": 4.1182856842809204e-07, + "loss": 0.0348, + "step": 2925 + }, + { + "epoch": 3.676286072772898, + "grad_norm": 0.28872652510766195, + "learning_rate": 3.964218493919525e-07, + "loss": 0.0361, + "step": 2930 + }, + { + "epoch": 3.682559598494354, + "grad_norm": 0.3053021191496778, + "learning_rate": 3.813030097947212e-07, + "loss": 0.0357, + "step": 2935 + }, + { + "epoch": 3.6888331242158094, + "grad_norm": 0.3648251833459472, + "learning_rate": 3.6647250284279735e-07, + "loss": 0.0404, + "step": 2940 + }, + { + "epoch": 3.695106649937265, + "grad_norm": 0.3319781395595929, + "learning_rate": 3.5193077309943923e-07, + "loss": 0.0403, + "step": 2945 + }, + { + "epoch": 3.7013801756587204, + "grad_norm": 0.3085438905662011, + "learning_rate": 3.376782564714476e-07, + "loss": 0.0369, + "step": 2950 + }, + { + "epoch": 3.707653701380176, + "grad_norm": 0.2849277889599658, + "learning_rate": 3.237153801960868e-07, + "loss": 0.0362, + "step": 2955 + }, + { + "epoch": 3.7139272271016313, + "grad_norm": 0.31923000028121545, + "learning_rate": 3.100425628282899e-07, + "loss": 0.0369, + "step": 2960 + }, + { + "epoch": 3.7202007528230867, + "grad_norm": 0.2929761574906958, + "learning_rate": 2.9666021422810274e-07, + "loss": 0.0369, + "step": 2965 + }, + { + "epoch": 3.726474278544542, + "grad_norm": 0.2992636928106286, + "learning_rate": 2.8356873554840514e-07, + "loss": 0.0364, + "step": 2970 + }, + { + "epoch": 3.7327478042659976, + "grad_norm": 0.301134189273727, + "learning_rate": 2.7076851922287704e-07, + "loss": 0.0354, + "step": 2975 + }, + { + "epoch": 3.739021329987453, + "grad_norm": 0.2943853551390973, + "learning_rate": 2.5825994895424255e-07, + "loss": 0.0346, + "step": 2980 + }, + { + "epoch": 3.7452948557089085, + "grad_norm": 0.2920091794037564, + "learning_rate": 2.460433997027634e-07, + "loss": 0.0377, + "step": 2985 + }, + { + "epoch": 3.751568381430364, + "grad_norm": 0.27495401738635517, + "learning_rate": 2.3411923767500455e-07, + "loss": 0.0378, + "step": 2990 + }, + { + "epoch": 3.7578419071518194, + "grad_norm": 0.39264097272429527, + "learning_rate": 2.224878203128511e-07, + "loss": 0.037, + "step": 2995 + }, + { + "epoch": 3.764115432873275, + "grad_norm": 0.30250413845451674, + "learning_rate": 2.1114949628279201e-07, + "loss": 0.0366, + "step": 3000 + }, + { + "epoch": 3.7703889585947303, + "grad_norm": 0.35611796959076564, + "learning_rate": 2.001046054654776e-07, + "loss": 0.0371, + "step": 3005 + }, + { + "epoch": 3.776662484316186, + "grad_norm": 0.2561175523575295, + "learning_rate": 1.893534789455209e-07, + "loss": 0.0351, + "step": 3010 + }, + { + "epoch": 3.7829360100376412, + "grad_norm": 0.28295745535344086, + "learning_rate": 1.7889643900158016e-07, + "loss": 0.0361, + "step": 3015 + }, + { + "epoch": 3.7892095357590967, + "grad_norm": 0.28512419406398154, + "learning_rate": 1.6873379909669307e-07, + "loss": 0.0376, + "step": 3020 + }, + { + "epoch": 3.795483061480552, + "grad_norm": 0.30485457369447977, + "learning_rate": 1.5886586386888449e-07, + "loss": 0.0377, + "step": 3025 + }, + { + "epoch": 3.8017565872020076, + "grad_norm": 0.2901818797318041, + "learning_rate": 1.4929292912203354e-07, + "loss": 0.0357, + "step": 3030 + }, + { + "epoch": 3.808030112923463, + "grad_norm": 0.2667781630207582, + "learning_rate": 1.4001528181700196e-07, + "loss": 0.0382, + "step": 3035 + }, + { + "epoch": 3.8143036386449185, + "grad_norm": 0.3034166947926825, + "learning_rate": 1.3103320006303766e-07, + "loss": 0.0382, + "step": 3040 + }, + { + "epoch": 3.820577164366374, + "grad_norm": 0.3119770203364585, + "learning_rate": 1.2234695310944012e-07, + "loss": 0.0367, + "step": 3045 + }, + { + "epoch": 3.8268506900878294, + "grad_norm": 0.31429783494460234, + "learning_rate": 1.1395680133747811e-07, + "loss": 0.036, + "step": 3050 + }, + { + "epoch": 3.833124215809285, + "grad_norm": 0.2655885847696681, + "learning_rate": 1.0586299625259699e-07, + "loss": 0.0372, + "step": 3055 + }, + { + "epoch": 3.8393977415307403, + "grad_norm": 0.37692736629023765, + "learning_rate": 9.806578047687254e-08, + "loss": 0.0401, + "step": 3060 + }, + { + "epoch": 3.8456712672521958, + "grad_norm": 0.28037362911434216, + "learning_rate": 9.056538774174117e-08, + "loss": 0.0383, + "step": 3065 + }, + { + "epoch": 3.851944792973651, + "grad_norm": 0.2813649801748803, + "learning_rate": 8.336204288098671e-08, + "loss": 0.0409, + "step": 3070 + }, + { + "epoch": 3.8582183186951067, + "grad_norm": 0.28158812132522376, + "learning_rate": 7.64559618240146e-08, + "loss": 0.0371, + "step": 3075 + }, + { + "epoch": 3.864491844416562, + "grad_norm": 0.29334208788094884, + "learning_rate": 6.984735158936384e-08, + "loss": 0.0377, + "step": 3080 + }, + { + "epoch": 3.8707653701380176, + "grad_norm": 0.2776501474412646, + "learning_rate": 6.353641027850965e-08, + "loss": 0.0368, + "step": 3085 + }, + { + "epoch": 3.877038895859473, + "grad_norm": 0.27164861879214747, + "learning_rate": 5.7523327069926024e-08, + "loss": 0.0371, + "step": 3090 + }, + { + "epoch": 3.8833124215809285, + "grad_norm": 0.26782362958737654, + "learning_rate": 5.1808282213410276e-08, + "loss": 0.0366, + "step": 3095 + }, + { + "epoch": 3.889585947302384, + "grad_norm": 0.31086719768707505, + "learning_rate": 4.63914470246829e-08, + "loss": 0.0361, + "step": 3100 + }, + { + "epoch": 3.8958594730238394, + "grad_norm": 0.30662847914316993, + "learning_rate": 4.1272983880249476e-08, + "loss": 0.0374, + "step": 3105 + }, + { + "epoch": 3.902132998745295, + "grad_norm": 0.2915681277032509, + "learning_rate": 3.645304621253787e-08, + "loss": 0.0365, + "step": 3110 + }, + { + "epoch": 3.9084065244667503, + "grad_norm": 0.31776081537688317, + "learning_rate": 3.193177850529416e-08, + "loss": 0.0352, + "step": 3115 + }, + { + "epoch": 3.9146800501882058, + "grad_norm": 0.26314408385807814, + "learning_rate": 2.7709316289253885e-08, + "loss": 0.0362, + "step": 3120 + }, + { + "epoch": 3.920953575909661, + "grad_norm": 0.30353550353186937, + "learning_rate": 2.378578613807969e-08, + "loss": 0.0367, + "step": 3125 + }, + { + "epoch": 3.9272271016311167, + "grad_norm": 0.3008469541365023, + "learning_rate": 2.0161305664563312e-08, + "loss": 0.0377, + "step": 3130 + }, + { + "epoch": 3.933500627352572, + "grad_norm": 0.29989474797608723, + "learning_rate": 1.6835983517108357e-08, + "loss": 0.0364, + "step": 3135 + }, + { + "epoch": 3.9397741530740276, + "grad_norm": 0.3007906674460069, + "learning_rate": 1.3809919376461811e-08, + "loss": 0.0367, + "step": 3140 + }, + { + "epoch": 3.946047678795483, + "grad_norm": 0.2822359703350314, + "learning_rate": 1.1083203952737543e-08, + "loss": 0.0371, + "step": 3145 + }, + { + "epoch": 3.9523212045169385, + "grad_norm": 0.32240083725231283, + "learning_rate": 8.655918982689582e-09, + "loss": 0.0367, + "step": 3150 + }, + { + "epoch": 3.958594730238394, + "grad_norm": 0.27144908135189433, + "learning_rate": 6.528137227262976e-09, + "loss": 0.0368, + "step": 3155 + }, + { + "epoch": 3.9648682559598494, + "grad_norm": 0.29276670822521234, + "learning_rate": 4.6999224694166405e-09, + "loss": 0.0363, + "step": 3160 + }, + { + "epoch": 3.971141781681305, + "grad_norm": 0.2920658294802816, + "learning_rate": 3.1713295122071107e-09, + "loss": 0.0352, + "step": 3165 + }, + { + "epoch": 3.9774153074027603, + "grad_norm": 0.29999904692777374, + "learning_rate": 1.9424041771465286e-09, + "loss": 0.0354, + "step": 3170 + }, + { + "epoch": 3.9836888331242157, + "grad_norm": 0.28940456234051576, + "learning_rate": 1.013183302832621e-09, + "loss": 0.0379, + "step": 3175 + }, + { + "epoch": 3.989962358845671, + "grad_norm": 0.28743205163263064, + "learning_rate": 3.8369474383848083e-10, + "loss": 0.0349, + "step": 3180 + }, + { + "epoch": 3.9962358845671266, + "grad_norm": 0.3004497086648703, + "learning_rate": 5.395736988322853e-11, + "loss": 0.0363, + "step": 3185 + }, + { + "epoch": 4.0, + "eval_loss": 0.47916728258132935, + "eval_runtime": 2.4435, + "eval_samples_per_second": 14.324, + "eval_steps_per_second": 0.818, + "step": 3188 + }, + { + "epoch": 4.0, + "step": 3188, + "total_flos": 2680278636036096.0, + "train_loss": 0.20304497943459596, + "train_runtime": 21214.9362, + "train_samples_per_second": 4.806, + "train_steps_per_second": 0.15 + } + ], + "logging_steps": 5, + "max_steps": 3188, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 319, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2680278636036096.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}