diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14625 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 9123, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005480653293872629, + "grad_norm": 9.609877586364746, + "learning_rate": 4.999996294265421e-05, + "loss": 5.868, + "num_input_tokens_seen": 3944, + "step": 5 + }, + { + "epoch": 0.0010961306587745259, + "grad_norm": 8.435359001159668, + "learning_rate": 4.999985177072669e-05, + "loss": 5.1519, + "num_input_tokens_seen": 7552, + "step": 10 + }, + { + "epoch": 0.001644195988161789, + "grad_norm": 4.555312156677246, + "learning_rate": 4.999966648454702e-05, + "loss": 4.5297, + "num_input_tokens_seen": 10552, + "step": 15 + }, + { + "epoch": 0.0021922613175490518, + "grad_norm": 5.34758186340332, + "learning_rate": 4.9999407084664514e-05, + "loss": 4.1016, + "num_input_tokens_seen": 14720, + "step": 20 + }, + { + "epoch": 0.002740326646936315, + "grad_norm": 4.284458160400391, + "learning_rate": 4.999907357184816e-05, + "loss": 4.0075, + "num_input_tokens_seen": 17648, + "step": 25 + }, + { + "epoch": 0.003288391976323578, + "grad_norm": 6.062355041503906, + "learning_rate": 4.99986659470867e-05, + "loss": 3.9682, + "num_input_tokens_seen": 21192, + "step": 30 + }, + { + "epoch": 0.003836457305710841, + "grad_norm": 3.1782262325286865, + "learning_rate": 4.9998184211588574e-05, + "loss": 3.6158, + "num_input_tokens_seen": 24680, + "step": 35 + }, + { + "epoch": 0.0043845226350981035, + "grad_norm": 4.492194652557373, + "learning_rate": 4.999762836678192e-05, + "loss": 4.4312, + "num_input_tokens_seen": 27304, + "step": 40 + }, + { + "epoch": 0.004932587964485367, + "grad_norm": 4.35511589050293, + "learning_rate": 4.99969984143146e-05, + "loss": 4.0391, + "num_input_tokens_seen": 29824, + "step": 45 + }, + { + "epoch": 0.00548065329387263, + "grad_norm": 4.070927619934082, + "learning_rate": 4.999629435605416e-05, + "loss": 3.9559, + "num_input_tokens_seen": 32496, + "step": 50 + }, + { + "epoch": 0.006028718623259892, + "grad_norm": 3.5581634044647217, + "learning_rate": 4.9995516194087845e-05, + "loss": 3.6342, + "num_input_tokens_seen": 35624, + "step": 55 + }, + { + "epoch": 0.006576783952647156, + "grad_norm": 3.646406888961792, + "learning_rate": 4.999466393072258e-05, + "loss": 3.8581, + "num_input_tokens_seen": 38896, + "step": 60 + }, + { + "epoch": 0.007124849282034418, + "grad_norm": 3.964329719543457, + "learning_rate": 4.9993737568484967e-05, + "loss": 4.0054, + "num_input_tokens_seen": 42736, + "step": 65 + }, + { + "epoch": 0.007672914611421682, + "grad_norm": 4.500335693359375, + "learning_rate": 4.99927371101213e-05, + "loss": 3.3325, + "num_input_tokens_seen": 45256, + "step": 70 + }, + { + "epoch": 0.008220979940808944, + "grad_norm": 4.3628315925598145, + "learning_rate": 4.999166255859752e-05, + "loss": 3.5725, + "num_input_tokens_seen": 48576, + "step": 75 + }, + { + "epoch": 0.008769045270196207, + "grad_norm": 3.4167840480804443, + "learning_rate": 4.9990513917099225e-05, + "loss": 3.7729, + "num_input_tokens_seen": 52736, + "step": 80 + }, + { + "epoch": 0.00931711059958347, + "grad_norm": 4.027678489685059, + "learning_rate": 4.998929118903167e-05, + "loss": 3.7879, + "num_input_tokens_seen": 56256, + "step": 85 + }, + { + "epoch": 0.009865175928970734, + "grad_norm": 4.3075056076049805, + "learning_rate": 4.9987994378019746e-05, + "loss": 3.5822, + "num_input_tokens_seen": 59448, + "step": 90 + }, + { + "epoch": 0.010413241258357997, + "grad_norm": 3.550978899002075, + "learning_rate": 4.9986623487907955e-05, + "loss": 3.8015, + "num_input_tokens_seen": 63424, + "step": 95 + }, + { + "epoch": 0.01096130658774526, + "grad_norm": 3.6582727432250977, + "learning_rate": 4.998517852276042e-05, + "loss": 3.7712, + "num_input_tokens_seen": 66720, + "step": 100 + }, + { + "epoch": 0.011509371917132522, + "grad_norm": 5.284353733062744, + "learning_rate": 4.9983659486860865e-05, + "loss": 3.5192, + "num_input_tokens_seen": 69280, + "step": 105 + }, + { + "epoch": 0.012057437246519784, + "grad_norm": 3.712407350540161, + "learning_rate": 4.998206638471261e-05, + "loss": 3.9006, + "num_input_tokens_seen": 72488, + "step": 110 + }, + { + "epoch": 0.012605502575907049, + "grad_norm": 5.380141258239746, + "learning_rate": 4.9980399221038544e-05, + "loss": 3.7691, + "num_input_tokens_seen": 75728, + "step": 115 + }, + { + "epoch": 0.013153567905294311, + "grad_norm": 6.7210693359375, + "learning_rate": 4.997865800078112e-05, + "loss": 3.4306, + "num_input_tokens_seen": 78456, + "step": 120 + }, + { + "epoch": 0.013701633234681574, + "grad_norm": 3.6822457313537598, + "learning_rate": 4.997684272910233e-05, + "loss": 3.7098, + "num_input_tokens_seen": 81912, + "step": 125 + }, + { + "epoch": 0.014249698564068837, + "grad_norm": 4.587904453277588, + "learning_rate": 4.997495341138373e-05, + "loss": 3.7503, + "num_input_tokens_seen": 85768, + "step": 130 + }, + { + "epoch": 0.0147977638934561, + "grad_norm": 4.4221510887146, + "learning_rate": 4.997299005322634e-05, + "loss": 3.6916, + "num_input_tokens_seen": 89744, + "step": 135 + }, + { + "epoch": 0.015345829222843364, + "grad_norm": 4.955567359924316, + "learning_rate": 4.9970952660450734e-05, + "loss": 3.8345, + "num_input_tokens_seen": 93584, + "step": 140 + }, + { + "epoch": 0.015893894552230625, + "grad_norm": 3.8360307216644287, + "learning_rate": 4.996884123909692e-05, + "loss": 3.8622, + "num_input_tokens_seen": 96880, + "step": 145 + }, + { + "epoch": 0.01644195988161789, + "grad_norm": 4.293831825256348, + "learning_rate": 4.996665579542439e-05, + "loss": 3.6978, + "num_input_tokens_seen": 99736, + "step": 150 + }, + { + "epoch": 0.016990025211005153, + "grad_norm": 3.8615922927856445, + "learning_rate": 4.99643963359121e-05, + "loss": 3.7886, + "num_input_tokens_seen": 102768, + "step": 155 + }, + { + "epoch": 0.017538090540392414, + "grad_norm": 4.592337608337402, + "learning_rate": 4.996206286725841e-05, + "loss": 3.4776, + "num_input_tokens_seen": 107960, + "step": 160 + }, + { + "epoch": 0.01808615586977968, + "grad_norm": 5.695650577545166, + "learning_rate": 4.995965539638108e-05, + "loss": 3.9904, + "num_input_tokens_seen": 110712, + "step": 165 + }, + { + "epoch": 0.01863422119916694, + "grad_norm": 6.341024398803711, + "learning_rate": 4.995717393041729e-05, + "loss": 3.727, + "num_input_tokens_seen": 114496, + "step": 170 + }, + { + "epoch": 0.019182286528554204, + "grad_norm": 5.523504734039307, + "learning_rate": 4.995461847672354e-05, + "loss": 3.5366, + "num_input_tokens_seen": 118408, + "step": 175 + }, + { + "epoch": 0.019730351857941468, + "grad_norm": 4.576908111572266, + "learning_rate": 4.995198904287572e-05, + "loss": 3.4552, + "num_input_tokens_seen": 122024, + "step": 180 + }, + { + "epoch": 0.02027841718732873, + "grad_norm": 4.912643909454346, + "learning_rate": 4.9949285636669e-05, + "loss": 3.878, + "num_input_tokens_seen": 125680, + "step": 185 + }, + { + "epoch": 0.020826482516715993, + "grad_norm": 3.790379047393799, + "learning_rate": 4.994650826611787e-05, + "loss": 3.7852, + "num_input_tokens_seen": 129056, + "step": 190 + }, + { + "epoch": 0.021374547846103254, + "grad_norm": 4.877086162567139, + "learning_rate": 4.9943656939456094e-05, + "loss": 3.7977, + "num_input_tokens_seen": 132072, + "step": 195 + }, + { + "epoch": 0.02192261317549052, + "grad_norm": 4.675802230834961, + "learning_rate": 4.994073166513667e-05, + "loss": 3.6024, + "num_input_tokens_seen": 134448, + "step": 200 + }, + { + "epoch": 0.022470678504877783, + "grad_norm": 9.45524787902832, + "learning_rate": 4.9937732451831845e-05, + "loss": 3.9247, + "num_input_tokens_seen": 137808, + "step": 205 + }, + { + "epoch": 0.023018743834265044, + "grad_norm": 4.349103927612305, + "learning_rate": 4.9934659308433024e-05, + "loss": 3.5971, + "num_input_tokens_seen": 140752, + "step": 210 + }, + { + "epoch": 0.023566809163652308, + "grad_norm": 3.90029239654541, + "learning_rate": 4.993151224405084e-05, + "loss": 3.656, + "num_input_tokens_seen": 143328, + "step": 215 + }, + { + "epoch": 0.02411487449303957, + "grad_norm": 3.4128267765045166, + "learning_rate": 4.992829126801502e-05, + "loss": 3.7457, + "num_input_tokens_seen": 146792, + "step": 220 + }, + { + "epoch": 0.024662939822426833, + "grad_norm": 5.266091346740723, + "learning_rate": 4.9924996389874435e-05, + "loss": 3.3972, + "num_input_tokens_seen": 150352, + "step": 225 + }, + { + "epoch": 0.025211005151814098, + "grad_norm": 3.7570605278015137, + "learning_rate": 4.992162761939704e-05, + "loss": 2.8386, + "num_input_tokens_seen": 153688, + "step": 230 + }, + { + "epoch": 0.02575907048120136, + "grad_norm": 3.587785243988037, + "learning_rate": 4.991818496656986e-05, + "loss": 3.909, + "num_input_tokens_seen": 156824, + "step": 235 + }, + { + "epoch": 0.026307135810588623, + "grad_norm": 4.7243757247924805, + "learning_rate": 4.991466844159893e-05, + "loss": 3.7806, + "num_input_tokens_seen": 159728, + "step": 240 + }, + { + "epoch": 0.026855201139975884, + "grad_norm": 4.537757396697998, + "learning_rate": 4.99110780549093e-05, + "loss": 3.7949, + "num_input_tokens_seen": 162456, + "step": 245 + }, + { + "epoch": 0.027403266469363148, + "grad_norm": 5.187793731689453, + "learning_rate": 4.990741381714498e-05, + "loss": 3.7304, + "num_input_tokens_seen": 165176, + "step": 250 + }, + { + "epoch": 0.027951331798750412, + "grad_norm": 5.144887447357178, + "learning_rate": 4.990367573916894e-05, + "loss": 3.7232, + "num_input_tokens_seen": 168824, + "step": 255 + }, + { + "epoch": 0.028499397128137673, + "grad_norm": 5.238748550415039, + "learning_rate": 4.989986383206302e-05, + "loss": 3.5484, + "num_input_tokens_seen": 172512, + "step": 260 + }, + { + "epoch": 0.029047462457524938, + "grad_norm": 4.251674652099609, + "learning_rate": 4.9895978107127975e-05, + "loss": 3.3929, + "num_input_tokens_seen": 175544, + "step": 265 + }, + { + "epoch": 0.0295955277869122, + "grad_norm": 7.541206359863281, + "learning_rate": 4.9892018575883354e-05, + "loss": 3.5038, + "num_input_tokens_seen": 178784, + "step": 270 + }, + { + "epoch": 0.030143593116299463, + "grad_norm": 3.8806400299072266, + "learning_rate": 4.988798525006755e-05, + "loss": 3.9488, + "num_input_tokens_seen": 181112, + "step": 275 + }, + { + "epoch": 0.030691658445686727, + "grad_norm": 3.7918715476989746, + "learning_rate": 4.988387814163771e-05, + "loss": 3.4375, + "num_input_tokens_seen": 185416, + "step": 280 + }, + { + "epoch": 0.031239723775073988, + "grad_norm": 4.9953813552856445, + "learning_rate": 4.9879697262769706e-05, + "loss": 3.7866, + "num_input_tokens_seen": 188528, + "step": 285 + }, + { + "epoch": 0.03178778910446125, + "grad_norm": 4.683384418487549, + "learning_rate": 4.9875442625858125e-05, + "loss": 3.4738, + "num_input_tokens_seen": 191472, + "step": 290 + }, + { + "epoch": 0.03233585443384852, + "grad_norm": 3.5414726734161377, + "learning_rate": 4.987111424351622e-05, + "loss": 3.6306, + "num_input_tokens_seen": 195416, + "step": 295 + }, + { + "epoch": 0.03288391976323578, + "grad_norm": 6.5463547706604, + "learning_rate": 4.9866712128575855e-05, + "loss": 3.6409, + "num_input_tokens_seen": 198576, + "step": 300 + }, + { + "epoch": 0.03343198509262304, + "grad_norm": 4.8504180908203125, + "learning_rate": 4.9862236294087485e-05, + "loss": 3.9698, + "num_input_tokens_seen": 201432, + "step": 305 + }, + { + "epoch": 0.033980050422010306, + "grad_norm": 4.2637739181518555, + "learning_rate": 4.98576867533201e-05, + "loss": 3.4978, + "num_input_tokens_seen": 204776, + "step": 310 + }, + { + "epoch": 0.03452811575139757, + "grad_norm": 6.201929569244385, + "learning_rate": 4.9853063519761234e-05, + "loss": 3.5306, + "num_input_tokens_seen": 207984, + "step": 315 + }, + { + "epoch": 0.03507618108078483, + "grad_norm": 5.745614528656006, + "learning_rate": 4.984836660711686e-05, + "loss": 3.4114, + "num_input_tokens_seen": 211304, + "step": 320 + }, + { + "epoch": 0.035624246410172096, + "grad_norm": 7.258711338043213, + "learning_rate": 4.9843596029311386e-05, + "loss": 3.5909, + "num_input_tokens_seen": 214680, + "step": 325 + }, + { + "epoch": 0.03617231173955936, + "grad_norm": 5.421024799346924, + "learning_rate": 4.9838751800487606e-05, + "loss": 3.9625, + "num_input_tokens_seen": 217472, + "step": 330 + }, + { + "epoch": 0.03672037706894662, + "grad_norm": 4.33311653137207, + "learning_rate": 4.983383393500667e-05, + "loss": 3.1581, + "num_input_tokens_seen": 220824, + "step": 335 + }, + { + "epoch": 0.03726844239833388, + "grad_norm": 3.667479991912842, + "learning_rate": 4.982884244744801e-05, + "loss": 3.6578, + "num_input_tokens_seen": 224464, + "step": 340 + }, + { + "epoch": 0.037816507727721146, + "grad_norm": 4.797352313995361, + "learning_rate": 4.982377735260933e-05, + "loss": 3.4615, + "num_input_tokens_seen": 228120, + "step": 345 + }, + { + "epoch": 0.03836457305710841, + "grad_norm": 6.432485103607178, + "learning_rate": 4.981863866550656e-05, + "loss": 3.7862, + "num_input_tokens_seen": 231112, + "step": 350 + }, + { + "epoch": 0.03891263838649567, + "grad_norm": 5.501232624053955, + "learning_rate": 4.981342640137377e-05, + "loss": 3.5962, + "num_input_tokens_seen": 234456, + "step": 355 + }, + { + "epoch": 0.039460703715882936, + "grad_norm": 4.993545055389404, + "learning_rate": 4.9808140575663186e-05, + "loss": 3.4178, + "num_input_tokens_seen": 237744, + "step": 360 + }, + { + "epoch": 0.0400087690452702, + "grad_norm": 4.6652421951293945, + "learning_rate": 4.98027812040451e-05, + "loss": 3.3215, + "num_input_tokens_seen": 240240, + "step": 365 + }, + { + "epoch": 0.04055683437465746, + "grad_norm": 7.660661220550537, + "learning_rate": 4.979734830240784e-05, + "loss": 3.4482, + "num_input_tokens_seen": 243344, + "step": 370 + }, + { + "epoch": 0.041104899704044726, + "grad_norm": 5.362435340881348, + "learning_rate": 4.979184188685772e-05, + "loss": 3.6152, + "num_input_tokens_seen": 246928, + "step": 375 + }, + { + "epoch": 0.041652965033431986, + "grad_norm": 4.019466876983643, + "learning_rate": 4.9786261973718984e-05, + "loss": 3.4659, + "num_input_tokens_seen": 250592, + "step": 380 + }, + { + "epoch": 0.04220103036281925, + "grad_norm": 3.5128304958343506, + "learning_rate": 4.9780608579533774e-05, + "loss": 3.369, + "num_input_tokens_seen": 254136, + "step": 385 + }, + { + "epoch": 0.04274909569220651, + "grad_norm": 5.328804969787598, + "learning_rate": 4.9774881721062083e-05, + "loss": 3.396, + "num_input_tokens_seen": 257000, + "step": 390 + }, + { + "epoch": 0.043297161021593776, + "grad_norm": 3.9344732761383057, + "learning_rate": 4.976908141528168e-05, + "loss": 3.5748, + "num_input_tokens_seen": 259544, + "step": 395 + }, + { + "epoch": 0.04384522635098104, + "grad_norm": 6.34092903137207, + "learning_rate": 4.976320767938808e-05, + "loss": 3.2784, + "num_input_tokens_seen": 262648, + "step": 400 + }, + { + "epoch": 0.0443932916803683, + "grad_norm": 6.228747367858887, + "learning_rate": 4.975726053079448e-05, + "loss": 3.7733, + "num_input_tokens_seen": 265800, + "step": 405 + }, + { + "epoch": 0.044941357009755566, + "grad_norm": 6.360103130340576, + "learning_rate": 4.9751239987131735e-05, + "loss": 3.3795, + "num_input_tokens_seen": 268352, + "step": 410 + }, + { + "epoch": 0.045489422339142827, + "grad_norm": 5.080907821655273, + "learning_rate": 4.9745146066248275e-05, + "loss": 3.4467, + "num_input_tokens_seen": 271416, + "step": 415 + }, + { + "epoch": 0.04603748766853009, + "grad_norm": 4.075165271759033, + "learning_rate": 4.973897878621005e-05, + "loss": 3.4581, + "num_input_tokens_seen": 274912, + "step": 420 + }, + { + "epoch": 0.046585552997917355, + "grad_norm": 4.517000675201416, + "learning_rate": 4.973273816530051e-05, + "loss": 3.3681, + "num_input_tokens_seen": 279184, + "step": 425 + }, + { + "epoch": 0.047133618327304616, + "grad_norm": 5.66272497177124, + "learning_rate": 4.9726424222020527e-05, + "loss": 3.8983, + "num_input_tokens_seen": 283008, + "step": 430 + }, + { + "epoch": 0.04768168365669188, + "grad_norm": 5.277008056640625, + "learning_rate": 4.9720036975088334e-05, + "loss": 3.8482, + "num_input_tokens_seen": 285408, + "step": 435 + }, + { + "epoch": 0.04822974898607914, + "grad_norm": 5.911515235900879, + "learning_rate": 4.971357644343948e-05, + "loss": 3.7086, + "num_input_tokens_seen": 287672, + "step": 440 + }, + { + "epoch": 0.048777814315466406, + "grad_norm": 5.71356725692749, + "learning_rate": 4.9707042646226784e-05, + "loss": 3.7235, + "num_input_tokens_seen": 290608, + "step": 445 + }, + { + "epoch": 0.04932587964485367, + "grad_norm": 4.606592178344727, + "learning_rate": 4.9700435602820276e-05, + "loss": 3.5481, + "num_input_tokens_seen": 293688, + "step": 450 + }, + { + "epoch": 0.04987394497424093, + "grad_norm": 5.814152240753174, + "learning_rate": 4.969375533280708e-05, + "loss": 3.38, + "num_input_tokens_seen": 297160, + "step": 455 + }, + { + "epoch": 0.050422010303628195, + "grad_norm": 5.669627666473389, + "learning_rate": 4.968700185599147e-05, + "loss": 3.5052, + "num_input_tokens_seen": 300608, + "step": 460 + }, + { + "epoch": 0.050970075633015456, + "grad_norm": 4.943079471588135, + "learning_rate": 4.96801751923947e-05, + "loss": 3.5689, + "num_input_tokens_seen": 303680, + "step": 465 + }, + { + "epoch": 0.05151814096240272, + "grad_norm": 5.5774664878845215, + "learning_rate": 4.9673275362255035e-05, + "loss": 3.1872, + "num_input_tokens_seen": 306664, + "step": 470 + }, + { + "epoch": 0.052066206291789985, + "grad_norm": 5.742215633392334, + "learning_rate": 4.966630238602761e-05, + "loss": 3.873, + "num_input_tokens_seen": 310024, + "step": 475 + }, + { + "epoch": 0.052614271621177246, + "grad_norm": 5.4475507736206055, + "learning_rate": 4.9659256284384434e-05, + "loss": 3.5306, + "num_input_tokens_seen": 313296, + "step": 480 + }, + { + "epoch": 0.05316233695056451, + "grad_norm": 5.270495414733887, + "learning_rate": 4.965213707821428e-05, + "loss": 3.3911, + "num_input_tokens_seen": 317528, + "step": 485 + }, + { + "epoch": 0.05371040227995177, + "grad_norm": 4.345836639404297, + "learning_rate": 4.964494478862267e-05, + "loss": 3.338, + "num_input_tokens_seen": 320224, + "step": 490 + }, + { + "epoch": 0.054258467609339035, + "grad_norm": 8.715791702270508, + "learning_rate": 4.963767943693178e-05, + "loss": 3.6676, + "num_input_tokens_seen": 323576, + "step": 495 + }, + { + "epoch": 0.054806532938726296, + "grad_norm": 6.43541955947876, + "learning_rate": 4.9630341044680375e-05, + "loss": 3.4779, + "num_input_tokens_seen": 326840, + "step": 500 + }, + { + "epoch": 0.05535459826811356, + "grad_norm": 5.299740314483643, + "learning_rate": 4.962292963362376e-05, + "loss": 3.0794, + "num_input_tokens_seen": 330400, + "step": 505 + }, + { + "epoch": 0.055902663597500825, + "grad_norm": 5.377191543579102, + "learning_rate": 4.9615445225733714e-05, + "loss": 3.3778, + "num_input_tokens_seen": 334264, + "step": 510 + }, + { + "epoch": 0.056450728926888086, + "grad_norm": 4.671337127685547, + "learning_rate": 4.9607887843198417e-05, + "loss": 3.2423, + "num_input_tokens_seen": 338632, + "step": 515 + }, + { + "epoch": 0.05699879425627535, + "grad_norm": 4.917747497558594, + "learning_rate": 4.960025750842241e-05, + "loss": 3.2912, + "num_input_tokens_seen": 341576, + "step": 520 + }, + { + "epoch": 0.057546859585662614, + "grad_norm": 5.633148670196533, + "learning_rate": 4.959255424402647e-05, + "loss": 3.9649, + "num_input_tokens_seen": 343752, + "step": 525 + }, + { + "epoch": 0.058094924915049875, + "grad_norm": 5.843842506408691, + "learning_rate": 4.9584778072847605e-05, + "loss": 3.5301, + "num_input_tokens_seen": 346768, + "step": 530 + }, + { + "epoch": 0.058642990244437136, + "grad_norm": 6.019566059112549, + "learning_rate": 4.957692901793896e-05, + "loss": 3.7123, + "num_input_tokens_seen": 349488, + "step": 535 + }, + { + "epoch": 0.0591910555738244, + "grad_norm": 5.83019495010376, + "learning_rate": 4.9569007102569746e-05, + "loss": 4.0987, + "num_input_tokens_seen": 353448, + "step": 540 + }, + { + "epoch": 0.059739120903211665, + "grad_norm": 7.744917392730713, + "learning_rate": 4.9561012350225174e-05, + "loss": 3.4271, + "num_input_tokens_seen": 357336, + "step": 545 + }, + { + "epoch": 0.060287186232598926, + "grad_norm": 6.845799922943115, + "learning_rate": 4.955294478460638e-05, + "loss": 3.7176, + "num_input_tokens_seen": 361272, + "step": 550 + }, + { + "epoch": 0.06083525156198619, + "grad_norm": 7.8909592628479, + "learning_rate": 4.954480442963038e-05, + "loss": 3.3092, + "num_input_tokens_seen": 364048, + "step": 555 + }, + { + "epoch": 0.061383316891373454, + "grad_norm": 6.57379674911499, + "learning_rate": 4.953659130942997e-05, + "loss": 4.0073, + "num_input_tokens_seen": 368336, + "step": 560 + }, + { + "epoch": 0.061931382220760715, + "grad_norm": 5.875579833984375, + "learning_rate": 4.952830544835366e-05, + "loss": 3.4651, + "num_input_tokens_seen": 370824, + "step": 565 + }, + { + "epoch": 0.062479447550147976, + "grad_norm": 5.310330867767334, + "learning_rate": 4.951994687096562e-05, + "loss": 3.8036, + "num_input_tokens_seen": 374104, + "step": 570 + }, + { + "epoch": 0.06302751287953524, + "grad_norm": 6.611202239990234, + "learning_rate": 4.9511515602045563e-05, + "loss": 3.2939, + "num_input_tokens_seen": 376176, + "step": 575 + }, + { + "epoch": 0.0635755782089225, + "grad_norm": 4.5933451652526855, + "learning_rate": 4.950301166658875e-05, + "loss": 3.529, + "num_input_tokens_seen": 378600, + "step": 580 + }, + { + "epoch": 0.06412364353830977, + "grad_norm": 5.080543518066406, + "learning_rate": 4.9494435089805835e-05, + "loss": 4.0958, + "num_input_tokens_seen": 382584, + "step": 585 + }, + { + "epoch": 0.06467170886769703, + "grad_norm": 4.658755779266357, + "learning_rate": 4.948578589712283e-05, + "loss": 3.3213, + "num_input_tokens_seen": 386376, + "step": 590 + }, + { + "epoch": 0.06521977419708429, + "grad_norm": 5.556814670562744, + "learning_rate": 4.9477064114181026e-05, + "loss": 3.5986, + "num_input_tokens_seen": 390784, + "step": 595 + }, + { + "epoch": 0.06576783952647156, + "grad_norm": 6.1433491706848145, + "learning_rate": 4.946826976683691e-05, + "loss": 3.4305, + "num_input_tokens_seen": 395104, + "step": 600 + }, + { + "epoch": 0.06631590485585882, + "grad_norm": 4.176370143890381, + "learning_rate": 4.9459402881162095e-05, + "loss": 3.6053, + "num_input_tokens_seen": 398072, + "step": 605 + }, + { + "epoch": 0.06686397018524608, + "grad_norm": 4.746314525604248, + "learning_rate": 4.945046348344325e-05, + "loss": 3.4613, + "num_input_tokens_seen": 401112, + "step": 610 + }, + { + "epoch": 0.06741203551463334, + "grad_norm": 6.04541015625, + "learning_rate": 4.9441451600182e-05, + "loss": 3.3843, + "num_input_tokens_seen": 404728, + "step": 615 + }, + { + "epoch": 0.06796010084402061, + "grad_norm": 4.687957763671875, + "learning_rate": 4.943236725809485e-05, + "loss": 3.6494, + "num_input_tokens_seen": 407824, + "step": 620 + }, + { + "epoch": 0.06850816617340787, + "grad_norm": 5.392053604125977, + "learning_rate": 4.942321048411314e-05, + "loss": 3.7716, + "num_input_tokens_seen": 410064, + "step": 625 + }, + { + "epoch": 0.06905623150279513, + "grad_norm": 5.196096420288086, + "learning_rate": 4.9413981305382936e-05, + "loss": 3.7037, + "num_input_tokens_seen": 413664, + "step": 630 + }, + { + "epoch": 0.0696042968321824, + "grad_norm": 4.464987754821777, + "learning_rate": 4.940467974926493e-05, + "loss": 3.0886, + "num_input_tokens_seen": 416752, + "step": 635 + }, + { + "epoch": 0.07015236216156966, + "grad_norm": 4.81376838684082, + "learning_rate": 4.939530584333441e-05, + "loss": 3.11, + "num_input_tokens_seen": 420552, + "step": 640 + }, + { + "epoch": 0.07070042749095692, + "grad_norm": 5.184936046600342, + "learning_rate": 4.938585961538115e-05, + "loss": 3.1776, + "num_input_tokens_seen": 423200, + "step": 645 + }, + { + "epoch": 0.07124849282034419, + "grad_norm": 7.05800724029541, + "learning_rate": 4.9376341093409305e-05, + "loss": 3.2882, + "num_input_tokens_seen": 426840, + "step": 650 + }, + { + "epoch": 0.07179655814973145, + "grad_norm": 7.437703609466553, + "learning_rate": 4.9366750305637385e-05, + "loss": 3.3796, + "num_input_tokens_seen": 430168, + "step": 655 + }, + { + "epoch": 0.07234462347911871, + "grad_norm": 7.665436744689941, + "learning_rate": 4.9357087280498105e-05, + "loss": 3.6646, + "num_input_tokens_seen": 433080, + "step": 660 + }, + { + "epoch": 0.07289268880850597, + "grad_norm": 7.2700324058532715, + "learning_rate": 4.934735204663835e-05, + "loss": 3.4558, + "num_input_tokens_seen": 436600, + "step": 665 + }, + { + "epoch": 0.07344075413789324, + "grad_norm": 4.932444095611572, + "learning_rate": 4.9337544632919085e-05, + "loss": 3.1135, + "num_input_tokens_seen": 439552, + "step": 670 + }, + { + "epoch": 0.0739888194672805, + "grad_norm": 6.515824794769287, + "learning_rate": 4.9327665068415254e-05, + "loss": 3.3952, + "num_input_tokens_seen": 442776, + "step": 675 + }, + { + "epoch": 0.07453688479666776, + "grad_norm": 6.392978668212891, + "learning_rate": 4.931771338241566e-05, + "loss": 3.5728, + "num_input_tokens_seen": 445344, + "step": 680 + }, + { + "epoch": 0.07508495012605503, + "grad_norm": 5.692570209503174, + "learning_rate": 4.930768960442299e-05, + "loss": 3.3921, + "num_input_tokens_seen": 449360, + "step": 685 + }, + { + "epoch": 0.07563301545544229, + "grad_norm": 10.294317245483398, + "learning_rate": 4.929759376415358e-05, + "loss": 3.6814, + "num_input_tokens_seen": 452736, + "step": 690 + }, + { + "epoch": 0.07618108078482955, + "grad_norm": 7.613968849182129, + "learning_rate": 4.9287425891537454e-05, + "loss": 3.5298, + "num_input_tokens_seen": 455648, + "step": 695 + }, + { + "epoch": 0.07672914611421681, + "grad_norm": 5.538883209228516, + "learning_rate": 4.927718601671816e-05, + "loss": 3.4538, + "num_input_tokens_seen": 458256, + "step": 700 + }, + { + "epoch": 0.07727721144360408, + "grad_norm": 5.105963706970215, + "learning_rate": 4.926687417005268e-05, + "loss": 3.3759, + "num_input_tokens_seen": 461984, + "step": 705 + }, + { + "epoch": 0.07782527677299134, + "grad_norm": 5.424991130828857, + "learning_rate": 4.925649038211142e-05, + "loss": 3.4941, + "num_input_tokens_seen": 465216, + "step": 710 + }, + { + "epoch": 0.0783733421023786, + "grad_norm": 6.287330627441406, + "learning_rate": 4.924603468367801e-05, + "loss": 3.3536, + "num_input_tokens_seen": 468496, + "step": 715 + }, + { + "epoch": 0.07892140743176587, + "grad_norm": 7.270327568054199, + "learning_rate": 4.923550710574929e-05, + "loss": 3.1898, + "num_input_tokens_seen": 471784, + "step": 720 + }, + { + "epoch": 0.07946947276115313, + "grad_norm": 5.402751922607422, + "learning_rate": 4.922490767953519e-05, + "loss": 3.7645, + "num_input_tokens_seen": 474928, + "step": 725 + }, + { + "epoch": 0.0800175380905404, + "grad_norm": 5.472609996795654, + "learning_rate": 4.921423643645863e-05, + "loss": 3.5023, + "num_input_tokens_seen": 479376, + "step": 730 + }, + { + "epoch": 0.08056560341992766, + "grad_norm": 4.318566799163818, + "learning_rate": 4.9203493408155455e-05, + "loss": 3.1444, + "num_input_tokens_seen": 482328, + "step": 735 + }, + { + "epoch": 0.08111366874931492, + "grad_norm": 6.903258800506592, + "learning_rate": 4.919267862647431e-05, + "loss": 3.8837, + "num_input_tokens_seen": 486248, + "step": 740 + }, + { + "epoch": 0.08166173407870218, + "grad_norm": 4.821303844451904, + "learning_rate": 4.918179212347657e-05, + "loss": 3.7363, + "num_input_tokens_seen": 489736, + "step": 745 + }, + { + "epoch": 0.08220979940808945, + "grad_norm": 4.108252048492432, + "learning_rate": 4.917083393143621e-05, + "loss": 3.0709, + "num_input_tokens_seen": 492784, + "step": 750 + }, + { + "epoch": 0.0827578647374767, + "grad_norm": 6.259218215942383, + "learning_rate": 4.915980408283977e-05, + "loss": 3.4733, + "num_input_tokens_seen": 496528, + "step": 755 + }, + { + "epoch": 0.08330593006686397, + "grad_norm": 5.9338531494140625, + "learning_rate": 4.91487026103862e-05, + "loss": 3.8987, + "num_input_tokens_seen": 500832, + "step": 760 + }, + { + "epoch": 0.08385399539625123, + "grad_norm": 5.397777557373047, + "learning_rate": 4.913752954698677e-05, + "loss": 3.3764, + "num_input_tokens_seen": 503744, + "step": 765 + }, + { + "epoch": 0.0844020607256385, + "grad_norm": 5.536934852600098, + "learning_rate": 4.912628492576503e-05, + "loss": 3.7953, + "num_input_tokens_seen": 507656, + "step": 770 + }, + { + "epoch": 0.08495012605502576, + "grad_norm": 5.932541847229004, + "learning_rate": 4.9114968780056635e-05, + "loss": 3.4254, + "num_input_tokens_seen": 511216, + "step": 775 + }, + { + "epoch": 0.08549819138441302, + "grad_norm": 5.971353530883789, + "learning_rate": 4.910358114340929e-05, + "loss": 3.6466, + "num_input_tokens_seen": 514328, + "step": 780 + }, + { + "epoch": 0.08604625671380028, + "grad_norm": 8.010024070739746, + "learning_rate": 4.9092122049582636e-05, + "loss": 3.9475, + "num_input_tokens_seen": 518200, + "step": 785 + }, + { + "epoch": 0.08659432204318755, + "grad_norm": 6.520806312561035, + "learning_rate": 4.9080591532548175e-05, + "loss": 3.4056, + "num_input_tokens_seen": 521704, + "step": 790 + }, + { + "epoch": 0.0871423873725748, + "grad_norm": 5.646440029144287, + "learning_rate": 4.9068989626489126e-05, + "loss": 3.5912, + "num_input_tokens_seen": 524456, + "step": 795 + }, + { + "epoch": 0.08769045270196207, + "grad_norm": 4.937885284423828, + "learning_rate": 4.9057316365800366e-05, + "loss": 3.4854, + "num_input_tokens_seen": 526920, + "step": 800 + }, + { + "epoch": 0.08823851803134934, + "grad_norm": 6.204067230224609, + "learning_rate": 4.904557178508829e-05, + "loss": 3.3649, + "num_input_tokens_seen": 530544, + "step": 805 + }, + { + "epoch": 0.0887865833607366, + "grad_norm": 6.427296161651611, + "learning_rate": 4.9033755919170733e-05, + "loss": 3.8582, + "num_input_tokens_seen": 532832, + "step": 810 + }, + { + "epoch": 0.08933464869012386, + "grad_norm": 7.1010589599609375, + "learning_rate": 4.9021868803076875e-05, + "loss": 3.5353, + "num_input_tokens_seen": 536056, + "step": 815 + }, + { + "epoch": 0.08988271401951113, + "grad_norm": 4.813199043273926, + "learning_rate": 4.900991047204712e-05, + "loss": 3.2529, + "num_input_tokens_seen": 539248, + "step": 820 + }, + { + "epoch": 0.09043077934889839, + "grad_norm": 7.545267581939697, + "learning_rate": 4.899788096153297e-05, + "loss": 3.0758, + "num_input_tokens_seen": 543584, + "step": 825 + }, + { + "epoch": 0.09097884467828565, + "grad_norm": 5.574884414672852, + "learning_rate": 4.898578030719698e-05, + "loss": 3.0291, + "num_input_tokens_seen": 546792, + "step": 830 + }, + { + "epoch": 0.09152691000767292, + "grad_norm": 5.587398529052734, + "learning_rate": 4.897360854491259e-05, + "loss": 3.2747, + "num_input_tokens_seen": 549296, + "step": 835 + }, + { + "epoch": 0.09207497533706017, + "grad_norm": 6.558215618133545, + "learning_rate": 4.896136571076406e-05, + "loss": 3.4765, + "num_input_tokens_seen": 551784, + "step": 840 + }, + { + "epoch": 0.09262304066644744, + "grad_norm": 5.221803188323975, + "learning_rate": 4.894905184104634e-05, + "loss": 3.3299, + "num_input_tokens_seen": 555608, + "step": 845 + }, + { + "epoch": 0.09317110599583471, + "grad_norm": NaN, + "learning_rate": 4.8939149624187016e-05, + "loss": 3.5208, + "num_input_tokens_seen": 558848, + "step": 850 + }, + { + "epoch": 0.09371917132522196, + "grad_norm": 5.915983200073242, + "learning_rate": 4.8926707982580194e-05, + "loss": 3.5031, + "num_input_tokens_seen": 562384, + "step": 855 + }, + { + "epoch": 0.09426723665460923, + "grad_norm": 6.868443965911865, + "learning_rate": 4.891419540815006e-05, + "loss": 3.5194, + "num_input_tokens_seen": 565648, + "step": 860 + }, + { + "epoch": 0.09481530198399649, + "grad_norm": 6.696837902069092, + "learning_rate": 4.8901611937991244e-05, + "loss": 3.4405, + "num_input_tokens_seen": 568384, + "step": 865 + }, + { + "epoch": 0.09536336731338375, + "grad_norm": 6.879650592803955, + "learning_rate": 4.8888957609408535e-05, + "loss": 3.2062, + "num_input_tokens_seen": 571184, + "step": 870 + }, + { + "epoch": 0.09591143264277102, + "grad_norm": 5.235931396484375, + "learning_rate": 4.8876232459916805e-05, + "loss": 3.351, + "num_input_tokens_seen": 575328, + "step": 875 + }, + { + "epoch": 0.09645949797215828, + "grad_norm": 6.496284008026123, + "learning_rate": 4.886343652724088e-05, + "loss": 3.3753, + "num_input_tokens_seen": 578520, + "step": 880 + }, + { + "epoch": 0.09700756330154554, + "grad_norm": 8.708456039428711, + "learning_rate": 4.8850569849315414e-05, + "loss": 3.4456, + "num_input_tokens_seen": 581688, + "step": 885 + }, + { + "epoch": 0.09755562863093281, + "grad_norm": 5.558722496032715, + "learning_rate": 4.883763246428481e-05, + "loss": 3.3753, + "num_input_tokens_seen": 584736, + "step": 890 + }, + { + "epoch": 0.09810369396032007, + "grad_norm": 6.443663597106934, + "learning_rate": 4.882462441050308e-05, + "loss": 3.5381, + "num_input_tokens_seen": 587952, + "step": 895 + }, + { + "epoch": 0.09865175928970733, + "grad_norm": 6.3144073486328125, + "learning_rate": 4.881154572653373e-05, + "loss": 3.5416, + "num_input_tokens_seen": 590704, + "step": 900 + }, + { + "epoch": 0.0991998246190946, + "grad_norm": 5.615172386169434, + "learning_rate": 4.8798396451149676e-05, + "loss": 3.5944, + "num_input_tokens_seen": 593056, + "step": 905 + }, + { + "epoch": 0.09974788994848185, + "grad_norm": 6.011329174041748, + "learning_rate": 4.8785176623333094e-05, + "loss": 3.2378, + "num_input_tokens_seen": 596584, + "step": 910 + }, + { + "epoch": 0.10029595527786912, + "grad_norm": 5.445102214813232, + "learning_rate": 4.8771886282275324e-05, + "loss": 3.6375, + "num_input_tokens_seen": 600080, + "step": 915 + }, + { + "epoch": 0.10084402060725639, + "grad_norm": 6.635453701019287, + "learning_rate": 4.875852546737675e-05, + "loss": 3.5498, + "num_input_tokens_seen": 602696, + "step": 920 + }, + { + "epoch": 0.10139208593664364, + "grad_norm": 5.236489772796631, + "learning_rate": 4.874509421824667e-05, + "loss": 3.4216, + "num_input_tokens_seen": 606200, + "step": 925 + }, + { + "epoch": 0.10194015126603091, + "grad_norm": 6.734245300292969, + "learning_rate": 4.87315925747032e-05, + "loss": 3.3747, + "num_input_tokens_seen": 609848, + "step": 930 + }, + { + "epoch": 0.10248821659541818, + "grad_norm": 6.802552223205566, + "learning_rate": 4.871802057677315e-05, + "loss": 3.2441, + "num_input_tokens_seen": 613440, + "step": 935 + }, + { + "epoch": 0.10303628192480543, + "grad_norm": 6.780172824859619, + "learning_rate": 4.8704378264691894e-05, + "loss": 3.4606, + "num_input_tokens_seen": 617088, + "step": 940 + }, + { + "epoch": 0.1035843472541927, + "grad_norm": 6.527922630310059, + "learning_rate": 4.869066567890327e-05, + "loss": 3.4019, + "num_input_tokens_seen": 619952, + "step": 945 + }, + { + "epoch": 0.10413241258357997, + "grad_norm": 6.2412214279174805, + "learning_rate": 4.867688286005944e-05, + "loss": 3.2408, + "num_input_tokens_seen": 623088, + "step": 950 + }, + { + "epoch": 0.10468047791296722, + "grad_norm": 6.477228164672852, + "learning_rate": 4.8663029849020775e-05, + "loss": 3.2491, + "num_input_tokens_seen": 626376, + "step": 955 + }, + { + "epoch": 0.10522854324235449, + "grad_norm": 5.359529495239258, + "learning_rate": 4.864910668685574e-05, + "loss": 3.1534, + "num_input_tokens_seen": 628800, + "step": 960 + }, + { + "epoch": 0.10577660857174175, + "grad_norm": 5.2979960441589355, + "learning_rate": 4.863511341484077e-05, + "loss": 3.4653, + "num_input_tokens_seen": 631312, + "step": 965 + }, + { + "epoch": 0.10632467390112901, + "grad_norm": 12.67263126373291, + "learning_rate": 4.8621050074460136e-05, + "loss": 3.8407, + "num_input_tokens_seen": 634144, + "step": 970 + }, + { + "epoch": 0.10687273923051628, + "grad_norm": 4.020299434661865, + "learning_rate": 4.860691670740587e-05, + "loss": 3.6273, + "num_input_tokens_seen": 637568, + "step": 975 + }, + { + "epoch": 0.10742080455990353, + "grad_norm": 5.12907075881958, + "learning_rate": 4.8592713355577555e-05, + "loss": 2.9803, + "num_input_tokens_seen": 640368, + "step": 980 + }, + { + "epoch": 0.1079688698892908, + "grad_norm": 5.088891983032227, + "learning_rate": 4.8578440061082275e-05, + "loss": 3.0532, + "num_input_tokens_seen": 643928, + "step": 985 + }, + { + "epoch": 0.10851693521867807, + "grad_norm": 6.150454521179199, + "learning_rate": 4.856409686623447e-05, + "loss": 3.5733, + "num_input_tokens_seen": 648192, + "step": 990 + }, + { + "epoch": 0.10906500054806532, + "grad_norm": 6.601188659667969, + "learning_rate": 4.85496838135558e-05, + "loss": 3.4824, + "num_input_tokens_seen": 652272, + "step": 995 + }, + { + "epoch": 0.10961306587745259, + "grad_norm": 6.9974141120910645, + "learning_rate": 4.8535200945775016e-05, + "loss": 3.516, + "num_input_tokens_seen": 655696, + "step": 1000 + }, + { + "epoch": 0.11016113120683986, + "grad_norm": 7.116706371307373, + "learning_rate": 4.8520648305827855e-05, + "loss": 3.4208, + "num_input_tokens_seen": 658560, + "step": 1005 + }, + { + "epoch": 0.11070919653622711, + "grad_norm": 5.209189414978027, + "learning_rate": 4.850602593685689e-05, + "loss": 3.353, + "num_input_tokens_seen": 662152, + "step": 1010 + }, + { + "epoch": 0.11125726186561438, + "grad_norm": 5.9092278480529785, + "learning_rate": 4.8491333882211416e-05, + "loss": 3.2833, + "num_input_tokens_seen": 665968, + "step": 1015 + }, + { + "epoch": 0.11180532719500165, + "grad_norm": 7.026948928833008, + "learning_rate": 4.847657218544732e-05, + "loss": 3.291, + "num_input_tokens_seen": 668808, + "step": 1020 + }, + { + "epoch": 0.1123533925243889, + "grad_norm": 6.154213905334473, + "learning_rate": 4.8461740890326936e-05, + "loss": 3.3035, + "num_input_tokens_seen": 672280, + "step": 1025 + }, + { + "epoch": 0.11290145785377617, + "grad_norm": 6.6929521560668945, + "learning_rate": 4.844684004081895e-05, + "loss": 3.6387, + "num_input_tokens_seen": 675184, + "step": 1030 + }, + { + "epoch": 0.11344952318316344, + "grad_norm": 5.449969291687012, + "learning_rate": 4.843186968109823e-05, + "loss": 3.1393, + "num_input_tokens_seen": 677824, + "step": 1035 + }, + { + "epoch": 0.1139975885125507, + "grad_norm": 3.6720149517059326, + "learning_rate": 4.841682985554573e-05, + "loss": 3.2646, + "num_input_tokens_seen": 682856, + "step": 1040 + }, + { + "epoch": 0.11454565384193796, + "grad_norm": 5.606584072113037, + "learning_rate": 4.8401720608748324e-05, + "loss": 3.3697, + "num_input_tokens_seen": 687680, + "step": 1045 + }, + { + "epoch": 0.11509371917132523, + "grad_norm": 5.044498920440674, + "learning_rate": 4.83865419854987e-05, + "loss": 3.3275, + "num_input_tokens_seen": 690616, + "step": 1050 + }, + { + "epoch": 0.11564178450071248, + "grad_norm": 5.938497543334961, + "learning_rate": 4.83712940307952e-05, + "loss": 3.1055, + "num_input_tokens_seen": 693808, + "step": 1055 + }, + { + "epoch": 0.11618984983009975, + "grad_norm": 7.216318607330322, + "learning_rate": 4.8355976789841754e-05, + "loss": 3.5388, + "num_input_tokens_seen": 696992, + "step": 1060 + }, + { + "epoch": 0.116737915159487, + "grad_norm": 5.2063164710998535, + "learning_rate": 4.834059030804764e-05, + "loss": 3.3436, + "num_input_tokens_seen": 700448, + "step": 1065 + }, + { + "epoch": 0.11728598048887427, + "grad_norm": 6.457626819610596, + "learning_rate": 4.832513463102745e-05, + "loss": 3.281, + "num_input_tokens_seen": 702928, + "step": 1070 + }, + { + "epoch": 0.11783404581826154, + "grad_norm": 5.837212562561035, + "learning_rate": 4.8309609804600886e-05, + "loss": 3.3414, + "num_input_tokens_seen": 707064, + "step": 1075 + }, + { + "epoch": 0.1183821111476488, + "grad_norm": 5.227325439453125, + "learning_rate": 4.829401587479265e-05, + "loss": 3.0907, + "num_input_tokens_seen": 711056, + "step": 1080 + }, + { + "epoch": 0.11893017647703606, + "grad_norm": 7.185408115386963, + "learning_rate": 4.8278352887832326e-05, + "loss": 3.159, + "num_input_tokens_seen": 714472, + "step": 1085 + }, + { + "epoch": 0.11947824180642333, + "grad_norm": 7.311601638793945, + "learning_rate": 4.82626208901542e-05, + "loss": 3.5405, + "num_input_tokens_seen": 717400, + "step": 1090 + }, + { + "epoch": 0.12002630713581058, + "grad_norm": 4.9710693359375, + "learning_rate": 4.824681992839717e-05, + "loss": 3.3058, + "num_input_tokens_seen": 720472, + "step": 1095 + }, + { + "epoch": 0.12057437246519785, + "grad_norm": 4.5781779289245605, + "learning_rate": 4.823095004940456e-05, + "loss": 3.1374, + "num_input_tokens_seen": 723808, + "step": 1100 + }, + { + "epoch": 0.12112243779458512, + "grad_norm": 6.077118396759033, + "learning_rate": 4.8215011300224027e-05, + "loss": 3.1628, + "num_input_tokens_seen": 727576, + "step": 1105 + }, + { + "epoch": 0.12167050312397237, + "grad_norm": 6.6747870445251465, + "learning_rate": 4.819900372810739e-05, + "loss": 3.5095, + "num_input_tokens_seen": 730536, + "step": 1110 + }, + { + "epoch": 0.12221856845335964, + "grad_norm": 5.468014240264893, + "learning_rate": 4.818292738051049e-05, + "loss": 3.521, + "num_input_tokens_seen": 733024, + "step": 1115 + }, + { + "epoch": 0.12276663378274691, + "grad_norm": 6.263638019561768, + "learning_rate": 4.816678230509308e-05, + "loss": 3.2318, + "num_input_tokens_seen": 736048, + "step": 1120 + }, + { + "epoch": 0.12331469911213416, + "grad_norm": 5.998656272888184, + "learning_rate": 4.8150568549718655e-05, + "loss": 3.0286, + "num_input_tokens_seen": 739264, + "step": 1125 + }, + { + "epoch": 0.12386276444152143, + "grad_norm": 6.395206928253174, + "learning_rate": 4.81342861624543e-05, + "loss": 3.4223, + "num_input_tokens_seen": 742008, + "step": 1130 + }, + { + "epoch": 0.1244108297709087, + "grad_norm": 6.199779510498047, + "learning_rate": 4.811793519157059e-05, + "loss": 3.5237, + "num_input_tokens_seen": 745064, + "step": 1135 + }, + { + "epoch": 0.12495889510029595, + "grad_norm": 6.504228115081787, + "learning_rate": 4.81015156855414e-05, + "loss": 3.4249, + "num_input_tokens_seen": 748104, + "step": 1140 + }, + { + "epoch": 0.1255069604296832, + "grad_norm": 6.280592441558838, + "learning_rate": 4.80850276930438e-05, + "loss": 3.0411, + "num_input_tokens_seen": 752032, + "step": 1145 + }, + { + "epoch": 0.1260550257590705, + "grad_norm": 8.529096603393555, + "learning_rate": 4.806847126295789e-05, + "loss": 3.1457, + "num_input_tokens_seen": 755400, + "step": 1150 + }, + { + "epoch": 0.12660309108845774, + "grad_norm": 6.454196453094482, + "learning_rate": 4.8051846444366676e-05, + "loss": 3.0008, + "num_input_tokens_seen": 758392, + "step": 1155 + }, + { + "epoch": 0.127151156417845, + "grad_norm": 6.862017631530762, + "learning_rate": 4.803515328655586e-05, + "loss": 3.3972, + "num_input_tokens_seen": 760824, + "step": 1160 + }, + { + "epoch": 0.12769922174723228, + "grad_norm": 6.56373929977417, + "learning_rate": 4.8018391839013784e-05, + "loss": 3.4338, + "num_input_tokens_seen": 763680, + "step": 1165 + }, + { + "epoch": 0.12824728707661953, + "grad_norm": 5.431229114532471, + "learning_rate": 4.800156215143124e-05, + "loss": 3.2619, + "num_input_tokens_seen": 767352, + "step": 1170 + }, + { + "epoch": 0.12879535240600679, + "grad_norm": 5.761483192443848, + "learning_rate": 4.7984664273701305e-05, + "loss": 3.3616, + "num_input_tokens_seen": 771096, + "step": 1175 + }, + { + "epoch": 0.12934341773539407, + "grad_norm": 7.804869651794434, + "learning_rate": 4.796769825591921e-05, + "loss": 3.2658, + "num_input_tokens_seen": 774192, + "step": 1180 + }, + { + "epoch": 0.12989148306478132, + "grad_norm": 5.688300609588623, + "learning_rate": 4.7950664148382205e-05, + "loss": 3.7069, + "num_input_tokens_seen": 777712, + "step": 1185 + }, + { + "epoch": 0.13043954839416858, + "grad_norm": 4.980658054351807, + "learning_rate": 4.793356200158941e-05, + "loss": 3.0386, + "num_input_tokens_seen": 780680, + "step": 1190 + }, + { + "epoch": 0.13098761372355586, + "grad_norm": 6.9450249671936035, + "learning_rate": 4.791639186624162e-05, + "loss": 3.4293, + "num_input_tokens_seen": 783664, + "step": 1195 + }, + { + "epoch": 0.1315356790529431, + "grad_norm": 6.7938408851623535, + "learning_rate": 4.789915379324121e-05, + "loss": 3.2908, + "num_input_tokens_seen": 787480, + "step": 1200 + }, + { + "epoch": 0.13208374438233036, + "grad_norm": 5.833454608917236, + "learning_rate": 4.788184783369196e-05, + "loss": 3.3431, + "num_input_tokens_seen": 791560, + "step": 1205 + }, + { + "epoch": 0.13263180971171765, + "grad_norm": 6.020946502685547, + "learning_rate": 4.786447403889891e-05, + "loss": 3.1235, + "num_input_tokens_seen": 794600, + "step": 1210 + }, + { + "epoch": 0.1331798750411049, + "grad_norm": 9.639689445495605, + "learning_rate": 4.78470324603682e-05, + "loss": 3.357, + "num_input_tokens_seen": 796976, + "step": 1215 + }, + { + "epoch": 0.13372794037049215, + "grad_norm": 5.102296829223633, + "learning_rate": 4.782952314980691e-05, + "loss": 3.4762, + "num_input_tokens_seen": 801208, + "step": 1220 + }, + { + "epoch": 0.13427600569987944, + "grad_norm": 6.015713214874268, + "learning_rate": 4.781194615912292e-05, + "loss": 3.2738, + "num_input_tokens_seen": 804472, + "step": 1225 + }, + { + "epoch": 0.1348240710292667, + "grad_norm": 7.88398551940918, + "learning_rate": 4.7794301540424774e-05, + "loss": 3.3333, + "num_input_tokens_seen": 807568, + "step": 1230 + }, + { + "epoch": 0.13537213635865394, + "grad_norm": 6.841670989990234, + "learning_rate": 4.7776589346021486e-05, + "loss": 3.5167, + "num_input_tokens_seen": 811016, + "step": 1235 + }, + { + "epoch": 0.13592020168804123, + "grad_norm": 6.089728355407715, + "learning_rate": 4.775880962842241e-05, + "loss": 3.703, + "num_input_tokens_seen": 814536, + "step": 1240 + }, + { + "epoch": 0.13646826701742848, + "grad_norm": 6.35260009765625, + "learning_rate": 4.774096244033707e-05, + "loss": 3.1131, + "num_input_tokens_seen": 817496, + "step": 1245 + }, + { + "epoch": 0.13701633234681573, + "grad_norm": 5.8579254150390625, + "learning_rate": 4.772304783467503e-05, + "loss": 3.2992, + "num_input_tokens_seen": 821712, + "step": 1250 + }, + { + "epoch": 0.13756439767620302, + "grad_norm": 5.486454963684082, + "learning_rate": 4.7705065864545695e-05, + "loss": 3.1721, + "num_input_tokens_seen": 824688, + "step": 1255 + }, + { + "epoch": 0.13811246300559027, + "grad_norm": 6.544208526611328, + "learning_rate": 4.7687016583258203e-05, + "loss": 3.4493, + "num_input_tokens_seen": 828400, + "step": 1260 + }, + { + "epoch": 0.13866052833497752, + "grad_norm": 4.948637008666992, + "learning_rate": 4.7668900044321236e-05, + "loss": 3.0927, + "num_input_tokens_seen": 831936, + "step": 1265 + }, + { + "epoch": 0.1392085936643648, + "grad_norm": 6.64813756942749, + "learning_rate": 4.7650716301442856e-05, + "loss": 3.6065, + "num_input_tokens_seen": 834912, + "step": 1270 + }, + { + "epoch": 0.13975665899375206, + "grad_norm": 7.289310455322266, + "learning_rate": 4.763246540853035e-05, + "loss": 3.3871, + "num_input_tokens_seen": 839072, + "step": 1275 + }, + { + "epoch": 0.1403047243231393, + "grad_norm": 5.887922763824463, + "learning_rate": 4.761414741969011e-05, + "loss": 3.1424, + "num_input_tokens_seen": 842568, + "step": 1280 + }, + { + "epoch": 0.1408527896525266, + "grad_norm": 6.820570468902588, + "learning_rate": 4.7595762389227406e-05, + "loss": 3.0197, + "num_input_tokens_seen": 845808, + "step": 1285 + }, + { + "epoch": 0.14140085498191385, + "grad_norm": 6.593437671661377, + "learning_rate": 4.757731037164628e-05, + "loss": 3.2013, + "num_input_tokens_seen": 849184, + "step": 1290 + }, + { + "epoch": 0.1419489203113011, + "grad_norm": 8.89852523803711, + "learning_rate": 4.7558791421649354e-05, + "loss": 3.5085, + "num_input_tokens_seen": 852392, + "step": 1295 + }, + { + "epoch": 0.14249698564068838, + "grad_norm": 7.368271827697754, + "learning_rate": 4.754020559413768e-05, + "loss": 3.3167, + "num_input_tokens_seen": 855376, + "step": 1300 + }, + { + "epoch": 0.14304505097007564, + "grad_norm": 5.54932975769043, + "learning_rate": 4.752155294421056e-05, + "loss": 3.0516, + "num_input_tokens_seen": 858720, + "step": 1305 + }, + { + "epoch": 0.1435931162994629, + "grad_norm": 8.180092811584473, + "learning_rate": 4.750283352716543e-05, + "loss": 3.4647, + "num_input_tokens_seen": 861312, + "step": 1310 + }, + { + "epoch": 0.14414118162885015, + "grad_norm": 6.608414173126221, + "learning_rate": 4.748404739849763e-05, + "loss": 3.3686, + "num_input_tokens_seen": 864368, + "step": 1315 + }, + { + "epoch": 0.14468924695823743, + "grad_norm": 6.880706787109375, + "learning_rate": 4.746519461390029e-05, + "loss": 3.0061, + "num_input_tokens_seen": 868000, + "step": 1320 + }, + { + "epoch": 0.14523731228762468, + "grad_norm": 4.034643650054932, + "learning_rate": 4.744627522926414e-05, + "loss": 3.3709, + "num_input_tokens_seen": 871648, + "step": 1325 + }, + { + "epoch": 0.14578537761701194, + "grad_norm": 5.335696220397949, + "learning_rate": 4.742728930067736e-05, + "loss": 3.0955, + "num_input_tokens_seen": 875440, + "step": 1330 + }, + { + "epoch": 0.14633344294639922, + "grad_norm": 8.005532264709473, + "learning_rate": 4.7408236884425396e-05, + "loss": 3.6277, + "num_input_tokens_seen": 879208, + "step": 1335 + }, + { + "epoch": 0.14688150827578647, + "grad_norm": 7.770083904266357, + "learning_rate": 4.7389118036990795e-05, + "loss": 3.5794, + "num_input_tokens_seen": 882040, + "step": 1340 + }, + { + "epoch": 0.14742957360517372, + "grad_norm": 6.539053916931152, + "learning_rate": 4.736993281505307e-05, + "loss": 3.2326, + "num_input_tokens_seen": 884984, + "step": 1345 + }, + { + "epoch": 0.147977638934561, + "grad_norm": 7.831300258636475, + "learning_rate": 4.73506812754885e-05, + "loss": 3.2767, + "num_input_tokens_seen": 888128, + "step": 1350 + }, + { + "epoch": 0.14852570426394826, + "grad_norm": 5.242404937744141, + "learning_rate": 4.733136347536995e-05, + "loss": 3.4698, + "num_input_tokens_seen": 890520, + "step": 1355 + }, + { + "epoch": 0.14907376959333551, + "grad_norm": 5.803912162780762, + "learning_rate": 4.731197947196673e-05, + "loss": 3.4711, + "num_input_tokens_seen": 893464, + "step": 1360 + }, + { + "epoch": 0.1496218349227228, + "grad_norm": 8.300127983093262, + "learning_rate": 4.7292529322744416e-05, + "loss": 3.2302, + "num_input_tokens_seen": 897520, + "step": 1365 + }, + { + "epoch": 0.15016990025211005, + "grad_norm": 5.02566385269165, + "learning_rate": 4.7273013085364694e-05, + "loss": 3.2959, + "num_input_tokens_seen": 901416, + "step": 1370 + }, + { + "epoch": 0.1507179655814973, + "grad_norm": 4.600845813751221, + "learning_rate": 4.725343081768514e-05, + "loss": 3.3303, + "num_input_tokens_seen": 906432, + "step": 1375 + }, + { + "epoch": 0.15126603091088459, + "grad_norm": 6.849578380584717, + "learning_rate": 4.723378257775912e-05, + "loss": 3.1125, + "num_input_tokens_seen": 909264, + "step": 1380 + }, + { + "epoch": 0.15181409624027184, + "grad_norm": 7.15298318862915, + "learning_rate": 4.7214068423835566e-05, + "loss": 3.2795, + "num_input_tokens_seen": 912464, + "step": 1385 + }, + { + "epoch": 0.1523621615696591, + "grad_norm": 5.415898323059082, + "learning_rate": 4.7194288414358804e-05, + "loss": 3.1385, + "num_input_tokens_seen": 915960, + "step": 1390 + }, + { + "epoch": 0.15291022689904638, + "grad_norm": 6.559721946716309, + "learning_rate": 4.717444260796841e-05, + "loss": 3.4027, + "num_input_tokens_seen": 918984, + "step": 1395 + }, + { + "epoch": 0.15345829222843363, + "grad_norm": 5.312758922576904, + "learning_rate": 4.715453106349902e-05, + "loss": 3.4349, + "num_input_tokens_seen": 921912, + "step": 1400 + }, + { + "epoch": 0.15400635755782088, + "grad_norm": 6.985774040222168, + "learning_rate": 4.7134553839980143e-05, + "loss": 3.7019, + "num_input_tokens_seen": 925848, + "step": 1405 + }, + { + "epoch": 0.15455442288720816, + "grad_norm": 6.191575527191162, + "learning_rate": 4.711451099663603e-05, + "loss": 3.4276, + "num_input_tokens_seen": 929792, + "step": 1410 + }, + { + "epoch": 0.15510248821659542, + "grad_norm": 6.040350437164307, + "learning_rate": 4.709440259288542e-05, + "loss": 2.9173, + "num_input_tokens_seen": 932400, + "step": 1415 + }, + { + "epoch": 0.15565055354598267, + "grad_norm": 6.164414405822754, + "learning_rate": 4.707422868834146e-05, + "loss": 3.1684, + "num_input_tokens_seen": 935408, + "step": 1420 + }, + { + "epoch": 0.15619861887536995, + "grad_norm": 7.248453140258789, + "learning_rate": 4.705398934281145e-05, + "loss": 3.6365, + "num_input_tokens_seen": 938184, + "step": 1425 + }, + { + "epoch": 0.1567466842047572, + "grad_norm": 5.813863754272461, + "learning_rate": 4.70336846162967e-05, + "loss": 3.405, + "num_input_tokens_seen": 941272, + "step": 1430 + }, + { + "epoch": 0.15729474953414446, + "grad_norm": 6.239504337310791, + "learning_rate": 4.701331456899236e-05, + "loss": 3.0722, + "num_input_tokens_seen": 944728, + "step": 1435 + }, + { + "epoch": 0.15784281486353174, + "grad_norm": 9.224727630615234, + "learning_rate": 4.6992879261287226e-05, + "loss": 3.2262, + "num_input_tokens_seen": 947528, + "step": 1440 + }, + { + "epoch": 0.158390880192919, + "grad_norm": 7.570671558380127, + "learning_rate": 4.6972378753763545e-05, + "loss": 3.2116, + "num_input_tokens_seen": 950128, + "step": 1445 + }, + { + "epoch": 0.15893894552230625, + "grad_norm": 4.781320095062256, + "learning_rate": 4.6951813107196874e-05, + "loss": 3.2953, + "num_input_tokens_seen": 954336, + "step": 1450 + }, + { + "epoch": 0.15948701085169353, + "grad_norm": 7.117349147796631, + "learning_rate": 4.693118238255587e-05, + "loss": 3.2755, + "num_input_tokens_seen": 957704, + "step": 1455 + }, + { + "epoch": 0.1600350761810808, + "grad_norm": 6.41115665435791, + "learning_rate": 4.6910486641002136e-05, + "loss": 3.2523, + "num_input_tokens_seen": 960184, + "step": 1460 + }, + { + "epoch": 0.16058314151046804, + "grad_norm": 8.865285873413086, + "learning_rate": 4.688972594389001e-05, + "loss": 3.3998, + "num_input_tokens_seen": 963264, + "step": 1465 + }, + { + "epoch": 0.16113120683985532, + "grad_norm": 4.722679615020752, + "learning_rate": 4.6868900352766394e-05, + "loss": 3.0958, + "num_input_tokens_seen": 966536, + "step": 1470 + }, + { + "epoch": 0.16167927216924258, + "grad_norm": 8.334817886352539, + "learning_rate": 4.6848009929370575e-05, + "loss": 3.2969, + "num_input_tokens_seen": 969008, + "step": 1475 + }, + { + "epoch": 0.16222733749862983, + "grad_norm": 6.063559055328369, + "learning_rate": 4.682705473563406e-05, + "loss": 3.0186, + "num_input_tokens_seen": 972168, + "step": 1480 + }, + { + "epoch": 0.1627754028280171, + "grad_norm": 6.434414386749268, + "learning_rate": 4.680603483368033e-05, + "loss": 3.4689, + "num_input_tokens_seen": 976096, + "step": 1485 + }, + { + "epoch": 0.16332346815740437, + "grad_norm": 8.82730770111084, + "learning_rate": 4.678495028582476e-05, + "loss": 3.2562, + "num_input_tokens_seen": 979080, + "step": 1490 + }, + { + "epoch": 0.16387153348679162, + "grad_norm": 6.3244171142578125, + "learning_rate": 4.676380115457431e-05, + "loss": 3.0127, + "num_input_tokens_seen": 981896, + "step": 1495 + }, + { + "epoch": 0.1644195988161789, + "grad_norm": 6.033606052398682, + "learning_rate": 4.674258750262745e-05, + "loss": 3.1823, + "num_input_tokens_seen": 985072, + "step": 1500 + }, + { + "epoch": 0.16496766414556616, + "grad_norm": 4.211119174957275, + "learning_rate": 4.6721309392873926e-05, + "loss": 3.1351, + "num_input_tokens_seen": 987448, + "step": 1505 + }, + { + "epoch": 0.1655157294749534, + "grad_norm": 6.105933666229248, + "learning_rate": 4.669996688839453e-05, + "loss": 3.2884, + "num_input_tokens_seen": 990840, + "step": 1510 + }, + { + "epoch": 0.16606379480434066, + "grad_norm": 8.247055053710938, + "learning_rate": 4.6678560052460994e-05, + "loss": 3.1378, + "num_input_tokens_seen": 994768, + "step": 1515 + }, + { + "epoch": 0.16661186013372795, + "grad_norm": 5.653783798217773, + "learning_rate": 4.6657088948535776e-05, + "loss": 3.7376, + "num_input_tokens_seen": 997840, + "step": 1520 + }, + { + "epoch": 0.1671599254631152, + "grad_norm": 5.42575216293335, + "learning_rate": 4.6635553640271835e-05, + "loss": 3.4831, + "num_input_tokens_seen": 1000536, + "step": 1525 + }, + { + "epoch": 0.16770799079250245, + "grad_norm": 7.640921115875244, + "learning_rate": 4.6613954191512474e-05, + "loss": 3.5714, + "num_input_tokens_seen": 1003952, + "step": 1530 + }, + { + "epoch": 0.16825605612188974, + "grad_norm": 5.931758880615234, + "learning_rate": 4.6592290666291163e-05, + "loss": 3.4493, + "num_input_tokens_seen": 1006544, + "step": 1535 + }, + { + "epoch": 0.168804121451277, + "grad_norm": 4.96866512298584, + "learning_rate": 4.657056312883132e-05, + "loss": 3.0963, + "num_input_tokens_seen": 1009920, + "step": 1540 + }, + { + "epoch": 0.16935218678066424, + "grad_norm": 7.009856224060059, + "learning_rate": 4.6548771643546134e-05, + "loss": 3.0819, + "num_input_tokens_seen": 1012544, + "step": 1545 + }, + { + "epoch": 0.16990025211005153, + "grad_norm": 6.719354629516602, + "learning_rate": 4.652691627503837e-05, + "loss": 3.3187, + "num_input_tokens_seen": 1015248, + "step": 1550 + }, + { + "epoch": 0.17044831743943878, + "grad_norm": 7.1751837730407715, + "learning_rate": 4.650499708810018e-05, + "loss": 3.6579, + "num_input_tokens_seen": 1018720, + "step": 1555 + }, + { + "epoch": 0.17099638276882603, + "grad_norm": 11.277824401855469, + "learning_rate": 4.648301414771293e-05, + "loss": 3.5192, + "num_input_tokens_seen": 1021424, + "step": 1560 + }, + { + "epoch": 0.17154444809821331, + "grad_norm": 9.307093620300293, + "learning_rate": 4.646096751904696e-05, + "loss": 3.2431, + "num_input_tokens_seen": 1024192, + "step": 1565 + }, + { + "epoch": 0.17209251342760057, + "grad_norm": 6.657312393188477, + "learning_rate": 4.643885726746143e-05, + "loss": 3.1878, + "num_input_tokens_seen": 1027600, + "step": 1570 + }, + { + "epoch": 0.17264057875698782, + "grad_norm": 5.908510208129883, + "learning_rate": 4.641668345850414e-05, + "loss": 3.67, + "num_input_tokens_seen": 1030168, + "step": 1575 + }, + { + "epoch": 0.1731886440863751, + "grad_norm": 6.540554046630859, + "learning_rate": 4.639444615791128e-05, + "loss": 2.9285, + "num_input_tokens_seen": 1034472, + "step": 1580 + }, + { + "epoch": 0.17373670941576236, + "grad_norm": 6.857239723205566, + "learning_rate": 4.6372145431607264e-05, + "loss": 3.3879, + "num_input_tokens_seen": 1038520, + "step": 1585 + }, + { + "epoch": 0.1742847747451496, + "grad_norm": 5.343799591064453, + "learning_rate": 4.634978134570456e-05, + "loss": 3.3824, + "num_input_tokens_seen": 1041864, + "step": 1590 + }, + { + "epoch": 0.1748328400745369, + "grad_norm": 5.971281051635742, + "learning_rate": 4.632735396650346e-05, + "loss": 3.5344, + "num_input_tokens_seen": 1045192, + "step": 1595 + }, + { + "epoch": 0.17538090540392415, + "grad_norm": 5.474274158477783, + "learning_rate": 4.6304863360491906e-05, + "loss": 3.0682, + "num_input_tokens_seen": 1048680, + "step": 1600 + }, + { + "epoch": 0.1759289707333114, + "grad_norm": 6.720623970031738, + "learning_rate": 4.6282309594345266e-05, + "loss": 3.0808, + "num_input_tokens_seen": 1051776, + "step": 1605 + }, + { + "epoch": 0.17647703606269868, + "grad_norm": 6.88260555267334, + "learning_rate": 4.625969273492614e-05, + "loss": 3.5346, + "num_input_tokens_seen": 1054256, + "step": 1610 + }, + { + "epoch": 0.17702510139208594, + "grad_norm": 6.154021263122559, + "learning_rate": 4.623701284928421e-05, + "loss": 3.2947, + "num_input_tokens_seen": 1057536, + "step": 1615 + }, + { + "epoch": 0.1775731667214732, + "grad_norm": 6.108212471008301, + "learning_rate": 4.6214270004655985e-05, + "loss": 3.3287, + "num_input_tokens_seen": 1060872, + "step": 1620 + }, + { + "epoch": 0.17812123205086047, + "grad_norm": 4.82647705078125, + "learning_rate": 4.6191464268464614e-05, + "loss": 3.3231, + "num_input_tokens_seen": 1063536, + "step": 1625 + }, + { + "epoch": 0.17866929738024773, + "grad_norm": 6.965377330780029, + "learning_rate": 4.61685957083197e-05, + "loss": 3.5096, + "num_input_tokens_seen": 1066392, + "step": 1630 + }, + { + "epoch": 0.17921736270963498, + "grad_norm": 7.133657455444336, + "learning_rate": 4.6145664392017096e-05, + "loss": 3.2534, + "num_input_tokens_seen": 1068920, + "step": 1635 + }, + { + "epoch": 0.17976542803902226, + "grad_norm": 8.859077453613281, + "learning_rate": 4.6122670387538704e-05, + "loss": 3.2012, + "num_input_tokens_seen": 1071696, + "step": 1640 + }, + { + "epoch": 0.18031349336840952, + "grad_norm": 6.119090557098389, + "learning_rate": 4.6099613763052264e-05, + "loss": 3.6088, + "num_input_tokens_seen": 1074720, + "step": 1645 + }, + { + "epoch": 0.18086155869779677, + "grad_norm": 6.804201126098633, + "learning_rate": 4.607649458691115e-05, + "loss": 3.2794, + "num_input_tokens_seen": 1077944, + "step": 1650 + }, + { + "epoch": 0.18140962402718405, + "grad_norm": 7.389477729797363, + "learning_rate": 4.60533129276542e-05, + "loss": 3.4432, + "num_input_tokens_seen": 1080792, + "step": 1655 + }, + { + "epoch": 0.1819576893565713, + "grad_norm": 5.930356502532959, + "learning_rate": 4.6030068854005476e-05, + "loss": 3.2158, + "num_input_tokens_seen": 1083520, + "step": 1660 + }, + { + "epoch": 0.18250575468595856, + "grad_norm": 6.847218036651611, + "learning_rate": 4.6006762434874065e-05, + "loss": 3.4395, + "num_input_tokens_seen": 1086128, + "step": 1665 + }, + { + "epoch": 0.18305382001534584, + "grad_norm": 9.511390686035156, + "learning_rate": 4.598339373935389e-05, + "loss": 3.2795, + "num_input_tokens_seen": 1088560, + "step": 1670 + }, + { + "epoch": 0.1836018853447331, + "grad_norm": 4.90114688873291, + "learning_rate": 4.595996283672349e-05, + "loss": 3.2474, + "num_input_tokens_seen": 1091832, + "step": 1675 + }, + { + "epoch": 0.18414995067412035, + "grad_norm": 9.29576301574707, + "learning_rate": 4.5936469796445854e-05, + "loss": 3.3011, + "num_input_tokens_seen": 1095048, + "step": 1680 + }, + { + "epoch": 0.18469801600350763, + "grad_norm": 6.643434524536133, + "learning_rate": 4.5912914688168134e-05, + "loss": 3.4029, + "num_input_tokens_seen": 1097704, + "step": 1685 + }, + { + "epoch": 0.18524608133289489, + "grad_norm": 4.961350440979004, + "learning_rate": 4.5889297581721526e-05, + "loss": 3.0958, + "num_input_tokens_seen": 1100736, + "step": 1690 + }, + { + "epoch": 0.18579414666228214, + "grad_norm": 7.057353496551514, + "learning_rate": 4.5865618547121016e-05, + "loss": 3.1003, + "num_input_tokens_seen": 1104184, + "step": 1695 + }, + { + "epoch": 0.18634221199166942, + "grad_norm": 3.688004970550537, + "learning_rate": 4.584187765456516e-05, + "loss": 3.5992, + "num_input_tokens_seen": 1107880, + "step": 1700 + }, + { + "epoch": 0.18689027732105667, + "grad_norm": 6.79044246673584, + "learning_rate": 4.5818074974435935e-05, + "loss": 3.5112, + "num_input_tokens_seen": 1110728, + "step": 1705 + }, + { + "epoch": 0.18743834265044393, + "grad_norm": 5.125957489013672, + "learning_rate": 4.579421057729846e-05, + "loss": 3.4606, + "num_input_tokens_seen": 1113632, + "step": 1710 + }, + { + "epoch": 0.18798640797983118, + "grad_norm": 6.708007335662842, + "learning_rate": 4.577028453390084e-05, + "loss": 3.4139, + "num_input_tokens_seen": 1117248, + "step": 1715 + }, + { + "epoch": 0.18853447330921846, + "grad_norm": 4.76835298538208, + "learning_rate": 4.5746296915173924e-05, + "loss": 3.4408, + "num_input_tokens_seen": 1120600, + "step": 1720 + }, + { + "epoch": 0.18908253863860572, + "grad_norm": 6.29659366607666, + "learning_rate": 4.572224779223111e-05, + "loss": 3.4817, + "num_input_tokens_seen": 1123856, + "step": 1725 + }, + { + "epoch": 0.18963060396799297, + "grad_norm": 9.75003433227539, + "learning_rate": 4.569813723636813e-05, + "loss": 3.5152, + "num_input_tokens_seen": 1127872, + "step": 1730 + }, + { + "epoch": 0.19017866929738025, + "grad_norm": 6.846242427825928, + "learning_rate": 4.567396531906285e-05, + "loss": 3.4197, + "num_input_tokens_seen": 1131656, + "step": 1735 + }, + { + "epoch": 0.1907267346267675, + "grad_norm": 6.956099033355713, + "learning_rate": 4.564973211197503e-05, + "loss": 3.5098, + "num_input_tokens_seen": 1135160, + "step": 1740 + }, + { + "epoch": 0.19127479995615476, + "grad_norm": 5.187982559204102, + "learning_rate": 4.562543768694614e-05, + "loss": 3.2708, + "num_input_tokens_seen": 1137640, + "step": 1745 + }, + { + "epoch": 0.19182286528554204, + "grad_norm": 6.0655035972595215, + "learning_rate": 4.5601082115999126e-05, + "loss": 3.1415, + "num_input_tokens_seen": 1140624, + "step": 1750 + }, + { + "epoch": 0.1923709306149293, + "grad_norm": 7.111659049987793, + "learning_rate": 4.557666547133822e-05, + "loss": 3.419, + "num_input_tokens_seen": 1143352, + "step": 1755 + }, + { + "epoch": 0.19291899594431655, + "grad_norm": 5.601785659790039, + "learning_rate": 4.55521878253487e-05, + "loss": 3.1537, + "num_input_tokens_seen": 1146552, + "step": 1760 + }, + { + "epoch": 0.19346706127370383, + "grad_norm": 5.885753154754639, + "learning_rate": 4.5527649250596705e-05, + "loss": 3.1606, + "num_input_tokens_seen": 1150064, + "step": 1765 + }, + { + "epoch": 0.1940151266030911, + "grad_norm": 7.787903785705566, + "learning_rate": 4.5503049819828975e-05, + "loss": 3.5314, + "num_input_tokens_seen": 1152720, + "step": 1770 + }, + { + "epoch": 0.19456319193247834, + "grad_norm": 6.6935133934021, + "learning_rate": 4.5478389605972695e-05, + "loss": 3.2798, + "num_input_tokens_seen": 1155704, + "step": 1775 + }, + { + "epoch": 0.19511125726186562, + "grad_norm": 5.613322734832764, + "learning_rate": 4.545366868213521e-05, + "loss": 2.9432, + "num_input_tokens_seen": 1159064, + "step": 1780 + }, + { + "epoch": 0.19565932259125288, + "grad_norm": 5.332114219665527, + "learning_rate": 4.542888712160389e-05, + "loss": 3.417, + "num_input_tokens_seen": 1162384, + "step": 1785 + }, + { + "epoch": 0.19620738792064013, + "grad_norm": 5.810116291046143, + "learning_rate": 4.540404499784582e-05, + "loss": 3.4744, + "num_input_tokens_seen": 1165168, + "step": 1790 + }, + { + "epoch": 0.1967554532500274, + "grad_norm": 6.959201335906982, + "learning_rate": 4.537914238450768e-05, + "loss": 3.6205, + "num_input_tokens_seen": 1168288, + "step": 1795 + }, + { + "epoch": 0.19730351857941467, + "grad_norm": 7.266166687011719, + "learning_rate": 4.535417935541543e-05, + "loss": 3.5834, + "num_input_tokens_seen": 1170536, + "step": 1800 + }, + { + "epoch": 0.19785158390880192, + "grad_norm": 6.565328598022461, + "learning_rate": 4.5329155984574154e-05, + "loss": 3.094, + "num_input_tokens_seen": 1174016, + "step": 1805 + }, + { + "epoch": 0.1983996492381892, + "grad_norm": 6.1436944007873535, + "learning_rate": 4.5304072346167846e-05, + "loss": 3.6874, + "num_input_tokens_seen": 1177584, + "step": 1810 + }, + { + "epoch": 0.19894771456757646, + "grad_norm": 6.344284534454346, + "learning_rate": 4.527892851455915e-05, + "loss": 3.5916, + "num_input_tokens_seen": 1180544, + "step": 1815 + }, + { + "epoch": 0.1994957798969637, + "grad_norm": 6.047328472137451, + "learning_rate": 4.5253724564289144e-05, + "loss": 3.1019, + "num_input_tokens_seen": 1184376, + "step": 1820 + }, + { + "epoch": 0.200043845226351, + "grad_norm": 5.976099491119385, + "learning_rate": 4.522846057007716e-05, + "loss": 3.0793, + "num_input_tokens_seen": 1187280, + "step": 1825 + }, + { + "epoch": 0.20059191055573825, + "grad_norm": 6.050201892852783, + "learning_rate": 4.5203136606820515e-05, + "loss": 3.1914, + "num_input_tokens_seen": 1190952, + "step": 1830 + }, + { + "epoch": 0.2011399758851255, + "grad_norm": 5.573675632476807, + "learning_rate": 4.517775274959434e-05, + "loss": 3.3849, + "num_input_tokens_seen": 1194568, + "step": 1835 + }, + { + "epoch": 0.20168804121451278, + "grad_norm": 10.978282928466797, + "learning_rate": 4.5152309073651266e-05, + "loss": 3.3821, + "num_input_tokens_seen": 1197992, + "step": 1840 + }, + { + "epoch": 0.20223610654390003, + "grad_norm": 6.215994358062744, + "learning_rate": 4.512680565442133e-05, + "loss": 2.9822, + "num_input_tokens_seen": 1201456, + "step": 1845 + }, + { + "epoch": 0.2027841718732873, + "grad_norm": 5.15269660949707, + "learning_rate": 4.510124256751166e-05, + "loss": 3.0034, + "num_input_tokens_seen": 1205552, + "step": 1850 + }, + { + "epoch": 0.20333223720267457, + "grad_norm": 8.590337753295898, + "learning_rate": 4.507561988870624e-05, + "loss": 3.3385, + "num_input_tokens_seen": 1208496, + "step": 1855 + }, + { + "epoch": 0.20388030253206182, + "grad_norm": 6.038626194000244, + "learning_rate": 4.5049937693965764e-05, + "loss": 3.3063, + "num_input_tokens_seen": 1211856, + "step": 1860 + }, + { + "epoch": 0.20442836786144908, + "grad_norm": 6.621918678283691, + "learning_rate": 4.502419605942735e-05, + "loss": 3.2243, + "num_input_tokens_seen": 1216152, + "step": 1865 + }, + { + "epoch": 0.20497643319083636, + "grad_norm": 6.029962062835693, + "learning_rate": 4.499839506140433e-05, + "loss": 3.4138, + "num_input_tokens_seen": 1219840, + "step": 1870 + }, + { + "epoch": 0.20552449852022361, + "grad_norm": 7.1330952644348145, + "learning_rate": 4.497253477638602e-05, + "loss": 3.3366, + "num_input_tokens_seen": 1222888, + "step": 1875 + }, + { + "epoch": 0.20607256384961087, + "grad_norm": 7.775686264038086, + "learning_rate": 4.494661528103751e-05, + "loss": 3.1706, + "num_input_tokens_seen": 1227096, + "step": 1880 + }, + { + "epoch": 0.20662062917899815, + "grad_norm": 8.789952278137207, + "learning_rate": 4.492063665219941e-05, + "loss": 3.4648, + "num_input_tokens_seen": 1230856, + "step": 1885 + }, + { + "epoch": 0.2071686945083854, + "grad_norm": 7.492274284362793, + "learning_rate": 4.489459896688764e-05, + "loss": 3.6099, + "num_input_tokens_seen": 1234160, + "step": 1890 + }, + { + "epoch": 0.20771675983777266, + "grad_norm": 6.971865177154541, + "learning_rate": 4.48685023022932e-05, + "loss": 3.037, + "num_input_tokens_seen": 1236904, + "step": 1895 + }, + { + "epoch": 0.20826482516715994, + "grad_norm": 9.107683181762695, + "learning_rate": 4.484234673578196e-05, + "loss": 3.435, + "num_input_tokens_seen": 1239936, + "step": 1900 + }, + { + "epoch": 0.2088128904965472, + "grad_norm": 6.467232704162598, + "learning_rate": 4.4816132344894354e-05, + "loss": 3.6629, + "num_input_tokens_seen": 1242952, + "step": 1905 + }, + { + "epoch": 0.20936095582593445, + "grad_norm": 6.295756816864014, + "learning_rate": 4.4789859207345274e-05, + "loss": 3.1083, + "num_input_tokens_seen": 1246560, + "step": 1910 + }, + { + "epoch": 0.2099090211553217, + "grad_norm": 5.817240238189697, + "learning_rate": 4.4763527401023724e-05, + "loss": 3.2389, + "num_input_tokens_seen": 1249904, + "step": 1915 + }, + { + "epoch": 0.21045708648470898, + "grad_norm": 7.3531317710876465, + "learning_rate": 4.473713700399266e-05, + "loss": 3.1022, + "num_input_tokens_seen": 1252272, + "step": 1920 + }, + { + "epoch": 0.21100515181409624, + "grad_norm": 7.078802108764648, + "learning_rate": 4.471068809448872e-05, + "loss": 3.2372, + "num_input_tokens_seen": 1255904, + "step": 1925 + }, + { + "epoch": 0.2115532171434835, + "grad_norm": 5.776179313659668, + "learning_rate": 4.468418075092201e-05, + "loss": 3.2817, + "num_input_tokens_seen": 1259024, + "step": 1930 + }, + { + "epoch": 0.21210128247287077, + "grad_norm": 9.986640930175781, + "learning_rate": 4.465761505187589e-05, + "loss": 3.349, + "num_input_tokens_seen": 1262584, + "step": 1935 + }, + { + "epoch": 0.21264934780225803, + "grad_norm": 8.421146392822266, + "learning_rate": 4.463099107610669e-05, + "loss": 3.2711, + "num_input_tokens_seen": 1266072, + "step": 1940 + }, + { + "epoch": 0.21319741313164528, + "grad_norm": 8.646468162536621, + "learning_rate": 4.460430890254353e-05, + "loss": 3.264, + "num_input_tokens_seen": 1269528, + "step": 1945 + }, + { + "epoch": 0.21374547846103256, + "grad_norm": 6.439562797546387, + "learning_rate": 4.457756861028804e-05, + "loss": 3.2899, + "num_input_tokens_seen": 1272200, + "step": 1950 + }, + { + "epoch": 0.21429354379041982, + "grad_norm": 8.170503616333008, + "learning_rate": 4.455077027861417e-05, + "loss": 3.3649, + "num_input_tokens_seen": 1275360, + "step": 1955 + }, + { + "epoch": 0.21484160911980707, + "grad_norm": 6.329521179199219, + "learning_rate": 4.452391398696794e-05, + "loss": 3.4714, + "num_input_tokens_seen": 1278480, + "step": 1960 + }, + { + "epoch": 0.21538967444919435, + "grad_norm": 7.618672847747803, + "learning_rate": 4.449699981496714e-05, + "loss": 3.1889, + "num_input_tokens_seen": 1281312, + "step": 1965 + }, + { + "epoch": 0.2159377397785816, + "grad_norm": 5.937787055969238, + "learning_rate": 4.447002784240122e-05, + "loss": 3.2998, + "num_input_tokens_seen": 1284456, + "step": 1970 + }, + { + "epoch": 0.21648580510796886, + "grad_norm": 6.004344463348389, + "learning_rate": 4.444299814923096e-05, + "loss": 3.5535, + "num_input_tokens_seen": 1287512, + "step": 1975 + }, + { + "epoch": 0.21703387043735614, + "grad_norm": 6.512199878692627, + "learning_rate": 4.4415910815588235e-05, + "loss": 3.4036, + "num_input_tokens_seen": 1290336, + "step": 1980 + }, + { + "epoch": 0.2175819357667434, + "grad_norm": 6.4987616539001465, + "learning_rate": 4.438876592177584e-05, + "loss": 3.6318, + "num_input_tokens_seen": 1292832, + "step": 1985 + }, + { + "epoch": 0.21813000109613065, + "grad_norm": 5.955297946929932, + "learning_rate": 4.4361563548267186e-05, + "loss": 3.4087, + "num_input_tokens_seen": 1296336, + "step": 1990 + }, + { + "epoch": 0.21867806642551793, + "grad_norm": 9.001585960388184, + "learning_rate": 4.4334303775706087e-05, + "loss": 3.0256, + "num_input_tokens_seen": 1299928, + "step": 1995 + }, + { + "epoch": 0.21922613175490518, + "grad_norm": 8.543002128601074, + "learning_rate": 4.4306986684906534e-05, + "loss": 3.0983, + "num_input_tokens_seen": 1303344, + "step": 2000 + }, + { + "epoch": 0.21977419708429244, + "grad_norm": 5.445712089538574, + "learning_rate": 4.427961235685245e-05, + "loss": 3.5193, + "num_input_tokens_seen": 1306536, + "step": 2005 + }, + { + "epoch": 0.22032226241367972, + "grad_norm": 4.273796558380127, + "learning_rate": 4.4252180872697403e-05, + "loss": 3.036, + "num_input_tokens_seen": 1311056, + "step": 2010 + }, + { + "epoch": 0.22087032774306697, + "grad_norm": 5.357060432434082, + "learning_rate": 4.422469231376445e-05, + "loss": 3.2927, + "num_input_tokens_seen": 1314432, + "step": 2015 + }, + { + "epoch": 0.22141839307245423, + "grad_norm": 6.554574012756348, + "learning_rate": 4.4197146761545825e-05, + "loss": 3.4088, + "num_input_tokens_seen": 1317568, + "step": 2020 + }, + { + "epoch": 0.2219664584018415, + "grad_norm": 5.920197486877441, + "learning_rate": 4.4169544297702745e-05, + "loss": 3.1075, + "num_input_tokens_seen": 1321288, + "step": 2025 + }, + { + "epoch": 0.22251452373122876, + "grad_norm": 5.399965763092041, + "learning_rate": 4.414188500406513e-05, + "loss": 3.023, + "num_input_tokens_seen": 1324832, + "step": 2030 + }, + { + "epoch": 0.22306258906061602, + "grad_norm": 4.449610710144043, + "learning_rate": 4.411416896263137e-05, + "loss": 3.2649, + "num_input_tokens_seen": 1327992, + "step": 2035 + }, + { + "epoch": 0.2236106543900033, + "grad_norm": 5.2429304122924805, + "learning_rate": 4.408639625556812e-05, + "loss": 3.2027, + "num_input_tokens_seen": 1331448, + "step": 2040 + }, + { + "epoch": 0.22415871971939055, + "grad_norm": 5.563135623931885, + "learning_rate": 4.405856696520998e-05, + "loss": 3.0106, + "num_input_tokens_seen": 1334672, + "step": 2045 + }, + { + "epoch": 0.2247067850487778, + "grad_norm": 9.401083946228027, + "learning_rate": 4.403068117405933e-05, + "loss": 3.5604, + "num_input_tokens_seen": 1338664, + "step": 2050 + }, + { + "epoch": 0.2252548503781651, + "grad_norm": 6.381105899810791, + "learning_rate": 4.4002738964786047e-05, + "loss": 3.1456, + "num_input_tokens_seen": 1341320, + "step": 2055 + }, + { + "epoch": 0.22580291570755234, + "grad_norm": 8.379097938537598, + "learning_rate": 4.397474042022727e-05, + "loss": 3.7295, + "num_input_tokens_seen": 1344712, + "step": 2060 + }, + { + "epoch": 0.2263509810369396, + "grad_norm": 5.414994239807129, + "learning_rate": 4.394668562338711e-05, + "loss": 3.2339, + "num_input_tokens_seen": 1348704, + "step": 2065 + }, + { + "epoch": 0.22689904636632688, + "grad_norm": 6.6783447265625, + "learning_rate": 4.391857465743649e-05, + "loss": 3.1633, + "num_input_tokens_seen": 1352136, + "step": 2070 + }, + { + "epoch": 0.22744711169571413, + "grad_norm": 6.781215667724609, + "learning_rate": 4.389040760571284e-05, + "loss": 3.2454, + "num_input_tokens_seen": 1355704, + "step": 2075 + }, + { + "epoch": 0.2279951770251014, + "grad_norm": 8.376158714294434, + "learning_rate": 4.386218455171984e-05, + "loss": 3.2688, + "num_input_tokens_seen": 1358224, + "step": 2080 + }, + { + "epoch": 0.22854324235448867, + "grad_norm": 6.815377712249756, + "learning_rate": 4.383390557912722e-05, + "loss": 3.2047, + "num_input_tokens_seen": 1361624, + "step": 2085 + }, + { + "epoch": 0.22909130768387592, + "grad_norm": 9.893330574035645, + "learning_rate": 4.380557077177046e-05, + "loss": 3.3861, + "num_input_tokens_seen": 1365672, + "step": 2090 + }, + { + "epoch": 0.22963937301326318, + "grad_norm": 5.984465599060059, + "learning_rate": 4.3777180213650587e-05, + "loss": 3.2901, + "num_input_tokens_seen": 1368440, + "step": 2095 + }, + { + "epoch": 0.23018743834265046, + "grad_norm": 8.21902847290039, + "learning_rate": 4.37487339889339e-05, + "loss": 3.135, + "num_input_tokens_seen": 1370736, + "step": 2100 + }, + { + "epoch": 0.2307355036720377, + "grad_norm": 7.617781639099121, + "learning_rate": 4.3720232181951726e-05, + "loss": 3.2967, + "num_input_tokens_seen": 1373632, + "step": 2105 + }, + { + "epoch": 0.23128356900142497, + "grad_norm": 5.901704788208008, + "learning_rate": 4.3691674877200164e-05, + "loss": 3.0304, + "num_input_tokens_seen": 1376840, + "step": 2110 + }, + { + "epoch": 0.23183163433081222, + "grad_norm": 7.1147074699401855, + "learning_rate": 4.3663062159339855e-05, + "loss": 3.2797, + "num_input_tokens_seen": 1380024, + "step": 2115 + }, + { + "epoch": 0.2323796996601995, + "grad_norm": 6.9793243408203125, + "learning_rate": 4.363439411319571e-05, + "loss": 3.6079, + "num_input_tokens_seen": 1382992, + "step": 2120 + }, + { + "epoch": 0.23292776498958676, + "grad_norm": 5.454427242279053, + "learning_rate": 4.360567082375666e-05, + "loss": 3.1035, + "num_input_tokens_seen": 1385936, + "step": 2125 + }, + { + "epoch": 0.233475830318974, + "grad_norm": 9.776113510131836, + "learning_rate": 4.3576892376175414e-05, + "loss": 3.1049, + "num_input_tokens_seen": 1389176, + "step": 2130 + }, + { + "epoch": 0.2340238956483613, + "grad_norm": 5.588262557983398, + "learning_rate": 4.3553829961575053e-05, + "loss": 3.0589, + "num_input_tokens_seen": 1392080, + "step": 2135 + }, + { + "epoch": 0.23457196097774854, + "grad_norm": 7.208589553833008, + "learning_rate": 4.352495244444449e-05, + "loss": 3.3501, + "num_input_tokens_seen": 1395360, + "step": 2140 + }, + { + "epoch": 0.2351200263071358, + "grad_norm": 5.150116920471191, + "learning_rate": 4.349602000846844e-05, + "loss": 3.4204, + "num_input_tokens_seen": 1398760, + "step": 2145 + }, + { + "epoch": 0.23566809163652308, + "grad_norm": 7.456035137176514, + "learning_rate": 4.346703273941965e-05, + "loss": 2.9937, + "num_input_tokens_seen": 1402384, + "step": 2150 + }, + { + "epoch": 0.23621615696591033, + "grad_norm": 5.8624067306518555, + "learning_rate": 4.3437990723233416e-05, + "loss": 3.233, + "num_input_tokens_seen": 1406152, + "step": 2155 + }, + { + "epoch": 0.2367642222952976, + "grad_norm": 5.129085063934326, + "learning_rate": 4.3408894046007354e-05, + "loss": 3.3833, + "num_input_tokens_seen": 1409704, + "step": 2160 + }, + { + "epoch": 0.23731228762468487, + "grad_norm": 7.074642658233643, + "learning_rate": 4.337974279400111e-05, + "loss": 3.2288, + "num_input_tokens_seen": 1412984, + "step": 2165 + }, + { + "epoch": 0.23786035295407212, + "grad_norm": 7.073869228363037, + "learning_rate": 4.335053705363611e-05, + "loss": 3.1338, + "num_input_tokens_seen": 1416232, + "step": 2170 + }, + { + "epoch": 0.23840841828345938, + "grad_norm": 6.7071990966796875, + "learning_rate": 4.332127691149535e-05, + "loss": 3.1272, + "num_input_tokens_seen": 1419904, + "step": 2175 + }, + { + "epoch": 0.23895648361284666, + "grad_norm": 8.463297843933105, + "learning_rate": 4.3291962454323076e-05, + "loss": 3.3227, + "num_input_tokens_seen": 1423048, + "step": 2180 + }, + { + "epoch": 0.2395045489422339, + "grad_norm": 7.098794460296631, + "learning_rate": 4.3262593769024576e-05, + "loss": 3.1422, + "num_input_tokens_seen": 1425568, + "step": 2185 + }, + { + "epoch": 0.24005261427162117, + "grad_norm": 5.919711589813232, + "learning_rate": 4.323317094266589e-05, + "loss": 3.0584, + "num_input_tokens_seen": 1429464, + "step": 2190 + }, + { + "epoch": 0.24060067960100845, + "grad_norm": 5.311784267425537, + "learning_rate": 4.320369406247356e-05, + "loss": 2.8391, + "num_input_tokens_seen": 1432832, + "step": 2195 + }, + { + "epoch": 0.2411487449303957, + "grad_norm": 6.239211559295654, + "learning_rate": 4.317416321583437e-05, + "loss": 3.1701, + "num_input_tokens_seen": 1435960, + "step": 2200 + }, + { + "epoch": 0.24169681025978296, + "grad_norm": 9.268356323242188, + "learning_rate": 4.314457849029513e-05, + "loss": 3.3796, + "num_input_tokens_seen": 1439752, + "step": 2205 + }, + { + "epoch": 0.24224487558917024, + "grad_norm": 7.6005449295043945, + "learning_rate": 4.311493997356234e-05, + "loss": 3.189, + "num_input_tokens_seen": 1442488, + "step": 2210 + }, + { + "epoch": 0.2427929409185575, + "grad_norm": 6.128123760223389, + "learning_rate": 4.308524775350198e-05, + "loss": 3.2867, + "num_input_tokens_seen": 1445800, + "step": 2215 + }, + { + "epoch": 0.24334100624794475, + "grad_norm": 6.555956840515137, + "learning_rate": 4.305550191813923e-05, + "loss": 3.1985, + "num_input_tokens_seen": 1448992, + "step": 2220 + }, + { + "epoch": 0.24388907157733203, + "grad_norm": 6.0009446144104, + "learning_rate": 4.302570255565825e-05, + "loss": 3.1752, + "num_input_tokens_seen": 1452104, + "step": 2225 + }, + { + "epoch": 0.24443713690671928, + "grad_norm": 5.329344749450684, + "learning_rate": 4.299584975440184e-05, + "loss": 2.9533, + "num_input_tokens_seen": 1457016, + "step": 2230 + }, + { + "epoch": 0.24498520223610654, + "grad_norm": 4.869180202484131, + "learning_rate": 4.296594360287126e-05, + "loss": 2.9869, + "num_input_tokens_seen": 1459624, + "step": 2235 + }, + { + "epoch": 0.24553326756549382, + "grad_norm": 6.4714202880859375, + "learning_rate": 4.293598418972592e-05, + "loss": 3.2594, + "num_input_tokens_seen": 1462696, + "step": 2240 + }, + { + "epoch": 0.24608133289488107, + "grad_norm": 10.35406494140625, + "learning_rate": 4.2905971603783116e-05, + "loss": 3.164, + "num_input_tokens_seen": 1466832, + "step": 2245 + }, + { + "epoch": 0.24662939822426833, + "grad_norm": 5.773983001708984, + "learning_rate": 4.287590593401778e-05, + "loss": 3.2342, + "num_input_tokens_seen": 1470288, + "step": 2250 + }, + { + "epoch": 0.2471774635536556, + "grad_norm": 5.758610248565674, + "learning_rate": 4.284578726956225e-05, + "loss": 3.38, + "num_input_tokens_seen": 1473032, + "step": 2255 + }, + { + "epoch": 0.24772552888304286, + "grad_norm": 7.092349529266357, + "learning_rate": 4.2815615699705943e-05, + "loss": 3.1884, + "num_input_tokens_seen": 1476104, + "step": 2260 + }, + { + "epoch": 0.24827359421243012, + "grad_norm": 8.047478675842285, + "learning_rate": 4.2785391313895103e-05, + "loss": 3.3215, + "num_input_tokens_seen": 1479376, + "step": 2265 + }, + { + "epoch": 0.2488216595418174, + "grad_norm": 7.5882439613342285, + "learning_rate": 4.27551142017326e-05, + "loss": 3.0476, + "num_input_tokens_seen": 1482248, + "step": 2270 + }, + { + "epoch": 0.24936972487120465, + "grad_norm": 5.922421932220459, + "learning_rate": 4.2724784452977565e-05, + "loss": 3.3373, + "num_input_tokens_seen": 1485232, + "step": 2275 + }, + { + "epoch": 0.2499177902005919, + "grad_norm": 6.161900520324707, + "learning_rate": 4.26944021575452e-05, + "loss": 3.0011, + "num_input_tokens_seen": 1488896, + "step": 2280 + }, + { + "epoch": 0.2504658555299792, + "grad_norm": 7.3562397956848145, + "learning_rate": 4.2663967405506486e-05, + "loss": 2.9991, + "num_input_tokens_seen": 1492072, + "step": 2285 + }, + { + "epoch": 0.2510139208593664, + "grad_norm": 6.788776397705078, + "learning_rate": 4.263348028708792e-05, + "loss": 2.9735, + "num_input_tokens_seen": 1495224, + "step": 2290 + }, + { + "epoch": 0.2515619861887537, + "grad_norm": 8.632386207580566, + "learning_rate": 4.260294089267123e-05, + "loss": 3.2221, + "num_input_tokens_seen": 1498256, + "step": 2295 + }, + { + "epoch": 0.252110051518141, + "grad_norm": 6.462652683258057, + "learning_rate": 4.257234931279313e-05, + "loss": 2.8929, + "num_input_tokens_seen": 1501824, + "step": 2300 + }, + { + "epoch": 0.2526581168475282, + "grad_norm": 7.380079746246338, + "learning_rate": 4.254170563814505e-05, + "loss": 3.2545, + "num_input_tokens_seen": 1504768, + "step": 2305 + }, + { + "epoch": 0.2532061821769155, + "grad_norm": 5.370420455932617, + "learning_rate": 4.2511009959572826e-05, + "loss": 3.4558, + "num_input_tokens_seen": 1508056, + "step": 2310 + }, + { + "epoch": 0.25375424750630277, + "grad_norm": 5.953249454498291, + "learning_rate": 4.2480262368076504e-05, + "loss": 3.2177, + "num_input_tokens_seen": 1511920, + "step": 2315 + }, + { + "epoch": 0.25430231283569, + "grad_norm": 5.694786548614502, + "learning_rate": 4.244946295481001e-05, + "loss": 3.2378, + "num_input_tokens_seen": 1514936, + "step": 2320 + }, + { + "epoch": 0.2548503781650773, + "grad_norm": 7.257277965545654, + "learning_rate": 4.241861181108092e-05, + "loss": 3.616, + "num_input_tokens_seen": 1518416, + "step": 2325 + }, + { + "epoch": 0.25539844349446456, + "grad_norm": 6.388315200805664, + "learning_rate": 4.238770902835013e-05, + "loss": 3.2898, + "num_input_tokens_seen": 1521960, + "step": 2330 + }, + { + "epoch": 0.2559465088238518, + "grad_norm": 8.813338279724121, + "learning_rate": 4.235675469823166e-05, + "loss": 3.4491, + "num_input_tokens_seen": 1525312, + "step": 2335 + }, + { + "epoch": 0.25649457415323906, + "grad_norm": 6.0403947830200195, + "learning_rate": 4.232574891249234e-05, + "loss": 3.0747, + "num_input_tokens_seen": 1528632, + "step": 2340 + }, + { + "epoch": 0.25704263948262634, + "grad_norm": 6.77452278137207, + "learning_rate": 4.229469176305153e-05, + "loss": 3.2356, + "num_input_tokens_seen": 1532200, + "step": 2345 + }, + { + "epoch": 0.25759070481201357, + "grad_norm": 6.781161785125732, + "learning_rate": 4.2263583341980885e-05, + "loss": 3.1273, + "num_input_tokens_seen": 1535624, + "step": 2350 + }, + { + "epoch": 0.25813877014140085, + "grad_norm": 6.070975303649902, + "learning_rate": 4.223242374150402e-05, + "loss": 3.0905, + "num_input_tokens_seen": 1538504, + "step": 2355 + }, + { + "epoch": 0.25868683547078813, + "grad_norm": 6.770239353179932, + "learning_rate": 4.220121305399634e-05, + "loss": 3.2115, + "num_input_tokens_seen": 1541520, + "step": 2360 + }, + { + "epoch": 0.25923490080017536, + "grad_norm": 6.523434638977051, + "learning_rate": 4.216995137198463e-05, + "loss": 3.2605, + "num_input_tokens_seen": 1545656, + "step": 2365 + }, + { + "epoch": 0.25978296612956264, + "grad_norm": 6.475868225097656, + "learning_rate": 4.213863878814691e-05, + "loss": 3.2498, + "num_input_tokens_seen": 1549464, + "step": 2370 + }, + { + "epoch": 0.2603310314589499, + "grad_norm": 7.743395805358887, + "learning_rate": 4.210727539531206e-05, + "loss": 3.0166, + "num_input_tokens_seen": 1553408, + "step": 2375 + }, + { + "epoch": 0.26087909678833715, + "grad_norm": 6.206083297729492, + "learning_rate": 4.207586128645963e-05, + "loss": 3.2151, + "num_input_tokens_seen": 1557112, + "step": 2380 + }, + { + "epoch": 0.26142716211772443, + "grad_norm": 7.58196496963501, + "learning_rate": 4.204439655471949e-05, + "loss": 3.5573, + "num_input_tokens_seen": 1560984, + "step": 2385 + }, + { + "epoch": 0.2619752274471117, + "grad_norm": 8.101637840270996, + "learning_rate": 4.201288129337158e-05, + "loss": 3.4451, + "num_input_tokens_seen": 1563808, + "step": 2390 + }, + { + "epoch": 0.26252329277649894, + "grad_norm": 9.19637680053711, + "learning_rate": 4.1981315595845684e-05, + "loss": 3.191, + "num_input_tokens_seen": 1567344, + "step": 2395 + }, + { + "epoch": 0.2630713581058862, + "grad_norm": 7.602110862731934, + "learning_rate": 4.194969955572105e-05, + "loss": 3.7303, + "num_input_tokens_seen": 1570104, + "step": 2400 + }, + { + "epoch": 0.2636194234352735, + "grad_norm": 10.502030372619629, + "learning_rate": 4.191803326672622e-05, + "loss": 3.2205, + "num_input_tokens_seen": 1572864, + "step": 2405 + }, + { + "epoch": 0.26416748876466073, + "grad_norm": 5.903884410858154, + "learning_rate": 4.188631682273868e-05, + "loss": 3.5156, + "num_input_tokens_seen": 1575720, + "step": 2410 + }, + { + "epoch": 0.264715554094048, + "grad_norm": 5.067075729370117, + "learning_rate": 4.1854550317784604e-05, + "loss": 3.1053, + "num_input_tokens_seen": 1579008, + "step": 2415 + }, + { + "epoch": 0.2652636194234353, + "grad_norm": 6.393657207489014, + "learning_rate": 4.1822733846038584e-05, + "loss": 3.1813, + "num_input_tokens_seen": 1582216, + "step": 2420 + }, + { + "epoch": 0.2658116847528225, + "grad_norm": 10.575018882751465, + "learning_rate": 4.1790867501823345e-05, + "loss": 3.7197, + "num_input_tokens_seen": 1585440, + "step": 2425 + }, + { + "epoch": 0.2663597500822098, + "grad_norm": 7.280240535736084, + "learning_rate": 4.175895137960945e-05, + "loss": 3.0196, + "num_input_tokens_seen": 1588248, + "step": 2430 + }, + { + "epoch": 0.2669078154115971, + "grad_norm": 6.695456504821777, + "learning_rate": 4.172698557401503e-05, + "loss": 2.9587, + "num_input_tokens_seen": 1591288, + "step": 2435 + }, + { + "epoch": 0.2674558807409843, + "grad_norm": 6.2725653648376465, + "learning_rate": 4.169497017980555e-05, + "loss": 3.3583, + "num_input_tokens_seen": 1595056, + "step": 2440 + }, + { + "epoch": 0.2680039460703716, + "grad_norm": 6.505600929260254, + "learning_rate": 4.166290529189342e-05, + "loss": 3.474, + "num_input_tokens_seen": 1598096, + "step": 2445 + }, + { + "epoch": 0.26855201139975887, + "grad_norm": 7.131421089172363, + "learning_rate": 4.163079100533783e-05, + "loss": 3.2172, + "num_input_tokens_seen": 1602648, + "step": 2450 + }, + { + "epoch": 0.2691000767291461, + "grad_norm": 5.818497657775879, + "learning_rate": 4.1598627415344394e-05, + "loss": 3.2497, + "num_input_tokens_seen": 1605776, + "step": 2455 + }, + { + "epoch": 0.2696481420585334, + "grad_norm": 8.350225448608398, + "learning_rate": 4.156641461726489e-05, + "loss": 3.2372, + "num_input_tokens_seen": 1609960, + "step": 2460 + }, + { + "epoch": 0.27019620738792066, + "grad_norm": 10.619945526123047, + "learning_rate": 4.153415270659699e-05, + "loss": 3.0958, + "num_input_tokens_seen": 1612808, + "step": 2465 + }, + { + "epoch": 0.2707442727173079, + "grad_norm": 6.475553035736084, + "learning_rate": 4.150184177898394e-05, + "loss": 3.4121, + "num_input_tokens_seen": 1616104, + "step": 2470 + }, + { + "epoch": 0.27129233804669517, + "grad_norm": 9.670978546142578, + "learning_rate": 4.1469481930214335e-05, + "loss": 3.1002, + "num_input_tokens_seen": 1618920, + "step": 2475 + }, + { + "epoch": 0.27184040337608245, + "grad_norm": 5.271237850189209, + "learning_rate": 4.1437073256221784e-05, + "loss": 3.1366, + "num_input_tokens_seen": 1622272, + "step": 2480 + }, + { + "epoch": 0.2723884687054697, + "grad_norm": 6.107699394226074, + "learning_rate": 4.1404615853084626e-05, + "loss": 3.5266, + "num_input_tokens_seen": 1624928, + "step": 2485 + }, + { + "epoch": 0.27293653403485696, + "grad_norm": 8.945226669311523, + "learning_rate": 4.137210981702568e-05, + "loss": 3.627, + "num_input_tokens_seen": 1628632, + "step": 2490 + }, + { + "epoch": 0.27348459936424424, + "grad_norm": 5.393161296844482, + "learning_rate": 4.133955524441196e-05, + "loss": 3.6371, + "num_input_tokens_seen": 1631272, + "step": 2495 + }, + { + "epoch": 0.27403266469363147, + "grad_norm": 7.735115051269531, + "learning_rate": 4.130695223175434e-05, + "loss": 3.4529, + "num_input_tokens_seen": 1634272, + "step": 2500 + }, + { + "epoch": 0.27458073002301875, + "grad_norm": 9.375452041625977, + "learning_rate": 4.1274300875707295e-05, + "loss": 3.2474, + "num_input_tokens_seen": 1638000, + "step": 2505 + }, + { + "epoch": 0.27512879535240603, + "grad_norm": 6.957891464233398, + "learning_rate": 4.124160127306864e-05, + "loss": 3.0279, + "num_input_tokens_seen": 1641896, + "step": 2510 + }, + { + "epoch": 0.27567686068179326, + "grad_norm": 6.637111663818359, + "learning_rate": 4.120885352077922e-05, + "loss": 3.5516, + "num_input_tokens_seen": 1645288, + "step": 2515 + }, + { + "epoch": 0.27622492601118054, + "grad_norm": 6.921294212341309, + "learning_rate": 4.1176057715922624e-05, + "loss": 3.2415, + "num_input_tokens_seen": 1648800, + "step": 2520 + }, + { + "epoch": 0.2767729913405678, + "grad_norm": 6.21347713470459, + "learning_rate": 4.114321395572488e-05, + "loss": 3.3217, + "num_input_tokens_seen": 1652416, + "step": 2525 + }, + { + "epoch": 0.27732105666995505, + "grad_norm": 7.985599040985107, + "learning_rate": 4.111032233755418e-05, + "loss": 3.0362, + "num_input_tokens_seen": 1655720, + "step": 2530 + }, + { + "epoch": 0.27786912199934233, + "grad_norm": 6.855371952056885, + "learning_rate": 4.107738295892063e-05, + "loss": 3.0962, + "num_input_tokens_seen": 1659440, + "step": 2535 + }, + { + "epoch": 0.2784171873287296, + "grad_norm": 7.123937129974365, + "learning_rate": 4.104439591747591e-05, + "loss": 3.102, + "num_input_tokens_seen": 1662400, + "step": 2540 + }, + { + "epoch": 0.27896525265811684, + "grad_norm": 6.53096866607666, + "learning_rate": 4.101136131101297e-05, + "loss": 2.9064, + "num_input_tokens_seen": 1665336, + "step": 2545 + }, + { + "epoch": 0.2795133179875041, + "grad_norm": 8.0481538772583, + "learning_rate": 4.0978279237465825e-05, + "loss": 3.103, + "num_input_tokens_seen": 1668288, + "step": 2550 + }, + { + "epoch": 0.2800613833168914, + "grad_norm": 4.704191207885742, + "learning_rate": 4.094514979490917e-05, + "loss": 2.9912, + "num_input_tokens_seen": 1671840, + "step": 2555 + }, + { + "epoch": 0.2806094486462786, + "grad_norm": 6.396568775177002, + "learning_rate": 4.091197308155814e-05, + "loss": 3.0125, + "num_input_tokens_seen": 1675512, + "step": 2560 + }, + { + "epoch": 0.2811575139756659, + "grad_norm": 6.377243518829346, + "learning_rate": 4.087874919576801e-05, + "loss": 2.9588, + "num_input_tokens_seen": 1679232, + "step": 2565 + }, + { + "epoch": 0.2817055793050532, + "grad_norm": 7.850512981414795, + "learning_rate": 4.084547823603391e-05, + "loss": 3.1181, + "num_input_tokens_seen": 1682432, + "step": 2570 + }, + { + "epoch": 0.2822536446344404, + "grad_norm": 7.351206302642822, + "learning_rate": 4.08121603009905e-05, + "loss": 3.2493, + "num_input_tokens_seen": 1686064, + "step": 2575 + }, + { + "epoch": 0.2828017099638277, + "grad_norm": 6.765766620635986, + "learning_rate": 4.077879548941172e-05, + "loss": 2.9447, + "num_input_tokens_seen": 1689312, + "step": 2580 + }, + { + "epoch": 0.283349775293215, + "grad_norm": 6.162474155426025, + "learning_rate": 4.0745383900210514e-05, + "loss": 3.0923, + "num_input_tokens_seen": 1692976, + "step": 2585 + }, + { + "epoch": 0.2838978406226022, + "grad_norm": 6.094540119171143, + "learning_rate": 4.071192563243843e-05, + "loss": 3.4034, + "num_input_tokens_seen": 1695344, + "step": 2590 + }, + { + "epoch": 0.2844459059519895, + "grad_norm": 9.006319999694824, + "learning_rate": 4.0678420785285446e-05, + "loss": 3.3876, + "num_input_tokens_seen": 1698336, + "step": 2595 + }, + { + "epoch": 0.28499397128137677, + "grad_norm": 7.306302070617676, + "learning_rate": 4.064486945807963e-05, + "loss": 2.9591, + "num_input_tokens_seen": 1703912, + "step": 2600 + }, + { + "epoch": 0.285542036610764, + "grad_norm": 5.706150054931641, + "learning_rate": 4.0611271750286805e-05, + "loss": 3.0137, + "num_input_tokens_seen": 1707664, + "step": 2605 + }, + { + "epoch": 0.2860901019401513, + "grad_norm": 7.290525436401367, + "learning_rate": 4.057762776151035e-05, + "loss": 3.4755, + "num_input_tokens_seen": 1710832, + "step": 2610 + }, + { + "epoch": 0.2866381672695385, + "grad_norm": 7.548462867736816, + "learning_rate": 4.054393759149081e-05, + "loss": 3.1482, + "num_input_tokens_seen": 1713616, + "step": 2615 + }, + { + "epoch": 0.2871862325989258, + "grad_norm": 7.191598415374756, + "learning_rate": 4.051020134010564e-05, + "loss": 3.5189, + "num_input_tokens_seen": 1717328, + "step": 2620 + }, + { + "epoch": 0.28773429792831307, + "grad_norm": 5.576016426086426, + "learning_rate": 4.0476419107368924e-05, + "loss": 3.1058, + "num_input_tokens_seen": 1720976, + "step": 2625 + }, + { + "epoch": 0.2882823632577003, + "grad_norm": 5.512149333953857, + "learning_rate": 4.044259099343104e-05, + "loss": 3.3606, + "num_input_tokens_seen": 1723840, + "step": 2630 + }, + { + "epoch": 0.2888304285870876, + "grad_norm": 6.475109100341797, + "learning_rate": 4.040871709857842e-05, + "loss": 3.2876, + "num_input_tokens_seen": 1726944, + "step": 2635 + }, + { + "epoch": 0.28937849391647485, + "grad_norm": 6.24223518371582, + "learning_rate": 4.037479752323317e-05, + "loss": 3.2583, + "num_input_tokens_seen": 1730056, + "step": 2640 + }, + { + "epoch": 0.2899265592458621, + "grad_norm": 7.499751091003418, + "learning_rate": 4.034083236795286e-05, + "loss": 3.6548, + "num_input_tokens_seen": 1733800, + "step": 2645 + }, + { + "epoch": 0.29047462457524936, + "grad_norm": 5.272352695465088, + "learning_rate": 4.030682173343016e-05, + "loss": 3.345, + "num_input_tokens_seen": 1738176, + "step": 2650 + }, + { + "epoch": 0.29102268990463664, + "grad_norm": 4.747354030609131, + "learning_rate": 4.027276572049259e-05, + "loss": 2.8691, + "num_input_tokens_seen": 1742088, + "step": 2655 + }, + { + "epoch": 0.29157075523402387, + "grad_norm": 4.695064544677734, + "learning_rate": 4.0238664430102175e-05, + "loss": 3.3259, + "num_input_tokens_seen": 1746032, + "step": 2660 + }, + { + "epoch": 0.29211882056341115, + "grad_norm": 5.169468402862549, + "learning_rate": 4.020451796335518e-05, + "loss": 3.193, + "num_input_tokens_seen": 1749336, + "step": 2665 + }, + { + "epoch": 0.29266688589279843, + "grad_norm": 6.7505340576171875, + "learning_rate": 4.017032642148181e-05, + "loss": 3.1603, + "num_input_tokens_seen": 1752808, + "step": 2670 + }, + { + "epoch": 0.29321495122218566, + "grad_norm": 8.776106834411621, + "learning_rate": 4.0136089905845874e-05, + "loss": 3.065, + "num_input_tokens_seen": 1756768, + "step": 2675 + }, + { + "epoch": 0.29376301655157294, + "grad_norm": 5.4388203620910645, + "learning_rate": 4.010180851794453e-05, + "loss": 3.3523, + "num_input_tokens_seen": 1759960, + "step": 2680 + }, + { + "epoch": 0.2943110818809602, + "grad_norm": 7.309511661529541, + "learning_rate": 4.006748235940796e-05, + "loss": 3.1897, + "num_input_tokens_seen": 1763848, + "step": 2685 + }, + { + "epoch": 0.29485914721034745, + "grad_norm": 7.108086109161377, + "learning_rate": 4.003311153199908e-05, + "loss": 3.2525, + "num_input_tokens_seen": 1767224, + "step": 2690 + }, + { + "epoch": 0.29540721253973473, + "grad_norm": 6.940639495849609, + "learning_rate": 3.99986961376132e-05, + "loss": 3.0928, + "num_input_tokens_seen": 1770816, + "step": 2695 + }, + { + "epoch": 0.295955277869122, + "grad_norm": 8.109939575195312, + "learning_rate": 3.996423627827778e-05, + "loss": 3.2992, + "num_input_tokens_seen": 1775144, + "step": 2700 + }, + { + "epoch": 0.29650334319850924, + "grad_norm": 8.848753929138184, + "learning_rate": 3.9929732056152104e-05, + "loss": 3.1256, + "num_input_tokens_seen": 1777888, + "step": 2705 + }, + { + "epoch": 0.2970514085278965, + "grad_norm": 6.489472389221191, + "learning_rate": 3.989518357352695e-05, + "loss": 3.0047, + "num_input_tokens_seen": 1782160, + "step": 2710 + }, + { + "epoch": 0.2975994738572838, + "grad_norm": 7.247778415679932, + "learning_rate": 3.986059093282433e-05, + "loss": 3.075, + "num_input_tokens_seen": 1784824, + "step": 2715 + }, + { + "epoch": 0.29814753918667103, + "grad_norm": 7.691065788269043, + "learning_rate": 3.982595423659716e-05, + "loss": 3.4486, + "num_input_tokens_seen": 1788072, + "step": 2720 + }, + { + "epoch": 0.2986956045160583, + "grad_norm": 7.700766086578369, + "learning_rate": 3.979127358752897e-05, + "loss": 3.4979, + "num_input_tokens_seen": 1790944, + "step": 2725 + }, + { + "epoch": 0.2992436698454456, + "grad_norm": 5.059070110321045, + "learning_rate": 3.975654908843356e-05, + "loss": 3.305, + "num_input_tokens_seen": 1794368, + "step": 2730 + }, + { + "epoch": 0.2997917351748328, + "grad_norm": 6.1541595458984375, + "learning_rate": 3.972178084225478e-05, + "loss": 3.2146, + "num_input_tokens_seen": 1798760, + "step": 2735 + }, + { + "epoch": 0.3003398005042201, + "grad_norm": 8.040989875793457, + "learning_rate": 3.968696895206613e-05, + "loss": 3.482, + "num_input_tokens_seen": 1801512, + "step": 2740 + }, + { + "epoch": 0.3008878658336074, + "grad_norm": 5.050278186798096, + "learning_rate": 3.9652113521070513e-05, + "loss": 3.3143, + "num_input_tokens_seen": 1805240, + "step": 2745 + }, + { + "epoch": 0.3014359311629946, + "grad_norm": 5.1891279220581055, + "learning_rate": 3.9617214652599904e-05, + "loss": 2.8368, + "num_input_tokens_seen": 1809040, + "step": 2750 + }, + { + "epoch": 0.3019839964923819, + "grad_norm": 6.89003849029541, + "learning_rate": 3.958227245011506e-05, + "loss": 3.3205, + "num_input_tokens_seen": 1812536, + "step": 2755 + }, + { + "epoch": 0.30253206182176917, + "grad_norm": 6.001296043395996, + "learning_rate": 3.954728701720521e-05, + "loss": 3.4753, + "num_input_tokens_seen": 1816296, + "step": 2760 + }, + { + "epoch": 0.3030801271511564, + "grad_norm": 4.202249050140381, + "learning_rate": 3.951225845758773e-05, + "loss": 3.3659, + "num_input_tokens_seen": 1819896, + "step": 2765 + }, + { + "epoch": 0.3036281924805437, + "grad_norm": 6.209683418273926, + "learning_rate": 3.9477186875107865e-05, + "loss": 3.5706, + "num_input_tokens_seen": 1823960, + "step": 2770 + }, + { + "epoch": 0.30417625780993096, + "grad_norm": 5.219339847564697, + "learning_rate": 3.944207237373838e-05, + "loss": 3.121, + "num_input_tokens_seen": 1827176, + "step": 2775 + }, + { + "epoch": 0.3047243231393182, + "grad_norm": 6.556133270263672, + "learning_rate": 3.940691505757931e-05, + "loss": 3.1289, + "num_input_tokens_seen": 1830016, + "step": 2780 + }, + { + "epoch": 0.30527238846870547, + "grad_norm": 5.480815887451172, + "learning_rate": 3.9371715030857595e-05, + "loss": 2.8851, + "num_input_tokens_seen": 1833280, + "step": 2785 + }, + { + "epoch": 0.30582045379809275, + "grad_norm": 4.781624794006348, + "learning_rate": 3.933647239792679e-05, + "loss": 3.066, + "num_input_tokens_seen": 1836784, + "step": 2790 + }, + { + "epoch": 0.30636851912748, + "grad_norm": 5.901027202606201, + "learning_rate": 3.930118726326678e-05, + "loss": 3.0618, + "num_input_tokens_seen": 1840600, + "step": 2795 + }, + { + "epoch": 0.30691658445686726, + "grad_norm": 4.3098649978637695, + "learning_rate": 3.926585973148344e-05, + "loss": 3.0273, + "num_input_tokens_seen": 1844456, + "step": 2800 + }, + { + "epoch": 0.30746464978625454, + "grad_norm": 7.2452521324157715, + "learning_rate": 3.923048990730832e-05, + "loss": 3.3328, + "num_input_tokens_seen": 1847648, + "step": 2805 + }, + { + "epoch": 0.30801271511564177, + "grad_norm": 9.102137565612793, + "learning_rate": 3.9195077895598385e-05, + "loss": 3.4577, + "num_input_tokens_seen": 1851080, + "step": 2810 + }, + { + "epoch": 0.30856078044502905, + "grad_norm": 7.165421009063721, + "learning_rate": 3.9159623801335635e-05, + "loss": 3.2345, + "num_input_tokens_seen": 1854544, + "step": 2815 + }, + { + "epoch": 0.30910884577441633, + "grad_norm": 6.918674468994141, + "learning_rate": 3.912412772962685e-05, + "loss": 3.3151, + "num_input_tokens_seen": 1857488, + "step": 2820 + }, + { + "epoch": 0.30965691110380356, + "grad_norm": 7.7270660400390625, + "learning_rate": 3.908858978570324e-05, + "loss": 3.0722, + "num_input_tokens_seen": 1859744, + "step": 2825 + }, + { + "epoch": 0.31020497643319084, + "grad_norm": 5.471165657043457, + "learning_rate": 3.905301007492016e-05, + "loss": 3.3752, + "num_input_tokens_seen": 1862520, + "step": 2830 + }, + { + "epoch": 0.3107530417625781, + "grad_norm": 8.547778129577637, + "learning_rate": 3.9017388702756766e-05, + "loss": 3.4572, + "num_input_tokens_seen": 1865688, + "step": 2835 + }, + { + "epoch": 0.31130110709196535, + "grad_norm": 5.8289289474487305, + "learning_rate": 3.898172577481577e-05, + "loss": 3.0442, + "num_input_tokens_seen": 1869008, + "step": 2840 + }, + { + "epoch": 0.3118491724213526, + "grad_norm": 5.646442413330078, + "learning_rate": 3.894602139682301e-05, + "loss": 3.3365, + "num_input_tokens_seen": 1872200, + "step": 2845 + }, + { + "epoch": 0.3123972377507399, + "grad_norm": 5.7611565589904785, + "learning_rate": 3.891027567462727e-05, + "loss": 3.0501, + "num_input_tokens_seen": 1874936, + "step": 2850 + }, + { + "epoch": 0.31294530308012714, + "grad_norm": 6.07964563369751, + "learning_rate": 3.8874488714199874e-05, + "loss": 3.1584, + "num_input_tokens_seen": 1877880, + "step": 2855 + }, + { + "epoch": 0.3134933684095144, + "grad_norm": 6.76899528503418, + "learning_rate": 3.883866062163439e-05, + "loss": 3.2215, + "num_input_tokens_seen": 1880632, + "step": 2860 + }, + { + "epoch": 0.3140414337389017, + "grad_norm": 9.11755657196045, + "learning_rate": 3.880279150314636e-05, + "loss": 3.4992, + "num_input_tokens_seen": 1883792, + "step": 2865 + }, + { + "epoch": 0.3145894990682889, + "grad_norm": 4.672335147857666, + "learning_rate": 3.876688146507291e-05, + "loss": 3.2378, + "num_input_tokens_seen": 1887984, + "step": 2870 + }, + { + "epoch": 0.3151375643976762, + "grad_norm": 8.21897029876709, + "learning_rate": 3.873093061387251e-05, + "loss": 3.4215, + "num_input_tokens_seen": 1890952, + "step": 2875 + }, + { + "epoch": 0.3156856297270635, + "grad_norm": 6.4296674728393555, + "learning_rate": 3.869493905612461e-05, + "loss": 3.1436, + "num_input_tokens_seen": 1894376, + "step": 2880 + }, + { + "epoch": 0.3162336950564507, + "grad_norm": 6.088110446929932, + "learning_rate": 3.8658906898529325e-05, + "loss": 3.1597, + "num_input_tokens_seen": 1897632, + "step": 2885 + }, + { + "epoch": 0.316781760385838, + "grad_norm": 7.144382953643799, + "learning_rate": 3.8622834247907155e-05, + "loss": 3.3071, + "num_input_tokens_seen": 1899992, + "step": 2890 + }, + { + "epoch": 0.3173298257152253, + "grad_norm": 5.95371675491333, + "learning_rate": 3.858672121119863e-05, + "loss": 3.1272, + "num_input_tokens_seen": 1902928, + "step": 2895 + }, + { + "epoch": 0.3178778910446125, + "grad_norm": 5.033254623413086, + "learning_rate": 3.855056789546402e-05, + "loss": 3.5104, + "num_input_tokens_seen": 1905872, + "step": 2900 + }, + { + "epoch": 0.3184259563739998, + "grad_norm": 9.2310209274292, + "learning_rate": 3.8514374407883e-05, + "loss": 3.22, + "num_input_tokens_seen": 1910456, + "step": 2905 + }, + { + "epoch": 0.31897402170338707, + "grad_norm": 13.305641174316406, + "learning_rate": 3.847814085575432e-05, + "loss": 3.5537, + "num_input_tokens_seen": 1914432, + "step": 2910 + }, + { + "epoch": 0.3195220870327743, + "grad_norm": 4.90524959564209, + "learning_rate": 3.844186734649554e-05, + "loss": 3.1428, + "num_input_tokens_seen": 1917176, + "step": 2915 + }, + { + "epoch": 0.3200701523621616, + "grad_norm": 7.605042457580566, + "learning_rate": 3.840555398764265e-05, + "loss": 2.6933, + "num_input_tokens_seen": 1919488, + "step": 2920 + }, + { + "epoch": 0.32061821769154886, + "grad_norm": 6.435617923736572, + "learning_rate": 3.836920088684979e-05, + "loss": 3.1942, + "num_input_tokens_seen": 1922184, + "step": 2925 + }, + { + "epoch": 0.3211662830209361, + "grad_norm": 5.5276288986206055, + "learning_rate": 3.8332808151888906e-05, + "loss": 3.3987, + "num_input_tokens_seen": 1925760, + "step": 2930 + }, + { + "epoch": 0.32171434835032336, + "grad_norm": 7.981554985046387, + "learning_rate": 3.829637589064946e-05, + "loss": 3.107, + "num_input_tokens_seen": 1928024, + "step": 2935 + }, + { + "epoch": 0.32226241367971065, + "grad_norm": 6.667475700378418, + "learning_rate": 3.8259904211138074e-05, + "loss": 2.8259, + "num_input_tokens_seen": 1931992, + "step": 2940 + }, + { + "epoch": 0.3228104790090979, + "grad_norm": 6.904677867889404, + "learning_rate": 3.8223393221478257e-05, + "loss": 3.3099, + "num_input_tokens_seen": 1934432, + "step": 2945 + }, + { + "epoch": 0.32335854433848515, + "grad_norm": 6.4357008934021, + "learning_rate": 3.818684302991001e-05, + "loss": 3.5156, + "num_input_tokens_seen": 1938288, + "step": 2950 + }, + { + "epoch": 0.32390660966787244, + "grad_norm": 6.910282611846924, + "learning_rate": 3.8150253744789624e-05, + "loss": 3.7432, + "num_input_tokens_seen": 1941552, + "step": 2955 + }, + { + "epoch": 0.32445467499725966, + "grad_norm": 6.355223178863525, + "learning_rate": 3.811362547458919e-05, + "loss": 3.3951, + "num_input_tokens_seen": 1944848, + "step": 2960 + }, + { + "epoch": 0.32500274032664694, + "grad_norm": 5.630364418029785, + "learning_rate": 3.807695832789646e-05, + "loss": 3.1733, + "num_input_tokens_seen": 1947576, + "step": 2965 + }, + { + "epoch": 0.3255508056560342, + "grad_norm": 7.782848358154297, + "learning_rate": 3.80402524134144e-05, + "loss": 2.9549, + "num_input_tokens_seen": 1950920, + "step": 2970 + }, + { + "epoch": 0.32609887098542145, + "grad_norm": 6.886142730712891, + "learning_rate": 3.8003507839960895e-05, + "loss": 3.1884, + "num_input_tokens_seen": 1954424, + "step": 2975 + }, + { + "epoch": 0.32664693631480873, + "grad_norm": 6.035950660705566, + "learning_rate": 3.796672471646848e-05, + "loss": 2.9874, + "num_input_tokens_seen": 1957928, + "step": 2980 + }, + { + "epoch": 0.327195001644196, + "grad_norm": 8.303248405456543, + "learning_rate": 3.7929903151983934e-05, + "loss": 3.4268, + "num_input_tokens_seen": 1961240, + "step": 2985 + }, + { + "epoch": 0.32774306697358324, + "grad_norm": 6.161063194274902, + "learning_rate": 3.789304325566801e-05, + "loss": 2.8965, + "num_input_tokens_seen": 1963864, + "step": 2990 + }, + { + "epoch": 0.3282911323029705, + "grad_norm": 5.629215717315674, + "learning_rate": 3.7856145136795104e-05, + "loss": 3.0241, + "num_input_tokens_seen": 1967656, + "step": 2995 + }, + { + "epoch": 0.3288391976323578, + "grad_norm": 9.494491577148438, + "learning_rate": 3.781920890475294e-05, + "loss": 3.2297, + "num_input_tokens_seen": 1970608, + "step": 3000 + }, + { + "epoch": 0.32938726296174503, + "grad_norm": 4.975097179412842, + "learning_rate": 3.7782234669042186e-05, + "loss": 3.1757, + "num_input_tokens_seen": 1973664, + "step": 3005 + }, + { + "epoch": 0.3299353282911323, + "grad_norm": 7.1082258224487305, + "learning_rate": 3.7745222539276224e-05, + "loss": 3.1921, + "num_input_tokens_seen": 1976944, + "step": 3010 + }, + { + "epoch": 0.33048339362051954, + "grad_norm": 11.492435455322266, + "learning_rate": 3.770817262518076e-05, + "loss": 3.1751, + "num_input_tokens_seen": 1980160, + "step": 3015 + }, + { + "epoch": 0.3310314589499068, + "grad_norm": 6.560080051422119, + "learning_rate": 3.76710850365935e-05, + "loss": 3.0906, + "num_input_tokens_seen": 1983576, + "step": 3020 + }, + { + "epoch": 0.3315795242792941, + "grad_norm": 7.438432216644287, + "learning_rate": 3.763395988346386e-05, + "loss": 3.1074, + "num_input_tokens_seen": 1985784, + "step": 3025 + }, + { + "epoch": 0.33212758960868133, + "grad_norm": 7.6575164794921875, + "learning_rate": 3.759679727585262e-05, + "loss": 3.1625, + "num_input_tokens_seen": 1989344, + "step": 3030 + }, + { + "epoch": 0.3326756549380686, + "grad_norm": 6.756874084472656, + "learning_rate": 3.7559597323931566e-05, + "loss": 3.2758, + "num_input_tokens_seen": 1992304, + "step": 3035 + }, + { + "epoch": 0.3332237202674559, + "grad_norm": 5.427942276000977, + "learning_rate": 3.7522360137983235e-05, + "loss": 3.1905, + "num_input_tokens_seen": 1996120, + "step": 3040 + }, + { + "epoch": 0.3337717855968431, + "grad_norm": 5.814554691314697, + "learning_rate": 3.748508582840052e-05, + "loss": 2.8693, + "num_input_tokens_seen": 1999176, + "step": 3045 + }, + { + "epoch": 0.3343198509262304, + "grad_norm": 7.720613956451416, + "learning_rate": 3.744777450568638e-05, + "loss": 3.3644, + "num_input_tokens_seen": 2002112, + "step": 3050 + }, + { + "epoch": 0.3348679162556177, + "grad_norm": 5.780377388000488, + "learning_rate": 3.7410426280453505e-05, + "loss": 2.8918, + "num_input_tokens_seen": 2005800, + "step": 3055 + }, + { + "epoch": 0.3354159815850049, + "grad_norm": 5.939544677734375, + "learning_rate": 3.737304126342398e-05, + "loss": 3.0217, + "num_input_tokens_seen": 2009192, + "step": 3060 + }, + { + "epoch": 0.3359640469143922, + "grad_norm": 6.661081314086914, + "learning_rate": 3.7335619565428964e-05, + "loss": 3.2056, + "num_input_tokens_seen": 2012280, + "step": 3065 + }, + { + "epoch": 0.33651211224377947, + "grad_norm": 4.9228620529174805, + "learning_rate": 3.729816129740836e-05, + "loss": 3.106, + "num_input_tokens_seen": 2014984, + "step": 3070 + }, + { + "epoch": 0.3370601775731667, + "grad_norm": 6.285070896148682, + "learning_rate": 3.726066657041051e-05, + "loss": 3.1639, + "num_input_tokens_seen": 2019048, + "step": 3075 + }, + { + "epoch": 0.337608242902554, + "grad_norm": 6.625104904174805, + "learning_rate": 3.7223135495591776e-05, + "loss": 3.2258, + "num_input_tokens_seen": 2022776, + "step": 3080 + }, + { + "epoch": 0.33815630823194126, + "grad_norm": 8.347160339355469, + "learning_rate": 3.718556818421636e-05, + "loss": 3.4006, + "num_input_tokens_seen": 2026304, + "step": 3085 + }, + { + "epoch": 0.3387043735613285, + "grad_norm": 9.37065601348877, + "learning_rate": 3.7147964747655836e-05, + "loss": 3.2778, + "num_input_tokens_seen": 2030200, + "step": 3090 + }, + { + "epoch": 0.33925243889071577, + "grad_norm": 6.341724872589111, + "learning_rate": 3.711032529738887e-05, + "loss": 3.5654, + "num_input_tokens_seen": 2033656, + "step": 3095 + }, + { + "epoch": 0.33980050422010305, + "grad_norm": 6.54714298248291, + "learning_rate": 3.7072649945000936e-05, + "loss": 3.0664, + "num_input_tokens_seen": 2037328, + "step": 3100 + }, + { + "epoch": 0.3403485695494903, + "grad_norm": 6.289731979370117, + "learning_rate": 3.703493880218391e-05, + "loss": 2.8214, + "num_input_tokens_seen": 2040488, + "step": 3105 + }, + { + "epoch": 0.34089663487887756, + "grad_norm": 8.150530815124512, + "learning_rate": 3.699719198073578e-05, + "loss": 3.2654, + "num_input_tokens_seen": 2043256, + "step": 3110 + }, + { + "epoch": 0.34144470020826484, + "grad_norm": 7.053910255432129, + "learning_rate": 3.6959409592560304e-05, + "loss": 3.3008, + "num_input_tokens_seen": 2046064, + "step": 3115 + }, + { + "epoch": 0.34199276553765207, + "grad_norm": 5.083940505981445, + "learning_rate": 3.69215917496667e-05, + "loss": 3.0999, + "num_input_tokens_seen": 2049568, + "step": 3120 + }, + { + "epoch": 0.34254083086703935, + "grad_norm": 5.558229446411133, + "learning_rate": 3.6883738564169254e-05, + "loss": 3.4491, + "num_input_tokens_seen": 2052400, + "step": 3125 + }, + { + "epoch": 0.34308889619642663, + "grad_norm": 7.365407466888428, + "learning_rate": 3.684585014828708e-05, + "loss": 3.1569, + "num_input_tokens_seen": 2055864, + "step": 3130 + }, + { + "epoch": 0.34363696152581386, + "grad_norm": 7.316169738769531, + "learning_rate": 3.680792661434368e-05, + "loss": 3.1274, + "num_input_tokens_seen": 2058856, + "step": 3135 + }, + { + "epoch": 0.34418502685520114, + "grad_norm": 8.32957935333252, + "learning_rate": 3.676996807476671e-05, + "loss": 2.9842, + "num_input_tokens_seen": 2062056, + "step": 3140 + }, + { + "epoch": 0.3447330921845884, + "grad_norm": 7.238974094390869, + "learning_rate": 3.673197464208759e-05, + "loss": 3.1055, + "num_input_tokens_seen": 2064760, + "step": 3145 + }, + { + "epoch": 0.34528115751397565, + "grad_norm": 8.2353515625, + "learning_rate": 3.669394642894118e-05, + "loss": 2.7765, + "num_input_tokens_seen": 2068440, + "step": 3150 + }, + { + "epoch": 0.3458292228433629, + "grad_norm": 7.214339256286621, + "learning_rate": 3.665588354806545e-05, + "loss": 3.0102, + "num_input_tokens_seen": 2072136, + "step": 3155 + }, + { + "epoch": 0.3463772881727502, + "grad_norm": 6.484249114990234, + "learning_rate": 3.661778611230114e-05, + "loss": 3.2456, + "num_input_tokens_seen": 2074560, + "step": 3160 + }, + { + "epoch": 0.34692535350213743, + "grad_norm": 6.298303604125977, + "learning_rate": 3.657965423459145e-05, + "loss": 3.3588, + "num_input_tokens_seen": 2077248, + "step": 3165 + }, + { + "epoch": 0.3474734188315247, + "grad_norm": 8.595486640930176, + "learning_rate": 3.6541488027981675e-05, + "loss": 2.9303, + "num_input_tokens_seen": 2080160, + "step": 3170 + }, + { + "epoch": 0.348021484160912, + "grad_norm": 7.8414740562438965, + "learning_rate": 3.650328760561887e-05, + "loss": 3.5767, + "num_input_tokens_seen": 2082320, + "step": 3175 + }, + { + "epoch": 0.3485695494902992, + "grad_norm": 5.1522908210754395, + "learning_rate": 3.646505308075154e-05, + "loss": 3.1739, + "num_input_tokens_seen": 2085104, + "step": 3180 + }, + { + "epoch": 0.3491176148196865, + "grad_norm": 9.065922737121582, + "learning_rate": 3.642678456672929e-05, + "loss": 3.3567, + "num_input_tokens_seen": 2087800, + "step": 3185 + }, + { + "epoch": 0.3496656801490738, + "grad_norm": 11.175498962402344, + "learning_rate": 3.638848217700248e-05, + "loss": 3.3376, + "num_input_tokens_seen": 2090776, + "step": 3190 + }, + { + "epoch": 0.350213745478461, + "grad_norm": 7.90383768081665, + "learning_rate": 3.63501460251219e-05, + "loss": 2.9388, + "num_input_tokens_seen": 2093152, + "step": 3195 + }, + { + "epoch": 0.3507618108078483, + "grad_norm": 7.013014316558838, + "learning_rate": 3.6311776224738435e-05, + "loss": 3.0298, + "num_input_tokens_seen": 2096192, + "step": 3200 + }, + { + "epoch": 0.3513098761372356, + "grad_norm": 4.87260103225708, + "learning_rate": 3.627337288960272e-05, + "loss": 3.3596, + "num_input_tokens_seen": 2100256, + "step": 3205 + }, + { + "epoch": 0.3518579414666228, + "grad_norm": 7.644909858703613, + "learning_rate": 3.6234936133564823e-05, + "loss": 3.1154, + "num_input_tokens_seen": 2102928, + "step": 3210 + }, + { + "epoch": 0.3524060067960101, + "grad_norm": 5.678354263305664, + "learning_rate": 3.619646607057386e-05, + "loss": 2.8941, + "num_input_tokens_seen": 2106944, + "step": 3215 + }, + { + "epoch": 0.35295407212539737, + "grad_norm": 5.123593330383301, + "learning_rate": 3.61579628146777e-05, + "loss": 3.1417, + "num_input_tokens_seen": 2111496, + "step": 3220 + }, + { + "epoch": 0.3535021374547846, + "grad_norm": 5.542695999145508, + "learning_rate": 3.611942648002265e-05, + "loss": 3.1733, + "num_input_tokens_seen": 2114960, + "step": 3225 + }, + { + "epoch": 0.3540502027841719, + "grad_norm": 8.204092025756836, + "learning_rate": 3.6080857180853025e-05, + "loss": 3.4422, + "num_input_tokens_seen": 2117528, + "step": 3230 + }, + { + "epoch": 0.35459826811355916, + "grad_norm": 6.3048014640808105, + "learning_rate": 3.6042255031510895e-05, + "loss": 3.3049, + "num_input_tokens_seen": 2121312, + "step": 3235 + }, + { + "epoch": 0.3551463334429464, + "grad_norm": 8.287495613098145, + "learning_rate": 3.600362014643573e-05, + "loss": 3.2349, + "num_input_tokens_seen": 2125296, + "step": 3240 + }, + { + "epoch": 0.35569439877233366, + "grad_norm": 7.690340995788574, + "learning_rate": 3.5964952640164016e-05, + "loss": 3.4982, + "num_input_tokens_seen": 2127944, + "step": 3245 + }, + { + "epoch": 0.35624246410172095, + "grad_norm": 5.382369518280029, + "learning_rate": 3.592625262732898e-05, + "loss": 3.3248, + "num_input_tokens_seen": 2131200, + "step": 3250 + }, + { + "epoch": 0.35679052943110817, + "grad_norm": 7.964527606964111, + "learning_rate": 3.58875202226602e-05, + "loss": 3.2188, + "num_input_tokens_seen": 2133648, + "step": 3255 + }, + { + "epoch": 0.35733859476049545, + "grad_norm": 5.458812236785889, + "learning_rate": 3.5848755540983286e-05, + "loss": 3.3385, + "num_input_tokens_seen": 2136960, + "step": 3260 + }, + { + "epoch": 0.35788666008988274, + "grad_norm": 7.087930679321289, + "learning_rate": 3.580995869721953e-05, + "loss": 3.0703, + "num_input_tokens_seen": 2140656, + "step": 3265 + }, + { + "epoch": 0.35843472541926996, + "grad_norm": 6.762202262878418, + "learning_rate": 3.577112980638557e-05, + "loss": 2.9214, + "num_input_tokens_seen": 2143360, + "step": 3270 + }, + { + "epoch": 0.35898279074865724, + "grad_norm": 6.3621649742126465, + "learning_rate": 3.573226898359308e-05, + "loss": 3.4276, + "num_input_tokens_seen": 2146456, + "step": 3275 + }, + { + "epoch": 0.3595308560780445, + "grad_norm": 8.797203063964844, + "learning_rate": 3.5693376344048344e-05, + "loss": 3.0474, + "num_input_tokens_seen": 2149336, + "step": 3280 + }, + { + "epoch": 0.36007892140743175, + "grad_norm": 7.268299579620361, + "learning_rate": 3.5654452003052033e-05, + "loss": 2.8497, + "num_input_tokens_seen": 2152960, + "step": 3285 + }, + { + "epoch": 0.36062698673681903, + "grad_norm": 8.053544044494629, + "learning_rate": 3.5615496075998744e-05, + "loss": 3.6495, + "num_input_tokens_seen": 2157104, + "step": 3290 + }, + { + "epoch": 0.3611750520662063, + "grad_norm": 6.6186604499816895, + "learning_rate": 3.5576508678376743e-05, + "loss": 2.9909, + "num_input_tokens_seen": 2159576, + "step": 3295 + }, + { + "epoch": 0.36172311739559354, + "grad_norm": 6.244167327880859, + "learning_rate": 3.55374899257676e-05, + "loss": 3.064, + "num_input_tokens_seen": 2163112, + "step": 3300 + }, + { + "epoch": 0.3622711827249808, + "grad_norm": 7.658557891845703, + "learning_rate": 3.549843993384582e-05, + "loss": 3.1039, + "num_input_tokens_seen": 2166048, + "step": 3305 + }, + { + "epoch": 0.3628192480543681, + "grad_norm": 5.7698140144348145, + "learning_rate": 3.545935881837852e-05, + "loss": 2.9442, + "num_input_tokens_seen": 2169192, + "step": 3310 + }, + { + "epoch": 0.36336731338375533, + "grad_norm": 6.534774303436279, + "learning_rate": 3.542024669522511e-05, + "loss": 2.9845, + "num_input_tokens_seen": 2172544, + "step": 3315 + }, + { + "epoch": 0.3639153787131426, + "grad_norm": 5.373234748840332, + "learning_rate": 3.538110368033689e-05, + "loss": 3.0865, + "num_input_tokens_seen": 2176280, + "step": 3320 + }, + { + "epoch": 0.3644634440425299, + "grad_norm": 6.9778547286987305, + "learning_rate": 3.5341929889756775e-05, + "loss": 3.1341, + "num_input_tokens_seen": 2179792, + "step": 3325 + }, + { + "epoch": 0.3650115093719171, + "grad_norm": 10.10000991821289, + "learning_rate": 3.530272543961888e-05, + "loss": 3.3558, + "num_input_tokens_seen": 2182776, + "step": 3330 + }, + { + "epoch": 0.3655595747013044, + "grad_norm": 6.022150993347168, + "learning_rate": 3.526349044614826e-05, + "loss": 3.1005, + "num_input_tokens_seen": 2186112, + "step": 3335 + }, + { + "epoch": 0.3661076400306917, + "grad_norm": 6.781782150268555, + "learning_rate": 3.522422502566047e-05, + "loss": 3.3438, + "num_input_tokens_seen": 2188600, + "step": 3340 + }, + { + "epoch": 0.3666557053600789, + "grad_norm": 4.399787425994873, + "learning_rate": 3.51849292945613e-05, + "loss": 3.0477, + "num_input_tokens_seen": 2191600, + "step": 3345 + }, + { + "epoch": 0.3672037706894662, + "grad_norm": 6.852601528167725, + "learning_rate": 3.51456033693464e-05, + "loss": 2.8756, + "num_input_tokens_seen": 2194544, + "step": 3350 + }, + { + "epoch": 0.3677518360188535, + "grad_norm": 7.015017509460449, + "learning_rate": 3.510624736660091e-05, + "loss": 3.6253, + "num_input_tokens_seen": 2198296, + "step": 3355 + }, + { + "epoch": 0.3682999013482407, + "grad_norm": 4.540085792541504, + "learning_rate": 3.506686140299915e-05, + "loss": 2.9568, + "num_input_tokens_seen": 2201384, + "step": 3360 + }, + { + "epoch": 0.368847966677628, + "grad_norm": 9.393879890441895, + "learning_rate": 3.502744559530426e-05, + "loss": 3.1794, + "num_input_tokens_seen": 2205720, + "step": 3365 + }, + { + "epoch": 0.36939603200701526, + "grad_norm": 7.7508344650268555, + "learning_rate": 3.498800006036788e-05, + "loss": 3.0188, + "num_input_tokens_seen": 2210344, + "step": 3370 + }, + { + "epoch": 0.3699440973364025, + "grad_norm": 5.801796913146973, + "learning_rate": 3.4948524915129726e-05, + "loss": 3.1028, + "num_input_tokens_seen": 2213264, + "step": 3375 + }, + { + "epoch": 0.37049216266578977, + "grad_norm": 6.9859938621521, + "learning_rate": 3.490902027661734e-05, + "loss": 3.5774, + "num_input_tokens_seen": 2216560, + "step": 3380 + }, + { + "epoch": 0.37104022799517705, + "grad_norm": 5.871939659118652, + "learning_rate": 3.4869486261945695e-05, + "loss": 3.3648, + "num_input_tokens_seen": 2219376, + "step": 3385 + }, + { + "epoch": 0.3715882933245643, + "grad_norm": 6.051314830780029, + "learning_rate": 3.482992298831682e-05, + "loss": 3.2641, + "num_input_tokens_seen": 2222568, + "step": 3390 + }, + { + "epoch": 0.37213635865395156, + "grad_norm": 7.149409294128418, + "learning_rate": 3.4790330573019524e-05, + "loss": 3.0127, + "num_input_tokens_seen": 2225232, + "step": 3395 + }, + { + "epoch": 0.37268442398333884, + "grad_norm": 5.8362650871276855, + "learning_rate": 3.4750709133429e-05, + "loss": 3.2417, + "num_input_tokens_seen": 2228360, + "step": 3400 + }, + { + "epoch": 0.37323248931272607, + "grad_norm": 6.061380386352539, + "learning_rate": 3.471105878700646e-05, + "loss": 3.4256, + "num_input_tokens_seen": 2231864, + "step": 3405 + }, + { + "epoch": 0.37378055464211335, + "grad_norm": 7.543921947479248, + "learning_rate": 3.467137965129884e-05, + "loss": 3.1154, + "num_input_tokens_seen": 2234400, + "step": 3410 + }, + { + "epoch": 0.3743286199715006, + "grad_norm": 4.8110151290893555, + "learning_rate": 3.463167184393843e-05, + "loss": 3.1221, + "num_input_tokens_seen": 2238056, + "step": 3415 + }, + { + "epoch": 0.37487668530088786, + "grad_norm": 7.194852352142334, + "learning_rate": 3.459193548264248e-05, + "loss": 3.4609, + "num_input_tokens_seen": 2240472, + "step": 3420 + }, + { + "epoch": 0.37542475063027514, + "grad_norm": 7.457151889801025, + "learning_rate": 3.4552170685212936e-05, + "loss": 3.1907, + "num_input_tokens_seen": 2243944, + "step": 3425 + }, + { + "epoch": 0.37597281595966237, + "grad_norm": 8.671926498413086, + "learning_rate": 3.4512377569536025e-05, + "loss": 3.0142, + "num_input_tokens_seen": 2246376, + "step": 3430 + }, + { + "epoch": 0.37652088128904965, + "grad_norm": 6.243984222412109, + "learning_rate": 3.447255625358191e-05, + "loss": 3.094, + "num_input_tokens_seen": 2249288, + "step": 3435 + }, + { + "epoch": 0.37706894661843693, + "grad_norm": 7.37971830368042, + "learning_rate": 3.443270685540439e-05, + "loss": 3.4606, + "num_input_tokens_seen": 2252536, + "step": 3440 + }, + { + "epoch": 0.37761701194782415, + "grad_norm": 6.270237445831299, + "learning_rate": 3.43928294931405e-05, + "loss": 3.1928, + "num_input_tokens_seen": 2255576, + "step": 3445 + }, + { + "epoch": 0.37816507727721144, + "grad_norm": 5.272236347198486, + "learning_rate": 3.435292428501016e-05, + "loss": 3.4196, + "num_input_tokens_seen": 2258456, + "step": 3450 + }, + { + "epoch": 0.3787131426065987, + "grad_norm": 6.378783226013184, + "learning_rate": 3.431299134931587e-05, + "loss": 3.3069, + "num_input_tokens_seen": 2261160, + "step": 3455 + }, + { + "epoch": 0.37926120793598594, + "grad_norm": 7.296474456787109, + "learning_rate": 3.427303080444232e-05, + "loss": 3.3306, + "num_input_tokens_seen": 2263808, + "step": 3460 + }, + { + "epoch": 0.3798092732653732, + "grad_norm": 6.654740333557129, + "learning_rate": 3.423304276885605e-05, + "loss": 2.871, + "num_input_tokens_seen": 2267280, + "step": 3465 + }, + { + "epoch": 0.3803573385947605, + "grad_norm": 7.27192497253418, + "learning_rate": 3.419302736110508e-05, + "loss": 3.3171, + "num_input_tokens_seen": 2270632, + "step": 3470 + }, + { + "epoch": 0.38090540392414773, + "grad_norm": 5.948354721069336, + "learning_rate": 3.4152984699818614e-05, + "loss": 3.4794, + "num_input_tokens_seen": 2273960, + "step": 3475 + }, + { + "epoch": 0.381453469253535, + "grad_norm": 6.537465572357178, + "learning_rate": 3.4112914903706616e-05, + "loss": 3.1609, + "num_input_tokens_seen": 2277568, + "step": 3480 + }, + { + "epoch": 0.3820015345829223, + "grad_norm": 13.15424919128418, + "learning_rate": 3.4072818091559524e-05, + "loss": 3.0777, + "num_input_tokens_seen": 2279976, + "step": 3485 + }, + { + "epoch": 0.3825495999123095, + "grad_norm": 5.581765174865723, + "learning_rate": 3.403269438224784e-05, + "loss": 3.1242, + "num_input_tokens_seen": 2282912, + "step": 3490 + }, + { + "epoch": 0.3830976652416968, + "grad_norm": 5.730728626251221, + "learning_rate": 3.3992543894721825e-05, + "loss": 3.2418, + "num_input_tokens_seen": 2286272, + "step": 3495 + }, + { + "epoch": 0.3836457305710841, + "grad_norm": 9.713155746459961, + "learning_rate": 3.3952366748011114e-05, + "loss": 3.17, + "num_input_tokens_seen": 2289944, + "step": 3500 + }, + { + "epoch": 0.3841937959004713, + "grad_norm": 6.645389556884766, + "learning_rate": 3.391216306122439e-05, + "loss": 3.3796, + "num_input_tokens_seen": 2292688, + "step": 3505 + }, + { + "epoch": 0.3847418612298586, + "grad_norm": 7.148984432220459, + "learning_rate": 3.3871932953549005e-05, + "loss": 3.282, + "num_input_tokens_seen": 2295584, + "step": 3510 + }, + { + "epoch": 0.3852899265592459, + "grad_norm": 5.25370979309082, + "learning_rate": 3.3831676544250616e-05, + "loss": 2.9293, + "num_input_tokens_seen": 2298440, + "step": 3515 + }, + { + "epoch": 0.3858379918886331, + "grad_norm": 5.668978214263916, + "learning_rate": 3.3791393952672915e-05, + "loss": 3.0635, + "num_input_tokens_seen": 2301024, + "step": 3520 + }, + { + "epoch": 0.3863860572180204, + "grad_norm": 4.52470064163208, + "learning_rate": 3.375108529823715e-05, + "loss": 3.0398, + "num_input_tokens_seen": 2304392, + "step": 3525 + }, + { + "epoch": 0.38693412254740767, + "grad_norm": 5.700072288513184, + "learning_rate": 3.371075070044186e-05, + "loss": 3.0855, + "num_input_tokens_seen": 2307688, + "step": 3530 + }, + { + "epoch": 0.3874821878767949, + "grad_norm": 5.35679292678833, + "learning_rate": 3.367039027886252e-05, + "loss": 3.2953, + "num_input_tokens_seen": 2312384, + "step": 3535 + }, + { + "epoch": 0.3880302532061822, + "grad_norm": 6.735170841217041, + "learning_rate": 3.363000415315111e-05, + "loss": 3.1434, + "num_input_tokens_seen": 2315864, + "step": 3540 + }, + { + "epoch": 0.38857831853556946, + "grad_norm": 6.647335052490234, + "learning_rate": 3.358959244303585e-05, + "loss": 3.2033, + "num_input_tokens_seen": 2319744, + "step": 3545 + }, + { + "epoch": 0.3891263838649567, + "grad_norm": 6.841831684112549, + "learning_rate": 3.354915526832082e-05, + "loss": 3.3414, + "num_input_tokens_seen": 2322856, + "step": 3550 + }, + { + "epoch": 0.38967444919434396, + "grad_norm": 7.023780822753906, + "learning_rate": 3.350869274888554e-05, + "loss": 3.1525, + "num_input_tokens_seen": 2326016, + "step": 3555 + }, + { + "epoch": 0.39022251452373125, + "grad_norm": 8.96906852722168, + "learning_rate": 3.3468205004684695e-05, + "loss": 3.2852, + "num_input_tokens_seen": 2330120, + "step": 3560 + }, + { + "epoch": 0.39077057985311847, + "grad_norm": 7.874572277069092, + "learning_rate": 3.3427692155747766e-05, + "loss": 2.9457, + "num_input_tokens_seen": 2332776, + "step": 3565 + }, + { + "epoch": 0.39131864518250575, + "grad_norm": 6.962822914123535, + "learning_rate": 3.338715432217865e-05, + "loss": 3.0687, + "num_input_tokens_seen": 2336856, + "step": 3570 + }, + { + "epoch": 0.39186671051189303, + "grad_norm": 6.802676200866699, + "learning_rate": 3.334659162415529e-05, + "loss": 3.6562, + "num_input_tokens_seen": 2339768, + "step": 3575 + }, + { + "epoch": 0.39241477584128026, + "grad_norm": 7.828624725341797, + "learning_rate": 3.3306004181929375e-05, + "loss": 3.2111, + "num_input_tokens_seen": 2342920, + "step": 3580 + }, + { + "epoch": 0.39296284117066754, + "grad_norm": 7.1746320724487305, + "learning_rate": 3.326539211582592e-05, + "loss": 3.2333, + "num_input_tokens_seen": 2346656, + "step": 3585 + }, + { + "epoch": 0.3935109065000548, + "grad_norm": 7.000988006591797, + "learning_rate": 3.3224755546242967e-05, + "loss": 3.3291, + "num_input_tokens_seen": 2351008, + "step": 3590 + }, + { + "epoch": 0.39405897182944205, + "grad_norm": 6.557620048522949, + "learning_rate": 3.3184094593651196e-05, + "loss": 2.7686, + "num_input_tokens_seen": 2354160, + "step": 3595 + }, + { + "epoch": 0.39460703715882933, + "grad_norm": 7.011937618255615, + "learning_rate": 3.314340937859356e-05, + "loss": 3.4913, + "num_input_tokens_seen": 2357464, + "step": 3600 + }, + { + "epoch": 0.3951551024882166, + "grad_norm": 6.284838676452637, + "learning_rate": 3.310270002168493e-05, + "loss": 2.835, + "num_input_tokens_seen": 2360488, + "step": 3605 + }, + { + "epoch": 0.39570316781760384, + "grad_norm": 7.415198802947998, + "learning_rate": 3.306196664361178e-05, + "loss": 2.9347, + "num_input_tokens_seen": 2363448, + "step": 3610 + }, + { + "epoch": 0.3962512331469911, + "grad_norm": 7.382150650024414, + "learning_rate": 3.302120936513177e-05, + "loss": 3.3669, + "num_input_tokens_seen": 2365800, + "step": 3615 + }, + { + "epoch": 0.3967992984763784, + "grad_norm": 5.894745349884033, + "learning_rate": 3.2980428307073435e-05, + "loss": 2.8094, + "num_input_tokens_seen": 2369016, + "step": 3620 + }, + { + "epoch": 0.39734736380576563, + "grad_norm": 6.539662837982178, + "learning_rate": 3.29396235903358e-05, + "loss": 3.1544, + "num_input_tokens_seen": 2372144, + "step": 3625 + }, + { + "epoch": 0.3978954291351529, + "grad_norm": 6.1463799476623535, + "learning_rate": 3.2898795335888005e-05, + "loss": 3.2679, + "num_input_tokens_seen": 2374656, + "step": 3630 + }, + { + "epoch": 0.3984434944645402, + "grad_norm": 8.810948371887207, + "learning_rate": 3.2857943664769e-05, + "loss": 3.394, + "num_input_tokens_seen": 2378056, + "step": 3635 + }, + { + "epoch": 0.3989915597939274, + "grad_norm": 10.048519134521484, + "learning_rate": 3.2817068698087164e-05, + "loss": 3.4094, + "num_input_tokens_seen": 2380792, + "step": 3640 + }, + { + "epoch": 0.3995396251233147, + "grad_norm": 8.441570281982422, + "learning_rate": 3.277617055701989e-05, + "loss": 2.9142, + "num_input_tokens_seen": 2383912, + "step": 3645 + }, + { + "epoch": 0.400087690452702, + "grad_norm": 5.723228931427002, + "learning_rate": 3.273524936281331e-05, + "loss": 3.2162, + "num_input_tokens_seen": 2386592, + "step": 3650 + }, + { + "epoch": 0.4006357557820892, + "grad_norm": 5.869374752044678, + "learning_rate": 3.2694305236781904e-05, + "loss": 3.301, + "num_input_tokens_seen": 2390144, + "step": 3655 + }, + { + "epoch": 0.4011838211114765, + "grad_norm": 6.342257499694824, + "learning_rate": 3.26533383003081e-05, + "loss": 3.2055, + "num_input_tokens_seen": 2393872, + "step": 3660 + }, + { + "epoch": 0.4017318864408638, + "grad_norm": 6.534188270568848, + "learning_rate": 3.2612348674841995e-05, + "loss": 3.0935, + "num_input_tokens_seen": 2396648, + "step": 3665 + }, + { + "epoch": 0.402279951770251, + "grad_norm": 7.0050272941589355, + "learning_rate": 3.2571336481900926e-05, + "loss": 3.2582, + "num_input_tokens_seen": 2400328, + "step": 3670 + }, + { + "epoch": 0.4028280170996383, + "grad_norm": 8.4814453125, + "learning_rate": 3.253030184306912e-05, + "loss": 3.3026, + "num_input_tokens_seen": 2403080, + "step": 3675 + }, + { + "epoch": 0.40337608242902556, + "grad_norm": 7.716960906982422, + "learning_rate": 3.248924487999737e-05, + "loss": 3.052, + "num_input_tokens_seen": 2406352, + "step": 3680 + }, + { + "epoch": 0.4039241477584128, + "grad_norm": 6.716127395629883, + "learning_rate": 3.244816571440265e-05, + "loss": 3.2428, + "num_input_tokens_seen": 2409496, + "step": 3685 + }, + { + "epoch": 0.40447221308780007, + "grad_norm": 8.213761329650879, + "learning_rate": 3.240706446806773e-05, + "loss": 2.9107, + "num_input_tokens_seen": 2414032, + "step": 3690 + }, + { + "epoch": 0.40502027841718735, + "grad_norm": 6.492610931396484, + "learning_rate": 3.236594126284086e-05, + "loss": 3.293, + "num_input_tokens_seen": 2417472, + "step": 3695 + }, + { + "epoch": 0.4055683437465746, + "grad_norm": 6.562194347381592, + "learning_rate": 3.23247962206354e-05, + "loss": 3.4693, + "num_input_tokens_seen": 2420224, + "step": 3700 + }, + { + "epoch": 0.40611640907596186, + "grad_norm": 6.379699230194092, + "learning_rate": 3.228362946342942e-05, + "loss": 3.2036, + "num_input_tokens_seen": 2425376, + "step": 3705 + }, + { + "epoch": 0.40666447440534914, + "grad_norm": 8.669161796569824, + "learning_rate": 3.2242441113265395e-05, + "loss": 3.3417, + "num_input_tokens_seen": 2429616, + "step": 3710 + }, + { + "epoch": 0.40721253973473637, + "grad_norm": 4.813148021697998, + "learning_rate": 3.220123129224979e-05, + "loss": 2.9484, + "num_input_tokens_seen": 2433168, + "step": 3715 + }, + { + "epoch": 0.40776060506412365, + "grad_norm": 6.526965141296387, + "learning_rate": 3.216000012255273e-05, + "loss": 3.5202, + "num_input_tokens_seen": 2435880, + "step": 3720 + }, + { + "epoch": 0.40830867039351093, + "grad_norm": 7.899510860443115, + "learning_rate": 3.211874772640765e-05, + "loss": 3.2844, + "num_input_tokens_seen": 2439232, + "step": 3725 + }, + { + "epoch": 0.40885673572289816, + "grad_norm": 6.932427406311035, + "learning_rate": 3.2077474226110866e-05, + "loss": 3.5213, + "num_input_tokens_seen": 2443400, + "step": 3730 + }, + { + "epoch": 0.40940480105228544, + "grad_norm": 6.4443793296813965, + "learning_rate": 3.203617974402131e-05, + "loss": 3.4504, + "num_input_tokens_seen": 2446448, + "step": 3735 + }, + { + "epoch": 0.4099528663816727, + "grad_norm": 6.693415641784668, + "learning_rate": 3.199486440256009e-05, + "loss": 3.6388, + "num_input_tokens_seen": 2450016, + "step": 3740 + }, + { + "epoch": 0.41050093171105995, + "grad_norm": 6.27035665512085, + "learning_rate": 3.195352832421015e-05, + "loss": 3.4589, + "num_input_tokens_seen": 2452584, + "step": 3745 + }, + { + "epoch": 0.41104899704044723, + "grad_norm": 6.987046241760254, + "learning_rate": 3.191217163151593e-05, + "loss": 3.484, + "num_input_tokens_seen": 2455440, + "step": 3750 + }, + { + "epoch": 0.4115970623698345, + "grad_norm": 5.9024200439453125, + "learning_rate": 3.187079444708296e-05, + "loss": 2.9859, + "num_input_tokens_seen": 2459048, + "step": 3755 + }, + { + "epoch": 0.41214512769922174, + "grad_norm": 5.624914646148682, + "learning_rate": 3.182939689357753e-05, + "loss": 3.317, + "num_input_tokens_seen": 2463488, + "step": 3760 + }, + { + "epoch": 0.412693193028609, + "grad_norm": 5.933727264404297, + "learning_rate": 3.1787979093726314e-05, + "loss": 3.1318, + "num_input_tokens_seen": 2466560, + "step": 3765 + }, + { + "epoch": 0.4132412583579963, + "grad_norm": 8.507558822631836, + "learning_rate": 3.1746541170316036e-05, + "loss": 3.5896, + "num_input_tokens_seen": 2469072, + "step": 3770 + }, + { + "epoch": 0.4137893236873835, + "grad_norm": 6.940069198608398, + "learning_rate": 3.1705083246193015e-05, + "loss": 3.5636, + "num_input_tokens_seen": 2471528, + "step": 3775 + }, + { + "epoch": 0.4143373890167708, + "grad_norm": 7.710633277893066, + "learning_rate": 3.166360544426293e-05, + "loss": 3.373, + "num_input_tokens_seen": 2474672, + "step": 3780 + }, + { + "epoch": 0.4148854543461581, + "grad_norm": 6.710258960723877, + "learning_rate": 3.1622107887490354e-05, + "loss": 2.9773, + "num_input_tokens_seen": 2478184, + "step": 3785 + }, + { + "epoch": 0.4154335196755453, + "grad_norm": 6.593062400817871, + "learning_rate": 3.158059069889843e-05, + "loss": 3.1045, + "num_input_tokens_seen": 2481016, + "step": 3790 + }, + { + "epoch": 0.4159815850049326, + "grad_norm": 8.369247436523438, + "learning_rate": 3.1539054001568493e-05, + "loss": 2.7624, + "num_input_tokens_seen": 2483976, + "step": 3795 + }, + { + "epoch": 0.4165296503343199, + "grad_norm": 5.184842586517334, + "learning_rate": 3.149749791863974e-05, + "loss": 3.2427, + "num_input_tokens_seen": 2486960, + "step": 3800 + }, + { + "epoch": 0.4170777156637071, + "grad_norm": 5.449498653411865, + "learning_rate": 3.145592257330881e-05, + "loss": 3.3931, + "num_input_tokens_seen": 2490928, + "step": 3805 + }, + { + "epoch": 0.4176257809930944, + "grad_norm": 7.610599994659424, + "learning_rate": 3.141432808882946e-05, + "loss": 3.3562, + "num_input_tokens_seen": 2494760, + "step": 3810 + }, + { + "epoch": 0.4181738463224816, + "grad_norm": 6.789968490600586, + "learning_rate": 3.13727145885122e-05, + "loss": 2.823, + "num_input_tokens_seen": 2498352, + "step": 3815 + }, + { + "epoch": 0.4187219116518689, + "grad_norm": 6.654449462890625, + "learning_rate": 3.133108219572388e-05, + "loss": 3.2867, + "num_input_tokens_seen": 2501440, + "step": 3820 + }, + { + "epoch": 0.4192699769812562, + "grad_norm": 6.487675189971924, + "learning_rate": 3.1289431033887386e-05, + "loss": 3.3113, + "num_input_tokens_seen": 2504560, + "step": 3825 + }, + { + "epoch": 0.4198180423106434, + "grad_norm": 7.911233901977539, + "learning_rate": 3.1247761226481244e-05, + "loss": 2.8476, + "num_input_tokens_seen": 2507984, + "step": 3830 + }, + { + "epoch": 0.4203661076400307, + "grad_norm": 7.292878150939941, + "learning_rate": 3.120607289703925e-05, + "loss": 2.9229, + "num_input_tokens_seen": 2511632, + "step": 3835 + }, + { + "epoch": 0.42091417296941797, + "grad_norm": 7.699312686920166, + "learning_rate": 3.11643661691501e-05, + "loss": 3.2728, + "num_input_tokens_seen": 2514512, + "step": 3840 + }, + { + "epoch": 0.4214622382988052, + "grad_norm": 7.424167156219482, + "learning_rate": 3.112264116645705e-05, + "loss": 3.0013, + "num_input_tokens_seen": 2517840, + "step": 3845 + }, + { + "epoch": 0.4220103036281925, + "grad_norm": 6.991738796234131, + "learning_rate": 3.1080898012657536e-05, + "loss": 2.9434, + "num_input_tokens_seen": 2521296, + "step": 3850 + }, + { + "epoch": 0.42255836895757976, + "grad_norm": 6.644684314727783, + "learning_rate": 3.103913683150278e-05, + "loss": 3.4346, + "num_input_tokens_seen": 2523800, + "step": 3855 + }, + { + "epoch": 0.423106434286967, + "grad_norm": 6.666325092315674, + "learning_rate": 3.099735774679749e-05, + "loss": 3.2123, + "num_input_tokens_seen": 2526096, + "step": 3860 + }, + { + "epoch": 0.42365449961635426, + "grad_norm": 9.987031936645508, + "learning_rate": 3.09555608823994e-05, + "loss": 3.2205, + "num_input_tokens_seen": 2528464, + "step": 3865 + }, + { + "epoch": 0.42420256494574154, + "grad_norm": 8.114043235778809, + "learning_rate": 3.091374636221899e-05, + "loss": 3.1648, + "num_input_tokens_seen": 2530808, + "step": 3870 + }, + { + "epoch": 0.42475063027512877, + "grad_norm": 7.4291229248046875, + "learning_rate": 3.087191431021908e-05, + "loss": 2.874, + "num_input_tokens_seen": 2534400, + "step": 3875 + }, + { + "epoch": 0.42529869560451605, + "grad_norm": 6.414401054382324, + "learning_rate": 3.083006485041444e-05, + "loss": 3.0927, + "num_input_tokens_seen": 2538584, + "step": 3880 + }, + { + "epoch": 0.42584676093390333, + "grad_norm": 12.14594554901123, + "learning_rate": 3.078819810687147e-05, + "loss": 3.1133, + "num_input_tokens_seen": 2542184, + "step": 3885 + }, + { + "epoch": 0.42639482626329056, + "grad_norm": 6.391221046447754, + "learning_rate": 3.074631420370779e-05, + "loss": 3.0244, + "num_input_tokens_seen": 2545592, + "step": 3890 + }, + { + "epoch": 0.42694289159267784, + "grad_norm": 6.802542686462402, + "learning_rate": 3.0704413265091916e-05, + "loss": 3.2812, + "num_input_tokens_seen": 2548816, + "step": 3895 + }, + { + "epoch": 0.4274909569220651, + "grad_norm": 7.281493186950684, + "learning_rate": 3.066249541524285e-05, + "loss": 3.3321, + "num_input_tokens_seen": 2552352, + "step": 3900 + }, + { + "epoch": 0.42803902225145235, + "grad_norm": 6.2967047691345215, + "learning_rate": 3.0620560778429736e-05, + "loss": 3.1571, + "num_input_tokens_seen": 2556072, + "step": 3905 + }, + { + "epoch": 0.42858708758083963, + "grad_norm": 5.46196174621582, + "learning_rate": 3.0578609478971474e-05, + "loss": 2.9312, + "num_input_tokens_seen": 2559680, + "step": 3910 + }, + { + "epoch": 0.4291351529102269, + "grad_norm": 6.703193664550781, + "learning_rate": 3.0536641641236366e-05, + "loss": 3.1173, + "num_input_tokens_seen": 2564072, + "step": 3915 + }, + { + "epoch": 0.42968321823961414, + "grad_norm": 6.250140190124512, + "learning_rate": 3.0494657389641763e-05, + "loss": 2.8173, + "num_input_tokens_seen": 2567848, + "step": 3920 + }, + { + "epoch": 0.4302312835690014, + "grad_norm": 8.19283676147461, + "learning_rate": 3.0452656848653643e-05, + "loss": 3.1555, + "num_input_tokens_seen": 2570760, + "step": 3925 + }, + { + "epoch": 0.4307793488983887, + "grad_norm": 4.393120288848877, + "learning_rate": 3.041064014278629e-05, + "loss": 3.3082, + "num_input_tokens_seen": 2574112, + "step": 3930 + }, + { + "epoch": 0.43132741422777593, + "grad_norm": 7.910434246063232, + "learning_rate": 3.036860739660193e-05, + "loss": 3.0528, + "num_input_tokens_seen": 2578144, + "step": 3935 + }, + { + "epoch": 0.4318754795571632, + "grad_norm": 8.536887168884277, + "learning_rate": 3.0326558734710304e-05, + "loss": 3.224, + "num_input_tokens_seen": 2581008, + "step": 3940 + }, + { + "epoch": 0.4324235448865505, + "grad_norm": 5.810432434082031, + "learning_rate": 3.028449428176836e-05, + "loss": 3.2157, + "num_input_tokens_seen": 2583616, + "step": 3945 + }, + { + "epoch": 0.4329716102159377, + "grad_norm": 7.819321632385254, + "learning_rate": 3.024241416247987e-05, + "loss": 3.3845, + "num_input_tokens_seen": 2587680, + "step": 3950 + }, + { + "epoch": 0.433519675545325, + "grad_norm": 7.583765506744385, + "learning_rate": 3.0200318501595028e-05, + "loss": 3.4347, + "num_input_tokens_seen": 2590536, + "step": 3955 + }, + { + "epoch": 0.4340677408747123, + "grad_norm": 6.201939105987549, + "learning_rate": 3.01582074239101e-05, + "loss": 3.0368, + "num_input_tokens_seen": 2593560, + "step": 3960 + }, + { + "epoch": 0.4346158062040995, + "grad_norm": 6.4165425300598145, + "learning_rate": 3.0116081054267086e-05, + "loss": 3.1866, + "num_input_tokens_seen": 2597464, + "step": 3965 + }, + { + "epoch": 0.4351638715334868, + "grad_norm": 5.670197486877441, + "learning_rate": 3.007393951755329e-05, + "loss": 3.1721, + "num_input_tokens_seen": 2600616, + "step": 3970 + }, + { + "epoch": 0.43571193686287407, + "grad_norm": 6.542341709136963, + "learning_rate": 3.0031782938701004e-05, + "loss": 3.1902, + "num_input_tokens_seen": 2603832, + "step": 3975 + }, + { + "epoch": 0.4362600021922613, + "grad_norm": 11.36231803894043, + "learning_rate": 2.9989611442687087e-05, + "loss": 3.1505, + "num_input_tokens_seen": 2607032, + "step": 3980 + }, + { + "epoch": 0.4368080675216486, + "grad_norm": 8.223766326904297, + "learning_rate": 2.994742515453264e-05, + "loss": 3.2596, + "num_input_tokens_seen": 2609848, + "step": 3985 + }, + { + "epoch": 0.43735613285103586, + "grad_norm": 6.220792770385742, + "learning_rate": 2.9905224199302612e-05, + "loss": 3.105, + "num_input_tokens_seen": 2613072, + "step": 3990 + }, + { + "epoch": 0.4379041981804231, + "grad_norm": 9.295598983764648, + "learning_rate": 2.9863008702105444e-05, + "loss": 3.5309, + "num_input_tokens_seen": 2617216, + "step": 3995 + }, + { + "epoch": 0.43845226350981037, + "grad_norm": 7.482667446136475, + "learning_rate": 2.9820778788092662e-05, + "loss": 3.0894, + "num_input_tokens_seen": 2620440, + "step": 4000 + }, + { + "epoch": 0.43900032883919765, + "grad_norm": 8.263635635375977, + "learning_rate": 2.9778534582458563e-05, + "loss": 3.2592, + "num_input_tokens_seen": 2624136, + "step": 4005 + }, + { + "epoch": 0.4395483941685849, + "grad_norm": 6.1141180992126465, + "learning_rate": 2.973627621043979e-05, + "loss": 2.9611, + "num_input_tokens_seen": 2628416, + "step": 4010 + }, + { + "epoch": 0.44009645949797216, + "grad_norm": 5.068775653839111, + "learning_rate": 2.969400379731499e-05, + "loss": 3.2408, + "num_input_tokens_seen": 2632360, + "step": 4015 + }, + { + "epoch": 0.44064452482735944, + "grad_norm": 4.8074049949646, + "learning_rate": 2.965171746840445e-05, + "loss": 3.3503, + "num_input_tokens_seen": 2635144, + "step": 4020 + }, + { + "epoch": 0.44119259015674667, + "grad_norm": 5.924848556518555, + "learning_rate": 2.9609417349069685e-05, + "loss": 2.8347, + "num_input_tokens_seen": 2638880, + "step": 4025 + }, + { + "epoch": 0.44174065548613395, + "grad_norm": 6.371955871582031, + "learning_rate": 2.9567103564713107e-05, + "loss": 3.0076, + "num_input_tokens_seen": 2642200, + "step": 4030 + }, + { + "epoch": 0.44228872081552123, + "grad_norm": 6.616983890533447, + "learning_rate": 2.952477624077764e-05, + "loss": 3.1063, + "num_input_tokens_seen": 2647008, + "step": 4035 + }, + { + "epoch": 0.44283678614490846, + "grad_norm": 6.057950973510742, + "learning_rate": 2.9482435502746363e-05, + "loss": 2.9816, + "num_input_tokens_seen": 2649824, + "step": 4040 + }, + { + "epoch": 0.44338485147429574, + "grad_norm": 5.292036533355713, + "learning_rate": 2.944008147614208e-05, + "loss": 2.9774, + "num_input_tokens_seen": 2652424, + "step": 4045 + }, + { + "epoch": 0.443932916803683, + "grad_norm": 6.374473571777344, + "learning_rate": 2.9397714286527034e-05, + "loss": 2.9106, + "num_input_tokens_seen": 2655792, + "step": 4050 + }, + { + "epoch": 0.44448098213307025, + "grad_norm": 5.729962348937988, + "learning_rate": 2.9355334059502472e-05, + "loss": 3.1529, + "num_input_tokens_seen": 2658608, + "step": 4055 + }, + { + "epoch": 0.4450290474624575, + "grad_norm": 8.748932838439941, + "learning_rate": 2.9312940920708277e-05, + "loss": 3.236, + "num_input_tokens_seen": 2661312, + "step": 4060 + }, + { + "epoch": 0.4455771127918448, + "grad_norm": 8.778289794921875, + "learning_rate": 2.927053499582264e-05, + "loss": 3.1197, + "num_input_tokens_seen": 2665256, + "step": 4065 + }, + { + "epoch": 0.44612517812123204, + "grad_norm": 8.748550415039062, + "learning_rate": 2.922811641056164e-05, + "loss": 3.2486, + "num_input_tokens_seen": 2669288, + "step": 4070 + }, + { + "epoch": 0.4466732434506193, + "grad_norm": 5.559131145477295, + "learning_rate": 2.9185685290678888e-05, + "loss": 2.9932, + "num_input_tokens_seen": 2672312, + "step": 4075 + }, + { + "epoch": 0.4472213087800066, + "grad_norm": 5.6860575675964355, + "learning_rate": 2.9143241761965155e-05, + "loss": 3.1337, + "num_input_tokens_seen": 2676312, + "step": 4080 + }, + { + "epoch": 0.4477693741093938, + "grad_norm": 7.295080184936523, + "learning_rate": 2.9100785950248015e-05, + "loss": 2.9724, + "num_input_tokens_seen": 2679592, + "step": 4085 + }, + { + "epoch": 0.4483174394387811, + "grad_norm": 9.514237403869629, + "learning_rate": 2.9058317981391437e-05, + "loss": 3.1765, + "num_input_tokens_seen": 2682472, + "step": 4090 + }, + { + "epoch": 0.4488655047681684, + "grad_norm": 7.216882705688477, + "learning_rate": 2.901583798129543e-05, + "loss": 3.3707, + "num_input_tokens_seen": 2685328, + "step": 4095 + }, + { + "epoch": 0.4494135700975556, + "grad_norm": 7.9535298347473145, + "learning_rate": 2.8973346075895695e-05, + "loss": 3.4585, + "num_input_tokens_seen": 2688080, + "step": 4100 + }, + { + "epoch": 0.4499616354269429, + "grad_norm": 7.782059669494629, + "learning_rate": 2.8930842391163192e-05, + "loss": 2.9516, + "num_input_tokens_seen": 2691112, + "step": 4105 + }, + { + "epoch": 0.4505097007563302, + "grad_norm": 6.065903186798096, + "learning_rate": 2.8888327053103836e-05, + "loss": 3.0919, + "num_input_tokens_seen": 2694328, + "step": 4110 + }, + { + "epoch": 0.4510577660857174, + "grad_norm": 6.912715435028076, + "learning_rate": 2.884580018775807e-05, + "loss": 2.9052, + "num_input_tokens_seen": 2696856, + "step": 4115 + }, + { + "epoch": 0.4516058314151047, + "grad_norm": 8.30929946899414, + "learning_rate": 2.8803261921200503e-05, + "loss": 3.3268, + "num_input_tokens_seen": 2699968, + "step": 4120 + }, + { + "epoch": 0.45215389674449197, + "grad_norm": 8.51347541809082, + "learning_rate": 2.8760712379539567e-05, + "loss": 3.3617, + "num_input_tokens_seen": 2702416, + "step": 4125 + }, + { + "epoch": 0.4527019620738792, + "grad_norm": 6.167294979095459, + "learning_rate": 2.8718151688917105e-05, + "loss": 3.1805, + "num_input_tokens_seen": 2705440, + "step": 4130 + }, + { + "epoch": 0.4532500274032665, + "grad_norm": 8.299149513244629, + "learning_rate": 2.867557997550801e-05, + "loss": 3.2122, + "num_input_tokens_seen": 2708248, + "step": 4135 + }, + { + "epoch": 0.45379809273265376, + "grad_norm": 8.19796085357666, + "learning_rate": 2.8632997365519877e-05, + "loss": 3.0817, + "num_input_tokens_seen": 2712464, + "step": 4140 + }, + { + "epoch": 0.454346158062041, + "grad_norm": 6.964700698852539, + "learning_rate": 2.859040398519256e-05, + "loss": 3.4051, + "num_input_tokens_seen": 2715048, + "step": 4145 + }, + { + "epoch": 0.45489422339142827, + "grad_norm": 6.310876846313477, + "learning_rate": 2.8547799960797883e-05, + "loss": 2.7846, + "num_input_tokens_seen": 2718192, + "step": 4150 + }, + { + "epoch": 0.45544228872081555, + "grad_norm": 6.786360263824463, + "learning_rate": 2.8505185418639212e-05, + "loss": 2.829, + "num_input_tokens_seen": 2722064, + "step": 4155 + }, + { + "epoch": 0.4559903540502028, + "grad_norm": 7.1503520011901855, + "learning_rate": 2.8462560485051098e-05, + "loss": 2.9883, + "num_input_tokens_seen": 2725640, + "step": 4160 + }, + { + "epoch": 0.45653841937959005, + "grad_norm": 5.350907802581787, + "learning_rate": 2.841992528639888e-05, + "loss": 3.0743, + "num_input_tokens_seen": 2729992, + "step": 4165 + }, + { + "epoch": 0.45708648470897734, + "grad_norm": 5.482122421264648, + "learning_rate": 2.837727994907835e-05, + "loss": 3.2459, + "num_input_tokens_seen": 2733424, + "step": 4170 + }, + { + "epoch": 0.45763455003836456, + "grad_norm": 4.941489219665527, + "learning_rate": 2.833462459951534e-05, + "loss": 3.2963, + "num_input_tokens_seen": 2736656, + "step": 4175 + }, + { + "epoch": 0.45818261536775184, + "grad_norm": 10.229253768920898, + "learning_rate": 2.8291959364165387e-05, + "loss": 3.2607, + "num_input_tokens_seen": 2739808, + "step": 4180 + }, + { + "epoch": 0.4587306806971391, + "grad_norm": 5.911849498748779, + "learning_rate": 2.824928436951332e-05, + "loss": 3.3887, + "num_input_tokens_seen": 2742752, + "step": 4185 + }, + { + "epoch": 0.45927874602652635, + "grad_norm": 6.14879846572876, + "learning_rate": 2.8206599742072883e-05, + "loss": 3.0095, + "num_input_tokens_seen": 2746256, + "step": 4190 + }, + { + "epoch": 0.45982681135591363, + "grad_norm": 6.8150529861450195, + "learning_rate": 2.8163905608386415e-05, + "loss": 3.0599, + "num_input_tokens_seen": 2750736, + "step": 4195 + }, + { + "epoch": 0.4603748766853009, + "grad_norm": 5.578204154968262, + "learning_rate": 2.812120209502441e-05, + "loss": 3.4177, + "num_input_tokens_seen": 2753832, + "step": 4200 + }, + { + "epoch": 0.46092294201468814, + "grad_norm": 7.075170040130615, + "learning_rate": 2.8078489328585184e-05, + "loss": 3.2787, + "num_input_tokens_seen": 2757176, + "step": 4205 + }, + { + "epoch": 0.4614710073440754, + "grad_norm": 7.633877754211426, + "learning_rate": 2.803576743569447e-05, + "loss": 3.2838, + "num_input_tokens_seen": 2760632, + "step": 4210 + }, + { + "epoch": 0.46201907267346265, + "grad_norm": 7.296063423156738, + "learning_rate": 2.7993036543005073e-05, + "loss": 3.2533, + "num_input_tokens_seen": 2763160, + "step": 4215 + }, + { + "epoch": 0.46256713800284993, + "grad_norm": 9.778048515319824, + "learning_rate": 2.7950296777196454e-05, + "loss": 3.2876, + "num_input_tokens_seen": 2766304, + "step": 4220 + }, + { + "epoch": 0.4631152033322372, + "grad_norm": 6.1279826164245605, + "learning_rate": 2.7907548264974408e-05, + "loss": 3.3613, + "num_input_tokens_seen": 2769112, + "step": 4225 + }, + { + "epoch": 0.46366326866162444, + "grad_norm": 7.0411458015441895, + "learning_rate": 2.7864791133070655e-05, + "loss": 2.9218, + "num_input_tokens_seen": 2773120, + "step": 4230 + }, + { + "epoch": 0.4642113339910117, + "grad_norm": 7.575366497039795, + "learning_rate": 2.782202550824244e-05, + "loss": 2.7816, + "num_input_tokens_seen": 2775712, + "step": 4235 + }, + { + "epoch": 0.464759399320399, + "grad_norm": 4.21223258972168, + "learning_rate": 2.777925151727222e-05, + "loss": 2.913, + "num_input_tokens_seen": 2778872, + "step": 4240 + }, + { + "epoch": 0.46530746464978623, + "grad_norm": 7.198635101318359, + "learning_rate": 2.7736469286967244e-05, + "loss": 3.3944, + "num_input_tokens_seen": 2783424, + "step": 4245 + }, + { + "epoch": 0.4658555299791735, + "grad_norm": 6.785750389099121, + "learning_rate": 2.7693678944159168e-05, + "loss": 3.0493, + "num_input_tokens_seen": 2787720, + "step": 4250 + }, + { + "epoch": 0.4664035953085608, + "grad_norm": 5.799097061157227, + "learning_rate": 2.7650880615703735e-05, + "loss": 3.043, + "num_input_tokens_seen": 2790528, + "step": 4255 + }, + { + "epoch": 0.466951660637948, + "grad_norm": 5.558688163757324, + "learning_rate": 2.760807442848033e-05, + "loss": 3.0476, + "num_input_tokens_seen": 2794088, + "step": 4260 + }, + { + "epoch": 0.4674997259673353, + "grad_norm": 7.959995269775391, + "learning_rate": 2.7565260509391644e-05, + "loss": 3.3705, + "num_input_tokens_seen": 2797168, + "step": 4265 + }, + { + "epoch": 0.4680477912967226, + "grad_norm": 5.836214542388916, + "learning_rate": 2.7522438985363297e-05, + "loss": 3.1173, + "num_input_tokens_seen": 2799752, + "step": 4270 + }, + { + "epoch": 0.4685958566261098, + "grad_norm": 5.6099348068237305, + "learning_rate": 2.7479609983343457e-05, + "loss": 3.4298, + "num_input_tokens_seen": 2803560, + "step": 4275 + }, + { + "epoch": 0.4691439219554971, + "grad_norm": 6.971024513244629, + "learning_rate": 2.7436773630302448e-05, + "loss": 3.4299, + "num_input_tokens_seen": 2806360, + "step": 4280 + }, + { + "epoch": 0.46969198728488437, + "grad_norm": 5.738091945648193, + "learning_rate": 2.7393930053232393e-05, + "loss": 3.0872, + "num_input_tokens_seen": 2809408, + "step": 4285 + }, + { + "epoch": 0.4702400526142716, + "grad_norm": 10.746182441711426, + "learning_rate": 2.7351079379146844e-05, + "loss": 3.5487, + "num_input_tokens_seen": 2812752, + "step": 4290 + }, + { + "epoch": 0.4707881179436589, + "grad_norm": 6.557742595672607, + "learning_rate": 2.7308221735080363e-05, + "loss": 3.1006, + "num_input_tokens_seen": 2816432, + "step": 4295 + }, + { + "epoch": 0.47133618327304616, + "grad_norm": 7.124549865722656, + "learning_rate": 2.726535724808821e-05, + "loss": 3.2491, + "num_input_tokens_seen": 2819608, + "step": 4300 + }, + { + "epoch": 0.4718842486024334, + "grad_norm": 8.328391075134277, + "learning_rate": 2.7222486045245905e-05, + "loss": 2.9571, + "num_input_tokens_seen": 2822304, + "step": 4305 + }, + { + "epoch": 0.47243231393182067, + "grad_norm": 8.121037483215332, + "learning_rate": 2.717960825364888e-05, + "loss": 3.0946, + "num_input_tokens_seen": 2826112, + "step": 4310 + }, + { + "epoch": 0.47298037926120795, + "grad_norm": 7.5214715003967285, + "learning_rate": 2.7136724000412122e-05, + "loss": 3.2682, + "num_input_tokens_seen": 2829640, + "step": 4315 + }, + { + "epoch": 0.4735284445905952, + "grad_norm": 5.765413761138916, + "learning_rate": 2.709383341266975e-05, + "loss": 3.3871, + "num_input_tokens_seen": 2832536, + "step": 4320 + }, + { + "epoch": 0.47407650991998246, + "grad_norm": 7.573315143585205, + "learning_rate": 2.7050936617574674e-05, + "loss": 3.0505, + "num_input_tokens_seen": 2835312, + "step": 4325 + }, + { + "epoch": 0.47462457524936974, + "grad_norm": 5.444807052612305, + "learning_rate": 2.70080337422982e-05, + "loss": 3.1385, + "num_input_tokens_seen": 2839520, + "step": 4330 + }, + { + "epoch": 0.47517264057875697, + "grad_norm": 5.842774868011475, + "learning_rate": 2.696512491402967e-05, + "loss": 3.0295, + "num_input_tokens_seen": 2842096, + "step": 4335 + }, + { + "epoch": 0.47572070590814425, + "grad_norm": 6.1106157302856445, + "learning_rate": 2.692221025997606e-05, + "loss": 3.0393, + "num_input_tokens_seen": 2845424, + "step": 4340 + }, + { + "epoch": 0.47626877123753153, + "grad_norm": 7.988515377044678, + "learning_rate": 2.687928990736163e-05, + "loss": 3.3657, + "num_input_tokens_seen": 2847648, + "step": 4345 + }, + { + "epoch": 0.47681683656691876, + "grad_norm": 7.0514655113220215, + "learning_rate": 2.683636398342753e-05, + "loss": 3.4438, + "num_input_tokens_seen": 2850432, + "step": 4350 + }, + { + "epoch": 0.47736490189630604, + "grad_norm": 5.54784631729126, + "learning_rate": 2.6793432615431406e-05, + "loss": 2.9583, + "num_input_tokens_seen": 2854176, + "step": 4355 + }, + { + "epoch": 0.4779129672256933, + "grad_norm": 6.001830577850342, + "learning_rate": 2.6750495930647083e-05, + "loss": 3.4694, + "num_input_tokens_seen": 2857368, + "step": 4360 + }, + { + "epoch": 0.47846103255508055, + "grad_norm": 7.455556392669678, + "learning_rate": 2.670755405636412e-05, + "loss": 3.0839, + "num_input_tokens_seen": 2860064, + "step": 4365 + }, + { + "epoch": 0.4790090978844678, + "grad_norm": 6.409590721130371, + "learning_rate": 2.6664607119887462e-05, + "loss": 3.0962, + "num_input_tokens_seen": 2863128, + "step": 4370 + }, + { + "epoch": 0.4795571632138551, + "grad_norm": 5.903439044952393, + "learning_rate": 2.6621655248537075e-05, + "loss": 3.0613, + "num_input_tokens_seen": 2866720, + "step": 4375 + }, + { + "epoch": 0.48010522854324233, + "grad_norm": 7.286397457122803, + "learning_rate": 2.657869856964754e-05, + "loss": 2.9673, + "num_input_tokens_seen": 2869568, + "step": 4380 + }, + { + "epoch": 0.4806532938726296, + "grad_norm": 7.941439151763916, + "learning_rate": 2.6535737210567707e-05, + "loss": 3.3656, + "num_input_tokens_seen": 2874584, + "step": 4385 + }, + { + "epoch": 0.4812013592020169, + "grad_norm": 3.8733413219451904, + "learning_rate": 2.6492771298660286e-05, + "loss": 2.8012, + "num_input_tokens_seen": 2879248, + "step": 4390 + }, + { + "epoch": 0.4817494245314041, + "grad_norm": 4.492478370666504, + "learning_rate": 2.6449800961301485e-05, + "loss": 2.9495, + "num_input_tokens_seen": 2882824, + "step": 4395 + }, + { + "epoch": 0.4822974898607914, + "grad_norm": 7.726132392883301, + "learning_rate": 2.640682632588064e-05, + "loss": 3.1087, + "num_input_tokens_seen": 2886440, + "step": 4400 + }, + { + "epoch": 0.4828455551901787, + "grad_norm": 6.549642562866211, + "learning_rate": 2.6363847519799822e-05, + "loss": 2.985, + "num_input_tokens_seen": 2889808, + "step": 4405 + }, + { + "epoch": 0.4833936205195659, + "grad_norm": 8.789740562438965, + "learning_rate": 2.632086467047348e-05, + "loss": 3.1352, + "num_input_tokens_seen": 2893680, + "step": 4410 + }, + { + "epoch": 0.4839416858489532, + "grad_norm": 8.024590492248535, + "learning_rate": 2.6277877905328023e-05, + "loss": 3.3008, + "num_input_tokens_seen": 2895872, + "step": 4415 + }, + { + "epoch": 0.4844897511783405, + "grad_norm": 6.235259532928467, + "learning_rate": 2.623488735180149e-05, + "loss": 3.1758, + "num_input_tokens_seen": 2898680, + "step": 4420 + }, + { + "epoch": 0.4850378165077277, + "grad_norm": 7.674651145935059, + "learning_rate": 2.619189313734316e-05, + "loss": 2.9519, + "num_input_tokens_seen": 2903496, + "step": 4425 + }, + { + "epoch": 0.485585881837115, + "grad_norm": 5.884274959564209, + "learning_rate": 2.614889538941313e-05, + "loss": 3.3259, + "num_input_tokens_seen": 2906248, + "step": 4430 + }, + { + "epoch": 0.48613394716650227, + "grad_norm": 5.681421279907227, + "learning_rate": 2.610589423548201e-05, + "loss": 3.4432, + "num_input_tokens_seen": 2909352, + "step": 4435 + }, + { + "epoch": 0.4866820124958895, + "grad_norm": 8.08205795288086, + "learning_rate": 2.6062889803030477e-05, + "loss": 3.6165, + "num_input_tokens_seen": 2911960, + "step": 4440 + }, + { + "epoch": 0.4872300778252768, + "grad_norm": 7.7329277992248535, + "learning_rate": 2.601988221954894e-05, + "loss": 3.2172, + "num_input_tokens_seen": 2915256, + "step": 4445 + }, + { + "epoch": 0.48777814315466406, + "grad_norm": 6.208625793457031, + "learning_rate": 2.5976871612537164e-05, + "loss": 3.2373, + "num_input_tokens_seen": 2919040, + "step": 4450 + }, + { + "epoch": 0.4883262084840513, + "grad_norm": 8.127032279968262, + "learning_rate": 2.593385810950386e-05, + "loss": 2.9402, + "num_input_tokens_seen": 2922272, + "step": 4455 + }, + { + "epoch": 0.48887427381343856, + "grad_norm": 6.481329441070557, + "learning_rate": 2.589084183796632e-05, + "loss": 3.0208, + "num_input_tokens_seen": 2926072, + "step": 4460 + }, + { + "epoch": 0.48942233914282585, + "grad_norm": 6.350535869598389, + "learning_rate": 2.5847822925450055e-05, + "loss": 3.1026, + "num_input_tokens_seen": 2928760, + "step": 4465 + }, + { + "epoch": 0.4899704044722131, + "grad_norm": 7.3511457443237305, + "learning_rate": 2.5804801499488407e-05, + "loss": 2.9358, + "num_input_tokens_seen": 2932088, + "step": 4470 + }, + { + "epoch": 0.49051846980160035, + "grad_norm": 5.9759521484375, + "learning_rate": 2.576177768762216e-05, + "loss": 3.1564, + "num_input_tokens_seen": 2935272, + "step": 4475 + }, + { + "epoch": 0.49106653513098764, + "grad_norm": 7.138418674468994, + "learning_rate": 2.5718751617399182e-05, + "loss": 3.0998, + "num_input_tokens_seen": 2938280, + "step": 4480 + }, + { + "epoch": 0.49161460046037486, + "grad_norm": 10.551050186157227, + "learning_rate": 2.5675723416374026e-05, + "loss": 3.1874, + "num_input_tokens_seen": 2941648, + "step": 4485 + }, + { + "epoch": 0.49216266578976214, + "grad_norm": 6.085887432098389, + "learning_rate": 2.5632693212107567e-05, + "loss": 2.8506, + "num_input_tokens_seen": 2944680, + "step": 4490 + }, + { + "epoch": 0.4927107311191494, + "grad_norm": 6.314172267913818, + "learning_rate": 2.5589661132166613e-05, + "loss": 2.8206, + "num_input_tokens_seen": 2948744, + "step": 4495 + }, + { + "epoch": 0.49325879644853665, + "grad_norm": 6.3680853843688965, + "learning_rate": 2.5546627304123545e-05, + "loss": 2.85, + "num_input_tokens_seen": 2951256, + "step": 4500 + }, + { + "epoch": 0.49380686177792393, + "grad_norm": 6.314942359924316, + "learning_rate": 2.5503591855555908e-05, + "loss": 3.2021, + "num_input_tokens_seen": 2954536, + "step": 4505 + }, + { + "epoch": 0.4943549271073112, + "grad_norm": 6.349035739898682, + "learning_rate": 2.546055491404607e-05, + "loss": 2.9022, + "num_input_tokens_seen": 2958112, + "step": 4510 + }, + { + "epoch": 0.49490299243669844, + "grad_norm": 6.812668800354004, + "learning_rate": 2.5417516607180825e-05, + "loss": 3.2304, + "num_input_tokens_seen": 2961024, + "step": 4515 + }, + { + "epoch": 0.4954510577660857, + "grad_norm": 4.483590126037598, + "learning_rate": 2.5374477062550984e-05, + "loss": 2.8489, + "num_input_tokens_seen": 2964344, + "step": 4520 + }, + { + "epoch": 0.495999123095473, + "grad_norm": 6.769683837890625, + "learning_rate": 2.5331436407751074e-05, + "loss": 3.1946, + "num_input_tokens_seen": 2967608, + "step": 4525 + }, + { + "epoch": 0.49654718842486023, + "grad_norm": 9.059048652648926, + "learning_rate": 2.528839477037887e-05, + "loss": 3.2895, + "num_input_tokens_seen": 2970488, + "step": 4530 + }, + { + "epoch": 0.4970952537542475, + "grad_norm": 9.555692672729492, + "learning_rate": 2.5245352278035095e-05, + "loss": 3.0595, + "num_input_tokens_seen": 2973200, + "step": 4535 + }, + { + "epoch": 0.4976433190836348, + "grad_norm": 8.808011054992676, + "learning_rate": 2.520230905832298e-05, + "loss": 3.1939, + "num_input_tokens_seen": 2976576, + "step": 4540 + }, + { + "epoch": 0.498191384413022, + "grad_norm": 7.059693336486816, + "learning_rate": 2.515926523884792e-05, + "loss": 3.3154, + "num_input_tokens_seen": 2980624, + "step": 4545 + }, + { + "epoch": 0.4987394497424093, + "grad_norm": 5.0204973220825195, + "learning_rate": 2.5116220947217107e-05, + "loss": 3.2012, + "num_input_tokens_seen": 2983328, + "step": 4550 + }, + { + "epoch": 0.4992875150717966, + "grad_norm": 8.473772048950195, + "learning_rate": 2.507317631103911e-05, + "loss": 3.3448, + "num_input_tokens_seen": 2986664, + "step": 4555 + }, + { + "epoch": 0.4998355804011838, + "grad_norm": 5.891829490661621, + "learning_rate": 2.5030131457923512e-05, + "loss": 3.0624, + "num_input_tokens_seen": 2990088, + "step": 4560 + }, + { + "epoch": 0.500383645730571, + "grad_norm": 8.812019348144531, + "learning_rate": 2.498708651548057e-05, + "loss": 3.1606, + "num_input_tokens_seen": 2993152, + "step": 4565 + }, + { + "epoch": 0.5009317110599584, + "grad_norm": 6.772736549377441, + "learning_rate": 2.494404161132079e-05, + "loss": 2.6401, + "num_input_tokens_seen": 2996104, + "step": 4570 + }, + { + "epoch": 0.5014797763893456, + "grad_norm": 6.640130996704102, + "learning_rate": 2.490099687305455e-05, + "loss": 2.8047, + "num_input_tokens_seen": 3000664, + "step": 4575 + }, + { + "epoch": 0.5020278417187328, + "grad_norm": 8.050363540649414, + "learning_rate": 2.485795242829177e-05, + "loss": 2.9757, + "num_input_tokens_seen": 3004312, + "step": 4580 + }, + { + "epoch": 0.5025759070481202, + "grad_norm": 7.689075469970703, + "learning_rate": 2.481490840464147e-05, + "loss": 3.6823, + "num_input_tokens_seen": 3008056, + "step": 4585 + }, + { + "epoch": 0.5031239723775074, + "grad_norm": 7.890453815460205, + "learning_rate": 2.4771864929711414e-05, + "loss": 3.5555, + "num_input_tokens_seen": 3010640, + "step": 4590 + }, + { + "epoch": 0.5036720377068946, + "grad_norm": 8.07981014251709, + "learning_rate": 2.4728822131107784e-05, + "loss": 2.9504, + "num_input_tokens_seen": 3013752, + "step": 4595 + }, + { + "epoch": 0.504220103036282, + "grad_norm": 5.753955364227295, + "learning_rate": 2.468578013643472e-05, + "loss": 3.1703, + "num_input_tokens_seen": 3016248, + "step": 4600 + }, + { + "epoch": 0.5047681683656692, + "grad_norm": 5.296700954437256, + "learning_rate": 2.4642739073293978e-05, + "loss": 2.8482, + "num_input_tokens_seen": 3019256, + "step": 4605 + }, + { + "epoch": 0.5053162336950564, + "grad_norm": 11.357376098632812, + "learning_rate": 2.459969906928458e-05, + "loss": 2.8125, + "num_input_tokens_seen": 3021936, + "step": 4610 + }, + { + "epoch": 0.5058642990244437, + "grad_norm": 9.2806396484375, + "learning_rate": 2.4556660252002384e-05, + "loss": 3.1294, + "num_input_tokens_seen": 3025888, + "step": 4615 + }, + { + "epoch": 0.506412364353831, + "grad_norm": 7.156399250030518, + "learning_rate": 2.451362274903973e-05, + "loss": 3.202, + "num_input_tokens_seen": 3029752, + "step": 4620 + }, + { + "epoch": 0.5069604296832182, + "grad_norm": 7.298778533935547, + "learning_rate": 2.4470586687985077e-05, + "loss": 3.2958, + "num_input_tokens_seen": 3033576, + "step": 4625 + }, + { + "epoch": 0.5075084950126055, + "grad_norm": 7.478179454803467, + "learning_rate": 2.4427552196422602e-05, + "loss": 3.1416, + "num_input_tokens_seen": 3037016, + "step": 4630 + }, + { + "epoch": 0.5080565603419928, + "grad_norm": 8.109244346618652, + "learning_rate": 2.438451940193181e-05, + "loss": 2.7633, + "num_input_tokens_seen": 3040640, + "step": 4635 + }, + { + "epoch": 0.50860462567138, + "grad_norm": 6.991682052612305, + "learning_rate": 2.434148843208722e-05, + "loss": 2.9995, + "num_input_tokens_seen": 3043424, + "step": 4640 + }, + { + "epoch": 0.5091526910007673, + "grad_norm": 5.315702438354492, + "learning_rate": 2.4298459414457896e-05, + "loss": 2.9122, + "num_input_tokens_seen": 3046672, + "step": 4645 + }, + { + "epoch": 0.5097007563301545, + "grad_norm": 8.090765953063965, + "learning_rate": 2.425543247660713e-05, + "loss": 3.3741, + "num_input_tokens_seen": 3049736, + "step": 4650 + }, + { + "epoch": 0.5102488216595418, + "grad_norm": 9.288080215454102, + "learning_rate": 2.4212407746092066e-05, + "loss": 3.4609, + "num_input_tokens_seen": 3053656, + "step": 4655 + }, + { + "epoch": 0.5107968869889291, + "grad_norm": 5.754721164703369, + "learning_rate": 2.4169385350463282e-05, + "loss": 2.9946, + "num_input_tokens_seen": 3056144, + "step": 4660 + }, + { + "epoch": 0.5113449523183163, + "grad_norm": 6.588372230529785, + "learning_rate": 2.412636541726444e-05, + "loss": 3.0074, + "num_input_tokens_seen": 3059712, + "step": 4665 + }, + { + "epoch": 0.5118930176477036, + "grad_norm": 7.401770114898682, + "learning_rate": 2.4083348074031904e-05, + "loss": 3.4029, + "num_input_tokens_seen": 3062288, + "step": 4670 + }, + { + "epoch": 0.5124410829770909, + "grad_norm": 5.612600803375244, + "learning_rate": 2.4040333448294364e-05, + "loss": 3.2012, + "num_input_tokens_seen": 3065728, + "step": 4675 + }, + { + "epoch": 0.5129891483064781, + "grad_norm": 5.925127983093262, + "learning_rate": 2.399732166757243e-05, + "loss": 3.0461, + "num_input_tokens_seen": 3068632, + "step": 4680 + }, + { + "epoch": 0.5135372136358654, + "grad_norm": 8.738677978515625, + "learning_rate": 2.3954312859378325e-05, + "loss": 3.4782, + "num_input_tokens_seen": 3070968, + "step": 4685 + }, + { + "epoch": 0.5140852789652527, + "grad_norm": 9.27092170715332, + "learning_rate": 2.3911307151215413e-05, + "loss": 3.2625, + "num_input_tokens_seen": 3074696, + "step": 4690 + }, + { + "epoch": 0.5146333442946399, + "grad_norm": 5.855086326599121, + "learning_rate": 2.3868304670577886e-05, + "loss": 3.045, + "num_input_tokens_seen": 3078584, + "step": 4695 + }, + { + "epoch": 0.5151814096240271, + "grad_norm": 8.794078826904297, + "learning_rate": 2.3825305544950374e-05, + "loss": 2.7209, + "num_input_tokens_seen": 3081624, + "step": 4700 + }, + { + "epoch": 0.5157294749534145, + "grad_norm": 7.675835132598877, + "learning_rate": 2.3782309901807555e-05, + "loss": 3.3431, + "num_input_tokens_seen": 3084152, + "step": 4705 + }, + { + "epoch": 0.5162775402828017, + "grad_norm": 7.583930969238281, + "learning_rate": 2.3739317868613776e-05, + "loss": 3.1141, + "num_input_tokens_seen": 3087040, + "step": 4710 + }, + { + "epoch": 0.5168256056121889, + "grad_norm": 7.561563968658447, + "learning_rate": 2.369632957282269e-05, + "loss": 3.4023, + "num_input_tokens_seen": 3090352, + "step": 4715 + }, + { + "epoch": 0.5173736709415763, + "grad_norm": 6.868551254272461, + "learning_rate": 2.365334514187687e-05, + "loss": 3.0766, + "num_input_tokens_seen": 3093552, + "step": 4720 + }, + { + "epoch": 0.5179217362709635, + "grad_norm": 5.663219928741455, + "learning_rate": 2.3610364703207432e-05, + "loss": 3.1136, + "num_input_tokens_seen": 3097168, + "step": 4725 + }, + { + "epoch": 0.5184698016003507, + "grad_norm": 7.611098766326904, + "learning_rate": 2.3567388384233648e-05, + "loss": 3.0911, + "num_input_tokens_seen": 3101648, + "step": 4730 + }, + { + "epoch": 0.5190178669297381, + "grad_norm": 6.850576877593994, + "learning_rate": 2.352441631236259e-05, + "loss": 2.9311, + "num_input_tokens_seen": 3105888, + "step": 4735 + }, + { + "epoch": 0.5195659322591253, + "grad_norm": 5.57901668548584, + "learning_rate": 2.348144861498873e-05, + "loss": 3.0239, + "num_input_tokens_seen": 3110648, + "step": 4740 + }, + { + "epoch": 0.5201139975885125, + "grad_norm": 6.950675010681152, + "learning_rate": 2.343848541949356e-05, + "loss": 3.053, + "num_input_tokens_seen": 3113400, + "step": 4745 + }, + { + "epoch": 0.5206620629178998, + "grad_norm": 5.661995887756348, + "learning_rate": 2.3395526853245264e-05, + "loss": 3.2619, + "num_input_tokens_seen": 3117000, + "step": 4750 + }, + { + "epoch": 0.5212101282472871, + "grad_norm": 6.956995010375977, + "learning_rate": 2.3352573043598267e-05, + "loss": 3.6572, + "num_input_tokens_seen": 3121664, + "step": 4755 + }, + { + "epoch": 0.5217581935766743, + "grad_norm": 4.707006454467773, + "learning_rate": 2.3309624117892885e-05, + "loss": 2.9066, + "num_input_tokens_seen": 3124872, + "step": 4760 + }, + { + "epoch": 0.5223062589060616, + "grad_norm": 5.503338813781738, + "learning_rate": 2.3266680203455004e-05, + "loss": 3.2066, + "num_input_tokens_seen": 3128760, + "step": 4765 + }, + { + "epoch": 0.5228543242354489, + "grad_norm": 7.054602146148682, + "learning_rate": 2.322374142759561e-05, + "loss": 2.8683, + "num_input_tokens_seen": 3131480, + "step": 4770 + }, + { + "epoch": 0.5234023895648361, + "grad_norm": 8.06494140625, + "learning_rate": 2.318080791761046e-05, + "loss": 3.2634, + "num_input_tokens_seen": 3135040, + "step": 4775 + }, + { + "epoch": 0.5239504548942234, + "grad_norm": 8.718894958496094, + "learning_rate": 2.313787980077972e-05, + "loss": 3.3735, + "num_input_tokens_seen": 3137816, + "step": 4780 + }, + { + "epoch": 0.5244985202236107, + "grad_norm": 6.601426124572754, + "learning_rate": 2.309495720436755e-05, + "loss": 3.0622, + "num_input_tokens_seen": 3141752, + "step": 4785 + }, + { + "epoch": 0.5250465855529979, + "grad_norm": 7.08184814453125, + "learning_rate": 2.305204025562174e-05, + "loss": 2.6361, + "num_input_tokens_seen": 3144792, + "step": 4790 + }, + { + "epoch": 0.5255946508823852, + "grad_norm": 8.298012733459473, + "learning_rate": 2.3009129081773366e-05, + "loss": 2.8071, + "num_input_tokens_seen": 3147904, + "step": 4795 + }, + { + "epoch": 0.5261427162117724, + "grad_norm": 7.070413589477539, + "learning_rate": 2.2966223810036357e-05, + "loss": 3.2667, + "num_input_tokens_seen": 3150344, + "step": 4800 + }, + { + "epoch": 0.5266907815411597, + "grad_norm": 8.037806510925293, + "learning_rate": 2.292332456760714e-05, + "loss": 3.3148, + "num_input_tokens_seen": 3154328, + "step": 4805 + }, + { + "epoch": 0.527238846870547, + "grad_norm": 5.284430980682373, + "learning_rate": 2.2880431481664306e-05, + "loss": 2.6196, + "num_input_tokens_seen": 3157392, + "step": 4810 + }, + { + "epoch": 0.5277869121999342, + "grad_norm": 7.804793357849121, + "learning_rate": 2.283754467936815e-05, + "loss": 2.9899, + "num_input_tokens_seen": 3160304, + "step": 4815 + }, + { + "epoch": 0.5283349775293215, + "grad_norm": 8.394335746765137, + "learning_rate": 2.279466428786035e-05, + "loss": 3.2071, + "num_input_tokens_seen": 3163736, + "step": 4820 + }, + { + "epoch": 0.5288830428587088, + "grad_norm": 6.269372463226318, + "learning_rate": 2.2751790434263608e-05, + "loss": 3.1003, + "num_input_tokens_seen": 3166368, + "step": 4825 + }, + { + "epoch": 0.529431108188096, + "grad_norm": 7.112332820892334, + "learning_rate": 2.2708923245681203e-05, + "loss": 3.2725, + "num_input_tokens_seen": 3169960, + "step": 4830 + }, + { + "epoch": 0.5299791735174832, + "grad_norm": 8.58667278289795, + "learning_rate": 2.266606284919667e-05, + "loss": 2.7479, + "num_input_tokens_seen": 3172744, + "step": 4835 + }, + { + "epoch": 0.5305272388468706, + "grad_norm": 7.745898723602295, + "learning_rate": 2.262320937187344e-05, + "loss": 3.4911, + "num_input_tokens_seen": 3175984, + "step": 4840 + }, + { + "epoch": 0.5310753041762578, + "grad_norm": 6.885601997375488, + "learning_rate": 2.258036294075438e-05, + "loss": 2.8831, + "num_input_tokens_seen": 3178800, + "step": 4845 + }, + { + "epoch": 0.531623369505645, + "grad_norm": 6.387146472930908, + "learning_rate": 2.2537523682861484e-05, + "loss": 3.0745, + "num_input_tokens_seen": 3182328, + "step": 4850 + }, + { + "epoch": 0.5321714348350324, + "grad_norm": 4.868107795715332, + "learning_rate": 2.249469172519551e-05, + "loss": 3.0048, + "num_input_tokens_seen": 3185912, + "step": 4855 + }, + { + "epoch": 0.5327195001644196, + "grad_norm": 8.075777053833008, + "learning_rate": 2.2451867194735542e-05, + "loss": 3.3234, + "num_input_tokens_seen": 3189352, + "step": 4860 + }, + { + "epoch": 0.5332675654938068, + "grad_norm": 5.830811500549316, + "learning_rate": 2.2409050218438645e-05, + "loss": 3.0588, + "num_input_tokens_seen": 3193072, + "step": 4865 + }, + { + "epoch": 0.5338156308231942, + "grad_norm": 5.349551200866699, + "learning_rate": 2.2366240923239514e-05, + "loss": 2.7223, + "num_input_tokens_seen": 3196104, + "step": 4870 + }, + { + "epoch": 0.5343636961525814, + "grad_norm": 8.454142570495605, + "learning_rate": 2.2323439436050054e-05, + "loss": 3.1157, + "num_input_tokens_seen": 3198648, + "step": 4875 + }, + { + "epoch": 0.5349117614819686, + "grad_norm": 7.110290050506592, + "learning_rate": 2.2280645883759006e-05, + "loss": 3.0379, + "num_input_tokens_seen": 3201056, + "step": 4880 + }, + { + "epoch": 0.535459826811356, + "grad_norm": 5.1915154457092285, + "learning_rate": 2.2237860393231634e-05, + "loss": 3.575, + "num_input_tokens_seen": 3203712, + "step": 4885 + }, + { + "epoch": 0.5360078921407432, + "grad_norm": 8.497429847717285, + "learning_rate": 2.219508309130927e-05, + "loss": 2.9379, + "num_input_tokens_seen": 3206288, + "step": 4890 + }, + { + "epoch": 0.5365559574701304, + "grad_norm": 8.26462173461914, + "learning_rate": 2.2152314104808956e-05, + "loss": 3.1587, + "num_input_tokens_seen": 3209928, + "step": 4895 + }, + { + "epoch": 0.5371040227995177, + "grad_norm": 6.499933242797852, + "learning_rate": 2.210955356052313e-05, + "loss": 2.9181, + "num_input_tokens_seen": 3213336, + "step": 4900 + }, + { + "epoch": 0.537652088128905, + "grad_norm": 5.8398590087890625, + "learning_rate": 2.2066801585219156e-05, + "loss": 2.8303, + "num_input_tokens_seen": 3216464, + "step": 4905 + }, + { + "epoch": 0.5382001534582922, + "grad_norm": 6.813495635986328, + "learning_rate": 2.2024058305639015e-05, + "loss": 2.9079, + "num_input_tokens_seen": 3221256, + "step": 4910 + }, + { + "epoch": 0.5387482187876795, + "grad_norm": 8.064513206481934, + "learning_rate": 2.198132384849891e-05, + "loss": 3.2373, + "num_input_tokens_seen": 3224320, + "step": 4915 + }, + { + "epoch": 0.5392962841170668, + "grad_norm": 7.14154577255249, + "learning_rate": 2.1938598340488886e-05, + "loss": 3.0737, + "num_input_tokens_seen": 3227128, + "step": 4920 + }, + { + "epoch": 0.539844349446454, + "grad_norm": 6.514719009399414, + "learning_rate": 2.1895881908272446e-05, + "loss": 2.8825, + "num_input_tokens_seen": 3230352, + "step": 4925 + }, + { + "epoch": 0.5403924147758413, + "grad_norm": 7.076175212860107, + "learning_rate": 2.1853174678486213e-05, + "loss": 2.8721, + "num_input_tokens_seen": 3234440, + "step": 4930 + }, + { + "epoch": 0.5409404801052285, + "grad_norm": 5.526149749755859, + "learning_rate": 2.1810476777739508e-05, + "loss": 3.1112, + "num_input_tokens_seen": 3238176, + "step": 4935 + }, + { + "epoch": 0.5414885454346158, + "grad_norm": 8.458449363708496, + "learning_rate": 2.176778833261399e-05, + "loss": 3.2798, + "num_input_tokens_seen": 3241728, + "step": 4940 + }, + { + "epoch": 0.5420366107640031, + "grad_norm": 7.216832160949707, + "learning_rate": 2.1725109469663318e-05, + "loss": 3.1847, + "num_input_tokens_seen": 3244416, + "step": 4945 + }, + { + "epoch": 0.5425846760933903, + "grad_norm": 5.6720147132873535, + "learning_rate": 2.168244031541271e-05, + "loss": 3.4552, + "num_input_tokens_seen": 3247816, + "step": 4950 + }, + { + "epoch": 0.5431327414227776, + "grad_norm": 7.452066898345947, + "learning_rate": 2.163978099635861e-05, + "loss": 2.958, + "num_input_tokens_seen": 3250432, + "step": 4955 + }, + { + "epoch": 0.5436808067521649, + "grad_norm": 6.589701175689697, + "learning_rate": 2.159713163896832e-05, + "loss": 3.4633, + "num_input_tokens_seen": 3253376, + "step": 4960 + }, + { + "epoch": 0.5442288720815521, + "grad_norm": 4.926830768585205, + "learning_rate": 2.1554492369679598e-05, + "loss": 3.0458, + "num_input_tokens_seen": 3257640, + "step": 4965 + }, + { + "epoch": 0.5447769374109394, + "grad_norm": 8.084177017211914, + "learning_rate": 2.1511863314900275e-05, + "loss": 2.992, + "num_input_tokens_seen": 3261952, + "step": 4970 + }, + { + "epoch": 0.5453250027403267, + "grad_norm": 5.291374683380127, + "learning_rate": 2.146924460100795e-05, + "loss": 2.5116, + "num_input_tokens_seen": 3265912, + "step": 4975 + }, + { + "epoch": 0.5458730680697139, + "grad_norm": 9.101826667785645, + "learning_rate": 2.1426636354349523e-05, + "loss": 3.0809, + "num_input_tokens_seen": 3269624, + "step": 4980 + }, + { + "epoch": 0.5464211333991011, + "grad_norm": 9.933355331420898, + "learning_rate": 2.1384038701240865e-05, + "loss": 2.6956, + "num_input_tokens_seen": 3273112, + "step": 4985 + }, + { + "epoch": 0.5469691987284885, + "grad_norm": 8.288704872131348, + "learning_rate": 2.1341451767966475e-05, + "loss": 3.319, + "num_input_tokens_seen": 3275624, + "step": 4990 + }, + { + "epoch": 0.5475172640578757, + "grad_norm": 6.39847469329834, + "learning_rate": 2.129887568077904e-05, + "loss": 3.0552, + "num_input_tokens_seen": 3279792, + "step": 4995 + }, + { + "epoch": 0.5480653293872629, + "grad_norm": 6.739533424377441, + "learning_rate": 2.12563105658991e-05, + "loss": 3.1218, + "num_input_tokens_seen": 3283560, + "step": 5000 + }, + { + "epoch": 0.5486133947166503, + "grad_norm": 7.888918399810791, + "learning_rate": 2.1213756549514674e-05, + "loss": 3.0369, + "num_input_tokens_seen": 3286504, + "step": 5005 + }, + { + "epoch": 0.5491614600460375, + "grad_norm": 6.957367897033691, + "learning_rate": 2.1171213757780873e-05, + "loss": 2.9968, + "num_input_tokens_seen": 3289512, + "step": 5010 + }, + { + "epoch": 0.5497095253754247, + "grad_norm": 6.351596355438232, + "learning_rate": 2.1128682316819522e-05, + "loss": 3.0657, + "num_input_tokens_seen": 3293512, + "step": 5015 + }, + { + "epoch": 0.5502575907048121, + "grad_norm": 7.056116104125977, + "learning_rate": 2.1086162352718825e-05, + "loss": 3.029, + "num_input_tokens_seen": 3298024, + "step": 5020 + }, + { + "epoch": 0.5508056560341993, + "grad_norm": 6.343071937561035, + "learning_rate": 2.1043653991532934e-05, + "loss": 2.8398, + "num_input_tokens_seen": 3301000, + "step": 5025 + }, + { + "epoch": 0.5513537213635865, + "grad_norm": 8.5012788772583, + "learning_rate": 2.1001157359281605e-05, + "loss": 3.1406, + "num_input_tokens_seen": 3304064, + "step": 5030 + }, + { + "epoch": 0.5519017866929739, + "grad_norm": 5.8669819831848145, + "learning_rate": 2.095867258194984e-05, + "loss": 2.7844, + "num_input_tokens_seen": 3308616, + "step": 5035 + }, + { + "epoch": 0.5524498520223611, + "grad_norm": 6.373290061950684, + "learning_rate": 2.0916199785487488e-05, + "loss": 3.346, + "num_input_tokens_seen": 3312128, + "step": 5040 + }, + { + "epoch": 0.5529979173517483, + "grad_norm": 7.038343906402588, + "learning_rate": 2.0873739095808865e-05, + "loss": 3.1385, + "num_input_tokens_seen": 3315040, + "step": 5045 + }, + { + "epoch": 0.5535459826811356, + "grad_norm": 7.340169429779053, + "learning_rate": 2.083129063879242e-05, + "loss": 2.9194, + "num_input_tokens_seen": 3319432, + "step": 5050 + }, + { + "epoch": 0.5540940480105229, + "grad_norm": 5.199733734130859, + "learning_rate": 2.0788854540280315e-05, + "loss": 3.5487, + "num_input_tokens_seen": 3322568, + "step": 5055 + }, + { + "epoch": 0.5546421133399101, + "grad_norm": 7.935201168060303, + "learning_rate": 2.0746430926078086e-05, + "loss": 2.8886, + "num_input_tokens_seen": 3325536, + "step": 5060 + }, + { + "epoch": 0.5551901786692974, + "grad_norm": 7.43034029006958, + "learning_rate": 2.0704019921954264e-05, + "loss": 3.0405, + "num_input_tokens_seen": 3329312, + "step": 5065 + }, + { + "epoch": 0.5557382439986847, + "grad_norm": 5.411002159118652, + "learning_rate": 2.0661621653639987e-05, + "loss": 3.1599, + "num_input_tokens_seen": 3333232, + "step": 5070 + }, + { + "epoch": 0.5562863093280719, + "grad_norm": 8.897222518920898, + "learning_rate": 2.0619236246828622e-05, + "loss": 2.8413, + "num_input_tokens_seen": 3336312, + "step": 5075 + }, + { + "epoch": 0.5568343746574592, + "grad_norm": 8.512425422668457, + "learning_rate": 2.0576863827175447e-05, + "loss": 2.9528, + "num_input_tokens_seen": 3339344, + "step": 5080 + }, + { + "epoch": 0.5573824399868464, + "grad_norm": 7.003962516784668, + "learning_rate": 2.0534504520297203e-05, + "loss": 3.3579, + "num_input_tokens_seen": 3342520, + "step": 5085 + }, + { + "epoch": 0.5579305053162337, + "grad_norm": 6.14302396774292, + "learning_rate": 2.0492158451771767e-05, + "loss": 3.3721, + "num_input_tokens_seen": 3346272, + "step": 5090 + }, + { + "epoch": 0.558478570645621, + "grad_norm": 8.199108123779297, + "learning_rate": 2.0449825747137778e-05, + "loss": 2.9852, + "num_input_tokens_seen": 3350232, + "step": 5095 + }, + { + "epoch": 0.5590266359750082, + "grad_norm": 7.849426746368408, + "learning_rate": 2.0407506531894245e-05, + "loss": 3.1338, + "num_input_tokens_seen": 3353144, + "step": 5100 + }, + { + "epoch": 0.5595747013043955, + "grad_norm": 6.752470016479492, + "learning_rate": 2.0365200931500177e-05, + "loss": 2.9589, + "num_input_tokens_seen": 3356952, + "step": 5105 + }, + { + "epoch": 0.5601227666337828, + "grad_norm": 7.846312046051025, + "learning_rate": 2.0322909071374265e-05, + "loss": 3.2629, + "num_input_tokens_seen": 3360424, + "step": 5110 + }, + { + "epoch": 0.56067083196317, + "grad_norm": 6.629732131958008, + "learning_rate": 2.028063107689442e-05, + "loss": 3.2232, + "num_input_tokens_seen": 3363544, + "step": 5115 + }, + { + "epoch": 0.5612188972925573, + "grad_norm": 7.26005220413208, + "learning_rate": 2.023836707339745e-05, + "loss": 3.2771, + "num_input_tokens_seen": 3366664, + "step": 5120 + }, + { + "epoch": 0.5617669626219446, + "grad_norm": 7.383485317230225, + "learning_rate": 2.0196117186178727e-05, + "loss": 2.8273, + "num_input_tokens_seen": 3369848, + "step": 5125 + }, + { + "epoch": 0.5623150279513318, + "grad_norm": 7.374210357666016, + "learning_rate": 2.015388154049173e-05, + "loss": 3.2708, + "num_input_tokens_seen": 3373208, + "step": 5130 + }, + { + "epoch": 0.562863093280719, + "grad_norm": 6.803157329559326, + "learning_rate": 2.0111660261547728e-05, + "loss": 3.1036, + "num_input_tokens_seen": 3376872, + "step": 5135 + }, + { + "epoch": 0.5634111586101064, + "grad_norm": 6.192258358001709, + "learning_rate": 2.006945347451541e-05, + "loss": 3.0572, + "num_input_tokens_seen": 3382136, + "step": 5140 + }, + { + "epoch": 0.5639592239394936, + "grad_norm": 9.468875885009766, + "learning_rate": 2.00272613045205e-05, + "loss": 3.2346, + "num_input_tokens_seen": 3385456, + "step": 5145 + }, + { + "epoch": 0.5645072892688808, + "grad_norm": 6.274002552032471, + "learning_rate": 1.9985083876645368e-05, + "loss": 3.1731, + "num_input_tokens_seen": 3388976, + "step": 5150 + }, + { + "epoch": 0.5650553545982682, + "grad_norm": 5.550570487976074, + "learning_rate": 1.994292131592872e-05, + "loss": 3.2257, + "num_input_tokens_seen": 3392736, + "step": 5155 + }, + { + "epoch": 0.5656034199276554, + "grad_norm": 8.218210220336914, + "learning_rate": 1.990077374736515e-05, + "loss": 3.0855, + "num_input_tokens_seen": 3396128, + "step": 5160 + }, + { + "epoch": 0.5661514852570426, + "grad_norm": 7.721156597137451, + "learning_rate": 1.9858641295904813e-05, + "loss": 2.9721, + "num_input_tokens_seen": 3399376, + "step": 5165 + }, + { + "epoch": 0.56669955058643, + "grad_norm": 6.2414231300354, + "learning_rate": 1.981652408645307e-05, + "loss": 3.3822, + "num_input_tokens_seen": 3401928, + "step": 5170 + }, + { + "epoch": 0.5672476159158172, + "grad_norm": 8.496658325195312, + "learning_rate": 1.9774422243870078e-05, + "loss": 3.0474, + "num_input_tokens_seen": 3404744, + "step": 5175 + }, + { + "epoch": 0.5677956812452044, + "grad_norm": 7.224369049072266, + "learning_rate": 1.9732335892970427e-05, + "loss": 3.259, + "num_input_tokens_seen": 3407824, + "step": 5180 + }, + { + "epoch": 0.5683437465745917, + "grad_norm": 9.386946678161621, + "learning_rate": 1.969026515852281e-05, + "loss": 3.0473, + "num_input_tokens_seen": 3410608, + "step": 5185 + }, + { + "epoch": 0.568891811903979, + "grad_norm": 8.189655303955078, + "learning_rate": 1.96482101652496e-05, + "loss": 3.3926, + "num_input_tokens_seen": 3413592, + "step": 5190 + }, + { + "epoch": 0.5694398772333662, + "grad_norm": 6.405150890350342, + "learning_rate": 1.9606171037826502e-05, + "loss": 2.9921, + "num_input_tokens_seen": 3417320, + "step": 5195 + }, + { + "epoch": 0.5699879425627535, + "grad_norm": 6.89292573928833, + "learning_rate": 1.9564147900882213e-05, + "loss": 2.9261, + "num_input_tokens_seen": 3420888, + "step": 5200 + }, + { + "epoch": 0.5705360078921408, + "grad_norm": 6.517080307006836, + "learning_rate": 1.9522140878997995e-05, + "loss": 3.3255, + "num_input_tokens_seen": 3424336, + "step": 5205 + }, + { + "epoch": 0.571084073221528, + "grad_norm": 8.910572052001953, + "learning_rate": 1.9480150096707344e-05, + "loss": 2.9723, + "num_input_tokens_seen": 3428120, + "step": 5210 + }, + { + "epoch": 0.5716321385509152, + "grad_norm": 8.455070495605469, + "learning_rate": 1.943817567849563e-05, + "loss": 3.0703, + "num_input_tokens_seen": 3430880, + "step": 5215 + }, + { + "epoch": 0.5721802038803026, + "grad_norm": 6.948888778686523, + "learning_rate": 1.9396217748799682e-05, + "loss": 2.9862, + "num_input_tokens_seen": 3435560, + "step": 5220 + }, + { + "epoch": 0.5727282692096898, + "grad_norm": 6.147201061248779, + "learning_rate": 1.935427643200746e-05, + "loss": 3.0719, + "num_input_tokens_seen": 3438352, + "step": 5225 + }, + { + "epoch": 0.573276334539077, + "grad_norm": 7.213772773742676, + "learning_rate": 1.9312351852457686e-05, + "loss": 2.9474, + "num_input_tokens_seen": 3441216, + "step": 5230 + }, + { + "epoch": 0.5738243998684643, + "grad_norm": 6.16003942489624, + "learning_rate": 1.9270444134439434e-05, + "loss": 3.0849, + "num_input_tokens_seen": 3444944, + "step": 5235 + }, + { + "epoch": 0.5743724651978516, + "grad_norm": 7.64081335067749, + "learning_rate": 1.9228553402191822e-05, + "loss": 3.0799, + "num_input_tokens_seen": 3449568, + "step": 5240 + }, + { + "epoch": 0.5749205305272388, + "grad_norm": 7.353094577789307, + "learning_rate": 1.91866797799036e-05, + "loss": 3.3501, + "num_input_tokens_seen": 3452544, + "step": 5245 + }, + { + "epoch": 0.5754685958566261, + "grad_norm": 7.696213722229004, + "learning_rate": 1.9144823391712785e-05, + "loss": 3.2286, + "num_input_tokens_seen": 3455600, + "step": 5250 + }, + { + "epoch": 0.5760166611860134, + "grad_norm": 9.90982723236084, + "learning_rate": 1.91029843617063e-05, + "loss": 3.3799, + "num_input_tokens_seen": 3458728, + "step": 5255 + }, + { + "epoch": 0.5765647265154006, + "grad_norm": 6.676484107971191, + "learning_rate": 1.9061162813919637e-05, + "loss": 3.2611, + "num_input_tokens_seen": 3461888, + "step": 5260 + }, + { + "epoch": 0.5771127918447879, + "grad_norm": 6.546321868896484, + "learning_rate": 1.9019358872336428e-05, + "loss": 2.9518, + "num_input_tokens_seen": 3464880, + "step": 5265 + }, + { + "epoch": 0.5776608571741751, + "grad_norm": 5.9848151206970215, + "learning_rate": 1.8977572660888122e-05, + "loss": 3.1144, + "num_input_tokens_seen": 3467712, + "step": 5270 + }, + { + "epoch": 0.5782089225035624, + "grad_norm": 6.030148506164551, + "learning_rate": 1.8935804303453612e-05, + "loss": 3.0001, + "num_input_tokens_seen": 3471760, + "step": 5275 + }, + { + "epoch": 0.5787569878329497, + "grad_norm": 9.319378852844238, + "learning_rate": 1.8894053923858857e-05, + "loss": 2.7935, + "num_input_tokens_seen": 3475928, + "step": 5280 + }, + { + "epoch": 0.5793050531623369, + "grad_norm": 7.607476711273193, + "learning_rate": 1.8852321645876507e-05, + "loss": 2.9319, + "num_input_tokens_seen": 3478968, + "step": 5285 + }, + { + "epoch": 0.5798531184917242, + "grad_norm": 7.065295219421387, + "learning_rate": 1.8810607593225567e-05, + "loss": 2.9655, + "num_input_tokens_seen": 3482160, + "step": 5290 + }, + { + "epoch": 0.5804011838211115, + "grad_norm": 6.528260707855225, + "learning_rate": 1.8768911889571002e-05, + "loss": 3.0625, + "num_input_tokens_seen": 3486016, + "step": 5295 + }, + { + "epoch": 0.5809492491504987, + "grad_norm": 8.56631851196289, + "learning_rate": 1.8727234658523368e-05, + "loss": 3.1642, + "num_input_tokens_seen": 3488552, + "step": 5300 + }, + { + "epoch": 0.581497314479886, + "grad_norm": 6.70935583114624, + "learning_rate": 1.8685576023638495e-05, + "loss": 2.908, + "num_input_tokens_seen": 3492192, + "step": 5305 + }, + { + "epoch": 0.5820453798092733, + "grad_norm": 9.139800071716309, + "learning_rate": 1.864393610841704e-05, + "loss": 3.0694, + "num_input_tokens_seen": 3495032, + "step": 5310 + }, + { + "epoch": 0.5825934451386605, + "grad_norm": 6.343008041381836, + "learning_rate": 1.8602315036304175e-05, + "loss": 2.939, + "num_input_tokens_seen": 3498288, + "step": 5315 + }, + { + "epoch": 0.5831415104680477, + "grad_norm": 6.961386203765869, + "learning_rate": 1.8560712930689238e-05, + "loss": 2.7722, + "num_input_tokens_seen": 3501112, + "step": 5320 + }, + { + "epoch": 0.5836895757974351, + "grad_norm": 8.582582473754883, + "learning_rate": 1.851912991490531e-05, + "loss": 3.0957, + "num_input_tokens_seen": 3504384, + "step": 5325 + }, + { + "epoch": 0.5842376411268223, + "grad_norm": 6.227029800415039, + "learning_rate": 1.8477566112228878e-05, + "loss": 3.2204, + "num_input_tokens_seen": 3508024, + "step": 5330 + }, + { + "epoch": 0.5847857064562095, + "grad_norm": 6.587297439575195, + "learning_rate": 1.8436021645879494e-05, + "loss": 3.1471, + "num_input_tokens_seen": 3511392, + "step": 5335 + }, + { + "epoch": 0.5853337717855969, + "grad_norm": 5.520746231079102, + "learning_rate": 1.839449663901936e-05, + "loss": 2.9406, + "num_input_tokens_seen": 3514568, + "step": 5340 + }, + { + "epoch": 0.5858818371149841, + "grad_norm": 5.80632209777832, + "learning_rate": 1.8352991214752983e-05, + "loss": 2.9652, + "num_input_tokens_seen": 3517672, + "step": 5345 + }, + { + "epoch": 0.5864299024443713, + "grad_norm": 4.704535484313965, + "learning_rate": 1.8311505496126868e-05, + "loss": 2.7212, + "num_input_tokens_seen": 3522392, + "step": 5350 + }, + { + "epoch": 0.5869779677737587, + "grad_norm": 12.650748252868652, + "learning_rate": 1.8270039606129045e-05, + "loss": 3.7118, + "num_input_tokens_seen": 3526336, + "step": 5355 + }, + { + "epoch": 0.5875260331031459, + "grad_norm": 9.578808784484863, + "learning_rate": 1.8228593667688772e-05, + "loss": 3.2441, + "num_input_tokens_seen": 3530656, + "step": 5360 + }, + { + "epoch": 0.5880740984325331, + "grad_norm": 6.2789812088012695, + "learning_rate": 1.818716780367618e-05, + "loss": 2.7651, + "num_input_tokens_seen": 3533184, + "step": 5365 + }, + { + "epoch": 0.5886221637619204, + "grad_norm": 8.422161102294922, + "learning_rate": 1.8145762136901874e-05, + "loss": 3.3134, + "num_input_tokens_seen": 3536976, + "step": 5370 + }, + { + "epoch": 0.5891702290913077, + "grad_norm": 7.674281597137451, + "learning_rate": 1.8104376790116572e-05, + "loss": 3.1223, + "num_input_tokens_seen": 3540496, + "step": 5375 + }, + { + "epoch": 0.5897182944206949, + "grad_norm": 7.617640495300293, + "learning_rate": 1.8063011886010777e-05, + "loss": 3.4106, + "num_input_tokens_seen": 3542952, + "step": 5380 + }, + { + "epoch": 0.5902663597500822, + "grad_norm": 6.847158908843994, + "learning_rate": 1.8021667547214367e-05, + "loss": 3.4031, + "num_input_tokens_seen": 3545952, + "step": 5385 + }, + { + "epoch": 0.5908144250794695, + "grad_norm": 7.656712532043457, + "learning_rate": 1.7980343896296243e-05, + "loss": 3.1261, + "num_input_tokens_seen": 3548960, + "step": 5390 + }, + { + "epoch": 0.5913624904088567, + "grad_norm": 6.854838848114014, + "learning_rate": 1.7939041055764015e-05, + "loss": 2.8715, + "num_input_tokens_seen": 3552888, + "step": 5395 + }, + { + "epoch": 0.591910555738244, + "grad_norm": 7.809703350067139, + "learning_rate": 1.789775914806357e-05, + "loss": 3.0002, + "num_input_tokens_seen": 3556448, + "step": 5400 + }, + { + "epoch": 0.5924586210676313, + "grad_norm": 9.405502319335938, + "learning_rate": 1.785649829557873e-05, + "loss": 3.4519, + "num_input_tokens_seen": 3560392, + "step": 5405 + }, + { + "epoch": 0.5930066863970185, + "grad_norm": 9.429394721984863, + "learning_rate": 1.781525862063092e-05, + "loss": 3.2288, + "num_input_tokens_seen": 3563680, + "step": 5410 + }, + { + "epoch": 0.5935547517264058, + "grad_norm": 6.114898204803467, + "learning_rate": 1.7774040245478767e-05, + "loss": 3.3265, + "num_input_tokens_seen": 3567200, + "step": 5415 + }, + { + "epoch": 0.594102817055793, + "grad_norm": 6.565958499908447, + "learning_rate": 1.7732843292317757e-05, + "loss": 3.0318, + "num_input_tokens_seen": 3570120, + "step": 5420 + }, + { + "epoch": 0.5946508823851803, + "grad_norm": 7.470787048339844, + "learning_rate": 1.7691667883279877e-05, + "loss": 2.9758, + "num_input_tokens_seen": 3573704, + "step": 5425 + }, + { + "epoch": 0.5951989477145676, + "grad_norm": 6.305603504180908, + "learning_rate": 1.7650514140433226e-05, + "loss": 2.8946, + "num_input_tokens_seen": 3577472, + "step": 5430 + }, + { + "epoch": 0.5957470130439548, + "grad_norm": 7.486173629760742, + "learning_rate": 1.760938218578168e-05, + "loss": 3.0453, + "num_input_tokens_seen": 3579928, + "step": 5435 + }, + { + "epoch": 0.5962950783733421, + "grad_norm": 5.27332067489624, + "learning_rate": 1.7568272141264542e-05, + "loss": 3.0027, + "num_input_tokens_seen": 3582744, + "step": 5440 + }, + { + "epoch": 0.5968431437027294, + "grad_norm": 5.261857986450195, + "learning_rate": 1.752718412875613e-05, + "loss": 3.373, + "num_input_tokens_seen": 3586344, + "step": 5445 + }, + { + "epoch": 0.5973912090321166, + "grad_norm": 7.151644706726074, + "learning_rate": 1.748611827006545e-05, + "loss": 3.0059, + "num_input_tokens_seen": 3590696, + "step": 5450 + }, + { + "epoch": 0.5979392743615038, + "grad_norm": 6.867771148681641, + "learning_rate": 1.7445074686935865e-05, + "loss": 2.9594, + "num_input_tokens_seen": 3593960, + "step": 5455 + }, + { + "epoch": 0.5984873396908912, + "grad_norm": 10.243605613708496, + "learning_rate": 1.740405350104466e-05, + "loss": 3.1614, + "num_input_tokens_seen": 3597248, + "step": 5460 + }, + { + "epoch": 0.5990354050202784, + "grad_norm": 7.2442827224731445, + "learning_rate": 1.736305483400273e-05, + "loss": 3.444, + "num_input_tokens_seen": 3600048, + "step": 5465 + }, + { + "epoch": 0.5995834703496656, + "grad_norm": 8.634395599365234, + "learning_rate": 1.7322078807354232e-05, + "loss": 3.6502, + "num_input_tokens_seen": 3603160, + "step": 5470 + }, + { + "epoch": 0.600131535679053, + "grad_norm": 7.339416027069092, + "learning_rate": 1.728112554257618e-05, + "loss": 2.9444, + "num_input_tokens_seen": 3606976, + "step": 5475 + }, + { + "epoch": 0.6006796010084402, + "grad_norm": 6.438117027282715, + "learning_rate": 1.7240195161078112e-05, + "loss": 2.7825, + "num_input_tokens_seen": 3610368, + "step": 5480 + }, + { + "epoch": 0.6012276663378274, + "grad_norm": 8.13581657409668, + "learning_rate": 1.7199287784201752e-05, + "loss": 3.1469, + "num_input_tokens_seen": 3613240, + "step": 5485 + }, + { + "epoch": 0.6017757316672148, + "grad_norm": 9.25243854522705, + "learning_rate": 1.715840353322059e-05, + "loss": 3.1494, + "num_input_tokens_seen": 3616384, + "step": 5490 + }, + { + "epoch": 0.602323796996602, + "grad_norm": 6.846777439117432, + "learning_rate": 1.7117542529339564e-05, + "loss": 3.0651, + "num_input_tokens_seen": 3620600, + "step": 5495 + }, + { + "epoch": 0.6028718623259892, + "grad_norm": 9.576505661010742, + "learning_rate": 1.7076704893694725e-05, + "loss": 3.2062, + "num_input_tokens_seen": 3624184, + "step": 5500 + }, + { + "epoch": 0.6034199276553766, + "grad_norm": 5.831842422485352, + "learning_rate": 1.7035890747352812e-05, + "loss": 2.9302, + "num_input_tokens_seen": 3628160, + "step": 5505 + }, + { + "epoch": 0.6039679929847638, + "grad_norm": 6.526121139526367, + "learning_rate": 1.699510021131093e-05, + "loss": 3.0619, + "num_input_tokens_seen": 3632144, + "step": 5510 + }, + { + "epoch": 0.604516058314151, + "grad_norm": 8.087743759155273, + "learning_rate": 1.695433340649622e-05, + "loss": 3.1402, + "num_input_tokens_seen": 3635512, + "step": 5515 + }, + { + "epoch": 0.6050641236435383, + "grad_norm": 4.840604305267334, + "learning_rate": 1.6913590453765436e-05, + "loss": 3.0223, + "num_input_tokens_seen": 3638824, + "step": 5520 + }, + { + "epoch": 0.6056121889729256, + "grad_norm": 7.919428825378418, + "learning_rate": 1.687287147390463e-05, + "loss": 2.7976, + "num_input_tokens_seen": 3642704, + "step": 5525 + }, + { + "epoch": 0.6061602543023128, + "grad_norm": 5.97782039642334, + "learning_rate": 1.6832176587628784e-05, + "loss": 2.9795, + "num_input_tokens_seen": 3645432, + "step": 5530 + }, + { + "epoch": 0.6067083196317001, + "grad_norm": 7.9558539390563965, + "learning_rate": 1.6791505915581474e-05, + "loss": 3.0965, + "num_input_tokens_seen": 3647912, + "step": 5535 + }, + { + "epoch": 0.6072563849610874, + "grad_norm": 7.399658203125, + "learning_rate": 1.675085957833446e-05, + "loss": 3.0064, + "num_input_tokens_seen": 3651176, + "step": 5540 + }, + { + "epoch": 0.6078044502904746, + "grad_norm": 5.475082874298096, + "learning_rate": 1.6710237696387364e-05, + "loss": 3.0204, + "num_input_tokens_seen": 3653864, + "step": 5545 + }, + { + "epoch": 0.6083525156198619, + "grad_norm": 7.328055381774902, + "learning_rate": 1.666964039016734e-05, + "loss": 3.4209, + "num_input_tokens_seen": 3656896, + "step": 5550 + }, + { + "epoch": 0.6089005809492491, + "grad_norm": 6.844607353210449, + "learning_rate": 1.6629067780028643e-05, + "loss": 2.8587, + "num_input_tokens_seen": 3660032, + "step": 5555 + }, + { + "epoch": 0.6094486462786364, + "grad_norm": 8.957280158996582, + "learning_rate": 1.6588519986252334e-05, + "loss": 3.3932, + "num_input_tokens_seen": 3662592, + "step": 5560 + }, + { + "epoch": 0.6099967116080237, + "grad_norm": 6.236993789672852, + "learning_rate": 1.6547997129045907e-05, + "loss": 2.8217, + "num_input_tokens_seen": 3665480, + "step": 5565 + }, + { + "epoch": 0.6105447769374109, + "grad_norm": 6.7575201988220215, + "learning_rate": 1.6507499328542926e-05, + "loss": 3.1285, + "num_input_tokens_seen": 3668296, + "step": 5570 + }, + { + "epoch": 0.6110928422667982, + "grad_norm": 6.297115802764893, + "learning_rate": 1.6467026704802652e-05, + "loss": 3.0519, + "num_input_tokens_seen": 3671088, + "step": 5575 + }, + { + "epoch": 0.6116409075961855, + "grad_norm": 5.6386003494262695, + "learning_rate": 1.6426579377809755e-05, + "loss": 3.0005, + "num_input_tokens_seen": 3674856, + "step": 5580 + }, + { + "epoch": 0.6121889729255727, + "grad_norm": 5.507198333740234, + "learning_rate": 1.6386157467473867e-05, + "loss": 3.0995, + "num_input_tokens_seen": 3677256, + "step": 5585 + }, + { + "epoch": 0.61273703825496, + "grad_norm": 6.467530250549316, + "learning_rate": 1.6345761093629276e-05, + "loss": 3.1279, + "num_input_tokens_seen": 3680248, + "step": 5590 + }, + { + "epoch": 0.6132851035843473, + "grad_norm": 6.12019681930542, + "learning_rate": 1.630539037603459e-05, + "loss": 3.0768, + "num_input_tokens_seen": 3683464, + "step": 5595 + }, + { + "epoch": 0.6138331689137345, + "grad_norm": 6.198227882385254, + "learning_rate": 1.626504543437234e-05, + "loss": 3.1144, + "num_input_tokens_seen": 3686448, + "step": 5600 + }, + { + "epoch": 0.6143812342431217, + "grad_norm": 8.729185104370117, + "learning_rate": 1.6224726388248622e-05, + "loss": 3.2992, + "num_input_tokens_seen": 3690360, + "step": 5605 + }, + { + "epoch": 0.6149292995725091, + "grad_norm": 8.366303443908691, + "learning_rate": 1.618443335719281e-05, + "loss": 3.1796, + "num_input_tokens_seen": 3693344, + "step": 5610 + }, + { + "epoch": 0.6154773649018963, + "grad_norm": 5.997150897979736, + "learning_rate": 1.614416646065711e-05, + "loss": 3.0782, + "num_input_tokens_seen": 3696488, + "step": 5615 + }, + { + "epoch": 0.6160254302312835, + "grad_norm": 6.210281848907471, + "learning_rate": 1.6103925818016257e-05, + "loss": 3.0592, + "num_input_tokens_seen": 3700080, + "step": 5620 + }, + { + "epoch": 0.6165734955606709, + "grad_norm": 10.414953231811523, + "learning_rate": 1.606371154856719e-05, + "loss": 2.9467, + "num_input_tokens_seen": 3703264, + "step": 5625 + }, + { + "epoch": 0.6171215608900581, + "grad_norm": 6.666655540466309, + "learning_rate": 1.6023523771528623e-05, + "loss": 3.3406, + "num_input_tokens_seen": 3706232, + "step": 5630 + }, + { + "epoch": 0.6176696262194453, + "grad_norm": 6.776188373565674, + "learning_rate": 1.5983362606040733e-05, + "loss": 2.9584, + "num_input_tokens_seen": 3709728, + "step": 5635 + }, + { + "epoch": 0.6182176915488327, + "grad_norm": 6.977499008178711, + "learning_rate": 1.5943228171164837e-05, + "loss": 3.607, + "num_input_tokens_seen": 3713824, + "step": 5640 + }, + { + "epoch": 0.6187657568782199, + "grad_norm": 6.040121555328369, + "learning_rate": 1.5903120585882974e-05, + "loss": 3.4444, + "num_input_tokens_seen": 3718048, + "step": 5645 + }, + { + "epoch": 0.6193138222076071, + "grad_norm": 7.120656967163086, + "learning_rate": 1.5863039969097592e-05, + "loss": 3.3153, + "num_input_tokens_seen": 3720360, + "step": 5650 + }, + { + "epoch": 0.6198618875369944, + "grad_norm": 10.212481498718262, + "learning_rate": 1.5822986439631207e-05, + "loss": 3.0222, + "num_input_tokens_seen": 3723136, + "step": 5655 + }, + { + "epoch": 0.6204099528663817, + "grad_norm": 6.770248889923096, + "learning_rate": 1.5782960116226007e-05, + "loss": 2.9785, + "num_input_tokens_seen": 3726064, + "step": 5660 + }, + { + "epoch": 0.6209580181957689, + "grad_norm": 5.595423221588135, + "learning_rate": 1.574296111754353e-05, + "loss": 3.03, + "num_input_tokens_seen": 3729800, + "step": 5665 + }, + { + "epoch": 0.6215060835251562, + "grad_norm": 6.7276225090026855, + "learning_rate": 1.5702989562164337e-05, + "loss": 3.2465, + "num_input_tokens_seen": 3733608, + "step": 5670 + }, + { + "epoch": 0.6220541488545435, + "grad_norm": 7.501856327056885, + "learning_rate": 1.5663045568587592e-05, + "loss": 2.8702, + "num_input_tokens_seen": 3736928, + "step": 5675 + }, + { + "epoch": 0.6226022141839307, + "grad_norm": 4.790249824523926, + "learning_rate": 1.562312925523076e-05, + "loss": 3.0023, + "num_input_tokens_seen": 3740256, + "step": 5680 + }, + { + "epoch": 0.623150279513318, + "grad_norm": 6.182326316833496, + "learning_rate": 1.5583240740429266e-05, + "loss": 2.9844, + "num_input_tokens_seen": 3743504, + "step": 5685 + }, + { + "epoch": 0.6236983448427053, + "grad_norm": 8.316134452819824, + "learning_rate": 1.5543380142436108e-05, + "loss": 3.1194, + "num_input_tokens_seen": 3746976, + "step": 5690 + }, + { + "epoch": 0.6242464101720925, + "grad_norm": 4.825036525726318, + "learning_rate": 1.5503547579421507e-05, + "loss": 2.9029, + "num_input_tokens_seen": 3749736, + "step": 5695 + }, + { + "epoch": 0.6247944755014798, + "grad_norm": 5.379034996032715, + "learning_rate": 1.5463743169472604e-05, + "loss": 2.813, + "num_input_tokens_seen": 3754312, + "step": 5700 + }, + { + "epoch": 0.625342540830867, + "grad_norm": 7.649238586425781, + "learning_rate": 1.5423967030593054e-05, + "loss": 2.9726, + "num_input_tokens_seen": 3757320, + "step": 5705 + }, + { + "epoch": 0.6258906061602543, + "grad_norm": 8.456625938415527, + "learning_rate": 1.5384219280702707e-05, + "loss": 2.9852, + "num_input_tokens_seen": 3761320, + "step": 5710 + }, + { + "epoch": 0.6264386714896416, + "grad_norm": 5.238711833953857, + "learning_rate": 1.534450003763726e-05, + "loss": 2.8722, + "num_input_tokens_seen": 3764536, + "step": 5715 + }, + { + "epoch": 0.6269867368190288, + "grad_norm": 7.77496337890625, + "learning_rate": 1.5304809419147885e-05, + "loss": 3.0119, + "num_input_tokens_seen": 3766832, + "step": 5720 + }, + { + "epoch": 0.6275348021484161, + "grad_norm": 6.092039108276367, + "learning_rate": 1.526514754290089e-05, + "loss": 3.1644, + "num_input_tokens_seen": 3770960, + "step": 5725 + }, + { + "epoch": 0.6280828674778034, + "grad_norm": 8.289813995361328, + "learning_rate": 1.5225514526477408e-05, + "loss": 3.0392, + "num_input_tokens_seen": 3774184, + "step": 5730 + }, + { + "epoch": 0.6286309328071906, + "grad_norm": 7.361676216125488, + "learning_rate": 1.5185910487372973e-05, + "loss": 2.9171, + "num_input_tokens_seen": 3778784, + "step": 5735 + }, + { + "epoch": 0.6291789981365778, + "grad_norm": 6.253126621246338, + "learning_rate": 1.514633554299723e-05, + "loss": 2.9294, + "num_input_tokens_seen": 3781568, + "step": 5740 + }, + { + "epoch": 0.6297270634659652, + "grad_norm": 10.453216552734375, + "learning_rate": 1.5106789810673578e-05, + "loss": 3.2064, + "num_input_tokens_seen": 3784152, + "step": 5745 + }, + { + "epoch": 0.6302751287953524, + "grad_norm": 7.798788547515869, + "learning_rate": 1.506727340763881e-05, + "loss": 2.9679, + "num_input_tokens_seen": 3786864, + "step": 5750 + }, + { + "epoch": 0.6308231941247396, + "grad_norm": 7.438601493835449, + "learning_rate": 1.5027786451042758e-05, + "loss": 2.9835, + "num_input_tokens_seen": 3790360, + "step": 5755 + }, + { + "epoch": 0.631371259454127, + "grad_norm": 8.202717781066895, + "learning_rate": 1.498832905794797e-05, + "loss": 3.1209, + "num_input_tokens_seen": 3793160, + "step": 5760 + }, + { + "epoch": 0.6319193247835142, + "grad_norm": 7.448530673980713, + "learning_rate": 1.4948901345329352e-05, + "loss": 3.1779, + "num_input_tokens_seen": 3797568, + "step": 5765 + }, + { + "epoch": 0.6324673901129014, + "grad_norm": 5.029766082763672, + "learning_rate": 1.4909503430073796e-05, + "loss": 2.8519, + "num_input_tokens_seen": 3801096, + "step": 5770 + }, + { + "epoch": 0.6330154554422888, + "grad_norm": 5.234902858734131, + "learning_rate": 1.48701354289799e-05, + "loss": 3.1461, + "num_input_tokens_seen": 3806256, + "step": 5775 + }, + { + "epoch": 0.633563520771676, + "grad_norm": 8.089512825012207, + "learning_rate": 1.4830797458757544e-05, + "loss": 3.12, + "num_input_tokens_seen": 3808880, + "step": 5780 + }, + { + "epoch": 0.6341115861010632, + "grad_norm": 5.7707839012146, + "learning_rate": 1.4791489636027583e-05, + "loss": 2.7087, + "num_input_tokens_seen": 3813584, + "step": 5785 + }, + { + "epoch": 0.6346596514304506, + "grad_norm": 6.020088195800781, + "learning_rate": 1.475221207732151e-05, + "loss": 2.9224, + "num_input_tokens_seen": 3816848, + "step": 5790 + }, + { + "epoch": 0.6352077167598378, + "grad_norm": 6.976149082183838, + "learning_rate": 1.4712964899081093e-05, + "loss": 3.0359, + "num_input_tokens_seen": 3820368, + "step": 5795 + }, + { + "epoch": 0.635755782089225, + "grad_norm": 7.066904544830322, + "learning_rate": 1.4673748217658026e-05, + "loss": 3.0753, + "num_input_tokens_seen": 3823064, + "step": 5800 + }, + { + "epoch": 0.6363038474186123, + "grad_norm": 5.929400444030762, + "learning_rate": 1.4634562149313607e-05, + "loss": 3.1222, + "num_input_tokens_seen": 3826048, + "step": 5805 + }, + { + "epoch": 0.6368519127479996, + "grad_norm": 6.900379657745361, + "learning_rate": 1.459540681021836e-05, + "loss": 3.4275, + "num_input_tokens_seen": 3829584, + "step": 5810 + }, + { + "epoch": 0.6373999780773868, + "grad_norm": 6.451569080352783, + "learning_rate": 1.4556282316451733e-05, + "loss": 3.0381, + "num_input_tokens_seen": 3832848, + "step": 5815 + }, + { + "epoch": 0.6379480434067741, + "grad_norm": 6.459670066833496, + "learning_rate": 1.4517188784001712e-05, + "loss": 2.9231, + "num_input_tokens_seen": 3835392, + "step": 5820 + }, + { + "epoch": 0.6384961087361614, + "grad_norm": 9.6491117477417, + "learning_rate": 1.4478126328764496e-05, + "loss": 3.1121, + "num_input_tokens_seen": 3839016, + "step": 5825 + }, + { + "epoch": 0.6390441740655486, + "grad_norm": 6.9248552322387695, + "learning_rate": 1.4439095066544154e-05, + "loss": 3.0439, + "num_input_tokens_seen": 3841424, + "step": 5830 + }, + { + "epoch": 0.6395922393949359, + "grad_norm": 8.927162170410156, + "learning_rate": 1.44000951130523e-05, + "loss": 2.9511, + "num_input_tokens_seen": 3843624, + "step": 5835 + }, + { + "epoch": 0.6401403047243232, + "grad_norm": 7.547786712646484, + "learning_rate": 1.4361126583907708e-05, + "loss": 3.2556, + "num_input_tokens_seen": 3846024, + "step": 5840 + }, + { + "epoch": 0.6406883700537104, + "grad_norm": 9.325125694274902, + "learning_rate": 1.432218959463599e-05, + "loss": 3.2518, + "num_input_tokens_seen": 3849176, + "step": 5845 + }, + { + "epoch": 0.6412364353830977, + "grad_norm": 7.831711292266846, + "learning_rate": 1.4283284260669282e-05, + "loss": 3.3252, + "num_input_tokens_seen": 3851496, + "step": 5850 + }, + { + "epoch": 0.6417845007124849, + "grad_norm": 5.674088001251221, + "learning_rate": 1.4244410697345845e-05, + "loss": 3.1402, + "num_input_tokens_seen": 3854384, + "step": 5855 + }, + { + "epoch": 0.6423325660418722, + "grad_norm": 5.759450912475586, + "learning_rate": 1.4205569019909759e-05, + "loss": 3.2573, + "num_input_tokens_seen": 3857336, + "step": 5860 + }, + { + "epoch": 0.6428806313712595, + "grad_norm": 6.425468921661377, + "learning_rate": 1.4166759343510599e-05, + "loss": 2.994, + "num_input_tokens_seen": 3860008, + "step": 5865 + }, + { + "epoch": 0.6434286967006467, + "grad_norm": 8.979571342468262, + "learning_rate": 1.4127981783203049e-05, + "loss": 2.8518, + "num_input_tokens_seen": 3863232, + "step": 5870 + }, + { + "epoch": 0.643976762030034, + "grad_norm": 7.848270416259766, + "learning_rate": 1.4089236453946563e-05, + "loss": 3.312, + "num_input_tokens_seen": 3867768, + "step": 5875 + }, + { + "epoch": 0.6445248273594213, + "grad_norm": 6.893942832946777, + "learning_rate": 1.4050523470605099e-05, + "loss": 3.0278, + "num_input_tokens_seen": 3870384, + "step": 5880 + }, + { + "epoch": 0.6450728926888085, + "grad_norm": 6.547880172729492, + "learning_rate": 1.4011842947946674e-05, + "loss": 2.7762, + "num_input_tokens_seen": 3873064, + "step": 5885 + }, + { + "epoch": 0.6456209580181957, + "grad_norm": 8.624503135681152, + "learning_rate": 1.397319500064308e-05, + "loss": 2.8362, + "num_input_tokens_seen": 3876656, + "step": 5890 + }, + { + "epoch": 0.6461690233475831, + "grad_norm": 7.134870529174805, + "learning_rate": 1.3934579743269561e-05, + "loss": 2.6202, + "num_input_tokens_seen": 3880296, + "step": 5895 + }, + { + "epoch": 0.6467170886769703, + "grad_norm": 7.61886739730835, + "learning_rate": 1.389599729030443e-05, + "loss": 2.9104, + "num_input_tokens_seen": 3883280, + "step": 5900 + }, + { + "epoch": 0.6472651540063575, + "grad_norm": 6.761881351470947, + "learning_rate": 1.3857447756128744e-05, + "loss": 2.9658, + "num_input_tokens_seen": 3885848, + "step": 5905 + }, + { + "epoch": 0.6478132193357449, + "grad_norm": 9.020877838134766, + "learning_rate": 1.381893125502598e-05, + "loss": 3.1887, + "num_input_tokens_seen": 3889168, + "step": 5910 + }, + { + "epoch": 0.6483612846651321, + "grad_norm": 7.6226091384887695, + "learning_rate": 1.3780447901181681e-05, + "loss": 3.2913, + "num_input_tokens_seen": 3892368, + "step": 5915 + }, + { + "epoch": 0.6489093499945193, + "grad_norm": 6.327563285827637, + "learning_rate": 1.374199780868311e-05, + "loss": 2.868, + "num_input_tokens_seen": 3895192, + "step": 5920 + }, + { + "epoch": 0.6494574153239067, + "grad_norm": 7.200982093811035, + "learning_rate": 1.3703581091518964e-05, + "loss": 2.9841, + "num_input_tokens_seen": 3899104, + "step": 5925 + }, + { + "epoch": 0.6500054806532939, + "grad_norm": 7.297597885131836, + "learning_rate": 1.3665197863578954e-05, + "loss": 3.1225, + "num_input_tokens_seen": 3901696, + "step": 5930 + }, + { + "epoch": 0.6505535459826811, + "grad_norm": 6.203746318817139, + "learning_rate": 1.3626848238653516e-05, + "loss": 3.082, + "num_input_tokens_seen": 3905192, + "step": 5935 + }, + { + "epoch": 0.6511016113120685, + "grad_norm": 7.677253246307373, + "learning_rate": 1.358853233043349e-05, + "loss": 3.2795, + "num_input_tokens_seen": 3908456, + "step": 5940 + }, + { + "epoch": 0.6516496766414557, + "grad_norm": 6.703474044799805, + "learning_rate": 1.3550250252509744e-05, + "loss": 3.123, + "num_input_tokens_seen": 3910504, + "step": 5945 + }, + { + "epoch": 0.6521977419708429, + "grad_norm": 7.855628967285156, + "learning_rate": 1.3512002118372835e-05, + "loss": 2.8393, + "num_input_tokens_seen": 3913032, + "step": 5950 + }, + { + "epoch": 0.6527458073002302, + "grad_norm": 7.922531604766846, + "learning_rate": 1.3473788041412732e-05, + "loss": 2.7007, + "num_input_tokens_seen": 3916392, + "step": 5955 + }, + { + "epoch": 0.6532938726296175, + "grad_norm": 10.957340240478516, + "learning_rate": 1.3435608134918412e-05, + "loss": 2.9213, + "num_input_tokens_seen": 3919248, + "step": 5960 + }, + { + "epoch": 0.6538419379590047, + "grad_norm": 5.184296607971191, + "learning_rate": 1.3397462512077535e-05, + "loss": 3.203, + "num_input_tokens_seen": 3922528, + "step": 5965 + }, + { + "epoch": 0.654390003288392, + "grad_norm": 8.037724494934082, + "learning_rate": 1.3359351285976174e-05, + "loss": 3.1737, + "num_input_tokens_seen": 3925200, + "step": 5970 + }, + { + "epoch": 0.6549380686177793, + "grad_norm": 7.275876045227051, + "learning_rate": 1.3321274569598382e-05, + "loss": 2.848, + "num_input_tokens_seen": 3928128, + "step": 5975 + }, + { + "epoch": 0.6554861339471665, + "grad_norm": 5.043073654174805, + "learning_rate": 1.3283232475825916e-05, + "loss": 2.8843, + "num_input_tokens_seen": 3931696, + "step": 5980 + }, + { + "epoch": 0.6560341992765538, + "grad_norm": 8.235861778259277, + "learning_rate": 1.3245225117437918e-05, + "loss": 3.3592, + "num_input_tokens_seen": 3934656, + "step": 5985 + }, + { + "epoch": 0.656582264605941, + "grad_norm": 7.135794162750244, + "learning_rate": 1.3207252607110521e-05, + "loss": 3.263, + "num_input_tokens_seen": 3937536, + "step": 5990 + }, + { + "epoch": 0.6571303299353283, + "grad_norm": 8.360773086547852, + "learning_rate": 1.3169315057416564e-05, + "loss": 3.1673, + "num_input_tokens_seen": 3940200, + "step": 5995 + }, + { + "epoch": 0.6576783952647156, + "grad_norm": 9.115818977355957, + "learning_rate": 1.3131412580825236e-05, + "loss": 3.1802, + "num_input_tokens_seen": 3942688, + "step": 6000 + }, + { + "epoch": 0.6582264605941028, + "grad_norm": 8.476052284240723, + "learning_rate": 1.3093545289701747e-05, + "loss": 3.1919, + "num_input_tokens_seen": 3945760, + "step": 6005 + }, + { + "epoch": 0.6587745259234901, + "grad_norm": 6.621984004974365, + "learning_rate": 1.3055713296307016e-05, + "loss": 2.8701, + "num_input_tokens_seen": 3948512, + "step": 6010 + }, + { + "epoch": 0.6593225912528773, + "grad_norm": 8.03313159942627, + "learning_rate": 1.3017916712797293e-05, + "loss": 3.3227, + "num_input_tokens_seen": 3951520, + "step": 6015 + }, + { + "epoch": 0.6598706565822646, + "grad_norm": 7.0439677238464355, + "learning_rate": 1.2980155651223867e-05, + "loss": 2.8738, + "num_input_tokens_seen": 3955392, + "step": 6020 + }, + { + "epoch": 0.6604187219116519, + "grad_norm": 7.3785529136657715, + "learning_rate": 1.2942430223532703e-05, + "loss": 3.3427, + "num_input_tokens_seen": 3959592, + "step": 6025 + }, + { + "epoch": 0.6609667872410391, + "grad_norm": 5.641672134399414, + "learning_rate": 1.2904740541564159e-05, + "loss": 3.0156, + "num_input_tokens_seen": 3963064, + "step": 6030 + }, + { + "epoch": 0.6615148525704264, + "grad_norm": 6.209802150726318, + "learning_rate": 1.286708671705259e-05, + "loss": 3.0553, + "num_input_tokens_seen": 3965552, + "step": 6035 + }, + { + "epoch": 0.6620629178998136, + "grad_norm": 6.092316627502441, + "learning_rate": 1.2829468861626052e-05, + "loss": 2.9092, + "num_input_tokens_seen": 3968480, + "step": 6040 + }, + { + "epoch": 0.6626109832292009, + "grad_norm": 10.323710441589355, + "learning_rate": 1.2791887086805993e-05, + "loss": 3.4687, + "num_input_tokens_seen": 3971464, + "step": 6045 + }, + { + "epoch": 0.6631590485585882, + "grad_norm": 6.506869792938232, + "learning_rate": 1.2754341504006872e-05, + "loss": 3.0349, + "num_input_tokens_seen": 3975640, + "step": 6050 + }, + { + "epoch": 0.6637071138879754, + "grad_norm": 6.929319381713867, + "learning_rate": 1.2716832224535847e-05, + "loss": 3.1761, + "num_input_tokens_seen": 3978928, + "step": 6055 + }, + { + "epoch": 0.6642551792173627, + "grad_norm": 6.731025218963623, + "learning_rate": 1.2679359359592488e-05, + "loss": 2.7582, + "num_input_tokens_seen": 3984016, + "step": 6060 + }, + { + "epoch": 0.66480324454675, + "grad_norm": 7.775283336639404, + "learning_rate": 1.2641923020268377e-05, + "loss": 3.222, + "num_input_tokens_seen": 3986544, + "step": 6065 + }, + { + "epoch": 0.6653513098761372, + "grad_norm": 9.189234733581543, + "learning_rate": 1.2604523317546813e-05, + "loss": 2.7329, + "num_input_tokens_seen": 3989440, + "step": 6070 + }, + { + "epoch": 0.6658993752055244, + "grad_norm": 6.482409954071045, + "learning_rate": 1.2567160362302515e-05, + "loss": 3.0355, + "num_input_tokens_seen": 3993928, + "step": 6075 + }, + { + "epoch": 0.6664474405349118, + "grad_norm": 6.9843878746032715, + "learning_rate": 1.2529834265301227e-05, + "loss": 3.1331, + "num_input_tokens_seen": 3997312, + "step": 6080 + }, + { + "epoch": 0.666995505864299, + "grad_norm": 7.9999308586120605, + "learning_rate": 1.2492545137199426e-05, + "loss": 3.2756, + "num_input_tokens_seen": 4000160, + "step": 6085 + }, + { + "epoch": 0.6675435711936862, + "grad_norm": 5.13596773147583, + "learning_rate": 1.2455293088544023e-05, + "loss": 3.382, + "num_input_tokens_seen": 4003720, + "step": 6090 + }, + { + "epoch": 0.6680916365230736, + "grad_norm": 6.42021369934082, + "learning_rate": 1.2418078229771973e-05, + "loss": 2.9692, + "num_input_tokens_seen": 4006680, + "step": 6095 + }, + { + "epoch": 0.6686397018524608, + "grad_norm": 9.268325805664062, + "learning_rate": 1.2380900671209984e-05, + "loss": 2.9399, + "num_input_tokens_seen": 4009632, + "step": 6100 + }, + { + "epoch": 0.669187767181848, + "grad_norm": 5.049006938934326, + "learning_rate": 1.2343760523074186e-05, + "loss": 3.0858, + "num_input_tokens_seen": 4012552, + "step": 6105 + }, + { + "epoch": 0.6697358325112354, + "grad_norm": 6.255411148071289, + "learning_rate": 1.2306657895469809e-05, + "loss": 3.16, + "num_input_tokens_seen": 4016240, + "step": 6110 + }, + { + "epoch": 0.6702838978406226, + "grad_norm": 10.016054153442383, + "learning_rate": 1.2269592898390833e-05, + "loss": 3.0065, + "num_input_tokens_seen": 4019680, + "step": 6115 + }, + { + "epoch": 0.6708319631700098, + "grad_norm": 7.499462604522705, + "learning_rate": 1.223256564171971e-05, + "loss": 3.3602, + "num_input_tokens_seen": 4022288, + "step": 6120 + }, + { + "epoch": 0.6713800284993972, + "grad_norm": 7.838258266448975, + "learning_rate": 1.2195576235226977e-05, + "loss": 2.7866, + "num_input_tokens_seen": 4025216, + "step": 6125 + }, + { + "epoch": 0.6719280938287844, + "grad_norm": 7.931380271911621, + "learning_rate": 1.2158624788570965e-05, + "loss": 3.4889, + "num_input_tokens_seen": 4029376, + "step": 6130 + }, + { + "epoch": 0.6724761591581716, + "grad_norm": 5.675364971160889, + "learning_rate": 1.2121711411297498e-05, + "loss": 3.3344, + "num_input_tokens_seen": 4031616, + "step": 6135 + }, + { + "epoch": 0.6730242244875589, + "grad_norm": 5.3835577964782715, + "learning_rate": 1.2084836212839507e-05, + "loss": 3.1429, + "num_input_tokens_seen": 4034840, + "step": 6140 + }, + { + "epoch": 0.6735722898169462, + "grad_norm": 7.542428016662598, + "learning_rate": 1.2047999302516737e-05, + "loss": 2.9853, + "num_input_tokens_seen": 4037792, + "step": 6145 + }, + { + "epoch": 0.6741203551463334, + "grad_norm": 7.841860771179199, + "learning_rate": 1.2011200789535464e-05, + "loss": 3.011, + "num_input_tokens_seen": 4041272, + "step": 6150 + }, + { + "epoch": 0.6746684204757207, + "grad_norm": 10.116206169128418, + "learning_rate": 1.1974440782988094e-05, + "loss": 3.1755, + "num_input_tokens_seen": 4044360, + "step": 6155 + }, + { + "epoch": 0.675216485805108, + "grad_norm": 6.566442489624023, + "learning_rate": 1.1937719391852877e-05, + "loss": 3.0532, + "num_input_tokens_seen": 4047544, + "step": 6160 + }, + { + "epoch": 0.6757645511344952, + "grad_norm": 6.767369747161865, + "learning_rate": 1.1901036724993616e-05, + "loss": 2.9114, + "num_input_tokens_seen": 4050584, + "step": 6165 + }, + { + "epoch": 0.6763126164638825, + "grad_norm": 5.782663822174072, + "learning_rate": 1.1864392891159284e-05, + "loss": 3.4902, + "num_input_tokens_seen": 4053392, + "step": 6170 + }, + { + "epoch": 0.6768606817932697, + "grad_norm": 7.807350158691406, + "learning_rate": 1.1827787998983731e-05, + "loss": 3.1896, + "num_input_tokens_seen": 4056184, + "step": 6175 + }, + { + "epoch": 0.677408747122657, + "grad_norm": 8.840995788574219, + "learning_rate": 1.1791222156985382e-05, + "loss": 3.4261, + "num_input_tokens_seen": 4060616, + "step": 6180 + }, + { + "epoch": 0.6779568124520443, + "grad_norm": 5.441840171813965, + "learning_rate": 1.1754695473566877e-05, + "loss": 2.8645, + "num_input_tokens_seen": 4065008, + "step": 6185 + }, + { + "epoch": 0.6785048777814315, + "grad_norm": 7.820642471313477, + "learning_rate": 1.1718208057014768e-05, + "loss": 3.1664, + "num_input_tokens_seen": 4068872, + "step": 6190 + }, + { + "epoch": 0.6790529431108188, + "grad_norm": 7.290872573852539, + "learning_rate": 1.1681760015499201e-05, + "loss": 3.4087, + "num_input_tokens_seen": 4071376, + "step": 6195 + }, + { + "epoch": 0.6796010084402061, + "grad_norm": 5.5174360275268555, + "learning_rate": 1.1645351457073594e-05, + "loss": 3.3074, + "num_input_tokens_seen": 4074528, + "step": 6200 + }, + { + "epoch": 0.6801490737695933, + "grad_norm": 6.114542484283447, + "learning_rate": 1.1608982489674295e-05, + "loss": 3.0535, + "num_input_tokens_seen": 4077600, + "step": 6205 + }, + { + "epoch": 0.6806971390989806, + "grad_norm": 8.515054702758789, + "learning_rate": 1.1572653221120316e-05, + "loss": 3.2291, + "num_input_tokens_seen": 4080664, + "step": 6210 + }, + { + "epoch": 0.6812452044283679, + "grad_norm": 8.11023235321045, + "learning_rate": 1.1536363759112952e-05, + "loss": 3.1448, + "num_input_tokens_seen": 4083256, + "step": 6215 + }, + { + "epoch": 0.6817932697577551, + "grad_norm": 7.834672927856445, + "learning_rate": 1.1500114211235482e-05, + "loss": 3.1213, + "num_input_tokens_seen": 4085568, + "step": 6220 + }, + { + "epoch": 0.6823413350871423, + "grad_norm": 6.758762836456299, + "learning_rate": 1.146390468495289e-05, + "loss": 3.0515, + "num_input_tokens_seen": 4088248, + "step": 6225 + }, + { + "epoch": 0.6828894004165297, + "grad_norm": 6.3487372398376465, + "learning_rate": 1.1427735287611477e-05, + "loss": 2.5775, + "num_input_tokens_seen": 4090848, + "step": 6230 + }, + { + "epoch": 0.6834374657459169, + "grad_norm": 5.81227445602417, + "learning_rate": 1.1391606126438586e-05, + "loss": 3.0297, + "num_input_tokens_seen": 4094232, + "step": 6235 + }, + { + "epoch": 0.6839855310753041, + "grad_norm": 7.857996463775635, + "learning_rate": 1.1355517308542301e-05, + "loss": 3.0582, + "num_input_tokens_seen": 4097096, + "step": 6240 + }, + { + "epoch": 0.6845335964046915, + "grad_norm": 5.819544792175293, + "learning_rate": 1.1319468940911079e-05, + "loss": 2.8814, + "num_input_tokens_seen": 4099912, + "step": 6245 + }, + { + "epoch": 0.6850816617340787, + "grad_norm": 9.14799976348877, + "learning_rate": 1.1283461130413453e-05, + "loss": 3.3229, + "num_input_tokens_seen": 4102320, + "step": 6250 + }, + { + "epoch": 0.6856297270634659, + "grad_norm": 7.087406158447266, + "learning_rate": 1.1247493983797754e-05, + "loss": 2.8581, + "num_input_tokens_seen": 4106480, + "step": 6255 + }, + { + "epoch": 0.6861777923928533, + "grad_norm": 7.298010349273682, + "learning_rate": 1.1218749616158092e-05, + "loss": 3.1186, + "num_input_tokens_seen": 4110064, + "step": 6260 + }, + { + "epoch": 0.6867258577222405, + "grad_norm": 6.6678290367126465, + "learning_rate": 1.1182855933150582e-05, + "loss": 2.971, + "num_input_tokens_seen": 4113304, + "step": 6265 + }, + { + "epoch": 0.6872739230516277, + "grad_norm": 8.044167518615723, + "learning_rate": 1.1147003212277912e-05, + "loss": 3.3036, + "num_input_tokens_seen": 4115752, + "step": 6270 + }, + { + "epoch": 0.687821988381015, + "grad_norm": 6.803138256072998, + "learning_rate": 1.1111191559828627e-05, + "loss": 2.7812, + "num_input_tokens_seen": 4119488, + "step": 6275 + }, + { + "epoch": 0.6883700537104023, + "grad_norm": 5.070322513580322, + "learning_rate": 1.1075421081969502e-05, + "loss": 3.152, + "num_input_tokens_seen": 4122168, + "step": 6280 + }, + { + "epoch": 0.6889181190397895, + "grad_norm": 6.463720321655273, + "learning_rate": 1.1039691884745252e-05, + "loss": 2.9657, + "num_input_tokens_seen": 4125704, + "step": 6285 + }, + { + "epoch": 0.6894661843691768, + "grad_norm": 9.405960083007812, + "learning_rate": 1.1004004074078223e-05, + "loss": 3.5484, + "num_input_tokens_seen": 4128608, + "step": 6290 + }, + { + "epoch": 0.6900142496985641, + "grad_norm": 6.504082679748535, + "learning_rate": 1.0968357755768051e-05, + "loss": 2.7744, + "num_input_tokens_seen": 4131416, + "step": 6295 + }, + { + "epoch": 0.6905623150279513, + "grad_norm": 7.679104804992676, + "learning_rate": 1.093275303549137e-05, + "loss": 3.1396, + "num_input_tokens_seen": 4135168, + "step": 6300 + }, + { + "epoch": 0.6911103803573386, + "grad_norm": 10.499975204467773, + "learning_rate": 1.0897190018801503e-05, + "loss": 3.4244, + "num_input_tokens_seen": 4138320, + "step": 6305 + }, + { + "epoch": 0.6916584456867259, + "grad_norm": 5.967805862426758, + "learning_rate": 1.0861668811128129e-05, + "loss": 3.0676, + "num_input_tokens_seen": 4140880, + "step": 6310 + }, + { + "epoch": 0.6922065110161131, + "grad_norm": 6.552985668182373, + "learning_rate": 1.0826189517776975e-05, + "loss": 3.0805, + "num_input_tokens_seen": 4143912, + "step": 6315 + }, + { + "epoch": 0.6927545763455004, + "grad_norm": 8.34593677520752, + "learning_rate": 1.0790752243929523e-05, + "loss": 3.2587, + "num_input_tokens_seen": 4147320, + "step": 6320 + }, + { + "epoch": 0.6933026416748876, + "grad_norm": 6.536946773529053, + "learning_rate": 1.0755357094642674e-05, + "loss": 3.0053, + "num_input_tokens_seen": 4150928, + "step": 6325 + }, + { + "epoch": 0.6938507070042749, + "grad_norm": 7.138943672180176, + "learning_rate": 1.0720004174848444e-05, + "loss": 2.9898, + "num_input_tokens_seen": 4154120, + "step": 6330 + }, + { + "epoch": 0.6943987723336622, + "grad_norm": 9.60561466217041, + "learning_rate": 1.0684693589353678e-05, + "loss": 3.4849, + "num_input_tokens_seen": 4156832, + "step": 6335 + }, + { + "epoch": 0.6949468376630494, + "grad_norm": 8.691582679748535, + "learning_rate": 1.0649425442839697e-05, + "loss": 3.1178, + "num_input_tokens_seen": 4159704, + "step": 6340 + }, + { + "epoch": 0.6954949029924367, + "grad_norm": 8.004415512084961, + "learning_rate": 1.0614199839862002e-05, + "loss": 3.0848, + "num_input_tokens_seen": 4162168, + "step": 6345 + }, + { + "epoch": 0.696042968321824, + "grad_norm": 12.674962043762207, + "learning_rate": 1.0579016884849999e-05, + "loss": 3.4026, + "num_input_tokens_seen": 4165384, + "step": 6350 + }, + { + "epoch": 0.6965910336512112, + "grad_norm": 7.9511284828186035, + "learning_rate": 1.0543876682106632e-05, + "loss": 3.0329, + "num_input_tokens_seen": 4168128, + "step": 6355 + }, + { + "epoch": 0.6971390989805984, + "grad_norm": 9.268970489501953, + "learning_rate": 1.0508779335808105e-05, + "loss": 3.1994, + "num_input_tokens_seen": 4171888, + "step": 6360 + }, + { + "epoch": 0.6976871643099858, + "grad_norm": 6.21211051940918, + "learning_rate": 1.04737249500036e-05, + "loss": 3.1242, + "num_input_tokens_seen": 4174896, + "step": 6365 + }, + { + "epoch": 0.698235229639373, + "grad_norm": 7.668500900268555, + "learning_rate": 1.04387136286149e-05, + "loss": 3.0467, + "num_input_tokens_seen": 4178504, + "step": 6370 + }, + { + "epoch": 0.6987832949687602, + "grad_norm": 5.02815580368042, + "learning_rate": 1.040374547543613e-05, + "loss": 2.9279, + "num_input_tokens_seen": 4182040, + "step": 6375 + }, + { + "epoch": 0.6993313602981476, + "grad_norm": 5.940211772918701, + "learning_rate": 1.0368820594133466e-05, + "loss": 2.968, + "num_input_tokens_seen": 4185880, + "step": 6380 + }, + { + "epoch": 0.6998794256275348, + "grad_norm": 6.044907093048096, + "learning_rate": 1.0333939088244771e-05, + "loss": 3.3093, + "num_input_tokens_seen": 4189000, + "step": 6385 + }, + { + "epoch": 0.700427490956922, + "grad_norm": 6.427306652069092, + "learning_rate": 1.0299101061179317e-05, + "loss": 3.2814, + "num_input_tokens_seen": 4191736, + "step": 6390 + }, + { + "epoch": 0.7009755562863094, + "grad_norm": 7.336453914642334, + "learning_rate": 1.0264306616217507e-05, + "loss": 2.8437, + "num_input_tokens_seen": 4194360, + "step": 6395 + }, + { + "epoch": 0.7015236216156966, + "grad_norm": 7.562320709228516, + "learning_rate": 1.0229555856510512e-05, + "loss": 2.828, + "num_input_tokens_seen": 4197920, + "step": 6400 + }, + { + "epoch": 0.7020716869450838, + "grad_norm": 7.142042636871338, + "learning_rate": 1.0194848885080011e-05, + "loss": 3.1228, + "num_input_tokens_seen": 4201984, + "step": 6405 + }, + { + "epoch": 0.7026197522744712, + "grad_norm": 6.18742036819458, + "learning_rate": 1.0160185804817859e-05, + "loss": 2.8393, + "num_input_tokens_seen": 4205328, + "step": 6410 + }, + { + "epoch": 0.7031678176038584, + "grad_norm": 7.195977687835693, + "learning_rate": 1.0125566718485788e-05, + "loss": 2.9868, + "num_input_tokens_seen": 4208312, + "step": 6415 + }, + { + "epoch": 0.7037158829332456, + "grad_norm": 10.329099655151367, + "learning_rate": 1.0090991728715132e-05, + "loss": 2.829, + "num_input_tokens_seen": 4211312, + "step": 6420 + }, + { + "epoch": 0.7042639482626329, + "grad_norm": 6.6712236404418945, + "learning_rate": 1.0056460938006473e-05, + "loss": 2.9549, + "num_input_tokens_seen": 4213800, + "step": 6425 + }, + { + "epoch": 0.7048120135920202, + "grad_norm": 4.803092002868652, + "learning_rate": 1.0021974448729365e-05, + "loss": 3.3355, + "num_input_tokens_seen": 4217200, + "step": 6430 + }, + { + "epoch": 0.7053600789214074, + "grad_norm": 6.527164459228516, + "learning_rate": 9.987532363122018e-06, + "loss": 2.9652, + "num_input_tokens_seen": 4220768, + "step": 6435 + }, + { + "epoch": 0.7059081442507947, + "grad_norm": 7.362782955169678, + "learning_rate": 9.953134783291036e-06, + "loss": 2.8684, + "num_input_tokens_seen": 4224224, + "step": 6440 + }, + { + "epoch": 0.706456209580182, + "grad_norm": 9.984780311584473, + "learning_rate": 9.918781811211045e-06, + "loss": 2.8968, + "num_input_tokens_seen": 4229272, + "step": 6445 + }, + { + "epoch": 0.7070042749095692, + "grad_norm": 6.219121932983398, + "learning_rate": 9.884473548724441e-06, + "loss": 3.1832, + "num_input_tokens_seen": 4232096, + "step": 6450 + }, + { + "epoch": 0.7075523402389565, + "grad_norm": 6.208556652069092, + "learning_rate": 9.850210097541085e-06, + "loss": 3.108, + "num_input_tokens_seen": 4235496, + "step": 6455 + }, + { + "epoch": 0.7081004055683437, + "grad_norm": 7.7808003425598145, + "learning_rate": 9.81599155923798e-06, + "loss": 3.0694, + "num_input_tokens_seen": 4238320, + "step": 6460 + }, + { + "epoch": 0.708648470897731, + "grad_norm": 8.587124824523926, + "learning_rate": 9.781818035258972e-06, + "loss": 3.1773, + "num_input_tokens_seen": 4240792, + "step": 6465 + }, + { + "epoch": 0.7091965362271183, + "grad_norm": 11.057994842529297, + "learning_rate": 9.747689626914483e-06, + "loss": 3.4154, + "num_input_tokens_seen": 4244904, + "step": 6470 + }, + { + "epoch": 0.7097446015565055, + "grad_norm": 6.430279731750488, + "learning_rate": 9.713606435381165e-06, + "loss": 3.1772, + "num_input_tokens_seen": 4247632, + "step": 6475 + }, + { + "epoch": 0.7102926668858928, + "grad_norm": 7.846237659454346, + "learning_rate": 9.679568561701615e-06, + "loss": 2.9962, + "num_input_tokens_seen": 4250768, + "step": 6480 + }, + { + "epoch": 0.7108407322152801, + "grad_norm": 8.467151641845703, + "learning_rate": 9.645576106784118e-06, + "loss": 2.8687, + "num_input_tokens_seen": 4253904, + "step": 6485 + }, + { + "epoch": 0.7113887975446673, + "grad_norm": 16.991235733032227, + "learning_rate": 9.611629171402273e-06, + "loss": 3.1696, + "num_input_tokens_seen": 4256768, + "step": 6490 + }, + { + "epoch": 0.7119368628740546, + "grad_norm": 7.091182231903076, + "learning_rate": 9.577727856194746e-06, + "loss": 2.7567, + "num_input_tokens_seen": 4260192, + "step": 6495 + }, + { + "epoch": 0.7124849282034419, + "grad_norm": 7.963916778564453, + "learning_rate": 9.543872261664952e-06, + "loss": 2.9586, + "num_input_tokens_seen": 4263560, + "step": 6500 + }, + { + "epoch": 0.7130329935328291, + "grad_norm": 6.632905006408691, + "learning_rate": 9.510062488180781e-06, + "loss": 2.8122, + "num_input_tokens_seen": 4266624, + "step": 6505 + }, + { + "epoch": 0.7135810588622163, + "grad_norm": 8.157563209533691, + "learning_rate": 9.476298635974265e-06, + "loss": 2.9458, + "num_input_tokens_seen": 4269488, + "step": 6510 + }, + { + "epoch": 0.7141291241916037, + "grad_norm": 7.982326507568359, + "learning_rate": 9.442580805141305e-06, + "loss": 3.172, + "num_input_tokens_seen": 4272592, + "step": 6515 + }, + { + "epoch": 0.7146771895209909, + "grad_norm": 5.6351423263549805, + "learning_rate": 9.408909095641363e-06, + "loss": 3.139, + "num_input_tokens_seen": 4275552, + "step": 6520 + }, + { + "epoch": 0.7152252548503781, + "grad_norm": 7.883710861206055, + "learning_rate": 9.375283607297175e-06, + "loss": 3.3458, + "num_input_tokens_seen": 4277912, + "step": 6525 + }, + { + "epoch": 0.7157733201797655, + "grad_norm": 5.036897659301758, + "learning_rate": 9.341704439794441e-06, + "loss": 2.9759, + "num_input_tokens_seen": 4280520, + "step": 6530 + }, + { + "epoch": 0.7163213855091527, + "grad_norm": 6.539727687835693, + "learning_rate": 9.308171692681565e-06, + "loss": 2.7201, + "num_input_tokens_seen": 4284248, + "step": 6535 + }, + { + "epoch": 0.7168694508385399, + "grad_norm": 7.108365058898926, + "learning_rate": 9.274685465369303e-06, + "loss": 3.1882, + "num_input_tokens_seen": 4288664, + "step": 6540 + }, + { + "epoch": 0.7174175161679273, + "grad_norm": 5.567689418792725, + "learning_rate": 9.241245857130507e-06, + "loss": 3.3889, + "num_input_tokens_seen": 4292104, + "step": 6545 + }, + { + "epoch": 0.7179655814973145, + "grad_norm": 7.539772033691406, + "learning_rate": 9.207852967099841e-06, + "loss": 3.2677, + "num_input_tokens_seen": 4296664, + "step": 6550 + }, + { + "epoch": 0.7185136468267017, + "grad_norm": 11.019807815551758, + "learning_rate": 9.174506894273448e-06, + "loss": 3.2587, + "num_input_tokens_seen": 4298936, + "step": 6555 + }, + { + "epoch": 0.719061712156089, + "grad_norm": 4.87662935256958, + "learning_rate": 9.141207737508677e-06, + "loss": 3.4056, + "num_input_tokens_seen": 4301872, + "step": 6560 + }, + { + "epoch": 0.7196097774854763, + "grad_norm": 7.396250247955322, + "learning_rate": 9.107955595523812e-06, + "loss": 3.0741, + "num_input_tokens_seen": 4305096, + "step": 6565 + }, + { + "epoch": 0.7201578428148635, + "grad_norm": 9.769874572753906, + "learning_rate": 9.074750566897733e-06, + "loss": 2.8083, + "num_input_tokens_seen": 4309576, + "step": 6570 + }, + { + "epoch": 0.7207059081442508, + "grad_norm": 7.023451805114746, + "learning_rate": 9.041592750069652e-06, + "loss": 3.162, + "num_input_tokens_seen": 4313728, + "step": 6575 + }, + { + "epoch": 0.7212539734736381, + "grad_norm": 7.67805814743042, + "learning_rate": 9.008482243338841e-06, + "loss": 3.1487, + "num_input_tokens_seen": 4316864, + "step": 6580 + }, + { + "epoch": 0.7218020388030253, + "grad_norm": 5.812924385070801, + "learning_rate": 8.975419144864292e-06, + "loss": 2.6071, + "num_input_tokens_seen": 4320688, + "step": 6585 + }, + { + "epoch": 0.7223501041324126, + "grad_norm": 9.005423545837402, + "learning_rate": 8.94240355266445e-06, + "loss": 3.2333, + "num_input_tokens_seen": 4323184, + "step": 6590 + }, + { + "epoch": 0.7228981694617999, + "grad_norm": 5.683709144592285, + "learning_rate": 8.909435564616944e-06, + "loss": 2.9484, + "num_input_tokens_seen": 4326304, + "step": 6595 + }, + { + "epoch": 0.7234462347911871, + "grad_norm": 9.263490676879883, + "learning_rate": 8.876515278458265e-06, + "loss": 3.2337, + "num_input_tokens_seen": 4329120, + "step": 6600 + }, + { + "epoch": 0.7239943001205744, + "grad_norm": 6.478157997131348, + "learning_rate": 8.84364279178348e-06, + "loss": 3.0925, + "num_input_tokens_seen": 4332440, + "step": 6605 + }, + { + "epoch": 0.7245423654499616, + "grad_norm": 8.741613388061523, + "learning_rate": 8.810818202045962e-06, + "loss": 3.3093, + "num_input_tokens_seen": 4335440, + "step": 6610 + }, + { + "epoch": 0.7250904307793489, + "grad_norm": 7.031724452972412, + "learning_rate": 8.77804160655708e-06, + "loss": 3.3767, + "num_input_tokens_seen": 4337912, + "step": 6615 + }, + { + "epoch": 0.7256384961087362, + "grad_norm": 8.763786315917969, + "learning_rate": 8.745313102485923e-06, + "loss": 3.201, + "num_input_tokens_seen": 4341472, + "step": 6620 + }, + { + "epoch": 0.7261865614381234, + "grad_norm": 5.877601623535156, + "learning_rate": 8.712632786859021e-06, + "loss": 2.7422, + "num_input_tokens_seen": 4345304, + "step": 6625 + }, + { + "epoch": 0.7267346267675107, + "grad_norm": 7.608758926391602, + "learning_rate": 8.68000075656003e-06, + "loss": 3.2688, + "num_input_tokens_seen": 4348264, + "step": 6630 + }, + { + "epoch": 0.727282692096898, + "grad_norm": 6.207149982452393, + "learning_rate": 8.647417108329454e-06, + "loss": 3.1522, + "num_input_tokens_seen": 4352144, + "step": 6635 + }, + { + "epoch": 0.7278307574262852, + "grad_norm": 6.543735504150391, + "learning_rate": 8.61488193876439e-06, + "loss": 2.968, + "num_input_tokens_seen": 4355840, + "step": 6640 + }, + { + "epoch": 0.7283788227556725, + "grad_norm": 7.882357597351074, + "learning_rate": 8.582395344318197e-06, + "loss": 2.8674, + "num_input_tokens_seen": 4358640, + "step": 6645 + }, + { + "epoch": 0.7289268880850598, + "grad_norm": 10.999910354614258, + "learning_rate": 8.54995742130022e-06, + "loss": 3.2327, + "num_input_tokens_seen": 4361656, + "step": 6650 + }, + { + "epoch": 0.729474953414447, + "grad_norm": 8.629473686218262, + "learning_rate": 8.517568265875541e-06, + "loss": 3.1042, + "num_input_tokens_seen": 4363968, + "step": 6655 + }, + { + "epoch": 0.7300230187438342, + "grad_norm": 8.353252410888672, + "learning_rate": 8.485227974064647e-06, + "loss": 2.7692, + "num_input_tokens_seen": 4367200, + "step": 6660 + }, + { + "epoch": 0.7305710840732216, + "grad_norm": 7.927604675292969, + "learning_rate": 8.452936641743156e-06, + "loss": 3.2321, + "num_input_tokens_seen": 4370096, + "step": 6665 + }, + { + "epoch": 0.7311191494026088, + "grad_norm": 5.507778644561768, + "learning_rate": 8.42069436464157e-06, + "loss": 3.1024, + "num_input_tokens_seen": 4374264, + "step": 6670 + }, + { + "epoch": 0.731667214731996, + "grad_norm": 6.3533172607421875, + "learning_rate": 8.38850123834494e-06, + "loss": 2.7559, + "num_input_tokens_seen": 4378824, + "step": 6675 + }, + { + "epoch": 0.7322152800613834, + "grad_norm": 6.395352840423584, + "learning_rate": 8.356357358292601e-06, + "loss": 3.243, + "num_input_tokens_seen": 4382616, + "step": 6680 + }, + { + "epoch": 0.7327633453907706, + "grad_norm": 8.324797630310059, + "learning_rate": 8.32426281977792e-06, + "loss": 3.6588, + "num_input_tokens_seen": 4385488, + "step": 6685 + }, + { + "epoch": 0.7333114107201578, + "grad_norm": 6.711746692657471, + "learning_rate": 8.292217717947962e-06, + "loss": 3.1062, + "num_input_tokens_seen": 4388592, + "step": 6690 + }, + { + "epoch": 0.7338594760495452, + "grad_norm": 11.369217872619629, + "learning_rate": 8.26022214780324e-06, + "loss": 3.0253, + "num_input_tokens_seen": 4391640, + "step": 6695 + }, + { + "epoch": 0.7344075413789324, + "grad_norm": 7.522586822509766, + "learning_rate": 8.228276204197427e-06, + "loss": 3.3273, + "num_input_tokens_seen": 4394456, + "step": 6700 + }, + { + "epoch": 0.7349556067083196, + "grad_norm": 7.1993207931518555, + "learning_rate": 8.196379981837071e-06, + "loss": 2.9679, + "num_input_tokens_seen": 4397352, + "step": 6705 + }, + { + "epoch": 0.735503672037707, + "grad_norm": 9.711231231689453, + "learning_rate": 8.164533575281316e-06, + "loss": 3.5035, + "num_input_tokens_seen": 4400744, + "step": 6710 + }, + { + "epoch": 0.7360517373670942, + "grad_norm": 8.696206092834473, + "learning_rate": 8.132737078941642e-06, + "loss": 2.8264, + "num_input_tokens_seen": 4404712, + "step": 6715 + }, + { + "epoch": 0.7365998026964814, + "grad_norm": 8.558262825012207, + "learning_rate": 8.100990587081536e-06, + "loss": 3.0127, + "num_input_tokens_seen": 4407448, + "step": 6720 + }, + { + "epoch": 0.7371478680258687, + "grad_norm": 7.874935626983643, + "learning_rate": 8.069294193816252e-06, + "loss": 2.9852, + "num_input_tokens_seen": 4410096, + "step": 6725 + }, + { + "epoch": 0.737695933355256, + "grad_norm": 10.938785552978516, + "learning_rate": 8.037647993112543e-06, + "loss": 2.8523, + "num_input_tokens_seen": 4413248, + "step": 6730 + }, + { + "epoch": 0.7382439986846432, + "grad_norm": 6.2363786697387695, + "learning_rate": 8.006052078788335e-06, + "loss": 3.5423, + "num_input_tokens_seen": 4417016, + "step": 6735 + }, + { + "epoch": 0.7387920640140305, + "grad_norm": 7.439382553100586, + "learning_rate": 7.974506544512478e-06, + "loss": 3.0829, + "num_input_tokens_seen": 4420144, + "step": 6740 + }, + { + "epoch": 0.7393401293434178, + "grad_norm": 8.05595588684082, + "learning_rate": 7.943011483804494e-06, + "loss": 2.8291, + "num_input_tokens_seen": 4422672, + "step": 6745 + }, + { + "epoch": 0.739888194672805, + "grad_norm": 7.396727561950684, + "learning_rate": 7.91156699003424e-06, + "loss": 3.1015, + "num_input_tokens_seen": 4425368, + "step": 6750 + }, + { + "epoch": 0.7404362600021923, + "grad_norm": 5.773197650909424, + "learning_rate": 7.880173156421661e-06, + "loss": 3.0124, + "num_input_tokens_seen": 4427720, + "step": 6755 + }, + { + "epoch": 0.7409843253315795, + "grad_norm": 7.078009128570557, + "learning_rate": 7.848830076036556e-06, + "loss": 3.007, + "num_input_tokens_seen": 4430872, + "step": 6760 + }, + { + "epoch": 0.7415323906609668, + "grad_norm": 6.219594478607178, + "learning_rate": 7.817537841798216e-06, + "loss": 3.0966, + "num_input_tokens_seen": 4434816, + "step": 6765 + }, + { + "epoch": 0.7420804559903541, + "grad_norm": 7.2829365730285645, + "learning_rate": 7.786296546475213e-06, + "loss": 3.4504, + "num_input_tokens_seen": 4437960, + "step": 6770 + }, + { + "epoch": 0.7426285213197413, + "grad_norm": 7.280004978179932, + "learning_rate": 7.755106282685118e-06, + "loss": 3.0042, + "num_input_tokens_seen": 4440624, + "step": 6775 + }, + { + "epoch": 0.7431765866491286, + "grad_norm": 6.213809490203857, + "learning_rate": 7.723967142894195e-06, + "loss": 3.0603, + "num_input_tokens_seen": 4444120, + "step": 6780 + }, + { + "epoch": 0.7437246519785159, + "grad_norm": 6.277675628662109, + "learning_rate": 7.69287921941715e-06, + "loss": 2.9716, + "num_input_tokens_seen": 4447152, + "step": 6785 + }, + { + "epoch": 0.7442727173079031, + "grad_norm": 8.690731048583984, + "learning_rate": 7.661842604416863e-06, + "loss": 3.2242, + "num_input_tokens_seen": 4450720, + "step": 6790 + }, + { + "epoch": 0.7448207826372903, + "grad_norm": 6.518171787261963, + "learning_rate": 7.630857389904095e-06, + "loss": 2.8793, + "num_input_tokens_seen": 4454448, + "step": 6795 + }, + { + "epoch": 0.7453688479666777, + "grad_norm": 10.606318473815918, + "learning_rate": 7.599923667737227e-06, + "loss": 2.9673, + "num_input_tokens_seen": 4457816, + "step": 6800 + }, + { + "epoch": 0.7459169132960649, + "grad_norm": 10.472159385681152, + "learning_rate": 7.5690415296220035e-06, + "loss": 3.0352, + "num_input_tokens_seen": 4460936, + "step": 6805 + }, + { + "epoch": 0.7464649786254521, + "grad_norm": 7.0004496574401855, + "learning_rate": 7.538211067111223e-06, + "loss": 3.165, + "num_input_tokens_seen": 4463688, + "step": 6810 + }, + { + "epoch": 0.7470130439548394, + "grad_norm": 7.692315101623535, + "learning_rate": 7.5074323716044835e-06, + "loss": 3.3064, + "num_input_tokens_seen": 4466616, + "step": 6815 + }, + { + "epoch": 0.7475611092842267, + "grad_norm": 5.7364702224731445, + "learning_rate": 7.476705534347947e-06, + "loss": 3.2443, + "num_input_tokens_seen": 4470464, + "step": 6820 + }, + { + "epoch": 0.7481091746136139, + "grad_norm": 6.589802265167236, + "learning_rate": 7.446030646434008e-06, + "loss": 2.9859, + "num_input_tokens_seen": 4472944, + "step": 6825 + }, + { + "epoch": 0.7486572399430012, + "grad_norm": 8.241453170776367, + "learning_rate": 7.4154077988010466e-06, + "loss": 3.1194, + "num_input_tokens_seen": 4475896, + "step": 6830 + }, + { + "epoch": 0.7492053052723885, + "grad_norm": 7.177932262420654, + "learning_rate": 7.3848370822332005e-06, + "loss": 2.9095, + "num_input_tokens_seen": 4478424, + "step": 6835 + }, + { + "epoch": 0.7497533706017757, + "grad_norm": 6.683755397796631, + "learning_rate": 7.354318587360029e-06, + "loss": 2.8105, + "num_input_tokens_seen": 4481120, + "step": 6840 + }, + { + "epoch": 0.7503014359311629, + "grad_norm": 7.998584747314453, + "learning_rate": 7.323852404656279e-06, + "loss": 2.5817, + "num_input_tokens_seen": 4484912, + "step": 6845 + }, + { + "epoch": 0.7508495012605503, + "grad_norm": 5.244688034057617, + "learning_rate": 7.293438624441637e-06, + "loss": 3.1018, + "num_input_tokens_seen": 4488416, + "step": 6850 + }, + { + "epoch": 0.7513975665899375, + "grad_norm": 7.417481422424316, + "learning_rate": 7.263077336880406e-06, + "loss": 3.2385, + "num_input_tokens_seen": 4491392, + "step": 6855 + }, + { + "epoch": 0.7519456319193247, + "grad_norm": 5.952940464019775, + "learning_rate": 7.232768631981285e-06, + "loss": 2.5967, + "num_input_tokens_seen": 4494608, + "step": 6860 + }, + { + "epoch": 0.7524936972487121, + "grad_norm": 7.974299907684326, + "learning_rate": 7.202512599597097e-06, + "loss": 3.3131, + "num_input_tokens_seen": 4497952, + "step": 6865 + }, + { + "epoch": 0.7530417625780993, + "grad_norm": 10.40588092803955, + "learning_rate": 7.172309329424495e-06, + "loss": 2.8735, + "num_input_tokens_seen": 4500792, + "step": 6870 + }, + { + "epoch": 0.7535898279074865, + "grad_norm": 7.208824634552002, + "learning_rate": 7.142158911003724e-06, + "loss": 3.3135, + "num_input_tokens_seen": 4504032, + "step": 6875 + }, + { + "epoch": 0.7541378932368739, + "grad_norm": 7.409761428833008, + "learning_rate": 7.112061433718339e-06, + "loss": 2.955, + "num_input_tokens_seen": 4506784, + "step": 6880 + }, + { + "epoch": 0.7546859585662611, + "grad_norm": 6.84408712387085, + "learning_rate": 7.082016986794951e-06, + "loss": 3.3193, + "num_input_tokens_seen": 4510016, + "step": 6885 + }, + { + "epoch": 0.7552340238956483, + "grad_norm": 5.721726417541504, + "learning_rate": 7.052025659302952e-06, + "loss": 3.1054, + "num_input_tokens_seen": 4512496, + "step": 6890 + }, + { + "epoch": 0.7557820892250356, + "grad_norm": 7.73302698135376, + "learning_rate": 7.022087540154274e-06, + "loss": 3.0514, + "num_input_tokens_seen": 4515040, + "step": 6895 + }, + { + "epoch": 0.7563301545544229, + "grad_norm": 8.347733497619629, + "learning_rate": 6.992202718103086e-06, + "loss": 2.9805, + "num_input_tokens_seen": 4517944, + "step": 6900 + }, + { + "epoch": 0.7568782198838101, + "grad_norm": 7.3970255851745605, + "learning_rate": 6.962371281745561e-06, + "loss": 3.3263, + "num_input_tokens_seen": 4520568, + "step": 6905 + }, + { + "epoch": 0.7574262852131974, + "grad_norm": 7.3923797607421875, + "learning_rate": 6.932593319519618e-06, + "loss": 3.2219, + "num_input_tokens_seen": 4524592, + "step": 6910 + }, + { + "epoch": 0.7579743505425847, + "grad_norm": 7.414371490478516, + "learning_rate": 6.902868919704627e-06, + "loss": 2.5203, + "num_input_tokens_seen": 4528528, + "step": 6915 + }, + { + "epoch": 0.7585224158719719, + "grad_norm": 7.776823043823242, + "learning_rate": 6.873198170421175e-06, + "loss": 3.1746, + "num_input_tokens_seen": 4532008, + "step": 6920 + }, + { + "epoch": 0.7590704812013592, + "grad_norm": 7.0230889320373535, + "learning_rate": 6.84358115963081e-06, + "loss": 3.0865, + "num_input_tokens_seen": 4536232, + "step": 6925 + }, + { + "epoch": 0.7596185465307465, + "grad_norm": 4.996485233306885, + "learning_rate": 6.814017975135753e-06, + "loss": 3.2363, + "num_input_tokens_seen": 4539680, + "step": 6930 + }, + { + "epoch": 0.7601666118601337, + "grad_norm": 9.683207511901855, + "learning_rate": 6.784508704578646e-06, + "loss": 3.2016, + "num_input_tokens_seen": 4542848, + "step": 6935 + }, + { + "epoch": 0.760714677189521, + "grad_norm": 5.796095848083496, + "learning_rate": 6.755053435442324e-06, + "loss": 2.9563, + "num_input_tokens_seen": 4547104, + "step": 6940 + }, + { + "epoch": 0.7612627425189082, + "grad_norm": 7.686697959899902, + "learning_rate": 6.725652255049508e-06, + "loss": 2.7968, + "num_input_tokens_seen": 4550392, + "step": 6945 + }, + { + "epoch": 0.7618108078482955, + "grad_norm": 7.243149280548096, + "learning_rate": 6.696305250562562e-06, + "loss": 2.9016, + "num_input_tokens_seen": 4553760, + "step": 6950 + }, + { + "epoch": 0.7623588731776828, + "grad_norm": 5.771494388580322, + "learning_rate": 6.667012508983278e-06, + "loss": 3.1646, + "num_input_tokens_seen": 4558080, + "step": 6955 + }, + { + "epoch": 0.76290693850707, + "grad_norm": 7.9829816818237305, + "learning_rate": 6.63777411715254e-06, + "loss": 2.946, + "num_input_tokens_seen": 4560904, + "step": 6960 + }, + { + "epoch": 0.7634550038364573, + "grad_norm": 6.072175979614258, + "learning_rate": 6.608590161750131e-06, + "loss": 3.2183, + "num_input_tokens_seen": 4563864, + "step": 6965 + }, + { + "epoch": 0.7640030691658446, + "grad_norm": 6.895592212677002, + "learning_rate": 6.579460729294429e-06, + "loss": 3.2887, + "num_input_tokens_seen": 4566800, + "step": 6970 + }, + { + "epoch": 0.7645511344952318, + "grad_norm": 7.528575897216797, + "learning_rate": 6.550385906142212e-06, + "loss": 3.0147, + "num_input_tokens_seen": 4569680, + "step": 6975 + }, + { + "epoch": 0.765099199824619, + "grad_norm": 5.899028301239014, + "learning_rate": 6.521365778488331e-06, + "loss": 2.9008, + "num_input_tokens_seen": 4573704, + "step": 6980 + }, + { + "epoch": 0.7656472651540064, + "grad_norm": 7.313390254974365, + "learning_rate": 6.492400432365503e-06, + "loss": 3.1414, + "num_input_tokens_seen": 4576368, + "step": 6985 + }, + { + "epoch": 0.7661953304833936, + "grad_norm": 7.083227634429932, + "learning_rate": 6.463489953644031e-06, + "loss": 2.7539, + "num_input_tokens_seen": 4578936, + "step": 6990 + }, + { + "epoch": 0.7667433958127808, + "grad_norm": 7.272182941436768, + "learning_rate": 6.434634428031558e-06, + "loss": 3.1749, + "num_input_tokens_seen": 4582096, + "step": 6995 + }, + { + "epoch": 0.7672914611421682, + "grad_norm": 9.697888374328613, + "learning_rate": 6.405833941072834e-06, + "loss": 3.1397, + "num_input_tokens_seen": 4584400, + "step": 7000 + }, + { + "epoch": 0.7678395264715554, + "grad_norm": 7.066343307495117, + "learning_rate": 6.377088578149418e-06, + "loss": 2.8686, + "num_input_tokens_seen": 4587688, + "step": 7005 + }, + { + "epoch": 0.7683875918009426, + "grad_norm": 5.80040979385376, + "learning_rate": 6.348398424479454e-06, + "loss": 2.7322, + "num_input_tokens_seen": 4591120, + "step": 7010 + }, + { + "epoch": 0.76893565713033, + "grad_norm": 8.803409576416016, + "learning_rate": 6.319763565117432e-06, + "loss": 3.2123, + "num_input_tokens_seen": 4594456, + "step": 7015 + }, + { + "epoch": 0.7694837224597172, + "grad_norm": 6.382712364196777, + "learning_rate": 6.291184084953894e-06, + "loss": 3.3465, + "num_input_tokens_seen": 4597120, + "step": 7020 + }, + { + "epoch": 0.7700317877891044, + "grad_norm": 6.3958740234375, + "learning_rate": 6.2626600687152064e-06, + "loss": 2.9045, + "num_input_tokens_seen": 4599416, + "step": 7025 + }, + { + "epoch": 0.7705798531184918, + "grad_norm": 5.454673767089844, + "learning_rate": 6.234191600963335e-06, + "loss": 3.1258, + "num_input_tokens_seen": 4602760, + "step": 7030 + }, + { + "epoch": 0.771127918447879, + "grad_norm": 4.992536544799805, + "learning_rate": 6.205778766095533e-06, + "loss": 3.0881, + "num_input_tokens_seen": 4605312, + "step": 7035 + }, + { + "epoch": 0.7716759837772662, + "grad_norm": 7.264188766479492, + "learning_rate": 6.1774216483441394e-06, + "loss": 3.117, + "num_input_tokens_seen": 4608784, + "step": 7040 + }, + { + "epoch": 0.7722240491066535, + "grad_norm": 7.106401443481445, + "learning_rate": 6.149120331776329e-06, + "loss": 2.8674, + "num_input_tokens_seen": 4612728, + "step": 7045 + }, + { + "epoch": 0.7727721144360408, + "grad_norm": 8.04111385345459, + "learning_rate": 6.120874900293827e-06, + "loss": 3.0187, + "num_input_tokens_seen": 4616096, + "step": 7050 + }, + { + "epoch": 0.773320179765428, + "grad_norm": 7.114358901977539, + "learning_rate": 6.092685437632683e-06, + "loss": 2.9277, + "num_input_tokens_seen": 4619312, + "step": 7055 + }, + { + "epoch": 0.7738682450948153, + "grad_norm": 6.135927200317383, + "learning_rate": 6.064552027363049e-06, + "loss": 2.8, + "num_input_tokens_seen": 4623080, + "step": 7060 + }, + { + "epoch": 0.7744163104242026, + "grad_norm": 9.407398223876953, + "learning_rate": 6.0364747528888734e-06, + "loss": 2.8471, + "num_input_tokens_seen": 4625720, + "step": 7065 + }, + { + "epoch": 0.7749643757535898, + "grad_norm": 8.590024948120117, + "learning_rate": 6.0084536974476995e-06, + "loss": 3.1369, + "num_input_tokens_seen": 4628368, + "step": 7070 + }, + { + "epoch": 0.7755124410829771, + "grad_norm": 6.1918721199035645, + "learning_rate": 5.980488944110408e-06, + "loss": 2.9941, + "num_input_tokens_seen": 4631128, + "step": 7075 + }, + { + "epoch": 0.7760605064123643, + "grad_norm": 6.956912994384766, + "learning_rate": 5.9525805757809524e-06, + "loss": 3.3899, + "num_input_tokens_seen": 4634672, + "step": 7080 + }, + { + "epoch": 0.7766085717417516, + "grad_norm": 6.198210716247559, + "learning_rate": 5.9247286751961366e-06, + "loss": 3.165, + "num_input_tokens_seen": 4638184, + "step": 7085 + }, + { + "epoch": 0.7771566370711389, + "grad_norm": 6.877211570739746, + "learning_rate": 5.896933324925372e-06, + "loss": 3.1694, + "num_input_tokens_seen": 4641976, + "step": 7090 + }, + { + "epoch": 0.7777047024005261, + "grad_norm": 6.007309436798096, + "learning_rate": 5.869194607370409e-06, + "loss": 3.1036, + "num_input_tokens_seen": 4645280, + "step": 7095 + }, + { + "epoch": 0.7782527677299134, + "grad_norm": 7.9656572341918945, + "learning_rate": 5.8415126047650955e-06, + "loss": 3.2545, + "num_input_tokens_seen": 4648904, + "step": 7100 + }, + { + "epoch": 0.7788008330593007, + "grad_norm": 7.05634069442749, + "learning_rate": 5.813887399175169e-06, + "loss": 2.912, + "num_input_tokens_seen": 4651232, + "step": 7105 + }, + { + "epoch": 0.7793488983886879, + "grad_norm": 8.77833080291748, + "learning_rate": 5.7863190724979695e-06, + "loss": 3.0476, + "num_input_tokens_seen": 4654288, + "step": 7110 + }, + { + "epoch": 0.7798969637180752, + "grad_norm": 6.191843032836914, + "learning_rate": 5.75880770646221e-06, + "loss": 3.1158, + "num_input_tokens_seen": 4657808, + "step": 7115 + }, + { + "epoch": 0.7804450290474625, + "grad_norm": 5.634969234466553, + "learning_rate": 5.73135338262776e-06, + "loss": 2.8591, + "num_input_tokens_seen": 4661440, + "step": 7120 + }, + { + "epoch": 0.7809930943768497, + "grad_norm": 6.004340648651123, + "learning_rate": 5.7039561823853615e-06, + "loss": 2.8518, + "num_input_tokens_seen": 4665104, + "step": 7125 + }, + { + "epoch": 0.7815411597062369, + "grad_norm": 7.3791680335998535, + "learning_rate": 5.676616186956413e-06, + "loss": 3.1628, + "num_input_tokens_seen": 4668432, + "step": 7130 + }, + { + "epoch": 0.7820892250356243, + "grad_norm": 9.166860580444336, + "learning_rate": 5.649333477392735e-06, + "loss": 3.3455, + "num_input_tokens_seen": 4671688, + "step": 7135 + }, + { + "epoch": 0.7826372903650115, + "grad_norm": 6.651597023010254, + "learning_rate": 5.622108134576312e-06, + "loss": 3.4196, + "num_input_tokens_seen": 4675408, + "step": 7140 + }, + { + "epoch": 0.7831853556943987, + "grad_norm": 7.5387797355651855, + "learning_rate": 5.594940239219049e-06, + "loss": 3.2571, + "num_input_tokens_seen": 4678440, + "step": 7145 + }, + { + "epoch": 0.7837334210237861, + "grad_norm": 9.256987571716309, + "learning_rate": 5.5678298718625674e-06, + "loss": 3.1553, + "num_input_tokens_seen": 4681320, + "step": 7150 + }, + { + "epoch": 0.7842814863531733, + "grad_norm": 8.727250099182129, + "learning_rate": 5.54077711287792e-06, + "loss": 3.2874, + "num_input_tokens_seen": 4685024, + "step": 7155 + }, + { + "epoch": 0.7848295516825605, + "grad_norm": 8.900041580200195, + "learning_rate": 5.513782042465385e-06, + "loss": 2.8368, + "num_input_tokens_seen": 4687568, + "step": 7160 + }, + { + "epoch": 0.7853776170119479, + "grad_norm": 10.776511192321777, + "learning_rate": 5.4868447406542125e-06, + "loss": 2.9062, + "num_input_tokens_seen": 4690632, + "step": 7165 + }, + { + "epoch": 0.7859256823413351, + "grad_norm": 6.669962406158447, + "learning_rate": 5.459965287302396e-06, + "loss": 3.3375, + "num_input_tokens_seen": 4694528, + "step": 7170 + }, + { + "epoch": 0.7864737476707223, + "grad_norm": 8.748539924621582, + "learning_rate": 5.4331437620964235e-06, + "loss": 3.1538, + "num_input_tokens_seen": 4697304, + "step": 7175 + }, + { + "epoch": 0.7870218130001096, + "grad_norm": 6.20130729675293, + "learning_rate": 5.406380244551077e-06, + "loss": 3.3296, + "num_input_tokens_seen": 4701400, + "step": 7180 + }, + { + "epoch": 0.7875698783294969, + "grad_norm": 6.8918304443359375, + "learning_rate": 5.379674814009133e-06, + "loss": 2.9058, + "num_input_tokens_seen": 4704688, + "step": 7185 + }, + { + "epoch": 0.7881179436588841, + "grad_norm": 8.053811073303223, + "learning_rate": 5.353027549641185e-06, + "loss": 3.19, + "num_input_tokens_seen": 4707832, + "step": 7190 + }, + { + "epoch": 0.7886660089882714, + "grad_norm": 8.722176551818848, + "learning_rate": 5.326438530445394e-06, + "loss": 3.1039, + "num_input_tokens_seen": 4711272, + "step": 7195 + }, + { + "epoch": 0.7892140743176587, + "grad_norm": 8.22156810760498, + "learning_rate": 5.299907835247228e-06, + "loss": 2.9179, + "num_input_tokens_seen": 4714584, + "step": 7200 + }, + { + "epoch": 0.7897621396470459, + "grad_norm": 8.812997817993164, + "learning_rate": 5.273435542699259e-06, + "loss": 2.9421, + "num_input_tokens_seen": 4717960, + "step": 7205 + }, + { + "epoch": 0.7903102049764332, + "grad_norm": 7.295377731323242, + "learning_rate": 5.247021731280927e-06, + "loss": 3.1538, + "num_input_tokens_seen": 4721208, + "step": 7210 + }, + { + "epoch": 0.7908582703058205, + "grad_norm": 6.8964762687683105, + "learning_rate": 5.220666479298283e-06, + "loss": 2.9399, + "num_input_tokens_seen": 4723760, + "step": 7215 + }, + { + "epoch": 0.7914063356352077, + "grad_norm": 8.851302146911621, + "learning_rate": 5.194369864883783e-06, + "loss": 3.0368, + "num_input_tokens_seen": 4727808, + "step": 7220 + }, + { + "epoch": 0.791954400964595, + "grad_norm": 6.765636444091797, + "learning_rate": 5.168131965996051e-06, + "loss": 2.5498, + "num_input_tokens_seen": 4730984, + "step": 7225 + }, + { + "epoch": 0.7925024662939822, + "grad_norm": 6.0574750900268555, + "learning_rate": 5.1419528604196385e-06, + "loss": 2.9546, + "num_input_tokens_seen": 4734472, + "step": 7230 + }, + { + "epoch": 0.7930505316233695, + "grad_norm": 6.703484535217285, + "learning_rate": 5.1158326257647855e-06, + "loss": 3.0816, + "num_input_tokens_seen": 4736976, + "step": 7235 + }, + { + "epoch": 0.7935985969527568, + "grad_norm": 5.429347038269043, + "learning_rate": 5.089771339467236e-06, + "loss": 2.8567, + "num_input_tokens_seen": 4740592, + "step": 7240 + }, + { + "epoch": 0.794146662282144, + "grad_norm": 6.805422306060791, + "learning_rate": 5.06376907878795e-06, + "loss": 3.0524, + "num_input_tokens_seen": 4744232, + "step": 7245 + }, + { + "epoch": 0.7946947276115313, + "grad_norm": 7.566915512084961, + "learning_rate": 5.0378259208129054e-06, + "loss": 2.7767, + "num_input_tokens_seen": 4748392, + "step": 7250 + }, + { + "epoch": 0.7952427929409186, + "grad_norm": 8.171722412109375, + "learning_rate": 5.011941942452872e-06, + "loss": 2.9925, + "num_input_tokens_seen": 4751496, + "step": 7255 + }, + { + "epoch": 0.7957908582703058, + "grad_norm": 9.192333221435547, + "learning_rate": 4.986117220443173e-06, + "loss": 3.3195, + "num_input_tokens_seen": 4754624, + "step": 7260 + }, + { + "epoch": 0.796338923599693, + "grad_norm": 6.089689254760742, + "learning_rate": 4.960351831343452e-06, + "loss": 3.3298, + "num_input_tokens_seen": 4758304, + "step": 7265 + }, + { + "epoch": 0.7968869889290804, + "grad_norm": 7.405531883239746, + "learning_rate": 4.9346458515374785e-06, + "loss": 3.3122, + "num_input_tokens_seen": 4760592, + "step": 7270 + }, + { + "epoch": 0.7974350542584676, + "grad_norm": 7.917971611022949, + "learning_rate": 4.908999357232874e-06, + "loss": 3.0276, + "num_input_tokens_seen": 4763392, + "step": 7275 + }, + { + "epoch": 0.7979831195878548, + "grad_norm": 8.550086975097656, + "learning_rate": 4.8834124244609145e-06, + "loss": 3.2591, + "num_input_tokens_seen": 4766544, + "step": 7280 + }, + { + "epoch": 0.7985311849172422, + "grad_norm": 7.939424514770508, + "learning_rate": 4.857885129076317e-06, + "loss": 2.8357, + "num_input_tokens_seen": 4769408, + "step": 7285 + }, + { + "epoch": 0.7990792502466294, + "grad_norm": 6.404162406921387, + "learning_rate": 4.8324175467569845e-06, + "loss": 3.0799, + "num_input_tokens_seen": 4773344, + "step": 7290 + }, + { + "epoch": 0.7996273155760166, + "grad_norm": 7.251323699951172, + "learning_rate": 4.807009753003791e-06, + "loss": 3.1363, + "num_input_tokens_seen": 4776640, + "step": 7295 + }, + { + "epoch": 0.800175380905404, + "grad_norm": 8.667237281799316, + "learning_rate": 4.781661823140366e-06, + "loss": 3.2124, + "num_input_tokens_seen": 4779376, + "step": 7300 + }, + { + "epoch": 0.8007234462347912, + "grad_norm": 8.147212028503418, + "learning_rate": 4.756373832312879e-06, + "loss": 2.874, + "num_input_tokens_seen": 4781952, + "step": 7305 + }, + { + "epoch": 0.8012715115641784, + "grad_norm": 8.90487003326416, + "learning_rate": 4.731145855489794e-06, + "loss": 3.2025, + "num_input_tokens_seen": 4784816, + "step": 7310 + }, + { + "epoch": 0.8018195768935658, + "grad_norm": 7.192740440368652, + "learning_rate": 4.70597796746165e-06, + "loss": 2.9843, + "num_input_tokens_seen": 4787592, + "step": 7315 + }, + { + "epoch": 0.802367642222953, + "grad_norm": 6.346043586730957, + "learning_rate": 4.6808702428408706e-06, + "loss": 3.1331, + "num_input_tokens_seen": 4790256, + "step": 7320 + }, + { + "epoch": 0.8029157075523402, + "grad_norm": 8.076735496520996, + "learning_rate": 4.655822756061503e-06, + "loss": 3.1571, + "num_input_tokens_seen": 4792768, + "step": 7325 + }, + { + "epoch": 0.8034637728817275, + "grad_norm": 7.521450519561768, + "learning_rate": 4.630835581379006e-06, + "loss": 2.929, + "num_input_tokens_seen": 4796152, + "step": 7330 + }, + { + "epoch": 0.8040118382111148, + "grad_norm": 12.113771438598633, + "learning_rate": 4.605908792870067e-06, + "loss": 3.1268, + "num_input_tokens_seen": 4798376, + "step": 7335 + }, + { + "epoch": 0.804559903540502, + "grad_norm": 5.997092247009277, + "learning_rate": 4.581042464432328e-06, + "loss": 2.8665, + "num_input_tokens_seen": 4802104, + "step": 7340 + }, + { + "epoch": 0.8051079688698893, + "grad_norm": 6.922906875610352, + "learning_rate": 4.556236669784197e-06, + "loss": 3.3316, + "num_input_tokens_seen": 4805648, + "step": 7345 + }, + { + "epoch": 0.8056560341992766, + "grad_norm": 9.63893985748291, + "learning_rate": 4.531491482464628e-06, + "loss": 3.2614, + "num_input_tokens_seen": 4810112, + "step": 7350 + }, + { + "epoch": 0.8062040995286638, + "grad_norm": 8.894881248474121, + "learning_rate": 4.5068069758329e-06, + "loss": 3.2695, + "num_input_tokens_seen": 4813192, + "step": 7355 + }, + { + "epoch": 0.8067521648580511, + "grad_norm": 6.436181545257568, + "learning_rate": 4.482183223068387e-06, + "loss": 2.8622, + "num_input_tokens_seen": 4815768, + "step": 7360 + }, + { + "epoch": 0.8073002301874384, + "grad_norm": 7.975905895233154, + "learning_rate": 4.457620297170381e-06, + "loss": 3.3166, + "num_input_tokens_seen": 4819144, + "step": 7365 + }, + { + "epoch": 0.8078482955168256, + "grad_norm": 7.515452861785889, + "learning_rate": 4.433118270957818e-06, + "loss": 2.5207, + "num_input_tokens_seen": 4822152, + "step": 7370 + }, + { + "epoch": 0.8083963608462129, + "grad_norm": 6.722434997558594, + "learning_rate": 4.408677217069096e-06, + "loss": 3.1815, + "num_input_tokens_seen": 4825920, + "step": 7375 + }, + { + "epoch": 0.8089444261756001, + "grad_norm": 6.1937031745910645, + "learning_rate": 4.3842972079618765e-06, + "loss": 3.0536, + "num_input_tokens_seen": 4829224, + "step": 7380 + }, + { + "epoch": 0.8094924915049874, + "grad_norm": 7.4900898933410645, + "learning_rate": 4.359978315912827e-06, + "loss": 2.9555, + "num_input_tokens_seen": 4832576, + "step": 7385 + }, + { + "epoch": 0.8100405568343747, + "grad_norm": 7.267132759094238, + "learning_rate": 4.33572061301743e-06, + "loss": 3.376, + "num_input_tokens_seen": 4834896, + "step": 7390 + }, + { + "epoch": 0.8105886221637619, + "grad_norm": 6.553824424743652, + "learning_rate": 4.311524171189782e-06, + "loss": 3.1203, + "num_input_tokens_seen": 4838536, + "step": 7395 + }, + { + "epoch": 0.8111366874931492, + "grad_norm": 6.04332971572876, + "learning_rate": 4.28738906216235e-06, + "loss": 2.898, + "num_input_tokens_seen": 4842312, + "step": 7400 + }, + { + "epoch": 0.8116847528225365, + "grad_norm": 6.300970077514648, + "learning_rate": 4.263315357485775e-06, + "loss": 3.2478, + "num_input_tokens_seen": 4845640, + "step": 7405 + }, + { + "epoch": 0.8122328181519237, + "grad_norm": 8.834260940551758, + "learning_rate": 4.2393031285286796e-06, + "loss": 3.1214, + "num_input_tokens_seen": 4848880, + "step": 7410 + }, + { + "epoch": 0.812780883481311, + "grad_norm": 7.611583709716797, + "learning_rate": 4.215352446477413e-06, + "loss": 2.8593, + "num_input_tokens_seen": 4852904, + "step": 7415 + }, + { + "epoch": 0.8133289488106983, + "grad_norm": 5.708853244781494, + "learning_rate": 4.191463382335867e-06, + "loss": 3.1984, + "num_input_tokens_seen": 4855720, + "step": 7420 + }, + { + "epoch": 0.8138770141400855, + "grad_norm": 5.545560836791992, + "learning_rate": 4.167636006925274e-06, + "loss": 3.1826, + "num_input_tokens_seen": 4859488, + "step": 7425 + }, + { + "epoch": 0.8144250794694727, + "grad_norm": 9.735588073730469, + "learning_rate": 4.143870390883978e-06, + "loss": 2.8356, + "num_input_tokens_seen": 4862808, + "step": 7430 + }, + { + "epoch": 0.8149731447988601, + "grad_norm": 10.298928260803223, + "learning_rate": 4.120166604667225e-06, + "loss": 2.9738, + "num_input_tokens_seen": 4866608, + "step": 7435 + }, + { + "epoch": 0.8155212101282473, + "grad_norm": 8.623414039611816, + "learning_rate": 4.096524718546974e-06, + "loss": 3.0776, + "num_input_tokens_seen": 4868832, + "step": 7440 + }, + { + "epoch": 0.8160692754576345, + "grad_norm": 10.033533096313477, + "learning_rate": 4.072944802611655e-06, + "loss": 3.1786, + "num_input_tokens_seen": 4872536, + "step": 7445 + }, + { + "epoch": 0.8166173407870219, + "grad_norm": 8.511270523071289, + "learning_rate": 4.0494269267660144e-06, + "loss": 3.4183, + "num_input_tokens_seen": 4876032, + "step": 7450 + }, + { + "epoch": 0.8171654061164091, + "grad_norm": 6.882598876953125, + "learning_rate": 4.025971160730846e-06, + "loss": 3.0995, + "num_input_tokens_seen": 4878536, + "step": 7455 + }, + { + "epoch": 0.8177134714457963, + "grad_norm": 6.228262901306152, + "learning_rate": 4.002577574042829e-06, + "loss": 2.8603, + "num_input_tokens_seen": 4880976, + "step": 7460 + }, + { + "epoch": 0.8182615367751837, + "grad_norm": 9.165740013122559, + "learning_rate": 3.9792462360542935e-06, + "loss": 2.8565, + "num_input_tokens_seen": 4884688, + "step": 7465 + }, + { + "epoch": 0.8188096021045709, + "grad_norm": 7.1637701988220215, + "learning_rate": 3.955977215933046e-06, + "loss": 2.9947, + "num_input_tokens_seen": 4888200, + "step": 7470 + }, + { + "epoch": 0.8193576674339581, + "grad_norm": 7.321343421936035, + "learning_rate": 3.932770582662135e-06, + "loss": 3.1105, + "num_input_tokens_seen": 4890856, + "step": 7475 + }, + { + "epoch": 0.8199057327633454, + "grad_norm": 7.804381847381592, + "learning_rate": 3.9096264050396485e-06, + "loss": 2.9519, + "num_input_tokens_seen": 4893712, + "step": 7480 + }, + { + "epoch": 0.8204537980927327, + "grad_norm": 6.569583415985107, + "learning_rate": 3.886544751678547e-06, + "loss": 3.0457, + "num_input_tokens_seen": 4897104, + "step": 7485 + }, + { + "epoch": 0.8210018634221199, + "grad_norm": 10.908699035644531, + "learning_rate": 3.863525691006406e-06, + "loss": 3.5541, + "num_input_tokens_seen": 4900616, + "step": 7490 + }, + { + "epoch": 0.8215499287515072, + "grad_norm": 8.427760124206543, + "learning_rate": 3.840569291265242e-06, + "loss": 2.9541, + "num_input_tokens_seen": 4902848, + "step": 7495 + }, + { + "epoch": 0.8220979940808945, + "grad_norm": 10.59475040435791, + "learning_rate": 3.817675620511329e-06, + "loss": 2.932, + "num_input_tokens_seen": 4905424, + "step": 7500 + }, + { + "epoch": 0.8226460594102817, + "grad_norm": 8.56042194366455, + "learning_rate": 3.794844746614956e-06, + "loss": 3.3314, + "num_input_tokens_seen": 4908016, + "step": 7505 + }, + { + "epoch": 0.823194124739669, + "grad_norm": 8.957588195800781, + "learning_rate": 3.772076737260241e-06, + "loss": 3.4287, + "num_input_tokens_seen": 4912944, + "step": 7510 + }, + { + "epoch": 0.8237421900690562, + "grad_norm": 8.641453742980957, + "learning_rate": 3.7493716599449557e-06, + "loss": 2.7836, + "num_input_tokens_seen": 4915344, + "step": 7515 + }, + { + "epoch": 0.8242902553984435, + "grad_norm": 9.905373573303223, + "learning_rate": 3.726729581980287e-06, + "loss": 3.3792, + "num_input_tokens_seen": 4918280, + "step": 7520 + }, + { + "epoch": 0.8248383207278308, + "grad_norm": 6.359044075012207, + "learning_rate": 3.7041505704906554e-06, + "loss": 2.6283, + "num_input_tokens_seen": 4923056, + "step": 7525 + }, + { + "epoch": 0.825386386057218, + "grad_norm": 8.611063957214355, + "learning_rate": 3.681634692413527e-06, + "loss": 3.0805, + "num_input_tokens_seen": 4925992, + "step": 7530 + }, + { + "epoch": 0.8259344513866053, + "grad_norm": 6.022265911102295, + "learning_rate": 3.659182014499199e-06, + "loss": 2.9173, + "num_input_tokens_seen": 4928312, + "step": 7535 + }, + { + "epoch": 0.8264825167159926, + "grad_norm": 7.828344821929932, + "learning_rate": 3.636792603310593e-06, + "loss": 3.3786, + "num_input_tokens_seen": 4931816, + "step": 7540 + }, + { + "epoch": 0.8270305820453798, + "grad_norm": 9.197246551513672, + "learning_rate": 3.6144665252230897e-06, + "loss": 3.1869, + "num_input_tokens_seen": 4934904, + "step": 7545 + }, + { + "epoch": 0.827578647374767, + "grad_norm": 6.626698017120361, + "learning_rate": 3.5922038464243e-06, + "loss": 2.864, + "num_input_tokens_seen": 4937320, + "step": 7550 + }, + { + "epoch": 0.8281267127041544, + "grad_norm": 6.149302959442139, + "learning_rate": 3.570004632913884e-06, + "loss": 2.9841, + "num_input_tokens_seen": 4940472, + "step": 7555 + }, + { + "epoch": 0.8286747780335416, + "grad_norm": 5.897488117218018, + "learning_rate": 3.5478689505033635e-06, + "loss": 3.0083, + "num_input_tokens_seen": 4943240, + "step": 7560 + }, + { + "epoch": 0.8292228433629288, + "grad_norm": 5.379867076873779, + "learning_rate": 3.5257968648159085e-06, + "loss": 3.2044, + "num_input_tokens_seen": 4947448, + "step": 7565 + }, + { + "epoch": 0.8297709086923162, + "grad_norm": 8.127168655395508, + "learning_rate": 3.503788441286143e-06, + "loss": 3.0341, + "num_input_tokens_seen": 4950720, + "step": 7570 + }, + { + "epoch": 0.8303189740217034, + "grad_norm": 7.3780364990234375, + "learning_rate": 3.4818437451599796e-06, + "loss": 3.2321, + "num_input_tokens_seen": 4954728, + "step": 7575 + }, + { + "epoch": 0.8308670393510906, + "grad_norm": 6.4768757820129395, + "learning_rate": 3.459962841494391e-06, + "loss": 3.1017, + "num_input_tokens_seen": 4957936, + "step": 7580 + }, + { + "epoch": 0.831415104680478, + "grad_norm": 7.365682125091553, + "learning_rate": 3.4381457951572245e-06, + "loss": 2.8212, + "num_input_tokens_seen": 4961240, + "step": 7585 + }, + { + "epoch": 0.8319631700098652, + "grad_norm": 7.922868251800537, + "learning_rate": 3.41639267082704e-06, + "loss": 2.8681, + "num_input_tokens_seen": 4964016, + "step": 7590 + }, + { + "epoch": 0.8325112353392524, + "grad_norm": 4.56962251663208, + "learning_rate": 3.3947035329928768e-06, + "loss": 3.0944, + "num_input_tokens_seen": 4966208, + "step": 7595 + }, + { + "epoch": 0.8330593006686398, + "grad_norm": 8.027546882629395, + "learning_rate": 3.3730784459540755e-06, + "loss": 2.62, + "num_input_tokens_seen": 4969656, + "step": 7600 + }, + { + "epoch": 0.833607365998027, + "grad_norm": 9.634477615356445, + "learning_rate": 3.3515174738201204e-06, + "loss": 3.0848, + "num_input_tokens_seen": 4972656, + "step": 7605 + }, + { + "epoch": 0.8341554313274142, + "grad_norm": 6.137497901916504, + "learning_rate": 3.3300206805103902e-06, + "loss": 2.8019, + "num_input_tokens_seen": 4976816, + "step": 7610 + }, + { + "epoch": 0.8347034966568014, + "grad_norm": 6.958483695983887, + "learning_rate": 3.3085881297540143e-06, + "loss": 3.1585, + "num_input_tokens_seen": 4979448, + "step": 7615 + }, + { + "epoch": 0.8352515619861888, + "grad_norm": 6.135876178741455, + "learning_rate": 3.2872198850896763e-06, + "loss": 3.4485, + "num_input_tokens_seen": 4982096, + "step": 7620 + }, + { + "epoch": 0.835799627315576, + "grad_norm": 5.784817218780518, + "learning_rate": 3.265916009865405e-06, + "loss": 2.5781, + "num_input_tokens_seen": 4987624, + "step": 7625 + }, + { + "epoch": 0.8363476926449632, + "grad_norm": 7.2112603187561035, + "learning_rate": 3.2446765672384083e-06, + "loss": 3.1842, + "num_input_tokens_seen": 4991016, + "step": 7630 + }, + { + "epoch": 0.8368957579743506, + "grad_norm": 8.30711555480957, + "learning_rate": 3.223501620174871e-06, + "loss": 2.8567, + "num_input_tokens_seen": 4994496, + "step": 7635 + }, + { + "epoch": 0.8374438233037378, + "grad_norm": 5.6931915283203125, + "learning_rate": 3.2023912314497835e-06, + "loss": 3.109, + "num_input_tokens_seen": 4997176, + "step": 7640 + }, + { + "epoch": 0.837991888633125, + "grad_norm": 7.178470611572266, + "learning_rate": 3.18134546364674e-06, + "loss": 3.1472, + "num_input_tokens_seen": 5001168, + "step": 7645 + }, + { + "epoch": 0.8385399539625124, + "grad_norm": 6.247611045837402, + "learning_rate": 3.160364379157771e-06, + "loss": 3.0272, + "num_input_tokens_seen": 5004928, + "step": 7650 + }, + { + "epoch": 0.8390880192918996, + "grad_norm": 8.314835548400879, + "learning_rate": 3.1394480401831376e-06, + "loss": 3.1062, + "num_input_tokens_seen": 5007976, + "step": 7655 + }, + { + "epoch": 0.8396360846212868, + "grad_norm": 8.253650665283203, + "learning_rate": 3.118596508731153e-06, + "loss": 3.1373, + "num_input_tokens_seen": 5010840, + "step": 7660 + }, + { + "epoch": 0.8401841499506741, + "grad_norm": 8.37070083618164, + "learning_rate": 3.0978098466180246e-06, + "loss": 3.1474, + "num_input_tokens_seen": 5013264, + "step": 7665 + }, + { + "epoch": 0.8407322152800614, + "grad_norm": 7.3890700340271, + "learning_rate": 3.0770881154676244e-06, + "loss": 2.9336, + "num_input_tokens_seen": 5016288, + "step": 7670 + }, + { + "epoch": 0.8412802806094486, + "grad_norm": 9.55408000946045, + "learning_rate": 3.056431376711341e-06, + "loss": 3.1662, + "num_input_tokens_seen": 5019184, + "step": 7675 + }, + { + "epoch": 0.8418283459388359, + "grad_norm": 9.764185905456543, + "learning_rate": 3.035839691587891e-06, + "loss": 3.3416, + "num_input_tokens_seen": 5022032, + "step": 7680 + }, + { + "epoch": 0.8423764112682232, + "grad_norm": 6.572988510131836, + "learning_rate": 3.015313121143132e-06, + "loss": 3.44, + "num_input_tokens_seen": 5025704, + "step": 7685 + }, + { + "epoch": 0.8429244765976104, + "grad_norm": 6.35365629196167, + "learning_rate": 2.994851726229872e-06, + "loss": 2.8245, + "num_input_tokens_seen": 5029360, + "step": 7690 + }, + { + "epoch": 0.8434725419269977, + "grad_norm": 5.579585552215576, + "learning_rate": 2.9744555675077195e-06, + "loss": 2.9123, + "num_input_tokens_seen": 5032232, + "step": 7695 + }, + { + "epoch": 0.844020607256385, + "grad_norm": 9.263272285461426, + "learning_rate": 2.9541247054428732e-06, + "loss": 3.1231, + "num_input_tokens_seen": 5034616, + "step": 7700 + }, + { + "epoch": 0.8445686725857722, + "grad_norm": 6.095417022705078, + "learning_rate": 2.933859200307948e-06, + "loss": 2.822, + "num_input_tokens_seen": 5037736, + "step": 7705 + }, + { + "epoch": 0.8451167379151595, + "grad_norm": 7.388354778289795, + "learning_rate": 2.913659112181824e-06, + "loss": 2.8813, + "num_input_tokens_seen": 5040224, + "step": 7710 + }, + { + "epoch": 0.8456648032445467, + "grad_norm": 5.476953983306885, + "learning_rate": 2.893524500949424e-06, + "loss": 2.9058, + "num_input_tokens_seen": 5042920, + "step": 7715 + }, + { + "epoch": 0.846212868573934, + "grad_norm": 8.243193626403809, + "learning_rate": 2.8734554263015717e-06, + "loss": 3.0815, + "num_input_tokens_seen": 5046384, + "step": 7720 + }, + { + "epoch": 0.8467609339033213, + "grad_norm": 5.285266399383545, + "learning_rate": 2.853451947734795e-06, + "loss": 2.8613, + "num_input_tokens_seen": 5050096, + "step": 7725 + }, + { + "epoch": 0.8473089992327085, + "grad_norm": 7.07433557510376, + "learning_rate": 2.833514124551162e-06, + "loss": 3.2751, + "num_input_tokens_seen": 5053016, + "step": 7730 + }, + { + "epoch": 0.8478570645620958, + "grad_norm": 7.447408676147461, + "learning_rate": 2.8136420158580923e-06, + "loss": 3.199, + "num_input_tokens_seen": 5055816, + "step": 7735 + }, + { + "epoch": 0.8484051298914831, + "grad_norm": 6.6446757316589355, + "learning_rate": 2.793835680568202e-06, + "loss": 2.9382, + "num_input_tokens_seen": 5059872, + "step": 7740 + }, + { + "epoch": 0.8489531952208703, + "grad_norm": 6.634135723114014, + "learning_rate": 2.774095177399108e-06, + "loss": 2.7486, + "num_input_tokens_seen": 5063104, + "step": 7745 + }, + { + "epoch": 0.8495012605502575, + "grad_norm": 6.349103927612305, + "learning_rate": 2.75442056487325e-06, + "loss": 2.8114, + "num_input_tokens_seen": 5067312, + "step": 7750 + }, + { + "epoch": 0.8500493258796449, + "grad_norm": 9.979939460754395, + "learning_rate": 2.7348119013177605e-06, + "loss": 3.0652, + "num_input_tokens_seen": 5070232, + "step": 7755 + }, + { + "epoch": 0.8505973912090321, + "grad_norm": 9.005098342895508, + "learning_rate": 2.7152692448642297e-06, + "loss": 2.7476, + "num_input_tokens_seen": 5073736, + "step": 7760 + }, + { + "epoch": 0.8511454565384193, + "grad_norm": 7.502773761749268, + "learning_rate": 2.695792653448573e-06, + "loss": 2.6705, + "num_input_tokens_seen": 5076032, + "step": 7765 + }, + { + "epoch": 0.8516935218678067, + "grad_norm": 6.317687511444092, + "learning_rate": 2.6763821848108634e-06, + "loss": 2.7642, + "num_input_tokens_seen": 5078736, + "step": 7770 + }, + { + "epoch": 0.8522415871971939, + "grad_norm": 6.520786762237549, + "learning_rate": 2.6570378964951322e-06, + "loss": 2.9362, + "num_input_tokens_seen": 5081560, + "step": 7775 + }, + { + "epoch": 0.8527896525265811, + "grad_norm": 7.41638708114624, + "learning_rate": 2.637759845849211e-06, + "loss": 2.9981, + "num_input_tokens_seen": 5084504, + "step": 7780 + }, + { + "epoch": 0.8533377178559685, + "grad_norm": 7.572868824005127, + "learning_rate": 2.6185480900245836e-06, + "loss": 2.7595, + "num_input_tokens_seen": 5088232, + "step": 7785 + }, + { + "epoch": 0.8538857831853557, + "grad_norm": 6.104272842407227, + "learning_rate": 2.5994026859761766e-06, + "loss": 2.9084, + "num_input_tokens_seen": 5090552, + "step": 7790 + }, + { + "epoch": 0.8544338485147429, + "grad_norm": 8.887699127197266, + "learning_rate": 2.5803236904622134e-06, + "loss": 3.3633, + "num_input_tokens_seen": 5093720, + "step": 7795 + }, + { + "epoch": 0.8549819138441302, + "grad_norm": 7.048088550567627, + "learning_rate": 2.5613111600440637e-06, + "loss": 2.94, + "num_input_tokens_seen": 5096984, + "step": 7800 + }, + { + "epoch": 0.8555299791735175, + "grad_norm": 7.457699775695801, + "learning_rate": 2.5423651510860292e-06, + "loss": 2.9086, + "num_input_tokens_seen": 5100088, + "step": 7805 + }, + { + "epoch": 0.8560780445029047, + "grad_norm": 7.127599239349365, + "learning_rate": 2.5234857197552197e-06, + "loss": 3.2513, + "num_input_tokens_seen": 5102776, + "step": 7810 + }, + { + "epoch": 0.856626109832292, + "grad_norm": 6.716034412384033, + "learning_rate": 2.5046729220213615e-06, + "loss": 3.1929, + "num_input_tokens_seen": 5106680, + "step": 7815 + }, + { + "epoch": 0.8571741751616793, + "grad_norm": 8.033172607421875, + "learning_rate": 2.4859268136566415e-06, + "loss": 3.2828, + "num_input_tokens_seen": 5110400, + "step": 7820 + }, + { + "epoch": 0.8577222404910665, + "grad_norm": 7.232936859130859, + "learning_rate": 2.4672474502355406e-06, + "loss": 2.9178, + "num_input_tokens_seen": 5113896, + "step": 7825 + }, + { + "epoch": 0.8582703058204538, + "grad_norm": 7.433042526245117, + "learning_rate": 2.4486348871346738e-06, + "loss": 3.2398, + "num_input_tokens_seen": 5116440, + "step": 7830 + }, + { + "epoch": 0.858818371149841, + "grad_norm": 6.7432756423950195, + "learning_rate": 2.4300891795326157e-06, + "loss": 2.8448, + "num_input_tokens_seen": 5119296, + "step": 7835 + }, + { + "epoch": 0.8593664364792283, + "grad_norm": 6.955072402954102, + "learning_rate": 2.4116103824097345e-06, + "loss": 3.0554, + "num_input_tokens_seen": 5122136, + "step": 7840 + }, + { + "epoch": 0.8599145018086156, + "grad_norm": 7.900850296020508, + "learning_rate": 2.3931985505480564e-06, + "loss": 2.9951, + "num_input_tokens_seen": 5125056, + "step": 7845 + }, + { + "epoch": 0.8604625671380028, + "grad_norm": 5.292073726654053, + "learning_rate": 2.374853738531063e-06, + "loss": 3.1992, + "num_input_tokens_seen": 5128688, + "step": 7850 + }, + { + "epoch": 0.8610106324673901, + "grad_norm": 6.894753932952881, + "learning_rate": 2.356576000743557e-06, + "loss": 3.2569, + "num_input_tokens_seen": 5132184, + "step": 7855 + }, + { + "epoch": 0.8615586977967774, + "grad_norm": 6.101509094238281, + "learning_rate": 2.3383653913714996e-06, + "loss": 2.8422, + "num_input_tokens_seen": 5136352, + "step": 7860 + }, + { + "epoch": 0.8621067631261646, + "grad_norm": 6.467989444732666, + "learning_rate": 2.3202219644018365e-06, + "loss": 3.0615, + "num_input_tokens_seen": 5139152, + "step": 7865 + }, + { + "epoch": 0.8626548284555519, + "grad_norm": 6.982528209686279, + "learning_rate": 2.3021457736223412e-06, + "loss": 3.0371, + "num_input_tokens_seen": 5142336, + "step": 7870 + }, + { + "epoch": 0.8632028937849392, + "grad_norm": 5.719668388366699, + "learning_rate": 2.2841368726214755e-06, + "loss": 3.1793, + "num_input_tokens_seen": 5145504, + "step": 7875 + }, + { + "epoch": 0.8637509591143264, + "grad_norm": 6.815168380737305, + "learning_rate": 2.2661953147882024e-06, + "loss": 3.2501, + "num_input_tokens_seen": 5148672, + "step": 7880 + }, + { + "epoch": 0.8642990244437136, + "grad_norm": 6.836389541625977, + "learning_rate": 2.2483211533118357e-06, + "loss": 3.2825, + "num_input_tokens_seen": 5152104, + "step": 7885 + }, + { + "epoch": 0.864847089773101, + "grad_norm": 9.11992359161377, + "learning_rate": 2.2305144411819052e-06, + "loss": 3.1458, + "num_input_tokens_seen": 5154840, + "step": 7890 + }, + { + "epoch": 0.8653951551024882, + "grad_norm": 7.1421308517456055, + "learning_rate": 2.212775231187966e-06, + "loss": 3.2977, + "num_input_tokens_seen": 5157496, + "step": 7895 + }, + { + "epoch": 0.8659432204318754, + "grad_norm": 6.900385856628418, + "learning_rate": 2.1951035759194605e-06, + "loss": 2.9658, + "num_input_tokens_seen": 5161824, + "step": 7900 + }, + { + "epoch": 0.8664912857612628, + "grad_norm": 8.681853294372559, + "learning_rate": 2.1774995277655556e-06, + "loss": 2.9868, + "num_input_tokens_seen": 5164840, + "step": 7905 + }, + { + "epoch": 0.86703935109065, + "grad_norm": 6.421346187591553, + "learning_rate": 2.1599631389150027e-06, + "loss": 3.3, + "num_input_tokens_seen": 5169320, + "step": 7910 + }, + { + "epoch": 0.8675874164200372, + "grad_norm": 6.86265754699707, + "learning_rate": 2.1424944613559537e-06, + "loss": 3.1633, + "num_input_tokens_seen": 5172784, + "step": 7915 + }, + { + "epoch": 0.8681354817494246, + "grad_norm": 4.766587257385254, + "learning_rate": 2.1250935468758446e-06, + "loss": 3.2877, + "num_input_tokens_seen": 5175600, + "step": 7920 + }, + { + "epoch": 0.8686835470788118, + "grad_norm": 6.533714771270752, + "learning_rate": 2.1077604470612106e-06, + "loss": 2.9995, + "num_input_tokens_seen": 5178624, + "step": 7925 + }, + { + "epoch": 0.869231612408199, + "grad_norm": 7.438570022583008, + "learning_rate": 2.0904952132975386e-06, + "loss": 2.7973, + "num_input_tokens_seen": 5181688, + "step": 7930 + }, + { + "epoch": 0.8697796777375864, + "grad_norm": 7.600935459136963, + "learning_rate": 2.0732978967691357e-06, + "loss": 3.4927, + "num_input_tokens_seen": 5184008, + "step": 7935 + }, + { + "epoch": 0.8703277430669736, + "grad_norm": 10.930978775024414, + "learning_rate": 2.0561685484589506e-06, + "loss": 3.0121, + "num_input_tokens_seen": 5187600, + "step": 7940 + }, + { + "epoch": 0.8708758083963608, + "grad_norm": 8.671449661254883, + "learning_rate": 2.0391072191484338e-06, + "loss": 3.1692, + "num_input_tokens_seen": 5190976, + "step": 7945 + }, + { + "epoch": 0.8714238737257481, + "grad_norm": 9.432777404785156, + "learning_rate": 2.0221139594174018e-06, + "loss": 3.0802, + "num_input_tokens_seen": 5193664, + "step": 7950 + }, + { + "epoch": 0.8719719390551354, + "grad_norm": 8.096484184265137, + "learning_rate": 2.0051888196438552e-06, + "loss": 2.8438, + "num_input_tokens_seen": 5196696, + "step": 7955 + }, + { + "epoch": 0.8725200043845226, + "grad_norm": 8.458807945251465, + "learning_rate": 1.988331850003855e-06, + "loss": 3.4075, + "num_input_tokens_seen": 5200640, + "step": 7960 + }, + { + "epoch": 0.8730680697139099, + "grad_norm": 9.191377639770508, + "learning_rate": 1.971543100471368e-06, + "loss": 3.276, + "num_input_tokens_seen": 5204240, + "step": 7965 + }, + { + "epoch": 0.8736161350432972, + "grad_norm": 6.790607929229736, + "learning_rate": 1.954822620818114e-06, + "loss": 2.9706, + "num_input_tokens_seen": 5208024, + "step": 7970 + }, + { + "epoch": 0.8741642003726844, + "grad_norm": 7.511916637420654, + "learning_rate": 1.938170460613417e-06, + "loss": 2.8037, + "num_input_tokens_seen": 5211272, + "step": 7975 + }, + { + "epoch": 0.8747122657020717, + "grad_norm": 6.600817680358887, + "learning_rate": 1.921586669224071e-06, + "loss": 3.3576, + "num_input_tokens_seen": 5215392, + "step": 7980 + }, + { + "epoch": 0.875260331031459, + "grad_norm": 5.347980976104736, + "learning_rate": 1.9050712958141758e-06, + "loss": 3.3071, + "num_input_tokens_seen": 5217928, + "step": 7985 + }, + { + "epoch": 0.8758083963608462, + "grad_norm": 6.689899921417236, + "learning_rate": 1.8886243893450061e-06, + "loss": 3.2119, + "num_input_tokens_seen": 5220984, + "step": 7990 + }, + { + "epoch": 0.8763564616902335, + "grad_norm": 6.363076210021973, + "learning_rate": 1.8722459985748563e-06, + "loss": 2.9524, + "num_input_tokens_seen": 5224504, + "step": 7995 + }, + { + "epoch": 0.8769045270196207, + "grad_norm": 7.521759986877441, + "learning_rate": 1.8559361720588974e-06, + "loss": 3.1379, + "num_input_tokens_seen": 5227336, + "step": 8000 + }, + { + "epoch": 0.877452592349008, + "grad_norm": 8.488334655761719, + "learning_rate": 1.8396949581490463e-06, + "loss": 3.2758, + "num_input_tokens_seen": 5229968, + "step": 8005 + }, + { + "epoch": 0.8780006576783953, + "grad_norm": 7.164643287658691, + "learning_rate": 1.8235224049938049e-06, + "loss": 3.0142, + "num_input_tokens_seen": 5233280, + "step": 8010 + }, + { + "epoch": 0.8785487230077825, + "grad_norm": 8.150335311889648, + "learning_rate": 1.8074185605381239e-06, + "loss": 3.2278, + "num_input_tokens_seen": 5236408, + "step": 8015 + }, + { + "epoch": 0.8790967883371698, + "grad_norm": 9.74315357208252, + "learning_rate": 1.791383472523256e-06, + "loss": 3.3009, + "num_input_tokens_seen": 5240040, + "step": 8020 + }, + { + "epoch": 0.8796448536665571, + "grad_norm": 6.548309326171875, + "learning_rate": 1.7754171884866362e-06, + "loss": 3.0949, + "num_input_tokens_seen": 5243480, + "step": 8025 + }, + { + "epoch": 0.8801929189959443, + "grad_norm": 6.918182373046875, + "learning_rate": 1.7595197557617044e-06, + "loss": 3.1496, + "num_input_tokens_seen": 5246664, + "step": 8030 + }, + { + "epoch": 0.8807409843253315, + "grad_norm": 6.263129711151123, + "learning_rate": 1.7436912214777945e-06, + "loss": 2.9099, + "num_input_tokens_seen": 5249392, + "step": 8035 + }, + { + "epoch": 0.8812890496547189, + "grad_norm": 8.55476188659668, + "learning_rate": 1.7279316325599898e-06, + "loss": 2.8569, + "num_input_tokens_seen": 5252584, + "step": 8040 + }, + { + "epoch": 0.8818371149841061, + "grad_norm": 7.661272048950195, + "learning_rate": 1.7122410357289703e-06, + "loss": 2.9037, + "num_input_tokens_seen": 5256184, + "step": 8045 + }, + { + "epoch": 0.8823851803134933, + "grad_norm": 5.52952766418457, + "learning_rate": 1.6966194775008798e-06, + "loss": 3.0452, + "num_input_tokens_seen": 5260048, + "step": 8050 + }, + { + "epoch": 0.8829332456428807, + "grad_norm": 8.354534149169922, + "learning_rate": 1.6810670041872062e-06, + "loss": 3.005, + "num_input_tokens_seen": 5264288, + "step": 8055 + }, + { + "epoch": 0.8834813109722679, + "grad_norm": 7.364735126495361, + "learning_rate": 1.6655836618946151e-06, + "loss": 3.1181, + "num_input_tokens_seen": 5268000, + "step": 8060 + }, + { + "epoch": 0.8840293763016551, + "grad_norm": 7.844119071960449, + "learning_rate": 1.650169496524831e-06, + "loss": 2.9376, + "num_input_tokens_seen": 5270984, + "step": 8065 + }, + { + "epoch": 0.8845774416310425, + "grad_norm": 5.87100076675415, + "learning_rate": 1.6348245537745028e-06, + "loss": 3.1916, + "num_input_tokens_seen": 5274448, + "step": 8070 + }, + { + "epoch": 0.8851255069604297, + "grad_norm": 7.44371223449707, + "learning_rate": 1.6195488791350548e-06, + "loss": 2.9924, + "num_input_tokens_seen": 5277432, + "step": 8075 + }, + { + "epoch": 0.8856735722898169, + "grad_norm": 6.34487771987915, + "learning_rate": 1.6043425178925652e-06, + "loss": 3.0224, + "num_input_tokens_seen": 5279944, + "step": 8080 + }, + { + "epoch": 0.8862216376192042, + "grad_norm": 5.726871490478516, + "learning_rate": 1.5892055151276258e-06, + "loss": 2.7579, + "num_input_tokens_seen": 5283720, + "step": 8085 + }, + { + "epoch": 0.8867697029485915, + "grad_norm": 9.92805004119873, + "learning_rate": 1.574137915715207e-06, + "loss": 3.0515, + "num_input_tokens_seen": 5286392, + "step": 8090 + }, + { + "epoch": 0.8873177682779787, + "grad_norm": 9.383995056152344, + "learning_rate": 1.559139764324527e-06, + "loss": 3.3639, + "num_input_tokens_seen": 5289440, + "step": 8095 + }, + { + "epoch": 0.887865833607366, + "grad_norm": 6.371479034423828, + "learning_rate": 1.5442111054189246e-06, + "loss": 3.0694, + "num_input_tokens_seen": 5293168, + "step": 8100 + }, + { + "epoch": 0.8884138989367533, + "grad_norm": 7.600619316101074, + "learning_rate": 1.5293519832557113e-06, + "loss": 3.1645, + "num_input_tokens_seen": 5296272, + "step": 8105 + }, + { + "epoch": 0.8889619642661405, + "grad_norm": 10.624588966369629, + "learning_rate": 1.5145624418860637e-06, + "loss": 2.9331, + "num_input_tokens_seen": 5299248, + "step": 8110 + }, + { + "epoch": 0.8895100295955278, + "grad_norm": 6.536969184875488, + "learning_rate": 1.4998425251548654e-06, + "loss": 2.962, + "num_input_tokens_seen": 5302376, + "step": 8115 + }, + { + "epoch": 0.890058094924915, + "grad_norm": 5.556844234466553, + "learning_rate": 1.4851922767006088e-06, + "loss": 2.9318, + "num_input_tokens_seen": 5305704, + "step": 8120 + }, + { + "epoch": 0.8906061602543023, + "grad_norm": 7.522222995758057, + "learning_rate": 1.4706117399552383e-06, + "loss": 3.0438, + "num_input_tokens_seen": 5308112, + "step": 8125 + }, + { + "epoch": 0.8911542255836896, + "grad_norm": 9.176352500915527, + "learning_rate": 1.4561009581440272e-06, + "loss": 3.0732, + "num_input_tokens_seen": 5310768, + "step": 8130 + }, + { + "epoch": 0.8917022909130768, + "grad_norm": 6.739439010620117, + "learning_rate": 1.441659974285467e-06, + "loss": 3.0154, + "num_input_tokens_seen": 5313544, + "step": 8135 + }, + { + "epoch": 0.8922503562424641, + "grad_norm": 6.810214042663574, + "learning_rate": 1.4272888311911176e-06, + "loss": 3.0619, + "num_input_tokens_seen": 5316352, + "step": 8140 + }, + { + "epoch": 0.8927984215718514, + "grad_norm": 5.931697368621826, + "learning_rate": 1.4129875714654905e-06, + "loss": 3.3196, + "num_input_tokens_seen": 5320160, + "step": 8145 + }, + { + "epoch": 0.8933464869012386, + "grad_norm": 7.526365280151367, + "learning_rate": 1.398756237505927e-06, + "loss": 2.9404, + "num_input_tokens_seen": 5323560, + "step": 8150 + }, + { + "epoch": 0.8938945522306259, + "grad_norm": 6.762884616851807, + "learning_rate": 1.3845948715024648e-06, + "loss": 3.2493, + "num_input_tokens_seen": 5326504, + "step": 8155 + }, + { + "epoch": 0.8944426175600132, + "grad_norm": 4.969104290008545, + "learning_rate": 1.37050351543771e-06, + "loss": 3.3379, + "num_input_tokens_seen": 5329424, + "step": 8160 + }, + { + "epoch": 0.8949906828894004, + "grad_norm": 6.4593586921691895, + "learning_rate": 1.3564822110867264e-06, + "loss": 3.2228, + "num_input_tokens_seen": 5332600, + "step": 8165 + }, + { + "epoch": 0.8955387482187877, + "grad_norm": 7.721135139465332, + "learning_rate": 1.3425310000169028e-06, + "loss": 3.2133, + "num_input_tokens_seen": 5335792, + "step": 8170 + }, + { + "epoch": 0.896086813548175, + "grad_norm": 8.572230339050293, + "learning_rate": 1.3286499235878214e-06, + "loss": 3.1945, + "num_input_tokens_seen": 5339616, + "step": 8175 + }, + { + "epoch": 0.8966348788775622, + "grad_norm": 7.773857593536377, + "learning_rate": 1.3148390229511532e-06, + "loss": 2.9125, + "num_input_tokens_seen": 5342320, + "step": 8180 + }, + { + "epoch": 0.8971829442069494, + "grad_norm": 7.451086521148682, + "learning_rate": 1.3010983390505244e-06, + "loss": 3.1514, + "num_input_tokens_seen": 5345336, + "step": 8185 + }, + { + "epoch": 0.8977310095363368, + "grad_norm": 7.28810453414917, + "learning_rate": 1.2874279126213973e-06, + "loss": 3.1191, + "num_input_tokens_seen": 5348880, + "step": 8190 + }, + { + "epoch": 0.898279074865724, + "grad_norm": 4.2049078941345215, + "learning_rate": 1.2738277841909479e-06, + "loss": 2.9685, + "num_input_tokens_seen": 5352936, + "step": 8195 + }, + { + "epoch": 0.8988271401951112, + "grad_norm": 7.404577732086182, + "learning_rate": 1.2602979940779524e-06, + "loss": 3.107, + "num_input_tokens_seen": 5355952, + "step": 8200 + }, + { + "epoch": 0.8993752055244986, + "grad_norm": 11.230597496032715, + "learning_rate": 1.2468385823926481e-06, + "loss": 2.9561, + "num_input_tokens_seen": 5359608, + "step": 8205 + }, + { + "epoch": 0.8999232708538858, + "grad_norm": 8.928146362304688, + "learning_rate": 1.233449589036656e-06, + "loss": 3.172, + "num_input_tokens_seen": 5363024, + "step": 8210 + }, + { + "epoch": 0.900471336183273, + "grad_norm": 5.939243316650391, + "learning_rate": 1.2201310537028138e-06, + "loss": 3.0996, + "num_input_tokens_seen": 5366928, + "step": 8215 + }, + { + "epoch": 0.9010194015126604, + "grad_norm": 7.374519348144531, + "learning_rate": 1.206883015875085e-06, + "loss": 3.0966, + "num_input_tokens_seen": 5369984, + "step": 8220 + }, + { + "epoch": 0.9015674668420476, + "grad_norm": 8.059386253356934, + "learning_rate": 1.1937055148284444e-06, + "loss": 3.0717, + "num_input_tokens_seen": 5372632, + "step": 8225 + }, + { + "epoch": 0.9021155321714348, + "grad_norm": 8.80373764038086, + "learning_rate": 1.1805985896287452e-06, + "loss": 3.1543, + "num_input_tokens_seen": 5375544, + "step": 8230 + }, + { + "epoch": 0.9026635975008221, + "grad_norm": 6.8497443199157715, + "learning_rate": 1.1675622791326169e-06, + "loss": 2.9531, + "num_input_tokens_seen": 5378856, + "step": 8235 + }, + { + "epoch": 0.9032116628302094, + "grad_norm": 7.791383266448975, + "learning_rate": 1.1545966219873444e-06, + "loss": 2.9187, + "num_input_tokens_seen": 5382752, + "step": 8240 + }, + { + "epoch": 0.9037597281595966, + "grad_norm": 6.825507640838623, + "learning_rate": 1.1417016566307586e-06, + "loss": 2.8782, + "num_input_tokens_seen": 5386080, + "step": 8245 + }, + { + "epoch": 0.9043077934889839, + "grad_norm": 6.135127544403076, + "learning_rate": 1.1288774212911052e-06, + "loss": 2.8879, + "num_input_tokens_seen": 5389680, + "step": 8250 + }, + { + "epoch": 0.9048558588183712, + "grad_norm": 8.292460441589355, + "learning_rate": 1.1161239539869668e-06, + "loss": 2.9108, + "num_input_tokens_seen": 5393112, + "step": 8255 + }, + { + "epoch": 0.9054039241477584, + "grad_norm": 6.192307949066162, + "learning_rate": 1.1034412925271075e-06, + "loss": 2.72, + "num_input_tokens_seen": 5397056, + "step": 8260 + }, + { + "epoch": 0.9059519894771457, + "grad_norm": 6.773381233215332, + "learning_rate": 1.0908294745103882e-06, + "loss": 2.7747, + "num_input_tokens_seen": 5400928, + "step": 8265 + }, + { + "epoch": 0.906500054806533, + "grad_norm": 9.411810874938965, + "learning_rate": 1.078288537325653e-06, + "loss": 3.1762, + "num_input_tokens_seen": 5403744, + "step": 8270 + }, + { + "epoch": 0.9070481201359202, + "grad_norm": 5.909646511077881, + "learning_rate": 1.0658185181516094e-06, + "loss": 2.9356, + "num_input_tokens_seen": 5406888, + "step": 8275 + }, + { + "epoch": 0.9075961854653075, + "grad_norm": 8.18594741821289, + "learning_rate": 1.0534194539567194e-06, + "loss": 3.0487, + "num_input_tokens_seen": 5409856, + "step": 8280 + }, + { + "epoch": 0.9081442507946947, + "grad_norm": 10.775045394897461, + "learning_rate": 1.0410913814990985e-06, + "loss": 2.8025, + "num_input_tokens_seen": 5412416, + "step": 8285 + }, + { + "epoch": 0.908692316124082, + "grad_norm": 8.237727165222168, + "learning_rate": 1.0288343373263954e-06, + "loss": 3.0227, + "num_input_tokens_seen": 5415176, + "step": 8290 + }, + { + "epoch": 0.9092403814534693, + "grad_norm": 7.0511884689331055, + "learning_rate": 1.016648357775693e-06, + "loss": 2.8189, + "num_input_tokens_seen": 5418552, + "step": 8295 + }, + { + "epoch": 0.9097884467828565, + "grad_norm": 6.959300518035889, + "learning_rate": 1.004533478973399e-06, + "loss": 3.3864, + "num_input_tokens_seen": 5421712, + "step": 8300 + }, + { + "epoch": 0.9103365121122438, + "grad_norm": 7.333334922790527, + "learning_rate": 9.924897368351282e-07, + "loss": 3.1543, + "num_input_tokens_seen": 5425312, + "step": 8305 + }, + { + "epoch": 0.9108845774416311, + "grad_norm": 7.005816459655762, + "learning_rate": 9.805171670656117e-07, + "loss": 3.1113, + "num_input_tokens_seen": 5428680, + "step": 8310 + }, + { + "epoch": 0.9114326427710183, + "grad_norm": 5.512388229370117, + "learning_rate": 9.686158051585874e-07, + "loss": 3.0001, + "num_input_tokens_seen": 5431848, + "step": 8315 + }, + { + "epoch": 0.9119807081004055, + "grad_norm": 6.378774642944336, + "learning_rate": 9.56785686396683e-07, + "loss": 3.1063, + "num_input_tokens_seen": 5434648, + "step": 8320 + }, + { + "epoch": 0.9125287734297929, + "grad_norm": 6.719765663146973, + "learning_rate": 9.450268458513156e-07, + "loss": 2.7967, + "num_input_tokens_seen": 5438728, + "step": 8325 + }, + { + "epoch": 0.9130768387591801, + "grad_norm": 8.518233299255371, + "learning_rate": 9.333393183826089e-07, + "loss": 2.7597, + "num_input_tokens_seen": 5442232, + "step": 8330 + }, + { + "epoch": 0.9136249040885673, + "grad_norm": 7.718142986297607, + "learning_rate": 9.217231386392577e-07, + "loss": 3.5149, + "num_input_tokens_seen": 5445320, + "step": 8335 + }, + { + "epoch": 0.9141729694179547, + "grad_norm": 7.286013603210449, + "learning_rate": 9.101783410584458e-07, + "loss": 3.2542, + "num_input_tokens_seen": 5448280, + "step": 8340 + }, + { + "epoch": 0.9147210347473419, + "grad_norm": 6.524003028869629, + "learning_rate": 8.987049598657398e-07, + "loss": 3.0042, + "num_input_tokens_seen": 5452360, + "step": 8345 + }, + { + "epoch": 0.9152691000767291, + "grad_norm": 6.262417316436768, + "learning_rate": 8.87303029074979e-07, + "loss": 2.6819, + "num_input_tokens_seen": 5455872, + "step": 8350 + }, + { + "epoch": 0.9158171654061165, + "grad_norm": 6.51323127746582, + "learning_rate": 8.75972582488191e-07, + "loss": 3.1662, + "num_input_tokens_seen": 5458616, + "step": 8355 + }, + { + "epoch": 0.9163652307355037, + "grad_norm": 7.502628803253174, + "learning_rate": 8.647136536954787e-07, + "loss": 2.4922, + "num_input_tokens_seen": 5461408, + "step": 8360 + }, + { + "epoch": 0.9169132960648909, + "grad_norm": 6.768873691558838, + "learning_rate": 8.535262760749202e-07, + "loss": 2.7696, + "num_input_tokens_seen": 5466664, + "step": 8365 + }, + { + "epoch": 0.9174613613942783, + "grad_norm": 9.054154396057129, + "learning_rate": 8.4241048279248e-07, + "loss": 3.3125, + "num_input_tokens_seen": 5469400, + "step": 8370 + }, + { + "epoch": 0.9180094267236655, + "grad_norm": 7.729340076446533, + "learning_rate": 8.313663068019007e-07, + "loss": 3.383, + "num_input_tokens_seen": 5472936, + "step": 8375 + }, + { + "epoch": 0.9185574920530527, + "grad_norm": 8.844609260559082, + "learning_rate": 8.203937808446083e-07, + "loss": 2.7089, + "num_input_tokens_seen": 5476176, + "step": 8380 + }, + { + "epoch": 0.91910555738244, + "grad_norm": 7.043740272521973, + "learning_rate": 8.094929374496185e-07, + "loss": 3.2024, + "num_input_tokens_seen": 5479576, + "step": 8385 + }, + { + "epoch": 0.9196536227118273, + "grad_norm": 8.144498825073242, + "learning_rate": 7.986638089334392e-07, + "loss": 3.4681, + "num_input_tokens_seen": 5483592, + "step": 8390 + }, + { + "epoch": 0.9202016880412145, + "grad_norm": 7.295477867126465, + "learning_rate": 7.879064273999731e-07, + "loss": 3.3592, + "num_input_tokens_seen": 5486736, + "step": 8395 + }, + { + "epoch": 0.9207497533706018, + "grad_norm": 6.9401960372924805, + "learning_rate": 7.772208247404128e-07, + "loss": 2.8916, + "num_input_tokens_seen": 5489720, + "step": 8400 + }, + { + "epoch": 0.9212978186999891, + "grad_norm": 5.044391632080078, + "learning_rate": 7.666070326331709e-07, + "loss": 2.9984, + "num_input_tokens_seen": 5494312, + "step": 8405 + }, + { + "epoch": 0.9218458840293763, + "grad_norm": 7.426214218139648, + "learning_rate": 7.560650825437637e-07, + "loss": 2.6398, + "num_input_tokens_seen": 5498536, + "step": 8410 + }, + { + "epoch": 0.9223939493587635, + "grad_norm": 6.066382884979248, + "learning_rate": 7.455950057247252e-07, + "loss": 3.0293, + "num_input_tokens_seen": 5501256, + "step": 8415 + }, + { + "epoch": 0.9229420146881508, + "grad_norm": 6.4779181480407715, + "learning_rate": 7.351968332155152e-07, + "loss": 3.0215, + "num_input_tokens_seen": 5504440, + "step": 8420 + }, + { + "epoch": 0.9234900800175381, + "grad_norm": 5.473248481750488, + "learning_rate": 7.248705958424307e-07, + "loss": 2.9114, + "num_input_tokens_seen": 5507752, + "step": 8425 + }, + { + "epoch": 0.9240381453469253, + "grad_norm": 7.87445592880249, + "learning_rate": 7.146163242185033e-07, + "loss": 3.0642, + "num_input_tokens_seen": 5511168, + "step": 8430 + }, + { + "epoch": 0.9245862106763126, + "grad_norm": 7.2715959548950195, + "learning_rate": 7.044340487434242e-07, + "loss": 3.0391, + "num_input_tokens_seen": 5513984, + "step": 8435 + }, + { + "epoch": 0.9251342760056999, + "grad_norm": 7.839521408081055, + "learning_rate": 6.943237996034386e-07, + "loss": 3.2316, + "num_input_tokens_seen": 5516632, + "step": 8440 + }, + { + "epoch": 0.9256823413350871, + "grad_norm": 7.8146820068359375, + "learning_rate": 6.842856067712677e-07, + "loss": 3.0688, + "num_input_tokens_seen": 5520488, + "step": 8445 + }, + { + "epoch": 0.9262304066644744, + "grad_norm": 7.480862140655518, + "learning_rate": 6.743195000060154e-07, + "loss": 2.8072, + "num_input_tokens_seen": 5524136, + "step": 8450 + }, + { + "epoch": 0.9267784719938617, + "grad_norm": 6.187289237976074, + "learning_rate": 6.644255088530782e-07, + "loss": 3.1597, + "num_input_tokens_seen": 5528256, + "step": 8455 + }, + { + "epoch": 0.9273265373232489, + "grad_norm": 7.108201026916504, + "learning_rate": 6.546036626440599e-07, + "loss": 2.8195, + "num_input_tokens_seen": 5531368, + "step": 8460 + }, + { + "epoch": 0.9278746026526362, + "grad_norm": 9.429540634155273, + "learning_rate": 6.448539904966827e-07, + "loss": 3.1321, + "num_input_tokens_seen": 5534144, + "step": 8465 + }, + { + "epoch": 0.9284226679820234, + "grad_norm": 6.745710849761963, + "learning_rate": 6.351765213147037e-07, + "loss": 2.8217, + "num_input_tokens_seen": 5536848, + "step": 8470 + }, + { + "epoch": 0.9289707333114107, + "grad_norm": 6.650664806365967, + "learning_rate": 6.255712837878347e-07, + "loss": 3.1658, + "num_input_tokens_seen": 5540136, + "step": 8475 + }, + { + "epoch": 0.929518798640798, + "grad_norm": 7.63946008682251, + "learning_rate": 6.160383063916419e-07, + "loss": 3.1177, + "num_input_tokens_seen": 5543192, + "step": 8480 + }, + { + "epoch": 0.9300668639701852, + "grad_norm": 7.223082542419434, + "learning_rate": 6.065776173874687e-07, + "loss": 3.6049, + "num_input_tokens_seen": 5547392, + "step": 8485 + }, + { + "epoch": 0.9306149292995725, + "grad_norm": 7.673356533050537, + "learning_rate": 5.971892448223576e-07, + "loss": 2.8851, + "num_input_tokens_seen": 5550056, + "step": 8490 + }, + { + "epoch": 0.9311629946289598, + "grad_norm": 7.799294471740723, + "learning_rate": 5.878732165289668e-07, + "loss": 3.2135, + "num_input_tokens_seen": 5552728, + "step": 8495 + }, + { + "epoch": 0.931711059958347, + "grad_norm": 5.8991312980651855, + "learning_rate": 5.786295601254765e-07, + "loss": 3.5495, + "num_input_tokens_seen": 5556008, + "step": 8500 + }, + { + "epoch": 0.9322591252877342, + "grad_norm": 8.919817924499512, + "learning_rate": 5.694583030155131e-07, + "loss": 3.2696, + "num_input_tokens_seen": 5558680, + "step": 8505 + }, + { + "epoch": 0.9328071906171216, + "grad_norm": 6.0595293045043945, + "learning_rate": 5.60359472388075e-07, + "loss": 3.1983, + "num_input_tokens_seen": 5561976, + "step": 8510 + }, + { + "epoch": 0.9333552559465088, + "grad_norm": 7.8532185554504395, + "learning_rate": 5.513330952174462e-07, + "loss": 2.8831, + "num_input_tokens_seen": 5565032, + "step": 8515 + }, + { + "epoch": 0.933903321275896, + "grad_norm": 6.592312335968018, + "learning_rate": 5.423791982631071e-07, + "loss": 3.2783, + "num_input_tokens_seen": 5567976, + "step": 8520 + }, + { + "epoch": 0.9344513866052834, + "grad_norm": 5.455694198608398, + "learning_rate": 5.334978080696773e-07, + "loss": 2.3299, + "num_input_tokens_seen": 5572544, + "step": 8525 + }, + { + "epoch": 0.9349994519346706, + "grad_norm": 6.956151008605957, + "learning_rate": 5.246889509668118e-07, + "loss": 3.0221, + "num_input_tokens_seen": 5575256, + "step": 8530 + }, + { + "epoch": 0.9355475172640578, + "grad_norm": 7.278057098388672, + "learning_rate": 5.159526530691378e-07, + "loss": 3.2783, + "num_input_tokens_seen": 5577928, + "step": 8535 + }, + { + "epoch": 0.9360955825934452, + "grad_norm": 5.909106731414795, + "learning_rate": 5.072889402761821e-07, + "loss": 3.2452, + "num_input_tokens_seen": 5580632, + "step": 8540 + }, + { + "epoch": 0.9366436479228324, + "grad_norm": 6.952794075012207, + "learning_rate": 4.986978382722773e-07, + "loss": 3.0232, + "num_input_tokens_seen": 5584824, + "step": 8545 + }, + { + "epoch": 0.9371917132522196, + "grad_norm": 8.14654541015625, + "learning_rate": 4.901793725264975e-07, + "loss": 3.0803, + "num_input_tokens_seen": 5589208, + "step": 8550 + }, + { + "epoch": 0.937739778581607, + "grad_norm": 6.610713958740234, + "learning_rate": 4.817335682925805e-07, + "loss": 2.8802, + "num_input_tokens_seen": 5592056, + "step": 8555 + }, + { + "epoch": 0.9382878439109942, + "grad_norm": 10.567109107971191, + "learning_rate": 4.73360450608859e-07, + "loss": 3.3952, + "num_input_tokens_seen": 5595120, + "step": 8560 + }, + { + "epoch": 0.9388359092403814, + "grad_norm": 7.1954545974731445, + "learning_rate": 4.6506004429817117e-07, + "loss": 3.2835, + "num_input_tokens_seen": 5598408, + "step": 8565 + }, + { + "epoch": 0.9393839745697687, + "grad_norm": 7.200895309448242, + "learning_rate": 4.568323739677971e-07, + "loss": 3.2721, + "num_input_tokens_seen": 5602328, + "step": 8570 + }, + { + "epoch": 0.939932039899156, + "grad_norm": 7.637218952178955, + "learning_rate": 4.486774640093894e-07, + "loss": 3.0411, + "num_input_tokens_seen": 5606096, + "step": 8575 + }, + { + "epoch": 0.9404801052285432, + "grad_norm": 8.214374542236328, + "learning_rate": 4.405953385988898e-07, + "loss": 3.1399, + "num_input_tokens_seen": 5608544, + "step": 8580 + }, + { + "epoch": 0.9410281705579305, + "grad_norm": 7.163279056549072, + "learning_rate": 4.325860216964711e-07, + "loss": 2.7451, + "num_input_tokens_seen": 5611872, + "step": 8585 + }, + { + "epoch": 0.9415762358873178, + "grad_norm": 7.930347919464111, + "learning_rate": 4.2464953704645647e-07, + "loss": 2.9838, + "num_input_tokens_seen": 5614440, + "step": 8590 + }, + { + "epoch": 0.942124301216705, + "grad_norm": 4.849373817443848, + "learning_rate": 4.167859081772446e-07, + "loss": 2.9805, + "num_input_tokens_seen": 5617856, + "step": 8595 + }, + { + "epoch": 0.9426723665460923, + "grad_norm": 8.461563110351562, + "learning_rate": 4.0899515840125966e-07, + "loss": 3.2951, + "num_input_tokens_seen": 5620824, + "step": 8600 + }, + { + "epoch": 0.9432204318754795, + "grad_norm": 8.734384536743164, + "learning_rate": 4.0127731081485987e-07, + "loss": 3.3802, + "num_input_tokens_seen": 5624696, + "step": 8605 + }, + { + "epoch": 0.9437684972048668, + "grad_norm": 9.480766296386719, + "learning_rate": 3.936323882982762e-07, + "loss": 2.8742, + "num_input_tokens_seen": 5628648, + "step": 8610 + }, + { + "epoch": 0.9443165625342541, + "grad_norm": 8.393555641174316, + "learning_rate": 3.8606041351555986e-07, + "loss": 3.3445, + "num_input_tokens_seen": 5631048, + "step": 8615 + }, + { + "epoch": 0.9448646278636413, + "grad_norm": 5.754420757293701, + "learning_rate": 3.785614089144879e-07, + "loss": 3.2994, + "num_input_tokens_seen": 5634840, + "step": 8620 + }, + { + "epoch": 0.9454126931930286, + "grad_norm": 7.406842231750488, + "learning_rate": 3.7113539672651853e-07, + "loss": 3.2169, + "num_input_tokens_seen": 5639056, + "step": 8625 + }, + { + "epoch": 0.9459607585224159, + "grad_norm": 8.346644401550293, + "learning_rate": 3.637823989667166e-07, + "loss": 3.5016, + "num_input_tokens_seen": 5642368, + "step": 8630 + }, + { + "epoch": 0.9465088238518031, + "grad_norm": 6.256731033325195, + "learning_rate": 3.565024374336895e-07, + "loss": 2.9251, + "num_input_tokens_seen": 5645288, + "step": 8635 + }, + { + "epoch": 0.9470568891811904, + "grad_norm": 8.30922794342041, + "learning_rate": 3.4929553370951496e-07, + "loss": 2.897, + "num_input_tokens_seen": 5648256, + "step": 8640 + }, + { + "epoch": 0.9476049545105777, + "grad_norm": 5.839921951293945, + "learning_rate": 3.421617091596996e-07, + "loss": 3.0709, + "num_input_tokens_seen": 5651456, + "step": 8645 + }, + { + "epoch": 0.9481530198399649, + "grad_norm": 8.873268127441406, + "learning_rate": 3.3510098493308715e-07, + "loss": 2.8349, + "num_input_tokens_seen": 5654936, + "step": 8650 + }, + { + "epoch": 0.9487010851693521, + "grad_norm": 7.447127342224121, + "learning_rate": 3.2811338196181706e-07, + "loss": 3.1457, + "num_input_tokens_seen": 5658344, + "step": 8655 + }, + { + "epoch": 0.9492491504987395, + "grad_norm": 7.901216506958008, + "learning_rate": 3.211989209612437e-07, + "loss": 3.0331, + "num_input_tokens_seen": 5661088, + "step": 8660 + }, + { + "epoch": 0.9497972158281267, + "grad_norm": 6.363575458526611, + "learning_rate": 3.1435762242990053e-07, + "loss": 3.0904, + "num_input_tokens_seen": 5664544, + "step": 8665 + }, + { + "epoch": 0.9503452811575139, + "grad_norm": 8.245457649230957, + "learning_rate": 3.0758950664940833e-07, + "loss": 2.9634, + "num_input_tokens_seen": 5667704, + "step": 8670 + }, + { + "epoch": 0.9508933464869013, + "grad_norm": 6.969222068786621, + "learning_rate": 3.008945936844504e-07, + "loss": 2.9006, + "num_input_tokens_seen": 5671088, + "step": 8675 + }, + { + "epoch": 0.9514414118162885, + "grad_norm": 9.956710815429688, + "learning_rate": 2.942729033826752e-07, + "loss": 3.3092, + "num_input_tokens_seen": 5673784, + "step": 8680 + }, + { + "epoch": 0.9519894771456757, + "grad_norm": 6.730470657348633, + "learning_rate": 2.877244553746633e-07, + "loss": 2.8794, + "num_input_tokens_seen": 5677024, + "step": 8685 + }, + { + "epoch": 0.9525375424750631, + "grad_norm": 7.628656387329102, + "learning_rate": 2.8124926907386885e-07, + "loss": 2.9683, + "num_input_tokens_seen": 5680552, + "step": 8690 + }, + { + "epoch": 0.9530856078044503, + "grad_norm": 8.587575912475586, + "learning_rate": 2.748473636765475e-07, + "loss": 3.0311, + "num_input_tokens_seen": 5684128, + "step": 8695 + }, + { + "epoch": 0.9536336731338375, + "grad_norm": 8.781567573547363, + "learning_rate": 2.6851875816170655e-07, + "loss": 2.9722, + "num_input_tokens_seen": 5687784, + "step": 8700 + }, + { + "epoch": 0.9541817384632248, + "grad_norm": 6.88287353515625, + "learning_rate": 2.622634712910521e-07, + "loss": 3.3128, + "num_input_tokens_seen": 5690464, + "step": 8705 + }, + { + "epoch": 0.9547298037926121, + "grad_norm": 7.1090874671936035, + "learning_rate": 2.560815216089335e-07, + "loss": 3.0189, + "num_input_tokens_seen": 5693312, + "step": 8710 + }, + { + "epoch": 0.9552778691219993, + "grad_norm": 7.3000168800354, + "learning_rate": 2.499729274422796e-07, + "loss": 3.5534, + "num_input_tokens_seen": 5697232, + "step": 8715 + }, + { + "epoch": 0.9558259344513866, + "grad_norm": 8.97269344329834, + "learning_rate": 2.439377069005544e-07, + "loss": 3.5597, + "num_input_tokens_seen": 5699808, + "step": 8720 + }, + { + "epoch": 0.9563739997807739, + "grad_norm": 8.973227500915527, + "learning_rate": 2.3797587787569852e-07, + "loss": 3.0848, + "num_input_tokens_seen": 5703784, + "step": 8725 + }, + { + "epoch": 0.9569220651101611, + "grad_norm": 7.142612934112549, + "learning_rate": 2.3208745804207398e-07, + "loss": 2.8029, + "num_input_tokens_seen": 5706344, + "step": 8730 + }, + { + "epoch": 0.9574701304395484, + "grad_norm": 8.567402839660645, + "learning_rate": 2.262724648564224e-07, + "loss": 3.3482, + "num_input_tokens_seen": 5710600, + "step": 8735 + }, + { + "epoch": 0.9580181957689357, + "grad_norm": 11.277481079101562, + "learning_rate": 2.2053091555779837e-07, + "loss": 3.0415, + "num_input_tokens_seen": 5714152, + "step": 8740 + }, + { + "epoch": 0.9585662610983229, + "grad_norm": 7.343226432800293, + "learning_rate": 2.1486282716752791e-07, + "loss": 3.0087, + "num_input_tokens_seen": 5716376, + "step": 8745 + }, + { + "epoch": 0.9591143264277102, + "grad_norm": 6.354895114898682, + "learning_rate": 2.0926821648915574e-07, + "loss": 3.0672, + "num_input_tokens_seen": 5719152, + "step": 8750 + }, + { + "epoch": 0.9596623917570974, + "grad_norm": 7.212831497192383, + "learning_rate": 2.0374710010839793e-07, + "loss": 3.3, + "num_input_tokens_seen": 5723064, + "step": 8755 + }, + { + "epoch": 0.9602104570864847, + "grad_norm": 6.967692852020264, + "learning_rate": 1.982994943930838e-07, + "loss": 3.1401, + "num_input_tokens_seen": 5725768, + "step": 8760 + }, + { + "epoch": 0.960758522415872, + "grad_norm": 8.500665664672852, + "learning_rate": 1.9292541549311983e-07, + "loss": 3.2358, + "num_input_tokens_seen": 5728104, + "step": 8765 + }, + { + "epoch": 0.9613065877452592, + "grad_norm": 7.204361915588379, + "learning_rate": 1.876248793404367e-07, + "loss": 2.9241, + "num_input_tokens_seen": 5730688, + "step": 8770 + }, + { + "epoch": 0.9618546530746465, + "grad_norm": 7.031684398651123, + "learning_rate": 1.8239790164893412e-07, + "loss": 3.2293, + "num_input_tokens_seen": 5733936, + "step": 8775 + }, + { + "epoch": 0.9624027184040338, + "grad_norm": 8.101325035095215, + "learning_rate": 1.7724449791444997e-07, + "loss": 2.7716, + "num_input_tokens_seen": 5737880, + "step": 8780 + }, + { + "epoch": 0.962950783733421, + "grad_norm": 6.74721622467041, + "learning_rate": 1.721646834146967e-07, + "loss": 2.715, + "num_input_tokens_seen": 5741936, + "step": 8785 + }, + { + "epoch": 0.9634988490628082, + "grad_norm": 9.26173210144043, + "learning_rate": 1.671584732092335e-07, + "loss": 2.8224, + "num_input_tokens_seen": 5746160, + "step": 8790 + }, + { + "epoch": 0.9640469143921956, + "grad_norm": 5.797330856323242, + "learning_rate": 1.6222588213940792e-07, + "loss": 3.3261, + "num_input_tokens_seen": 5750696, + "step": 8795 + }, + { + "epoch": 0.9645949797215828, + "grad_norm": 9.205500602722168, + "learning_rate": 1.5736692482831995e-07, + "loss": 2.9268, + "num_input_tokens_seen": 5753384, + "step": 8800 + }, + { + "epoch": 0.96514304505097, + "grad_norm": 6.270941257476807, + "learning_rate": 1.5258161568077188e-07, + "loss": 2.8041, + "num_input_tokens_seen": 5756640, + "step": 8805 + }, + { + "epoch": 0.9656911103803574, + "grad_norm": 7.947140693664551, + "learning_rate": 1.4786996888323524e-07, + "loss": 3.1006, + "num_input_tokens_seen": 5759848, + "step": 8810 + }, + { + "epoch": 0.9662391757097446, + "grad_norm": 8.765256881713867, + "learning_rate": 1.4323199840380053e-07, + "loss": 3.2065, + "num_input_tokens_seen": 5763416, + "step": 8815 + }, + { + "epoch": 0.9667872410391318, + "grad_norm": 5.335040092468262, + "learning_rate": 1.3866771799213307e-07, + "loss": 2.9768, + "num_input_tokens_seen": 5766160, + "step": 8820 + }, + { + "epoch": 0.9673353063685192, + "grad_norm": 5.483620643615723, + "learning_rate": 1.3417714117944513e-07, + "loss": 2.8682, + "num_input_tokens_seen": 5771024, + "step": 8825 + }, + { + "epoch": 0.9678833716979064, + "grad_norm": 8.511704444885254, + "learning_rate": 1.2976028127844597e-07, + "loss": 3.1851, + "num_input_tokens_seen": 5774632, + "step": 8830 + }, + { + "epoch": 0.9684314370272936, + "grad_norm": 6.916325569152832, + "learning_rate": 1.25417151383303e-07, + "loss": 3.2018, + "num_input_tokens_seen": 5778048, + "step": 8835 + }, + { + "epoch": 0.968979502356681, + "grad_norm": 6.791527271270752, + "learning_rate": 1.2114776436960294e-07, + "loss": 3.1153, + "num_input_tokens_seen": 5781288, + "step": 8840 + }, + { + "epoch": 0.9695275676860682, + "grad_norm": 7.304278373718262, + "learning_rate": 1.1695213289432406e-07, + "loss": 2.7359, + "num_input_tokens_seen": 5783776, + "step": 8845 + }, + { + "epoch": 0.9700756330154554, + "grad_norm": 7.467769145965576, + "learning_rate": 1.128302693957778e-07, + "loss": 3.1941, + "num_input_tokens_seen": 5786120, + "step": 8850 + }, + { + "epoch": 0.9706236983448427, + "grad_norm": 8.969725608825684, + "learning_rate": 1.0878218609359502e-07, + "loss": 3.0654, + "num_input_tokens_seen": 5789672, + "step": 8855 + }, + { + "epoch": 0.97117176367423, + "grad_norm": 8.292722702026367, + "learning_rate": 1.0480789498866772e-07, + "loss": 2.9517, + "num_input_tokens_seen": 5792480, + "step": 8860 + }, + { + "epoch": 0.9717198290036172, + "grad_norm": 5.788974285125732, + "learning_rate": 1.0090740786313502e-07, + "loss": 2.9964, + "num_input_tokens_seen": 5796848, + "step": 8865 + }, + { + "epoch": 0.9722678943330045, + "grad_norm": 8.003725051879883, + "learning_rate": 9.708073628033055e-08, + "loss": 2.8592, + "num_input_tokens_seen": 5801376, + "step": 8870 + }, + { + "epoch": 0.9728159596623918, + "grad_norm": 6.711467742919922, + "learning_rate": 9.332789158476018e-08, + "loss": 2.9653, + "num_input_tokens_seen": 5804480, + "step": 8875 + }, + { + "epoch": 0.973364024991779, + "grad_norm": 5.3671417236328125, + "learning_rate": 8.964888490205769e-08, + "loss": 3.1577, + "num_input_tokens_seen": 5807632, + "step": 8880 + }, + { + "epoch": 0.9739120903211663, + "grad_norm": 6.408278942108154, + "learning_rate": 8.604372713896247e-08, + "loss": 2.7764, + "num_input_tokens_seen": 5810096, + "step": 8885 + }, + { + "epoch": 0.9744601556505536, + "grad_norm": 8.041277885437012, + "learning_rate": 8.251242898328071e-08, + "loss": 3.2175, + "num_input_tokens_seen": 5813808, + "step": 8890 + }, + { + "epoch": 0.9750082209799408, + "grad_norm": 6.138535499572754, + "learning_rate": 7.905500090385487e-08, + "loss": 2.9364, + "num_input_tokens_seen": 5816552, + "step": 8895 + }, + { + "epoch": 0.9755562863093281, + "grad_norm": 8.328486442565918, + "learning_rate": 7.567145315053314e-08, + "loss": 3.163, + "num_input_tokens_seen": 5820568, + "step": 8900 + }, + { + "epoch": 0.9761043516387153, + "grad_norm": 9.473198890686035, + "learning_rate": 7.236179575414448e-08, + "loss": 3.2253, + "num_input_tokens_seen": 5823808, + "step": 8905 + }, + { + "epoch": 0.9766524169681026, + "grad_norm": 5.804590225219727, + "learning_rate": 6.912603852645138e-08, + "loss": 3.0782, + "num_input_tokens_seen": 5826744, + "step": 8910 + }, + { + "epoch": 0.9772004822974899, + "grad_norm": 5.613870620727539, + "learning_rate": 6.596419106014163e-08, + "loss": 2.9843, + "num_input_tokens_seen": 5831144, + "step": 8915 + }, + { + "epoch": 0.9777485476268771, + "grad_norm": 8.519886016845703, + "learning_rate": 6.28762627287921e-08, + "loss": 3.0685, + "num_input_tokens_seen": 5834792, + "step": 8920 + }, + { + "epoch": 0.9782966129562644, + "grad_norm": 7.168541431427002, + "learning_rate": 5.986226268683282e-08, + "loss": 3.2515, + "num_input_tokens_seen": 5838368, + "step": 8925 + }, + { + "epoch": 0.9788446782856517, + "grad_norm": 10.949654579162598, + "learning_rate": 5.692219986953573e-08, + "loss": 2.9654, + "num_input_tokens_seen": 5842120, + "step": 8930 + }, + { + "epoch": 0.9793927436150389, + "grad_norm": 6.906786918640137, + "learning_rate": 5.4056082992973155e-08, + "loss": 3.0675, + "num_input_tokens_seen": 5845248, + "step": 8935 + }, + { + "epoch": 0.9799408089444261, + "grad_norm": 5.457529067993164, + "learning_rate": 5.1263920553998315e-08, + "loss": 2.9989, + "num_input_tokens_seen": 5848536, + "step": 8940 + }, + { + "epoch": 0.9804888742738135, + "grad_norm": 9.393891334533691, + "learning_rate": 4.854572083022313e-08, + "loss": 3.1355, + "num_input_tokens_seen": 5851824, + "step": 8945 + }, + { + "epoch": 0.9810369396032007, + "grad_norm": 8.42390251159668, + "learning_rate": 4.5901491879984934e-08, + "loss": 3.0677, + "num_input_tokens_seen": 5855152, + "step": 8950 + }, + { + "epoch": 0.9815850049325879, + "grad_norm": 7.749826908111572, + "learning_rate": 4.3331241542340916e-08, + "loss": 3.1391, + "num_input_tokens_seen": 5858576, + "step": 8955 + }, + { + "epoch": 0.9821330702619753, + "grad_norm": 8.214120864868164, + "learning_rate": 4.083497743701259e-08, + "loss": 2.8317, + "num_input_tokens_seen": 5861528, + "step": 8960 + }, + { + "epoch": 0.9826811355913625, + "grad_norm": 6.369811058044434, + "learning_rate": 3.8412706964402465e-08, + "loss": 2.9487, + "num_input_tokens_seen": 5865128, + "step": 8965 + }, + { + "epoch": 0.9832292009207497, + "grad_norm": 8.29269027709961, + "learning_rate": 3.606443730554132e-08, + "loss": 3.0666, + "num_input_tokens_seen": 5867928, + "step": 8970 + }, + { + "epoch": 0.9837772662501371, + "grad_norm": 7.444830417633057, + "learning_rate": 3.379017542207707e-08, + "loss": 3.0067, + "num_input_tokens_seen": 5870968, + "step": 8975 + }, + { + "epoch": 0.9843253315795243, + "grad_norm": 7.021453380584717, + "learning_rate": 3.1589928056263704e-08, + "loss": 3.1972, + "num_input_tokens_seen": 5874496, + "step": 8980 + }, + { + "epoch": 0.9848733969089115, + "grad_norm": 7.41176176071167, + "learning_rate": 2.9463701730922388e-08, + "loss": 2.826, + "num_input_tokens_seen": 5878088, + "step": 8985 + }, + { + "epoch": 0.9854214622382989, + "grad_norm": 9.515088081359863, + "learning_rate": 2.7411502749441488e-08, + "loss": 3.1693, + "num_input_tokens_seen": 5881752, + "step": 8990 + }, + { + "epoch": 0.9859695275676861, + "grad_norm": 8.658610343933105, + "learning_rate": 2.5433337195743258e-08, + "loss": 2.8453, + "num_input_tokens_seen": 5884816, + "step": 8995 + }, + { + "epoch": 0.9865175928970733, + "grad_norm": 7.5331830978393555, + "learning_rate": 2.3529210934272738e-08, + "loss": 2.8423, + "num_input_tokens_seen": 5887864, + "step": 9000 + }, + { + "epoch": 0.9870656582264606, + "grad_norm": 8.601006507873535, + "learning_rate": 2.2059222016279636e-08, + "loss": 3.5074, + "num_input_tokens_seen": 5892776, + "step": 9005 + }, + { + "epoch": 0.9876137235558479, + "grad_norm": 9.700572967529297, + "learning_rate": 2.0288380558580732e-08, + "loss": 2.9729, + "num_input_tokens_seen": 5895976, + "step": 9010 + }, + { + "epoch": 0.9881617888852351, + "grad_norm": 7.793155193328857, + "learning_rate": 1.859159364578089e-08, + "loss": 3.1164, + "num_input_tokens_seen": 5897952, + "step": 9015 + }, + { + "epoch": 0.9887098542146224, + "grad_norm": 6.612551212310791, + "learning_rate": 1.696886630815908e-08, + "loss": 2.9729, + "num_input_tokens_seen": 5901264, + "step": 9020 + }, + { + "epoch": 0.9892579195440097, + "grad_norm": 7.382999897003174, + "learning_rate": 1.5420203356431018e-08, + "loss": 3.2611, + "num_input_tokens_seen": 5904096, + "step": 9025 + }, + { + "epoch": 0.9898059848733969, + "grad_norm": 6.810866832733154, + "learning_rate": 1.3945609381743607e-08, + "loss": 2.8127, + "num_input_tokens_seen": 5907072, + "step": 9030 + }, + { + "epoch": 0.9903540502027842, + "grad_norm": 7.927409648895264, + "learning_rate": 1.2545088755658296e-08, + "loss": 3.2365, + "num_input_tokens_seen": 5910056, + "step": 9035 + }, + { + "epoch": 0.9909021155321714, + "grad_norm": 7.214841842651367, + "learning_rate": 1.121864563014552e-08, + "loss": 3.0081, + "num_input_tokens_seen": 5913112, + "step": 9040 + }, + { + "epoch": 0.9914501808615587, + "grad_norm": 8.652878761291504, + "learning_rate": 9.966283937559716e-09, + "loss": 3.0332, + "num_input_tokens_seen": 5916360, + "step": 9045 + }, + { + "epoch": 0.991998246190946, + "grad_norm": 8.960352897644043, + "learning_rate": 8.78800739063379e-09, + "loss": 2.6109, + "num_input_tokens_seen": 5918704, + "step": 9050 + }, + { + "epoch": 0.9925463115203332, + "grad_norm": 7.337709903717041, + "learning_rate": 7.683819482479094e-09, + "loss": 2.7987, + "num_input_tokens_seen": 5921928, + "step": 9055 + }, + { + "epoch": 0.9930943768497205, + "grad_norm": 7.972464561462402, + "learning_rate": 6.653723486549357e-09, + "loss": 3.1164, + "num_input_tokens_seen": 5924176, + "step": 9060 + }, + { + "epoch": 0.9936424421791078, + "grad_norm": 5.17326021194458, + "learning_rate": 5.69772245666289e-09, + "loss": 2.8857, + "num_input_tokens_seen": 5927832, + "step": 9065 + }, + { + "epoch": 0.994190507508495, + "grad_norm": 9.227761268615723, + "learning_rate": 4.815819226960949e-09, + "loss": 3.0089, + "num_input_tokens_seen": 5931264, + "step": 9070 + }, + { + "epoch": 0.9947385728378823, + "grad_norm": 8.926158905029297, + "learning_rate": 4.008016411927162e-09, + "loss": 3.3191, + "num_input_tokens_seen": 5933904, + "step": 9075 + }, + { + "epoch": 0.9952866381672696, + "grad_norm": 10.433160781860352, + "learning_rate": 3.274316406362554e-09, + "loss": 3.447, + "num_input_tokens_seen": 5936464, + "step": 9080 + }, + { + "epoch": 0.9958347034966568, + "grad_norm": 7.052779197692871, + "learning_rate": 2.6147213853855436e-09, + "loss": 3.0385, + "num_input_tokens_seen": 5939544, + "step": 9085 + }, + { + "epoch": 0.996382768826044, + "grad_norm": 5.819647789001465, + "learning_rate": 2.0292333044236166e-09, + "loss": 3.3745, + "num_input_tokens_seen": 5943312, + "step": 9090 + }, + { + "epoch": 0.9969308341554314, + "grad_norm": 7.4259748458862305, + "learning_rate": 1.5178538992050018e-09, + "loss": 2.8346, + "num_input_tokens_seen": 5946248, + "step": 9095 + }, + { + "epoch": 0.9974788994848186, + "grad_norm": 9.022146224975586, + "learning_rate": 1.0805846857642188e-09, + "loss": 2.969, + "num_input_tokens_seen": 5949520, + "step": 9100 + }, + { + "epoch": 0.9980269648142058, + "grad_norm": 7.631455898284912, + "learning_rate": 7.174269604171002e-10, + "loss": 3.0908, + "num_input_tokens_seen": 5953392, + "step": 9105 + }, + { + "epoch": 0.9985750301435932, + "grad_norm": 8.837788581848145, + "learning_rate": 4.283817997829953e-10, + "loss": 2.8613, + "num_input_tokens_seen": 5957048, + "step": 9110 + }, + { + "epoch": 0.9991230954729804, + "grad_norm": 6.420173645019531, + "learning_rate": 2.1345006075979e-10, + "loss": 2.8579, + "num_input_tokens_seen": 5959744, + "step": 9115 + }, + { + "epoch": 0.9996711608023676, + "grad_norm": 8.133180618286133, + "learning_rate": 7.263238052668264e-11, + "loss": 3.1424, + "num_input_tokens_seen": 5962752, + "step": 9120 + } + ], + "logging_steps": 5, + "max_steps": 9123, + "num_input_tokens_seen": 5964208, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.722124677282202e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}