diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,27312 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999754305791012, + "eval_steps": 1000, + "global_step": 10175, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000294833050784993, + "grad_norm": 351.9773864746094, + "learning_rate": 4.715127701375246e-07, + "loss": 174.741, + "num_input_tokens_seen": 147204, + "step": 3 + }, + { + "epoch": 0.000589666101569986, + "grad_norm": 335.4129333496094, + "learning_rate": 9.430255402750492e-07, + "loss": 172.9366, + "num_input_tokens_seen": 298172, + "step": 6 + }, + { + "epoch": 0.000884499152354979, + "grad_norm": 221.5324249267578, + "learning_rate": 1.4145383104125737e-06, + "loss": 169.1322, + "num_input_tokens_seen": 438208, + "step": 9 + }, + { + "epoch": 0.001179332203139972, + "grad_norm": 166.86729431152344, + "learning_rate": 1.8860510805500984e-06, + "loss": 165.9725, + "num_input_tokens_seen": 598448, + "step": 12 + }, + { + "epoch": 0.001474165253924965, + "grad_norm": 115.85004425048828, + "learning_rate": 2.357563850687623e-06, + "loss": 164.239, + "num_input_tokens_seen": 761840, + "step": 15 + }, + { + "epoch": 0.001768998304709958, + "grad_norm": 92.04082489013672, + "learning_rate": 2.8290766208251474e-06, + "loss": 162.3242, + "num_input_tokens_seen": 913844, + "step": 18 + }, + { + "epoch": 0.002063831355494951, + "grad_norm": 81.15711212158203, + "learning_rate": 3.3005893909626725e-06, + "loss": 161.3536, + "num_input_tokens_seen": 1079924, + "step": 21 + }, + { + "epoch": 0.002358664406279944, + "grad_norm": 80.38558959960938, + "learning_rate": 3.7721021611001968e-06, + "loss": 160.6213, + "num_input_tokens_seen": 1217660, + "step": 24 + }, + { + "epoch": 0.002653497457064937, + "grad_norm": 73.85279083251953, + "learning_rate": 4.243614931237721e-06, + "loss": 159.9705, + "num_input_tokens_seen": 1371872, + "step": 27 + }, + { + "epoch": 0.00294833050784993, + "grad_norm": 64.86689758300781, + "learning_rate": 4.715127701375246e-06, + "loss": 159.3978, + "num_input_tokens_seen": 1552480, + "step": 30 + }, + { + "epoch": 0.003243163558634923, + "grad_norm": 64.5094223022461, + "learning_rate": 5.1866404715127704e-06, + "loss": 159.1339, + "num_input_tokens_seen": 1705784, + "step": 33 + }, + { + "epoch": 0.003537996609419916, + "grad_norm": 58.84899139404297, + "learning_rate": 5.658153241650295e-06, + "loss": 158.6619, + "num_input_tokens_seen": 1866676, + "step": 36 + }, + { + "epoch": 0.003832829660204909, + "grad_norm": 68.10957336425781, + "learning_rate": 6.129666011787819e-06, + "loss": 157.7221, + "num_input_tokens_seen": 2025592, + "step": 39 + }, + { + "epoch": 0.004127662710989902, + "grad_norm": 72.53784942626953, + "learning_rate": 6.601178781925345e-06, + "loss": 157.304, + "num_input_tokens_seen": 2184160, + "step": 42 + }, + { + "epoch": 0.004422495761774895, + "grad_norm": 110.91462707519531, + "learning_rate": 7.072691552062869e-06, + "loss": 155.9637, + "num_input_tokens_seen": 2315356, + "step": 45 + }, + { + "epoch": 0.004717328812559888, + "grad_norm": 89.97759246826172, + "learning_rate": 7.5442043222003935e-06, + "loss": 155.4095, + "num_input_tokens_seen": 2466780, + "step": 48 + }, + { + "epoch": 0.005012161863344881, + "grad_norm": 78.38220977783203, + "learning_rate": 8.015717092337918e-06, + "loss": 154.4634, + "num_input_tokens_seen": 2629552, + "step": 51 + }, + { + "epoch": 0.005306994914129874, + "grad_norm": 106.3893814086914, + "learning_rate": 8.487229862475442e-06, + "loss": 154.4811, + "num_input_tokens_seen": 2762900, + "step": 54 + }, + { + "epoch": 0.005601827964914867, + "grad_norm": 60.77946472167969, + "learning_rate": 8.958742632612968e-06, + "loss": 153.6777, + "num_input_tokens_seen": 2910676, + "step": 57 + }, + { + "epoch": 0.00589666101569986, + "grad_norm": 74.85725402832031, + "learning_rate": 9.430255402750492e-06, + "loss": 153.2267, + "num_input_tokens_seen": 3068036, + "step": 60 + }, + { + "epoch": 0.006191494066484853, + "grad_norm": 108.25921630859375, + "learning_rate": 9.901768172888017e-06, + "loss": 152.5115, + "num_input_tokens_seen": 3225588, + "step": 63 + }, + { + "epoch": 0.006486327117269846, + "grad_norm": 64.9744644165039, + "learning_rate": 1.0373280943025541e-05, + "loss": 151.9292, + "num_input_tokens_seen": 3366520, + "step": 66 + }, + { + "epoch": 0.006781160168054839, + "grad_norm": 70.4972152709961, + "learning_rate": 1.0844793713163067e-05, + "loss": 151.1674, + "num_input_tokens_seen": 3544740, + "step": 69 + }, + { + "epoch": 0.007075993218839832, + "grad_norm": 96.35140991210938, + "learning_rate": 1.131630648330059e-05, + "loss": 151.7663, + "num_input_tokens_seen": 3691996, + "step": 72 + }, + { + "epoch": 0.007370826269624825, + "grad_norm": 76.31684112548828, + "learning_rate": 1.1787819253438115e-05, + "loss": 150.7677, + "num_input_tokens_seen": 3840260, + "step": 75 + }, + { + "epoch": 0.007665659320409818, + "grad_norm": 107.214111328125, + "learning_rate": 1.2259332023575638e-05, + "loss": 149.475, + "num_input_tokens_seen": 3989812, + "step": 78 + }, + { + "epoch": 0.00796049237119481, + "grad_norm": 58.428550720214844, + "learning_rate": 1.2730844793713164e-05, + "loss": 149.771, + "num_input_tokens_seen": 4128548, + "step": 81 + }, + { + "epoch": 0.008255325421979804, + "grad_norm": 107.3379898071289, + "learning_rate": 1.320235756385069e-05, + "loss": 148.0709, + "num_input_tokens_seen": 4291808, + "step": 84 + }, + { + "epoch": 0.008550158472764796, + "grad_norm": 79.58226013183594, + "learning_rate": 1.3673870333988213e-05, + "loss": 147.9338, + "num_input_tokens_seen": 4450936, + "step": 87 + }, + { + "epoch": 0.00884499152354979, + "grad_norm": 90.84200286865234, + "learning_rate": 1.4145383104125738e-05, + "loss": 147.3218, + "num_input_tokens_seen": 4606692, + "step": 90 + }, + { + "epoch": 0.009139824574334782, + "grad_norm": 82.57324981689453, + "learning_rate": 1.4616895874263261e-05, + "loss": 146.2605, + "num_input_tokens_seen": 4752792, + "step": 93 + }, + { + "epoch": 0.009434657625119776, + "grad_norm": 127.30277252197266, + "learning_rate": 1.5088408644400787e-05, + "loss": 146.2332, + "num_input_tokens_seen": 4910364, + "step": 96 + }, + { + "epoch": 0.009729490675904768, + "grad_norm": 79.93926239013672, + "learning_rate": 1.555992141453831e-05, + "loss": 145.666, + "num_input_tokens_seen": 5052164, + "step": 99 + }, + { + "epoch": 0.010024323726689762, + "grad_norm": 82.26026916503906, + "learning_rate": 1.6031434184675836e-05, + "loss": 144.4902, + "num_input_tokens_seen": 5174080, + "step": 102 + }, + { + "epoch": 0.010319156777474754, + "grad_norm": 63.15846252441406, + "learning_rate": 1.650294695481336e-05, + "loss": 143.5527, + "num_input_tokens_seen": 5320224, + "step": 105 + }, + { + "epoch": 0.010613989828259748, + "grad_norm": 57.38979721069336, + "learning_rate": 1.6974459724950884e-05, + "loss": 143.0506, + "num_input_tokens_seen": 5469368, + "step": 108 + }, + { + "epoch": 0.01090882287904474, + "grad_norm": 47.44795227050781, + "learning_rate": 1.7445972495088412e-05, + "loss": 142.9269, + "num_input_tokens_seen": 5613288, + "step": 111 + }, + { + "epoch": 0.011203655929829734, + "grad_norm": 46.905216217041016, + "learning_rate": 1.7917485265225936e-05, + "loss": 142.9721, + "num_input_tokens_seen": 5754360, + "step": 114 + }, + { + "epoch": 0.011498488980614728, + "grad_norm": 79.62482452392578, + "learning_rate": 1.838899803536346e-05, + "loss": 141.9862, + "num_input_tokens_seen": 5928136, + "step": 117 + }, + { + "epoch": 0.01179332203139972, + "grad_norm": 54.01189422607422, + "learning_rate": 1.8860510805500985e-05, + "loss": 140.0655, + "num_input_tokens_seen": 6077916, + "step": 120 + }, + { + "epoch": 0.012088155082184714, + "grad_norm": 68.40033721923828, + "learning_rate": 1.933202357563851e-05, + "loss": 140.2441, + "num_input_tokens_seen": 6223140, + "step": 123 + }, + { + "epoch": 0.012382988132969706, + "grad_norm": 43.50001907348633, + "learning_rate": 1.9803536345776033e-05, + "loss": 139.027, + "num_input_tokens_seen": 6384480, + "step": 126 + }, + { + "epoch": 0.0126778211837547, + "grad_norm": 80.14400482177734, + "learning_rate": 2.0275049115913557e-05, + "loss": 138.4401, + "num_input_tokens_seen": 6547396, + "step": 129 + }, + { + "epoch": 0.012972654234539692, + "grad_norm": 44.20638656616211, + "learning_rate": 2.0746561886051082e-05, + "loss": 138.0098, + "num_input_tokens_seen": 6690760, + "step": 132 + }, + { + "epoch": 0.013267487285324685, + "grad_norm": 55.782615661621094, + "learning_rate": 2.1218074656188606e-05, + "loss": 136.31, + "num_input_tokens_seen": 6834084, + "step": 135 + }, + { + "epoch": 0.013562320336109678, + "grad_norm": 54.36646270751953, + "learning_rate": 2.1689587426326134e-05, + "loss": 134.8045, + "num_input_tokens_seen": 6981456, + "step": 138 + }, + { + "epoch": 0.013857153386894671, + "grad_norm": 48.38446807861328, + "learning_rate": 2.2161100196463658e-05, + "loss": 135.7917, + "num_input_tokens_seen": 7150592, + "step": 141 + }, + { + "epoch": 0.014151986437679663, + "grad_norm": 74.71308898925781, + "learning_rate": 2.263261296660118e-05, + "loss": 133.7626, + "num_input_tokens_seen": 7305448, + "step": 144 + }, + { + "epoch": 0.014446819488464657, + "grad_norm": 60.53500747680664, + "learning_rate": 2.3104125736738707e-05, + "loss": 134.0327, + "num_input_tokens_seen": 7470308, + "step": 147 + }, + { + "epoch": 0.01474165253924965, + "grad_norm": 59.86284255981445, + "learning_rate": 2.357563850687623e-05, + "loss": 132.9785, + "num_input_tokens_seen": 7624272, + "step": 150 + }, + { + "epoch": 0.015036485590034643, + "grad_norm": 43.723575592041016, + "learning_rate": 2.4047151277013755e-05, + "loss": 132.5055, + "num_input_tokens_seen": 7765928, + "step": 153 + }, + { + "epoch": 0.015331318640819635, + "grad_norm": 48.423545837402344, + "learning_rate": 2.4518664047151276e-05, + "loss": 131.7431, + "num_input_tokens_seen": 7930544, + "step": 156 + }, + { + "epoch": 0.01562615169160463, + "grad_norm": 50.08180236816406, + "learning_rate": 2.4990176817288804e-05, + "loss": 130.7851, + "num_input_tokens_seen": 8090808, + "step": 159 + }, + { + "epoch": 0.01592098474238962, + "grad_norm": 36.15312576293945, + "learning_rate": 2.5461689587426328e-05, + "loss": 129.9209, + "num_input_tokens_seen": 8243468, + "step": 162 + }, + { + "epoch": 0.016215817793174613, + "grad_norm": 51.09037780761719, + "learning_rate": 2.5933202357563852e-05, + "loss": 128.3312, + "num_input_tokens_seen": 8391768, + "step": 165 + }, + { + "epoch": 0.01651065084395961, + "grad_norm": 42.25214767456055, + "learning_rate": 2.640471512770138e-05, + "loss": 127.7597, + "num_input_tokens_seen": 8561392, + "step": 168 + }, + { + "epoch": 0.0168054838947446, + "grad_norm": 33.23691177368164, + "learning_rate": 2.68762278978389e-05, + "loss": 127.3331, + "num_input_tokens_seen": 8715268, + "step": 171 + }, + { + "epoch": 0.017100316945529593, + "grad_norm": 38.06812286376953, + "learning_rate": 2.7347740667976425e-05, + "loss": 126.6195, + "num_input_tokens_seen": 8863028, + "step": 174 + }, + { + "epoch": 0.01739514999631459, + "grad_norm": 32.519142150878906, + "learning_rate": 2.7819253438113953e-05, + "loss": 126.7838, + "num_input_tokens_seen": 9009124, + "step": 177 + }, + { + "epoch": 0.01768998304709958, + "grad_norm": 49.16916275024414, + "learning_rate": 2.8290766208251477e-05, + "loss": 126.2718, + "num_input_tokens_seen": 9173364, + "step": 180 + }, + { + "epoch": 0.017984816097884573, + "grad_norm": 39.809391021728516, + "learning_rate": 2.8762278978389e-05, + "loss": 124.7026, + "num_input_tokens_seen": 9320500, + "step": 183 + }, + { + "epoch": 0.018279649148669565, + "grad_norm": 37.8610725402832, + "learning_rate": 2.9233791748526522e-05, + "loss": 124.2418, + "num_input_tokens_seen": 9497040, + "step": 186 + }, + { + "epoch": 0.01857448219945456, + "grad_norm": 28.63737678527832, + "learning_rate": 2.970530451866405e-05, + "loss": 125.1306, + "num_input_tokens_seen": 9651548, + "step": 189 + }, + { + "epoch": 0.018869315250239552, + "grad_norm": 29.44608497619629, + "learning_rate": 3.0176817288801574e-05, + "loss": 123.0679, + "num_input_tokens_seen": 9811940, + "step": 192 + }, + { + "epoch": 0.019164148301024544, + "grad_norm": 28.310522079467773, + "learning_rate": 3.06483300589391e-05, + "loss": 122.1647, + "num_input_tokens_seen": 9965988, + "step": 195 + }, + { + "epoch": 0.019458981351809537, + "grad_norm": 36.38974380493164, + "learning_rate": 3.111984282907662e-05, + "loss": 122.0278, + "num_input_tokens_seen": 10116488, + "step": 198 + }, + { + "epoch": 0.019753814402594532, + "grad_norm": 45.70870590209961, + "learning_rate": 3.159135559921415e-05, + "loss": 120.921, + "num_input_tokens_seen": 10252748, + "step": 201 + }, + { + "epoch": 0.020048647453379524, + "grad_norm": 30.969467163085938, + "learning_rate": 3.206286836935167e-05, + "loss": 122.1355, + "num_input_tokens_seen": 10421036, + "step": 204 + }, + { + "epoch": 0.020343480504164516, + "grad_norm": 29.877321243286133, + "learning_rate": 3.2534381139489195e-05, + "loss": 121.7966, + "num_input_tokens_seen": 10580420, + "step": 207 + }, + { + "epoch": 0.02063831355494951, + "grad_norm": 29.29607582092285, + "learning_rate": 3.300589390962672e-05, + "loss": 120.6734, + "num_input_tokens_seen": 10721712, + "step": 210 + }, + { + "epoch": 0.020933146605734504, + "grad_norm": 31.24442481994629, + "learning_rate": 3.3477406679764244e-05, + "loss": 120.3414, + "num_input_tokens_seen": 10887044, + "step": 213 + }, + { + "epoch": 0.021227979656519496, + "grad_norm": 28.847200393676758, + "learning_rate": 3.394891944990177e-05, + "loss": 121.1287, + "num_input_tokens_seen": 11071896, + "step": 216 + }, + { + "epoch": 0.021522812707304488, + "grad_norm": 27.561906814575195, + "learning_rate": 3.44204322200393e-05, + "loss": 121.9215, + "num_input_tokens_seen": 11225176, + "step": 219 + }, + { + "epoch": 0.02181764575808948, + "grad_norm": 25.985300064086914, + "learning_rate": 3.4891944990176824e-05, + "loss": 118.9775, + "num_input_tokens_seen": 11372232, + "step": 222 + }, + { + "epoch": 0.022112478808874476, + "grad_norm": 34.018558502197266, + "learning_rate": 3.536345776031434e-05, + "loss": 121.1285, + "num_input_tokens_seen": 11494888, + "step": 225 + }, + { + "epoch": 0.022407311859659468, + "grad_norm": 33.21733856201172, + "learning_rate": 3.583497053045187e-05, + "loss": 118.8301, + "num_input_tokens_seen": 11644560, + "step": 228 + }, + { + "epoch": 0.02270214491044446, + "grad_norm": 28.910791397094727, + "learning_rate": 3.6306483300589396e-05, + "loss": 120.0619, + "num_input_tokens_seen": 11796652, + "step": 231 + }, + { + "epoch": 0.022996977961229455, + "grad_norm": 32.809871673583984, + "learning_rate": 3.677799607072692e-05, + "loss": 117.1904, + "num_input_tokens_seen": 11938880, + "step": 234 + }, + { + "epoch": 0.023291811012014448, + "grad_norm": 32.05815887451172, + "learning_rate": 3.724950884086444e-05, + "loss": 118.6215, + "num_input_tokens_seen": 12077780, + "step": 237 + }, + { + "epoch": 0.02358664406279944, + "grad_norm": 38.406951904296875, + "learning_rate": 3.772102161100197e-05, + "loss": 119.2037, + "num_input_tokens_seen": 12230832, + "step": 240 + }, + { + "epoch": 0.02388147711358443, + "grad_norm": 33.12141036987305, + "learning_rate": 3.8192534381139494e-05, + "loss": 120.1299, + "num_input_tokens_seen": 12365416, + "step": 243 + }, + { + "epoch": 0.024176310164369427, + "grad_norm": 35.890018463134766, + "learning_rate": 3.866404715127702e-05, + "loss": 119.1851, + "num_input_tokens_seen": 12527200, + "step": 246 + }, + { + "epoch": 0.02447114321515442, + "grad_norm": 32.43567657470703, + "learning_rate": 3.913555992141454e-05, + "loss": 120.2267, + "num_input_tokens_seen": 12662252, + "step": 249 + }, + { + "epoch": 0.02476597626593941, + "grad_norm": 27.831727981567383, + "learning_rate": 3.9607072691552066e-05, + "loss": 117.902, + "num_input_tokens_seen": 12817284, + "step": 252 + }, + { + "epoch": 0.025060809316724404, + "grad_norm": 35.014984130859375, + "learning_rate": 4.007858546168959e-05, + "loss": 118.5048, + "num_input_tokens_seen": 12981920, + "step": 255 + }, + { + "epoch": 0.0253556423675094, + "grad_norm": 31.94507598876953, + "learning_rate": 4.0550098231827115e-05, + "loss": 116.9797, + "num_input_tokens_seen": 13146228, + "step": 258 + }, + { + "epoch": 0.02565047541829439, + "grad_norm": 34.79327392578125, + "learning_rate": 4.102161100196464e-05, + "loss": 118.0679, + "num_input_tokens_seen": 13299160, + "step": 261 + }, + { + "epoch": 0.025945308469079383, + "grad_norm": 27.8881778717041, + "learning_rate": 4.1493123772102163e-05, + "loss": 118.1522, + "num_input_tokens_seen": 13430784, + "step": 264 + }, + { + "epoch": 0.026240141519864375, + "grad_norm": 35.07295227050781, + "learning_rate": 4.1964636542239695e-05, + "loss": 117.0864, + "num_input_tokens_seen": 13568028, + "step": 267 + }, + { + "epoch": 0.02653497457064937, + "grad_norm": 30.21860694885254, + "learning_rate": 4.243614931237721e-05, + "loss": 115.8842, + "num_input_tokens_seen": 13736356, + "step": 270 + }, + { + "epoch": 0.026829807621434363, + "grad_norm": 30.91063117980957, + "learning_rate": 4.2907662082514736e-05, + "loss": 118.1064, + "num_input_tokens_seen": 13892812, + "step": 273 + }, + { + "epoch": 0.027124640672219355, + "grad_norm": 28.20920753479004, + "learning_rate": 4.337917485265227e-05, + "loss": 116.0458, + "num_input_tokens_seen": 14034716, + "step": 276 + }, + { + "epoch": 0.027419473723004347, + "grad_norm": 27.55144691467285, + "learning_rate": 4.3850687622789785e-05, + "loss": 117.1214, + "num_input_tokens_seen": 14161152, + "step": 279 + }, + { + "epoch": 0.027714306773789343, + "grad_norm": 25.209985733032227, + "learning_rate": 4.4322200392927316e-05, + "loss": 115.2847, + "num_input_tokens_seen": 14311492, + "step": 282 + }, + { + "epoch": 0.028009139824574335, + "grad_norm": 31.782129287719727, + "learning_rate": 4.479371316306484e-05, + "loss": 116.3986, + "num_input_tokens_seen": 14468124, + "step": 285 + }, + { + "epoch": 0.028303972875359327, + "grad_norm": 33.29216003417969, + "learning_rate": 4.526522593320236e-05, + "loss": 117.0832, + "num_input_tokens_seen": 14621272, + "step": 288 + }, + { + "epoch": 0.028598805926144322, + "grad_norm": 34.79197311401367, + "learning_rate": 4.573673870333989e-05, + "loss": 115.3495, + "num_input_tokens_seen": 14758068, + "step": 291 + }, + { + "epoch": 0.028893638976929314, + "grad_norm": 29.354154586791992, + "learning_rate": 4.620825147347741e-05, + "loss": 115.9104, + "num_input_tokens_seen": 14908960, + "step": 294 + }, + { + "epoch": 0.029188472027714307, + "grad_norm": 27.28232192993164, + "learning_rate": 4.667976424361493e-05, + "loss": 115.7575, + "num_input_tokens_seen": 15053712, + "step": 297 + }, + { + "epoch": 0.0294833050784993, + "grad_norm": 36.298770904541016, + "learning_rate": 4.715127701375246e-05, + "loss": 114.2888, + "num_input_tokens_seen": 15211664, + "step": 300 + }, + { + "epoch": 0.029778138129284294, + "grad_norm": 34.47058868408203, + "learning_rate": 4.762278978388998e-05, + "loss": 113.5061, + "num_input_tokens_seen": 15359696, + "step": 303 + }, + { + "epoch": 0.030072971180069286, + "grad_norm": 36.17288589477539, + "learning_rate": 4.809430255402751e-05, + "loss": 114.0696, + "num_input_tokens_seen": 15519284, + "step": 306 + }, + { + "epoch": 0.03036780423085428, + "grad_norm": 36.91202163696289, + "learning_rate": 4.8565815324165034e-05, + "loss": 115.5098, + "num_input_tokens_seen": 15675800, + "step": 309 + }, + { + "epoch": 0.03066263728163927, + "grad_norm": 47.51340103149414, + "learning_rate": 4.903732809430255e-05, + "loss": 115.061, + "num_input_tokens_seen": 15828156, + "step": 312 + }, + { + "epoch": 0.030957470332424266, + "grad_norm": 32.856422424316406, + "learning_rate": 4.950884086444008e-05, + "loss": 116.2509, + "num_input_tokens_seen": 15994660, + "step": 315 + }, + { + "epoch": 0.03125230338320926, + "grad_norm": 35.165409088134766, + "learning_rate": 4.998035363457761e-05, + "loss": 115.2751, + "num_input_tokens_seen": 16143304, + "step": 318 + }, + { + "epoch": 0.031547136433994254, + "grad_norm": 33.481590270996094, + "learning_rate": 5.045186640471513e-05, + "loss": 116.1703, + "num_input_tokens_seen": 16291300, + "step": 321 + }, + { + "epoch": 0.03184196948477924, + "grad_norm": 30.144989013671875, + "learning_rate": 5.0923379174852656e-05, + "loss": 113.4856, + "num_input_tokens_seen": 16469844, + "step": 324 + }, + { + "epoch": 0.03213680253556424, + "grad_norm": 26.71873664855957, + "learning_rate": 5.139489194499019e-05, + "loss": 115.6015, + "num_input_tokens_seen": 16605332, + "step": 327 + }, + { + "epoch": 0.032431635586349226, + "grad_norm": 28.083377838134766, + "learning_rate": 5.1866404715127704e-05, + "loss": 112.2329, + "num_input_tokens_seen": 16737376, + "step": 330 + }, + { + "epoch": 0.03272646863713422, + "grad_norm": 31.44344711303711, + "learning_rate": 5.233791748526523e-05, + "loss": 113.1789, + "num_input_tokens_seen": 16891036, + "step": 333 + }, + { + "epoch": 0.03302130168791922, + "grad_norm": 31.328773498535156, + "learning_rate": 5.280943025540276e-05, + "loss": 113.8001, + "num_input_tokens_seen": 17041200, + "step": 336 + }, + { + "epoch": 0.033316134738704206, + "grad_norm": 26.61764144897461, + "learning_rate": 5.328094302554028e-05, + "loss": 113.1791, + "num_input_tokens_seen": 17200492, + "step": 339 + }, + { + "epoch": 0.0336109677894892, + "grad_norm": 30.37028694152832, + "learning_rate": 5.37524557956778e-05, + "loss": 114.1511, + "num_input_tokens_seen": 17349404, + "step": 342 + }, + { + "epoch": 0.0339058008402742, + "grad_norm": 31.63671875, + "learning_rate": 5.422396856581533e-05, + "loss": 112.5117, + "num_input_tokens_seen": 17497608, + "step": 345 + }, + { + "epoch": 0.034200633891059186, + "grad_norm": 34.88505935668945, + "learning_rate": 5.469548133595285e-05, + "loss": 113.4745, + "num_input_tokens_seen": 17630548, + "step": 348 + }, + { + "epoch": 0.03449546694184418, + "grad_norm": 34.04216003417969, + "learning_rate": 5.516699410609038e-05, + "loss": 114.2201, + "num_input_tokens_seen": 17770232, + "step": 351 + }, + { + "epoch": 0.03479029999262918, + "grad_norm": 31.125619888305664, + "learning_rate": 5.5638506876227905e-05, + "loss": 113.5508, + "num_input_tokens_seen": 17952832, + "step": 354 + }, + { + "epoch": 0.035085133043414166, + "grad_norm": 33.099971771240234, + "learning_rate": 5.611001964636542e-05, + "loss": 112.3604, + "num_input_tokens_seen": 18087932, + "step": 357 + }, + { + "epoch": 0.03537996609419916, + "grad_norm": 27.962017059326172, + "learning_rate": 5.6581532416502954e-05, + "loss": 111.5293, + "num_input_tokens_seen": 18249968, + "step": 360 + }, + { + "epoch": 0.03567479914498415, + "grad_norm": 34.95933532714844, + "learning_rate": 5.705304518664047e-05, + "loss": 112.3723, + "num_input_tokens_seen": 18420280, + "step": 363 + }, + { + "epoch": 0.035969632195769145, + "grad_norm": 30.143390655517578, + "learning_rate": 5.7524557956778e-05, + "loss": 112.1157, + "num_input_tokens_seen": 18578408, + "step": 366 + }, + { + "epoch": 0.03626446524655414, + "grad_norm": 33.200233459472656, + "learning_rate": 5.799607072691553e-05, + "loss": 111.7586, + "num_input_tokens_seen": 18710576, + "step": 369 + }, + { + "epoch": 0.03655929829733913, + "grad_norm": 36.428489685058594, + "learning_rate": 5.8467583497053044e-05, + "loss": 112.8608, + "num_input_tokens_seen": 18881144, + "step": 372 + }, + { + "epoch": 0.036854131348124125, + "grad_norm": 202.75164794921875, + "learning_rate": 5.8939096267190575e-05, + "loss": 111.489, + "num_input_tokens_seen": 19037724, + "step": 375 + }, + { + "epoch": 0.03714896439890912, + "grad_norm": 39.6142578125, + "learning_rate": 5.94106090373281e-05, + "loss": 113.2192, + "num_input_tokens_seen": 19184996, + "step": 378 + }, + { + "epoch": 0.03744379744969411, + "grad_norm": 37.47132110595703, + "learning_rate": 5.9882121807465624e-05, + "loss": 111.0813, + "num_input_tokens_seen": 19342988, + "step": 381 + }, + { + "epoch": 0.037738630500479105, + "grad_norm": 46.92488098144531, + "learning_rate": 6.035363457760315e-05, + "loss": 111.7378, + "num_input_tokens_seen": 19500860, + "step": 384 + }, + { + "epoch": 0.03803346355126409, + "grad_norm": 34.19045639038086, + "learning_rate": 6.082514734774068e-05, + "loss": 111.4771, + "num_input_tokens_seen": 19655396, + "step": 387 + }, + { + "epoch": 0.03832829660204909, + "grad_norm": 33.17388916015625, + "learning_rate": 6.12966601178782e-05, + "loss": 111.7661, + "num_input_tokens_seen": 19812628, + "step": 390 + }, + { + "epoch": 0.038623129652834085, + "grad_norm": 37.04789733886719, + "learning_rate": 6.176817288801572e-05, + "loss": 111.0436, + "num_input_tokens_seen": 19962968, + "step": 393 + }, + { + "epoch": 0.03891796270361907, + "grad_norm": 34.42665100097656, + "learning_rate": 6.223968565815325e-05, + "loss": 110.9319, + "num_input_tokens_seen": 20124412, + "step": 396 + }, + { + "epoch": 0.03921279575440407, + "grad_norm": 30.080944061279297, + "learning_rate": 6.271119842829077e-05, + "loss": 109.7078, + "num_input_tokens_seen": 20271340, + "step": 399 + }, + { + "epoch": 0.039507628805189064, + "grad_norm": 37.49620056152344, + "learning_rate": 6.31827111984283e-05, + "loss": 109.8044, + "num_input_tokens_seen": 20416520, + "step": 402 + }, + { + "epoch": 0.03980246185597405, + "grad_norm": 40.49543762207031, + "learning_rate": 6.365422396856582e-05, + "loss": 111.5728, + "num_input_tokens_seen": 20562864, + "step": 405 + }, + { + "epoch": 0.04009729490675905, + "grad_norm": 32.68974304199219, + "learning_rate": 6.412573673870334e-05, + "loss": 110.8815, + "num_input_tokens_seen": 20722032, + "step": 408 + }, + { + "epoch": 0.040392127957544044, + "grad_norm": 31.40269660949707, + "learning_rate": 6.459724950884087e-05, + "loss": 110.8921, + "num_input_tokens_seen": 20876444, + "step": 411 + }, + { + "epoch": 0.04068696100832903, + "grad_norm": 35.71750259399414, + "learning_rate": 6.506876227897839e-05, + "loss": 112.1176, + "num_input_tokens_seen": 21048444, + "step": 414 + }, + { + "epoch": 0.04098179405911403, + "grad_norm": 30.60179901123047, + "learning_rate": 6.554027504911592e-05, + "loss": 110.2898, + "num_input_tokens_seen": 21213736, + "step": 417 + }, + { + "epoch": 0.04127662710989902, + "grad_norm": 33.711368560791016, + "learning_rate": 6.601178781925344e-05, + "loss": 109.2687, + "num_input_tokens_seen": 21386808, + "step": 420 + }, + { + "epoch": 0.04157146016068401, + "grad_norm": 28.957597732543945, + "learning_rate": 6.648330058939096e-05, + "loss": 108.5565, + "num_input_tokens_seen": 21564740, + "step": 423 + }, + { + "epoch": 0.04186629321146901, + "grad_norm": 33.985660552978516, + "learning_rate": 6.695481335952849e-05, + "loss": 108.6807, + "num_input_tokens_seen": 21717276, + "step": 426 + }, + { + "epoch": 0.042161126262253996, + "grad_norm": 35.086524963378906, + "learning_rate": 6.742632612966603e-05, + "loss": 111.5809, + "num_input_tokens_seen": 21886308, + "step": 429 + }, + { + "epoch": 0.04245595931303899, + "grad_norm": 29.464298248291016, + "learning_rate": 6.789783889980354e-05, + "loss": 110.4938, + "num_input_tokens_seen": 22035420, + "step": 432 + }, + { + "epoch": 0.04275079236382399, + "grad_norm": 31.72478675842285, + "learning_rate": 6.836935166994106e-05, + "loss": 108.5007, + "num_input_tokens_seen": 22195320, + "step": 435 + }, + { + "epoch": 0.043045625414608976, + "grad_norm": 29.742385864257812, + "learning_rate": 6.88408644400786e-05, + "loss": 109.9771, + "num_input_tokens_seen": 22353492, + "step": 438 + }, + { + "epoch": 0.04334045846539397, + "grad_norm": 27.068031311035156, + "learning_rate": 6.931237721021611e-05, + "loss": 109.8151, + "num_input_tokens_seen": 22499056, + "step": 441 + }, + { + "epoch": 0.04363529151617896, + "grad_norm": 31.5910587310791, + "learning_rate": 6.978388998035365e-05, + "loss": 109.6762, + "num_input_tokens_seen": 22661068, + "step": 444 + }, + { + "epoch": 0.043930124566963956, + "grad_norm": 32.54317855834961, + "learning_rate": 7.025540275049117e-05, + "loss": 109.202, + "num_input_tokens_seen": 22825316, + "step": 447 + }, + { + "epoch": 0.04422495761774895, + "grad_norm": 29.726049423217773, + "learning_rate": 7.072691552062868e-05, + "loss": 109.1553, + "num_input_tokens_seen": 22968708, + "step": 450 + }, + { + "epoch": 0.04451979066853394, + "grad_norm": 29.412227630615234, + "learning_rate": 7.119842829076622e-05, + "loss": 109.7633, + "num_input_tokens_seen": 23115468, + "step": 453 + }, + { + "epoch": 0.044814623719318936, + "grad_norm": 29.294687271118164, + "learning_rate": 7.166994106090374e-05, + "loss": 108.4812, + "num_input_tokens_seen": 23287960, + "step": 456 + }, + { + "epoch": 0.04510945677010393, + "grad_norm": 28.74698257446289, + "learning_rate": 7.214145383104126e-05, + "loss": 108.3726, + "num_input_tokens_seen": 23436948, + "step": 459 + }, + { + "epoch": 0.04540428982088892, + "grad_norm": 47.22918701171875, + "learning_rate": 7.261296660117879e-05, + "loss": 109.4055, + "num_input_tokens_seen": 23600328, + "step": 462 + }, + { + "epoch": 0.045699122871673915, + "grad_norm": 31.876970291137695, + "learning_rate": 7.30844793713163e-05, + "loss": 107.6664, + "num_input_tokens_seen": 23745424, + "step": 465 + }, + { + "epoch": 0.04599395592245891, + "grad_norm": 35.708683013916016, + "learning_rate": 7.355599214145384e-05, + "loss": 108.5909, + "num_input_tokens_seen": 23903584, + "step": 468 + }, + { + "epoch": 0.0462887889732439, + "grad_norm": 28.689655303955078, + "learning_rate": 7.402750491159137e-05, + "loss": 108.6428, + "num_input_tokens_seen": 24052516, + "step": 471 + }, + { + "epoch": 0.046583622024028895, + "grad_norm": 29.488140106201172, + "learning_rate": 7.449901768172888e-05, + "loss": 106.9835, + "num_input_tokens_seen": 24211128, + "step": 474 + }, + { + "epoch": 0.046878455074813884, + "grad_norm": 38.3368034362793, + "learning_rate": 7.497053045186641e-05, + "loss": 108.5497, + "num_input_tokens_seen": 24363412, + "step": 477 + }, + { + "epoch": 0.04717328812559888, + "grad_norm": 30.385234832763672, + "learning_rate": 7.544204322200394e-05, + "loss": 106.6094, + "num_input_tokens_seen": 24524724, + "step": 480 + }, + { + "epoch": 0.047468121176383875, + "grad_norm": 73.74546813964844, + "learning_rate": 7.591355599214146e-05, + "loss": 107.275, + "num_input_tokens_seen": 24672488, + "step": 483 + }, + { + "epoch": 0.04776295422716886, + "grad_norm": 30.18967628479004, + "learning_rate": 7.638506876227899e-05, + "loss": 107.4118, + "num_input_tokens_seen": 24822048, + "step": 486 + }, + { + "epoch": 0.04805778727795386, + "grad_norm": 30.575702667236328, + "learning_rate": 7.685658153241651e-05, + "loss": 108.2546, + "num_input_tokens_seen": 24971528, + "step": 489 + }, + { + "epoch": 0.048352620328738855, + "grad_norm": 34.882266998291016, + "learning_rate": 7.732809430255404e-05, + "loss": 106.8039, + "num_input_tokens_seen": 25107552, + "step": 492 + }, + { + "epoch": 0.04864745337952384, + "grad_norm": 31.217613220214844, + "learning_rate": 7.779960707269156e-05, + "loss": 108.3255, + "num_input_tokens_seen": 25288080, + "step": 495 + }, + { + "epoch": 0.04894228643030884, + "grad_norm": 29.565536499023438, + "learning_rate": 7.827111984282908e-05, + "loss": 106.2473, + "num_input_tokens_seen": 25445284, + "step": 498 + }, + { + "epoch": 0.04923711948109383, + "grad_norm": 36.96455001831055, + "learning_rate": 7.874263261296661e-05, + "loss": 106.5716, + "num_input_tokens_seen": 25627308, + "step": 501 + }, + { + "epoch": 0.04953195253187882, + "grad_norm": 41.639625549316406, + "learning_rate": 7.921414538310413e-05, + "loss": 108.6796, + "num_input_tokens_seen": 25782124, + "step": 504 + }, + { + "epoch": 0.04982678558266382, + "grad_norm": 31.215431213378906, + "learning_rate": 7.968565815324166e-05, + "loss": 106.9381, + "num_input_tokens_seen": 25932328, + "step": 507 + }, + { + "epoch": 0.05012161863344881, + "grad_norm": 32.43290328979492, + "learning_rate": 8e-05, + "loss": 107.4913, + "num_input_tokens_seen": 26084420, + "step": 510 + }, + { + "epoch": 0.0504164516842338, + "grad_norm": 48.26959228515625, + "learning_rate": 8e-05, + "loss": 107.523, + "num_input_tokens_seen": 26251152, + "step": 513 + }, + { + "epoch": 0.0507112847350188, + "grad_norm": 35.35361099243164, + "learning_rate": 8e-05, + "loss": 108.4295, + "num_input_tokens_seen": 26406284, + "step": 516 + }, + { + "epoch": 0.05100611778580379, + "grad_norm": 37.7109260559082, + "learning_rate": 8e-05, + "loss": 106.0299, + "num_input_tokens_seen": 26544532, + "step": 519 + }, + { + "epoch": 0.05130095083658878, + "grad_norm": 31.95452880859375, + "learning_rate": 8e-05, + "loss": 107.3508, + "num_input_tokens_seen": 26697740, + "step": 522 + }, + { + "epoch": 0.05159578388737378, + "grad_norm": 33.43760681152344, + "learning_rate": 8e-05, + "loss": 105.9003, + "num_input_tokens_seen": 26854696, + "step": 525 + }, + { + "epoch": 0.051890616938158766, + "grad_norm": 34.20586013793945, + "learning_rate": 8e-05, + "loss": 107.3876, + "num_input_tokens_seen": 27009780, + "step": 528 + }, + { + "epoch": 0.05218544998894376, + "grad_norm": 31.921728134155273, + "learning_rate": 8e-05, + "loss": 105.3969, + "num_input_tokens_seen": 27150128, + "step": 531 + }, + { + "epoch": 0.05248028303972875, + "grad_norm": 36.0076789855957, + "learning_rate": 8e-05, + "loss": 106.9184, + "num_input_tokens_seen": 27304528, + "step": 534 + }, + { + "epoch": 0.052775116090513746, + "grad_norm": 29.839731216430664, + "learning_rate": 8e-05, + "loss": 105.1159, + "num_input_tokens_seen": 27450804, + "step": 537 + }, + { + "epoch": 0.05306994914129874, + "grad_norm": 33.60909652709961, + "learning_rate": 8e-05, + "loss": 104.5463, + "num_input_tokens_seen": 27611428, + "step": 540 + }, + { + "epoch": 0.05336478219208373, + "grad_norm": 30.84507179260254, + "learning_rate": 8e-05, + "loss": 103.293, + "num_input_tokens_seen": 27759640, + "step": 543 + }, + { + "epoch": 0.053659615242868726, + "grad_norm": 35.389583587646484, + "learning_rate": 8e-05, + "loss": 104.1917, + "num_input_tokens_seen": 27912976, + "step": 546 + }, + { + "epoch": 0.05395444829365372, + "grad_norm": 31.530858993530273, + "learning_rate": 8e-05, + "loss": 105.6748, + "num_input_tokens_seen": 28064152, + "step": 549 + }, + { + "epoch": 0.05424928134443871, + "grad_norm": 41.1005859375, + "learning_rate": 8e-05, + "loss": 107.298, + "num_input_tokens_seen": 28217516, + "step": 552 + }, + { + "epoch": 0.054544114395223706, + "grad_norm": 61.542354583740234, + "learning_rate": 8e-05, + "loss": 106.5471, + "num_input_tokens_seen": 28356428, + "step": 555 + }, + { + "epoch": 0.054838947446008694, + "grad_norm": 46.59098434448242, + "learning_rate": 8e-05, + "loss": 104.5256, + "num_input_tokens_seen": 28529116, + "step": 558 + }, + { + "epoch": 0.05513378049679369, + "grad_norm": 31.784414291381836, + "learning_rate": 8e-05, + "loss": 107.8473, + "num_input_tokens_seen": 28697036, + "step": 561 + }, + { + "epoch": 0.055428613547578685, + "grad_norm": 35.911521911621094, + "learning_rate": 8e-05, + "loss": 104.2456, + "num_input_tokens_seen": 28851120, + "step": 564 + }, + { + "epoch": 0.055723446598363674, + "grad_norm": 28.5731201171875, + "learning_rate": 8e-05, + "loss": 103.9228, + "num_input_tokens_seen": 28991428, + "step": 567 + }, + { + "epoch": 0.05601827964914867, + "grad_norm": 33.67176055908203, + "learning_rate": 8e-05, + "loss": 105.1696, + "num_input_tokens_seen": 29153744, + "step": 570 + }, + { + "epoch": 0.056313112699933665, + "grad_norm": 30.27126121520996, + "learning_rate": 8e-05, + "loss": 104.0718, + "num_input_tokens_seen": 29291672, + "step": 573 + }, + { + "epoch": 0.056607945750718654, + "grad_norm": 32.40224075317383, + "learning_rate": 8e-05, + "loss": 105.3532, + "num_input_tokens_seen": 29444228, + "step": 576 + }, + { + "epoch": 0.05690277880150365, + "grad_norm": 32.94948959350586, + "learning_rate": 8e-05, + "loss": 105.5668, + "num_input_tokens_seen": 29614308, + "step": 579 + }, + { + "epoch": 0.057197611852288645, + "grad_norm": 36.907508850097656, + "learning_rate": 8e-05, + "loss": 104.2329, + "num_input_tokens_seen": 29782636, + "step": 582 + }, + { + "epoch": 0.05749244490307363, + "grad_norm": 32.03764343261719, + "learning_rate": 8e-05, + "loss": 106.6867, + "num_input_tokens_seen": 29928448, + "step": 585 + }, + { + "epoch": 0.05778727795385863, + "grad_norm": 36.27505874633789, + "learning_rate": 8e-05, + "loss": 103.9458, + "num_input_tokens_seen": 30081348, + "step": 588 + }, + { + "epoch": 0.05808211100464362, + "grad_norm": 48.598350524902344, + "learning_rate": 8e-05, + "loss": 104.8759, + "num_input_tokens_seen": 30230504, + "step": 591 + }, + { + "epoch": 0.05837694405542861, + "grad_norm": 32.41371536254883, + "learning_rate": 8e-05, + "loss": 103.2528, + "num_input_tokens_seen": 30384920, + "step": 594 + }, + { + "epoch": 0.05867177710621361, + "grad_norm": 28.43147087097168, + "learning_rate": 8e-05, + "loss": 102.9143, + "num_input_tokens_seen": 30538908, + "step": 597 + }, + { + "epoch": 0.0589666101569986, + "grad_norm": 28.21474838256836, + "learning_rate": 8e-05, + "loss": 104.1305, + "num_input_tokens_seen": 30696424, + "step": 600 + }, + { + "epoch": 0.05926144320778359, + "grad_norm": 33.953983306884766, + "learning_rate": 8e-05, + "loss": 105.6636, + "num_input_tokens_seen": 30848860, + "step": 603 + }, + { + "epoch": 0.05955627625856859, + "grad_norm": 27.20724868774414, + "learning_rate": 8e-05, + "loss": 102.2961, + "num_input_tokens_seen": 30997372, + "step": 606 + }, + { + "epoch": 0.05985110930935358, + "grad_norm": 33.98468780517578, + "learning_rate": 8e-05, + "loss": 103.787, + "num_input_tokens_seen": 31157788, + "step": 609 + }, + { + "epoch": 0.06014594236013857, + "grad_norm": 40.239501953125, + "learning_rate": 8e-05, + "loss": 104.2993, + "num_input_tokens_seen": 31325552, + "step": 612 + }, + { + "epoch": 0.06044077541092356, + "grad_norm": 32.588470458984375, + "learning_rate": 8e-05, + "loss": 104.0729, + "num_input_tokens_seen": 31466752, + "step": 615 + }, + { + "epoch": 0.06073560846170856, + "grad_norm": 37.641700744628906, + "learning_rate": 8e-05, + "loss": 103.5312, + "num_input_tokens_seen": 31636332, + "step": 618 + }, + { + "epoch": 0.06103044151249355, + "grad_norm": 31.127992630004883, + "learning_rate": 8e-05, + "loss": 104.2723, + "num_input_tokens_seen": 31781852, + "step": 621 + }, + { + "epoch": 0.06132527456327854, + "grad_norm": 28.448110580444336, + "learning_rate": 8e-05, + "loss": 104.1347, + "num_input_tokens_seen": 31946376, + "step": 624 + }, + { + "epoch": 0.061620107614063536, + "grad_norm": 29.03575897216797, + "learning_rate": 8e-05, + "loss": 102.5833, + "num_input_tokens_seen": 32108392, + "step": 627 + }, + { + "epoch": 0.06191494066484853, + "grad_norm": 37.53111267089844, + "learning_rate": 8e-05, + "loss": 105.0999, + "num_input_tokens_seen": 32269168, + "step": 630 + }, + { + "epoch": 0.06220977371563352, + "grad_norm": 35.30291748046875, + "learning_rate": 8e-05, + "loss": 102.7633, + "num_input_tokens_seen": 32426728, + "step": 633 + }, + { + "epoch": 0.06250460676641852, + "grad_norm": 63.35116195678711, + "learning_rate": 8e-05, + "loss": 105.0097, + "num_input_tokens_seen": 32592400, + "step": 636 + }, + { + "epoch": 0.06279943981720351, + "grad_norm": 28.85284996032715, + "learning_rate": 8e-05, + "loss": 103.9038, + "num_input_tokens_seen": 32760464, + "step": 639 + }, + { + "epoch": 0.06309427286798851, + "grad_norm": 29.186050415039062, + "learning_rate": 8e-05, + "loss": 102.84, + "num_input_tokens_seen": 32899168, + "step": 642 + }, + { + "epoch": 0.06338910591877349, + "grad_norm": 35.00688552856445, + "learning_rate": 8e-05, + "loss": 102.888, + "num_input_tokens_seen": 33040460, + "step": 645 + }, + { + "epoch": 0.06368393896955848, + "grad_norm": 31.484594345092773, + "learning_rate": 8e-05, + "loss": 101.2444, + "num_input_tokens_seen": 33191032, + "step": 648 + }, + { + "epoch": 0.06397877202034348, + "grad_norm": 28.048425674438477, + "learning_rate": 8e-05, + "loss": 100.0415, + "num_input_tokens_seen": 33344604, + "step": 651 + }, + { + "epoch": 0.06427360507112848, + "grad_norm": 27.901996612548828, + "learning_rate": 8e-05, + "loss": 102.9519, + "num_input_tokens_seen": 33494576, + "step": 654 + }, + { + "epoch": 0.06456843812191347, + "grad_norm": 33.018165588378906, + "learning_rate": 8e-05, + "loss": 102.6747, + "num_input_tokens_seen": 33648388, + "step": 657 + }, + { + "epoch": 0.06486327117269845, + "grad_norm": 29.498411178588867, + "learning_rate": 8e-05, + "loss": 104.4521, + "num_input_tokens_seen": 33772076, + "step": 660 + }, + { + "epoch": 0.06515810422348345, + "grad_norm": 28.262418746948242, + "learning_rate": 8e-05, + "loss": 101.4761, + "num_input_tokens_seen": 33922880, + "step": 663 + }, + { + "epoch": 0.06545293727426844, + "grad_norm": 28.457019805908203, + "learning_rate": 8e-05, + "loss": 102.9336, + "num_input_tokens_seen": 34087612, + "step": 666 + }, + { + "epoch": 0.06574777032505344, + "grad_norm": 31.7344970703125, + "learning_rate": 8e-05, + "loss": 101.4416, + "num_input_tokens_seen": 34246408, + "step": 669 + }, + { + "epoch": 0.06604260337583844, + "grad_norm": 30.76515007019043, + "learning_rate": 8e-05, + "loss": 102.94, + "num_input_tokens_seen": 34396300, + "step": 672 + }, + { + "epoch": 0.06633743642662343, + "grad_norm": 29.8140811920166, + "learning_rate": 8e-05, + "loss": 102.4964, + "num_input_tokens_seen": 34566084, + "step": 675 + }, + { + "epoch": 0.06663226947740841, + "grad_norm": 38.429473876953125, + "learning_rate": 8e-05, + "loss": 100.1495, + "num_input_tokens_seen": 34711580, + "step": 678 + }, + { + "epoch": 0.06692710252819341, + "grad_norm": 35.76615905761719, + "learning_rate": 8e-05, + "loss": 101.2931, + "num_input_tokens_seen": 34862992, + "step": 681 + }, + { + "epoch": 0.0672219355789784, + "grad_norm": 29.05952262878418, + "learning_rate": 8e-05, + "loss": 101.0068, + "num_input_tokens_seen": 35016092, + "step": 684 + }, + { + "epoch": 0.0675167686297634, + "grad_norm": 28.329811096191406, + "learning_rate": 8e-05, + "loss": 105.2395, + "num_input_tokens_seen": 35200604, + "step": 687 + }, + { + "epoch": 0.0678116016805484, + "grad_norm": 27.511215209960938, + "learning_rate": 8e-05, + "loss": 102.3719, + "num_input_tokens_seen": 35339672, + "step": 690 + }, + { + "epoch": 0.06810643473133338, + "grad_norm": 29.413833618164062, + "learning_rate": 8e-05, + "loss": 101.1614, + "num_input_tokens_seen": 35488832, + "step": 693 + }, + { + "epoch": 0.06840126778211837, + "grad_norm": 33.835548400878906, + "learning_rate": 8e-05, + "loss": 101.2409, + "num_input_tokens_seen": 35640116, + "step": 696 + }, + { + "epoch": 0.06869610083290337, + "grad_norm": 37.69698715209961, + "learning_rate": 8e-05, + "loss": 101.9321, + "num_input_tokens_seen": 35830276, + "step": 699 + }, + { + "epoch": 0.06899093388368836, + "grad_norm": 34.55872344970703, + "learning_rate": 8e-05, + "loss": 101.7172, + "num_input_tokens_seen": 35986216, + "step": 702 + }, + { + "epoch": 0.06928576693447336, + "grad_norm": 38.171295166015625, + "learning_rate": 8e-05, + "loss": 100.8534, + "num_input_tokens_seen": 36136840, + "step": 705 + }, + { + "epoch": 0.06958059998525835, + "grad_norm": 30.427322387695312, + "learning_rate": 8e-05, + "loss": 101.2733, + "num_input_tokens_seen": 36307732, + "step": 708 + }, + { + "epoch": 0.06987543303604334, + "grad_norm": 33.20026779174805, + "learning_rate": 8e-05, + "loss": 100.3995, + "num_input_tokens_seen": 36468544, + "step": 711 + }, + { + "epoch": 0.07017026608682833, + "grad_norm": 28.34000015258789, + "learning_rate": 8e-05, + "loss": 100.1351, + "num_input_tokens_seen": 36613340, + "step": 714 + }, + { + "epoch": 0.07046509913761333, + "grad_norm": 31.049711227416992, + "learning_rate": 8e-05, + "loss": 99.2558, + "num_input_tokens_seen": 36772992, + "step": 717 + }, + { + "epoch": 0.07075993218839832, + "grad_norm": 28.40256118774414, + "learning_rate": 8e-05, + "loss": 98.7879, + "num_input_tokens_seen": 36917312, + "step": 720 + }, + { + "epoch": 0.07105476523918332, + "grad_norm": 35.21345901489258, + "learning_rate": 8e-05, + "loss": 99.0775, + "num_input_tokens_seen": 37056508, + "step": 723 + }, + { + "epoch": 0.0713495982899683, + "grad_norm": 28.639978408813477, + "learning_rate": 8e-05, + "loss": 100.1574, + "num_input_tokens_seen": 37209684, + "step": 726 + }, + { + "epoch": 0.0716444313407533, + "grad_norm": 35.251808166503906, + "learning_rate": 8e-05, + "loss": 100.5594, + "num_input_tokens_seen": 37374472, + "step": 729 + }, + { + "epoch": 0.07193926439153829, + "grad_norm": 40.02030563354492, + "learning_rate": 8e-05, + "loss": 97.8263, + "num_input_tokens_seen": 37511060, + "step": 732 + }, + { + "epoch": 0.07223409744232329, + "grad_norm": 31.680225372314453, + "learning_rate": 8e-05, + "loss": 98.7203, + "num_input_tokens_seen": 37642500, + "step": 735 + }, + { + "epoch": 0.07252893049310828, + "grad_norm": 31.110706329345703, + "learning_rate": 8e-05, + "loss": 102.0147, + "num_input_tokens_seen": 37799960, + "step": 738 + }, + { + "epoch": 0.07282376354389328, + "grad_norm": 29.42256736755371, + "learning_rate": 8e-05, + "loss": 98.3472, + "num_input_tokens_seen": 37949436, + "step": 741 + }, + { + "epoch": 0.07311859659467826, + "grad_norm": 35.685115814208984, + "learning_rate": 8e-05, + "loss": 99.096, + "num_input_tokens_seen": 38114716, + "step": 744 + }, + { + "epoch": 0.07341342964546325, + "grad_norm": 31.129684448242188, + "learning_rate": 8e-05, + "loss": 99.8503, + "num_input_tokens_seen": 38266548, + "step": 747 + }, + { + "epoch": 0.07370826269624825, + "grad_norm": 67.06864166259766, + "learning_rate": 8e-05, + "loss": 99.9585, + "num_input_tokens_seen": 38421392, + "step": 750 + }, + { + "epoch": 0.07400309574703325, + "grad_norm": 28.124284744262695, + "learning_rate": 8e-05, + "loss": 98.4267, + "num_input_tokens_seen": 38577988, + "step": 753 + }, + { + "epoch": 0.07429792879781824, + "grad_norm": 31.328781127929688, + "learning_rate": 8e-05, + "loss": 100.2827, + "num_input_tokens_seen": 38753868, + "step": 756 + }, + { + "epoch": 0.07459276184860322, + "grad_norm": 31.482263565063477, + "learning_rate": 8e-05, + "loss": 100.3612, + "num_input_tokens_seen": 38919460, + "step": 759 + }, + { + "epoch": 0.07488759489938822, + "grad_norm": 28.848894119262695, + "learning_rate": 8e-05, + "loss": 98.9799, + "num_input_tokens_seen": 39051280, + "step": 762 + }, + { + "epoch": 0.07518242795017321, + "grad_norm": 29.552709579467773, + "learning_rate": 8e-05, + "loss": 95.8027, + "num_input_tokens_seen": 39223880, + "step": 765 + }, + { + "epoch": 0.07547726100095821, + "grad_norm": 32.752559661865234, + "learning_rate": 8e-05, + "loss": 96.3181, + "num_input_tokens_seen": 39388972, + "step": 768 + }, + { + "epoch": 0.0757720940517432, + "grad_norm": 27.956424713134766, + "learning_rate": 8e-05, + "loss": 98.5633, + "num_input_tokens_seen": 39524604, + "step": 771 + }, + { + "epoch": 0.07606692710252819, + "grad_norm": 28.682424545288086, + "learning_rate": 8e-05, + "loss": 97.9849, + "num_input_tokens_seen": 39667896, + "step": 774 + }, + { + "epoch": 0.07636176015331318, + "grad_norm": 30.985292434692383, + "learning_rate": 8e-05, + "loss": 98.5518, + "num_input_tokens_seen": 39840608, + "step": 777 + }, + { + "epoch": 0.07665659320409818, + "grad_norm": 34.598148345947266, + "learning_rate": 8e-05, + "loss": 98.3472, + "num_input_tokens_seen": 39975948, + "step": 780 + }, + { + "epoch": 0.07695142625488317, + "grad_norm": 29.500808715820312, + "learning_rate": 8e-05, + "loss": 98.7694, + "num_input_tokens_seen": 40135764, + "step": 783 + }, + { + "epoch": 0.07724625930566817, + "grad_norm": 29.20780372619629, + "learning_rate": 8e-05, + "loss": 99.1242, + "num_input_tokens_seen": 40282812, + "step": 786 + }, + { + "epoch": 0.07754109235645316, + "grad_norm": 31.628633499145508, + "learning_rate": 8e-05, + "loss": 97.8333, + "num_input_tokens_seen": 40432408, + "step": 789 + }, + { + "epoch": 0.07783592540723815, + "grad_norm": 38.96535873413086, + "learning_rate": 8e-05, + "loss": 97.465, + "num_input_tokens_seen": 40591640, + "step": 792 + }, + { + "epoch": 0.07813075845802314, + "grad_norm": 30.278688430786133, + "learning_rate": 8e-05, + "loss": 96.5388, + "num_input_tokens_seen": 40745628, + "step": 795 + }, + { + "epoch": 0.07842559150880814, + "grad_norm": 34.10064697265625, + "learning_rate": 8e-05, + "loss": 98.7074, + "num_input_tokens_seen": 40892836, + "step": 798 + }, + { + "epoch": 0.07872042455959313, + "grad_norm": 34.84008026123047, + "learning_rate": 8e-05, + "loss": 98.8112, + "num_input_tokens_seen": 41044032, + "step": 801 + }, + { + "epoch": 0.07901525761037813, + "grad_norm": 29.791622161865234, + "learning_rate": 8e-05, + "loss": 96.3578, + "num_input_tokens_seen": 41189400, + "step": 804 + }, + { + "epoch": 0.07931009066116311, + "grad_norm": 29.386751174926758, + "learning_rate": 8e-05, + "loss": 97.5173, + "num_input_tokens_seen": 41345100, + "step": 807 + }, + { + "epoch": 0.0796049237119481, + "grad_norm": 27.887340545654297, + "learning_rate": 8e-05, + "loss": 97.5752, + "num_input_tokens_seen": 41487676, + "step": 810 + }, + { + "epoch": 0.0798997567627331, + "grad_norm": 29.79248046875, + "learning_rate": 8e-05, + "loss": 96.6194, + "num_input_tokens_seen": 41651120, + "step": 813 + }, + { + "epoch": 0.0801945898135181, + "grad_norm": 30.108230590820312, + "learning_rate": 8e-05, + "loss": 96.5995, + "num_input_tokens_seen": 41806768, + "step": 816 + }, + { + "epoch": 0.08048942286430309, + "grad_norm": 32.913536071777344, + "learning_rate": 8e-05, + "loss": 95.4259, + "num_input_tokens_seen": 41950708, + "step": 819 + }, + { + "epoch": 0.08078425591508809, + "grad_norm": 35.09928512573242, + "learning_rate": 8e-05, + "loss": 97.372, + "num_input_tokens_seen": 42101944, + "step": 822 + }, + { + "epoch": 0.08107908896587307, + "grad_norm": 36.4766845703125, + "learning_rate": 8e-05, + "loss": 96.4494, + "num_input_tokens_seen": 42244600, + "step": 825 + }, + { + "epoch": 0.08137392201665807, + "grad_norm": 43.9494743347168, + "learning_rate": 8e-05, + "loss": 96.8876, + "num_input_tokens_seen": 42402116, + "step": 828 + }, + { + "epoch": 0.08166875506744306, + "grad_norm": 31.989990234375, + "learning_rate": 8e-05, + "loss": 95.472, + "num_input_tokens_seen": 42570484, + "step": 831 + }, + { + "epoch": 0.08196358811822806, + "grad_norm": 27.526823043823242, + "learning_rate": 8e-05, + "loss": 94.3046, + "num_input_tokens_seen": 42715820, + "step": 834 + }, + { + "epoch": 0.08225842116901305, + "grad_norm": 29.665773391723633, + "learning_rate": 8e-05, + "loss": 96.0409, + "num_input_tokens_seen": 42885048, + "step": 837 + }, + { + "epoch": 0.08255325421979803, + "grad_norm": 33.27475357055664, + "learning_rate": 8e-05, + "loss": 94.5064, + "num_input_tokens_seen": 43037472, + "step": 840 + }, + { + "epoch": 0.08284808727058303, + "grad_norm": 27.184389114379883, + "learning_rate": 8e-05, + "loss": 95.9627, + "num_input_tokens_seen": 43207080, + "step": 843 + }, + { + "epoch": 0.08314292032136802, + "grad_norm": 30.488483428955078, + "learning_rate": 8e-05, + "loss": 95.5, + "num_input_tokens_seen": 43342624, + "step": 846 + }, + { + "epoch": 0.08343775337215302, + "grad_norm": 30.834781646728516, + "learning_rate": 8e-05, + "loss": 96.9813, + "num_input_tokens_seen": 43479084, + "step": 849 + }, + { + "epoch": 0.08373258642293802, + "grad_norm": 31.856342315673828, + "learning_rate": 8e-05, + "loss": 98.9376, + "num_input_tokens_seen": 43625832, + "step": 852 + }, + { + "epoch": 0.08402741947372301, + "grad_norm": 35.55719757080078, + "learning_rate": 8e-05, + "loss": 97.2659, + "num_input_tokens_seen": 43778084, + "step": 855 + }, + { + "epoch": 0.08432225252450799, + "grad_norm": 29.947450637817383, + "learning_rate": 8e-05, + "loss": 95.7474, + "num_input_tokens_seen": 43922396, + "step": 858 + }, + { + "epoch": 0.08461708557529299, + "grad_norm": 31.373952865600586, + "learning_rate": 8e-05, + "loss": 94.7743, + "num_input_tokens_seen": 44085324, + "step": 861 + }, + { + "epoch": 0.08491191862607798, + "grad_norm": 30.180021286010742, + "learning_rate": 8e-05, + "loss": 96.9605, + "num_input_tokens_seen": 44230488, + "step": 864 + }, + { + "epoch": 0.08520675167686298, + "grad_norm": 29.01386260986328, + "learning_rate": 8e-05, + "loss": 93.7627, + "num_input_tokens_seen": 44390288, + "step": 867 + }, + { + "epoch": 0.08550158472764798, + "grad_norm": 29.978891372680664, + "learning_rate": 8e-05, + "loss": 95.4733, + "num_input_tokens_seen": 44548192, + "step": 870 + }, + { + "epoch": 0.08579641777843296, + "grad_norm": 30.313705444335938, + "learning_rate": 8e-05, + "loss": 94.0545, + "num_input_tokens_seen": 44690768, + "step": 873 + }, + { + "epoch": 0.08609125082921795, + "grad_norm": 32.91279220581055, + "learning_rate": 8e-05, + "loss": 96.173, + "num_input_tokens_seen": 44834448, + "step": 876 + }, + { + "epoch": 0.08638608388000295, + "grad_norm": 28.27375602722168, + "learning_rate": 8e-05, + "loss": 94.625, + "num_input_tokens_seen": 44980220, + "step": 879 + }, + { + "epoch": 0.08668091693078794, + "grad_norm": 29.834308624267578, + "learning_rate": 8e-05, + "loss": 95.7682, + "num_input_tokens_seen": 45130548, + "step": 882 + }, + { + "epoch": 0.08697574998157294, + "grad_norm": 30.992219924926758, + "learning_rate": 8e-05, + "loss": 95.9339, + "num_input_tokens_seen": 45285992, + "step": 885 + }, + { + "epoch": 0.08727058303235792, + "grad_norm": 29.38202476501465, + "learning_rate": 8e-05, + "loss": 94.346, + "num_input_tokens_seen": 45423456, + "step": 888 + }, + { + "epoch": 0.08756541608314292, + "grad_norm": 30.09346580505371, + "learning_rate": 8e-05, + "loss": 96.734, + "num_input_tokens_seen": 45606976, + "step": 891 + }, + { + "epoch": 0.08786024913392791, + "grad_norm": 30.944683074951172, + "learning_rate": 8e-05, + "loss": 94.5057, + "num_input_tokens_seen": 45742436, + "step": 894 + }, + { + "epoch": 0.08815508218471291, + "grad_norm": 32.21797561645508, + "learning_rate": 8e-05, + "loss": 95.2261, + "num_input_tokens_seen": 45891856, + "step": 897 + }, + { + "epoch": 0.0884499152354979, + "grad_norm": 36.828975677490234, + "learning_rate": 8e-05, + "loss": 93.5124, + "num_input_tokens_seen": 46030736, + "step": 900 + }, + { + "epoch": 0.0887447482862829, + "grad_norm": 32.43253707885742, + "learning_rate": 8e-05, + "loss": 94.0418, + "num_input_tokens_seen": 46180516, + "step": 903 + }, + { + "epoch": 0.08903958133706788, + "grad_norm": 30.175016403198242, + "learning_rate": 8e-05, + "loss": 93.9941, + "num_input_tokens_seen": 46330568, + "step": 906 + }, + { + "epoch": 0.08933441438785288, + "grad_norm": 31.575355529785156, + "learning_rate": 8e-05, + "loss": 94.8749, + "num_input_tokens_seen": 46485964, + "step": 909 + }, + { + "epoch": 0.08962924743863787, + "grad_norm": 30.174617767333984, + "learning_rate": 8e-05, + "loss": 93.3188, + "num_input_tokens_seen": 46641304, + "step": 912 + }, + { + "epoch": 0.08992408048942287, + "grad_norm": 35.43874740600586, + "learning_rate": 8e-05, + "loss": 92.9305, + "num_input_tokens_seen": 46796272, + "step": 915 + }, + { + "epoch": 0.09021891354020786, + "grad_norm": 31.57838249206543, + "learning_rate": 8e-05, + "loss": 96.0303, + "num_input_tokens_seen": 46941040, + "step": 918 + }, + { + "epoch": 0.09051374659099284, + "grad_norm": 30.171436309814453, + "learning_rate": 8e-05, + "loss": 94.1278, + "num_input_tokens_seen": 47111964, + "step": 921 + }, + { + "epoch": 0.09080857964177784, + "grad_norm": 29.509170532226562, + "learning_rate": 8e-05, + "loss": 94.6481, + "num_input_tokens_seen": 47280680, + "step": 924 + }, + { + "epoch": 0.09110341269256284, + "grad_norm": 30.14413070678711, + "learning_rate": 8e-05, + "loss": 93.1313, + "num_input_tokens_seen": 47427784, + "step": 927 + }, + { + "epoch": 0.09139824574334783, + "grad_norm": 30.193668365478516, + "learning_rate": 8e-05, + "loss": 92.5065, + "num_input_tokens_seen": 47585472, + "step": 930 + }, + { + "epoch": 0.09169307879413283, + "grad_norm": 34.98398971557617, + "learning_rate": 8e-05, + "loss": 93.7416, + "num_input_tokens_seen": 47750156, + "step": 933 + }, + { + "epoch": 0.09198791184491782, + "grad_norm": 30.99966812133789, + "learning_rate": 8e-05, + "loss": 91.2385, + "num_input_tokens_seen": 47898952, + "step": 936 + }, + { + "epoch": 0.0922827448957028, + "grad_norm": 30.56644058227539, + "learning_rate": 8e-05, + "loss": 94.2638, + "num_input_tokens_seen": 48053704, + "step": 939 + }, + { + "epoch": 0.0925775779464878, + "grad_norm": 29.665430068969727, + "learning_rate": 8e-05, + "loss": 91.8333, + "num_input_tokens_seen": 48212428, + "step": 942 + }, + { + "epoch": 0.0928724109972728, + "grad_norm": 31.677806854248047, + "learning_rate": 8e-05, + "loss": 93.413, + "num_input_tokens_seen": 48372056, + "step": 945 + }, + { + "epoch": 0.09316724404805779, + "grad_norm": 29.71071434020996, + "learning_rate": 8e-05, + "loss": 92.5726, + "num_input_tokens_seen": 48551512, + "step": 948 + }, + { + "epoch": 0.09346207709884279, + "grad_norm": 30.616531372070312, + "learning_rate": 8e-05, + "loss": 92.5297, + "num_input_tokens_seen": 48708300, + "step": 951 + }, + { + "epoch": 0.09375691014962777, + "grad_norm": 31.245548248291016, + "learning_rate": 8e-05, + "loss": 90.3821, + "num_input_tokens_seen": 48869976, + "step": 954 + }, + { + "epoch": 0.09405174320041276, + "grad_norm": 32.1124382019043, + "learning_rate": 8e-05, + "loss": 92.5386, + "num_input_tokens_seen": 49009364, + "step": 957 + }, + { + "epoch": 0.09434657625119776, + "grad_norm": 194.75343322753906, + "learning_rate": 8e-05, + "loss": 94.1239, + "num_input_tokens_seen": 49169552, + "step": 960 + }, + { + "epoch": 0.09464140930198275, + "grad_norm": 32.542686462402344, + "learning_rate": 8e-05, + "loss": 94.3201, + "num_input_tokens_seen": 49326648, + "step": 963 + }, + { + "epoch": 0.09493624235276775, + "grad_norm": 85.58460235595703, + "learning_rate": 8e-05, + "loss": 92.559, + "num_input_tokens_seen": 49473464, + "step": 966 + }, + { + "epoch": 0.09523107540355275, + "grad_norm": 31.504518508911133, + "learning_rate": 8e-05, + "loss": 94.1152, + "num_input_tokens_seen": 49641688, + "step": 969 + }, + { + "epoch": 0.09552590845433773, + "grad_norm": 31.837738037109375, + "learning_rate": 8e-05, + "loss": 94.4699, + "num_input_tokens_seen": 49807428, + "step": 972 + }, + { + "epoch": 0.09582074150512272, + "grad_norm": 28.046907424926758, + "learning_rate": 8e-05, + "loss": 92.2552, + "num_input_tokens_seen": 49960180, + "step": 975 + }, + { + "epoch": 0.09611557455590772, + "grad_norm": 31.577808380126953, + "learning_rate": 8e-05, + "loss": 92.3609, + "num_input_tokens_seen": 50130040, + "step": 978 + }, + { + "epoch": 0.09641040760669271, + "grad_norm": 30.214200973510742, + "learning_rate": 8e-05, + "loss": 91.6302, + "num_input_tokens_seen": 50293532, + "step": 981 + }, + { + "epoch": 0.09670524065747771, + "grad_norm": 34.3266487121582, + "learning_rate": 8e-05, + "loss": 91.4269, + "num_input_tokens_seen": 50461436, + "step": 984 + }, + { + "epoch": 0.09700007370826269, + "grad_norm": 35.89109802246094, + "learning_rate": 8e-05, + "loss": 91.2092, + "num_input_tokens_seen": 50615044, + "step": 987 + }, + { + "epoch": 0.09729490675904769, + "grad_norm": 35.1023063659668, + "learning_rate": 8e-05, + "loss": 92.4097, + "num_input_tokens_seen": 50776320, + "step": 990 + }, + { + "epoch": 0.09758973980983268, + "grad_norm": 34.04882049560547, + "learning_rate": 8e-05, + "loss": 90.7081, + "num_input_tokens_seen": 50938464, + "step": 993 + }, + { + "epoch": 0.09788457286061768, + "grad_norm": 35.52146911621094, + "learning_rate": 8e-05, + "loss": 90.209, + "num_input_tokens_seen": 51085308, + "step": 996 + }, + { + "epoch": 0.09817940591140267, + "grad_norm": 38.20060729980469, + "learning_rate": 8e-05, + "loss": 90.1752, + "num_input_tokens_seen": 51250996, + "step": 999 + }, + { + "epoch": 0.09827768359499767, + "eval_gen_len": 69.32, + "eval_loss": 5.634235382080078, + "eval_rouge1": 16.5561, + "eval_rouge2": 3.2961, + "eval_rougeL": 14.7126, + "eval_rougeLsum": 14.7712, + "eval_runtime": 223.2645, + "eval_samples_per_second": 0.896, + "eval_steps_per_second": 0.224, + "num_input_tokens_seen": 51291360, + "step": 1000 + }, + { + "epoch": 0.09847423896218765, + "grad_norm": 32.12675857543945, + "learning_rate": 8e-05, + "loss": 91.8453, + "num_input_tokens_seen": 51385280, + "step": 1002 + }, + { + "epoch": 0.09876907201297265, + "grad_norm": 35.18289566040039, + "learning_rate": 8e-05, + "loss": 90.7393, + "num_input_tokens_seen": 51541768, + "step": 1005 + }, + { + "epoch": 0.09906390506375765, + "grad_norm": 33.29011535644531, + "learning_rate": 8e-05, + "loss": 89.8114, + "num_input_tokens_seen": 51695044, + "step": 1008 + }, + { + "epoch": 0.09935873811454264, + "grad_norm": 33.36168670654297, + "learning_rate": 8e-05, + "loss": 91.4891, + "num_input_tokens_seen": 51853280, + "step": 1011 + }, + { + "epoch": 0.09965357116532764, + "grad_norm": 33.3763542175293, + "learning_rate": 8e-05, + "loss": 90.593, + "num_input_tokens_seen": 52000664, + "step": 1014 + }, + { + "epoch": 0.09994840421611263, + "grad_norm": 36.2030029296875, + "learning_rate": 8e-05, + "loss": 90.8745, + "num_input_tokens_seen": 52158944, + "step": 1017 + }, + { + "epoch": 0.10024323726689761, + "grad_norm": 33.890750885009766, + "learning_rate": 8e-05, + "loss": 91.0693, + "num_input_tokens_seen": 52313252, + "step": 1020 + }, + { + "epoch": 0.10053807031768261, + "grad_norm": 30.91986846923828, + "learning_rate": 8e-05, + "loss": 88.7983, + "num_input_tokens_seen": 52486164, + "step": 1023 + }, + { + "epoch": 0.1008329033684676, + "grad_norm": 36.78636932373047, + "learning_rate": 8e-05, + "loss": 88.6159, + "num_input_tokens_seen": 52637680, + "step": 1026 + }, + { + "epoch": 0.1011277364192526, + "grad_norm": 35.55237579345703, + "learning_rate": 8e-05, + "loss": 88.7181, + "num_input_tokens_seen": 52779392, + "step": 1029 + }, + { + "epoch": 0.1014225694700376, + "grad_norm": 31.026241302490234, + "learning_rate": 8e-05, + "loss": 88.8933, + "num_input_tokens_seen": 52942088, + "step": 1032 + }, + { + "epoch": 0.10171740252082258, + "grad_norm": 102.04617309570312, + "learning_rate": 8e-05, + "loss": 88.6628, + "num_input_tokens_seen": 53095664, + "step": 1035 + }, + { + "epoch": 0.10201223557160757, + "grad_norm": 35.890018463134766, + "learning_rate": 8e-05, + "loss": 89.8409, + "num_input_tokens_seen": 53253208, + "step": 1038 + }, + { + "epoch": 0.10230706862239257, + "grad_norm": 32.98229217529297, + "learning_rate": 8e-05, + "loss": 90.546, + "num_input_tokens_seen": 53402820, + "step": 1041 + }, + { + "epoch": 0.10260190167317756, + "grad_norm": 53.527061462402344, + "learning_rate": 8e-05, + "loss": 89.9919, + "num_input_tokens_seen": 53538824, + "step": 1044 + }, + { + "epoch": 0.10289673472396256, + "grad_norm": 34.09297180175781, + "learning_rate": 8e-05, + "loss": 89.4842, + "num_input_tokens_seen": 53684448, + "step": 1047 + }, + { + "epoch": 0.10319156777474756, + "grad_norm": 32.06243133544922, + "learning_rate": 8e-05, + "loss": 89.727, + "num_input_tokens_seen": 53865824, + "step": 1050 + }, + { + "epoch": 0.10348640082553254, + "grad_norm": 37.76670455932617, + "learning_rate": 8e-05, + "loss": 89.0664, + "num_input_tokens_seen": 54009924, + "step": 1053 + }, + { + "epoch": 0.10378123387631753, + "grad_norm": 31.572418212890625, + "learning_rate": 8e-05, + "loss": 90.8275, + "num_input_tokens_seen": 54159500, + "step": 1056 + }, + { + "epoch": 0.10407606692710253, + "grad_norm": 40.10283660888672, + "learning_rate": 8e-05, + "loss": 90.409, + "num_input_tokens_seen": 54303812, + "step": 1059 + }, + { + "epoch": 0.10437089997788752, + "grad_norm": 41.28361129760742, + "learning_rate": 8e-05, + "loss": 90.1141, + "num_input_tokens_seen": 54455676, + "step": 1062 + }, + { + "epoch": 0.10466573302867252, + "grad_norm": 35.48882293701172, + "learning_rate": 8e-05, + "loss": 89.0017, + "num_input_tokens_seen": 54620984, + "step": 1065 + }, + { + "epoch": 0.1049605660794575, + "grad_norm": 44.871646881103516, + "learning_rate": 8e-05, + "loss": 87.272, + "num_input_tokens_seen": 54795340, + "step": 1068 + }, + { + "epoch": 0.1052553991302425, + "grad_norm": 37.673038482666016, + "learning_rate": 8e-05, + "loss": 89.4379, + "num_input_tokens_seen": 54950136, + "step": 1071 + }, + { + "epoch": 0.10555023218102749, + "grad_norm": 35.36658477783203, + "learning_rate": 8e-05, + "loss": 88.0971, + "num_input_tokens_seen": 55101460, + "step": 1074 + }, + { + "epoch": 0.10584506523181249, + "grad_norm": 35.320072174072266, + "learning_rate": 8e-05, + "loss": 89.0408, + "num_input_tokens_seen": 55264840, + "step": 1077 + }, + { + "epoch": 0.10613989828259748, + "grad_norm": 39.10199737548828, + "learning_rate": 8e-05, + "loss": 88.5933, + "num_input_tokens_seen": 55429216, + "step": 1080 + }, + { + "epoch": 0.10643473133338248, + "grad_norm": 57.2830696105957, + "learning_rate": 8e-05, + "loss": 88.9289, + "num_input_tokens_seen": 55561768, + "step": 1083 + }, + { + "epoch": 0.10672956438416746, + "grad_norm": 34.813167572021484, + "learning_rate": 8e-05, + "loss": 89.9692, + "num_input_tokens_seen": 55724544, + "step": 1086 + }, + { + "epoch": 0.10702439743495246, + "grad_norm": 38.25190734863281, + "learning_rate": 8e-05, + "loss": 86.0509, + "num_input_tokens_seen": 55872416, + "step": 1089 + }, + { + "epoch": 0.10731923048573745, + "grad_norm": 35.670265197753906, + "learning_rate": 8e-05, + "loss": 88.2196, + "num_input_tokens_seen": 56021900, + "step": 1092 + }, + { + "epoch": 0.10761406353652245, + "grad_norm": 45.03786087036133, + "learning_rate": 8e-05, + "loss": 90.6019, + "num_input_tokens_seen": 56175616, + "step": 1095 + }, + { + "epoch": 0.10790889658730744, + "grad_norm": 36.3194580078125, + "learning_rate": 8e-05, + "loss": 86.1967, + "num_input_tokens_seen": 56322540, + "step": 1098 + }, + { + "epoch": 0.10820372963809242, + "grad_norm": 61.72140884399414, + "learning_rate": 8e-05, + "loss": 85.0621, + "num_input_tokens_seen": 56462500, + "step": 1101 + }, + { + "epoch": 0.10849856268887742, + "grad_norm": 57.37849044799805, + "learning_rate": 8e-05, + "loss": 86.611, + "num_input_tokens_seen": 56610976, + "step": 1104 + }, + { + "epoch": 0.10879339573966242, + "grad_norm": 36.412654876708984, + "learning_rate": 8e-05, + "loss": 87.3833, + "num_input_tokens_seen": 56758720, + "step": 1107 + }, + { + "epoch": 0.10908822879044741, + "grad_norm": 77.69864654541016, + "learning_rate": 8e-05, + "loss": 87.5024, + "num_input_tokens_seen": 56931780, + "step": 1110 + }, + { + "epoch": 0.1093830618412324, + "grad_norm": 35.17158508300781, + "learning_rate": 8e-05, + "loss": 86.0469, + "num_input_tokens_seen": 57083168, + "step": 1113 + }, + { + "epoch": 0.10967789489201739, + "grad_norm": 39.6878662109375, + "learning_rate": 8e-05, + "loss": 83.8399, + "num_input_tokens_seen": 57233344, + "step": 1116 + }, + { + "epoch": 0.10997272794280238, + "grad_norm": 41.83913040161133, + "learning_rate": 8e-05, + "loss": 85.4779, + "num_input_tokens_seen": 57389876, + "step": 1119 + }, + { + "epoch": 0.11026756099358738, + "grad_norm": 35.37332534790039, + "learning_rate": 8e-05, + "loss": 85.3762, + "num_input_tokens_seen": 57549376, + "step": 1122 + }, + { + "epoch": 0.11056239404437238, + "grad_norm": 34.330841064453125, + "learning_rate": 8e-05, + "loss": 83.6415, + "num_input_tokens_seen": 57718812, + "step": 1125 + }, + { + "epoch": 0.11085722709515737, + "grad_norm": 33.91706085205078, + "learning_rate": 8e-05, + "loss": 85.471, + "num_input_tokens_seen": 57872976, + "step": 1128 + }, + { + "epoch": 0.11115206014594237, + "grad_norm": 34.27631759643555, + "learning_rate": 8e-05, + "loss": 86.9546, + "num_input_tokens_seen": 58032316, + "step": 1131 + }, + { + "epoch": 0.11144689319672735, + "grad_norm": 134.00440979003906, + "learning_rate": 8e-05, + "loss": 84.8564, + "num_input_tokens_seen": 58185864, + "step": 1134 + }, + { + "epoch": 0.11174172624751234, + "grad_norm": 61.93877410888672, + "learning_rate": 8e-05, + "loss": 86.4746, + "num_input_tokens_seen": 58357596, + "step": 1137 + }, + { + "epoch": 0.11203655929829734, + "grad_norm": 39.10196304321289, + "learning_rate": 8e-05, + "loss": 86.2628, + "num_input_tokens_seen": 58517260, + "step": 1140 + }, + { + "epoch": 0.11233139234908233, + "grad_norm": 35.11958694458008, + "learning_rate": 8e-05, + "loss": 84.882, + "num_input_tokens_seen": 58670664, + "step": 1143 + }, + { + "epoch": 0.11262622539986733, + "grad_norm": 156.38101196289062, + "learning_rate": 8e-05, + "loss": 85.4139, + "num_input_tokens_seen": 58826068, + "step": 1146 + }, + { + "epoch": 0.11292105845065231, + "grad_norm": 37.681278228759766, + "learning_rate": 8e-05, + "loss": 85.1363, + "num_input_tokens_seen": 58989632, + "step": 1149 + }, + { + "epoch": 0.11321589150143731, + "grad_norm": 34.654964447021484, + "learning_rate": 8e-05, + "loss": 85.26, + "num_input_tokens_seen": 59149176, + "step": 1152 + }, + { + "epoch": 0.1135107245522223, + "grad_norm": 34.971920013427734, + "learning_rate": 8e-05, + "loss": 83.7622, + "num_input_tokens_seen": 59307864, + "step": 1155 + }, + { + "epoch": 0.1138055576030073, + "grad_norm": 36.55862808227539, + "learning_rate": 8e-05, + "loss": 86.1476, + "num_input_tokens_seen": 59477476, + "step": 1158 + }, + { + "epoch": 0.1141003906537923, + "grad_norm": 37.583221435546875, + "learning_rate": 8e-05, + "loss": 85.1669, + "num_input_tokens_seen": 59619472, + "step": 1161 + }, + { + "epoch": 0.11439522370457729, + "grad_norm": 36.37376022338867, + "learning_rate": 8e-05, + "loss": 84.7555, + "num_input_tokens_seen": 59786124, + "step": 1164 + }, + { + "epoch": 0.11469005675536227, + "grad_norm": 36.52484893798828, + "learning_rate": 8e-05, + "loss": 85.2837, + "num_input_tokens_seen": 59946560, + "step": 1167 + }, + { + "epoch": 0.11498488980614727, + "grad_norm": 33.52820587158203, + "learning_rate": 8e-05, + "loss": 83.8732, + "num_input_tokens_seen": 60107548, + "step": 1170 + }, + { + "epoch": 0.11527972285693226, + "grad_norm": 32.112953186035156, + "learning_rate": 8e-05, + "loss": 83.5283, + "num_input_tokens_seen": 60265444, + "step": 1173 + }, + { + "epoch": 0.11557455590771726, + "grad_norm": 35.48290252685547, + "learning_rate": 8e-05, + "loss": 82.8051, + "num_input_tokens_seen": 60426424, + "step": 1176 + }, + { + "epoch": 0.11586938895850225, + "grad_norm": 37.165374755859375, + "learning_rate": 8e-05, + "loss": 84.2286, + "num_input_tokens_seen": 60559724, + "step": 1179 + }, + { + "epoch": 0.11616422200928724, + "grad_norm": 38.65311050415039, + "learning_rate": 8e-05, + "loss": 84.465, + "num_input_tokens_seen": 60704680, + "step": 1182 + }, + { + "epoch": 0.11645905506007223, + "grad_norm": 33.537418365478516, + "learning_rate": 8e-05, + "loss": 85.1183, + "num_input_tokens_seen": 60866020, + "step": 1185 + }, + { + "epoch": 0.11675388811085723, + "grad_norm": 36.444644927978516, + "learning_rate": 8e-05, + "loss": 85.455, + "num_input_tokens_seen": 61030532, + "step": 1188 + }, + { + "epoch": 0.11704872116164222, + "grad_norm": 39.73960876464844, + "learning_rate": 8e-05, + "loss": 84.1651, + "num_input_tokens_seen": 61180192, + "step": 1191 + }, + { + "epoch": 0.11734355421242722, + "grad_norm": 210.22747802734375, + "learning_rate": 8e-05, + "loss": 82.5961, + "num_input_tokens_seen": 61319688, + "step": 1194 + }, + { + "epoch": 0.11763838726321221, + "grad_norm": 35.85403060913086, + "learning_rate": 8e-05, + "loss": 86.2338, + "num_input_tokens_seen": 61451216, + "step": 1197 + }, + { + "epoch": 0.1179332203139972, + "grad_norm": 38.668426513671875, + "learning_rate": 8e-05, + "loss": 84.9885, + "num_input_tokens_seen": 61614408, + "step": 1200 + }, + { + "epoch": 0.11822805336478219, + "grad_norm": 37.449241638183594, + "learning_rate": 8e-05, + "loss": 84.2673, + "num_input_tokens_seen": 61777484, + "step": 1203 + }, + { + "epoch": 0.11852288641556719, + "grad_norm": 37.551456451416016, + "learning_rate": 8e-05, + "loss": 83.5775, + "num_input_tokens_seen": 61925060, + "step": 1206 + }, + { + "epoch": 0.11881771946635218, + "grad_norm": 37.979461669921875, + "learning_rate": 8e-05, + "loss": 85.261, + "num_input_tokens_seen": 62069836, + "step": 1209 + }, + { + "epoch": 0.11911255251713718, + "grad_norm": 41.7076530456543, + "learning_rate": 8e-05, + "loss": 82.3804, + "num_input_tokens_seen": 62228640, + "step": 1212 + }, + { + "epoch": 0.11940738556792216, + "grad_norm": 46.70987319946289, + "learning_rate": 8e-05, + "loss": 85.4103, + "num_input_tokens_seen": 62379892, + "step": 1215 + }, + { + "epoch": 0.11970221861870715, + "grad_norm": 40.67140579223633, + "learning_rate": 8e-05, + "loss": 84.2097, + "num_input_tokens_seen": 62518936, + "step": 1218 + }, + { + "epoch": 0.11999705166949215, + "grad_norm": 46.623600006103516, + "learning_rate": 8e-05, + "loss": 84.5125, + "num_input_tokens_seen": 62695380, + "step": 1221 + }, + { + "epoch": 0.12029188472027715, + "grad_norm": 36.65542984008789, + "learning_rate": 8e-05, + "loss": 80.6407, + "num_input_tokens_seen": 62862792, + "step": 1224 + }, + { + "epoch": 0.12058671777106214, + "grad_norm": 42.10529327392578, + "learning_rate": 8e-05, + "loss": 83.1043, + "num_input_tokens_seen": 63016620, + "step": 1227 + }, + { + "epoch": 0.12088155082184712, + "grad_norm": 44.04954528808594, + "learning_rate": 8e-05, + "loss": 82.43, + "num_input_tokens_seen": 63155064, + "step": 1230 + }, + { + "epoch": 0.12117638387263212, + "grad_norm": 39.24373245239258, + "learning_rate": 8e-05, + "loss": 83.1794, + "num_input_tokens_seen": 63309192, + "step": 1233 + }, + { + "epoch": 0.12147121692341711, + "grad_norm": 34.62562942504883, + "learning_rate": 8e-05, + "loss": 83.2758, + "num_input_tokens_seen": 63476200, + "step": 1236 + }, + { + "epoch": 0.12176604997420211, + "grad_norm": 40.90768051147461, + "learning_rate": 8e-05, + "loss": 82.4989, + "num_input_tokens_seen": 63631280, + "step": 1239 + }, + { + "epoch": 0.1220608830249871, + "grad_norm": 34.59130096435547, + "learning_rate": 8e-05, + "loss": 81.9263, + "num_input_tokens_seen": 63782788, + "step": 1242 + }, + { + "epoch": 0.1223557160757721, + "grad_norm": 39.34327697753906, + "learning_rate": 8e-05, + "loss": 85.3184, + "num_input_tokens_seen": 63945800, + "step": 1245 + }, + { + "epoch": 0.12265054912655708, + "grad_norm": 49.83769607543945, + "learning_rate": 8e-05, + "loss": 81.4915, + "num_input_tokens_seen": 64094332, + "step": 1248 + }, + { + "epoch": 0.12294538217734208, + "grad_norm": 44.31386947631836, + "learning_rate": 8e-05, + "loss": 81.2687, + "num_input_tokens_seen": 64267812, + "step": 1251 + }, + { + "epoch": 0.12324021522812707, + "grad_norm": 32.71273422241211, + "learning_rate": 8e-05, + "loss": 80.4577, + "num_input_tokens_seen": 64413464, + "step": 1254 + }, + { + "epoch": 0.12353504827891207, + "grad_norm": 34.991573333740234, + "learning_rate": 8e-05, + "loss": 81.0808, + "num_input_tokens_seen": 64564716, + "step": 1257 + }, + { + "epoch": 0.12382988132969706, + "grad_norm": 110.09813690185547, + "learning_rate": 8e-05, + "loss": 82.0606, + "num_input_tokens_seen": 64723184, + "step": 1260 + }, + { + "epoch": 0.12412471438048205, + "grad_norm": 37.52562713623047, + "learning_rate": 8e-05, + "loss": 79.5995, + "num_input_tokens_seen": 64871304, + "step": 1263 + }, + { + "epoch": 0.12441954743126704, + "grad_norm": 33.689971923828125, + "learning_rate": 8e-05, + "loss": 80.8298, + "num_input_tokens_seen": 65027664, + "step": 1266 + }, + { + "epoch": 0.12471438048205204, + "grad_norm": 35.41553497314453, + "learning_rate": 8e-05, + "loss": 79.461, + "num_input_tokens_seen": 65172948, + "step": 1269 + }, + { + "epoch": 0.12500921353283703, + "grad_norm": 36.45944595336914, + "learning_rate": 8e-05, + "loss": 80.9711, + "num_input_tokens_seen": 65330620, + "step": 1272 + }, + { + "epoch": 0.12530404658362201, + "grad_norm": 34.718894958496094, + "learning_rate": 8e-05, + "loss": 82.0436, + "num_input_tokens_seen": 65508384, + "step": 1275 + }, + { + "epoch": 0.12559887963440702, + "grad_norm": 43.59950256347656, + "learning_rate": 8e-05, + "loss": 81.6451, + "num_input_tokens_seen": 65645948, + "step": 1278 + }, + { + "epoch": 0.125893712685192, + "grad_norm": 37.226043701171875, + "learning_rate": 8e-05, + "loss": 79.2421, + "num_input_tokens_seen": 65787648, + "step": 1281 + }, + { + "epoch": 0.12618854573597701, + "grad_norm": 43.468685150146484, + "learning_rate": 8e-05, + "loss": 80.5175, + "num_input_tokens_seen": 65953232, + "step": 1284 + }, + { + "epoch": 0.126483378786762, + "grad_norm": 32.858699798583984, + "learning_rate": 8e-05, + "loss": 79.5494, + "num_input_tokens_seen": 66116840, + "step": 1287 + }, + { + "epoch": 0.12677821183754698, + "grad_norm": 234.20143127441406, + "learning_rate": 8e-05, + "loss": 79.8303, + "num_input_tokens_seen": 66260528, + "step": 1290 + }, + { + "epoch": 0.127073044888332, + "grad_norm": 50.944881439208984, + "learning_rate": 8e-05, + "loss": 82.267, + "num_input_tokens_seen": 66437188, + "step": 1293 + }, + { + "epoch": 0.12736787793911697, + "grad_norm": 37.243194580078125, + "learning_rate": 8e-05, + "loss": 80.2612, + "num_input_tokens_seen": 66609192, + "step": 1296 + }, + { + "epoch": 0.12766271098990198, + "grad_norm": 35.89582824707031, + "learning_rate": 8e-05, + "loss": 81.5992, + "num_input_tokens_seen": 66774100, + "step": 1299 + }, + { + "epoch": 0.12795754404068696, + "grad_norm": 32.087738037109375, + "learning_rate": 8e-05, + "loss": 79.4512, + "num_input_tokens_seen": 66912696, + "step": 1302 + }, + { + "epoch": 0.12825237709147194, + "grad_norm": 36.44207000732422, + "learning_rate": 8e-05, + "loss": 81.6661, + "num_input_tokens_seen": 67073876, + "step": 1305 + }, + { + "epoch": 0.12854721014225695, + "grad_norm": 36.4789924621582, + "learning_rate": 8e-05, + "loss": 81.5013, + "num_input_tokens_seen": 67234592, + "step": 1308 + }, + { + "epoch": 0.12884204319304193, + "grad_norm": 39.537139892578125, + "learning_rate": 8e-05, + "loss": 82.2263, + "num_input_tokens_seen": 67389940, + "step": 1311 + }, + { + "epoch": 0.12913687624382694, + "grad_norm": 36.199737548828125, + "learning_rate": 8e-05, + "loss": 81.6914, + "num_input_tokens_seen": 67542840, + "step": 1314 + }, + { + "epoch": 0.12943170929461192, + "grad_norm": 34.60406494140625, + "learning_rate": 8e-05, + "loss": 80.2577, + "num_input_tokens_seen": 67690332, + "step": 1317 + }, + { + "epoch": 0.1297265423453969, + "grad_norm": 39.24729537963867, + "learning_rate": 8e-05, + "loss": 79.3764, + "num_input_tokens_seen": 67839352, + "step": 1320 + }, + { + "epoch": 0.13002137539618192, + "grad_norm": 35.990760803222656, + "learning_rate": 8e-05, + "loss": 77.4493, + "num_input_tokens_seen": 67981068, + "step": 1323 + }, + { + "epoch": 0.1303162084469669, + "grad_norm": 37.870235443115234, + "learning_rate": 8e-05, + "loss": 81.2442, + "num_input_tokens_seen": 68133412, + "step": 1326 + }, + { + "epoch": 0.1306110414977519, + "grad_norm": 37.46533966064453, + "learning_rate": 8e-05, + "loss": 80.1867, + "num_input_tokens_seen": 68287496, + "step": 1329 + }, + { + "epoch": 0.1309058745485369, + "grad_norm": 39.02689743041992, + "learning_rate": 8e-05, + "loss": 78.4787, + "num_input_tokens_seen": 68451852, + "step": 1332 + }, + { + "epoch": 0.1312007075993219, + "grad_norm": 34.74725341796875, + "learning_rate": 8e-05, + "loss": 76.681, + "num_input_tokens_seen": 68595968, + "step": 1335 + }, + { + "epoch": 0.13149554065010688, + "grad_norm": 35.77785873413086, + "learning_rate": 8e-05, + "loss": 80.0318, + "num_input_tokens_seen": 68755292, + "step": 1338 + }, + { + "epoch": 0.13179037370089186, + "grad_norm": 36.45845413208008, + "learning_rate": 8e-05, + "loss": 77.0289, + "num_input_tokens_seen": 68906272, + "step": 1341 + }, + { + "epoch": 0.13208520675167687, + "grad_norm": 36.09657287597656, + "learning_rate": 8e-05, + "loss": 78.4508, + "num_input_tokens_seen": 69039068, + "step": 1344 + }, + { + "epoch": 0.13238003980246185, + "grad_norm": 35.88303756713867, + "learning_rate": 8e-05, + "loss": 78.0555, + "num_input_tokens_seen": 69180160, + "step": 1347 + }, + { + "epoch": 0.13267487285324686, + "grad_norm": 38.69503402709961, + "learning_rate": 8e-05, + "loss": 80.4413, + "num_input_tokens_seen": 69344756, + "step": 1350 + }, + { + "epoch": 0.13296970590403184, + "grad_norm": 35.21122360229492, + "learning_rate": 8e-05, + "loss": 78.5331, + "num_input_tokens_seen": 69504472, + "step": 1353 + }, + { + "epoch": 0.13326453895481682, + "grad_norm": 46.11822509765625, + "learning_rate": 8e-05, + "loss": 78.9917, + "num_input_tokens_seen": 69656792, + "step": 1356 + }, + { + "epoch": 0.13355937200560183, + "grad_norm": 36.190093994140625, + "learning_rate": 8e-05, + "loss": 79.9646, + "num_input_tokens_seen": 69824388, + "step": 1359 + }, + { + "epoch": 0.13385420505638682, + "grad_norm": 38.50130081176758, + "learning_rate": 8e-05, + "loss": 78.924, + "num_input_tokens_seen": 69978636, + "step": 1362 + }, + { + "epoch": 0.13414903810717183, + "grad_norm": 65.22470092773438, + "learning_rate": 8e-05, + "loss": 75.8164, + "num_input_tokens_seen": 70126680, + "step": 1365 + }, + { + "epoch": 0.1344438711579568, + "grad_norm": 60.97062683105469, + "learning_rate": 8e-05, + "loss": 77.5916, + "num_input_tokens_seen": 70258788, + "step": 1368 + }, + { + "epoch": 0.1347387042087418, + "grad_norm": 38.67707824707031, + "learning_rate": 8e-05, + "loss": 79.541, + "num_input_tokens_seen": 70428536, + "step": 1371 + }, + { + "epoch": 0.1350335372595268, + "grad_norm": 35.03982925415039, + "learning_rate": 8e-05, + "loss": 78.4784, + "num_input_tokens_seen": 70595776, + "step": 1374 + }, + { + "epoch": 0.13532837031031178, + "grad_norm": 38.07881546020508, + "learning_rate": 8e-05, + "loss": 77.7045, + "num_input_tokens_seen": 70746704, + "step": 1377 + }, + { + "epoch": 0.1356232033610968, + "grad_norm": 38.27223587036133, + "learning_rate": 8e-05, + "loss": 78.7978, + "num_input_tokens_seen": 70918160, + "step": 1380 + }, + { + "epoch": 0.13591803641188177, + "grad_norm": 38.4968376159668, + "learning_rate": 8e-05, + "loss": 77.204, + "num_input_tokens_seen": 71067852, + "step": 1383 + }, + { + "epoch": 0.13621286946266675, + "grad_norm": 57.48464584350586, + "learning_rate": 8e-05, + "loss": 74.9822, + "num_input_tokens_seen": 71221176, + "step": 1386 + }, + { + "epoch": 0.13650770251345176, + "grad_norm": 34.94679260253906, + "learning_rate": 8e-05, + "loss": 76.3158, + "num_input_tokens_seen": 71359204, + "step": 1389 + }, + { + "epoch": 0.13680253556423674, + "grad_norm": 36.15276336669922, + "learning_rate": 8e-05, + "loss": 76.0528, + "num_input_tokens_seen": 71532088, + "step": 1392 + }, + { + "epoch": 0.13709736861502175, + "grad_norm": 35.722801208496094, + "learning_rate": 8e-05, + "loss": 79.0203, + "num_input_tokens_seen": 71692128, + "step": 1395 + }, + { + "epoch": 0.13739220166580673, + "grad_norm": 32.96024703979492, + "learning_rate": 8e-05, + "loss": 78.2764, + "num_input_tokens_seen": 71855924, + "step": 1398 + }, + { + "epoch": 0.13768703471659172, + "grad_norm": 36.010257720947266, + "learning_rate": 8e-05, + "loss": 77.3026, + "num_input_tokens_seen": 72011396, + "step": 1401 + }, + { + "epoch": 0.13798186776737673, + "grad_norm": 51.21072769165039, + "learning_rate": 8e-05, + "loss": 77.9927, + "num_input_tokens_seen": 72159264, + "step": 1404 + }, + { + "epoch": 0.1382767008181617, + "grad_norm": 40.23877716064453, + "learning_rate": 8e-05, + "loss": 77.9101, + "num_input_tokens_seen": 72320092, + "step": 1407 + }, + { + "epoch": 0.13857153386894672, + "grad_norm": 41.20027160644531, + "learning_rate": 8e-05, + "loss": 74.7435, + "num_input_tokens_seen": 72464520, + "step": 1410 + }, + { + "epoch": 0.1388663669197317, + "grad_norm": 38.02928924560547, + "learning_rate": 8e-05, + "loss": 75.4835, + "num_input_tokens_seen": 72614312, + "step": 1413 + }, + { + "epoch": 0.1391611999705167, + "grad_norm": 34.233665466308594, + "learning_rate": 8e-05, + "loss": 76.4433, + "num_input_tokens_seen": 72769836, + "step": 1416 + }, + { + "epoch": 0.1394560330213017, + "grad_norm": 31.657432556152344, + "learning_rate": 8e-05, + "loss": 75.8057, + "num_input_tokens_seen": 72931408, + "step": 1419 + }, + { + "epoch": 0.13975086607208667, + "grad_norm": 36.81168746948242, + "learning_rate": 8e-05, + "loss": 75.7319, + "num_input_tokens_seen": 73104460, + "step": 1422 + }, + { + "epoch": 0.14004569912287168, + "grad_norm": 7573.0126953125, + "learning_rate": 8e-05, + "loss": 76.7549, + "num_input_tokens_seen": 73260520, + "step": 1425 + }, + { + "epoch": 0.14034053217365666, + "grad_norm": 38.15385437011719, + "learning_rate": 8e-05, + "loss": 76.9857, + "num_input_tokens_seen": 73405808, + "step": 1428 + }, + { + "epoch": 0.14063536522444167, + "grad_norm": 44.962528228759766, + "learning_rate": 8e-05, + "loss": 75.2264, + "num_input_tokens_seen": 73573148, + "step": 1431 + }, + { + "epoch": 0.14093019827522665, + "grad_norm": 41.70336151123047, + "learning_rate": 8e-05, + "loss": 74.1684, + "num_input_tokens_seen": 73725792, + "step": 1434 + }, + { + "epoch": 0.14122503132601164, + "grad_norm": 33.183799743652344, + "learning_rate": 8e-05, + "loss": 77.1766, + "num_input_tokens_seen": 73892364, + "step": 1437 + }, + { + "epoch": 0.14151986437679664, + "grad_norm": 39.463348388671875, + "learning_rate": 8e-05, + "loss": 76.8123, + "num_input_tokens_seen": 74051972, + "step": 1440 + }, + { + "epoch": 0.14181469742758163, + "grad_norm": 34.82969665527344, + "learning_rate": 8e-05, + "loss": 76.6535, + "num_input_tokens_seen": 74220148, + "step": 1443 + }, + { + "epoch": 0.14210953047836664, + "grad_norm": 35.903076171875, + "learning_rate": 8e-05, + "loss": 75.3105, + "num_input_tokens_seen": 74405764, + "step": 1446 + }, + { + "epoch": 0.14240436352915162, + "grad_norm": 75.70014953613281, + "learning_rate": 8e-05, + "loss": 75.8482, + "num_input_tokens_seen": 74562012, + "step": 1449 + }, + { + "epoch": 0.1426991965799366, + "grad_norm": 36.09914016723633, + "learning_rate": 8e-05, + "loss": 73.2226, + "num_input_tokens_seen": 74705004, + "step": 1452 + }, + { + "epoch": 0.1429940296307216, + "grad_norm": 35.636756896972656, + "learning_rate": 8e-05, + "loss": 78.9578, + "num_input_tokens_seen": 74886308, + "step": 1455 + }, + { + "epoch": 0.1432888626815066, + "grad_norm": 33.12376403808594, + "learning_rate": 8e-05, + "loss": 76.2449, + "num_input_tokens_seen": 75059784, + "step": 1458 + }, + { + "epoch": 0.1435836957322916, + "grad_norm": 34.66209411621094, + "learning_rate": 8e-05, + "loss": 74.7041, + "num_input_tokens_seen": 75210648, + "step": 1461 + }, + { + "epoch": 0.14387852878307658, + "grad_norm": 36.110801696777344, + "learning_rate": 8e-05, + "loss": 74.4723, + "num_input_tokens_seen": 75367344, + "step": 1464 + }, + { + "epoch": 0.14417336183386156, + "grad_norm": 41.02329635620117, + "learning_rate": 8e-05, + "loss": 74.6284, + "num_input_tokens_seen": 75525132, + "step": 1467 + }, + { + "epoch": 0.14446819488464657, + "grad_norm": 51.964595794677734, + "learning_rate": 8e-05, + "loss": 76.0471, + "num_input_tokens_seen": 75672700, + "step": 1470 + }, + { + "epoch": 0.14476302793543155, + "grad_norm": 37.155418395996094, + "learning_rate": 8e-05, + "loss": 77.9106, + "num_input_tokens_seen": 75822896, + "step": 1473 + }, + { + "epoch": 0.14505786098621656, + "grad_norm": 39.485267639160156, + "learning_rate": 8e-05, + "loss": 76.1002, + "num_input_tokens_seen": 75990008, + "step": 1476 + }, + { + "epoch": 0.14535269403700155, + "grad_norm": 34.41350555419922, + "learning_rate": 8e-05, + "loss": 76.0894, + "num_input_tokens_seen": 76149960, + "step": 1479 + }, + { + "epoch": 0.14564752708778655, + "grad_norm": 38.971229553222656, + "learning_rate": 8e-05, + "loss": 75.9462, + "num_input_tokens_seen": 76327592, + "step": 1482 + }, + { + "epoch": 0.14594236013857154, + "grad_norm": 37.98147964477539, + "learning_rate": 8e-05, + "loss": 73.715, + "num_input_tokens_seen": 76479420, + "step": 1485 + }, + { + "epoch": 0.14623719318935652, + "grad_norm": 43.49007034301758, + "learning_rate": 8e-05, + "loss": 74.4515, + "num_input_tokens_seen": 76630452, + "step": 1488 + }, + { + "epoch": 0.14653202624014153, + "grad_norm": 35.00507736206055, + "learning_rate": 8e-05, + "loss": 74.7782, + "num_input_tokens_seen": 76781236, + "step": 1491 + }, + { + "epoch": 0.1468268592909265, + "grad_norm": 34.62977981567383, + "learning_rate": 8e-05, + "loss": 75.692, + "num_input_tokens_seen": 76928432, + "step": 1494 + }, + { + "epoch": 0.14712169234171152, + "grad_norm": 43.507266998291016, + "learning_rate": 8e-05, + "loss": 75.112, + "num_input_tokens_seen": 77074108, + "step": 1497 + }, + { + "epoch": 0.1474165253924965, + "grad_norm": 38.37984085083008, + "learning_rate": 8e-05, + "loss": 73.1151, + "num_input_tokens_seen": 77231832, + "step": 1500 + }, + { + "epoch": 0.14771135844328148, + "grad_norm": 38.74737548828125, + "learning_rate": 8e-05, + "loss": 76.9576, + "num_input_tokens_seen": 77393456, + "step": 1503 + }, + { + "epoch": 0.1480061914940665, + "grad_norm": 36.35774230957031, + "learning_rate": 8e-05, + "loss": 77.1779, + "num_input_tokens_seen": 77536392, + "step": 1506 + }, + { + "epoch": 0.14830102454485147, + "grad_norm": 30.660966873168945, + "learning_rate": 8e-05, + "loss": 72.1792, + "num_input_tokens_seen": 77689848, + "step": 1509 + }, + { + "epoch": 0.14859585759563648, + "grad_norm": 30.7512149810791, + "learning_rate": 8e-05, + "loss": 72.6574, + "num_input_tokens_seen": 77845968, + "step": 1512 + }, + { + "epoch": 0.14889069064642146, + "grad_norm": 35.85249710083008, + "learning_rate": 8e-05, + "loss": 74.7064, + "num_input_tokens_seen": 78000624, + "step": 1515 + }, + { + "epoch": 0.14918552369720645, + "grad_norm": 35.147151947021484, + "learning_rate": 8e-05, + "loss": 72.1662, + "num_input_tokens_seen": 78149056, + "step": 1518 + }, + { + "epoch": 0.14948035674799146, + "grad_norm": 36.17216873168945, + "learning_rate": 8e-05, + "loss": 73.4778, + "num_input_tokens_seen": 78288020, + "step": 1521 + }, + { + "epoch": 0.14977518979877644, + "grad_norm": 36.134586334228516, + "learning_rate": 8e-05, + "loss": 74.2239, + "num_input_tokens_seen": 78444656, + "step": 1524 + }, + { + "epoch": 0.15007002284956145, + "grad_norm": 35.32678985595703, + "learning_rate": 8e-05, + "loss": 75.2825, + "num_input_tokens_seen": 78581236, + "step": 1527 + }, + { + "epoch": 0.15036485590034643, + "grad_norm": 34.928157806396484, + "learning_rate": 8e-05, + "loss": 71.9196, + "num_input_tokens_seen": 78723900, + "step": 1530 + }, + { + "epoch": 0.1506596889511314, + "grad_norm": 32.27571487426758, + "learning_rate": 8e-05, + "loss": 74.4344, + "num_input_tokens_seen": 78874436, + "step": 1533 + }, + { + "epoch": 0.15095452200191642, + "grad_norm": 39.34345245361328, + "learning_rate": 8e-05, + "loss": 72.6868, + "num_input_tokens_seen": 79031824, + "step": 1536 + }, + { + "epoch": 0.1512493550527014, + "grad_norm": 37.49494934082031, + "learning_rate": 8e-05, + "loss": 73.9715, + "num_input_tokens_seen": 79181500, + "step": 1539 + }, + { + "epoch": 0.1515441881034864, + "grad_norm": 41.84995651245117, + "learning_rate": 8e-05, + "loss": 74.3241, + "num_input_tokens_seen": 79335788, + "step": 1542 + }, + { + "epoch": 0.1518390211542714, + "grad_norm": 35.9151725769043, + "learning_rate": 8e-05, + "loss": 74.5754, + "num_input_tokens_seen": 79500936, + "step": 1545 + }, + { + "epoch": 0.15213385420505637, + "grad_norm": 34.1334342956543, + "learning_rate": 8e-05, + "loss": 71.089, + "num_input_tokens_seen": 79672776, + "step": 1548 + }, + { + "epoch": 0.15242868725584138, + "grad_norm": 42.203880310058594, + "learning_rate": 8e-05, + "loss": 71.2149, + "num_input_tokens_seen": 79831716, + "step": 1551 + }, + { + "epoch": 0.15272352030662636, + "grad_norm": 38.425052642822266, + "learning_rate": 8e-05, + "loss": 71.9566, + "num_input_tokens_seen": 79988016, + "step": 1554 + }, + { + "epoch": 0.15301835335741137, + "grad_norm": 39.873870849609375, + "learning_rate": 8e-05, + "loss": 72.8495, + "num_input_tokens_seen": 80144084, + "step": 1557 + }, + { + "epoch": 0.15331318640819636, + "grad_norm": 35.926002502441406, + "learning_rate": 8e-05, + "loss": 72.3613, + "num_input_tokens_seen": 80267444, + "step": 1560 + }, + { + "epoch": 0.15360801945898137, + "grad_norm": 38.23421096801758, + "learning_rate": 8e-05, + "loss": 74.279, + "num_input_tokens_seen": 80446852, + "step": 1563 + }, + { + "epoch": 0.15390285250976635, + "grad_norm": 36.72174072265625, + "learning_rate": 8e-05, + "loss": 72.2801, + "num_input_tokens_seen": 80624428, + "step": 1566 + }, + { + "epoch": 0.15419768556055133, + "grad_norm": 38.56344985961914, + "learning_rate": 8e-05, + "loss": 73.2882, + "num_input_tokens_seen": 80781668, + "step": 1569 + }, + { + "epoch": 0.15449251861133634, + "grad_norm": 35.1093864440918, + "learning_rate": 8e-05, + "loss": 72.6963, + "num_input_tokens_seen": 80934068, + "step": 1572 + }, + { + "epoch": 0.15478735166212132, + "grad_norm": 35.02631378173828, + "learning_rate": 8e-05, + "loss": 72.4826, + "num_input_tokens_seen": 81103204, + "step": 1575 + }, + { + "epoch": 0.15508218471290633, + "grad_norm": 39.04288101196289, + "learning_rate": 8e-05, + "loss": 71.8602, + "num_input_tokens_seen": 81257460, + "step": 1578 + }, + { + "epoch": 0.1553770177636913, + "grad_norm": 34.65717697143555, + "learning_rate": 8e-05, + "loss": 71.3285, + "num_input_tokens_seen": 81408692, + "step": 1581 + }, + { + "epoch": 0.1556718508144763, + "grad_norm": 33.63228988647461, + "learning_rate": 8e-05, + "loss": 72.5402, + "num_input_tokens_seen": 81561644, + "step": 1584 + }, + { + "epoch": 0.1559666838652613, + "grad_norm": 34.98646545410156, + "learning_rate": 8e-05, + "loss": 72.3205, + "num_input_tokens_seen": 81714672, + "step": 1587 + }, + { + "epoch": 0.15626151691604628, + "grad_norm": 34.4061279296875, + "learning_rate": 8e-05, + "loss": 70.3614, + "num_input_tokens_seen": 81869272, + "step": 1590 + }, + { + "epoch": 0.1565563499668313, + "grad_norm": 33.87858200073242, + "learning_rate": 8e-05, + "loss": 69.5967, + "num_input_tokens_seen": 82036368, + "step": 1593 + }, + { + "epoch": 0.15685118301761627, + "grad_norm": 42.29884338378906, + "learning_rate": 8e-05, + "loss": 74.2783, + "num_input_tokens_seen": 82182920, + "step": 1596 + }, + { + "epoch": 0.15714601606840126, + "grad_norm": 49.82851791381836, + "learning_rate": 8e-05, + "loss": 70.0678, + "num_input_tokens_seen": 82345176, + "step": 1599 + }, + { + "epoch": 0.15744084911918627, + "grad_norm": 34.39692306518555, + "learning_rate": 8e-05, + "loss": 71.5913, + "num_input_tokens_seen": 82512116, + "step": 1602 + }, + { + "epoch": 0.15773568216997125, + "grad_norm": 35.24238204956055, + "learning_rate": 8e-05, + "loss": 72.218, + "num_input_tokens_seen": 82679872, + "step": 1605 + }, + { + "epoch": 0.15803051522075626, + "grad_norm": 33.634464263916016, + "learning_rate": 8e-05, + "loss": 72.6587, + "num_input_tokens_seen": 82819184, + "step": 1608 + }, + { + "epoch": 0.15832534827154124, + "grad_norm": 34.30915832519531, + "learning_rate": 8e-05, + "loss": 71.4704, + "num_input_tokens_seen": 82982680, + "step": 1611 + }, + { + "epoch": 0.15862018132232622, + "grad_norm": 40.74231719970703, + "learning_rate": 8e-05, + "loss": 70.3003, + "num_input_tokens_seen": 83135768, + "step": 1614 + }, + { + "epoch": 0.15891501437311123, + "grad_norm": 55.77992630004883, + "learning_rate": 8e-05, + "loss": 71.88, + "num_input_tokens_seen": 83308712, + "step": 1617 + }, + { + "epoch": 0.1592098474238962, + "grad_norm": 39.83135223388672, + "learning_rate": 8e-05, + "loss": 71.1506, + "num_input_tokens_seen": 83483420, + "step": 1620 + }, + { + "epoch": 0.15950468047468122, + "grad_norm": 34.45026779174805, + "learning_rate": 8e-05, + "loss": 73.5146, + "num_input_tokens_seen": 83641592, + "step": 1623 + }, + { + "epoch": 0.1597995135254662, + "grad_norm": 42.277793884277344, + "learning_rate": 8e-05, + "loss": 75.9387, + "num_input_tokens_seen": 83804164, + "step": 1626 + }, + { + "epoch": 0.16009434657625118, + "grad_norm": 39.62792205810547, + "learning_rate": 8e-05, + "loss": 71.2239, + "num_input_tokens_seen": 83966016, + "step": 1629 + }, + { + "epoch": 0.1603891796270362, + "grad_norm": 49.15724182128906, + "learning_rate": 8e-05, + "loss": 70.4015, + "num_input_tokens_seen": 84129672, + "step": 1632 + }, + { + "epoch": 0.16068401267782118, + "grad_norm": 38.876102447509766, + "learning_rate": 8e-05, + "loss": 70.0127, + "num_input_tokens_seen": 84278492, + "step": 1635 + }, + { + "epoch": 0.16097884572860618, + "grad_norm": 34.91112518310547, + "learning_rate": 8e-05, + "loss": 72.3853, + "num_input_tokens_seen": 84439544, + "step": 1638 + }, + { + "epoch": 0.16127367877939117, + "grad_norm": 35.54895782470703, + "learning_rate": 8e-05, + "loss": 69.1896, + "num_input_tokens_seen": 84600056, + "step": 1641 + }, + { + "epoch": 0.16156851183017618, + "grad_norm": 39.978538513183594, + "learning_rate": 8e-05, + "loss": 70.4133, + "num_input_tokens_seen": 84750168, + "step": 1644 + }, + { + "epoch": 0.16186334488096116, + "grad_norm": 33.363372802734375, + "learning_rate": 8e-05, + "loss": 71.4283, + "num_input_tokens_seen": 84899072, + "step": 1647 + }, + { + "epoch": 0.16215817793174614, + "grad_norm": 33.02912521362305, + "learning_rate": 8e-05, + "loss": 70.59, + "num_input_tokens_seen": 85040176, + "step": 1650 + }, + { + "epoch": 0.16245301098253115, + "grad_norm": 34.67600631713867, + "learning_rate": 8e-05, + "loss": 72.9926, + "num_input_tokens_seen": 85202388, + "step": 1653 + }, + { + "epoch": 0.16274784403331613, + "grad_norm": 88.48844909667969, + "learning_rate": 8e-05, + "loss": 71.2351, + "num_input_tokens_seen": 85369400, + "step": 1656 + }, + { + "epoch": 0.16304267708410114, + "grad_norm": 38.78783416748047, + "learning_rate": 8e-05, + "loss": 67.4387, + "num_input_tokens_seen": 85535784, + "step": 1659 + }, + { + "epoch": 0.16333751013488612, + "grad_norm": 169.82952880859375, + "learning_rate": 8e-05, + "loss": 69.9607, + "num_input_tokens_seen": 85706820, + "step": 1662 + }, + { + "epoch": 0.1636323431856711, + "grad_norm": 40.37202453613281, + "learning_rate": 8e-05, + "loss": 71.8307, + "num_input_tokens_seen": 85849804, + "step": 1665 + }, + { + "epoch": 0.1639271762364561, + "grad_norm": 67.48583984375, + "learning_rate": 8e-05, + "loss": 69.4109, + "num_input_tokens_seen": 85998468, + "step": 1668 + }, + { + "epoch": 0.1642220092872411, + "grad_norm": 35.94486999511719, + "learning_rate": 8e-05, + "loss": 70.1855, + "num_input_tokens_seen": 86158700, + "step": 1671 + }, + { + "epoch": 0.1645168423380261, + "grad_norm": 49.25463104248047, + "learning_rate": 8e-05, + "loss": 69.7001, + "num_input_tokens_seen": 86312436, + "step": 1674 + }, + { + "epoch": 0.16481167538881109, + "grad_norm": 35.989192962646484, + "learning_rate": 8e-05, + "loss": 70.2536, + "num_input_tokens_seen": 86492368, + "step": 1677 + }, + { + "epoch": 0.16510650843959607, + "grad_norm": 34.7452392578125, + "learning_rate": 8e-05, + "loss": 70.5204, + "num_input_tokens_seen": 86620228, + "step": 1680 + }, + { + "epoch": 0.16540134149038108, + "grad_norm": 36.26546096801758, + "learning_rate": 8e-05, + "loss": 69.9993, + "num_input_tokens_seen": 86761548, + "step": 1683 + }, + { + "epoch": 0.16569617454116606, + "grad_norm": 73.40389251708984, + "learning_rate": 8e-05, + "loss": 71.2321, + "num_input_tokens_seen": 86896420, + "step": 1686 + }, + { + "epoch": 0.16599100759195107, + "grad_norm": 37.27740478515625, + "learning_rate": 8e-05, + "loss": 71.3492, + "num_input_tokens_seen": 87064216, + "step": 1689 + }, + { + "epoch": 0.16628584064273605, + "grad_norm": 43.147308349609375, + "learning_rate": 8e-05, + "loss": 70.2469, + "num_input_tokens_seen": 87196748, + "step": 1692 + }, + { + "epoch": 0.16658067369352103, + "grad_norm": 48.543495178222656, + "learning_rate": 8e-05, + "loss": 69.6811, + "num_input_tokens_seen": 87350500, + "step": 1695 + }, + { + "epoch": 0.16687550674430604, + "grad_norm": 41.09067153930664, + "learning_rate": 8e-05, + "loss": 68.8399, + "num_input_tokens_seen": 87498152, + "step": 1698 + }, + { + "epoch": 0.16717033979509102, + "grad_norm": 41.002784729003906, + "learning_rate": 8e-05, + "loss": 70.0191, + "num_input_tokens_seen": 87657492, + "step": 1701 + }, + { + "epoch": 0.16746517284587603, + "grad_norm": 42.89789581298828, + "learning_rate": 8e-05, + "loss": 67.6702, + "num_input_tokens_seen": 87799916, + "step": 1704 + }, + { + "epoch": 0.167760005896661, + "grad_norm": 38.87138366699219, + "learning_rate": 8e-05, + "loss": 68.2145, + "num_input_tokens_seen": 87935648, + "step": 1707 + }, + { + "epoch": 0.16805483894744602, + "grad_norm": 256.7908630371094, + "learning_rate": 8e-05, + "loss": 68.1492, + "num_input_tokens_seen": 88081728, + "step": 1710 + }, + { + "epoch": 0.168349671998231, + "grad_norm": 37.66705322265625, + "learning_rate": 8e-05, + "loss": 67.9451, + "num_input_tokens_seen": 88240552, + "step": 1713 + }, + { + "epoch": 0.16864450504901599, + "grad_norm": 33.00398635864258, + "learning_rate": 8e-05, + "loss": 72.0741, + "num_input_tokens_seen": 88397404, + "step": 1716 + }, + { + "epoch": 0.168939338099801, + "grad_norm": 32.67354965209961, + "learning_rate": 8e-05, + "loss": 67.1234, + "num_input_tokens_seen": 88559332, + "step": 1719 + }, + { + "epoch": 0.16923417115058598, + "grad_norm": 37.69291687011719, + "learning_rate": 8e-05, + "loss": 68.6124, + "num_input_tokens_seen": 88707332, + "step": 1722 + }, + { + "epoch": 0.169529004201371, + "grad_norm": 34.9688720703125, + "learning_rate": 8e-05, + "loss": 64.365, + "num_input_tokens_seen": 88830788, + "step": 1725 + }, + { + "epoch": 0.16982383725215597, + "grad_norm": 38.50293731689453, + "learning_rate": 8e-05, + "loss": 67.384, + "num_input_tokens_seen": 88982944, + "step": 1728 + }, + { + "epoch": 0.17011867030294095, + "grad_norm": 54.379638671875, + "learning_rate": 8e-05, + "loss": 69.4878, + "num_input_tokens_seen": 89120944, + "step": 1731 + }, + { + "epoch": 0.17041350335372596, + "grad_norm": 56.456138610839844, + "learning_rate": 8e-05, + "loss": 67.5283, + "num_input_tokens_seen": 89290008, + "step": 1734 + }, + { + "epoch": 0.17070833640451094, + "grad_norm": 34.861175537109375, + "learning_rate": 8e-05, + "loss": 68.7829, + "num_input_tokens_seen": 89442452, + "step": 1737 + }, + { + "epoch": 0.17100316945529595, + "grad_norm": 35.11691665649414, + "learning_rate": 8e-05, + "loss": 67.3596, + "num_input_tokens_seen": 89587496, + "step": 1740 + }, + { + "epoch": 0.17129800250608093, + "grad_norm": 43.15464782714844, + "learning_rate": 8e-05, + "loss": 66.5695, + "num_input_tokens_seen": 89743128, + "step": 1743 + }, + { + "epoch": 0.1715928355568659, + "grad_norm": 46.82964324951172, + "learning_rate": 8e-05, + "loss": 68.5125, + "num_input_tokens_seen": 89900992, + "step": 1746 + }, + { + "epoch": 0.17188766860765092, + "grad_norm": 38.2276725769043, + "learning_rate": 8e-05, + "loss": 68.039, + "num_input_tokens_seen": 90059608, + "step": 1749 + }, + { + "epoch": 0.1721825016584359, + "grad_norm": 39.11660385131836, + "learning_rate": 8e-05, + "loss": 69.9983, + "num_input_tokens_seen": 90221080, + "step": 1752 + }, + { + "epoch": 0.17247733470922091, + "grad_norm": 38.58439254760742, + "learning_rate": 8e-05, + "loss": 69.3442, + "num_input_tokens_seen": 90376304, + "step": 1755 + }, + { + "epoch": 0.1727721677600059, + "grad_norm": 36.8914680480957, + "learning_rate": 8e-05, + "loss": 66.3412, + "num_input_tokens_seen": 90535060, + "step": 1758 + }, + { + "epoch": 0.17306700081079088, + "grad_norm": 40.14888381958008, + "learning_rate": 8e-05, + "loss": 67.6869, + "num_input_tokens_seen": 90717824, + "step": 1761 + }, + { + "epoch": 0.1733618338615759, + "grad_norm": 34.87165451049805, + "learning_rate": 8e-05, + "loss": 68.2913, + "num_input_tokens_seen": 90864896, + "step": 1764 + }, + { + "epoch": 0.17365666691236087, + "grad_norm": 36.20130920410156, + "learning_rate": 8e-05, + "loss": 67.6237, + "num_input_tokens_seen": 91011224, + "step": 1767 + }, + { + "epoch": 0.17395149996314588, + "grad_norm": 41.79694747924805, + "learning_rate": 8e-05, + "loss": 67.4999, + "num_input_tokens_seen": 91156916, + "step": 1770 + }, + { + "epoch": 0.17424633301393086, + "grad_norm": 37.9937629699707, + "learning_rate": 8e-05, + "loss": 69.9868, + "num_input_tokens_seen": 91276748, + "step": 1773 + }, + { + "epoch": 0.17454116606471584, + "grad_norm": 37.94075012207031, + "learning_rate": 8e-05, + "loss": 66.9292, + "num_input_tokens_seen": 91426680, + "step": 1776 + }, + { + "epoch": 0.17483599911550085, + "grad_norm": 41.27400588989258, + "learning_rate": 8e-05, + "loss": 68.5003, + "num_input_tokens_seen": 91580812, + "step": 1779 + }, + { + "epoch": 0.17513083216628583, + "grad_norm": 41.00275802612305, + "learning_rate": 8e-05, + "loss": 67.9984, + "num_input_tokens_seen": 91741720, + "step": 1782 + }, + { + "epoch": 0.17542566521707084, + "grad_norm": 701.8508911132812, + "learning_rate": 8e-05, + "loss": 67.7746, + "num_input_tokens_seen": 91900988, + "step": 1785 + }, + { + "epoch": 0.17572049826785582, + "grad_norm": 36.433135986328125, + "learning_rate": 8e-05, + "loss": 65.6489, + "num_input_tokens_seen": 92069236, + "step": 1788 + }, + { + "epoch": 0.17601533131864083, + "grad_norm": 32.772438049316406, + "learning_rate": 8e-05, + "loss": 66.0876, + "num_input_tokens_seen": 92212252, + "step": 1791 + }, + { + "epoch": 0.17631016436942581, + "grad_norm": 34.46598434448242, + "learning_rate": 8e-05, + "loss": 66.1039, + "num_input_tokens_seen": 92363816, + "step": 1794 + }, + { + "epoch": 0.1766049974202108, + "grad_norm": 36.973140716552734, + "learning_rate": 8e-05, + "loss": 68.7322, + "num_input_tokens_seen": 92514400, + "step": 1797 + }, + { + "epoch": 0.1768998304709958, + "grad_norm": 33.909847259521484, + "learning_rate": 8e-05, + "loss": 68.1551, + "num_input_tokens_seen": 92658624, + "step": 1800 + }, + { + "epoch": 0.1771946635217808, + "grad_norm": 33.61487579345703, + "learning_rate": 8e-05, + "loss": 68.2057, + "num_input_tokens_seen": 92813240, + "step": 1803 + }, + { + "epoch": 0.1774894965725658, + "grad_norm": 48.34735107421875, + "learning_rate": 8e-05, + "loss": 67.1163, + "num_input_tokens_seen": 92961244, + "step": 1806 + }, + { + "epoch": 0.17778432962335078, + "grad_norm": 37.34203338623047, + "learning_rate": 8e-05, + "loss": 68.1642, + "num_input_tokens_seen": 93126052, + "step": 1809 + }, + { + "epoch": 0.17807916267413576, + "grad_norm": 36.56318283081055, + "learning_rate": 8e-05, + "loss": 67.9583, + "num_input_tokens_seen": 93291776, + "step": 1812 + }, + { + "epoch": 0.17837399572492077, + "grad_norm": 32.81214141845703, + "learning_rate": 8e-05, + "loss": 68.2649, + "num_input_tokens_seen": 93437804, + "step": 1815 + }, + { + "epoch": 0.17866882877570575, + "grad_norm": 39.49382781982422, + "learning_rate": 8e-05, + "loss": 66.587, + "num_input_tokens_seen": 93590280, + "step": 1818 + }, + { + "epoch": 0.17896366182649076, + "grad_norm": 37.001686096191406, + "learning_rate": 8e-05, + "loss": 68.0501, + "num_input_tokens_seen": 93759692, + "step": 1821 + }, + { + "epoch": 0.17925849487727574, + "grad_norm": 43.17896270751953, + "learning_rate": 8e-05, + "loss": 70.3608, + "num_input_tokens_seen": 93912212, + "step": 1824 + }, + { + "epoch": 0.17955332792806072, + "grad_norm": 31.220149993896484, + "learning_rate": 8e-05, + "loss": 67.3302, + "num_input_tokens_seen": 94050348, + "step": 1827 + }, + { + "epoch": 0.17984816097884573, + "grad_norm": 35.91094207763672, + "learning_rate": 8e-05, + "loss": 66.1876, + "num_input_tokens_seen": 94215336, + "step": 1830 + }, + { + "epoch": 0.18014299402963072, + "grad_norm": 39.12143325805664, + "learning_rate": 8e-05, + "loss": 67.2838, + "num_input_tokens_seen": 94394660, + "step": 1833 + }, + { + "epoch": 0.18043782708041572, + "grad_norm": 36.33224105834961, + "learning_rate": 8e-05, + "loss": 68.2328, + "num_input_tokens_seen": 94554828, + "step": 1836 + }, + { + "epoch": 0.1807326601312007, + "grad_norm": 33.85842514038086, + "learning_rate": 8e-05, + "loss": 64.8147, + "num_input_tokens_seen": 94714520, + "step": 1839 + }, + { + "epoch": 0.1810274931819857, + "grad_norm": 36.62459945678711, + "learning_rate": 8e-05, + "loss": 65.6021, + "num_input_tokens_seen": 94861320, + "step": 1842 + }, + { + "epoch": 0.1813223262327707, + "grad_norm": 33.322139739990234, + "learning_rate": 8e-05, + "loss": 63.1429, + "num_input_tokens_seen": 95014704, + "step": 1845 + }, + { + "epoch": 0.18161715928355568, + "grad_norm": 34.736106872558594, + "learning_rate": 8e-05, + "loss": 66.7304, + "num_input_tokens_seen": 95170048, + "step": 1848 + }, + { + "epoch": 0.1819119923343407, + "grad_norm": 37.72227096557617, + "learning_rate": 8e-05, + "loss": 63.1081, + "num_input_tokens_seen": 95333344, + "step": 1851 + }, + { + "epoch": 0.18220682538512567, + "grad_norm": 32.574039459228516, + "learning_rate": 8e-05, + "loss": 69.0083, + "num_input_tokens_seen": 95504940, + "step": 1854 + }, + { + "epoch": 0.18250165843591065, + "grad_norm": 33.293087005615234, + "learning_rate": 8e-05, + "loss": 68.1676, + "num_input_tokens_seen": 95668432, + "step": 1857 + }, + { + "epoch": 0.18279649148669566, + "grad_norm": 36.98726272583008, + "learning_rate": 8e-05, + "loss": 69.9773, + "num_input_tokens_seen": 95811784, + "step": 1860 + }, + { + "epoch": 0.18309132453748064, + "grad_norm": 40.061279296875, + "learning_rate": 8e-05, + "loss": 65.8954, + "num_input_tokens_seen": 95986592, + "step": 1863 + }, + { + "epoch": 0.18338615758826565, + "grad_norm": 56.59510803222656, + "learning_rate": 8e-05, + "loss": 67.293, + "num_input_tokens_seen": 96161604, + "step": 1866 + }, + { + "epoch": 0.18368099063905063, + "grad_norm": 37.23174285888672, + "learning_rate": 8e-05, + "loss": 65.7594, + "num_input_tokens_seen": 96310164, + "step": 1869 + }, + { + "epoch": 0.18397582368983564, + "grad_norm": 40.08700942993164, + "learning_rate": 8e-05, + "loss": 65.6641, + "num_input_tokens_seen": 96432028, + "step": 1872 + }, + { + "epoch": 0.18427065674062063, + "grad_norm": 36.7364501953125, + "learning_rate": 8e-05, + "loss": 66.722, + "num_input_tokens_seen": 96585280, + "step": 1875 + }, + { + "epoch": 0.1845654897914056, + "grad_norm": 34.42292404174805, + "learning_rate": 8e-05, + "loss": 65.9969, + "num_input_tokens_seen": 96761724, + "step": 1878 + }, + { + "epoch": 0.18486032284219062, + "grad_norm": 36.30381393432617, + "learning_rate": 8e-05, + "loss": 65.4957, + "num_input_tokens_seen": 96911572, + "step": 1881 + }, + { + "epoch": 0.1851551558929756, + "grad_norm": 35.37347412109375, + "learning_rate": 8e-05, + "loss": 64.8689, + "num_input_tokens_seen": 97064136, + "step": 1884 + }, + { + "epoch": 0.1854499889437606, + "grad_norm": 35.22996139526367, + "learning_rate": 8e-05, + "loss": 62.8097, + "num_input_tokens_seen": 97225564, + "step": 1887 + }, + { + "epoch": 0.1857448219945456, + "grad_norm": 45.67708969116211, + "learning_rate": 8e-05, + "loss": 64.6179, + "num_input_tokens_seen": 97374648, + "step": 1890 + }, + { + "epoch": 0.18603965504533057, + "grad_norm": 31.792390823364258, + "learning_rate": 8e-05, + "loss": 66.9663, + "num_input_tokens_seen": 97534584, + "step": 1893 + }, + { + "epoch": 0.18633448809611558, + "grad_norm": 41.057430267333984, + "learning_rate": 8e-05, + "loss": 65.637, + "num_input_tokens_seen": 97684144, + "step": 1896 + }, + { + "epoch": 0.18662932114690056, + "grad_norm": 31.76915168762207, + "learning_rate": 8e-05, + "loss": 66.0923, + "num_input_tokens_seen": 97843812, + "step": 1899 + }, + { + "epoch": 0.18692415419768557, + "grad_norm": 38.473182678222656, + "learning_rate": 8e-05, + "loss": 62.1717, + "num_input_tokens_seen": 98004360, + "step": 1902 + }, + { + "epoch": 0.18721898724847055, + "grad_norm": 32.907623291015625, + "learning_rate": 8e-05, + "loss": 64.183, + "num_input_tokens_seen": 98157508, + "step": 1905 + }, + { + "epoch": 0.18751382029925553, + "grad_norm": 88.73799896240234, + "learning_rate": 8e-05, + "loss": 60.3581, + "num_input_tokens_seen": 98300784, + "step": 1908 + }, + { + "epoch": 0.18780865335004054, + "grad_norm": 34.77318572998047, + "learning_rate": 8e-05, + "loss": 65.4762, + "num_input_tokens_seen": 98447492, + "step": 1911 + }, + { + "epoch": 0.18810348640082553, + "grad_norm": 41.78057861328125, + "learning_rate": 8e-05, + "loss": 61.738, + "num_input_tokens_seen": 98612760, + "step": 1914 + }, + { + "epoch": 0.18839831945161054, + "grad_norm": 42.5244026184082, + "learning_rate": 8e-05, + "loss": 63.2749, + "num_input_tokens_seen": 98775560, + "step": 1917 + }, + { + "epoch": 0.18869315250239552, + "grad_norm": 35.50346374511719, + "learning_rate": 8e-05, + "loss": 62.9821, + "num_input_tokens_seen": 98923580, + "step": 1920 + }, + { + "epoch": 0.1889879855531805, + "grad_norm": 39.55344772338867, + "learning_rate": 8e-05, + "loss": 63.3595, + "num_input_tokens_seen": 99070644, + "step": 1923 + }, + { + "epoch": 0.1892828186039655, + "grad_norm": 59.06232833862305, + "learning_rate": 8e-05, + "loss": 67.8251, + "num_input_tokens_seen": 99221436, + "step": 1926 + }, + { + "epoch": 0.1895776516547505, + "grad_norm": 35.27766418457031, + "learning_rate": 8e-05, + "loss": 65.3675, + "num_input_tokens_seen": 99358224, + "step": 1929 + }, + { + "epoch": 0.1898724847055355, + "grad_norm": 35.66068649291992, + "learning_rate": 8e-05, + "loss": 61.189, + "num_input_tokens_seen": 99499396, + "step": 1932 + }, + { + "epoch": 0.19016731775632048, + "grad_norm": 41.50188064575195, + "learning_rate": 8e-05, + "loss": 66.0529, + "num_input_tokens_seen": 99641728, + "step": 1935 + }, + { + "epoch": 0.1904621508071055, + "grad_norm": 35.998775482177734, + "learning_rate": 8e-05, + "loss": 61.1702, + "num_input_tokens_seen": 99775096, + "step": 1938 + }, + { + "epoch": 0.19075698385789047, + "grad_norm": 125.73440551757812, + "learning_rate": 8e-05, + "loss": 62.1858, + "num_input_tokens_seen": 99931240, + "step": 1941 + }, + { + "epoch": 0.19105181690867545, + "grad_norm": 39.456817626953125, + "learning_rate": 8e-05, + "loss": 65.4074, + "num_input_tokens_seen": 100080408, + "step": 1944 + }, + { + "epoch": 0.19134664995946046, + "grad_norm": 47.685970306396484, + "learning_rate": 8e-05, + "loss": 60.8324, + "num_input_tokens_seen": 100230596, + "step": 1947 + }, + { + "epoch": 0.19164148301024544, + "grad_norm": 48.207733154296875, + "learning_rate": 8e-05, + "loss": 64.0891, + "num_input_tokens_seen": 100387164, + "step": 1950 + }, + { + "epoch": 0.19193631606103045, + "grad_norm": 97.84766387939453, + "learning_rate": 8e-05, + "loss": 67.9796, + "num_input_tokens_seen": 100531688, + "step": 1953 + }, + { + "epoch": 0.19223114911181544, + "grad_norm": 53.36372756958008, + "learning_rate": 8e-05, + "loss": 62.5783, + "num_input_tokens_seen": 100688964, + "step": 1956 + }, + { + "epoch": 0.19252598216260042, + "grad_norm": 42.483978271484375, + "learning_rate": 8e-05, + "loss": 63.2629, + "num_input_tokens_seen": 100848816, + "step": 1959 + }, + { + "epoch": 0.19282081521338543, + "grad_norm": 100.14268493652344, + "learning_rate": 8e-05, + "loss": 64.8723, + "num_input_tokens_seen": 100998108, + "step": 1962 + }, + { + "epoch": 0.1931156482641704, + "grad_norm": 51.66161346435547, + "learning_rate": 8e-05, + "loss": 65.4101, + "num_input_tokens_seen": 101160364, + "step": 1965 + }, + { + "epoch": 0.19341048131495542, + "grad_norm": 86.82875061035156, + "learning_rate": 8e-05, + "loss": 64.7327, + "num_input_tokens_seen": 101314144, + "step": 1968 + }, + { + "epoch": 0.1937053143657404, + "grad_norm": 37.22882843017578, + "learning_rate": 8e-05, + "loss": 64.4545, + "num_input_tokens_seen": 101461764, + "step": 1971 + }, + { + "epoch": 0.19400014741652538, + "grad_norm": 54.93822479248047, + "learning_rate": 8e-05, + "loss": 67.0109, + "num_input_tokens_seen": 101594676, + "step": 1974 + }, + { + "epoch": 0.1942949804673104, + "grad_norm": 37.24103927612305, + "learning_rate": 8e-05, + "loss": 63.0324, + "num_input_tokens_seen": 101758344, + "step": 1977 + }, + { + "epoch": 0.19458981351809537, + "grad_norm": 32.88393783569336, + "learning_rate": 8e-05, + "loss": 63.9217, + "num_input_tokens_seen": 101909124, + "step": 1980 + }, + { + "epoch": 0.19488464656888038, + "grad_norm": 73.78339385986328, + "learning_rate": 8e-05, + "loss": 60.9234, + "num_input_tokens_seen": 102057364, + "step": 1983 + }, + { + "epoch": 0.19517947961966536, + "grad_norm": 40.5337028503418, + "learning_rate": 8e-05, + "loss": 64.0662, + "num_input_tokens_seen": 102223364, + "step": 1986 + }, + { + "epoch": 0.19547431267045035, + "grad_norm": 82.47228240966797, + "learning_rate": 8e-05, + "loss": 65.104, + "num_input_tokens_seen": 102379700, + "step": 1989 + }, + { + "epoch": 0.19576914572123535, + "grad_norm": 48.52934265136719, + "learning_rate": 8e-05, + "loss": 63.7737, + "num_input_tokens_seen": 102538772, + "step": 1992 + }, + { + "epoch": 0.19606397877202034, + "grad_norm": 105.58226013183594, + "learning_rate": 8e-05, + "loss": 61.4124, + "num_input_tokens_seen": 102692596, + "step": 1995 + }, + { + "epoch": 0.19635881182280535, + "grad_norm": 132.78990173339844, + "learning_rate": 8e-05, + "loss": 65.7669, + "num_input_tokens_seen": 102836944, + "step": 1998 + }, + { + "epoch": 0.19655536718999533, + "eval_gen_len": 41.59, + "eval_loss": 4.052365779876709, + "eval_rouge1": 27.4318, + "eval_rouge2": 11.4034, + "eval_rougeL": 24.5864, + "eval_rougeLsum": 24.8835, + "eval_runtime": 146.6637, + "eval_samples_per_second": 1.364, + "eval_steps_per_second": 0.341, + "num_input_tokens_seen": 102933044, + "step": 2000 + }, + { + "epoch": 0.19665364487359033, + "grad_norm": 50.9435920715332, + "learning_rate": 8e-05, + "loss": 65.0293, + "num_input_tokens_seen": 102991844, + "step": 2001 + }, + { + "epoch": 0.1969484779243753, + "grad_norm": 47.73625564575195, + "learning_rate": 8e-05, + "loss": 63.8356, + "num_input_tokens_seen": 103117500, + "step": 2004 + }, + { + "epoch": 0.19724331097516032, + "grad_norm": 34.227874755859375, + "learning_rate": 8e-05, + "loss": 61.5276, + "num_input_tokens_seen": 103281116, + "step": 2007 + }, + { + "epoch": 0.1975381440259453, + "grad_norm": 74.65849304199219, + "learning_rate": 8e-05, + "loss": 62.754, + "num_input_tokens_seen": 103437116, + "step": 2010 + }, + { + "epoch": 0.1978329770767303, + "grad_norm": 33.34313201904297, + "learning_rate": 8e-05, + "loss": 63.4937, + "num_input_tokens_seen": 103608180, + "step": 2013 + }, + { + "epoch": 0.1981278101275153, + "grad_norm": 40.12400817871094, + "learning_rate": 8e-05, + "loss": 64.5351, + "num_input_tokens_seen": 103747448, + "step": 2016 + }, + { + "epoch": 0.1984226431783003, + "grad_norm": 77.02454376220703, + "learning_rate": 8e-05, + "loss": 63.297, + "num_input_tokens_seen": 103911272, + "step": 2019 + }, + { + "epoch": 0.19871747622908528, + "grad_norm": 133.56906127929688, + "learning_rate": 8e-05, + "loss": 60.8824, + "num_input_tokens_seen": 104053508, + "step": 2022 + }, + { + "epoch": 0.19901230927987026, + "grad_norm": 41.40303421020508, + "learning_rate": 8e-05, + "loss": 60.8633, + "num_input_tokens_seen": 104197836, + "step": 2025 + }, + { + "epoch": 0.19930714233065527, + "grad_norm": 44.42088317871094, + "learning_rate": 8e-05, + "loss": 63.3138, + "num_input_tokens_seen": 104321044, + "step": 2028 + }, + { + "epoch": 0.19960197538144026, + "grad_norm": 48.47317886352539, + "learning_rate": 8e-05, + "loss": 64.958, + "num_input_tokens_seen": 104487040, + "step": 2031 + }, + { + "epoch": 0.19989680843222526, + "grad_norm": 40.33289337158203, + "learning_rate": 8e-05, + "loss": 65.5737, + "num_input_tokens_seen": 104656620, + "step": 2034 + }, + { + "epoch": 0.20019164148301025, + "grad_norm": 37.34440612792969, + "learning_rate": 8e-05, + "loss": 63.7026, + "num_input_tokens_seen": 104807568, + "step": 2037 + }, + { + "epoch": 0.20048647453379523, + "grad_norm": 40.222557067871094, + "learning_rate": 8e-05, + "loss": 60.0065, + "num_input_tokens_seen": 104976912, + "step": 2040 + }, + { + "epoch": 0.20078130758458024, + "grad_norm": 115.03577423095703, + "learning_rate": 8e-05, + "loss": 63.1647, + "num_input_tokens_seen": 105134276, + "step": 2043 + }, + { + "epoch": 0.20107614063536522, + "grad_norm": 36.157569885253906, + "learning_rate": 8e-05, + "loss": 62.082, + "num_input_tokens_seen": 105299968, + "step": 2046 + }, + { + "epoch": 0.20137097368615023, + "grad_norm": 37.57674789428711, + "learning_rate": 8e-05, + "loss": 62.3581, + "num_input_tokens_seen": 105448460, + "step": 2049 + }, + { + "epoch": 0.2016658067369352, + "grad_norm": 36.60391616821289, + "learning_rate": 8e-05, + "loss": 60.2257, + "num_input_tokens_seen": 105620848, + "step": 2052 + }, + { + "epoch": 0.2019606397877202, + "grad_norm": 33.81732177734375, + "learning_rate": 8e-05, + "loss": 63.3224, + "num_input_tokens_seen": 105769236, + "step": 2055 + }, + { + "epoch": 0.2022554728385052, + "grad_norm": 40.38296890258789, + "learning_rate": 8e-05, + "loss": 63.6429, + "num_input_tokens_seen": 105921540, + "step": 2058 + }, + { + "epoch": 0.20255030588929018, + "grad_norm": 36.80983352661133, + "learning_rate": 8e-05, + "loss": 60.2437, + "num_input_tokens_seen": 106082260, + "step": 2061 + }, + { + "epoch": 0.2028451389400752, + "grad_norm": 67.15686798095703, + "learning_rate": 8e-05, + "loss": 60.4693, + "num_input_tokens_seen": 106241504, + "step": 2064 + }, + { + "epoch": 0.20313997199086017, + "grad_norm": 34.85142135620117, + "learning_rate": 8e-05, + "loss": 62.0851, + "num_input_tokens_seen": 106395036, + "step": 2067 + }, + { + "epoch": 0.20343480504164516, + "grad_norm": 34.57643508911133, + "learning_rate": 8e-05, + "loss": 67.5032, + "num_input_tokens_seen": 106552940, + "step": 2070 + }, + { + "epoch": 0.20372963809243017, + "grad_norm": 41.602684020996094, + "learning_rate": 8e-05, + "loss": 59.1889, + "num_input_tokens_seen": 106705432, + "step": 2073 + }, + { + "epoch": 0.20402447114321515, + "grad_norm": 35.24937438964844, + "learning_rate": 8e-05, + "loss": 64.8413, + "num_input_tokens_seen": 106862652, + "step": 2076 + }, + { + "epoch": 0.20431930419400016, + "grad_norm": 33.99971389770508, + "learning_rate": 8e-05, + "loss": 65.4623, + "num_input_tokens_seen": 107025212, + "step": 2079 + }, + { + "epoch": 0.20461413724478514, + "grad_norm": 37.50223922729492, + "learning_rate": 8e-05, + "loss": 61.1447, + "num_input_tokens_seen": 107200104, + "step": 2082 + }, + { + "epoch": 0.20490897029557012, + "grad_norm": 39.0561637878418, + "learning_rate": 8e-05, + "loss": 64.2402, + "num_input_tokens_seen": 107351344, + "step": 2085 + }, + { + "epoch": 0.20520380334635513, + "grad_norm": 36.96356201171875, + "learning_rate": 8e-05, + "loss": 64.7978, + "num_input_tokens_seen": 107502508, + "step": 2088 + }, + { + "epoch": 0.2054986363971401, + "grad_norm": 34.923370361328125, + "learning_rate": 8e-05, + "loss": 62.0066, + "num_input_tokens_seen": 107659476, + "step": 2091 + }, + { + "epoch": 0.20579346944792512, + "grad_norm": 33.66584014892578, + "learning_rate": 8e-05, + "loss": 62.7879, + "num_input_tokens_seen": 107813464, + "step": 2094 + }, + { + "epoch": 0.2060883024987101, + "grad_norm": 43.05538558959961, + "learning_rate": 8e-05, + "loss": 65.1983, + "num_input_tokens_seen": 107945632, + "step": 2097 + }, + { + "epoch": 0.2063831355494951, + "grad_norm": 46.15825653076172, + "learning_rate": 8e-05, + "loss": 62.3188, + "num_input_tokens_seen": 108105348, + "step": 2100 + }, + { + "epoch": 0.2066779686002801, + "grad_norm": 38.95362091064453, + "learning_rate": 8e-05, + "loss": 63.0942, + "num_input_tokens_seen": 108263528, + "step": 2103 + }, + { + "epoch": 0.20697280165106507, + "grad_norm": 30.686132431030273, + "learning_rate": 8e-05, + "loss": 60.3785, + "num_input_tokens_seen": 108411408, + "step": 2106 + }, + { + "epoch": 0.20726763470185008, + "grad_norm": 47.61467742919922, + "learning_rate": 8e-05, + "loss": 61.1736, + "num_input_tokens_seen": 108577660, + "step": 2109 + }, + { + "epoch": 0.20756246775263507, + "grad_norm": 35.973411560058594, + "learning_rate": 8e-05, + "loss": 65.4033, + "num_input_tokens_seen": 108731036, + "step": 2112 + }, + { + "epoch": 0.20785730080342008, + "grad_norm": 36.56504440307617, + "learning_rate": 8e-05, + "loss": 59.0829, + "num_input_tokens_seen": 108874528, + "step": 2115 + }, + { + "epoch": 0.20815213385420506, + "grad_norm": 31.043546676635742, + "learning_rate": 8e-05, + "loss": 61.4159, + "num_input_tokens_seen": 109028200, + "step": 2118 + }, + { + "epoch": 0.20844696690499004, + "grad_norm": 28.701793670654297, + "learning_rate": 8e-05, + "loss": 59.8725, + "num_input_tokens_seen": 109193424, + "step": 2121 + }, + { + "epoch": 0.20874179995577505, + "grad_norm": 36.71512985229492, + "learning_rate": 8e-05, + "loss": 61.5933, + "num_input_tokens_seen": 109348680, + "step": 2124 + }, + { + "epoch": 0.20903663300656003, + "grad_norm": 33.578277587890625, + "learning_rate": 8e-05, + "loss": 62.275, + "num_input_tokens_seen": 109490436, + "step": 2127 + }, + { + "epoch": 0.20933146605734504, + "grad_norm": 35.8470573425293, + "learning_rate": 8e-05, + "loss": 60.6259, + "num_input_tokens_seen": 109651880, + "step": 2130 + }, + { + "epoch": 0.20962629910813002, + "grad_norm": 30.997970581054688, + "learning_rate": 8e-05, + "loss": 58.4895, + "num_input_tokens_seen": 109786272, + "step": 2133 + }, + { + "epoch": 0.209921132158915, + "grad_norm": 85.87825012207031, + "learning_rate": 8e-05, + "loss": 62.2673, + "num_input_tokens_seen": 109946588, + "step": 2136 + }, + { + "epoch": 0.2102159652097, + "grad_norm": 46.45925521850586, + "learning_rate": 8e-05, + "loss": 61.9533, + "num_input_tokens_seen": 110106728, + "step": 2139 + }, + { + "epoch": 0.210510798260485, + "grad_norm": 35.03594970703125, + "learning_rate": 8e-05, + "loss": 61.7035, + "num_input_tokens_seen": 110278860, + "step": 2142 + }, + { + "epoch": 0.21080563131127, + "grad_norm": 35.61140441894531, + "learning_rate": 8e-05, + "loss": 58.79, + "num_input_tokens_seen": 110423348, + "step": 2145 + }, + { + "epoch": 0.21110046436205498, + "grad_norm": 48.249786376953125, + "learning_rate": 8e-05, + "loss": 60.4113, + "num_input_tokens_seen": 110576468, + "step": 2148 + }, + { + "epoch": 0.21139529741283997, + "grad_norm": 40.02042007446289, + "learning_rate": 8e-05, + "loss": 60.7596, + "num_input_tokens_seen": 110727952, + "step": 2151 + }, + { + "epoch": 0.21169013046362498, + "grad_norm": 36.13483810424805, + "learning_rate": 8e-05, + "loss": 62.302, + "num_input_tokens_seen": 110873488, + "step": 2154 + }, + { + "epoch": 0.21198496351440996, + "grad_norm": 37.476776123046875, + "learning_rate": 8e-05, + "loss": 61.4339, + "num_input_tokens_seen": 111019072, + "step": 2157 + }, + { + "epoch": 0.21227979656519497, + "grad_norm": 31.12283706665039, + "learning_rate": 8e-05, + "loss": 57.8079, + "num_input_tokens_seen": 111175896, + "step": 2160 + }, + { + "epoch": 0.21257462961597995, + "grad_norm": 33.844364166259766, + "learning_rate": 8e-05, + "loss": 62.2772, + "num_input_tokens_seen": 111309704, + "step": 2163 + }, + { + "epoch": 0.21286946266676496, + "grad_norm": 39.86318588256836, + "learning_rate": 8e-05, + "loss": 56.3056, + "num_input_tokens_seen": 111484440, + "step": 2166 + }, + { + "epoch": 0.21316429571754994, + "grad_norm": 36.426536560058594, + "learning_rate": 8e-05, + "loss": 64.5189, + "num_input_tokens_seen": 111623400, + "step": 2169 + }, + { + "epoch": 0.21345912876833492, + "grad_norm": 32.7545051574707, + "learning_rate": 8e-05, + "loss": 60.6215, + "num_input_tokens_seen": 111775816, + "step": 2172 + }, + { + "epoch": 0.21375396181911993, + "grad_norm": 80.18124389648438, + "learning_rate": 8e-05, + "loss": 61.9464, + "num_input_tokens_seen": 111924912, + "step": 2175 + }, + { + "epoch": 0.2140487948699049, + "grad_norm": 40.502376556396484, + "learning_rate": 8e-05, + "loss": 63.1164, + "num_input_tokens_seen": 112065796, + "step": 2178 + }, + { + "epoch": 0.21434362792068992, + "grad_norm": 61.75509262084961, + "learning_rate": 8e-05, + "loss": 57.283, + "num_input_tokens_seen": 112235840, + "step": 2181 + }, + { + "epoch": 0.2146384609714749, + "grad_norm": 29.798152923583984, + "learning_rate": 8e-05, + "loss": 58.6705, + "num_input_tokens_seen": 112382700, + "step": 2184 + }, + { + "epoch": 0.21493329402225989, + "grad_norm": 35.83757781982422, + "learning_rate": 8e-05, + "loss": 56.2894, + "num_input_tokens_seen": 112533292, + "step": 2187 + }, + { + "epoch": 0.2152281270730449, + "grad_norm": 42.107147216796875, + "learning_rate": 8e-05, + "loss": 63.6896, + "num_input_tokens_seen": 112687044, + "step": 2190 + }, + { + "epoch": 0.21552296012382988, + "grad_norm": 37.97938537597656, + "learning_rate": 8e-05, + "loss": 61.2909, + "num_input_tokens_seen": 112824956, + "step": 2193 + }, + { + "epoch": 0.21581779317461489, + "grad_norm": 87.84455871582031, + "learning_rate": 8e-05, + "loss": 61.1541, + "num_input_tokens_seen": 112998488, + "step": 2196 + }, + { + "epoch": 0.21611262622539987, + "grad_norm": 49.55474853515625, + "learning_rate": 8e-05, + "loss": 60.0391, + "num_input_tokens_seen": 113155280, + "step": 2199 + }, + { + "epoch": 0.21640745927618485, + "grad_norm": 90.66659545898438, + "learning_rate": 8e-05, + "loss": 60.7969, + "num_input_tokens_seen": 113330904, + "step": 2202 + }, + { + "epoch": 0.21670229232696986, + "grad_norm": 38.3773078918457, + "learning_rate": 8e-05, + "loss": 62.971, + "num_input_tokens_seen": 113484604, + "step": 2205 + }, + { + "epoch": 0.21699712537775484, + "grad_norm": 36.9621696472168, + "learning_rate": 8e-05, + "loss": 61.0875, + "num_input_tokens_seen": 113649252, + "step": 2208 + }, + { + "epoch": 0.21729195842853985, + "grad_norm": 44.32843017578125, + "learning_rate": 8e-05, + "loss": 56.9424, + "num_input_tokens_seen": 113793828, + "step": 2211 + }, + { + "epoch": 0.21758679147932483, + "grad_norm": 34.17706298828125, + "learning_rate": 8e-05, + "loss": 56.7555, + "num_input_tokens_seen": 113955836, + "step": 2214 + }, + { + "epoch": 0.2178816245301098, + "grad_norm": 33.80354309082031, + "learning_rate": 8e-05, + "loss": 59.7268, + "num_input_tokens_seen": 114119876, + "step": 2217 + }, + { + "epoch": 0.21817645758089482, + "grad_norm": 52.79160690307617, + "learning_rate": 8e-05, + "loss": 59.4334, + "num_input_tokens_seen": 114271076, + "step": 2220 + }, + { + "epoch": 0.2184712906316798, + "grad_norm": 31.56442642211914, + "learning_rate": 8e-05, + "loss": 61.6311, + "num_input_tokens_seen": 114431644, + "step": 2223 + }, + { + "epoch": 0.2187661236824648, + "grad_norm": 37.805503845214844, + "learning_rate": 8e-05, + "loss": 59.168, + "num_input_tokens_seen": 114598224, + "step": 2226 + }, + { + "epoch": 0.2190609567332498, + "grad_norm": 36.063968658447266, + "learning_rate": 8e-05, + "loss": 58.9546, + "num_input_tokens_seen": 114751272, + "step": 2229 + }, + { + "epoch": 0.21935578978403478, + "grad_norm": 278.02459716796875, + "learning_rate": 8e-05, + "loss": 55.4246, + "num_input_tokens_seen": 114878676, + "step": 2232 + }, + { + "epoch": 0.2196506228348198, + "grad_norm": 34.540164947509766, + "learning_rate": 8e-05, + "loss": 61.1208, + "num_input_tokens_seen": 115022248, + "step": 2235 + }, + { + "epoch": 0.21994545588560477, + "grad_norm": 43.42741775512695, + "learning_rate": 8e-05, + "loss": 59.6594, + "num_input_tokens_seen": 115202884, + "step": 2238 + }, + { + "epoch": 0.22024028893638978, + "grad_norm": 53.860260009765625, + "learning_rate": 8e-05, + "loss": 59.714, + "num_input_tokens_seen": 115338936, + "step": 2241 + }, + { + "epoch": 0.22053512198717476, + "grad_norm": 37.03718185424805, + "learning_rate": 8e-05, + "loss": 61.3456, + "num_input_tokens_seen": 115484732, + "step": 2244 + }, + { + "epoch": 0.22082995503795977, + "grad_norm": 41.423885345458984, + "learning_rate": 8e-05, + "loss": 62.5182, + "num_input_tokens_seen": 115633372, + "step": 2247 + }, + { + "epoch": 0.22112478808874475, + "grad_norm": 38.18406295776367, + "learning_rate": 8e-05, + "loss": 58.236, + "num_input_tokens_seen": 115796768, + "step": 2250 + }, + { + "epoch": 0.22141962113952973, + "grad_norm": 35.71892166137695, + "learning_rate": 8e-05, + "loss": 56.6222, + "num_input_tokens_seen": 115939736, + "step": 2253 + }, + { + "epoch": 0.22171445419031474, + "grad_norm": 37.585693359375, + "learning_rate": 8e-05, + "loss": 58.3685, + "num_input_tokens_seen": 116096544, + "step": 2256 + }, + { + "epoch": 0.22200928724109972, + "grad_norm": 31.749067306518555, + "learning_rate": 8e-05, + "loss": 59.9537, + "num_input_tokens_seen": 116267532, + "step": 2259 + }, + { + "epoch": 0.22230412029188473, + "grad_norm": 32.64338302612305, + "learning_rate": 8e-05, + "loss": 60.2476, + "num_input_tokens_seen": 116429168, + "step": 2262 + }, + { + "epoch": 0.22259895334266971, + "grad_norm": 36.87150573730469, + "learning_rate": 8e-05, + "loss": 59.6034, + "num_input_tokens_seen": 116583948, + "step": 2265 + }, + { + "epoch": 0.2228937863934547, + "grad_norm": 35.45121383666992, + "learning_rate": 8e-05, + "loss": 61.7586, + "num_input_tokens_seen": 116727856, + "step": 2268 + }, + { + "epoch": 0.2231886194442397, + "grad_norm": 57.21012878417969, + "learning_rate": 8e-05, + "loss": 57.9647, + "num_input_tokens_seen": 116878736, + "step": 2271 + }, + { + "epoch": 0.2234834524950247, + "grad_norm": 37.94432830810547, + "learning_rate": 8e-05, + "loss": 60.7469, + "num_input_tokens_seen": 117057792, + "step": 2274 + }, + { + "epoch": 0.2237782855458097, + "grad_norm": 30.563480377197266, + "learning_rate": 8e-05, + "loss": 56.5987, + "num_input_tokens_seen": 117225848, + "step": 2277 + }, + { + "epoch": 0.22407311859659468, + "grad_norm": 31.604551315307617, + "learning_rate": 8e-05, + "loss": 60.5782, + "num_input_tokens_seen": 117357464, + "step": 2280 + }, + { + "epoch": 0.22436795164737966, + "grad_norm": 36.0341682434082, + "learning_rate": 8e-05, + "loss": 59.2931, + "num_input_tokens_seen": 117506908, + "step": 2283 + }, + { + "epoch": 0.22466278469816467, + "grad_norm": 35.426395416259766, + "learning_rate": 8e-05, + "loss": 61.6121, + "num_input_tokens_seen": 117663704, + "step": 2286 + }, + { + "epoch": 0.22495761774894965, + "grad_norm": 43.54740524291992, + "learning_rate": 8e-05, + "loss": 58.8303, + "num_input_tokens_seen": 117807716, + "step": 2289 + }, + { + "epoch": 0.22525245079973466, + "grad_norm": 35.09873962402344, + "learning_rate": 8e-05, + "loss": 57.9195, + "num_input_tokens_seen": 117980380, + "step": 2292 + }, + { + "epoch": 0.22554728385051964, + "grad_norm": 37.0145263671875, + "learning_rate": 8e-05, + "loss": 61.547, + "num_input_tokens_seen": 118156204, + "step": 2295 + }, + { + "epoch": 0.22584211690130462, + "grad_norm": 37.9959831237793, + "learning_rate": 8e-05, + "loss": 57.3029, + "num_input_tokens_seen": 118287944, + "step": 2298 + }, + { + "epoch": 0.22613694995208963, + "grad_norm": 42.40966033935547, + "learning_rate": 8e-05, + "loss": 61.7216, + "num_input_tokens_seen": 118442436, + "step": 2301 + }, + { + "epoch": 0.22643178300287461, + "grad_norm": 34.89582061767578, + "learning_rate": 8e-05, + "loss": 62.3091, + "num_input_tokens_seen": 118595600, + "step": 2304 + }, + { + "epoch": 0.22672661605365962, + "grad_norm": 32.897647857666016, + "learning_rate": 8e-05, + "loss": 56.707, + "num_input_tokens_seen": 118755620, + "step": 2307 + }, + { + "epoch": 0.2270214491044446, + "grad_norm": 50.87218475341797, + "learning_rate": 8e-05, + "loss": 58.4343, + "num_input_tokens_seen": 118917800, + "step": 2310 + }, + { + "epoch": 0.2273162821552296, + "grad_norm": 36.530635833740234, + "learning_rate": 8e-05, + "loss": 55.3719, + "num_input_tokens_seen": 119066396, + "step": 2313 + }, + { + "epoch": 0.2276111152060146, + "grad_norm": 38.81621551513672, + "learning_rate": 8e-05, + "loss": 62.1988, + "num_input_tokens_seen": 119226812, + "step": 2316 + }, + { + "epoch": 0.22790594825679958, + "grad_norm": 33.9807243347168, + "learning_rate": 8e-05, + "loss": 55.9138, + "num_input_tokens_seen": 119388700, + "step": 2319 + }, + { + "epoch": 0.2282007813075846, + "grad_norm": 45.930171966552734, + "learning_rate": 8e-05, + "loss": 58.6786, + "num_input_tokens_seen": 119529208, + "step": 2322 + }, + { + "epoch": 0.22849561435836957, + "grad_norm": 38.361358642578125, + "learning_rate": 8e-05, + "loss": 60.9187, + "num_input_tokens_seen": 119673064, + "step": 2325 + }, + { + "epoch": 0.22879044740915458, + "grad_norm": 33.112945556640625, + "learning_rate": 8e-05, + "loss": 59.7167, + "num_input_tokens_seen": 119812848, + "step": 2328 + }, + { + "epoch": 0.22908528045993956, + "grad_norm": 37.15597915649414, + "learning_rate": 8e-05, + "loss": 62.2376, + "num_input_tokens_seen": 119964060, + "step": 2331 + }, + { + "epoch": 0.22938011351072454, + "grad_norm": 38.77449035644531, + "learning_rate": 8e-05, + "loss": 61.0384, + "num_input_tokens_seen": 120133004, + "step": 2334 + }, + { + "epoch": 0.22967494656150955, + "grad_norm": 35.40461730957031, + "learning_rate": 8e-05, + "loss": 57.5117, + "num_input_tokens_seen": 120282564, + "step": 2337 + }, + { + "epoch": 0.22996977961229453, + "grad_norm": 39.79495620727539, + "learning_rate": 8e-05, + "loss": 61.9955, + "num_input_tokens_seen": 120457480, + "step": 2340 + }, + { + "epoch": 0.23026461266307954, + "grad_norm": 33.81966018676758, + "learning_rate": 8e-05, + "loss": 57.1984, + "num_input_tokens_seen": 120604636, + "step": 2343 + }, + { + "epoch": 0.23055944571386452, + "grad_norm": 33.839759826660156, + "learning_rate": 8e-05, + "loss": 59.4311, + "num_input_tokens_seen": 120754696, + "step": 2346 + }, + { + "epoch": 0.2308542787646495, + "grad_norm": 34.786277770996094, + "learning_rate": 8e-05, + "loss": 61.8979, + "num_input_tokens_seen": 120895720, + "step": 2349 + }, + { + "epoch": 0.23114911181543452, + "grad_norm": 36.697044372558594, + "learning_rate": 8e-05, + "loss": 56.9062, + "num_input_tokens_seen": 121047480, + "step": 2352 + }, + { + "epoch": 0.2314439448662195, + "grad_norm": 37.01709747314453, + "learning_rate": 8e-05, + "loss": 61.0646, + "num_input_tokens_seen": 121209004, + "step": 2355 + }, + { + "epoch": 0.2317387779170045, + "grad_norm": 32.902713775634766, + "learning_rate": 8e-05, + "loss": 57.6042, + "num_input_tokens_seen": 121344680, + "step": 2358 + }, + { + "epoch": 0.2320336109677895, + "grad_norm": 48.72361755371094, + "learning_rate": 8e-05, + "loss": 59.6539, + "num_input_tokens_seen": 121501404, + "step": 2361 + }, + { + "epoch": 0.23232844401857447, + "grad_norm": 33.87553405761719, + "learning_rate": 8e-05, + "loss": 57.2243, + "num_input_tokens_seen": 121660472, + "step": 2364 + }, + { + "epoch": 0.23262327706935948, + "grad_norm": 35.26018142700195, + "learning_rate": 8e-05, + "loss": 57.8523, + "num_input_tokens_seen": 121818284, + "step": 2367 + }, + { + "epoch": 0.23291811012014446, + "grad_norm": 36.639827728271484, + "learning_rate": 8e-05, + "loss": 59.1539, + "num_input_tokens_seen": 121966424, + "step": 2370 + }, + { + "epoch": 0.23321294317092947, + "grad_norm": 39.612640380859375, + "learning_rate": 8e-05, + "loss": 60.4686, + "num_input_tokens_seen": 122139672, + "step": 2373 + }, + { + "epoch": 0.23350777622171445, + "grad_norm": 37.069705963134766, + "learning_rate": 8e-05, + "loss": 58.0885, + "num_input_tokens_seen": 122294392, + "step": 2376 + }, + { + "epoch": 0.23380260927249943, + "grad_norm": 35.78841781616211, + "learning_rate": 8e-05, + "loss": 56.5211, + "num_input_tokens_seen": 122444948, + "step": 2379 + }, + { + "epoch": 0.23409744232328444, + "grad_norm": 116.17157745361328, + "learning_rate": 8e-05, + "loss": 59.1547, + "num_input_tokens_seen": 122598056, + "step": 2382 + }, + { + "epoch": 0.23439227537406943, + "grad_norm": 35.66117477416992, + "learning_rate": 8e-05, + "loss": 57.1662, + "num_input_tokens_seen": 122744792, + "step": 2385 + }, + { + "epoch": 0.23468710842485443, + "grad_norm": 40.41534423828125, + "learning_rate": 8e-05, + "loss": 59.0607, + "num_input_tokens_seen": 122896372, + "step": 2388 + }, + { + "epoch": 0.23498194147563942, + "grad_norm": 38.96350860595703, + "learning_rate": 8e-05, + "loss": 61.4238, + "num_input_tokens_seen": 123069076, + "step": 2391 + }, + { + "epoch": 0.23527677452642443, + "grad_norm": 31.967472076416016, + "learning_rate": 8e-05, + "loss": 57.8557, + "num_input_tokens_seen": 123226456, + "step": 2394 + }, + { + "epoch": 0.2355716075772094, + "grad_norm": 31.418663024902344, + "learning_rate": 8e-05, + "loss": 56.2966, + "num_input_tokens_seen": 123376080, + "step": 2397 + }, + { + "epoch": 0.2358664406279944, + "grad_norm": 32.175514221191406, + "learning_rate": 8e-05, + "loss": 57.1659, + "num_input_tokens_seen": 123541412, + "step": 2400 + }, + { + "epoch": 0.2361612736787794, + "grad_norm": 34.742496490478516, + "learning_rate": 8e-05, + "loss": 59.9754, + "num_input_tokens_seen": 123706440, + "step": 2403 + }, + { + "epoch": 0.23645610672956438, + "grad_norm": 29.358062744140625, + "learning_rate": 8e-05, + "loss": 54.3595, + "num_input_tokens_seen": 123858644, + "step": 2406 + }, + { + "epoch": 0.2367509397803494, + "grad_norm": 33.7983283996582, + "learning_rate": 8e-05, + "loss": 55.3638, + "num_input_tokens_seen": 124025808, + "step": 2409 + }, + { + "epoch": 0.23704577283113437, + "grad_norm": 39.6863899230957, + "learning_rate": 8e-05, + "loss": 56.7066, + "num_input_tokens_seen": 124198856, + "step": 2412 + }, + { + "epoch": 0.23734060588191935, + "grad_norm": 35.42102813720703, + "learning_rate": 8e-05, + "loss": 58.0268, + "num_input_tokens_seen": 124340672, + "step": 2415 + }, + { + "epoch": 0.23763543893270436, + "grad_norm": 33.165950775146484, + "learning_rate": 8e-05, + "loss": 59.3338, + "num_input_tokens_seen": 124474688, + "step": 2418 + }, + { + "epoch": 0.23793027198348934, + "grad_norm": 35.83243179321289, + "learning_rate": 8e-05, + "loss": 56.8695, + "num_input_tokens_seen": 124628504, + "step": 2421 + }, + { + "epoch": 0.23822510503427435, + "grad_norm": 30.844099044799805, + "learning_rate": 8e-05, + "loss": 53.5109, + "num_input_tokens_seen": 124770256, + "step": 2424 + }, + { + "epoch": 0.23851993808505934, + "grad_norm": 47.932647705078125, + "learning_rate": 8e-05, + "loss": 55.0004, + "num_input_tokens_seen": 124920860, + "step": 2427 + }, + { + "epoch": 0.23881477113584432, + "grad_norm": 37.77656173706055, + "learning_rate": 8e-05, + "loss": 56.5262, + "num_input_tokens_seen": 125088196, + "step": 2430 + }, + { + "epoch": 0.23910960418662933, + "grad_norm": 36.80366897583008, + "learning_rate": 8e-05, + "loss": 56.8007, + "num_input_tokens_seen": 125243848, + "step": 2433 + }, + { + "epoch": 0.2394044372374143, + "grad_norm": 34.99855041503906, + "learning_rate": 8e-05, + "loss": 58.5576, + "num_input_tokens_seen": 125404240, + "step": 2436 + }, + { + "epoch": 0.23969927028819932, + "grad_norm": 338.2091979980469, + "learning_rate": 8e-05, + "loss": 57.9598, + "num_input_tokens_seen": 125566664, + "step": 2439 + }, + { + "epoch": 0.2399941033389843, + "grad_norm": 41.78690719604492, + "learning_rate": 8e-05, + "loss": 56.5716, + "num_input_tokens_seen": 125708800, + "step": 2442 + }, + { + "epoch": 0.24028893638976928, + "grad_norm": 33.10038757324219, + "learning_rate": 8e-05, + "loss": 55.7882, + "num_input_tokens_seen": 125865424, + "step": 2445 + }, + { + "epoch": 0.2405837694405543, + "grad_norm": 32.90071487426758, + "learning_rate": 8e-05, + "loss": 54.1681, + "num_input_tokens_seen": 125992848, + "step": 2448 + }, + { + "epoch": 0.24087860249133927, + "grad_norm": 38.925724029541016, + "learning_rate": 8e-05, + "loss": 56.0581, + "num_input_tokens_seen": 126151048, + "step": 2451 + }, + { + "epoch": 0.24117343554212428, + "grad_norm": 36.73653793334961, + "learning_rate": 8e-05, + "loss": 57.7794, + "num_input_tokens_seen": 126319592, + "step": 2454 + }, + { + "epoch": 0.24146826859290926, + "grad_norm": 41.0250244140625, + "learning_rate": 8e-05, + "loss": 54.5532, + "num_input_tokens_seen": 126481832, + "step": 2457 + }, + { + "epoch": 0.24176310164369424, + "grad_norm": 30.037355422973633, + "learning_rate": 8e-05, + "loss": 58.5533, + "num_input_tokens_seen": 126632916, + "step": 2460 + }, + { + "epoch": 0.24205793469447925, + "grad_norm": 44.85157775878906, + "learning_rate": 8e-05, + "loss": 58.7409, + "num_input_tokens_seen": 126777564, + "step": 2463 + }, + { + "epoch": 0.24235276774526424, + "grad_norm": 29.838281631469727, + "learning_rate": 8e-05, + "loss": 55.2551, + "num_input_tokens_seen": 126935656, + "step": 2466 + }, + { + "epoch": 0.24264760079604925, + "grad_norm": 35.31085205078125, + "learning_rate": 8e-05, + "loss": 55.8649, + "num_input_tokens_seen": 127082204, + "step": 2469 + }, + { + "epoch": 0.24294243384683423, + "grad_norm": 36.91703796386719, + "learning_rate": 8e-05, + "loss": 57.6778, + "num_input_tokens_seen": 127241464, + "step": 2472 + }, + { + "epoch": 0.24323726689761924, + "grad_norm": 31.66358757019043, + "learning_rate": 8e-05, + "loss": 56.7686, + "num_input_tokens_seen": 127397924, + "step": 2475 + }, + { + "epoch": 0.24353209994840422, + "grad_norm": 35.33116912841797, + "learning_rate": 8e-05, + "loss": 56.6367, + "num_input_tokens_seen": 127559408, + "step": 2478 + }, + { + "epoch": 0.2438269329991892, + "grad_norm": 34.71982192993164, + "learning_rate": 8e-05, + "loss": 60.8994, + "num_input_tokens_seen": 127728204, + "step": 2481 + }, + { + "epoch": 0.2441217660499742, + "grad_norm": 35.25178146362305, + "learning_rate": 8e-05, + "loss": 58.6995, + "num_input_tokens_seen": 127885936, + "step": 2484 + }, + { + "epoch": 0.2444165991007592, + "grad_norm": 39.45205307006836, + "learning_rate": 8e-05, + "loss": 55.208, + "num_input_tokens_seen": 128030752, + "step": 2487 + }, + { + "epoch": 0.2447114321515442, + "grad_norm": 36.48596954345703, + "learning_rate": 8e-05, + "loss": 55.4749, + "num_input_tokens_seen": 128191944, + "step": 2490 + }, + { + "epoch": 0.24500626520232918, + "grad_norm": 34.035316467285156, + "learning_rate": 8e-05, + "loss": 53.0131, + "num_input_tokens_seen": 128345520, + "step": 2493 + }, + { + "epoch": 0.24530109825311416, + "grad_norm": 30.92428207397461, + "learning_rate": 8e-05, + "loss": 54.5648, + "num_input_tokens_seen": 128490620, + "step": 2496 + }, + { + "epoch": 0.24559593130389917, + "grad_norm": 35.92305374145508, + "learning_rate": 8e-05, + "loss": 57.5855, + "num_input_tokens_seen": 128639248, + "step": 2499 + }, + { + "epoch": 0.24589076435468415, + "grad_norm": 35.16416931152344, + "learning_rate": 8e-05, + "loss": 55.8813, + "num_input_tokens_seen": 128806916, + "step": 2502 + }, + { + "epoch": 0.24618559740546916, + "grad_norm": 35.76316452026367, + "learning_rate": 8e-05, + "loss": 56.8321, + "num_input_tokens_seen": 128949452, + "step": 2505 + }, + { + "epoch": 0.24648043045625415, + "grad_norm": 32.33674240112305, + "learning_rate": 8e-05, + "loss": 53.35, + "num_input_tokens_seen": 129100852, + "step": 2508 + }, + { + "epoch": 0.24677526350703913, + "grad_norm": 33.43571472167969, + "learning_rate": 8e-05, + "loss": 57.0671, + "num_input_tokens_seen": 129252204, + "step": 2511 + }, + { + "epoch": 0.24707009655782414, + "grad_norm": 140.02317810058594, + "learning_rate": 8e-05, + "loss": 55.7291, + "num_input_tokens_seen": 129397452, + "step": 2514 + }, + { + "epoch": 0.24736492960860912, + "grad_norm": 37.19168472290039, + "learning_rate": 8e-05, + "loss": 53.1093, + "num_input_tokens_seen": 129535376, + "step": 2517 + }, + { + "epoch": 0.24765976265939413, + "grad_norm": 48.09902572631836, + "learning_rate": 8e-05, + "loss": 56.053, + "num_input_tokens_seen": 129676028, + "step": 2520 + }, + { + "epoch": 0.2479545957101791, + "grad_norm": 37.414947509765625, + "learning_rate": 8e-05, + "loss": 54.0977, + "num_input_tokens_seen": 129844620, + "step": 2523 + }, + { + "epoch": 0.2482494287609641, + "grad_norm": 77.83485412597656, + "learning_rate": 8e-05, + "loss": 53.7719, + "num_input_tokens_seen": 129978612, + "step": 2526 + }, + { + "epoch": 0.2485442618117491, + "grad_norm": 33.74465560913086, + "learning_rate": 8e-05, + "loss": 52.7374, + "num_input_tokens_seen": 130135364, + "step": 2529 + }, + { + "epoch": 0.24883909486253408, + "grad_norm": 35.31605911254883, + "learning_rate": 8e-05, + "loss": 55.9591, + "num_input_tokens_seen": 130296404, + "step": 2532 + }, + { + "epoch": 0.2491339279133191, + "grad_norm": 28.435291290283203, + "learning_rate": 8e-05, + "loss": 50.9603, + "num_input_tokens_seen": 130467904, + "step": 2535 + }, + { + "epoch": 0.24942876096410407, + "grad_norm": 40.538909912109375, + "learning_rate": 8e-05, + "loss": 55.7693, + "num_input_tokens_seen": 130627960, + "step": 2538 + }, + { + "epoch": 0.24972359401488906, + "grad_norm": 34.60548782348633, + "learning_rate": 8e-05, + "loss": 53.3829, + "num_input_tokens_seen": 130777832, + "step": 2541 + }, + { + "epoch": 0.25001842706567406, + "grad_norm": 36.60821533203125, + "learning_rate": 8e-05, + "loss": 56.8237, + "num_input_tokens_seen": 130921848, + "step": 2544 + }, + { + "epoch": 0.25031326011645905, + "grad_norm": 31.987855911254883, + "learning_rate": 8e-05, + "loss": 53.2189, + "num_input_tokens_seen": 131084284, + "step": 2547 + }, + { + "epoch": 0.25060809316724403, + "grad_norm": 40.75633239746094, + "learning_rate": 8e-05, + "loss": 57.2221, + "num_input_tokens_seen": 131236140, + "step": 2550 + }, + { + "epoch": 0.25090292621802907, + "grad_norm": 34.93666076660156, + "learning_rate": 8e-05, + "loss": 58.1397, + "num_input_tokens_seen": 131394640, + "step": 2553 + }, + { + "epoch": 0.25119775926881405, + "grad_norm": 30.523983001708984, + "learning_rate": 8e-05, + "loss": 56.3125, + "num_input_tokens_seen": 131544288, + "step": 2556 + }, + { + "epoch": 0.25149259231959903, + "grad_norm": 36.5615348815918, + "learning_rate": 8e-05, + "loss": 58.1565, + "num_input_tokens_seen": 131694888, + "step": 2559 + }, + { + "epoch": 0.251787425370384, + "grad_norm": 41.425392150878906, + "learning_rate": 8e-05, + "loss": 53.5081, + "num_input_tokens_seen": 131859732, + "step": 2562 + }, + { + "epoch": 0.252082258421169, + "grad_norm": 39.53923034667969, + "learning_rate": 8e-05, + "loss": 52.4747, + "num_input_tokens_seen": 131998368, + "step": 2565 + }, + { + "epoch": 0.25237709147195403, + "grad_norm": 36.601383209228516, + "learning_rate": 8e-05, + "loss": 56.8967, + "num_input_tokens_seen": 132155796, + "step": 2568 + }, + { + "epoch": 0.252671924522739, + "grad_norm": 31.163291931152344, + "learning_rate": 8e-05, + "loss": 54.3867, + "num_input_tokens_seen": 132304292, + "step": 2571 + }, + { + "epoch": 0.252966757573524, + "grad_norm": 49.13053512573242, + "learning_rate": 8e-05, + "loss": 57.8686, + "num_input_tokens_seen": 132467932, + "step": 2574 + }, + { + "epoch": 0.253261590624309, + "grad_norm": 38.996482849121094, + "learning_rate": 8e-05, + "loss": 52.0673, + "num_input_tokens_seen": 132617624, + "step": 2577 + }, + { + "epoch": 0.25355642367509396, + "grad_norm": 43.19272994995117, + "learning_rate": 8e-05, + "loss": 55.1637, + "num_input_tokens_seen": 132774048, + "step": 2580 + }, + { + "epoch": 0.253851256725879, + "grad_norm": 34.15322494506836, + "learning_rate": 8e-05, + "loss": 58.8447, + "num_input_tokens_seen": 132933688, + "step": 2583 + }, + { + "epoch": 0.254146089776664, + "grad_norm": 31.180694580078125, + "learning_rate": 8e-05, + "loss": 55.6801, + "num_input_tokens_seen": 133091016, + "step": 2586 + }, + { + "epoch": 0.25444092282744896, + "grad_norm": 35.01714324951172, + "learning_rate": 8e-05, + "loss": 54.9311, + "num_input_tokens_seen": 133224776, + "step": 2589 + }, + { + "epoch": 0.25473575587823394, + "grad_norm": 39.41815185546875, + "learning_rate": 8e-05, + "loss": 55.9144, + "num_input_tokens_seen": 133376372, + "step": 2592 + }, + { + "epoch": 0.2550305889290189, + "grad_norm": 36.53614044189453, + "learning_rate": 8e-05, + "loss": 57.2327, + "num_input_tokens_seen": 133537224, + "step": 2595 + }, + { + "epoch": 0.25532542197980396, + "grad_norm": 34.72364807128906, + "learning_rate": 8e-05, + "loss": 56.9296, + "num_input_tokens_seen": 133704596, + "step": 2598 + }, + { + "epoch": 0.25562025503058894, + "grad_norm": 34.47258758544922, + "learning_rate": 8e-05, + "loss": 52.5994, + "num_input_tokens_seen": 133853996, + "step": 2601 + }, + { + "epoch": 0.2559150880813739, + "grad_norm": 38.56904220581055, + "learning_rate": 8e-05, + "loss": 53.1313, + "num_input_tokens_seen": 134009856, + "step": 2604 + }, + { + "epoch": 0.2562099211321589, + "grad_norm": 32.298828125, + "learning_rate": 8e-05, + "loss": 55.5707, + "num_input_tokens_seen": 134152368, + "step": 2607 + }, + { + "epoch": 0.2565047541829439, + "grad_norm": 38.61540985107422, + "learning_rate": 8e-05, + "loss": 57.645, + "num_input_tokens_seen": 134319592, + "step": 2610 + }, + { + "epoch": 0.2567995872337289, + "grad_norm": 49.49209213256836, + "learning_rate": 8e-05, + "loss": 52.0047, + "num_input_tokens_seen": 134464892, + "step": 2613 + }, + { + "epoch": 0.2570944202845139, + "grad_norm": 51.2288932800293, + "learning_rate": 8e-05, + "loss": 54.2843, + "num_input_tokens_seen": 134616000, + "step": 2616 + }, + { + "epoch": 0.2573892533352989, + "grad_norm": 31.375375747680664, + "learning_rate": 8e-05, + "loss": 53.1026, + "num_input_tokens_seen": 134774184, + "step": 2619 + }, + { + "epoch": 0.25768408638608387, + "grad_norm": 34.40566635131836, + "learning_rate": 8e-05, + "loss": 56.4985, + "num_input_tokens_seen": 134940048, + "step": 2622 + }, + { + "epoch": 0.25797891943686885, + "grad_norm": 33.19921112060547, + "learning_rate": 8e-05, + "loss": 53.7378, + "num_input_tokens_seen": 135099000, + "step": 2625 + }, + { + "epoch": 0.2582737524876539, + "grad_norm": 35.480255126953125, + "learning_rate": 8e-05, + "loss": 51.1283, + "num_input_tokens_seen": 135251292, + "step": 2628 + }, + { + "epoch": 0.25856858553843887, + "grad_norm": 51.189632415771484, + "learning_rate": 8e-05, + "loss": 58.2569, + "num_input_tokens_seen": 135414044, + "step": 2631 + }, + { + "epoch": 0.25886341858922385, + "grad_norm": 33.631473541259766, + "learning_rate": 8e-05, + "loss": 54.2228, + "num_input_tokens_seen": 135587632, + "step": 2634 + }, + { + "epoch": 0.25915825164000883, + "grad_norm": 35.20783615112305, + "learning_rate": 8e-05, + "loss": 50.3905, + "num_input_tokens_seen": 135736856, + "step": 2637 + }, + { + "epoch": 0.2594530846907938, + "grad_norm": 41.462608337402344, + "learning_rate": 8e-05, + "loss": 53.096, + "num_input_tokens_seen": 135891044, + "step": 2640 + }, + { + "epoch": 0.25974791774157885, + "grad_norm": 36.87857437133789, + "learning_rate": 8e-05, + "loss": 57.6014, + "num_input_tokens_seen": 136053652, + "step": 2643 + }, + { + "epoch": 0.26004275079236383, + "grad_norm": 34.586448669433594, + "learning_rate": 8e-05, + "loss": 56.0422, + "num_input_tokens_seen": 136177132, + "step": 2646 + }, + { + "epoch": 0.2603375838431488, + "grad_norm": 34.89237594604492, + "learning_rate": 8e-05, + "loss": 54.252, + "num_input_tokens_seen": 136318884, + "step": 2649 + }, + { + "epoch": 0.2606324168939338, + "grad_norm": 30.432071685791016, + "learning_rate": 8e-05, + "loss": 54.7018, + "num_input_tokens_seen": 136478960, + "step": 2652 + }, + { + "epoch": 0.2609272499447188, + "grad_norm": 37.170528411865234, + "learning_rate": 8e-05, + "loss": 52.9007, + "num_input_tokens_seen": 136650348, + "step": 2655 + }, + { + "epoch": 0.2612220829955038, + "grad_norm": 27.782188415527344, + "learning_rate": 8e-05, + "loss": 53.955, + "num_input_tokens_seen": 136797140, + "step": 2658 + }, + { + "epoch": 0.2615169160462888, + "grad_norm": 34.65721893310547, + "learning_rate": 8e-05, + "loss": 54.2823, + "num_input_tokens_seen": 136949440, + "step": 2661 + }, + { + "epoch": 0.2618117490970738, + "grad_norm": 37.2616081237793, + "learning_rate": 8e-05, + "loss": 56.0293, + "num_input_tokens_seen": 137098148, + "step": 2664 + }, + { + "epoch": 0.26210658214785876, + "grad_norm": 34.62560272216797, + "learning_rate": 8e-05, + "loss": 53.8384, + "num_input_tokens_seen": 137253928, + "step": 2667 + }, + { + "epoch": 0.2624014151986438, + "grad_norm": 31.226455688476562, + "learning_rate": 8e-05, + "loss": 55.0638, + "num_input_tokens_seen": 137390820, + "step": 2670 + }, + { + "epoch": 0.2626962482494288, + "grad_norm": 48.27977752685547, + "learning_rate": 8e-05, + "loss": 53.2792, + "num_input_tokens_seen": 137559424, + "step": 2673 + }, + { + "epoch": 0.26299108130021376, + "grad_norm": 32.660499572753906, + "learning_rate": 8e-05, + "loss": 52.7827, + "num_input_tokens_seen": 137719552, + "step": 2676 + }, + { + "epoch": 0.26328591435099874, + "grad_norm": 33.29769515991211, + "learning_rate": 8e-05, + "loss": 52.1215, + "num_input_tokens_seen": 137877180, + "step": 2679 + }, + { + "epoch": 0.2635807474017837, + "grad_norm": 33.17557144165039, + "learning_rate": 8e-05, + "loss": 54.5644, + "num_input_tokens_seen": 138048340, + "step": 2682 + }, + { + "epoch": 0.26387558045256876, + "grad_norm": 29.700239181518555, + "learning_rate": 8e-05, + "loss": 49.6223, + "num_input_tokens_seen": 138212264, + "step": 2685 + }, + { + "epoch": 0.26417041350335374, + "grad_norm": 33.049434661865234, + "learning_rate": 8e-05, + "loss": 56.5728, + "num_input_tokens_seen": 138395368, + "step": 2688 + }, + { + "epoch": 0.2644652465541387, + "grad_norm": 35.083763122558594, + "learning_rate": 8e-05, + "loss": 54.3003, + "num_input_tokens_seen": 138527808, + "step": 2691 + }, + { + "epoch": 0.2647600796049237, + "grad_norm": 29.783483505249023, + "learning_rate": 8e-05, + "loss": 51.9218, + "num_input_tokens_seen": 138693232, + "step": 2694 + }, + { + "epoch": 0.2650549126557087, + "grad_norm": 28.74640464782715, + "learning_rate": 8e-05, + "loss": 52.2437, + "num_input_tokens_seen": 138844128, + "step": 2697 + }, + { + "epoch": 0.2653497457064937, + "grad_norm": 33.24985122680664, + "learning_rate": 8e-05, + "loss": 54.1239, + "num_input_tokens_seen": 138994916, + "step": 2700 + }, + { + "epoch": 0.2656445787572787, + "grad_norm": 29.64424705505371, + "learning_rate": 8e-05, + "loss": 53.2436, + "num_input_tokens_seen": 139143228, + "step": 2703 + }, + { + "epoch": 0.2659394118080637, + "grad_norm": 35.86513137817383, + "learning_rate": 8e-05, + "loss": 56.2783, + "num_input_tokens_seen": 139297640, + "step": 2706 + }, + { + "epoch": 0.26623424485884867, + "grad_norm": 31.42705726623535, + "learning_rate": 8e-05, + "loss": 56.3895, + "num_input_tokens_seen": 139449136, + "step": 2709 + }, + { + "epoch": 0.26652907790963365, + "grad_norm": 170.20506286621094, + "learning_rate": 8e-05, + "loss": 53.5718, + "num_input_tokens_seen": 139597664, + "step": 2712 + }, + { + "epoch": 0.2668239109604187, + "grad_norm": 37.06209945678711, + "learning_rate": 8e-05, + "loss": 51.3754, + "num_input_tokens_seen": 139744708, + "step": 2715 + }, + { + "epoch": 0.26711874401120367, + "grad_norm": 32.0537223815918, + "learning_rate": 8e-05, + "loss": 54.4363, + "num_input_tokens_seen": 139911304, + "step": 2718 + }, + { + "epoch": 0.26741357706198865, + "grad_norm": 31.959020614624023, + "learning_rate": 8e-05, + "loss": 57.0125, + "num_input_tokens_seen": 140068588, + "step": 2721 + }, + { + "epoch": 0.26770841011277363, + "grad_norm": 32.1031608581543, + "learning_rate": 8e-05, + "loss": 48.1442, + "num_input_tokens_seen": 140220864, + "step": 2724 + }, + { + "epoch": 0.2680032431635586, + "grad_norm": 53.579872131347656, + "learning_rate": 8e-05, + "loss": 53.3052, + "num_input_tokens_seen": 140390072, + "step": 2727 + }, + { + "epoch": 0.26829807621434365, + "grad_norm": 32.496768951416016, + "learning_rate": 8e-05, + "loss": 55.885, + "num_input_tokens_seen": 140551612, + "step": 2730 + }, + { + "epoch": 0.26859290926512863, + "grad_norm": 33.73159408569336, + "learning_rate": 8e-05, + "loss": 49.2143, + "num_input_tokens_seen": 140705784, + "step": 2733 + }, + { + "epoch": 0.2688877423159136, + "grad_norm": 44.1909294128418, + "learning_rate": 8e-05, + "loss": 51.0975, + "num_input_tokens_seen": 140851328, + "step": 2736 + }, + { + "epoch": 0.2691825753666986, + "grad_norm": 35.7469367980957, + "learning_rate": 8e-05, + "loss": 52.4786, + "num_input_tokens_seen": 141018260, + "step": 2739 + }, + { + "epoch": 0.2694774084174836, + "grad_norm": 35.578575134277344, + "learning_rate": 8e-05, + "loss": 54.4309, + "num_input_tokens_seen": 141157520, + "step": 2742 + }, + { + "epoch": 0.2697722414682686, + "grad_norm": 31.34946060180664, + "learning_rate": 8e-05, + "loss": 54.8189, + "num_input_tokens_seen": 141322616, + "step": 2745 + }, + { + "epoch": 0.2700670745190536, + "grad_norm": 29.354907989501953, + "learning_rate": 8e-05, + "loss": 55.8781, + "num_input_tokens_seen": 141496008, + "step": 2748 + }, + { + "epoch": 0.2703619075698386, + "grad_norm": 30.352764129638672, + "learning_rate": 8e-05, + "loss": 52.979, + "num_input_tokens_seen": 141654120, + "step": 2751 + }, + { + "epoch": 0.27065674062062356, + "grad_norm": 29.738435745239258, + "learning_rate": 8e-05, + "loss": 50.5746, + "num_input_tokens_seen": 141794060, + "step": 2754 + }, + { + "epoch": 0.27095157367140854, + "grad_norm": 30.069454193115234, + "learning_rate": 8e-05, + "loss": 53.589, + "num_input_tokens_seen": 141943596, + "step": 2757 + }, + { + "epoch": 0.2712464067221936, + "grad_norm": 33.227054595947266, + "learning_rate": 8e-05, + "loss": 49.3517, + "num_input_tokens_seen": 142090492, + "step": 2760 + }, + { + "epoch": 0.27154123977297856, + "grad_norm": 32.27566909790039, + "learning_rate": 8e-05, + "loss": 53.5934, + "num_input_tokens_seen": 142261100, + "step": 2763 + }, + { + "epoch": 0.27183607282376354, + "grad_norm": 64.10994720458984, + "learning_rate": 8e-05, + "loss": 52.9152, + "num_input_tokens_seen": 142407892, + "step": 2766 + }, + { + "epoch": 0.2721309058745485, + "grad_norm": 96.94121551513672, + "learning_rate": 8e-05, + "loss": 51.15, + "num_input_tokens_seen": 142552576, + "step": 2769 + }, + { + "epoch": 0.2724257389253335, + "grad_norm": 40.26789093017578, + "learning_rate": 8e-05, + "loss": 52.0754, + "num_input_tokens_seen": 142702412, + "step": 2772 + }, + { + "epoch": 0.27272057197611854, + "grad_norm": 31.475791931152344, + "learning_rate": 8e-05, + "loss": 55.3629, + "num_input_tokens_seen": 142858748, + "step": 2775 + }, + { + "epoch": 0.2730154050269035, + "grad_norm": 32.535865783691406, + "learning_rate": 8e-05, + "loss": 48.8535, + "num_input_tokens_seen": 143027580, + "step": 2778 + }, + { + "epoch": 0.2733102380776885, + "grad_norm": 31.21608543395996, + "learning_rate": 8e-05, + "loss": 52.8587, + "num_input_tokens_seen": 143182316, + "step": 2781 + }, + { + "epoch": 0.2736050711284735, + "grad_norm": 33.09787368774414, + "learning_rate": 8e-05, + "loss": 52.7387, + "num_input_tokens_seen": 143350384, + "step": 2784 + }, + { + "epoch": 0.27389990417925847, + "grad_norm": 32.19013977050781, + "learning_rate": 8e-05, + "loss": 55.4064, + "num_input_tokens_seen": 143494136, + "step": 2787 + }, + { + "epoch": 0.2741947372300435, + "grad_norm": 33.92564010620117, + "learning_rate": 8e-05, + "loss": 52.2523, + "num_input_tokens_seen": 143658476, + "step": 2790 + }, + { + "epoch": 0.2744895702808285, + "grad_norm": 28.820104598999023, + "learning_rate": 8e-05, + "loss": 52.7883, + "num_input_tokens_seen": 143809704, + "step": 2793 + }, + { + "epoch": 0.27478440333161347, + "grad_norm": 32.38165283203125, + "learning_rate": 8e-05, + "loss": 55.9081, + "num_input_tokens_seen": 143979960, + "step": 2796 + }, + { + "epoch": 0.27507923638239845, + "grad_norm": 30.862777709960938, + "learning_rate": 8e-05, + "loss": 52.7154, + "num_input_tokens_seen": 144132200, + "step": 2799 + }, + { + "epoch": 0.27537406943318343, + "grad_norm": 31.817401885986328, + "learning_rate": 8e-05, + "loss": 52.9121, + "num_input_tokens_seen": 144283840, + "step": 2802 + }, + { + "epoch": 0.27566890248396847, + "grad_norm": 30.313737869262695, + "learning_rate": 8e-05, + "loss": 55.0518, + "num_input_tokens_seen": 144446960, + "step": 2805 + }, + { + "epoch": 0.27596373553475345, + "grad_norm": 35.830440521240234, + "learning_rate": 8e-05, + "loss": 53.0562, + "num_input_tokens_seen": 144599380, + "step": 2808 + }, + { + "epoch": 0.27625856858553843, + "grad_norm": 42.904781341552734, + "learning_rate": 8e-05, + "loss": 51.9012, + "num_input_tokens_seen": 144765832, + "step": 2811 + }, + { + "epoch": 0.2765534016363234, + "grad_norm": 36.148956298828125, + "learning_rate": 8e-05, + "loss": 50.594, + "num_input_tokens_seen": 144901240, + "step": 2814 + }, + { + "epoch": 0.27684823468710845, + "grad_norm": 28.935997009277344, + "learning_rate": 8e-05, + "loss": 52.4193, + "num_input_tokens_seen": 145063040, + "step": 2817 + }, + { + "epoch": 0.27714306773789343, + "grad_norm": 33.863101959228516, + "learning_rate": 8e-05, + "loss": 56.4542, + "num_input_tokens_seen": 145217092, + "step": 2820 + }, + { + "epoch": 0.2774379007886784, + "grad_norm": 30.103792190551758, + "learning_rate": 8e-05, + "loss": 47.8212, + "num_input_tokens_seen": 145380040, + "step": 2823 + }, + { + "epoch": 0.2777327338394634, + "grad_norm": 34.798763275146484, + "learning_rate": 8e-05, + "loss": 53.4973, + "num_input_tokens_seen": 145519664, + "step": 2826 + }, + { + "epoch": 0.2780275668902484, + "grad_norm": 32.65745544433594, + "learning_rate": 8e-05, + "loss": 51.7497, + "num_input_tokens_seen": 145678828, + "step": 2829 + }, + { + "epoch": 0.2783223999410334, + "grad_norm": 29.108938217163086, + "learning_rate": 8e-05, + "loss": 53.0858, + "num_input_tokens_seen": 145853384, + "step": 2832 + }, + { + "epoch": 0.2786172329918184, + "grad_norm": 32.11029815673828, + "learning_rate": 8e-05, + "loss": 47.0779, + "num_input_tokens_seen": 146002992, + "step": 2835 + }, + { + "epoch": 0.2789120660426034, + "grad_norm": 37.39936828613281, + "learning_rate": 8e-05, + "loss": 51.646, + "num_input_tokens_seen": 146148328, + "step": 2838 + }, + { + "epoch": 0.27920689909338836, + "grad_norm": 34.64923858642578, + "learning_rate": 8e-05, + "loss": 50.4198, + "num_input_tokens_seen": 146291240, + "step": 2841 + }, + { + "epoch": 0.27950173214417334, + "grad_norm": 34.844642639160156, + "learning_rate": 8e-05, + "loss": 53.106, + "num_input_tokens_seen": 146413520, + "step": 2844 + }, + { + "epoch": 0.2797965651949584, + "grad_norm": 41.19792556762695, + "learning_rate": 8e-05, + "loss": 51.1535, + "num_input_tokens_seen": 146537460, + "step": 2847 + }, + { + "epoch": 0.28009139824574336, + "grad_norm": 30.525279998779297, + "learning_rate": 8e-05, + "loss": 48.1843, + "num_input_tokens_seen": 146686000, + "step": 2850 + }, + { + "epoch": 0.28038623129652834, + "grad_norm": 32.820343017578125, + "learning_rate": 8e-05, + "loss": 51.6588, + "num_input_tokens_seen": 146833596, + "step": 2853 + }, + { + "epoch": 0.2806810643473133, + "grad_norm": 30.4845027923584, + "learning_rate": 8e-05, + "loss": 53.6651, + "num_input_tokens_seen": 146986500, + "step": 2856 + }, + { + "epoch": 0.2809758973980983, + "grad_norm": 28.711761474609375, + "learning_rate": 8e-05, + "loss": 48.2291, + "num_input_tokens_seen": 147126368, + "step": 2859 + }, + { + "epoch": 0.28127073044888334, + "grad_norm": 35.70820999145508, + "learning_rate": 8e-05, + "loss": 52.7987, + "num_input_tokens_seen": 147291712, + "step": 2862 + }, + { + "epoch": 0.2815655634996683, + "grad_norm": 36.2431526184082, + "learning_rate": 8e-05, + "loss": 52.1996, + "num_input_tokens_seen": 147430972, + "step": 2865 + }, + { + "epoch": 0.2818603965504533, + "grad_norm": 41.38965606689453, + "learning_rate": 8e-05, + "loss": 51.0413, + "num_input_tokens_seen": 147601428, + "step": 2868 + }, + { + "epoch": 0.2821552296012383, + "grad_norm": 34.36966323852539, + "learning_rate": 8e-05, + "loss": 49.9894, + "num_input_tokens_seen": 147741796, + "step": 2871 + }, + { + "epoch": 0.28245006265202327, + "grad_norm": 29.51498794555664, + "learning_rate": 8e-05, + "loss": 51.1804, + "num_input_tokens_seen": 147896472, + "step": 2874 + }, + { + "epoch": 0.2827448957028083, + "grad_norm": 31.154193878173828, + "learning_rate": 8e-05, + "loss": 50.2609, + "num_input_tokens_seen": 148044756, + "step": 2877 + }, + { + "epoch": 0.2830397287535933, + "grad_norm": 44.29144287109375, + "learning_rate": 8e-05, + "loss": 50.1375, + "num_input_tokens_seen": 148207236, + "step": 2880 + }, + { + "epoch": 0.28333456180437827, + "grad_norm": 32.92866897583008, + "learning_rate": 8e-05, + "loss": 50.9692, + "num_input_tokens_seen": 148347756, + "step": 2883 + }, + { + "epoch": 0.28362939485516325, + "grad_norm": 82.20730590820312, + "learning_rate": 8e-05, + "loss": 52.9466, + "num_input_tokens_seen": 148507104, + "step": 2886 + }, + { + "epoch": 0.28392422790594823, + "grad_norm": 31.4622859954834, + "learning_rate": 8e-05, + "loss": 52.708, + "num_input_tokens_seen": 148645608, + "step": 2889 + }, + { + "epoch": 0.28421906095673327, + "grad_norm": 31.294513702392578, + "learning_rate": 8e-05, + "loss": 51.7986, + "num_input_tokens_seen": 148807032, + "step": 2892 + }, + { + "epoch": 0.28451389400751825, + "grad_norm": 32.09864044189453, + "learning_rate": 8e-05, + "loss": 51.8296, + "num_input_tokens_seen": 148955096, + "step": 2895 + }, + { + "epoch": 0.28480872705830323, + "grad_norm": 32.155982971191406, + "learning_rate": 8e-05, + "loss": 51.9833, + "num_input_tokens_seen": 149115876, + "step": 2898 + }, + { + "epoch": 0.2851035601090882, + "grad_norm": 30.856664657592773, + "learning_rate": 8e-05, + "loss": 45.5968, + "num_input_tokens_seen": 149283748, + "step": 2901 + }, + { + "epoch": 0.2853983931598732, + "grad_norm": 37.33554458618164, + "learning_rate": 8e-05, + "loss": 53.588, + "num_input_tokens_seen": 149424712, + "step": 2904 + }, + { + "epoch": 0.28569322621065824, + "grad_norm": 34.55818557739258, + "learning_rate": 8e-05, + "loss": 52.8435, + "num_input_tokens_seen": 149576360, + "step": 2907 + }, + { + "epoch": 0.2859880592614432, + "grad_norm": 33.608280181884766, + "learning_rate": 8e-05, + "loss": 49.3937, + "num_input_tokens_seen": 149747444, + "step": 2910 + }, + { + "epoch": 0.2862828923122282, + "grad_norm": 43.21138000488281, + "learning_rate": 8e-05, + "loss": 53.2086, + "num_input_tokens_seen": 149905412, + "step": 2913 + }, + { + "epoch": 0.2865777253630132, + "grad_norm": 31.36229705810547, + "learning_rate": 8e-05, + "loss": 54.6847, + "num_input_tokens_seen": 150066696, + "step": 2916 + }, + { + "epoch": 0.28687255841379816, + "grad_norm": 34.487369537353516, + "learning_rate": 8e-05, + "loss": 50.4352, + "num_input_tokens_seen": 150233700, + "step": 2919 + }, + { + "epoch": 0.2871673914645832, + "grad_norm": 31.998851776123047, + "learning_rate": 8e-05, + "loss": 49.0981, + "num_input_tokens_seen": 150388200, + "step": 2922 + }, + { + "epoch": 0.2874622245153682, + "grad_norm": 29.5037841796875, + "learning_rate": 8e-05, + "loss": 52.8468, + "num_input_tokens_seen": 150534184, + "step": 2925 + }, + { + "epoch": 0.28775705756615316, + "grad_norm": 37.64830017089844, + "learning_rate": 8e-05, + "loss": 54.9494, + "num_input_tokens_seen": 150687716, + "step": 2928 + }, + { + "epoch": 0.28805189061693814, + "grad_norm": 29.77463722229004, + "learning_rate": 8e-05, + "loss": 51.2639, + "num_input_tokens_seen": 150848480, + "step": 2931 + }, + { + "epoch": 0.2883467236677231, + "grad_norm": 41.943302154541016, + "learning_rate": 8e-05, + "loss": 50.5784, + "num_input_tokens_seen": 151012276, + "step": 2934 + }, + { + "epoch": 0.28864155671850816, + "grad_norm": 29.216768264770508, + "learning_rate": 8e-05, + "loss": 48.4823, + "num_input_tokens_seen": 151147724, + "step": 2937 + }, + { + "epoch": 0.28893638976929314, + "grad_norm": 31.684417724609375, + "learning_rate": 8e-05, + "loss": 51.4178, + "num_input_tokens_seen": 151286992, + "step": 2940 + }, + { + "epoch": 0.2892312228200781, + "grad_norm": 29.715232849121094, + "learning_rate": 8e-05, + "loss": 52.4719, + "num_input_tokens_seen": 151434844, + "step": 2943 + }, + { + "epoch": 0.2895260558708631, + "grad_norm": 29.671850204467773, + "learning_rate": 8e-05, + "loss": 48.1795, + "num_input_tokens_seen": 151572760, + "step": 2946 + }, + { + "epoch": 0.2898208889216481, + "grad_norm": 30.16161346435547, + "learning_rate": 8e-05, + "loss": 50.0184, + "num_input_tokens_seen": 151719104, + "step": 2949 + }, + { + "epoch": 0.2901157219724331, + "grad_norm": 32.42359161376953, + "learning_rate": 8e-05, + "loss": 53.1842, + "num_input_tokens_seen": 151864812, + "step": 2952 + }, + { + "epoch": 0.2904105550232181, + "grad_norm": 37.412479400634766, + "learning_rate": 8e-05, + "loss": 52.6939, + "num_input_tokens_seen": 152000620, + "step": 2955 + }, + { + "epoch": 0.2907053880740031, + "grad_norm": 27.60318946838379, + "learning_rate": 8e-05, + "loss": 48.8245, + "num_input_tokens_seen": 152163128, + "step": 2958 + }, + { + "epoch": 0.29100022112478807, + "grad_norm": 31.002967834472656, + "learning_rate": 8e-05, + "loss": 52.472, + "num_input_tokens_seen": 152312572, + "step": 2961 + }, + { + "epoch": 0.2912950541755731, + "grad_norm": 32.0164680480957, + "learning_rate": 8e-05, + "loss": 51.9189, + "num_input_tokens_seen": 152464844, + "step": 2964 + }, + { + "epoch": 0.2915898872263581, + "grad_norm": 34.06315231323242, + "learning_rate": 8e-05, + "loss": 51.1857, + "num_input_tokens_seen": 152627208, + "step": 2967 + }, + { + "epoch": 0.2918847202771431, + "grad_norm": 34.70756530761719, + "learning_rate": 8e-05, + "loss": 52.5184, + "num_input_tokens_seen": 152788952, + "step": 2970 + }, + { + "epoch": 0.29217955332792805, + "grad_norm": 32.645355224609375, + "learning_rate": 8e-05, + "loss": 54.0848, + "num_input_tokens_seen": 152941456, + "step": 2973 + }, + { + "epoch": 0.29247438637871304, + "grad_norm": 35.99182891845703, + "learning_rate": 8e-05, + "loss": 49.7962, + "num_input_tokens_seen": 153102644, + "step": 2976 + }, + { + "epoch": 0.2927692194294981, + "grad_norm": 30.75571632385254, + "learning_rate": 8e-05, + "loss": 51.04, + "num_input_tokens_seen": 153244812, + "step": 2979 + }, + { + "epoch": 0.29306405248028305, + "grad_norm": 31.208608627319336, + "learning_rate": 8e-05, + "loss": 48.5619, + "num_input_tokens_seen": 153404796, + "step": 2982 + }, + { + "epoch": 0.29335888553106804, + "grad_norm": 35.94192123413086, + "learning_rate": 8e-05, + "loss": 49.4916, + "num_input_tokens_seen": 153544740, + "step": 2985 + }, + { + "epoch": 0.293653718581853, + "grad_norm": 33.36006546020508, + "learning_rate": 8e-05, + "loss": 52.1915, + "num_input_tokens_seen": 153695968, + "step": 2988 + }, + { + "epoch": 0.293948551632638, + "grad_norm": 29.09237289428711, + "learning_rate": 8e-05, + "loss": 50.3356, + "num_input_tokens_seen": 153874444, + "step": 2991 + }, + { + "epoch": 0.29424338468342304, + "grad_norm": 44.539207458496094, + "learning_rate": 8e-05, + "loss": 49.5421, + "num_input_tokens_seen": 154035432, + "step": 2994 + }, + { + "epoch": 0.294538217734208, + "grad_norm": 31.166061401367188, + "learning_rate": 8e-05, + "loss": 48.624, + "num_input_tokens_seen": 154182420, + "step": 2997 + }, + { + "epoch": 0.294833050784993, + "grad_norm": 27.469114303588867, + "learning_rate": 8e-05, + "loss": 51.9327, + "num_input_tokens_seen": 154351440, + "step": 3000 + }, + { + "epoch": 0.294833050784993, + "eval_gen_len": 30.495, + "eval_loss": 3.2430038452148438, + "eval_rouge1": 40.1723, + "eval_rouge2": 21.3863, + "eval_rougeL": 36.5277, + "eval_rougeLsum": 36.8678, + "eval_runtime": 91.7301, + "eval_samples_per_second": 2.18, + "eval_steps_per_second": 0.545, + "num_input_tokens_seen": 154351440, + "step": 3000 + }, + { + "epoch": 0.295127883835778, + "grad_norm": 27.834260940551758, + "learning_rate": 8e-05, + "loss": 51.2899, + "num_input_tokens_seen": 154512120, + "step": 3003 + }, + { + "epoch": 0.29542271688656296, + "grad_norm": 30.697715759277344, + "learning_rate": 8e-05, + "loss": 48.9938, + "num_input_tokens_seen": 154684184, + "step": 3006 + }, + { + "epoch": 0.295717549937348, + "grad_norm": 28.035795211791992, + "learning_rate": 8e-05, + "loss": 50.7757, + "num_input_tokens_seen": 154848936, + "step": 3009 + }, + { + "epoch": 0.296012382988133, + "grad_norm": 31.7027645111084, + "learning_rate": 8e-05, + "loss": 48.1236, + "num_input_tokens_seen": 155004780, + "step": 3012 + }, + { + "epoch": 0.29630721603891796, + "grad_norm": 30.000329971313477, + "learning_rate": 8e-05, + "loss": 49.6937, + "num_input_tokens_seen": 155170616, + "step": 3015 + }, + { + "epoch": 0.29660204908970295, + "grad_norm": 31.308128356933594, + "learning_rate": 8e-05, + "loss": 51.6159, + "num_input_tokens_seen": 155325012, + "step": 3018 + }, + { + "epoch": 0.2968968821404879, + "grad_norm": 40.389461517333984, + "learning_rate": 8e-05, + "loss": 45.708, + "num_input_tokens_seen": 155476104, + "step": 3021 + }, + { + "epoch": 0.29719171519127296, + "grad_norm": 93.52242279052734, + "learning_rate": 8e-05, + "loss": 45.6057, + "num_input_tokens_seen": 155620040, + "step": 3024 + }, + { + "epoch": 0.29748654824205795, + "grad_norm": 30.37114906311035, + "learning_rate": 8e-05, + "loss": 50.7246, + "num_input_tokens_seen": 155776340, + "step": 3027 + }, + { + "epoch": 0.29778138129284293, + "grad_norm": 32.123416900634766, + "learning_rate": 8e-05, + "loss": 47.2173, + "num_input_tokens_seen": 155892028, + "step": 3030 + }, + { + "epoch": 0.2980762143436279, + "grad_norm": 67.35137939453125, + "learning_rate": 8e-05, + "loss": 47.236, + "num_input_tokens_seen": 156037452, + "step": 3033 + }, + { + "epoch": 0.2983710473944129, + "grad_norm": 31.5852108001709, + "learning_rate": 8e-05, + "loss": 48.5519, + "num_input_tokens_seen": 156179356, + "step": 3036 + }, + { + "epoch": 0.29866588044519793, + "grad_norm": 31.071697235107422, + "learning_rate": 8e-05, + "loss": 52.0375, + "num_input_tokens_seen": 156335380, + "step": 3039 + }, + { + "epoch": 0.2989607134959829, + "grad_norm": 48.0234489440918, + "learning_rate": 8e-05, + "loss": 51.3958, + "num_input_tokens_seen": 156519724, + "step": 3042 + }, + { + "epoch": 0.2992555465467679, + "grad_norm": 32.876136779785156, + "learning_rate": 8e-05, + "loss": 50.8297, + "num_input_tokens_seen": 156690848, + "step": 3045 + }, + { + "epoch": 0.2995503795975529, + "grad_norm": 31.397253036499023, + "learning_rate": 8e-05, + "loss": 50.5913, + "num_input_tokens_seen": 156848912, + "step": 3048 + }, + { + "epoch": 0.29984521264833786, + "grad_norm": 30.054244995117188, + "learning_rate": 8e-05, + "loss": 45.5249, + "num_input_tokens_seen": 157005116, + "step": 3051 + }, + { + "epoch": 0.3001400456991229, + "grad_norm": 32.175376892089844, + "learning_rate": 8e-05, + "loss": 54.039, + "num_input_tokens_seen": 157163468, + "step": 3054 + }, + { + "epoch": 0.3004348787499079, + "grad_norm": 27.12468910217285, + "learning_rate": 8e-05, + "loss": 52.333, + "num_input_tokens_seen": 157321272, + "step": 3057 + }, + { + "epoch": 0.30072971180069286, + "grad_norm": 30.750747680664062, + "learning_rate": 8e-05, + "loss": 48.1032, + "num_input_tokens_seen": 157498128, + "step": 3060 + }, + { + "epoch": 0.30102454485147784, + "grad_norm": 35.073326110839844, + "learning_rate": 8e-05, + "loss": 47.2467, + "num_input_tokens_seen": 157654200, + "step": 3063 + }, + { + "epoch": 0.3013193779022628, + "grad_norm": 36.32673645019531, + "learning_rate": 8e-05, + "loss": 54.9197, + "num_input_tokens_seen": 157818088, + "step": 3066 + }, + { + "epoch": 0.30161421095304786, + "grad_norm": 26.123674392700195, + "learning_rate": 8e-05, + "loss": 50.2599, + "num_input_tokens_seen": 157982596, + "step": 3069 + }, + { + "epoch": 0.30190904400383284, + "grad_norm": 33.341209411621094, + "learning_rate": 8e-05, + "loss": 50.698, + "num_input_tokens_seen": 158152676, + "step": 3072 + }, + { + "epoch": 0.3022038770546178, + "grad_norm": 26.35466194152832, + "learning_rate": 8e-05, + "loss": 46.5832, + "num_input_tokens_seen": 158311820, + "step": 3075 + }, + { + "epoch": 0.3024987101054028, + "grad_norm": 34.99408721923828, + "learning_rate": 8e-05, + "loss": 51.7367, + "num_input_tokens_seen": 158483932, + "step": 3078 + }, + { + "epoch": 0.3027935431561878, + "grad_norm": 35.281272888183594, + "learning_rate": 8e-05, + "loss": 48.9734, + "num_input_tokens_seen": 158620712, + "step": 3081 + }, + { + "epoch": 0.3030883762069728, + "grad_norm": 28.188995361328125, + "learning_rate": 8e-05, + "loss": 47.357, + "num_input_tokens_seen": 158771128, + "step": 3084 + }, + { + "epoch": 0.3033832092577578, + "grad_norm": 31.777433395385742, + "learning_rate": 8e-05, + "loss": 51.7049, + "num_input_tokens_seen": 158922868, + "step": 3087 + }, + { + "epoch": 0.3036780423085428, + "grad_norm": 29.47471046447754, + "learning_rate": 8e-05, + "loss": 50.8354, + "num_input_tokens_seen": 159070688, + "step": 3090 + }, + { + "epoch": 0.30397287535932777, + "grad_norm": 31.329797744750977, + "learning_rate": 8e-05, + "loss": 48.8359, + "num_input_tokens_seen": 159204564, + "step": 3093 + }, + { + "epoch": 0.30426770841011275, + "grad_norm": 36.26081466674805, + "learning_rate": 8e-05, + "loss": 51.5042, + "num_input_tokens_seen": 159341748, + "step": 3096 + }, + { + "epoch": 0.3045625414608978, + "grad_norm": 46.48896789550781, + "learning_rate": 8e-05, + "loss": 48.3833, + "num_input_tokens_seen": 159487756, + "step": 3099 + }, + { + "epoch": 0.30485737451168277, + "grad_norm": 27.038835525512695, + "learning_rate": 8e-05, + "loss": 45.7966, + "num_input_tokens_seen": 159646172, + "step": 3102 + }, + { + "epoch": 0.30515220756246775, + "grad_norm": 29.98095703125, + "learning_rate": 8e-05, + "loss": 43.3869, + "num_input_tokens_seen": 159823372, + "step": 3105 + }, + { + "epoch": 0.30544704061325273, + "grad_norm": 30.19727897644043, + "learning_rate": 8e-05, + "loss": 47.5798, + "num_input_tokens_seen": 159975388, + "step": 3108 + }, + { + "epoch": 0.3057418736640377, + "grad_norm": 29.022533416748047, + "learning_rate": 8e-05, + "loss": 52.4457, + "num_input_tokens_seen": 160103260, + "step": 3111 + }, + { + "epoch": 0.30603670671482275, + "grad_norm": 90.77041625976562, + "learning_rate": 8e-05, + "loss": 54.8584, + "num_input_tokens_seen": 160259356, + "step": 3114 + }, + { + "epoch": 0.30633153976560773, + "grad_norm": 41.759761810302734, + "learning_rate": 8e-05, + "loss": 48.1346, + "num_input_tokens_seen": 160427328, + "step": 3117 + }, + { + "epoch": 0.3066263728163927, + "grad_norm": 38.2276496887207, + "learning_rate": 8e-05, + "loss": 48.8283, + "num_input_tokens_seen": 160572824, + "step": 3120 + }, + { + "epoch": 0.3069212058671777, + "grad_norm": 31.891613006591797, + "learning_rate": 8e-05, + "loss": 47.1253, + "num_input_tokens_seen": 160751252, + "step": 3123 + }, + { + "epoch": 0.30721603891796273, + "grad_norm": 29.48491668701172, + "learning_rate": 8e-05, + "loss": 47.7418, + "num_input_tokens_seen": 160934284, + "step": 3126 + }, + { + "epoch": 0.3075108719687477, + "grad_norm": 30.440406799316406, + "learning_rate": 8e-05, + "loss": 48.4942, + "num_input_tokens_seen": 161106808, + "step": 3129 + }, + { + "epoch": 0.3078057050195327, + "grad_norm": 32.63410186767578, + "learning_rate": 8e-05, + "loss": 51.9228, + "num_input_tokens_seen": 161259732, + "step": 3132 + }, + { + "epoch": 0.3081005380703177, + "grad_norm": 30.53582000732422, + "learning_rate": 8e-05, + "loss": 54.3659, + "num_input_tokens_seen": 161410464, + "step": 3135 + }, + { + "epoch": 0.30839537112110266, + "grad_norm": 31.640932083129883, + "learning_rate": 8e-05, + "loss": 49.5831, + "num_input_tokens_seen": 161557152, + "step": 3138 + }, + { + "epoch": 0.3086902041718877, + "grad_norm": 32.702518463134766, + "learning_rate": 8e-05, + "loss": 50.4781, + "num_input_tokens_seen": 161707312, + "step": 3141 + }, + { + "epoch": 0.3089850372226727, + "grad_norm": 30.084012985229492, + "learning_rate": 8e-05, + "loss": 51.5196, + "num_input_tokens_seen": 161855552, + "step": 3144 + }, + { + "epoch": 0.30927987027345766, + "grad_norm": 33.723819732666016, + "learning_rate": 8e-05, + "loss": 49.4337, + "num_input_tokens_seen": 161991804, + "step": 3147 + }, + { + "epoch": 0.30957470332424264, + "grad_norm": 37.2187614440918, + "learning_rate": 8e-05, + "loss": 52.0158, + "num_input_tokens_seen": 162132524, + "step": 3150 + }, + { + "epoch": 0.3098695363750276, + "grad_norm": 31.11838722229004, + "learning_rate": 8e-05, + "loss": 50.1612, + "num_input_tokens_seen": 162286832, + "step": 3153 + }, + { + "epoch": 0.31016436942581266, + "grad_norm": 29.569923400878906, + "learning_rate": 8e-05, + "loss": 45.6304, + "num_input_tokens_seen": 162426504, + "step": 3156 + }, + { + "epoch": 0.31045920247659764, + "grad_norm": 33.4188232421875, + "learning_rate": 8e-05, + "loss": 48.863, + "num_input_tokens_seen": 162582048, + "step": 3159 + }, + { + "epoch": 0.3107540355273826, + "grad_norm": 30.900367736816406, + "learning_rate": 8e-05, + "loss": 45.2414, + "num_input_tokens_seen": 162735952, + "step": 3162 + }, + { + "epoch": 0.3110488685781676, + "grad_norm": 30.14768409729004, + "learning_rate": 8e-05, + "loss": 51.5442, + "num_input_tokens_seen": 162880820, + "step": 3165 + }, + { + "epoch": 0.3113437016289526, + "grad_norm": 30.156339645385742, + "learning_rate": 8e-05, + "loss": 52.2109, + "num_input_tokens_seen": 163037460, + "step": 3168 + }, + { + "epoch": 0.3116385346797376, + "grad_norm": 29.608909606933594, + "learning_rate": 8e-05, + "loss": 48.732, + "num_input_tokens_seen": 163150724, + "step": 3171 + }, + { + "epoch": 0.3119333677305226, + "grad_norm": 30.16400718688965, + "learning_rate": 8e-05, + "loss": 51.2494, + "num_input_tokens_seen": 163317320, + "step": 3174 + }, + { + "epoch": 0.3122282007813076, + "grad_norm": 33.92653274536133, + "learning_rate": 8e-05, + "loss": 51.0058, + "num_input_tokens_seen": 163478164, + "step": 3177 + }, + { + "epoch": 0.31252303383209257, + "grad_norm": 34.29814910888672, + "learning_rate": 8e-05, + "loss": 49.5705, + "num_input_tokens_seen": 163593236, + "step": 3180 + }, + { + "epoch": 0.31281786688287755, + "grad_norm": 43.815528869628906, + "learning_rate": 8e-05, + "loss": 47.95, + "num_input_tokens_seen": 163751472, + "step": 3183 + }, + { + "epoch": 0.3131126999336626, + "grad_norm": 34.95302200317383, + "learning_rate": 8e-05, + "loss": 48.482, + "num_input_tokens_seen": 163909016, + "step": 3186 + }, + { + "epoch": 0.31340753298444757, + "grad_norm": 32.39896774291992, + "learning_rate": 8e-05, + "loss": 47.6268, + "num_input_tokens_seen": 164061776, + "step": 3189 + }, + { + "epoch": 0.31370236603523255, + "grad_norm": 29.34246063232422, + "learning_rate": 8e-05, + "loss": 44.7197, + "num_input_tokens_seen": 164228080, + "step": 3192 + }, + { + "epoch": 0.31399719908601753, + "grad_norm": 32.78367233276367, + "learning_rate": 8e-05, + "loss": 49.5443, + "num_input_tokens_seen": 164372944, + "step": 3195 + }, + { + "epoch": 0.3142920321368025, + "grad_norm": 29.293537139892578, + "learning_rate": 8e-05, + "loss": 48.2356, + "num_input_tokens_seen": 164539204, + "step": 3198 + }, + { + "epoch": 0.31458686518758755, + "grad_norm": 27.77086067199707, + "learning_rate": 8e-05, + "loss": 48.6342, + "num_input_tokens_seen": 164700484, + "step": 3201 + }, + { + "epoch": 0.31488169823837253, + "grad_norm": 30.448762893676758, + "learning_rate": 8e-05, + "loss": 50.1135, + "num_input_tokens_seen": 164858072, + "step": 3204 + }, + { + "epoch": 0.3151765312891575, + "grad_norm": 31.73259925842285, + "learning_rate": 8e-05, + "loss": 48.8052, + "num_input_tokens_seen": 164994416, + "step": 3207 + }, + { + "epoch": 0.3154713643399425, + "grad_norm": 24.573047637939453, + "learning_rate": 8e-05, + "loss": 48.3418, + "num_input_tokens_seen": 165168004, + "step": 3210 + }, + { + "epoch": 0.3157661973907275, + "grad_norm": 30.762184143066406, + "learning_rate": 8e-05, + "loss": 51.0315, + "num_input_tokens_seen": 165320468, + "step": 3213 + }, + { + "epoch": 0.3160610304415125, + "grad_norm": 31.318260192871094, + "learning_rate": 8e-05, + "loss": 50.0707, + "num_input_tokens_seen": 165487880, + "step": 3216 + }, + { + "epoch": 0.3163558634922975, + "grad_norm": 34.796592712402344, + "learning_rate": 8e-05, + "loss": 47.4512, + "num_input_tokens_seen": 165647096, + "step": 3219 + }, + { + "epoch": 0.3166506965430825, + "grad_norm": 31.09071159362793, + "learning_rate": 8e-05, + "loss": 48.0372, + "num_input_tokens_seen": 165813096, + "step": 3222 + }, + { + "epoch": 0.31694552959386746, + "grad_norm": 29.88203239440918, + "learning_rate": 8e-05, + "loss": 44.7359, + "num_input_tokens_seen": 165981372, + "step": 3225 + }, + { + "epoch": 0.31724036264465244, + "grad_norm": 29.312549591064453, + "learning_rate": 8e-05, + "loss": 47.5664, + "num_input_tokens_seen": 166140500, + "step": 3228 + }, + { + "epoch": 0.3175351956954375, + "grad_norm": 34.14030456542969, + "learning_rate": 8e-05, + "loss": 47.5588, + "num_input_tokens_seen": 166289596, + "step": 3231 + }, + { + "epoch": 0.31783002874622246, + "grad_norm": 32.20860290527344, + "learning_rate": 8e-05, + "loss": 50.7921, + "num_input_tokens_seen": 166451524, + "step": 3234 + }, + { + "epoch": 0.31812486179700744, + "grad_norm": 31.75952911376953, + "learning_rate": 8e-05, + "loss": 49.2161, + "num_input_tokens_seen": 166602148, + "step": 3237 + }, + { + "epoch": 0.3184196948477924, + "grad_norm": 29.15970802307129, + "learning_rate": 8e-05, + "loss": 49.655, + "num_input_tokens_seen": 166750708, + "step": 3240 + }, + { + "epoch": 0.3187145278985774, + "grad_norm": 28.13986587524414, + "learning_rate": 8e-05, + "loss": 50.4226, + "num_input_tokens_seen": 166894808, + "step": 3243 + }, + { + "epoch": 0.31900936094936244, + "grad_norm": 31.35550308227539, + "learning_rate": 8e-05, + "loss": 48.0589, + "num_input_tokens_seen": 167055288, + "step": 3246 + }, + { + "epoch": 0.3193041940001474, + "grad_norm": 41.20752716064453, + "learning_rate": 8e-05, + "loss": 46.9058, + "num_input_tokens_seen": 167214404, + "step": 3249 + }, + { + "epoch": 0.3195990270509324, + "grad_norm": 31.76648712158203, + "learning_rate": 8e-05, + "loss": 51.1456, + "num_input_tokens_seen": 167352292, + "step": 3252 + }, + { + "epoch": 0.3198938601017174, + "grad_norm": 25.93441390991211, + "learning_rate": 8e-05, + "loss": 45.1572, + "num_input_tokens_seen": 167508120, + "step": 3255 + }, + { + "epoch": 0.32018869315250237, + "grad_norm": 31.922340393066406, + "learning_rate": 8e-05, + "loss": 48.1204, + "num_input_tokens_seen": 167658524, + "step": 3258 + }, + { + "epoch": 0.3204835262032874, + "grad_norm": 31.73157501220703, + "learning_rate": 8e-05, + "loss": 47.138, + "num_input_tokens_seen": 167802924, + "step": 3261 + }, + { + "epoch": 0.3207783592540724, + "grad_norm": 32.577701568603516, + "learning_rate": 8e-05, + "loss": 47.1694, + "num_input_tokens_seen": 167933868, + "step": 3264 + }, + { + "epoch": 0.32107319230485737, + "grad_norm": 29.493078231811523, + "learning_rate": 8e-05, + "loss": 46.81, + "num_input_tokens_seen": 168090452, + "step": 3267 + }, + { + "epoch": 0.32136802535564235, + "grad_norm": 29.012908935546875, + "learning_rate": 8e-05, + "loss": 43.3255, + "num_input_tokens_seen": 168232328, + "step": 3270 + }, + { + "epoch": 0.3216628584064274, + "grad_norm": 32.09306716918945, + "learning_rate": 8e-05, + "loss": 45.4934, + "num_input_tokens_seen": 168361412, + "step": 3273 + }, + { + "epoch": 0.32195769145721237, + "grad_norm": 31.578340530395508, + "learning_rate": 8e-05, + "loss": 44.2819, + "num_input_tokens_seen": 168491948, + "step": 3276 + }, + { + "epoch": 0.32225252450799735, + "grad_norm": 32.133750915527344, + "learning_rate": 8e-05, + "loss": 48.465, + "num_input_tokens_seen": 168631568, + "step": 3279 + }, + { + "epoch": 0.32254735755878233, + "grad_norm": 29.717243194580078, + "learning_rate": 8e-05, + "loss": 50.4961, + "num_input_tokens_seen": 168768060, + "step": 3282 + }, + { + "epoch": 0.3228421906095673, + "grad_norm": 30.31847381591797, + "learning_rate": 8e-05, + "loss": 51.0925, + "num_input_tokens_seen": 168917240, + "step": 3285 + }, + { + "epoch": 0.32313702366035235, + "grad_norm": 28.687097549438477, + "learning_rate": 8e-05, + "loss": 50.5125, + "num_input_tokens_seen": 169092584, + "step": 3288 + }, + { + "epoch": 0.32343185671113733, + "grad_norm": 35.459190368652344, + "learning_rate": 8e-05, + "loss": 49.7695, + "num_input_tokens_seen": 169246108, + "step": 3291 + }, + { + "epoch": 0.3237266897619223, + "grad_norm": 29.45003318786621, + "learning_rate": 8e-05, + "loss": 53.2221, + "num_input_tokens_seen": 169419860, + "step": 3294 + }, + { + "epoch": 0.3240215228127073, + "grad_norm": 37.26349639892578, + "learning_rate": 8e-05, + "loss": 47.2518, + "num_input_tokens_seen": 169574904, + "step": 3297 + }, + { + "epoch": 0.3243163558634923, + "grad_norm": 31.292814254760742, + "learning_rate": 8e-05, + "loss": 48.2314, + "num_input_tokens_seen": 169734168, + "step": 3300 + }, + { + "epoch": 0.3246111889142773, + "grad_norm": 32.58222961425781, + "learning_rate": 8e-05, + "loss": 49.1216, + "num_input_tokens_seen": 169904416, + "step": 3303 + }, + { + "epoch": 0.3249060219650623, + "grad_norm": 31.01238250732422, + "learning_rate": 8e-05, + "loss": 45.0155, + "num_input_tokens_seen": 170060788, + "step": 3306 + }, + { + "epoch": 0.3252008550158473, + "grad_norm": 34.21432876586914, + "learning_rate": 8e-05, + "loss": 48.9929, + "num_input_tokens_seen": 170202916, + "step": 3309 + }, + { + "epoch": 0.32549568806663226, + "grad_norm": 28.22661590576172, + "learning_rate": 8e-05, + "loss": 45.7777, + "num_input_tokens_seen": 170362740, + "step": 3312 + }, + { + "epoch": 0.32579052111741724, + "grad_norm": 26.78307342529297, + "learning_rate": 8e-05, + "loss": 44.7937, + "num_input_tokens_seen": 170521040, + "step": 3315 + }, + { + "epoch": 0.3260853541682023, + "grad_norm": 29.794517517089844, + "learning_rate": 8e-05, + "loss": 47.9855, + "num_input_tokens_seen": 170682436, + "step": 3318 + }, + { + "epoch": 0.32638018721898726, + "grad_norm": 32.7954216003418, + "learning_rate": 8e-05, + "loss": 43.2529, + "num_input_tokens_seen": 170844748, + "step": 3321 + }, + { + "epoch": 0.32667502026977224, + "grad_norm": 29.20732307434082, + "learning_rate": 8e-05, + "loss": 46.1486, + "num_input_tokens_seen": 171001688, + "step": 3324 + }, + { + "epoch": 0.3269698533205572, + "grad_norm": 30.342622756958008, + "learning_rate": 8e-05, + "loss": 49.8051, + "num_input_tokens_seen": 171153672, + "step": 3327 + }, + { + "epoch": 0.3272646863713422, + "grad_norm": 59.030792236328125, + "learning_rate": 8e-05, + "loss": 46.6331, + "num_input_tokens_seen": 171301812, + "step": 3330 + }, + { + "epoch": 0.32755951942212724, + "grad_norm": 33.16861343383789, + "learning_rate": 8e-05, + "loss": 44.3875, + "num_input_tokens_seen": 171457740, + "step": 3333 + }, + { + "epoch": 0.3278543524729122, + "grad_norm": 31.25284194946289, + "learning_rate": 8e-05, + "loss": 47.5177, + "num_input_tokens_seen": 171607032, + "step": 3336 + }, + { + "epoch": 0.3281491855236972, + "grad_norm": 33.96405029296875, + "learning_rate": 8e-05, + "loss": 46.1177, + "num_input_tokens_seen": 171742676, + "step": 3339 + }, + { + "epoch": 0.3284440185744822, + "grad_norm": 31.06889533996582, + "learning_rate": 8e-05, + "loss": 46.4838, + "num_input_tokens_seen": 171906656, + "step": 3342 + }, + { + "epoch": 0.32873885162526717, + "grad_norm": 31.36388397216797, + "learning_rate": 8e-05, + "loss": 48.7096, + "num_input_tokens_seen": 172072992, + "step": 3345 + }, + { + "epoch": 0.3290336846760522, + "grad_norm": 30.852571487426758, + "learning_rate": 8e-05, + "loss": 50.6002, + "num_input_tokens_seen": 172230152, + "step": 3348 + }, + { + "epoch": 0.3293285177268372, + "grad_norm": 30.819143295288086, + "learning_rate": 8e-05, + "loss": 49.1265, + "num_input_tokens_seen": 172386988, + "step": 3351 + }, + { + "epoch": 0.32962335077762217, + "grad_norm": 41.371429443359375, + "learning_rate": 8e-05, + "loss": 46.3821, + "num_input_tokens_seen": 172524532, + "step": 3354 + }, + { + "epoch": 0.32991818382840715, + "grad_norm": 28.965272903442383, + "learning_rate": 8e-05, + "loss": 47.383, + "num_input_tokens_seen": 172683024, + "step": 3357 + }, + { + "epoch": 0.33021301687919213, + "grad_norm": 53.77383804321289, + "learning_rate": 8e-05, + "loss": 47.8472, + "num_input_tokens_seen": 172821612, + "step": 3360 + }, + { + "epoch": 0.33050784992997717, + "grad_norm": 30.76011085510254, + "learning_rate": 8e-05, + "loss": 45.129, + "num_input_tokens_seen": 172967344, + "step": 3363 + }, + { + "epoch": 0.33080268298076215, + "grad_norm": 27.747955322265625, + "learning_rate": 8e-05, + "loss": 46.2741, + "num_input_tokens_seen": 173115604, + "step": 3366 + }, + { + "epoch": 0.33109751603154713, + "grad_norm": 34.3202018737793, + "learning_rate": 8e-05, + "loss": 51.4671, + "num_input_tokens_seen": 173262284, + "step": 3369 + }, + { + "epoch": 0.3313923490823321, + "grad_norm": 40.12452697753906, + "learning_rate": 8e-05, + "loss": 41.3438, + "num_input_tokens_seen": 173425756, + "step": 3372 + }, + { + "epoch": 0.3316871821331171, + "grad_norm": 37.801883697509766, + "learning_rate": 8e-05, + "loss": 48.3023, + "num_input_tokens_seen": 173566892, + "step": 3375 + }, + { + "epoch": 0.33198201518390213, + "grad_norm": 35.19266891479492, + "learning_rate": 8e-05, + "loss": 49.593, + "num_input_tokens_seen": 173737684, + "step": 3378 + }, + { + "epoch": 0.3322768482346871, + "grad_norm": 28.83980369567871, + "learning_rate": 8e-05, + "loss": 44.9847, + "num_input_tokens_seen": 173916400, + "step": 3381 + }, + { + "epoch": 0.3325716812854721, + "grad_norm": 28.931739807128906, + "learning_rate": 8e-05, + "loss": 46.7851, + "num_input_tokens_seen": 174045520, + "step": 3384 + }, + { + "epoch": 0.3328665143362571, + "grad_norm": 32.318565368652344, + "learning_rate": 8e-05, + "loss": 48.6233, + "num_input_tokens_seen": 174204072, + "step": 3387 + }, + { + "epoch": 0.33316134738704206, + "grad_norm": 31.32320213317871, + "learning_rate": 8e-05, + "loss": 48.7227, + "num_input_tokens_seen": 174375316, + "step": 3390 + }, + { + "epoch": 0.3334561804378271, + "grad_norm": 34.406246185302734, + "learning_rate": 8e-05, + "loss": 43.7779, + "num_input_tokens_seen": 174514908, + "step": 3393 + }, + { + "epoch": 0.3337510134886121, + "grad_norm": 29.341218948364258, + "learning_rate": 8e-05, + "loss": 44.7992, + "num_input_tokens_seen": 174646708, + "step": 3396 + }, + { + "epoch": 0.33404584653939706, + "grad_norm": 29.529809951782227, + "learning_rate": 8e-05, + "loss": 44.8276, + "num_input_tokens_seen": 174789488, + "step": 3399 + }, + { + "epoch": 0.33434067959018204, + "grad_norm": 31.401277542114258, + "learning_rate": 8e-05, + "loss": 53.1048, + "num_input_tokens_seen": 174963152, + "step": 3402 + }, + { + "epoch": 0.334635512640967, + "grad_norm": 85.02721405029297, + "learning_rate": 8e-05, + "loss": 42.7275, + "num_input_tokens_seen": 175137252, + "step": 3405 + }, + { + "epoch": 0.33493034569175206, + "grad_norm": 36.67193603515625, + "learning_rate": 8e-05, + "loss": 48.8682, + "num_input_tokens_seen": 175315524, + "step": 3408 + }, + { + "epoch": 0.33522517874253704, + "grad_norm": 34.90581512451172, + "learning_rate": 8e-05, + "loss": 45.0905, + "num_input_tokens_seen": 175465152, + "step": 3411 + }, + { + "epoch": 0.335520011793322, + "grad_norm": 30.830902099609375, + "learning_rate": 8e-05, + "loss": 52.4906, + "num_input_tokens_seen": 175642048, + "step": 3414 + }, + { + "epoch": 0.335814844844107, + "grad_norm": 31.64945411682129, + "learning_rate": 8e-05, + "loss": 49.6005, + "num_input_tokens_seen": 175791692, + "step": 3417 + }, + { + "epoch": 0.33610967789489204, + "grad_norm": 30.259660720825195, + "learning_rate": 8e-05, + "loss": 45.7076, + "num_input_tokens_seen": 175928204, + "step": 3420 + }, + { + "epoch": 0.336404510945677, + "grad_norm": 32.44395065307617, + "learning_rate": 8e-05, + "loss": 49.5071, + "num_input_tokens_seen": 176079484, + "step": 3423 + }, + { + "epoch": 0.336699343996462, + "grad_norm": 494.6167907714844, + "learning_rate": 8e-05, + "loss": 47.7877, + "num_input_tokens_seen": 176223024, + "step": 3426 + }, + { + "epoch": 0.336994177047247, + "grad_norm": 31.663257598876953, + "learning_rate": 8e-05, + "loss": 47.6809, + "num_input_tokens_seen": 176362164, + "step": 3429 + }, + { + "epoch": 0.33728901009803197, + "grad_norm": 32.32041549682617, + "learning_rate": 8e-05, + "loss": 48.758, + "num_input_tokens_seen": 176521920, + "step": 3432 + }, + { + "epoch": 0.337583843148817, + "grad_norm": 30.0501766204834, + "learning_rate": 8e-05, + "loss": 47.7477, + "num_input_tokens_seen": 176662252, + "step": 3435 + }, + { + "epoch": 0.337878676199602, + "grad_norm": 34.510536193847656, + "learning_rate": 8e-05, + "loss": 47.1626, + "num_input_tokens_seen": 176824600, + "step": 3438 + }, + { + "epoch": 0.33817350925038697, + "grad_norm": 28.301755905151367, + "learning_rate": 8e-05, + "loss": 43.903, + "num_input_tokens_seen": 177002840, + "step": 3441 + }, + { + "epoch": 0.33846834230117195, + "grad_norm": 29.808353424072266, + "learning_rate": 8e-05, + "loss": 45.0669, + "num_input_tokens_seen": 177133472, + "step": 3444 + }, + { + "epoch": 0.33876317535195694, + "grad_norm": 34.070655822753906, + "learning_rate": 8e-05, + "loss": 48.548, + "num_input_tokens_seen": 177298748, + "step": 3447 + }, + { + "epoch": 0.339058008402742, + "grad_norm": 37.89089584350586, + "learning_rate": 8e-05, + "loss": 53.683, + "num_input_tokens_seen": 177438948, + "step": 3450 + }, + { + "epoch": 0.33935284145352695, + "grad_norm": 29.450450897216797, + "learning_rate": 8e-05, + "loss": 44.3855, + "num_input_tokens_seen": 177602176, + "step": 3453 + }, + { + "epoch": 0.33964767450431194, + "grad_norm": 31.299362182617188, + "learning_rate": 8e-05, + "loss": 48.8359, + "num_input_tokens_seen": 177758804, + "step": 3456 + }, + { + "epoch": 0.3399425075550969, + "grad_norm": 34.62590408325195, + "learning_rate": 8e-05, + "loss": 50.2165, + "num_input_tokens_seen": 177913172, + "step": 3459 + }, + { + "epoch": 0.3402373406058819, + "grad_norm": 30.38800811767578, + "learning_rate": 8e-05, + "loss": 49.167, + "num_input_tokens_seen": 178064440, + "step": 3462 + }, + { + "epoch": 0.34053217365666694, + "grad_norm": 31.36223030090332, + "learning_rate": 8e-05, + "loss": 48.3271, + "num_input_tokens_seen": 178230792, + "step": 3465 + }, + { + "epoch": 0.3408270067074519, + "grad_norm": 28.67354965209961, + "learning_rate": 8e-05, + "loss": 44.4037, + "num_input_tokens_seen": 178367876, + "step": 3468 + }, + { + "epoch": 0.3411218397582369, + "grad_norm": 27.097543716430664, + "learning_rate": 8e-05, + "loss": 46.3705, + "num_input_tokens_seen": 178521252, + "step": 3471 + }, + { + "epoch": 0.3414166728090219, + "grad_norm": 28.45320701599121, + "learning_rate": 8e-05, + "loss": 47.9262, + "num_input_tokens_seen": 178673632, + "step": 3474 + }, + { + "epoch": 0.34171150585980686, + "grad_norm": 34.57194137573242, + "learning_rate": 8e-05, + "loss": 46.3569, + "num_input_tokens_seen": 178831004, + "step": 3477 + }, + { + "epoch": 0.3420063389105919, + "grad_norm": 30.40506935119629, + "learning_rate": 8e-05, + "loss": 43.096, + "num_input_tokens_seen": 178980504, + "step": 3480 + }, + { + "epoch": 0.3423011719613769, + "grad_norm": 31.882638931274414, + "learning_rate": 8e-05, + "loss": 48.569, + "num_input_tokens_seen": 179121392, + "step": 3483 + }, + { + "epoch": 0.34259600501216186, + "grad_norm": 28.25156593322754, + "learning_rate": 8e-05, + "loss": 47.3365, + "num_input_tokens_seen": 179268492, + "step": 3486 + }, + { + "epoch": 0.34289083806294685, + "grad_norm": 30.753129959106445, + "learning_rate": 8e-05, + "loss": 47.6602, + "num_input_tokens_seen": 179410660, + "step": 3489 + }, + { + "epoch": 0.3431856711137318, + "grad_norm": 30.181903839111328, + "learning_rate": 8e-05, + "loss": 50.688, + "num_input_tokens_seen": 179595220, + "step": 3492 + }, + { + "epoch": 0.34348050416451686, + "grad_norm": 31.859561920166016, + "learning_rate": 8e-05, + "loss": 46.9394, + "num_input_tokens_seen": 179753316, + "step": 3495 + }, + { + "epoch": 0.34377533721530185, + "grad_norm": 58.434322357177734, + "learning_rate": 8e-05, + "loss": 49.7456, + "num_input_tokens_seen": 179901772, + "step": 3498 + }, + { + "epoch": 0.3440701702660868, + "grad_norm": 35.47056579589844, + "learning_rate": 8e-05, + "loss": 46.3298, + "num_input_tokens_seen": 180042052, + "step": 3501 + }, + { + "epoch": 0.3443650033168718, + "grad_norm": 28.941143035888672, + "learning_rate": 8e-05, + "loss": 46.3121, + "num_input_tokens_seen": 180183700, + "step": 3504 + }, + { + "epoch": 0.3446598363676568, + "grad_norm": 31.060523986816406, + "learning_rate": 8e-05, + "loss": 47.7124, + "num_input_tokens_seen": 180345032, + "step": 3507 + }, + { + "epoch": 0.34495466941844183, + "grad_norm": 32.34244918823242, + "learning_rate": 8e-05, + "loss": 45.1072, + "num_input_tokens_seen": 180500748, + "step": 3510 + }, + { + "epoch": 0.3452495024692268, + "grad_norm": 102.28973388671875, + "learning_rate": 8e-05, + "loss": 49.6808, + "num_input_tokens_seen": 180648820, + "step": 3513 + }, + { + "epoch": 0.3455443355200118, + "grad_norm": 31.596895217895508, + "learning_rate": 8e-05, + "loss": 47.586, + "num_input_tokens_seen": 180816336, + "step": 3516 + }, + { + "epoch": 0.3458391685707968, + "grad_norm": 84.94961547851562, + "learning_rate": 8e-05, + "loss": 48.6347, + "num_input_tokens_seen": 180981176, + "step": 3519 + }, + { + "epoch": 0.34613400162158175, + "grad_norm": 30.54353141784668, + "learning_rate": 8e-05, + "loss": 45.0397, + "num_input_tokens_seen": 181162392, + "step": 3522 + }, + { + "epoch": 0.3464288346723668, + "grad_norm": 29.32094383239746, + "learning_rate": 8e-05, + "loss": 41.4492, + "num_input_tokens_seen": 181312360, + "step": 3525 + }, + { + "epoch": 0.3467236677231518, + "grad_norm": 31.889060974121094, + "learning_rate": 8e-05, + "loss": 47.0884, + "num_input_tokens_seen": 181481268, + "step": 3528 + }, + { + "epoch": 0.34701850077393676, + "grad_norm": 28.774999618530273, + "learning_rate": 8e-05, + "loss": 46.5325, + "num_input_tokens_seen": 181645000, + "step": 3531 + }, + { + "epoch": 0.34731333382472174, + "grad_norm": 35.49319839477539, + "learning_rate": 8e-05, + "loss": 49.6535, + "num_input_tokens_seen": 181804300, + "step": 3534 + }, + { + "epoch": 0.3476081668755067, + "grad_norm": 29.114641189575195, + "learning_rate": 8e-05, + "loss": 48.0461, + "num_input_tokens_seen": 181956832, + "step": 3537 + }, + { + "epoch": 0.34790299992629176, + "grad_norm": 31.43871307373047, + "learning_rate": 8e-05, + "loss": 45.1125, + "num_input_tokens_seen": 182101356, + "step": 3540 + }, + { + "epoch": 0.34819783297707674, + "grad_norm": 36.504127502441406, + "learning_rate": 8e-05, + "loss": 46.4158, + "num_input_tokens_seen": 182254904, + "step": 3543 + }, + { + "epoch": 0.3484926660278617, + "grad_norm": 29.40753746032715, + "learning_rate": 8e-05, + "loss": 48.6721, + "num_input_tokens_seen": 182427084, + "step": 3546 + }, + { + "epoch": 0.3487874990786467, + "grad_norm": 33.908546447753906, + "learning_rate": 8e-05, + "loss": 47.7057, + "num_input_tokens_seen": 182574352, + "step": 3549 + }, + { + "epoch": 0.3490823321294317, + "grad_norm": 32.480323791503906, + "learning_rate": 8e-05, + "loss": 50.7387, + "num_input_tokens_seen": 182704996, + "step": 3552 + }, + { + "epoch": 0.3493771651802167, + "grad_norm": 34.784576416015625, + "learning_rate": 8e-05, + "loss": 47.4257, + "num_input_tokens_seen": 182855420, + "step": 3555 + }, + { + "epoch": 0.3496719982310017, + "grad_norm": 30.89937400817871, + "learning_rate": 8e-05, + "loss": 46.5586, + "num_input_tokens_seen": 182987680, + "step": 3558 + }, + { + "epoch": 0.3499668312817867, + "grad_norm": 28.803722381591797, + "learning_rate": 8e-05, + "loss": 46.2221, + "num_input_tokens_seen": 183137976, + "step": 3561 + }, + { + "epoch": 0.35026166433257166, + "grad_norm": 30.891504287719727, + "learning_rate": 8e-05, + "loss": 48.2359, + "num_input_tokens_seen": 183293772, + "step": 3564 + }, + { + "epoch": 0.35055649738335665, + "grad_norm": 38.08857727050781, + "learning_rate": 8e-05, + "loss": 47.384, + "num_input_tokens_seen": 183426252, + "step": 3567 + }, + { + "epoch": 0.3508513304341417, + "grad_norm": 33.9434928894043, + "learning_rate": 8e-05, + "loss": 49.6966, + "num_input_tokens_seen": 183569584, + "step": 3570 + }, + { + "epoch": 0.35114616348492667, + "grad_norm": 33.663787841796875, + "learning_rate": 8e-05, + "loss": 41.6853, + "num_input_tokens_seen": 183716720, + "step": 3573 + }, + { + "epoch": 0.35144099653571165, + "grad_norm": 35.80391311645508, + "learning_rate": 8e-05, + "loss": 48.0787, + "num_input_tokens_seen": 183861440, + "step": 3576 + }, + { + "epoch": 0.35173582958649663, + "grad_norm": 48.684268951416016, + "learning_rate": 8e-05, + "loss": 50.4618, + "num_input_tokens_seen": 184031560, + "step": 3579 + }, + { + "epoch": 0.35203066263728167, + "grad_norm": 33.56404113769531, + "learning_rate": 8e-05, + "loss": 46.1341, + "num_input_tokens_seen": 184182688, + "step": 3582 + }, + { + "epoch": 0.35232549568806665, + "grad_norm": 32.63978958129883, + "learning_rate": 8e-05, + "loss": 45.7058, + "num_input_tokens_seen": 184364896, + "step": 3585 + }, + { + "epoch": 0.35262032873885163, + "grad_norm": 34.40457534790039, + "learning_rate": 8e-05, + "loss": 49.1106, + "num_input_tokens_seen": 184508572, + "step": 3588 + }, + { + "epoch": 0.3529151617896366, + "grad_norm": 32.84516906738281, + "learning_rate": 8e-05, + "loss": 50.3652, + "num_input_tokens_seen": 184686200, + "step": 3591 + }, + { + "epoch": 0.3532099948404216, + "grad_norm": 81.30366516113281, + "learning_rate": 8e-05, + "loss": 46.5863, + "num_input_tokens_seen": 184814416, + "step": 3594 + }, + { + "epoch": 0.35350482789120663, + "grad_norm": 32.37525177001953, + "learning_rate": 8e-05, + "loss": 44.6372, + "num_input_tokens_seen": 184982744, + "step": 3597 + }, + { + "epoch": 0.3537996609419916, + "grad_norm": 34.05870056152344, + "learning_rate": 8e-05, + "loss": 44.6898, + "num_input_tokens_seen": 185135216, + "step": 3600 + }, + { + "epoch": 0.3540944939927766, + "grad_norm": 38.33725357055664, + "learning_rate": 8e-05, + "loss": 42.983, + "num_input_tokens_seen": 185304348, + "step": 3603 + }, + { + "epoch": 0.3543893270435616, + "grad_norm": 204.94512939453125, + "learning_rate": 8e-05, + "loss": 45.561, + "num_input_tokens_seen": 185459268, + "step": 3606 + }, + { + "epoch": 0.35468416009434656, + "grad_norm": 31.346824645996094, + "learning_rate": 8e-05, + "loss": 45.4419, + "num_input_tokens_seen": 185624232, + "step": 3609 + }, + { + "epoch": 0.3549789931451316, + "grad_norm": 29.82939338684082, + "learning_rate": 8e-05, + "loss": 45.6828, + "num_input_tokens_seen": 185787720, + "step": 3612 + }, + { + "epoch": 0.3552738261959166, + "grad_norm": 31.14798355102539, + "learning_rate": 8e-05, + "loss": 44.7127, + "num_input_tokens_seen": 185941444, + "step": 3615 + }, + { + "epoch": 0.35556865924670156, + "grad_norm": 29.58880615234375, + "learning_rate": 8e-05, + "loss": 50.7512, + "num_input_tokens_seen": 186088412, + "step": 3618 + }, + { + "epoch": 0.35586349229748654, + "grad_norm": 107.43943786621094, + "learning_rate": 8e-05, + "loss": 40.4814, + "num_input_tokens_seen": 186255008, + "step": 3621 + }, + { + "epoch": 0.3561583253482715, + "grad_norm": 32.718505859375, + "learning_rate": 8e-05, + "loss": 46.1933, + "num_input_tokens_seen": 186415012, + "step": 3624 + }, + { + "epoch": 0.35645315839905656, + "grad_norm": 35.0687370300293, + "learning_rate": 8e-05, + "loss": 47.947, + "num_input_tokens_seen": 186574040, + "step": 3627 + }, + { + "epoch": 0.35674799144984154, + "grad_norm": 33.25041961669922, + "learning_rate": 8e-05, + "loss": 47.4406, + "num_input_tokens_seen": 186759404, + "step": 3630 + }, + { + "epoch": 0.3570428245006265, + "grad_norm": 34.515506744384766, + "learning_rate": 8e-05, + "loss": 45.0484, + "num_input_tokens_seen": 186909100, + "step": 3633 + }, + { + "epoch": 0.3573376575514115, + "grad_norm": 33.785972595214844, + "learning_rate": 8e-05, + "loss": 42.7564, + "num_input_tokens_seen": 187064440, + "step": 3636 + }, + { + "epoch": 0.3576324906021965, + "grad_norm": 29.22235870361328, + "learning_rate": 8e-05, + "loss": 45.6804, + "num_input_tokens_seen": 187185516, + "step": 3639 + }, + { + "epoch": 0.3579273236529815, + "grad_norm": 24.949066162109375, + "learning_rate": 8e-05, + "loss": 44.1007, + "num_input_tokens_seen": 187337652, + "step": 3642 + }, + { + "epoch": 0.3582221567037665, + "grad_norm": 26.641557693481445, + "learning_rate": 8e-05, + "loss": 47.4558, + "num_input_tokens_seen": 187493736, + "step": 3645 + }, + { + "epoch": 0.3585169897545515, + "grad_norm": 32.34521484375, + "learning_rate": 8e-05, + "loss": 44.5307, + "num_input_tokens_seen": 187669108, + "step": 3648 + }, + { + "epoch": 0.35881182280533647, + "grad_norm": 31.242477416992188, + "learning_rate": 8e-05, + "loss": 49.4255, + "num_input_tokens_seen": 187822444, + "step": 3651 + }, + { + "epoch": 0.35910665585612145, + "grad_norm": 28.323322296142578, + "learning_rate": 8e-05, + "loss": 47.169, + "num_input_tokens_seen": 187977892, + "step": 3654 + }, + { + "epoch": 0.3594014889069065, + "grad_norm": 32.317771911621094, + "learning_rate": 8e-05, + "loss": 42.7603, + "num_input_tokens_seen": 188121380, + "step": 3657 + }, + { + "epoch": 0.35969632195769147, + "grad_norm": 55.2293586730957, + "learning_rate": 8e-05, + "loss": 42.8748, + "num_input_tokens_seen": 188246832, + "step": 3660 + }, + { + "epoch": 0.35999115500847645, + "grad_norm": 32.014766693115234, + "learning_rate": 8e-05, + "loss": 41.6695, + "num_input_tokens_seen": 188404404, + "step": 3663 + }, + { + "epoch": 0.36028598805926143, + "grad_norm": 30.330198287963867, + "learning_rate": 8e-05, + "loss": 45.1016, + "num_input_tokens_seen": 188555360, + "step": 3666 + }, + { + "epoch": 0.3605808211100464, + "grad_norm": 29.632488250732422, + "learning_rate": 8e-05, + "loss": 49.119, + "num_input_tokens_seen": 188718512, + "step": 3669 + }, + { + "epoch": 0.36087565416083145, + "grad_norm": 29.9589786529541, + "learning_rate": 8e-05, + "loss": 50.1, + "num_input_tokens_seen": 188897648, + "step": 3672 + }, + { + "epoch": 0.36117048721161643, + "grad_norm": 28.243938446044922, + "learning_rate": 8e-05, + "loss": 46.6535, + "num_input_tokens_seen": 189052008, + "step": 3675 + }, + { + "epoch": 0.3614653202624014, + "grad_norm": 28.271282196044922, + "learning_rate": 8e-05, + "loss": 44.8027, + "num_input_tokens_seen": 189184824, + "step": 3678 + }, + { + "epoch": 0.3617601533131864, + "grad_norm": 31.67430305480957, + "learning_rate": 8e-05, + "loss": 42.1192, + "num_input_tokens_seen": 189313464, + "step": 3681 + }, + { + "epoch": 0.3620549863639714, + "grad_norm": 31.47785186767578, + "learning_rate": 8e-05, + "loss": 45.3955, + "num_input_tokens_seen": 189454948, + "step": 3684 + }, + { + "epoch": 0.3623498194147564, + "grad_norm": 31.909595489501953, + "learning_rate": 8e-05, + "loss": 44.9947, + "num_input_tokens_seen": 189609212, + "step": 3687 + }, + { + "epoch": 0.3626446524655414, + "grad_norm": 32.878177642822266, + "learning_rate": 8e-05, + "loss": 47.5328, + "num_input_tokens_seen": 189758364, + "step": 3690 + }, + { + "epoch": 0.3629394855163264, + "grad_norm": 30.43533706665039, + "learning_rate": 8e-05, + "loss": 45.8225, + "num_input_tokens_seen": 189911720, + "step": 3693 + }, + { + "epoch": 0.36323431856711136, + "grad_norm": 29.782085418701172, + "learning_rate": 8e-05, + "loss": 46.4523, + "num_input_tokens_seen": 190058724, + "step": 3696 + }, + { + "epoch": 0.36352915161789634, + "grad_norm": 36.34353256225586, + "learning_rate": 8e-05, + "loss": 45.7055, + "num_input_tokens_seen": 190232372, + "step": 3699 + }, + { + "epoch": 0.3638239846686814, + "grad_norm": 31.679168701171875, + "learning_rate": 8e-05, + "loss": 43.4331, + "num_input_tokens_seen": 190397056, + "step": 3702 + }, + { + "epoch": 0.36411881771946636, + "grad_norm": 30.544857025146484, + "learning_rate": 8e-05, + "loss": 48.4687, + "num_input_tokens_seen": 190567812, + "step": 3705 + }, + { + "epoch": 0.36441365077025134, + "grad_norm": 34.39679718017578, + "learning_rate": 8e-05, + "loss": 46.3253, + "num_input_tokens_seen": 190727292, + "step": 3708 + }, + { + "epoch": 0.3647084838210363, + "grad_norm": 28.82375144958496, + "learning_rate": 8e-05, + "loss": 44.8942, + "num_input_tokens_seen": 190882620, + "step": 3711 + }, + { + "epoch": 0.3650033168718213, + "grad_norm": 34.30107498168945, + "learning_rate": 8e-05, + "loss": 48.7777, + "num_input_tokens_seen": 191060832, + "step": 3714 + }, + { + "epoch": 0.36529814992260634, + "grad_norm": 26.91672134399414, + "learning_rate": 8e-05, + "loss": 44.2889, + "num_input_tokens_seen": 191229352, + "step": 3717 + }, + { + "epoch": 0.3655929829733913, + "grad_norm": 31.14057159423828, + "learning_rate": 8e-05, + "loss": 44.6834, + "num_input_tokens_seen": 191379112, + "step": 3720 + }, + { + "epoch": 0.3658878160241763, + "grad_norm": 29.06418800354004, + "learning_rate": 8e-05, + "loss": 43.0235, + "num_input_tokens_seen": 191536348, + "step": 3723 + }, + { + "epoch": 0.3661826490749613, + "grad_norm": 29.524255752563477, + "learning_rate": 8e-05, + "loss": 48.2543, + "num_input_tokens_seen": 191688912, + "step": 3726 + }, + { + "epoch": 0.3664774821257463, + "grad_norm": 30.087139129638672, + "learning_rate": 8e-05, + "loss": 47.2003, + "num_input_tokens_seen": 191845312, + "step": 3729 + }, + { + "epoch": 0.3667723151765313, + "grad_norm": 30.029184341430664, + "learning_rate": 8e-05, + "loss": 40.1694, + "num_input_tokens_seen": 192019360, + "step": 3732 + }, + { + "epoch": 0.3670671482273163, + "grad_norm": 27.986467361450195, + "learning_rate": 8e-05, + "loss": 47.4301, + "num_input_tokens_seen": 192174176, + "step": 3735 + }, + { + "epoch": 0.36736198127810127, + "grad_norm": 28.899826049804688, + "learning_rate": 8e-05, + "loss": 42.4361, + "num_input_tokens_seen": 192312400, + "step": 3738 + }, + { + "epoch": 0.36765681432888625, + "grad_norm": 33.11496353149414, + "learning_rate": 8e-05, + "loss": 46.0006, + "num_input_tokens_seen": 192488888, + "step": 3741 + }, + { + "epoch": 0.3679516473796713, + "grad_norm": 28.488046646118164, + "learning_rate": 8e-05, + "loss": 45.7107, + "num_input_tokens_seen": 192646096, + "step": 3744 + }, + { + "epoch": 0.36824648043045627, + "grad_norm": 42.096595764160156, + "learning_rate": 8e-05, + "loss": 45.9505, + "num_input_tokens_seen": 192802116, + "step": 3747 + }, + { + "epoch": 0.36854131348124125, + "grad_norm": 34.51819610595703, + "learning_rate": 8e-05, + "loss": 46.8947, + "num_input_tokens_seen": 192960540, + "step": 3750 + }, + { + "epoch": 0.36883614653202623, + "grad_norm": 30.888742446899414, + "learning_rate": 8e-05, + "loss": 46.85, + "num_input_tokens_seen": 193127332, + "step": 3753 + }, + { + "epoch": 0.3691309795828112, + "grad_norm": 29.665699005126953, + "learning_rate": 8e-05, + "loss": 42.1983, + "num_input_tokens_seen": 193276052, + "step": 3756 + }, + { + "epoch": 0.36942581263359625, + "grad_norm": 30.412384033203125, + "learning_rate": 8e-05, + "loss": 45.8014, + "num_input_tokens_seen": 193437632, + "step": 3759 + }, + { + "epoch": 0.36972064568438123, + "grad_norm": 29.54482078552246, + "learning_rate": 8e-05, + "loss": 42.5811, + "num_input_tokens_seen": 193583420, + "step": 3762 + }, + { + "epoch": 0.3700154787351662, + "grad_norm": 39.85192108154297, + "learning_rate": 8e-05, + "loss": 46.2251, + "num_input_tokens_seen": 193717944, + "step": 3765 + }, + { + "epoch": 0.3703103117859512, + "grad_norm": 34.36831283569336, + "learning_rate": 8e-05, + "loss": 44.9958, + "num_input_tokens_seen": 193871112, + "step": 3768 + }, + { + "epoch": 0.3706051448367362, + "grad_norm": 29.24009132385254, + "learning_rate": 8e-05, + "loss": 43.8158, + "num_input_tokens_seen": 194048240, + "step": 3771 + }, + { + "epoch": 0.3708999778875212, + "grad_norm": 31.27372169494629, + "learning_rate": 8e-05, + "loss": 45.5604, + "num_input_tokens_seen": 194196752, + "step": 3774 + }, + { + "epoch": 0.3711948109383062, + "grad_norm": 51.641380310058594, + "learning_rate": 8e-05, + "loss": 40.0764, + "num_input_tokens_seen": 194380572, + "step": 3777 + }, + { + "epoch": 0.3714896439890912, + "grad_norm": 31.002944946289062, + "learning_rate": 8e-05, + "loss": 51.0142, + "num_input_tokens_seen": 194532176, + "step": 3780 + }, + { + "epoch": 0.37178447703987616, + "grad_norm": 62.89687728881836, + "learning_rate": 8e-05, + "loss": 40.5023, + "num_input_tokens_seen": 194676504, + "step": 3783 + }, + { + "epoch": 0.37207931009066114, + "grad_norm": 30.240619659423828, + "learning_rate": 8e-05, + "loss": 42.1637, + "num_input_tokens_seen": 194828120, + "step": 3786 + }, + { + "epoch": 0.3723741431414462, + "grad_norm": 30.373289108276367, + "learning_rate": 8e-05, + "loss": 48.1604, + "num_input_tokens_seen": 195013152, + "step": 3789 + }, + { + "epoch": 0.37266897619223116, + "grad_norm": 27.48198127746582, + "learning_rate": 8e-05, + "loss": 46.033, + "num_input_tokens_seen": 195186880, + "step": 3792 + }, + { + "epoch": 0.37296380924301614, + "grad_norm": 30.48610496520996, + "learning_rate": 8e-05, + "loss": 44.5144, + "num_input_tokens_seen": 195357388, + "step": 3795 + }, + { + "epoch": 0.3732586422938011, + "grad_norm": 37.86562728881836, + "learning_rate": 8e-05, + "loss": 46.8633, + "num_input_tokens_seen": 195523872, + "step": 3798 + }, + { + "epoch": 0.3735534753445861, + "grad_norm": 25.786422729492188, + "learning_rate": 8e-05, + "loss": 42.4548, + "num_input_tokens_seen": 195677988, + "step": 3801 + }, + { + "epoch": 0.37384830839537114, + "grad_norm": 26.696611404418945, + "learning_rate": 8e-05, + "loss": 43.9501, + "num_input_tokens_seen": 195837960, + "step": 3804 + }, + { + "epoch": 0.3741431414461561, + "grad_norm": 28.53610610961914, + "learning_rate": 8e-05, + "loss": 47.663, + "num_input_tokens_seen": 195989804, + "step": 3807 + }, + { + "epoch": 0.3744379744969411, + "grad_norm": 31.87685203552246, + "learning_rate": 8e-05, + "loss": 47.7918, + "num_input_tokens_seen": 196165920, + "step": 3810 + }, + { + "epoch": 0.3747328075477261, + "grad_norm": 28.79266357421875, + "learning_rate": 8e-05, + "loss": 40.603, + "num_input_tokens_seen": 196325116, + "step": 3813 + }, + { + "epoch": 0.37502764059851107, + "grad_norm": 27.765893936157227, + "learning_rate": 8e-05, + "loss": 44.3665, + "num_input_tokens_seen": 196467160, + "step": 3816 + }, + { + "epoch": 0.3753224736492961, + "grad_norm": 31.322372436523438, + "learning_rate": 8e-05, + "loss": 40.6007, + "num_input_tokens_seen": 196608832, + "step": 3819 + }, + { + "epoch": 0.3756173067000811, + "grad_norm": 26.41129493713379, + "learning_rate": 8e-05, + "loss": 39.7225, + "num_input_tokens_seen": 196769656, + "step": 3822 + }, + { + "epoch": 0.37591213975086607, + "grad_norm": 32.98222351074219, + "learning_rate": 8e-05, + "loss": 46.4776, + "num_input_tokens_seen": 196926960, + "step": 3825 + }, + { + "epoch": 0.37620697280165105, + "grad_norm": 31.1025333404541, + "learning_rate": 8e-05, + "loss": 48.5201, + "num_input_tokens_seen": 197081808, + "step": 3828 + }, + { + "epoch": 0.37650180585243603, + "grad_norm": 35.10872268676758, + "learning_rate": 8e-05, + "loss": 47.4361, + "num_input_tokens_seen": 197228816, + "step": 3831 + }, + { + "epoch": 0.37679663890322107, + "grad_norm": 63.819210052490234, + "learning_rate": 8e-05, + "loss": 44.656, + "num_input_tokens_seen": 197366464, + "step": 3834 + }, + { + "epoch": 0.37709147195400605, + "grad_norm": 88.14225006103516, + "learning_rate": 8e-05, + "loss": 39.3489, + "num_input_tokens_seen": 197515700, + "step": 3837 + }, + { + "epoch": 0.37738630500479103, + "grad_norm": 30.590761184692383, + "learning_rate": 8e-05, + "loss": 48.5193, + "num_input_tokens_seen": 197661556, + "step": 3840 + }, + { + "epoch": 0.377681138055576, + "grad_norm": 27.369964599609375, + "learning_rate": 8e-05, + "loss": 46.7822, + "num_input_tokens_seen": 197807528, + "step": 3843 + }, + { + "epoch": 0.377975971106361, + "grad_norm": 27.713287353515625, + "learning_rate": 8e-05, + "loss": 44.0746, + "num_input_tokens_seen": 197986864, + "step": 3846 + }, + { + "epoch": 0.37827080415714603, + "grad_norm": 27.770008087158203, + "learning_rate": 8e-05, + "loss": 42.8452, + "num_input_tokens_seen": 198150944, + "step": 3849 + }, + { + "epoch": 0.378565637207931, + "grad_norm": 31.2003173828125, + "learning_rate": 8e-05, + "loss": 41.3652, + "num_input_tokens_seen": 198302148, + "step": 3852 + }, + { + "epoch": 0.378860470258716, + "grad_norm": 27.300168991088867, + "learning_rate": 8e-05, + "loss": 40.8527, + "num_input_tokens_seen": 198452080, + "step": 3855 + }, + { + "epoch": 0.379155303309501, + "grad_norm": 27.111713409423828, + "learning_rate": 8e-05, + "loss": 45.4798, + "num_input_tokens_seen": 198600556, + "step": 3858 + }, + { + "epoch": 0.37945013636028596, + "grad_norm": 34.327232360839844, + "learning_rate": 8e-05, + "loss": 40.3459, + "num_input_tokens_seen": 198756508, + "step": 3861 + }, + { + "epoch": 0.379744969411071, + "grad_norm": 35.697811126708984, + "learning_rate": 8e-05, + "loss": 45.5265, + "num_input_tokens_seen": 198906604, + "step": 3864 + }, + { + "epoch": 0.380039802461856, + "grad_norm": 24.162860870361328, + "learning_rate": 8e-05, + "loss": 42.994, + "num_input_tokens_seen": 199062408, + "step": 3867 + }, + { + "epoch": 0.38033463551264096, + "grad_norm": 29.72947883605957, + "learning_rate": 8e-05, + "loss": 46.2078, + "num_input_tokens_seen": 199224256, + "step": 3870 + }, + { + "epoch": 0.38062946856342594, + "grad_norm": 29.636106491088867, + "learning_rate": 8e-05, + "loss": 41.394, + "num_input_tokens_seen": 199377520, + "step": 3873 + }, + { + "epoch": 0.380924301614211, + "grad_norm": 33.16059112548828, + "learning_rate": 8e-05, + "loss": 45.7516, + "num_input_tokens_seen": 199522628, + "step": 3876 + }, + { + "epoch": 0.38121913466499596, + "grad_norm": 29.432018280029297, + "learning_rate": 8e-05, + "loss": 46.2355, + "num_input_tokens_seen": 199680984, + "step": 3879 + }, + { + "epoch": 0.38151396771578094, + "grad_norm": 26.524246215820312, + "learning_rate": 8e-05, + "loss": 43.8269, + "num_input_tokens_seen": 199827376, + "step": 3882 + }, + { + "epoch": 0.3818088007665659, + "grad_norm": 27.051618576049805, + "learning_rate": 8e-05, + "loss": 41.0068, + "num_input_tokens_seen": 199965272, + "step": 3885 + }, + { + "epoch": 0.3821036338173509, + "grad_norm": 25.889406204223633, + "learning_rate": 8e-05, + "loss": 41.8811, + "num_input_tokens_seen": 200108116, + "step": 3888 + }, + { + "epoch": 0.38239846686813594, + "grad_norm": 27.41223907470703, + "learning_rate": 8e-05, + "loss": 44.4539, + "num_input_tokens_seen": 200259528, + "step": 3891 + }, + { + "epoch": 0.3826932999189209, + "grad_norm": 29.76633071899414, + "learning_rate": 8e-05, + "loss": 45.3152, + "num_input_tokens_seen": 200425348, + "step": 3894 + }, + { + "epoch": 0.3829881329697059, + "grad_norm": 30.8220272064209, + "learning_rate": 8e-05, + "loss": 44.7649, + "num_input_tokens_seen": 200581448, + "step": 3897 + }, + { + "epoch": 0.3832829660204909, + "grad_norm": 34.690086364746094, + "learning_rate": 8e-05, + "loss": 42.6207, + "num_input_tokens_seen": 200735580, + "step": 3900 + }, + { + "epoch": 0.38357779907127587, + "grad_norm": 31.798429489135742, + "learning_rate": 8e-05, + "loss": 42.9762, + "num_input_tokens_seen": 200871456, + "step": 3903 + }, + { + "epoch": 0.3838726321220609, + "grad_norm": 31.97600746154785, + "learning_rate": 8e-05, + "loss": 45.5173, + "num_input_tokens_seen": 201020476, + "step": 3906 + }, + { + "epoch": 0.3841674651728459, + "grad_norm": 28.01764488220215, + "learning_rate": 8e-05, + "loss": 46.0845, + "num_input_tokens_seen": 201189760, + "step": 3909 + }, + { + "epoch": 0.38446229822363087, + "grad_norm": 24.853553771972656, + "learning_rate": 8e-05, + "loss": 45.8736, + "num_input_tokens_seen": 201347932, + "step": 3912 + }, + { + "epoch": 0.38475713127441585, + "grad_norm": 33.82841873168945, + "learning_rate": 8e-05, + "loss": 43.8107, + "num_input_tokens_seen": 201498192, + "step": 3915 + }, + { + "epoch": 0.38505196432520084, + "grad_norm": 30.44049644470215, + "learning_rate": 8e-05, + "loss": 47.1945, + "num_input_tokens_seen": 201641892, + "step": 3918 + }, + { + "epoch": 0.38534679737598587, + "grad_norm": 33.982540130615234, + "learning_rate": 8e-05, + "loss": 42.0145, + "num_input_tokens_seen": 201791476, + "step": 3921 + }, + { + "epoch": 0.38564163042677085, + "grad_norm": 34.017024993896484, + "learning_rate": 8e-05, + "loss": 45.6285, + "num_input_tokens_seen": 201937304, + "step": 3924 + }, + { + "epoch": 0.38593646347755584, + "grad_norm": 188.57647705078125, + "learning_rate": 8e-05, + "loss": 42.9968, + "num_input_tokens_seen": 202069036, + "step": 3927 + }, + { + "epoch": 0.3862312965283408, + "grad_norm": 29.63610076904297, + "learning_rate": 8e-05, + "loss": 46.5057, + "num_input_tokens_seen": 202218752, + "step": 3930 + }, + { + "epoch": 0.3865261295791258, + "grad_norm": 29.076887130737305, + "learning_rate": 8e-05, + "loss": 43.2685, + "num_input_tokens_seen": 202393000, + "step": 3933 + }, + { + "epoch": 0.38682096262991084, + "grad_norm": 28.38823127746582, + "learning_rate": 8e-05, + "loss": 42.1467, + "num_input_tokens_seen": 202534764, + "step": 3936 + }, + { + "epoch": 0.3871157956806958, + "grad_norm": 27.43347930908203, + "learning_rate": 8e-05, + "loss": 44.0863, + "num_input_tokens_seen": 202698572, + "step": 3939 + }, + { + "epoch": 0.3874106287314808, + "grad_norm": 30.09478187561035, + "learning_rate": 8e-05, + "loss": 46.4954, + "num_input_tokens_seen": 202857640, + "step": 3942 + }, + { + "epoch": 0.3877054617822658, + "grad_norm": 32.25828170776367, + "learning_rate": 8e-05, + "loss": 42.9919, + "num_input_tokens_seen": 203014816, + "step": 3945 + }, + { + "epoch": 0.38800029483305076, + "grad_norm": 30.99892234802246, + "learning_rate": 8e-05, + "loss": 49.7405, + "num_input_tokens_seen": 203188984, + "step": 3948 + }, + { + "epoch": 0.3882951278838358, + "grad_norm": 32.622127532958984, + "learning_rate": 8e-05, + "loss": 44.2072, + "num_input_tokens_seen": 203340960, + "step": 3951 + }, + { + "epoch": 0.3885899609346208, + "grad_norm": 27.56840705871582, + "learning_rate": 8e-05, + "loss": 42.6712, + "num_input_tokens_seen": 203509848, + "step": 3954 + }, + { + "epoch": 0.38888479398540576, + "grad_norm": 34.69350814819336, + "learning_rate": 8e-05, + "loss": 44.4039, + "num_input_tokens_seen": 203668996, + "step": 3957 + }, + { + "epoch": 0.38917962703619075, + "grad_norm": 31.97882652282715, + "learning_rate": 8e-05, + "loss": 44.0893, + "num_input_tokens_seen": 203823772, + "step": 3960 + }, + { + "epoch": 0.3894744600869757, + "grad_norm": 29.57229995727539, + "learning_rate": 8e-05, + "loss": 42.1541, + "num_input_tokens_seen": 203977648, + "step": 3963 + }, + { + "epoch": 0.38976929313776076, + "grad_norm": 29.408475875854492, + "learning_rate": 8e-05, + "loss": 41.2887, + "num_input_tokens_seen": 204153060, + "step": 3966 + }, + { + "epoch": 0.39006412618854575, + "grad_norm": 25.34873390197754, + "learning_rate": 8e-05, + "loss": 44.2761, + "num_input_tokens_seen": 204311144, + "step": 3969 + }, + { + "epoch": 0.3903589592393307, + "grad_norm": 31.603593826293945, + "learning_rate": 8e-05, + "loss": 44.0217, + "num_input_tokens_seen": 204461156, + "step": 3972 + }, + { + "epoch": 0.3906537922901157, + "grad_norm": 29.555217742919922, + "learning_rate": 8e-05, + "loss": 43.9669, + "num_input_tokens_seen": 204626196, + "step": 3975 + }, + { + "epoch": 0.3909486253409007, + "grad_norm": 33.9174690246582, + "learning_rate": 8e-05, + "loss": 42.4694, + "num_input_tokens_seen": 204789336, + "step": 3978 + }, + { + "epoch": 0.39124345839168573, + "grad_norm": 30.75056266784668, + "learning_rate": 8e-05, + "loss": 44.382, + "num_input_tokens_seen": 204947000, + "step": 3981 + }, + { + "epoch": 0.3915382914424707, + "grad_norm": 26.133014678955078, + "learning_rate": 8e-05, + "loss": 40.7875, + "num_input_tokens_seen": 205113876, + "step": 3984 + }, + { + "epoch": 0.3918331244932557, + "grad_norm": 30.40180015563965, + "learning_rate": 8e-05, + "loss": 44.6807, + "num_input_tokens_seen": 205288396, + "step": 3987 + }, + { + "epoch": 0.3921279575440407, + "grad_norm": 25.849224090576172, + "learning_rate": 8e-05, + "loss": 44.2802, + "num_input_tokens_seen": 205448184, + "step": 3990 + }, + { + "epoch": 0.39242279059482565, + "grad_norm": 29.1865291595459, + "learning_rate": 8e-05, + "loss": 40.1745, + "num_input_tokens_seen": 205600468, + "step": 3993 + }, + { + "epoch": 0.3927176236456107, + "grad_norm": 28.203125, + "learning_rate": 8e-05, + "loss": 43.8436, + "num_input_tokens_seen": 205759340, + "step": 3996 + }, + { + "epoch": 0.3930124566963957, + "grad_norm": 28.577714920043945, + "learning_rate": 8e-05, + "loss": 41.8728, + "num_input_tokens_seen": 205922228, + "step": 3999 + }, + { + "epoch": 0.39311073437999067, + "eval_gen_len": 30.17, + "eval_loss": 2.8102006912231445, + "eval_rouge1": 43.9268, + "eval_rouge2": 26.793, + "eval_rougeL": 40.1378, + "eval_rougeLsum": 40.7026, + "eval_runtime": 87.0957, + "eval_samples_per_second": 2.296, + "eval_steps_per_second": 0.574, + "num_input_tokens_seen": 205979564, + "step": 4000 + }, + { + "epoch": 0.39330728974718066, + "grad_norm": 34.915348052978516, + "learning_rate": 8e-05, + "loss": 46.0877, + "num_input_tokens_seen": 206074996, + "step": 4002 + }, + { + "epoch": 0.39360212279796564, + "grad_norm": 28.753887176513672, + "learning_rate": 8e-05, + "loss": 43.9296, + "num_input_tokens_seen": 206246808, + "step": 4005 + }, + { + "epoch": 0.3938969558487506, + "grad_norm": 82.86294555664062, + "learning_rate": 8e-05, + "loss": 44.0698, + "num_input_tokens_seen": 206390204, + "step": 4008 + }, + { + "epoch": 0.39419178889953566, + "grad_norm": 31.36377716064453, + "learning_rate": 8e-05, + "loss": 43.5785, + "num_input_tokens_seen": 206545372, + "step": 4011 + }, + { + "epoch": 0.39448662195032064, + "grad_norm": 29.626392364501953, + "learning_rate": 8e-05, + "loss": 41.7103, + "num_input_tokens_seen": 206689100, + "step": 4014 + }, + { + "epoch": 0.3947814550011056, + "grad_norm": 28.289608001708984, + "learning_rate": 8e-05, + "loss": 43.7036, + "num_input_tokens_seen": 206850696, + "step": 4017 + }, + { + "epoch": 0.3950762880518906, + "grad_norm": 29.048423767089844, + "learning_rate": 8e-05, + "loss": 45.4158, + "num_input_tokens_seen": 207013176, + "step": 4020 + }, + { + "epoch": 0.3953711211026756, + "grad_norm": 32.74452209472656, + "learning_rate": 8e-05, + "loss": 44.8341, + "num_input_tokens_seen": 207154572, + "step": 4023 + }, + { + "epoch": 0.3956659541534606, + "grad_norm": 30.65328025817871, + "learning_rate": 8e-05, + "loss": 41.3659, + "num_input_tokens_seen": 207302600, + "step": 4026 + }, + { + "epoch": 0.3959607872042456, + "grad_norm": 29.986570358276367, + "learning_rate": 8e-05, + "loss": 46.9741, + "num_input_tokens_seen": 207460036, + "step": 4029 + }, + { + "epoch": 0.3962556202550306, + "grad_norm": 30.823116302490234, + "learning_rate": 8e-05, + "loss": 46.9343, + "num_input_tokens_seen": 207606992, + "step": 4032 + }, + { + "epoch": 0.39655045330581556, + "grad_norm": 43.00481033325195, + "learning_rate": 8e-05, + "loss": 42.793, + "num_input_tokens_seen": 207755144, + "step": 4035 + }, + { + "epoch": 0.3968452863566006, + "grad_norm": 35.6334342956543, + "learning_rate": 8e-05, + "loss": 40.773, + "num_input_tokens_seen": 207923108, + "step": 4038 + }, + { + "epoch": 0.3971401194073856, + "grad_norm": 33.873558044433594, + "learning_rate": 8e-05, + "loss": 44.2082, + "num_input_tokens_seen": 208070340, + "step": 4041 + }, + { + "epoch": 0.39743495245817057, + "grad_norm": 48.23017501831055, + "learning_rate": 8e-05, + "loss": 42.1629, + "num_input_tokens_seen": 208231296, + "step": 4044 + }, + { + "epoch": 0.39772978550895555, + "grad_norm": 32.64248275756836, + "learning_rate": 8e-05, + "loss": 42.2485, + "num_input_tokens_seen": 208386000, + "step": 4047 + }, + { + "epoch": 0.39802461855974053, + "grad_norm": 52.33050537109375, + "learning_rate": 8e-05, + "loss": 42.8339, + "num_input_tokens_seen": 208543976, + "step": 4050 + }, + { + "epoch": 0.39831945161052557, + "grad_norm": 27.67913055419922, + "learning_rate": 8e-05, + "loss": 46.7309, + "num_input_tokens_seen": 208710496, + "step": 4053 + }, + { + "epoch": 0.39861428466131055, + "grad_norm": 72.23770904541016, + "learning_rate": 8e-05, + "loss": 45.5283, + "num_input_tokens_seen": 208855908, + "step": 4056 + }, + { + "epoch": 0.39890911771209553, + "grad_norm": 63.03166198730469, + "learning_rate": 8e-05, + "loss": 45.1794, + "num_input_tokens_seen": 209009292, + "step": 4059 + }, + { + "epoch": 0.3992039507628805, + "grad_norm": 34.75033950805664, + "learning_rate": 8e-05, + "loss": 46.1443, + "num_input_tokens_seen": 209156760, + "step": 4062 + }, + { + "epoch": 0.3994987838136655, + "grad_norm": 31.29888916015625, + "learning_rate": 8e-05, + "loss": 42.9468, + "num_input_tokens_seen": 209310320, + "step": 4065 + }, + { + "epoch": 0.39979361686445053, + "grad_norm": 30.495737075805664, + "learning_rate": 8e-05, + "loss": 44.2227, + "num_input_tokens_seen": 209461532, + "step": 4068 + }, + { + "epoch": 0.4000884499152355, + "grad_norm": 29.618284225463867, + "learning_rate": 8e-05, + "loss": 40.4326, + "num_input_tokens_seen": 209602624, + "step": 4071 + }, + { + "epoch": 0.4003832829660205, + "grad_norm": 33.6846923828125, + "learning_rate": 8e-05, + "loss": 43.9579, + "num_input_tokens_seen": 209776420, + "step": 4074 + }, + { + "epoch": 0.4006781160168055, + "grad_norm": 26.407421112060547, + "learning_rate": 8e-05, + "loss": 43.0974, + "num_input_tokens_seen": 209937612, + "step": 4077 + }, + { + "epoch": 0.40097294906759046, + "grad_norm": 27.561464309692383, + "learning_rate": 8e-05, + "loss": 40.6611, + "num_input_tokens_seen": 210068780, + "step": 4080 + }, + { + "epoch": 0.4012677821183755, + "grad_norm": 27.589263916015625, + "learning_rate": 8e-05, + "loss": 42.478, + "num_input_tokens_seen": 210205900, + "step": 4083 + }, + { + "epoch": 0.4015626151691605, + "grad_norm": 33.44409942626953, + "learning_rate": 8e-05, + "loss": 41.9828, + "num_input_tokens_seen": 210337076, + "step": 4086 + }, + { + "epoch": 0.40185744821994546, + "grad_norm": 28.39436149597168, + "learning_rate": 8e-05, + "loss": 47.7314, + "num_input_tokens_seen": 210488940, + "step": 4089 + }, + { + "epoch": 0.40215228127073044, + "grad_norm": 30.144987106323242, + "learning_rate": 8e-05, + "loss": 43.2947, + "num_input_tokens_seen": 210640804, + "step": 4092 + }, + { + "epoch": 0.4024471143215154, + "grad_norm": 31.10938262939453, + "learning_rate": 8e-05, + "loss": 43.9627, + "num_input_tokens_seen": 210806580, + "step": 4095 + }, + { + "epoch": 0.40274194737230046, + "grad_norm": 29.420190811157227, + "learning_rate": 8e-05, + "loss": 40.8033, + "num_input_tokens_seen": 210966164, + "step": 4098 + }, + { + "epoch": 0.40303678042308544, + "grad_norm": 32.46398162841797, + "learning_rate": 8e-05, + "loss": 45.5083, + "num_input_tokens_seen": 211131532, + "step": 4101 + }, + { + "epoch": 0.4033316134738704, + "grad_norm": 25.459470748901367, + "learning_rate": 8e-05, + "loss": 44.1949, + "num_input_tokens_seen": 211277416, + "step": 4104 + }, + { + "epoch": 0.4036264465246554, + "grad_norm": 26.684600830078125, + "learning_rate": 8e-05, + "loss": 41.2496, + "num_input_tokens_seen": 211434740, + "step": 4107 + }, + { + "epoch": 0.4039212795754404, + "grad_norm": 30.317163467407227, + "learning_rate": 8e-05, + "loss": 46.2663, + "num_input_tokens_seen": 211590464, + "step": 4110 + }, + { + "epoch": 0.4042161126262254, + "grad_norm": 28.028276443481445, + "learning_rate": 8e-05, + "loss": 43.1111, + "num_input_tokens_seen": 211754212, + "step": 4113 + }, + { + "epoch": 0.4045109456770104, + "grad_norm": 25.986167907714844, + "learning_rate": 8e-05, + "loss": 42.6019, + "num_input_tokens_seen": 211931260, + "step": 4116 + }, + { + "epoch": 0.4048057787277954, + "grad_norm": 28.613994598388672, + "learning_rate": 8e-05, + "loss": 43.5144, + "num_input_tokens_seen": 212103276, + "step": 4119 + }, + { + "epoch": 0.40510061177858037, + "grad_norm": 31.608320236206055, + "learning_rate": 8e-05, + "loss": 43.4318, + "num_input_tokens_seen": 212264392, + "step": 4122 + }, + { + "epoch": 0.40539544482936535, + "grad_norm": 29.455188751220703, + "learning_rate": 8e-05, + "loss": 45.3574, + "num_input_tokens_seen": 212403240, + "step": 4125 + }, + { + "epoch": 0.4056902778801504, + "grad_norm": 27.21038818359375, + "learning_rate": 8e-05, + "loss": 41.9461, + "num_input_tokens_seen": 212547536, + "step": 4128 + }, + { + "epoch": 0.40598511093093537, + "grad_norm": 26.4250545501709, + "learning_rate": 8e-05, + "loss": 43.9856, + "num_input_tokens_seen": 212699424, + "step": 4131 + }, + { + "epoch": 0.40627994398172035, + "grad_norm": 27.913158416748047, + "learning_rate": 8e-05, + "loss": 43.1773, + "num_input_tokens_seen": 212856380, + "step": 4134 + }, + { + "epoch": 0.40657477703250533, + "grad_norm": 26.234939575195312, + "learning_rate": 8e-05, + "loss": 42.7789, + "num_input_tokens_seen": 213029056, + "step": 4137 + }, + { + "epoch": 0.4068696100832903, + "grad_norm": 31.88821029663086, + "learning_rate": 8e-05, + "loss": 44.5928, + "num_input_tokens_seen": 213185192, + "step": 4140 + }, + { + "epoch": 0.40716444313407535, + "grad_norm": 30.04606056213379, + "learning_rate": 8e-05, + "loss": 45.7953, + "num_input_tokens_seen": 213323056, + "step": 4143 + }, + { + "epoch": 0.40745927618486033, + "grad_norm": 30.563106536865234, + "learning_rate": 8e-05, + "loss": 47.0879, + "num_input_tokens_seen": 213479292, + "step": 4146 + }, + { + "epoch": 0.4077541092356453, + "grad_norm": 29.05677604675293, + "learning_rate": 8e-05, + "loss": 43.9433, + "num_input_tokens_seen": 213626828, + "step": 4149 + }, + { + "epoch": 0.4080489422864303, + "grad_norm": 28.374475479125977, + "learning_rate": 8e-05, + "loss": 46.5581, + "num_input_tokens_seen": 213772128, + "step": 4152 + }, + { + "epoch": 0.4083437753372153, + "grad_norm": 27.08576774597168, + "learning_rate": 8e-05, + "loss": 46.3264, + "num_input_tokens_seen": 213917224, + "step": 4155 + }, + { + "epoch": 0.4086386083880003, + "grad_norm": 27.269411087036133, + "learning_rate": 8e-05, + "loss": 42.0309, + "num_input_tokens_seen": 214065624, + "step": 4158 + }, + { + "epoch": 0.4089334414387853, + "grad_norm": 29.288543701171875, + "learning_rate": 8e-05, + "loss": 42.4004, + "num_input_tokens_seen": 214230256, + "step": 4161 + }, + { + "epoch": 0.4092282744895703, + "grad_norm": 25.37685775756836, + "learning_rate": 8e-05, + "loss": 41.6511, + "num_input_tokens_seen": 214381884, + "step": 4164 + }, + { + "epoch": 0.40952310754035526, + "grad_norm": 25.916948318481445, + "learning_rate": 8e-05, + "loss": 39.7151, + "num_input_tokens_seen": 214540220, + "step": 4167 + }, + { + "epoch": 0.40981794059114024, + "grad_norm": 32.05540466308594, + "learning_rate": 8e-05, + "loss": 42.6181, + "num_input_tokens_seen": 214689304, + "step": 4170 + }, + { + "epoch": 0.4101127736419253, + "grad_norm": 27.41069984436035, + "learning_rate": 8e-05, + "loss": 42.0901, + "num_input_tokens_seen": 214841736, + "step": 4173 + }, + { + "epoch": 0.41040760669271026, + "grad_norm": 23.779354095458984, + "learning_rate": 8e-05, + "loss": 40.2321, + "num_input_tokens_seen": 215011564, + "step": 4176 + }, + { + "epoch": 0.41070243974349524, + "grad_norm": 26.35748291015625, + "learning_rate": 8e-05, + "loss": 40.0868, + "num_input_tokens_seen": 215159752, + "step": 4179 + }, + { + "epoch": 0.4109972727942802, + "grad_norm": 25.243736267089844, + "learning_rate": 8e-05, + "loss": 42.1468, + "num_input_tokens_seen": 215309800, + "step": 4182 + }, + { + "epoch": 0.41129210584506526, + "grad_norm": 40.49623107910156, + "learning_rate": 8e-05, + "loss": 42.6442, + "num_input_tokens_seen": 215459680, + "step": 4185 + }, + { + "epoch": 0.41158693889585024, + "grad_norm": 27.0998477935791, + "learning_rate": 8e-05, + "loss": 44.6483, + "num_input_tokens_seen": 215603644, + "step": 4188 + }, + { + "epoch": 0.4118817719466352, + "grad_norm": 32.15782928466797, + "learning_rate": 8e-05, + "loss": 44.5412, + "num_input_tokens_seen": 215768376, + "step": 4191 + }, + { + "epoch": 0.4121766049974202, + "grad_norm": 30.283430099487305, + "learning_rate": 8e-05, + "loss": 43.9261, + "num_input_tokens_seen": 215907312, + "step": 4194 + }, + { + "epoch": 0.4124714380482052, + "grad_norm": 27.773475646972656, + "learning_rate": 8e-05, + "loss": 40.9794, + "num_input_tokens_seen": 216057128, + "step": 4197 + }, + { + "epoch": 0.4127662710989902, + "grad_norm": 72.8736801147461, + "learning_rate": 8e-05, + "loss": 44.0748, + "num_input_tokens_seen": 216211216, + "step": 4200 + }, + { + "epoch": 0.4130611041497752, + "grad_norm": 32.80086898803711, + "learning_rate": 8e-05, + "loss": 42.9728, + "num_input_tokens_seen": 216349276, + "step": 4203 + }, + { + "epoch": 0.4133559372005602, + "grad_norm": 30.961280822753906, + "learning_rate": 8e-05, + "loss": 46.0797, + "num_input_tokens_seen": 216517136, + "step": 4206 + }, + { + "epoch": 0.41365077025134517, + "grad_norm": 32.26238250732422, + "learning_rate": 8e-05, + "loss": 38.8171, + "num_input_tokens_seen": 216665920, + "step": 4209 + }, + { + "epoch": 0.41394560330213015, + "grad_norm": 28.24716567993164, + "learning_rate": 8e-05, + "loss": 44.8195, + "num_input_tokens_seen": 216825804, + "step": 4212 + }, + { + "epoch": 0.4142404363529152, + "grad_norm": 28.280357360839844, + "learning_rate": 8e-05, + "loss": 42.5102, + "num_input_tokens_seen": 216990172, + "step": 4215 + }, + { + "epoch": 0.41453526940370017, + "grad_norm": 26.093664169311523, + "learning_rate": 8e-05, + "loss": 46.2611, + "num_input_tokens_seen": 217147152, + "step": 4218 + }, + { + "epoch": 0.41483010245448515, + "grad_norm": 28.501445770263672, + "learning_rate": 8e-05, + "loss": 40.6934, + "num_input_tokens_seen": 217337740, + "step": 4221 + }, + { + "epoch": 0.41512493550527013, + "grad_norm": 25.08894157409668, + "learning_rate": 8e-05, + "loss": 44.784, + "num_input_tokens_seen": 217471444, + "step": 4224 + }, + { + "epoch": 0.4154197685560551, + "grad_norm": 30.653091430664062, + "learning_rate": 8e-05, + "loss": 43.6489, + "num_input_tokens_seen": 217620536, + "step": 4227 + }, + { + "epoch": 0.41571460160684015, + "grad_norm": 30.691848754882812, + "learning_rate": 8e-05, + "loss": 41.9049, + "num_input_tokens_seen": 217765936, + "step": 4230 + }, + { + "epoch": 0.41600943465762513, + "grad_norm": 30.53278160095215, + "learning_rate": 8e-05, + "loss": 39.0308, + "num_input_tokens_seen": 217949424, + "step": 4233 + }, + { + "epoch": 0.4163042677084101, + "grad_norm": 28.368154525756836, + "learning_rate": 8e-05, + "loss": 42.2364, + "num_input_tokens_seen": 218101452, + "step": 4236 + }, + { + "epoch": 0.4165991007591951, + "grad_norm": 30.020780563354492, + "learning_rate": 8e-05, + "loss": 45.0031, + "num_input_tokens_seen": 218280512, + "step": 4239 + }, + { + "epoch": 0.4168939338099801, + "grad_norm": 24.755632400512695, + "learning_rate": 8e-05, + "loss": 39.9639, + "num_input_tokens_seen": 218440480, + "step": 4242 + }, + { + "epoch": 0.4171887668607651, + "grad_norm": 28.246004104614258, + "learning_rate": 8e-05, + "loss": 48.4517, + "num_input_tokens_seen": 218584376, + "step": 4245 + }, + { + "epoch": 0.4174835999115501, + "grad_norm": 29.797582626342773, + "learning_rate": 8e-05, + "loss": 43.5954, + "num_input_tokens_seen": 218739188, + "step": 4248 + }, + { + "epoch": 0.4177784329623351, + "grad_norm": 30.69160270690918, + "learning_rate": 8e-05, + "loss": 42.1095, + "num_input_tokens_seen": 218895136, + "step": 4251 + }, + { + "epoch": 0.41807326601312006, + "grad_norm": 33.107330322265625, + "learning_rate": 8e-05, + "loss": 44.5923, + "num_input_tokens_seen": 219049092, + "step": 4254 + }, + { + "epoch": 0.41836809906390504, + "grad_norm": 35.53581237792969, + "learning_rate": 8e-05, + "loss": 39.4062, + "num_input_tokens_seen": 219199092, + "step": 4257 + }, + { + "epoch": 0.4186629321146901, + "grad_norm": 27.54696273803711, + "learning_rate": 8e-05, + "loss": 42.3835, + "num_input_tokens_seen": 219367748, + "step": 4260 + }, + { + "epoch": 0.41895776516547506, + "grad_norm": 27.74174690246582, + "learning_rate": 8e-05, + "loss": 43.5722, + "num_input_tokens_seen": 219543440, + "step": 4263 + }, + { + "epoch": 0.41925259821626004, + "grad_norm": 28.04417610168457, + "learning_rate": 8e-05, + "loss": 46.6899, + "num_input_tokens_seen": 219691600, + "step": 4266 + }, + { + "epoch": 0.419547431267045, + "grad_norm": 27.233768463134766, + "learning_rate": 8e-05, + "loss": 44.2905, + "num_input_tokens_seen": 219840124, + "step": 4269 + }, + { + "epoch": 0.41984226431783, + "grad_norm": 31.485761642456055, + "learning_rate": 8e-05, + "loss": 43.1472, + "num_input_tokens_seen": 219978572, + "step": 4272 + }, + { + "epoch": 0.42013709736861504, + "grad_norm": 27.343427658081055, + "learning_rate": 8e-05, + "loss": 40.3824, + "num_input_tokens_seen": 220118616, + "step": 4275 + }, + { + "epoch": 0.4204319304194, + "grad_norm": 28.364166259765625, + "learning_rate": 8e-05, + "loss": 41.5778, + "num_input_tokens_seen": 220275124, + "step": 4278 + }, + { + "epoch": 0.420726763470185, + "grad_norm": 31.391408920288086, + "learning_rate": 8e-05, + "loss": 45.4767, + "num_input_tokens_seen": 220432824, + "step": 4281 + }, + { + "epoch": 0.42102159652097, + "grad_norm": 27.511157989501953, + "learning_rate": 8e-05, + "loss": 44.5242, + "num_input_tokens_seen": 220564500, + "step": 4284 + }, + { + "epoch": 0.42131642957175497, + "grad_norm": 27.571096420288086, + "learning_rate": 8e-05, + "loss": 43.9533, + "num_input_tokens_seen": 220720920, + "step": 4287 + }, + { + "epoch": 0.42161126262254, + "grad_norm": 31.85966682434082, + "learning_rate": 8e-05, + "loss": 40.4081, + "num_input_tokens_seen": 220865572, + "step": 4290 + }, + { + "epoch": 0.421906095673325, + "grad_norm": 30.02121353149414, + "learning_rate": 8e-05, + "loss": 45.8891, + "num_input_tokens_seen": 221000948, + "step": 4293 + }, + { + "epoch": 0.42220092872410997, + "grad_norm": 34.88405227661133, + "learning_rate": 8e-05, + "loss": 43.8774, + "num_input_tokens_seen": 221152948, + "step": 4296 + }, + { + "epoch": 0.42249576177489495, + "grad_norm": 29.184438705444336, + "learning_rate": 8e-05, + "loss": 45.0154, + "num_input_tokens_seen": 221312820, + "step": 4299 + }, + { + "epoch": 0.42279059482567993, + "grad_norm": 28.55838394165039, + "learning_rate": 8e-05, + "loss": 41.2003, + "num_input_tokens_seen": 221458184, + "step": 4302 + }, + { + "epoch": 0.42308542787646497, + "grad_norm": 27.928804397583008, + "learning_rate": 8e-05, + "loss": 42.1826, + "num_input_tokens_seen": 221616952, + "step": 4305 + }, + { + "epoch": 0.42338026092724995, + "grad_norm": 26.76680564880371, + "learning_rate": 8e-05, + "loss": 40.2512, + "num_input_tokens_seen": 221761448, + "step": 4308 + }, + { + "epoch": 0.42367509397803493, + "grad_norm": 28.121938705444336, + "learning_rate": 8e-05, + "loss": 44.0944, + "num_input_tokens_seen": 221938332, + "step": 4311 + }, + { + "epoch": 0.4239699270288199, + "grad_norm": 31.460044860839844, + "learning_rate": 8e-05, + "loss": 41.3197, + "num_input_tokens_seen": 222095052, + "step": 4314 + }, + { + "epoch": 0.4242647600796049, + "grad_norm": 28.240819931030273, + "learning_rate": 8e-05, + "loss": 43.4479, + "num_input_tokens_seen": 222245720, + "step": 4317 + }, + { + "epoch": 0.42455959313038993, + "grad_norm": 29.07929229736328, + "learning_rate": 8e-05, + "loss": 47.6133, + "num_input_tokens_seen": 222377240, + "step": 4320 + }, + { + "epoch": 0.4248544261811749, + "grad_norm": 29.97142791748047, + "learning_rate": 8e-05, + "loss": 43.3428, + "num_input_tokens_seen": 222550048, + "step": 4323 + }, + { + "epoch": 0.4251492592319599, + "grad_norm": 34.53770446777344, + "learning_rate": 8e-05, + "loss": 38.3135, + "num_input_tokens_seen": 222694712, + "step": 4326 + }, + { + "epoch": 0.4254440922827449, + "grad_norm": 27.796838760375977, + "learning_rate": 8e-05, + "loss": 41.4311, + "num_input_tokens_seen": 222846660, + "step": 4329 + }, + { + "epoch": 0.4257389253335299, + "grad_norm": 30.644004821777344, + "learning_rate": 8e-05, + "loss": 46.5237, + "num_input_tokens_seen": 222995488, + "step": 4332 + }, + { + "epoch": 0.4260337583843149, + "grad_norm": 25.572084426879883, + "learning_rate": 8e-05, + "loss": 41.2685, + "num_input_tokens_seen": 223141724, + "step": 4335 + }, + { + "epoch": 0.4263285914350999, + "grad_norm": 26.065940856933594, + "learning_rate": 8e-05, + "loss": 42.0256, + "num_input_tokens_seen": 223288620, + "step": 4338 + }, + { + "epoch": 0.42662342448588486, + "grad_norm": 35.70410919189453, + "learning_rate": 8e-05, + "loss": 45.6876, + "num_input_tokens_seen": 223451668, + "step": 4341 + }, + { + "epoch": 0.42691825753666984, + "grad_norm": 21.78587532043457, + "learning_rate": 8e-05, + "loss": 39.8147, + "num_input_tokens_seen": 223600916, + "step": 4344 + }, + { + "epoch": 0.4272130905874549, + "grad_norm": 26.8822078704834, + "learning_rate": 8e-05, + "loss": 44.0621, + "num_input_tokens_seen": 223762988, + "step": 4347 + }, + { + "epoch": 0.42750792363823986, + "grad_norm": 26.29570960998535, + "learning_rate": 8e-05, + "loss": 42.4211, + "num_input_tokens_seen": 223927452, + "step": 4350 + }, + { + "epoch": 0.42780275668902484, + "grad_norm": 27.68350601196289, + "learning_rate": 8e-05, + "loss": 42.0873, + "num_input_tokens_seen": 224105740, + "step": 4353 + }, + { + "epoch": 0.4280975897398098, + "grad_norm": 25.874874114990234, + "learning_rate": 8e-05, + "loss": 37.8914, + "num_input_tokens_seen": 224268092, + "step": 4356 + }, + { + "epoch": 0.4283924227905948, + "grad_norm": 29.17751121520996, + "learning_rate": 8e-05, + "loss": 42.4487, + "num_input_tokens_seen": 224435168, + "step": 4359 + }, + { + "epoch": 0.42868725584137984, + "grad_norm": 30.268043518066406, + "learning_rate": 8e-05, + "loss": 44.7985, + "num_input_tokens_seen": 224577196, + "step": 4362 + }, + { + "epoch": 0.4289820888921648, + "grad_norm": 24.906023025512695, + "learning_rate": 8e-05, + "loss": 38.5836, + "num_input_tokens_seen": 224734276, + "step": 4365 + }, + { + "epoch": 0.4292769219429498, + "grad_norm": 37.84556198120117, + "learning_rate": 8e-05, + "loss": 44.6142, + "num_input_tokens_seen": 224892396, + "step": 4368 + }, + { + "epoch": 0.4295717549937348, + "grad_norm": 26.417160034179688, + "learning_rate": 8e-05, + "loss": 39.8458, + "num_input_tokens_seen": 225043828, + "step": 4371 + }, + { + "epoch": 0.42986658804451977, + "grad_norm": 27.29924774169922, + "learning_rate": 8e-05, + "loss": 38.1589, + "num_input_tokens_seen": 225198824, + "step": 4374 + }, + { + "epoch": 0.4301614210953048, + "grad_norm": 39.75818634033203, + "learning_rate": 8e-05, + "loss": 47.3034, + "num_input_tokens_seen": 225340136, + "step": 4377 + }, + { + "epoch": 0.4304562541460898, + "grad_norm": 28.143962860107422, + "learning_rate": 8e-05, + "loss": 42.0063, + "num_input_tokens_seen": 225469012, + "step": 4380 + }, + { + "epoch": 0.43075108719687477, + "grad_norm": 31.327285766601562, + "learning_rate": 8e-05, + "loss": 42.4765, + "num_input_tokens_seen": 225608504, + "step": 4383 + }, + { + "epoch": 0.43104592024765975, + "grad_norm": 27.978796005249023, + "learning_rate": 8e-05, + "loss": 39.2096, + "num_input_tokens_seen": 225760756, + "step": 4386 + }, + { + "epoch": 0.43134075329844473, + "grad_norm": 28.254173278808594, + "learning_rate": 8e-05, + "loss": 45.1552, + "num_input_tokens_seen": 225918220, + "step": 4389 + }, + { + "epoch": 0.43163558634922977, + "grad_norm": 28.915897369384766, + "learning_rate": 8e-05, + "loss": 43.1089, + "num_input_tokens_seen": 226082244, + "step": 4392 + }, + { + "epoch": 0.43193041940001475, + "grad_norm": 28.157947540283203, + "learning_rate": 8e-05, + "loss": 41.2043, + "num_input_tokens_seen": 226247280, + "step": 4395 + }, + { + "epoch": 0.43222525245079974, + "grad_norm": 28.28232765197754, + "learning_rate": 8e-05, + "loss": 43.0462, + "num_input_tokens_seen": 226400352, + "step": 4398 + }, + { + "epoch": 0.4325200855015847, + "grad_norm": 25.633373260498047, + "learning_rate": 8e-05, + "loss": 43.7799, + "num_input_tokens_seen": 226565792, + "step": 4401 + }, + { + "epoch": 0.4328149185523697, + "grad_norm": 27.941856384277344, + "learning_rate": 8e-05, + "loss": 43.6231, + "num_input_tokens_seen": 226710228, + "step": 4404 + }, + { + "epoch": 0.43310975160315474, + "grad_norm": 28.795846939086914, + "learning_rate": 8e-05, + "loss": 43.6229, + "num_input_tokens_seen": 226848448, + "step": 4407 + }, + { + "epoch": 0.4334045846539397, + "grad_norm": 33.9630241394043, + "learning_rate": 8e-05, + "loss": 39.9804, + "num_input_tokens_seen": 226998428, + "step": 4410 + }, + { + "epoch": 0.4336994177047247, + "grad_norm": 24.979286193847656, + "learning_rate": 8e-05, + "loss": 40.9833, + "num_input_tokens_seen": 227156348, + "step": 4413 + }, + { + "epoch": 0.4339942507555097, + "grad_norm": 29.209545135498047, + "learning_rate": 8e-05, + "loss": 38.7767, + "num_input_tokens_seen": 227288052, + "step": 4416 + }, + { + "epoch": 0.43428908380629466, + "grad_norm": 33.29966354370117, + "learning_rate": 8e-05, + "loss": 42.0673, + "num_input_tokens_seen": 227429904, + "step": 4419 + }, + { + "epoch": 0.4345839168570797, + "grad_norm": 24.679658889770508, + "learning_rate": 8e-05, + "loss": 42.4092, + "num_input_tokens_seen": 227604476, + "step": 4422 + }, + { + "epoch": 0.4348787499078647, + "grad_norm": 29.85538673400879, + "learning_rate": 8e-05, + "loss": 39.9113, + "num_input_tokens_seen": 227752576, + "step": 4425 + }, + { + "epoch": 0.43517358295864966, + "grad_norm": 29.75106430053711, + "learning_rate": 8e-05, + "loss": 44.6768, + "num_input_tokens_seen": 227894956, + "step": 4428 + }, + { + "epoch": 0.43546841600943464, + "grad_norm": 25.508901596069336, + "learning_rate": 8e-05, + "loss": 39.05, + "num_input_tokens_seen": 228065148, + "step": 4431 + }, + { + "epoch": 0.4357632490602196, + "grad_norm": 28.596651077270508, + "learning_rate": 8e-05, + "loss": 40.1365, + "num_input_tokens_seen": 228228056, + "step": 4434 + }, + { + "epoch": 0.43605808211100466, + "grad_norm": 45.46236038208008, + "learning_rate": 8e-05, + "loss": 42.0655, + "num_input_tokens_seen": 228375532, + "step": 4437 + }, + { + "epoch": 0.43635291516178965, + "grad_norm": 105.65780639648438, + "learning_rate": 8e-05, + "loss": 39.8979, + "num_input_tokens_seen": 228520848, + "step": 4440 + }, + { + "epoch": 0.4366477482125746, + "grad_norm": 27.998929977416992, + "learning_rate": 8e-05, + "loss": 45.1238, + "num_input_tokens_seen": 228682420, + "step": 4443 + }, + { + "epoch": 0.4369425812633596, + "grad_norm": 31.12704086303711, + "learning_rate": 8e-05, + "loss": 42.5966, + "num_input_tokens_seen": 228826300, + "step": 4446 + }, + { + "epoch": 0.4372374143141446, + "grad_norm": 29.78077507019043, + "learning_rate": 8e-05, + "loss": 43.7748, + "num_input_tokens_seen": 228999788, + "step": 4449 + }, + { + "epoch": 0.4375322473649296, + "grad_norm": 26.121967315673828, + "learning_rate": 8e-05, + "loss": 44.1384, + "num_input_tokens_seen": 229163636, + "step": 4452 + }, + { + "epoch": 0.4378270804157146, + "grad_norm": 27.11899757385254, + "learning_rate": 8e-05, + "loss": 42.037, + "num_input_tokens_seen": 229301104, + "step": 4455 + }, + { + "epoch": 0.4381219134664996, + "grad_norm": 27.865236282348633, + "learning_rate": 8e-05, + "loss": 41.5314, + "num_input_tokens_seen": 229470728, + "step": 4458 + }, + { + "epoch": 0.4384167465172846, + "grad_norm": 26.886306762695312, + "learning_rate": 8e-05, + "loss": 43.424, + "num_input_tokens_seen": 229639428, + "step": 4461 + }, + { + "epoch": 0.43871157956806955, + "grad_norm": 30.441774368286133, + "learning_rate": 8e-05, + "loss": 44.4104, + "num_input_tokens_seen": 229773216, + "step": 4464 + }, + { + "epoch": 0.4390064126188546, + "grad_norm": 26.78765869140625, + "learning_rate": 8e-05, + "loss": 44.3086, + "num_input_tokens_seen": 229930740, + "step": 4467 + }, + { + "epoch": 0.4393012456696396, + "grad_norm": 28.085712432861328, + "learning_rate": 8e-05, + "loss": 39.6616, + "num_input_tokens_seen": 230100392, + "step": 4470 + }, + { + "epoch": 0.43959607872042455, + "grad_norm": 28.30702781677246, + "learning_rate": 8e-05, + "loss": 42.1998, + "num_input_tokens_seen": 230261584, + "step": 4473 + }, + { + "epoch": 0.43989091177120954, + "grad_norm": 26.2158260345459, + "learning_rate": 8e-05, + "loss": 42.0976, + "num_input_tokens_seen": 230425084, + "step": 4476 + }, + { + "epoch": 0.4401857448219945, + "grad_norm": 32.34695053100586, + "learning_rate": 8e-05, + "loss": 46.1886, + "num_input_tokens_seen": 230576716, + "step": 4479 + }, + { + "epoch": 0.44048057787277956, + "grad_norm": 26.81767463684082, + "learning_rate": 8e-05, + "loss": 40.8735, + "num_input_tokens_seen": 230736824, + "step": 4482 + }, + { + "epoch": 0.44077541092356454, + "grad_norm": 26.143571853637695, + "learning_rate": 8e-05, + "loss": 40.0293, + "num_input_tokens_seen": 230870868, + "step": 4485 + }, + { + "epoch": 0.4410702439743495, + "grad_norm": 26.403610229492188, + "learning_rate": 8e-05, + "loss": 41.7636, + "num_input_tokens_seen": 231008172, + "step": 4488 + }, + { + "epoch": 0.4413650770251345, + "grad_norm": 26.66645050048828, + "learning_rate": 8e-05, + "loss": 42.2956, + "num_input_tokens_seen": 231173044, + "step": 4491 + }, + { + "epoch": 0.44165991007591954, + "grad_norm": 26.82290267944336, + "learning_rate": 8e-05, + "loss": 41.3145, + "num_input_tokens_seen": 231342420, + "step": 4494 + }, + { + "epoch": 0.4419547431267045, + "grad_norm": 26.77899169921875, + "learning_rate": 8e-05, + "loss": 37.7972, + "num_input_tokens_seen": 231483824, + "step": 4497 + }, + { + "epoch": 0.4422495761774895, + "grad_norm": 27.854171752929688, + "learning_rate": 8e-05, + "loss": 41.4553, + "num_input_tokens_seen": 231629132, + "step": 4500 + }, + { + "epoch": 0.4425444092282745, + "grad_norm": 26.862014770507812, + "learning_rate": 8e-05, + "loss": 40.6043, + "num_input_tokens_seen": 231776520, + "step": 4503 + }, + { + "epoch": 0.44283924227905946, + "grad_norm": 24.268115997314453, + "learning_rate": 8e-05, + "loss": 40.1136, + "num_input_tokens_seen": 231962636, + "step": 4506 + }, + { + "epoch": 0.4431340753298445, + "grad_norm": 30.513568878173828, + "learning_rate": 8e-05, + "loss": 40.4665, + "num_input_tokens_seen": 232125376, + "step": 4509 + }, + { + "epoch": 0.4434289083806295, + "grad_norm": 26.28969955444336, + "learning_rate": 8e-05, + "loss": 39.5851, + "num_input_tokens_seen": 232266024, + "step": 4512 + }, + { + "epoch": 0.44372374143141446, + "grad_norm": 27.46841812133789, + "learning_rate": 8e-05, + "loss": 44.978, + "num_input_tokens_seen": 232420048, + "step": 4515 + }, + { + "epoch": 0.44401857448219945, + "grad_norm": 25.948152542114258, + "learning_rate": 8e-05, + "loss": 43.6427, + "num_input_tokens_seen": 232570936, + "step": 4518 + }, + { + "epoch": 0.44431340753298443, + "grad_norm": 27.499948501586914, + "learning_rate": 8e-05, + "loss": 44.9228, + "num_input_tokens_seen": 232737172, + "step": 4521 + }, + { + "epoch": 0.44460824058376947, + "grad_norm": 28.01111602783203, + "learning_rate": 8e-05, + "loss": 43.929, + "num_input_tokens_seen": 232890980, + "step": 4524 + }, + { + "epoch": 0.44490307363455445, + "grad_norm": 27.918964385986328, + "learning_rate": 8e-05, + "loss": 38.9864, + "num_input_tokens_seen": 233027788, + "step": 4527 + }, + { + "epoch": 0.44519790668533943, + "grad_norm": 29.82014274597168, + "learning_rate": 8e-05, + "loss": 42.5304, + "num_input_tokens_seen": 233198452, + "step": 4530 + }, + { + "epoch": 0.4454927397361244, + "grad_norm": 26.141271591186523, + "learning_rate": 8e-05, + "loss": 41.4499, + "num_input_tokens_seen": 233350428, + "step": 4533 + }, + { + "epoch": 0.4457875727869094, + "grad_norm": 57.809268951416016, + "learning_rate": 8e-05, + "loss": 39.5788, + "num_input_tokens_seen": 233493092, + "step": 4536 + }, + { + "epoch": 0.44608240583769443, + "grad_norm": 56.92467498779297, + "learning_rate": 8e-05, + "loss": 39.9422, + "num_input_tokens_seen": 233646260, + "step": 4539 + }, + { + "epoch": 0.4463772388884794, + "grad_norm": 25.191301345825195, + "learning_rate": 8e-05, + "loss": 42.8789, + "num_input_tokens_seen": 233789360, + "step": 4542 + }, + { + "epoch": 0.4466720719392644, + "grad_norm": 31.032283782958984, + "learning_rate": 8e-05, + "loss": 42.8801, + "num_input_tokens_seen": 233939884, + "step": 4545 + }, + { + "epoch": 0.4469669049900494, + "grad_norm": 24.119443893432617, + "learning_rate": 8e-05, + "loss": 38.5954, + "num_input_tokens_seen": 234093744, + "step": 4548 + }, + { + "epoch": 0.44726173804083436, + "grad_norm": 30.361286163330078, + "learning_rate": 8e-05, + "loss": 42.6672, + "num_input_tokens_seen": 234253980, + "step": 4551 + }, + { + "epoch": 0.4475565710916194, + "grad_norm": 26.771743774414062, + "learning_rate": 8e-05, + "loss": 38.8298, + "num_input_tokens_seen": 234412916, + "step": 4554 + }, + { + "epoch": 0.4478514041424044, + "grad_norm": 31.644166946411133, + "learning_rate": 8e-05, + "loss": 45.0399, + "num_input_tokens_seen": 234564704, + "step": 4557 + }, + { + "epoch": 0.44814623719318936, + "grad_norm": 27.465606689453125, + "learning_rate": 8e-05, + "loss": 37.7804, + "num_input_tokens_seen": 234719408, + "step": 4560 + }, + { + "epoch": 0.44844107024397434, + "grad_norm": 25.798643112182617, + "learning_rate": 8e-05, + "loss": 41.3184, + "num_input_tokens_seen": 234877952, + "step": 4563 + }, + { + "epoch": 0.4487359032947593, + "grad_norm": 25.45863914489746, + "learning_rate": 8e-05, + "loss": 41.4128, + "num_input_tokens_seen": 235018628, + "step": 4566 + }, + { + "epoch": 0.44903073634554436, + "grad_norm": 27.793365478515625, + "learning_rate": 8e-05, + "loss": 45.1726, + "num_input_tokens_seen": 235183500, + "step": 4569 + }, + { + "epoch": 0.44932556939632934, + "grad_norm": 24.252897262573242, + "learning_rate": 8e-05, + "loss": 42.8122, + "num_input_tokens_seen": 235353172, + "step": 4572 + }, + { + "epoch": 0.4496204024471143, + "grad_norm": 26.199750900268555, + "learning_rate": 8e-05, + "loss": 41.2429, + "num_input_tokens_seen": 235498740, + "step": 4575 + }, + { + "epoch": 0.4499152354978993, + "grad_norm": 26.499221801757812, + "learning_rate": 8e-05, + "loss": 42.1125, + "num_input_tokens_seen": 235684608, + "step": 4578 + }, + { + "epoch": 0.4502100685486843, + "grad_norm": 26.480804443359375, + "learning_rate": 8e-05, + "loss": 40.9153, + "num_input_tokens_seen": 235850480, + "step": 4581 + }, + { + "epoch": 0.4505049015994693, + "grad_norm": 26.42413902282715, + "learning_rate": 8e-05, + "loss": 42.1996, + "num_input_tokens_seen": 236034140, + "step": 4584 + }, + { + "epoch": 0.4507997346502543, + "grad_norm": 24.659976959228516, + "learning_rate": 8e-05, + "loss": 39.7046, + "num_input_tokens_seen": 236180908, + "step": 4587 + }, + { + "epoch": 0.4510945677010393, + "grad_norm": 31.82207679748535, + "learning_rate": 8e-05, + "loss": 40.1822, + "num_input_tokens_seen": 236331080, + "step": 4590 + }, + { + "epoch": 0.45138940075182427, + "grad_norm": 22.85356903076172, + "learning_rate": 8e-05, + "loss": 37.4057, + "num_input_tokens_seen": 236491800, + "step": 4593 + }, + { + "epoch": 0.45168423380260925, + "grad_norm": 35.563621520996094, + "learning_rate": 8e-05, + "loss": 41.0396, + "num_input_tokens_seen": 236637464, + "step": 4596 + }, + { + "epoch": 0.4519790668533943, + "grad_norm": 36.02199172973633, + "learning_rate": 8e-05, + "loss": 45.6636, + "num_input_tokens_seen": 236803816, + "step": 4599 + }, + { + "epoch": 0.45227389990417927, + "grad_norm": 28.48723793029785, + "learning_rate": 8e-05, + "loss": 39.8946, + "num_input_tokens_seen": 236949088, + "step": 4602 + }, + { + "epoch": 0.45256873295496425, + "grad_norm": 29.46847915649414, + "learning_rate": 8e-05, + "loss": 40.2725, + "num_input_tokens_seen": 237105976, + "step": 4605 + }, + { + "epoch": 0.45286356600574923, + "grad_norm": 24.744489669799805, + "learning_rate": 8e-05, + "loss": 42.5878, + "num_input_tokens_seen": 237259064, + "step": 4608 + }, + { + "epoch": 0.4531583990565342, + "grad_norm": 28.40730857849121, + "learning_rate": 8e-05, + "loss": 41.7184, + "num_input_tokens_seen": 237429340, + "step": 4611 + }, + { + "epoch": 0.45345323210731925, + "grad_norm": 26.473224639892578, + "learning_rate": 8e-05, + "loss": 43.0435, + "num_input_tokens_seen": 237587500, + "step": 4614 + }, + { + "epoch": 0.45374806515810423, + "grad_norm": 26.486181259155273, + "learning_rate": 8e-05, + "loss": 40.3067, + "num_input_tokens_seen": 237748580, + "step": 4617 + }, + { + "epoch": 0.4540428982088892, + "grad_norm": 38.76091766357422, + "learning_rate": 8e-05, + "loss": 39.2949, + "num_input_tokens_seen": 237924340, + "step": 4620 + }, + { + "epoch": 0.4543377312596742, + "grad_norm": 27.670326232910156, + "learning_rate": 8e-05, + "loss": 40.5704, + "num_input_tokens_seen": 238066828, + "step": 4623 + }, + { + "epoch": 0.4546325643104592, + "grad_norm": 40.92820739746094, + "learning_rate": 8e-05, + "loss": 44.0541, + "num_input_tokens_seen": 238220812, + "step": 4626 + }, + { + "epoch": 0.4549273973612442, + "grad_norm": 27.452932357788086, + "learning_rate": 8e-05, + "loss": 42.7587, + "num_input_tokens_seen": 238384084, + "step": 4629 + }, + { + "epoch": 0.4552222304120292, + "grad_norm": 25.271718978881836, + "learning_rate": 8e-05, + "loss": 42.8695, + "num_input_tokens_seen": 238523988, + "step": 4632 + }, + { + "epoch": 0.4555170634628142, + "grad_norm": 23.655532836914062, + "learning_rate": 8e-05, + "loss": 41.0458, + "num_input_tokens_seen": 238669068, + "step": 4635 + }, + { + "epoch": 0.45581189651359916, + "grad_norm": 28.211811065673828, + "learning_rate": 8e-05, + "loss": 43.0585, + "num_input_tokens_seen": 238825804, + "step": 4638 + }, + { + "epoch": 0.4561067295643842, + "grad_norm": 28.751880645751953, + "learning_rate": 8e-05, + "loss": 40.9551, + "num_input_tokens_seen": 238982256, + "step": 4641 + }, + { + "epoch": 0.4564015626151692, + "grad_norm": 25.98259925842285, + "learning_rate": 8e-05, + "loss": 39.8824, + "num_input_tokens_seen": 239180620, + "step": 4644 + }, + { + "epoch": 0.45669639566595416, + "grad_norm": 28.03835678100586, + "learning_rate": 8e-05, + "loss": 44.4102, + "num_input_tokens_seen": 239351024, + "step": 4647 + }, + { + "epoch": 0.45699122871673914, + "grad_norm": 26.699846267700195, + "learning_rate": 8e-05, + "loss": 44.0684, + "num_input_tokens_seen": 239523072, + "step": 4650 + }, + { + "epoch": 0.4572860617675241, + "grad_norm": 24.692668914794922, + "learning_rate": 8e-05, + "loss": 36.3624, + "num_input_tokens_seen": 239675340, + "step": 4653 + }, + { + "epoch": 0.45758089481830916, + "grad_norm": 45.91609191894531, + "learning_rate": 8e-05, + "loss": 42.5926, + "num_input_tokens_seen": 239824032, + "step": 4656 + }, + { + "epoch": 0.45787572786909414, + "grad_norm": 29.405675888061523, + "learning_rate": 8e-05, + "loss": 37.3812, + "num_input_tokens_seen": 239957640, + "step": 4659 + }, + { + "epoch": 0.4581705609198791, + "grad_norm": 27.97926139831543, + "learning_rate": 8e-05, + "loss": 40.5367, + "num_input_tokens_seen": 240118696, + "step": 4662 + }, + { + "epoch": 0.4584653939706641, + "grad_norm": 41.609432220458984, + "learning_rate": 8e-05, + "loss": 39.9011, + "num_input_tokens_seen": 240257624, + "step": 4665 + }, + { + "epoch": 0.4587602270214491, + "grad_norm": 28.44548797607422, + "learning_rate": 8e-05, + "loss": 40.396, + "num_input_tokens_seen": 240413784, + "step": 4668 + }, + { + "epoch": 0.4590550600722341, + "grad_norm": 27.161584854125977, + "learning_rate": 8e-05, + "loss": 41.7049, + "num_input_tokens_seen": 240559368, + "step": 4671 + }, + { + "epoch": 0.4593498931230191, + "grad_norm": 28.079381942749023, + "learning_rate": 8e-05, + "loss": 45.1337, + "num_input_tokens_seen": 240711084, + "step": 4674 + }, + { + "epoch": 0.4596447261738041, + "grad_norm": 31.016523361206055, + "learning_rate": 8e-05, + "loss": 40.1331, + "num_input_tokens_seen": 240873416, + "step": 4677 + }, + { + "epoch": 0.45993955922458907, + "grad_norm": 29.084857940673828, + "learning_rate": 8e-05, + "loss": 43.6345, + "num_input_tokens_seen": 241011492, + "step": 4680 + }, + { + "epoch": 0.46023439227537405, + "grad_norm": 24.55000877380371, + "learning_rate": 8e-05, + "loss": 40.8207, + "num_input_tokens_seen": 241161368, + "step": 4683 + }, + { + "epoch": 0.4605292253261591, + "grad_norm": 28.673322677612305, + "learning_rate": 8e-05, + "loss": 40.0417, + "num_input_tokens_seen": 241311936, + "step": 4686 + }, + { + "epoch": 0.46082405837694407, + "grad_norm": 25.411218643188477, + "learning_rate": 8e-05, + "loss": 38.8036, + "num_input_tokens_seen": 241464440, + "step": 4689 + }, + { + "epoch": 0.46111889142772905, + "grad_norm": 20.186403274536133, + "learning_rate": 8e-05, + "loss": 36.3097, + "num_input_tokens_seen": 241636584, + "step": 4692 + }, + { + "epoch": 0.46141372447851403, + "grad_norm": 30.097230911254883, + "learning_rate": 8e-05, + "loss": 40.7838, + "num_input_tokens_seen": 241809340, + "step": 4695 + }, + { + "epoch": 0.461708557529299, + "grad_norm": 31.64427947998047, + "learning_rate": 8e-05, + "loss": 46.0323, + "num_input_tokens_seen": 241963564, + "step": 4698 + }, + { + "epoch": 0.46200339058008405, + "grad_norm": 28.308427810668945, + "learning_rate": 8e-05, + "loss": 40.2799, + "num_input_tokens_seen": 242148272, + "step": 4701 + }, + { + "epoch": 0.46229822363086903, + "grad_norm": 25.227632522583008, + "learning_rate": 8e-05, + "loss": 41.1382, + "num_input_tokens_seen": 242279320, + "step": 4704 + }, + { + "epoch": 0.462593056681654, + "grad_norm": 28.968778610229492, + "learning_rate": 8e-05, + "loss": 42.7915, + "num_input_tokens_seen": 242449204, + "step": 4707 + }, + { + "epoch": 0.462887889732439, + "grad_norm": 31.740821838378906, + "learning_rate": 8e-05, + "loss": 45.0049, + "num_input_tokens_seen": 242613292, + "step": 4710 + }, + { + "epoch": 0.463182722783224, + "grad_norm": 29.082109451293945, + "learning_rate": 8e-05, + "loss": 39.2121, + "num_input_tokens_seen": 242768116, + "step": 4713 + }, + { + "epoch": 0.463477555834009, + "grad_norm": 24.574909210205078, + "learning_rate": 8e-05, + "loss": 41.5347, + "num_input_tokens_seen": 242905276, + "step": 4716 + }, + { + "epoch": 0.463772388884794, + "grad_norm": 28.00779914855957, + "learning_rate": 8e-05, + "loss": 41.8152, + "num_input_tokens_seen": 243051876, + "step": 4719 + }, + { + "epoch": 0.464067221935579, + "grad_norm": 25.955181121826172, + "learning_rate": 8e-05, + "loss": 38.6152, + "num_input_tokens_seen": 243200648, + "step": 4722 + }, + { + "epoch": 0.46436205498636396, + "grad_norm": 29.894330978393555, + "learning_rate": 8e-05, + "loss": 37.0003, + "num_input_tokens_seen": 243360924, + "step": 4725 + }, + { + "epoch": 0.46465688803714894, + "grad_norm": 27.062658309936523, + "learning_rate": 8e-05, + "loss": 41.574, + "num_input_tokens_seen": 243529096, + "step": 4728 + }, + { + "epoch": 0.464951721087934, + "grad_norm": 32.036903381347656, + "learning_rate": 8e-05, + "loss": 43.9895, + "num_input_tokens_seen": 243677944, + "step": 4731 + }, + { + "epoch": 0.46524655413871896, + "grad_norm": 29.033061981201172, + "learning_rate": 8e-05, + "loss": 40.0667, + "num_input_tokens_seen": 243829408, + "step": 4734 + }, + { + "epoch": 0.46554138718950394, + "grad_norm": 27.246234893798828, + "learning_rate": 8e-05, + "loss": 38.1965, + "num_input_tokens_seen": 243986544, + "step": 4737 + }, + { + "epoch": 0.4658362202402889, + "grad_norm": 30.715890884399414, + "learning_rate": 8e-05, + "loss": 41.338, + "num_input_tokens_seen": 244140580, + "step": 4740 + }, + { + "epoch": 0.4661310532910739, + "grad_norm": 25.131591796875, + "learning_rate": 8e-05, + "loss": 39.5073, + "num_input_tokens_seen": 244284620, + "step": 4743 + }, + { + "epoch": 0.46642588634185894, + "grad_norm": 27.105449676513672, + "learning_rate": 8e-05, + "loss": 42.4629, + "num_input_tokens_seen": 244422736, + "step": 4746 + }, + { + "epoch": 0.4667207193926439, + "grad_norm": 25.411161422729492, + "learning_rate": 8e-05, + "loss": 39.7882, + "num_input_tokens_seen": 244575136, + "step": 4749 + }, + { + "epoch": 0.4670155524434289, + "grad_norm": 27.345781326293945, + "learning_rate": 8e-05, + "loss": 38.6773, + "num_input_tokens_seen": 244722500, + "step": 4752 + }, + { + "epoch": 0.4673103854942139, + "grad_norm": 29.45749282836914, + "learning_rate": 8e-05, + "loss": 40.3697, + "num_input_tokens_seen": 244873336, + "step": 4755 + }, + { + "epoch": 0.46760521854499887, + "grad_norm": 29.298477172851562, + "learning_rate": 8e-05, + "loss": 42.2155, + "num_input_tokens_seen": 245010504, + "step": 4758 + }, + { + "epoch": 0.4679000515957839, + "grad_norm": 27.368099212646484, + "learning_rate": 8e-05, + "loss": 41.6353, + "num_input_tokens_seen": 245187116, + "step": 4761 + }, + { + "epoch": 0.4681948846465689, + "grad_norm": 25.604515075683594, + "learning_rate": 8e-05, + "loss": 41.4931, + "num_input_tokens_seen": 245352472, + "step": 4764 + }, + { + "epoch": 0.46848971769735387, + "grad_norm": 26.513587951660156, + "learning_rate": 8e-05, + "loss": 40.9986, + "num_input_tokens_seen": 245524364, + "step": 4767 + }, + { + "epoch": 0.46878455074813885, + "grad_norm": 23.54816246032715, + "learning_rate": 8e-05, + "loss": 41.5214, + "num_input_tokens_seen": 245676924, + "step": 4770 + }, + { + "epoch": 0.46907938379892383, + "grad_norm": 26.805103302001953, + "learning_rate": 8e-05, + "loss": 37.1733, + "num_input_tokens_seen": 245828260, + "step": 4773 + }, + { + "epoch": 0.46937421684970887, + "grad_norm": 30.842025756835938, + "learning_rate": 8e-05, + "loss": 40.2481, + "num_input_tokens_seen": 245993472, + "step": 4776 + }, + { + "epoch": 0.46966904990049385, + "grad_norm": 26.739294052124023, + "learning_rate": 8e-05, + "loss": 36.3944, + "num_input_tokens_seen": 246172744, + "step": 4779 + }, + { + "epoch": 0.46996388295127883, + "grad_norm": 33.225921630859375, + "learning_rate": 8e-05, + "loss": 44.1665, + "num_input_tokens_seen": 246331176, + "step": 4782 + }, + { + "epoch": 0.4702587160020638, + "grad_norm": 28.183168411254883, + "learning_rate": 8e-05, + "loss": 43.6505, + "num_input_tokens_seen": 246481616, + "step": 4785 + }, + { + "epoch": 0.47055354905284885, + "grad_norm": 26.746055603027344, + "learning_rate": 8e-05, + "loss": 42.6295, + "num_input_tokens_seen": 246633420, + "step": 4788 + }, + { + "epoch": 0.47084838210363383, + "grad_norm": 30.00533103942871, + "learning_rate": 8e-05, + "loss": 46.9378, + "num_input_tokens_seen": 246794108, + "step": 4791 + }, + { + "epoch": 0.4711432151544188, + "grad_norm": 24.430938720703125, + "learning_rate": 8e-05, + "loss": 42.2782, + "num_input_tokens_seen": 246945580, + "step": 4794 + }, + { + "epoch": 0.4714380482052038, + "grad_norm": 34.13113021850586, + "learning_rate": 8e-05, + "loss": 42.1327, + "num_input_tokens_seen": 247093080, + "step": 4797 + }, + { + "epoch": 0.4717328812559888, + "grad_norm": 26.586788177490234, + "learning_rate": 8e-05, + "loss": 38.0685, + "num_input_tokens_seen": 247236392, + "step": 4800 + }, + { + "epoch": 0.4720277143067738, + "grad_norm": 31.26131820678711, + "learning_rate": 8e-05, + "loss": 39.4935, + "num_input_tokens_seen": 247378920, + "step": 4803 + }, + { + "epoch": 0.4723225473575588, + "grad_norm": 27.38518714904785, + "learning_rate": 8e-05, + "loss": 40.852, + "num_input_tokens_seen": 247532256, + "step": 4806 + }, + { + "epoch": 0.4726173804083438, + "grad_norm": 27.600831985473633, + "learning_rate": 8e-05, + "loss": 37.4856, + "num_input_tokens_seen": 247698124, + "step": 4809 + }, + { + "epoch": 0.47291221345912876, + "grad_norm": 27.928665161132812, + "learning_rate": 8e-05, + "loss": 40.5996, + "num_input_tokens_seen": 247855388, + "step": 4812 + }, + { + "epoch": 0.47320704650991374, + "grad_norm": 29.014537811279297, + "learning_rate": 8e-05, + "loss": 45.1572, + "num_input_tokens_seen": 248008312, + "step": 4815 + }, + { + "epoch": 0.4735018795606988, + "grad_norm": 24.911880493164062, + "learning_rate": 8e-05, + "loss": 44.1829, + "num_input_tokens_seen": 248169076, + "step": 4818 + }, + { + "epoch": 0.47379671261148376, + "grad_norm": 28.43665313720703, + "learning_rate": 8e-05, + "loss": 38.3031, + "num_input_tokens_seen": 248336860, + "step": 4821 + }, + { + "epoch": 0.47409154566226874, + "grad_norm": 29.070302963256836, + "learning_rate": 8e-05, + "loss": 40.3561, + "num_input_tokens_seen": 248490776, + "step": 4824 + }, + { + "epoch": 0.4743863787130537, + "grad_norm": 26.32981300354004, + "learning_rate": 8e-05, + "loss": 41.947, + "num_input_tokens_seen": 248638392, + "step": 4827 + }, + { + "epoch": 0.4746812117638387, + "grad_norm": 28.874282836914062, + "learning_rate": 8e-05, + "loss": 43.5164, + "num_input_tokens_seen": 248789532, + "step": 4830 + }, + { + "epoch": 0.47497604481462374, + "grad_norm": 26.124303817749023, + "learning_rate": 8e-05, + "loss": 40.266, + "num_input_tokens_seen": 248957436, + "step": 4833 + }, + { + "epoch": 0.4752708778654087, + "grad_norm": 25.164854049682617, + "learning_rate": 8e-05, + "loss": 41.942, + "num_input_tokens_seen": 249109452, + "step": 4836 + }, + { + "epoch": 0.4755657109161937, + "grad_norm": 25.05072784423828, + "learning_rate": 8e-05, + "loss": 37.8196, + "num_input_tokens_seen": 249255184, + "step": 4839 + }, + { + "epoch": 0.4758605439669787, + "grad_norm": 27.30223846435547, + "learning_rate": 8e-05, + "loss": 39.4101, + "num_input_tokens_seen": 249397136, + "step": 4842 + }, + { + "epoch": 0.47615537701776367, + "grad_norm": 33.2149658203125, + "learning_rate": 8e-05, + "loss": 43.5654, + "num_input_tokens_seen": 249558888, + "step": 4845 + }, + { + "epoch": 0.4764502100685487, + "grad_norm": 25.12885856628418, + "learning_rate": 8e-05, + "loss": 41.83, + "num_input_tokens_seen": 249706876, + "step": 4848 + }, + { + "epoch": 0.4767450431193337, + "grad_norm": 27.43396759033203, + "learning_rate": 8e-05, + "loss": 41.058, + "num_input_tokens_seen": 249894004, + "step": 4851 + }, + { + "epoch": 0.47703987617011867, + "grad_norm": 27.396812438964844, + "learning_rate": 8e-05, + "loss": 35.1668, + "num_input_tokens_seen": 250050096, + "step": 4854 + }, + { + "epoch": 0.47733470922090365, + "grad_norm": 31.309123992919922, + "learning_rate": 8e-05, + "loss": 41.7648, + "num_input_tokens_seen": 250215436, + "step": 4857 + }, + { + "epoch": 0.47762954227168863, + "grad_norm": 34.78447341918945, + "learning_rate": 8e-05, + "loss": 43.7872, + "num_input_tokens_seen": 250357180, + "step": 4860 + }, + { + "epoch": 0.47792437532247367, + "grad_norm": 30.251394271850586, + "learning_rate": 8e-05, + "loss": 42.9092, + "num_input_tokens_seen": 250499540, + "step": 4863 + }, + { + "epoch": 0.47821920837325865, + "grad_norm": 31.96343231201172, + "learning_rate": 8e-05, + "loss": 42.077, + "num_input_tokens_seen": 250649300, + "step": 4866 + }, + { + "epoch": 0.47851404142404363, + "grad_norm": 26.872812271118164, + "learning_rate": 8e-05, + "loss": 41.2801, + "num_input_tokens_seen": 250813788, + "step": 4869 + }, + { + "epoch": 0.4788088744748286, + "grad_norm": 26.450227737426758, + "learning_rate": 8e-05, + "loss": 42.2355, + "num_input_tokens_seen": 250984424, + "step": 4872 + }, + { + "epoch": 0.4791037075256136, + "grad_norm": 28.50580596923828, + "learning_rate": 8e-05, + "loss": 42.1893, + "num_input_tokens_seen": 251143068, + "step": 4875 + }, + { + "epoch": 0.47939854057639864, + "grad_norm": 29.96875762939453, + "learning_rate": 8e-05, + "loss": 41.5831, + "num_input_tokens_seen": 251301592, + "step": 4878 + }, + { + "epoch": 0.4796933736271836, + "grad_norm": 25.501129150390625, + "learning_rate": 8e-05, + "loss": 39.8773, + "num_input_tokens_seen": 251466600, + "step": 4881 + }, + { + "epoch": 0.4799882066779686, + "grad_norm": 27.416034698486328, + "learning_rate": 8e-05, + "loss": 39.085, + "num_input_tokens_seen": 251626984, + "step": 4884 + }, + { + "epoch": 0.4802830397287536, + "grad_norm": 27.230443954467773, + "learning_rate": 8e-05, + "loss": 42.5695, + "num_input_tokens_seen": 251801908, + "step": 4887 + }, + { + "epoch": 0.48057787277953856, + "grad_norm": 24.647361755371094, + "learning_rate": 8e-05, + "loss": 40.7212, + "num_input_tokens_seen": 251952904, + "step": 4890 + }, + { + "epoch": 0.4808727058303236, + "grad_norm": 28.154359817504883, + "learning_rate": 8e-05, + "loss": 40.6429, + "num_input_tokens_seen": 252107504, + "step": 4893 + }, + { + "epoch": 0.4811675388811086, + "grad_norm": 29.207487106323242, + "learning_rate": 8e-05, + "loss": 41.0142, + "num_input_tokens_seen": 252256372, + "step": 4896 + }, + { + "epoch": 0.48146237193189356, + "grad_norm": 26.0395450592041, + "learning_rate": 8e-05, + "loss": 39.9626, + "num_input_tokens_seen": 252403820, + "step": 4899 + }, + { + "epoch": 0.48175720498267854, + "grad_norm": 24.243412017822266, + "learning_rate": 8e-05, + "loss": 39.2682, + "num_input_tokens_seen": 252560904, + "step": 4902 + }, + { + "epoch": 0.4820520380334635, + "grad_norm": 32.20317459106445, + "learning_rate": 8e-05, + "loss": 42.3904, + "num_input_tokens_seen": 252696252, + "step": 4905 + }, + { + "epoch": 0.48234687108424856, + "grad_norm": 33.271995544433594, + "learning_rate": 8e-05, + "loss": 42.833, + "num_input_tokens_seen": 252835252, + "step": 4908 + }, + { + "epoch": 0.48264170413503354, + "grad_norm": 32.50652313232422, + "learning_rate": 8e-05, + "loss": 40.7188, + "num_input_tokens_seen": 252977368, + "step": 4911 + }, + { + "epoch": 0.4829365371858185, + "grad_norm": 21.94767189025879, + "learning_rate": 8e-05, + "loss": 39.2929, + "num_input_tokens_seen": 253131608, + "step": 4914 + }, + { + "epoch": 0.4832313702366035, + "grad_norm": 25.962169647216797, + "learning_rate": 8e-05, + "loss": 40.0814, + "num_input_tokens_seen": 253289132, + "step": 4917 + }, + { + "epoch": 0.4835262032873885, + "grad_norm": 26.929094314575195, + "learning_rate": 8e-05, + "loss": 38.078, + "num_input_tokens_seen": 253461620, + "step": 4920 + }, + { + "epoch": 0.4838210363381735, + "grad_norm": 28.08810043334961, + "learning_rate": 8e-05, + "loss": 41.8377, + "num_input_tokens_seen": 253609532, + "step": 4923 + }, + { + "epoch": 0.4841158693889585, + "grad_norm": 27.889724731445312, + "learning_rate": 8e-05, + "loss": 39.9923, + "num_input_tokens_seen": 253759116, + "step": 4926 + }, + { + "epoch": 0.4844107024397435, + "grad_norm": 23.67461585998535, + "learning_rate": 8e-05, + "loss": 38.6749, + "num_input_tokens_seen": 253940252, + "step": 4929 + }, + { + "epoch": 0.48470553549052847, + "grad_norm": 25.49771499633789, + "learning_rate": 8e-05, + "loss": 39.5138, + "num_input_tokens_seen": 254101524, + "step": 4932 + }, + { + "epoch": 0.48500036854131345, + "grad_norm": 27.745712280273438, + "learning_rate": 8e-05, + "loss": 42.9393, + "num_input_tokens_seen": 254265708, + "step": 4935 + }, + { + "epoch": 0.4852952015920985, + "grad_norm": 27.445417404174805, + "learning_rate": 8e-05, + "loss": 43.3623, + "num_input_tokens_seen": 254427608, + "step": 4938 + }, + { + "epoch": 0.4855900346428835, + "grad_norm": 23.81502342224121, + "learning_rate": 8e-05, + "loss": 39.5791, + "num_input_tokens_seen": 254585740, + "step": 4941 + }, + { + "epoch": 0.48588486769366845, + "grad_norm": 27.11995506286621, + "learning_rate": 8e-05, + "loss": 41.631, + "num_input_tokens_seen": 254735444, + "step": 4944 + }, + { + "epoch": 0.48617970074445344, + "grad_norm": 28.302568435668945, + "learning_rate": 8e-05, + "loss": 39.621, + "num_input_tokens_seen": 254873632, + "step": 4947 + }, + { + "epoch": 0.4864745337952385, + "grad_norm": 25.918787002563477, + "learning_rate": 8e-05, + "loss": 36.2581, + "num_input_tokens_seen": 255018324, + "step": 4950 + }, + { + "epoch": 0.48676936684602345, + "grad_norm": 26.976848602294922, + "learning_rate": 8e-05, + "loss": 39.609, + "num_input_tokens_seen": 255170504, + "step": 4953 + }, + { + "epoch": 0.48706419989680844, + "grad_norm": 39.22269821166992, + "learning_rate": 8e-05, + "loss": 42.433, + "num_input_tokens_seen": 255327264, + "step": 4956 + }, + { + "epoch": 0.4873590329475934, + "grad_norm": 31.607404708862305, + "learning_rate": 8e-05, + "loss": 41.0775, + "num_input_tokens_seen": 255484364, + "step": 4959 + }, + { + "epoch": 0.4876538659983784, + "grad_norm": 25.68681526184082, + "learning_rate": 8e-05, + "loss": 39.5636, + "num_input_tokens_seen": 255642880, + "step": 4962 + }, + { + "epoch": 0.48794869904916344, + "grad_norm": 30.098154067993164, + "learning_rate": 8e-05, + "loss": 38.4692, + "num_input_tokens_seen": 255793688, + "step": 4965 + }, + { + "epoch": 0.4882435320999484, + "grad_norm": 32.68474197387695, + "learning_rate": 8e-05, + "loss": 38.1969, + "num_input_tokens_seen": 255951492, + "step": 4968 + }, + { + "epoch": 0.4885383651507334, + "grad_norm": 26.910491943359375, + "learning_rate": 8e-05, + "loss": 40.7008, + "num_input_tokens_seen": 256109592, + "step": 4971 + }, + { + "epoch": 0.4888331982015184, + "grad_norm": 29.480241775512695, + "learning_rate": 8e-05, + "loss": 40.1681, + "num_input_tokens_seen": 256263356, + "step": 4974 + }, + { + "epoch": 0.48912803125230336, + "grad_norm": 23.866363525390625, + "learning_rate": 8e-05, + "loss": 38.0946, + "num_input_tokens_seen": 256423220, + "step": 4977 + }, + { + "epoch": 0.4894228643030884, + "grad_norm": 41.37398910522461, + "learning_rate": 8e-05, + "loss": 40.5847, + "num_input_tokens_seen": 256572760, + "step": 4980 + }, + { + "epoch": 0.4897176973538734, + "grad_norm": 27.146371841430664, + "learning_rate": 8e-05, + "loss": 41.5017, + "num_input_tokens_seen": 256729552, + "step": 4983 + }, + { + "epoch": 0.49001253040465836, + "grad_norm": 25.587785720825195, + "learning_rate": 8e-05, + "loss": 38.1012, + "num_input_tokens_seen": 256882972, + "step": 4986 + }, + { + "epoch": 0.49030736345544335, + "grad_norm": 24.43895149230957, + "learning_rate": 8e-05, + "loss": 39.9138, + "num_input_tokens_seen": 257060216, + "step": 4989 + }, + { + "epoch": 0.4906021965062283, + "grad_norm": 24.547412872314453, + "learning_rate": 8e-05, + "loss": 38.8881, + "num_input_tokens_seen": 257207456, + "step": 4992 + }, + { + "epoch": 0.49089702955701336, + "grad_norm": 32.7952880859375, + "learning_rate": 8e-05, + "loss": 39.1508, + "num_input_tokens_seen": 257373996, + "step": 4995 + }, + { + "epoch": 0.49119186260779835, + "grad_norm": 29.346824645996094, + "learning_rate": 8e-05, + "loss": 41.7305, + "num_input_tokens_seen": 257536592, + "step": 4998 + }, + { + "epoch": 0.49138841797498833, + "eval_gen_len": 32.985, + "eval_loss": 2.6099908351898193, + "eval_rouge1": 44.4312, + "eval_rouge2": 27.6447, + "eval_rougeL": 40.525, + "eval_rougeLsum": 40.7945, + "eval_runtime": 96.8114, + "eval_samples_per_second": 2.066, + "eval_steps_per_second": 0.516, + "num_input_tokens_seen": 257628708, + "step": 5000 + }, + { + "epoch": 0.49148669565858333, + "grad_norm": 33.791316986083984, + "learning_rate": 8e-05, + "loss": 42.0111, + "num_input_tokens_seen": 257674852, + "step": 5001 + }, + { + "epoch": 0.4917815287093683, + "grad_norm": 28.114267349243164, + "learning_rate": 8e-05, + "loss": 37.5229, + "num_input_tokens_seen": 257807336, + "step": 5004 + }, + { + "epoch": 0.4920763617601533, + "grad_norm": 26.51993751525879, + "learning_rate": 8e-05, + "loss": 35.2494, + "num_input_tokens_seen": 257960640, + "step": 5007 + }, + { + "epoch": 0.49237119481093833, + "grad_norm": 28.530141830444336, + "learning_rate": 8e-05, + "loss": 41.4613, + "num_input_tokens_seen": 258099104, + "step": 5010 + }, + { + "epoch": 0.4926660278617233, + "grad_norm": 23.611448287963867, + "learning_rate": 8e-05, + "loss": 42.0885, + "num_input_tokens_seen": 258290644, + "step": 5013 + }, + { + "epoch": 0.4929608609125083, + "grad_norm": 24.45577049255371, + "learning_rate": 8e-05, + "loss": 40.2475, + "num_input_tokens_seen": 258448336, + "step": 5016 + }, + { + "epoch": 0.4932556939632933, + "grad_norm": 27.3090877532959, + "learning_rate": 8e-05, + "loss": 38.2567, + "num_input_tokens_seen": 258608620, + "step": 5019 + }, + { + "epoch": 0.49355052701407826, + "grad_norm": 23.795654296875, + "learning_rate": 8e-05, + "loss": 42.4711, + "num_input_tokens_seen": 258771812, + "step": 5022 + }, + { + "epoch": 0.4938453600648633, + "grad_norm": 29.56174087524414, + "learning_rate": 8e-05, + "loss": 43.0615, + "num_input_tokens_seen": 258923164, + "step": 5025 + }, + { + "epoch": 0.4941401931156483, + "grad_norm": 37.0699577331543, + "learning_rate": 8e-05, + "loss": 38.6001, + "num_input_tokens_seen": 259096096, + "step": 5028 + }, + { + "epoch": 0.49443502616643326, + "grad_norm": 26.982820510864258, + "learning_rate": 8e-05, + "loss": 39.4593, + "num_input_tokens_seen": 259260152, + "step": 5031 + }, + { + "epoch": 0.49472985921721824, + "grad_norm": 28.042970657348633, + "learning_rate": 8e-05, + "loss": 39.738, + "num_input_tokens_seen": 259412384, + "step": 5034 + }, + { + "epoch": 0.4950246922680032, + "grad_norm": 28.633056640625, + "learning_rate": 8e-05, + "loss": 39.8979, + "num_input_tokens_seen": 259573016, + "step": 5037 + }, + { + "epoch": 0.49531952531878826, + "grad_norm": 28.497529983520508, + "learning_rate": 8e-05, + "loss": 40.1717, + "num_input_tokens_seen": 259706416, + "step": 5040 + }, + { + "epoch": 0.49561435836957324, + "grad_norm": 25.843650817871094, + "learning_rate": 8e-05, + "loss": 39.9081, + "num_input_tokens_seen": 259860744, + "step": 5043 + }, + { + "epoch": 0.4959091914203582, + "grad_norm": 24.251995086669922, + "learning_rate": 8e-05, + "loss": 40.0976, + "num_input_tokens_seen": 260015464, + "step": 5046 + }, + { + "epoch": 0.4962040244711432, + "grad_norm": 32.20277786254883, + "learning_rate": 8e-05, + "loss": 40.7806, + "num_input_tokens_seen": 260181292, + "step": 5049 + }, + { + "epoch": 0.4964988575219282, + "grad_norm": 42.7421875, + "learning_rate": 8e-05, + "loss": 38.9794, + "num_input_tokens_seen": 260325680, + "step": 5052 + }, + { + "epoch": 0.4967936905727132, + "grad_norm": 25.865787506103516, + "learning_rate": 8e-05, + "loss": 41.3198, + "num_input_tokens_seen": 260462880, + "step": 5055 + }, + { + "epoch": 0.4970885236234982, + "grad_norm": 25.28655433654785, + "learning_rate": 8e-05, + "loss": 41.7386, + "num_input_tokens_seen": 260644488, + "step": 5058 + }, + { + "epoch": 0.4973833566742832, + "grad_norm": 27.9177303314209, + "learning_rate": 8e-05, + "loss": 41.1107, + "num_input_tokens_seen": 260798904, + "step": 5061 + }, + { + "epoch": 0.49767818972506817, + "grad_norm": 39.34760284423828, + "learning_rate": 8e-05, + "loss": 34.9837, + "num_input_tokens_seen": 260955768, + "step": 5064 + }, + { + "epoch": 0.49797302277585315, + "grad_norm": 29.348892211914062, + "learning_rate": 8e-05, + "loss": 39.5821, + "num_input_tokens_seen": 261126252, + "step": 5067 + }, + { + "epoch": 0.4982678558266382, + "grad_norm": 35.165401458740234, + "learning_rate": 8e-05, + "loss": 41.6351, + "num_input_tokens_seen": 261285852, + "step": 5070 + }, + { + "epoch": 0.49856268887742317, + "grad_norm": 30.85509490966797, + "learning_rate": 8e-05, + "loss": 40.2213, + "num_input_tokens_seen": 261453024, + "step": 5073 + }, + { + "epoch": 0.49885752192820815, + "grad_norm": 58.00692367553711, + "learning_rate": 8e-05, + "loss": 39.2461, + "num_input_tokens_seen": 261609220, + "step": 5076 + }, + { + "epoch": 0.49915235497899313, + "grad_norm": 27.91078758239746, + "learning_rate": 8e-05, + "loss": 42.8827, + "num_input_tokens_seen": 261769640, + "step": 5079 + }, + { + "epoch": 0.4994471880297781, + "grad_norm": 26.631685256958008, + "learning_rate": 8e-05, + "loss": 39.4102, + "num_input_tokens_seen": 261931560, + "step": 5082 + }, + { + "epoch": 0.49974202108056315, + "grad_norm": 26.376930236816406, + "learning_rate": 8e-05, + "loss": 41.2661, + "num_input_tokens_seen": 262073004, + "step": 5085 + }, + { + "epoch": 0.5000368541313481, + "grad_norm": 28.25586700439453, + "learning_rate": 8e-05, + "loss": 41.2577, + "num_input_tokens_seen": 262208488, + "step": 5088 + }, + { + "epoch": 0.5003316871821332, + "grad_norm": 23.296409606933594, + "learning_rate": 8e-05, + "loss": 39.7283, + "num_input_tokens_seen": 262367280, + "step": 5091 + }, + { + "epoch": 0.5006265202329181, + "grad_norm": 28.338619232177734, + "learning_rate": 8e-05, + "loss": 38.4883, + "num_input_tokens_seen": 262520148, + "step": 5094 + }, + { + "epoch": 0.5009213532837031, + "grad_norm": 30.3863582611084, + "learning_rate": 8e-05, + "loss": 43.1131, + "num_input_tokens_seen": 262690632, + "step": 5097 + }, + { + "epoch": 0.5012161863344881, + "grad_norm": 28.374164581298828, + "learning_rate": 8e-05, + "loss": 40.7878, + "num_input_tokens_seen": 262837896, + "step": 5100 + }, + { + "epoch": 0.5015110193852731, + "grad_norm": 25.643203735351562, + "learning_rate": 8e-05, + "loss": 39.6263, + "num_input_tokens_seen": 263004624, + "step": 5103 + }, + { + "epoch": 0.5018058524360581, + "grad_norm": 26.574125289916992, + "learning_rate": 8e-05, + "loss": 39.7303, + "num_input_tokens_seen": 263162740, + "step": 5106 + }, + { + "epoch": 0.5021006854868431, + "grad_norm": 27.768224716186523, + "learning_rate": 8e-05, + "loss": 40.8774, + "num_input_tokens_seen": 263300304, + "step": 5109 + }, + { + "epoch": 0.5023955185376281, + "grad_norm": 25.535547256469727, + "learning_rate": 8e-05, + "loss": 40.4186, + "num_input_tokens_seen": 263434248, + "step": 5112 + }, + { + "epoch": 0.502690351588413, + "grad_norm": 30.100975036621094, + "learning_rate": 8e-05, + "loss": 41.7356, + "num_input_tokens_seen": 263587684, + "step": 5115 + }, + { + "epoch": 0.5029851846391981, + "grad_norm": 28.115644454956055, + "learning_rate": 8e-05, + "loss": 38.5396, + "num_input_tokens_seen": 263755936, + "step": 5118 + }, + { + "epoch": 0.5032800176899831, + "grad_norm": 24.499034881591797, + "learning_rate": 8e-05, + "loss": 39.5854, + "num_input_tokens_seen": 263900592, + "step": 5121 + }, + { + "epoch": 0.503574850740768, + "grad_norm": 30.202110290527344, + "learning_rate": 8e-05, + "loss": 39.8082, + "num_input_tokens_seen": 264063248, + "step": 5124 + }, + { + "epoch": 0.5038696837915531, + "grad_norm": 29.38237762451172, + "learning_rate": 8e-05, + "loss": 37.7482, + "num_input_tokens_seen": 264214796, + "step": 5127 + }, + { + "epoch": 0.504164516842338, + "grad_norm": 21.479717254638672, + "learning_rate": 8e-05, + "loss": 39.0967, + "num_input_tokens_seen": 264383492, + "step": 5130 + }, + { + "epoch": 0.504459349893123, + "grad_norm": 23.876405715942383, + "learning_rate": 8e-05, + "loss": 41.2343, + "num_input_tokens_seen": 264528208, + "step": 5133 + }, + { + "epoch": 0.5047541829439081, + "grad_norm": 26.351425170898438, + "learning_rate": 8e-05, + "loss": 41.1, + "num_input_tokens_seen": 264677424, + "step": 5136 + }, + { + "epoch": 0.505049015994693, + "grad_norm": 33.22755813598633, + "learning_rate": 8e-05, + "loss": 42.8171, + "num_input_tokens_seen": 264823504, + "step": 5139 + }, + { + "epoch": 0.505343849045478, + "grad_norm": 27.761079788208008, + "learning_rate": 8e-05, + "loss": 40.8142, + "num_input_tokens_seen": 264996748, + "step": 5142 + }, + { + "epoch": 0.505638682096263, + "grad_norm": 27.130084991455078, + "learning_rate": 8e-05, + "loss": 38.0833, + "num_input_tokens_seen": 265131520, + "step": 5145 + }, + { + "epoch": 0.505933515147048, + "grad_norm": 26.463390350341797, + "learning_rate": 8e-05, + "loss": 41.0535, + "num_input_tokens_seen": 265291396, + "step": 5148 + }, + { + "epoch": 0.506228348197833, + "grad_norm": 26.453123092651367, + "learning_rate": 8e-05, + "loss": 40.6416, + "num_input_tokens_seen": 265462212, + "step": 5151 + }, + { + "epoch": 0.506523181248618, + "grad_norm": 64.61177825927734, + "learning_rate": 8e-05, + "loss": 41.241, + "num_input_tokens_seen": 265616688, + "step": 5154 + }, + { + "epoch": 0.506818014299403, + "grad_norm": 31.10032844543457, + "learning_rate": 8e-05, + "loss": 36.3523, + "num_input_tokens_seen": 265765660, + "step": 5157 + }, + { + "epoch": 0.5071128473501879, + "grad_norm": 25.7410831451416, + "learning_rate": 8e-05, + "loss": 38.3414, + "num_input_tokens_seen": 265919712, + "step": 5160 + }, + { + "epoch": 0.507407680400973, + "grad_norm": 29.47089958190918, + "learning_rate": 8e-05, + "loss": 41.5284, + "num_input_tokens_seen": 266091432, + "step": 5163 + }, + { + "epoch": 0.507702513451758, + "grad_norm": 28.647890090942383, + "learning_rate": 8e-05, + "loss": 41.0231, + "num_input_tokens_seen": 266275472, + "step": 5166 + }, + { + "epoch": 0.5079973465025429, + "grad_norm": 26.218273162841797, + "learning_rate": 8e-05, + "loss": 42.1872, + "num_input_tokens_seen": 266422520, + "step": 5169 + }, + { + "epoch": 0.508292179553328, + "grad_norm": 24.72483253479004, + "learning_rate": 8e-05, + "loss": 39.21, + "num_input_tokens_seen": 266575292, + "step": 5172 + }, + { + "epoch": 0.5085870126041129, + "grad_norm": 24.378684997558594, + "learning_rate": 8e-05, + "loss": 41.5883, + "num_input_tokens_seen": 266727956, + "step": 5175 + }, + { + "epoch": 0.5088818456548979, + "grad_norm": 26.51215362548828, + "learning_rate": 8e-05, + "loss": 37.2083, + "num_input_tokens_seen": 266907284, + "step": 5178 + }, + { + "epoch": 0.509176678705683, + "grad_norm": 62.5871696472168, + "learning_rate": 8e-05, + "loss": 38.3234, + "num_input_tokens_seen": 267052280, + "step": 5181 + }, + { + "epoch": 0.5094715117564679, + "grad_norm": 29.808427810668945, + "learning_rate": 8e-05, + "loss": 39.3261, + "num_input_tokens_seen": 267225372, + "step": 5184 + }, + { + "epoch": 0.5097663448072529, + "grad_norm": 28.35696792602539, + "learning_rate": 8e-05, + "loss": 39.9704, + "num_input_tokens_seen": 267380868, + "step": 5187 + }, + { + "epoch": 0.5100611778580378, + "grad_norm": 30.325870513916016, + "learning_rate": 8e-05, + "loss": 39.2605, + "num_input_tokens_seen": 267536984, + "step": 5190 + }, + { + "epoch": 0.5103560109088229, + "grad_norm": 41.838809967041016, + "learning_rate": 8e-05, + "loss": 38.4647, + "num_input_tokens_seen": 267685792, + "step": 5193 + }, + { + "epoch": 0.5106508439596079, + "grad_norm": 25.790443420410156, + "learning_rate": 8e-05, + "loss": 41.0957, + "num_input_tokens_seen": 267849848, + "step": 5196 + }, + { + "epoch": 0.5109456770103928, + "grad_norm": 27.69879913330078, + "learning_rate": 8e-05, + "loss": 37.9932, + "num_input_tokens_seen": 267990528, + "step": 5199 + }, + { + "epoch": 0.5112405100611779, + "grad_norm": 28.44215965270996, + "learning_rate": 8e-05, + "loss": 39.1879, + "num_input_tokens_seen": 268122272, + "step": 5202 + }, + { + "epoch": 0.5115353431119628, + "grad_norm": 27.362436294555664, + "learning_rate": 8e-05, + "loss": 41.4922, + "num_input_tokens_seen": 268290084, + "step": 5205 + }, + { + "epoch": 0.5118301761627478, + "grad_norm": 25.73969268798828, + "learning_rate": 8e-05, + "loss": 38.0888, + "num_input_tokens_seen": 268466960, + "step": 5208 + }, + { + "epoch": 0.5121250092135329, + "grad_norm": 26.495397567749023, + "learning_rate": 8e-05, + "loss": 34.8907, + "num_input_tokens_seen": 268619264, + "step": 5211 + }, + { + "epoch": 0.5124198422643178, + "grad_norm": 28.732437133789062, + "learning_rate": 8e-05, + "loss": 41.5123, + "num_input_tokens_seen": 268765252, + "step": 5214 + }, + { + "epoch": 0.5127146753151028, + "grad_norm": 28.184734344482422, + "learning_rate": 8e-05, + "loss": 43.1832, + "num_input_tokens_seen": 268926384, + "step": 5217 + }, + { + "epoch": 0.5130095083658878, + "grad_norm": 38.591148376464844, + "learning_rate": 8e-05, + "loss": 41.2605, + "num_input_tokens_seen": 269076024, + "step": 5220 + }, + { + "epoch": 0.5133043414166728, + "grad_norm": 28.382539749145508, + "learning_rate": 8e-05, + "loss": 40.6811, + "num_input_tokens_seen": 269236980, + "step": 5223 + }, + { + "epoch": 0.5135991744674578, + "grad_norm": 34.059329986572266, + "learning_rate": 8e-05, + "loss": 42.5639, + "num_input_tokens_seen": 269408772, + "step": 5226 + }, + { + "epoch": 0.5138940075182428, + "grad_norm": 43.94654083251953, + "learning_rate": 8e-05, + "loss": 38.8053, + "num_input_tokens_seen": 269570864, + "step": 5229 + }, + { + "epoch": 0.5141888405690278, + "grad_norm": 34.34612274169922, + "learning_rate": 8e-05, + "loss": 39.3392, + "num_input_tokens_seen": 269725332, + "step": 5232 + }, + { + "epoch": 0.5144836736198127, + "grad_norm": 24.64056968688965, + "learning_rate": 8e-05, + "loss": 35.7388, + "num_input_tokens_seen": 269860884, + "step": 5235 + }, + { + "epoch": 0.5147785066705978, + "grad_norm": 31.60344123840332, + "learning_rate": 8e-05, + "loss": 36.9823, + "num_input_tokens_seen": 270002004, + "step": 5238 + }, + { + "epoch": 0.5150733397213828, + "grad_norm": 37.74430465698242, + "learning_rate": 8e-05, + "loss": 41.4918, + "num_input_tokens_seen": 270167456, + "step": 5241 + }, + { + "epoch": 0.5153681727721677, + "grad_norm": 28.199970245361328, + "learning_rate": 8e-05, + "loss": 40.808, + "num_input_tokens_seen": 270324812, + "step": 5244 + }, + { + "epoch": 0.5156630058229528, + "grad_norm": 65.03779602050781, + "learning_rate": 8e-05, + "loss": 38.1227, + "num_input_tokens_seen": 270470852, + "step": 5247 + }, + { + "epoch": 0.5159578388737377, + "grad_norm": 24.968914031982422, + "learning_rate": 8e-05, + "loss": 40.2922, + "num_input_tokens_seen": 270619848, + "step": 5250 + }, + { + "epoch": 0.5162526719245227, + "grad_norm": 28.412931442260742, + "learning_rate": 8e-05, + "loss": 41.0161, + "num_input_tokens_seen": 270776848, + "step": 5253 + }, + { + "epoch": 0.5165475049753078, + "grad_norm": 30.238481521606445, + "learning_rate": 8e-05, + "loss": 37.7615, + "num_input_tokens_seen": 270937948, + "step": 5256 + }, + { + "epoch": 0.5168423380260927, + "grad_norm": 27.592735290527344, + "learning_rate": 8e-05, + "loss": 40.6843, + "num_input_tokens_seen": 271093220, + "step": 5259 + }, + { + "epoch": 0.5171371710768777, + "grad_norm": 27.805315017700195, + "learning_rate": 8e-05, + "loss": 38.7642, + "num_input_tokens_seen": 271235880, + "step": 5262 + }, + { + "epoch": 0.5174320041276627, + "grad_norm": 64.01702880859375, + "learning_rate": 8e-05, + "loss": 40.7117, + "num_input_tokens_seen": 271373476, + "step": 5265 + }, + { + "epoch": 0.5177268371784477, + "grad_norm": 29.8955135345459, + "learning_rate": 8e-05, + "loss": 38.2741, + "num_input_tokens_seen": 271531824, + "step": 5268 + }, + { + "epoch": 0.5180216702292327, + "grad_norm": 26.748109817504883, + "learning_rate": 8e-05, + "loss": 38.4718, + "num_input_tokens_seen": 271685428, + "step": 5271 + }, + { + "epoch": 0.5183165032800177, + "grad_norm": 29.331661224365234, + "learning_rate": 8e-05, + "loss": 40.647, + "num_input_tokens_seen": 271834884, + "step": 5274 + }, + { + "epoch": 0.5186113363308027, + "grad_norm": 25.69898796081543, + "learning_rate": 8e-05, + "loss": 45.1337, + "num_input_tokens_seen": 271984356, + "step": 5277 + }, + { + "epoch": 0.5189061693815876, + "grad_norm": 29.571247100830078, + "learning_rate": 8e-05, + "loss": 38.5389, + "num_input_tokens_seen": 272155252, + "step": 5280 + }, + { + "epoch": 0.5192010024323727, + "grad_norm": 32.80549240112305, + "learning_rate": 8e-05, + "loss": 40.5171, + "num_input_tokens_seen": 272295720, + "step": 5283 + }, + { + "epoch": 0.5194958354831577, + "grad_norm": 33.686519622802734, + "learning_rate": 8e-05, + "loss": 39.3815, + "num_input_tokens_seen": 272456400, + "step": 5286 + }, + { + "epoch": 0.5197906685339426, + "grad_norm": 27.436264038085938, + "learning_rate": 8e-05, + "loss": 40.2075, + "num_input_tokens_seen": 272612176, + "step": 5289 + }, + { + "epoch": 0.5200855015847277, + "grad_norm": 29.439571380615234, + "learning_rate": 8e-05, + "loss": 41.1885, + "num_input_tokens_seen": 272764716, + "step": 5292 + }, + { + "epoch": 0.5203803346355126, + "grad_norm": 29.08220672607422, + "learning_rate": 8e-05, + "loss": 41.7982, + "num_input_tokens_seen": 272908684, + "step": 5295 + }, + { + "epoch": 0.5206751676862976, + "grad_norm": 29.931116104125977, + "learning_rate": 8e-05, + "loss": 40.7925, + "num_input_tokens_seen": 273075112, + "step": 5298 + }, + { + "epoch": 0.5209700007370827, + "grad_norm": 26.853681564331055, + "learning_rate": 8e-05, + "loss": 37.7096, + "num_input_tokens_seen": 273216512, + "step": 5301 + }, + { + "epoch": 0.5212648337878676, + "grad_norm": 25.798200607299805, + "learning_rate": 8e-05, + "loss": 38.8098, + "num_input_tokens_seen": 273352688, + "step": 5304 + }, + { + "epoch": 0.5215596668386526, + "grad_norm": 28.257875442504883, + "learning_rate": 8e-05, + "loss": 41.5688, + "num_input_tokens_seen": 273502340, + "step": 5307 + }, + { + "epoch": 0.5218544998894376, + "grad_norm": 23.321849822998047, + "learning_rate": 8e-05, + "loss": 37.043, + "num_input_tokens_seen": 273653772, + "step": 5310 + }, + { + "epoch": 0.5221493329402226, + "grad_norm": 27.79505729675293, + "learning_rate": 8e-05, + "loss": 38.3766, + "num_input_tokens_seen": 273808488, + "step": 5313 + }, + { + "epoch": 0.5224441659910076, + "grad_norm": 23.351730346679688, + "learning_rate": 8e-05, + "loss": 41.9723, + "num_input_tokens_seen": 273983832, + "step": 5316 + }, + { + "epoch": 0.5227389990417926, + "grad_norm": 30.498489379882812, + "learning_rate": 8e-05, + "loss": 38.6851, + "num_input_tokens_seen": 274136064, + "step": 5319 + }, + { + "epoch": 0.5230338320925776, + "grad_norm": 27.102434158325195, + "learning_rate": 8e-05, + "loss": 39.6621, + "num_input_tokens_seen": 274299484, + "step": 5322 + }, + { + "epoch": 0.5233286651433626, + "grad_norm": 30.713741302490234, + "learning_rate": 8e-05, + "loss": 36.4895, + "num_input_tokens_seen": 274437288, + "step": 5325 + }, + { + "epoch": 0.5236234981941476, + "grad_norm": 25.068334579467773, + "learning_rate": 8e-05, + "loss": 34.8924, + "num_input_tokens_seen": 274585476, + "step": 5328 + }, + { + "epoch": 0.5239183312449326, + "grad_norm": 28.447080612182617, + "learning_rate": 8e-05, + "loss": 36.4545, + "num_input_tokens_seen": 274737720, + "step": 5331 + }, + { + "epoch": 0.5242131642957175, + "grad_norm": 26.790447235107422, + "learning_rate": 8e-05, + "loss": 38.516, + "num_input_tokens_seen": 274876548, + "step": 5334 + }, + { + "epoch": 0.5245079973465026, + "grad_norm": 31.397802352905273, + "learning_rate": 8e-05, + "loss": 40.8084, + "num_input_tokens_seen": 275038836, + "step": 5337 + }, + { + "epoch": 0.5248028303972876, + "grad_norm": 30.601938247680664, + "learning_rate": 8e-05, + "loss": 40.2357, + "num_input_tokens_seen": 275189300, + "step": 5340 + }, + { + "epoch": 0.5250976634480725, + "grad_norm": 24.533906936645508, + "learning_rate": 8e-05, + "loss": 40.7632, + "num_input_tokens_seen": 275331292, + "step": 5343 + }, + { + "epoch": 0.5253924964988576, + "grad_norm": 28.960649490356445, + "learning_rate": 8e-05, + "loss": 36.632, + "num_input_tokens_seen": 275483224, + "step": 5346 + }, + { + "epoch": 0.5256873295496425, + "grad_norm": 28.03540802001953, + "learning_rate": 8e-05, + "loss": 40.7449, + "num_input_tokens_seen": 275641832, + "step": 5349 + }, + { + "epoch": 0.5259821626004275, + "grad_norm": 25.529850006103516, + "learning_rate": 8e-05, + "loss": 38.6161, + "num_input_tokens_seen": 275804628, + "step": 5352 + }, + { + "epoch": 0.5262769956512126, + "grad_norm": 24.701250076293945, + "learning_rate": 8e-05, + "loss": 38.1369, + "num_input_tokens_seen": 275967540, + "step": 5355 + }, + { + "epoch": 0.5265718287019975, + "grad_norm": 23.570316314697266, + "learning_rate": 8e-05, + "loss": 32.6844, + "num_input_tokens_seen": 276122968, + "step": 5358 + }, + { + "epoch": 0.5268666617527825, + "grad_norm": 29.606403350830078, + "learning_rate": 8e-05, + "loss": 38.6557, + "num_input_tokens_seen": 276278380, + "step": 5361 + }, + { + "epoch": 0.5271614948035674, + "grad_norm": 25.70172882080078, + "learning_rate": 8e-05, + "loss": 36.8258, + "num_input_tokens_seen": 276430096, + "step": 5364 + }, + { + "epoch": 0.5274563278543525, + "grad_norm": 26.238239288330078, + "learning_rate": 8e-05, + "loss": 35.7773, + "num_input_tokens_seen": 276591252, + "step": 5367 + }, + { + "epoch": 0.5277511609051375, + "grad_norm": 27.975414276123047, + "learning_rate": 8e-05, + "loss": 36.2485, + "num_input_tokens_seen": 276746740, + "step": 5370 + }, + { + "epoch": 0.5280459939559224, + "grad_norm": 27.955827713012695, + "learning_rate": 8e-05, + "loss": 38.1719, + "num_input_tokens_seen": 276897792, + "step": 5373 + }, + { + "epoch": 0.5283408270067075, + "grad_norm": 29.52302360534668, + "learning_rate": 8e-05, + "loss": 39.6888, + "num_input_tokens_seen": 277027124, + "step": 5376 + }, + { + "epoch": 0.5286356600574924, + "grad_norm": 27.78631591796875, + "learning_rate": 8e-05, + "loss": 37.4463, + "num_input_tokens_seen": 277181080, + "step": 5379 + }, + { + "epoch": 0.5289304931082774, + "grad_norm": 26.559904098510742, + "learning_rate": 8e-05, + "loss": 38.7405, + "num_input_tokens_seen": 277323696, + "step": 5382 + }, + { + "epoch": 0.5292253261590625, + "grad_norm": 24.3076171875, + "learning_rate": 8e-05, + "loss": 39.4059, + "num_input_tokens_seen": 277470088, + "step": 5385 + }, + { + "epoch": 0.5295201592098474, + "grad_norm": 33.071311950683594, + "learning_rate": 8e-05, + "loss": 42.2808, + "num_input_tokens_seen": 277613564, + "step": 5388 + }, + { + "epoch": 0.5298149922606324, + "grad_norm": 36.52372360229492, + "learning_rate": 8e-05, + "loss": 41.923, + "num_input_tokens_seen": 277801728, + "step": 5391 + }, + { + "epoch": 0.5301098253114174, + "grad_norm": 24.653995513916016, + "learning_rate": 8e-05, + "loss": 38.6579, + "num_input_tokens_seen": 277942992, + "step": 5394 + }, + { + "epoch": 0.5304046583622024, + "grad_norm": 22.624168395996094, + "learning_rate": 8e-05, + "loss": 37.3442, + "num_input_tokens_seen": 278105864, + "step": 5397 + }, + { + "epoch": 0.5306994914129874, + "grad_norm": 25.598228454589844, + "learning_rate": 8e-05, + "loss": 41.0039, + "num_input_tokens_seen": 278261296, + "step": 5400 + }, + { + "epoch": 0.5309943244637724, + "grad_norm": 26.133739471435547, + "learning_rate": 8e-05, + "loss": 44.2222, + "num_input_tokens_seen": 278411744, + "step": 5403 + }, + { + "epoch": 0.5312891575145574, + "grad_norm": 26.089935302734375, + "learning_rate": 8e-05, + "loss": 37.4215, + "num_input_tokens_seen": 278548772, + "step": 5406 + }, + { + "epoch": 0.5315839905653423, + "grad_norm": 26.627443313598633, + "learning_rate": 8e-05, + "loss": 38.2373, + "num_input_tokens_seen": 278710876, + "step": 5409 + }, + { + "epoch": 0.5318788236161274, + "grad_norm": 27.520763397216797, + "learning_rate": 8e-05, + "loss": 44.3792, + "num_input_tokens_seen": 278877056, + "step": 5412 + }, + { + "epoch": 0.5321736566669124, + "grad_norm": 29.1707763671875, + "learning_rate": 8e-05, + "loss": 38.5313, + "num_input_tokens_seen": 279032424, + "step": 5415 + }, + { + "epoch": 0.5324684897176973, + "grad_norm": 31.549942016601562, + "learning_rate": 8e-05, + "loss": 38.1194, + "num_input_tokens_seen": 279190248, + "step": 5418 + }, + { + "epoch": 0.5327633227684824, + "grad_norm": 28.314420700073242, + "learning_rate": 8e-05, + "loss": 39.1158, + "num_input_tokens_seen": 279324216, + "step": 5421 + }, + { + "epoch": 0.5330581558192673, + "grad_norm": 23.590492248535156, + "learning_rate": 8e-05, + "loss": 39.1442, + "num_input_tokens_seen": 279463144, + "step": 5424 + }, + { + "epoch": 0.5333529888700523, + "grad_norm": 24.80891990661621, + "learning_rate": 8e-05, + "loss": 42.6141, + "num_input_tokens_seen": 279603700, + "step": 5427 + }, + { + "epoch": 0.5336478219208374, + "grad_norm": 29.72337532043457, + "learning_rate": 8e-05, + "loss": 41.3286, + "num_input_tokens_seen": 279745296, + "step": 5430 + }, + { + "epoch": 0.5339426549716223, + "grad_norm": 20.50883674621582, + "learning_rate": 8e-05, + "loss": 35.5831, + "num_input_tokens_seen": 279890616, + "step": 5433 + }, + { + "epoch": 0.5342374880224073, + "grad_norm": 32.3402099609375, + "learning_rate": 8e-05, + "loss": 35.8485, + "num_input_tokens_seen": 280027824, + "step": 5436 + }, + { + "epoch": 0.5345323210731923, + "grad_norm": 25.347986221313477, + "learning_rate": 8e-05, + "loss": 40.0463, + "num_input_tokens_seen": 280194656, + "step": 5439 + }, + { + "epoch": 0.5348271541239773, + "grad_norm": 24.94376564025879, + "learning_rate": 8e-05, + "loss": 44.0798, + "num_input_tokens_seen": 280348208, + "step": 5442 + }, + { + "epoch": 0.5351219871747623, + "grad_norm": 25.23859214782715, + "learning_rate": 8e-05, + "loss": 44.2179, + "num_input_tokens_seen": 280535056, + "step": 5445 + }, + { + "epoch": 0.5354168202255473, + "grad_norm": 26.1563777923584, + "learning_rate": 8e-05, + "loss": 36.5628, + "num_input_tokens_seen": 280695328, + "step": 5448 + }, + { + "epoch": 0.5357116532763323, + "grad_norm": 25.728984832763672, + "learning_rate": 8e-05, + "loss": 40.156, + "num_input_tokens_seen": 280851680, + "step": 5451 + }, + { + "epoch": 0.5360064863271172, + "grad_norm": 28.252504348754883, + "learning_rate": 8e-05, + "loss": 42.1762, + "num_input_tokens_seen": 281001832, + "step": 5454 + }, + { + "epoch": 0.5363013193779023, + "grad_norm": 22.63050079345703, + "learning_rate": 8e-05, + "loss": 35.0196, + "num_input_tokens_seen": 281148572, + "step": 5457 + }, + { + "epoch": 0.5365961524286873, + "grad_norm": 55.22769546508789, + "learning_rate": 8e-05, + "loss": 38.8902, + "num_input_tokens_seen": 281315884, + "step": 5460 + }, + { + "epoch": 0.5368909854794722, + "grad_norm": 27.13551139831543, + "learning_rate": 8e-05, + "loss": 36.7139, + "num_input_tokens_seen": 281458016, + "step": 5463 + }, + { + "epoch": 0.5371858185302573, + "grad_norm": 25.429719924926758, + "learning_rate": 8e-05, + "loss": 35.9278, + "num_input_tokens_seen": 281624088, + "step": 5466 + }, + { + "epoch": 0.5374806515810422, + "grad_norm": 28.88753318786621, + "learning_rate": 8e-05, + "loss": 41.2427, + "num_input_tokens_seen": 281753072, + "step": 5469 + }, + { + "epoch": 0.5377754846318272, + "grad_norm": 31.249370574951172, + "learning_rate": 8e-05, + "loss": 38.2066, + "num_input_tokens_seen": 281881532, + "step": 5472 + }, + { + "epoch": 0.5380703176826123, + "grad_norm": 23.7862491607666, + "learning_rate": 8e-05, + "loss": 41.0832, + "num_input_tokens_seen": 282036740, + "step": 5475 + }, + { + "epoch": 0.5383651507333972, + "grad_norm": 20.942829132080078, + "learning_rate": 8e-05, + "loss": 37.1708, + "num_input_tokens_seen": 282192456, + "step": 5478 + }, + { + "epoch": 0.5386599837841822, + "grad_norm": 24.284210205078125, + "learning_rate": 8e-05, + "loss": 40.6561, + "num_input_tokens_seen": 282320636, + "step": 5481 + }, + { + "epoch": 0.5389548168349672, + "grad_norm": 26.83125877380371, + "learning_rate": 8e-05, + "loss": 44.5538, + "num_input_tokens_seen": 282455496, + "step": 5484 + }, + { + "epoch": 0.5392496498857522, + "grad_norm": 25.163188934326172, + "learning_rate": 8e-05, + "loss": 38.9524, + "num_input_tokens_seen": 282598584, + "step": 5487 + }, + { + "epoch": 0.5395444829365372, + "grad_norm": 25.366954803466797, + "learning_rate": 8e-05, + "loss": 40.435, + "num_input_tokens_seen": 282751360, + "step": 5490 + }, + { + "epoch": 0.5398393159873222, + "grad_norm": 26.106281280517578, + "learning_rate": 8e-05, + "loss": 38.6259, + "num_input_tokens_seen": 282882692, + "step": 5493 + }, + { + "epoch": 0.5401341490381072, + "grad_norm": 24.25537872314453, + "learning_rate": 8e-05, + "loss": 39.2648, + "num_input_tokens_seen": 283045352, + "step": 5496 + }, + { + "epoch": 0.5404289820888921, + "grad_norm": 24.524158477783203, + "learning_rate": 8e-05, + "loss": 34.532, + "num_input_tokens_seen": 283198744, + "step": 5499 + }, + { + "epoch": 0.5407238151396772, + "grad_norm": 27.091638565063477, + "learning_rate": 8e-05, + "loss": 40.4187, + "num_input_tokens_seen": 283367704, + "step": 5502 + }, + { + "epoch": 0.5410186481904622, + "grad_norm": 27.984676361083984, + "learning_rate": 8e-05, + "loss": 39.8698, + "num_input_tokens_seen": 283504976, + "step": 5505 + }, + { + "epoch": 0.5413134812412471, + "grad_norm": 39.2283935546875, + "learning_rate": 8e-05, + "loss": 39.939, + "num_input_tokens_seen": 283677932, + "step": 5508 + }, + { + "epoch": 0.5416083142920322, + "grad_norm": 25.75412368774414, + "learning_rate": 8e-05, + "loss": 39.2238, + "num_input_tokens_seen": 283826660, + "step": 5511 + }, + { + "epoch": 0.5419031473428171, + "grad_norm": 25.7208194732666, + "learning_rate": 8e-05, + "loss": 38.1721, + "num_input_tokens_seen": 283989516, + "step": 5514 + }, + { + "epoch": 0.5421979803936021, + "grad_norm": 24.40972328186035, + "learning_rate": 8e-05, + "loss": 38.1401, + "num_input_tokens_seen": 284151680, + "step": 5517 + }, + { + "epoch": 0.5424928134443872, + "grad_norm": 28.160717010498047, + "learning_rate": 8e-05, + "loss": 37.8972, + "num_input_tokens_seen": 284318576, + "step": 5520 + }, + { + "epoch": 0.5427876464951721, + "grad_norm": 23.772441864013672, + "learning_rate": 8e-05, + "loss": 40.8021, + "num_input_tokens_seen": 284479420, + "step": 5523 + }, + { + "epoch": 0.5430824795459571, + "grad_norm": 26.15060043334961, + "learning_rate": 8e-05, + "loss": 38.6573, + "num_input_tokens_seen": 284650476, + "step": 5526 + }, + { + "epoch": 0.543377312596742, + "grad_norm": 22.040708541870117, + "learning_rate": 8e-05, + "loss": 35.2758, + "num_input_tokens_seen": 284813264, + "step": 5529 + }, + { + "epoch": 0.5436721456475271, + "grad_norm": 25.442060470581055, + "learning_rate": 8e-05, + "loss": 40.1561, + "num_input_tokens_seen": 284955988, + "step": 5532 + }, + { + "epoch": 0.5439669786983121, + "grad_norm": 41.13759231567383, + "learning_rate": 8e-05, + "loss": 37.8666, + "num_input_tokens_seen": 285102048, + "step": 5535 + }, + { + "epoch": 0.544261811749097, + "grad_norm": 24.975589752197266, + "learning_rate": 8e-05, + "loss": 42.4398, + "num_input_tokens_seen": 285270928, + "step": 5538 + }, + { + "epoch": 0.5445566447998821, + "grad_norm": 22.179227828979492, + "learning_rate": 8e-05, + "loss": 36.8113, + "num_input_tokens_seen": 285419668, + "step": 5541 + }, + { + "epoch": 0.544851477850667, + "grad_norm": 26.153484344482422, + "learning_rate": 8e-05, + "loss": 37.6961, + "num_input_tokens_seen": 285605556, + "step": 5544 + }, + { + "epoch": 0.545146310901452, + "grad_norm": 26.558202743530273, + "learning_rate": 8e-05, + "loss": 39.7822, + "num_input_tokens_seen": 285758500, + "step": 5547 + }, + { + "epoch": 0.5454411439522371, + "grad_norm": 26.40770149230957, + "learning_rate": 8e-05, + "loss": 35.4442, + "num_input_tokens_seen": 285907832, + "step": 5550 + }, + { + "epoch": 0.545735977003022, + "grad_norm": 24.307918548583984, + "learning_rate": 8e-05, + "loss": 41.7595, + "num_input_tokens_seen": 286077336, + "step": 5553 + }, + { + "epoch": 0.546030810053807, + "grad_norm": 32.74250411987305, + "learning_rate": 8e-05, + "loss": 45.9219, + "num_input_tokens_seen": 286245036, + "step": 5556 + }, + { + "epoch": 0.546325643104592, + "grad_norm": 32.992637634277344, + "learning_rate": 8e-05, + "loss": 38.9375, + "num_input_tokens_seen": 286388136, + "step": 5559 + }, + { + "epoch": 0.546620476155377, + "grad_norm": 49.21634292602539, + "learning_rate": 8e-05, + "loss": 37.6279, + "num_input_tokens_seen": 286549512, + "step": 5562 + }, + { + "epoch": 0.546915309206162, + "grad_norm": 24.97784996032715, + "learning_rate": 8e-05, + "loss": 37.8887, + "num_input_tokens_seen": 286677032, + "step": 5565 + }, + { + "epoch": 0.547210142256947, + "grad_norm": 28.935182571411133, + "learning_rate": 8e-05, + "loss": 39.9942, + "num_input_tokens_seen": 286821188, + "step": 5568 + }, + { + "epoch": 0.547504975307732, + "grad_norm": 345.0877990722656, + "learning_rate": 8e-05, + "loss": 35.3478, + "num_input_tokens_seen": 286976392, + "step": 5571 + }, + { + "epoch": 0.5477998083585169, + "grad_norm": 30.12696647644043, + "learning_rate": 8e-05, + "loss": 43.53, + "num_input_tokens_seen": 287129980, + "step": 5574 + }, + { + "epoch": 0.548094641409302, + "grad_norm": 45.15864944458008, + "learning_rate": 8e-05, + "loss": 38.9156, + "num_input_tokens_seen": 287279712, + "step": 5577 + }, + { + "epoch": 0.548389474460087, + "grad_norm": 22.9730224609375, + "learning_rate": 8e-05, + "loss": 41.7912, + "num_input_tokens_seen": 287439780, + "step": 5580 + }, + { + "epoch": 0.5486843075108719, + "grad_norm": 25.395217895507812, + "learning_rate": 8e-05, + "loss": 36.8195, + "num_input_tokens_seen": 287598532, + "step": 5583 + }, + { + "epoch": 0.548979140561657, + "grad_norm": 24.853181838989258, + "learning_rate": 8e-05, + "loss": 37.4314, + "num_input_tokens_seen": 287745808, + "step": 5586 + }, + { + "epoch": 0.5492739736124419, + "grad_norm": 35.19365310668945, + "learning_rate": 8e-05, + "loss": 42.1886, + "num_input_tokens_seen": 287914132, + "step": 5589 + }, + { + "epoch": 0.5495688066632269, + "grad_norm": 27.032106399536133, + "learning_rate": 8e-05, + "loss": 37.3116, + "num_input_tokens_seen": 288060960, + "step": 5592 + }, + { + "epoch": 0.549863639714012, + "grad_norm": 25.702754974365234, + "learning_rate": 8e-05, + "loss": 37.2868, + "num_input_tokens_seen": 288202716, + "step": 5595 + }, + { + "epoch": 0.5501584727647969, + "grad_norm": 30.17405891418457, + "learning_rate": 8e-05, + "loss": 40.0769, + "num_input_tokens_seen": 288361760, + "step": 5598 + }, + { + "epoch": 0.5504533058155819, + "grad_norm": 34.64820861816406, + "learning_rate": 8e-05, + "loss": 36.3623, + "num_input_tokens_seen": 288520980, + "step": 5601 + }, + { + "epoch": 0.5507481388663669, + "grad_norm": 23.259239196777344, + "learning_rate": 8e-05, + "loss": 37.8757, + "num_input_tokens_seen": 288704772, + "step": 5604 + }, + { + "epoch": 0.5510429719171519, + "grad_norm": 26.68288803100586, + "learning_rate": 8e-05, + "loss": 37.9769, + "num_input_tokens_seen": 288846252, + "step": 5607 + }, + { + "epoch": 0.5513378049679369, + "grad_norm": 27.598785400390625, + "learning_rate": 8e-05, + "loss": 40.1756, + "num_input_tokens_seen": 288999052, + "step": 5610 + }, + { + "epoch": 0.5516326380187219, + "grad_norm": 25.46784210205078, + "learning_rate": 8e-05, + "loss": 44.6034, + "num_input_tokens_seen": 289152324, + "step": 5613 + }, + { + "epoch": 0.5519274710695069, + "grad_norm": 30.939964294433594, + "learning_rate": 8e-05, + "loss": 42.1413, + "num_input_tokens_seen": 289317628, + "step": 5616 + }, + { + "epoch": 0.5522223041202918, + "grad_norm": 24.61944007873535, + "learning_rate": 8e-05, + "loss": 38.7557, + "num_input_tokens_seen": 289449632, + "step": 5619 + }, + { + "epoch": 0.5525171371710769, + "grad_norm": 24.867664337158203, + "learning_rate": 8e-05, + "loss": 39.5265, + "num_input_tokens_seen": 289611300, + "step": 5622 + }, + { + "epoch": 0.5528119702218619, + "grad_norm": 22.521526336669922, + "learning_rate": 8e-05, + "loss": 39.4616, + "num_input_tokens_seen": 289771848, + "step": 5625 + }, + { + "epoch": 0.5531068032726468, + "grad_norm": 23.784482955932617, + "learning_rate": 8e-05, + "loss": 38.105, + "num_input_tokens_seen": 289938788, + "step": 5628 + }, + { + "epoch": 0.5534016363234319, + "grad_norm": 27.707380294799805, + "learning_rate": 8e-05, + "loss": 37.8987, + "num_input_tokens_seen": 290092404, + "step": 5631 + }, + { + "epoch": 0.5536964693742169, + "grad_norm": 23.659029006958008, + "learning_rate": 8e-05, + "loss": 37.5206, + "num_input_tokens_seen": 290262368, + "step": 5634 + }, + { + "epoch": 0.5539913024250018, + "grad_norm": 33.99428176879883, + "learning_rate": 8e-05, + "loss": 39.2867, + "num_input_tokens_seen": 290424784, + "step": 5637 + }, + { + "epoch": 0.5542861354757869, + "grad_norm": 29.01900291442871, + "learning_rate": 8e-05, + "loss": 36.2682, + "num_input_tokens_seen": 290579540, + "step": 5640 + }, + { + "epoch": 0.5545809685265718, + "grad_norm": 28.39067268371582, + "learning_rate": 8e-05, + "loss": 38.8032, + "num_input_tokens_seen": 290725264, + "step": 5643 + }, + { + "epoch": 0.5548758015773568, + "grad_norm": 28.794584274291992, + "learning_rate": 8e-05, + "loss": 42.0969, + "num_input_tokens_seen": 290897136, + "step": 5646 + }, + { + "epoch": 0.5551706346281419, + "grad_norm": 88.10995483398438, + "learning_rate": 8e-05, + "loss": 39.5867, + "num_input_tokens_seen": 291060408, + "step": 5649 + }, + { + "epoch": 0.5554654676789268, + "grad_norm": 44.05754089355469, + "learning_rate": 8e-05, + "loss": 39.5617, + "num_input_tokens_seen": 291232472, + "step": 5652 + }, + { + "epoch": 0.5557603007297118, + "grad_norm": 28.9007625579834, + "learning_rate": 8e-05, + "loss": 39.7402, + "num_input_tokens_seen": 291382420, + "step": 5655 + }, + { + "epoch": 0.5560551337804968, + "grad_norm": 32.80019760131836, + "learning_rate": 8e-05, + "loss": 41.9413, + "num_input_tokens_seen": 291516656, + "step": 5658 + }, + { + "epoch": 0.5563499668312818, + "grad_norm": 47.72719955444336, + "learning_rate": 8e-05, + "loss": 33.9347, + "num_input_tokens_seen": 291667636, + "step": 5661 + }, + { + "epoch": 0.5566447998820668, + "grad_norm": 66.66146850585938, + "learning_rate": 8e-05, + "loss": 43.2117, + "num_input_tokens_seen": 291800104, + "step": 5664 + }, + { + "epoch": 0.5569396329328518, + "grad_norm": 25.505979537963867, + "learning_rate": 8e-05, + "loss": 38.9609, + "num_input_tokens_seen": 291938656, + "step": 5667 + }, + { + "epoch": 0.5572344659836368, + "grad_norm": 27.359821319580078, + "learning_rate": 8e-05, + "loss": 35.8896, + "num_input_tokens_seen": 292071348, + "step": 5670 + }, + { + "epoch": 0.5575292990344217, + "grad_norm": 27.773998260498047, + "learning_rate": 8e-05, + "loss": 43.5044, + "num_input_tokens_seen": 292231860, + "step": 5673 + }, + { + "epoch": 0.5578241320852068, + "grad_norm": 29.372474670410156, + "learning_rate": 8e-05, + "loss": 34.7316, + "num_input_tokens_seen": 292390360, + "step": 5676 + }, + { + "epoch": 0.5581189651359918, + "grad_norm": 32.9428596496582, + "learning_rate": 8e-05, + "loss": 42.433, + "num_input_tokens_seen": 292555324, + "step": 5679 + }, + { + "epoch": 0.5584137981867767, + "grad_norm": 26.41676139831543, + "learning_rate": 8e-05, + "loss": 36.0981, + "num_input_tokens_seen": 292720044, + "step": 5682 + }, + { + "epoch": 0.5587086312375618, + "grad_norm": 24.948963165283203, + "learning_rate": 8e-05, + "loss": 41.193, + "num_input_tokens_seen": 292871312, + "step": 5685 + }, + { + "epoch": 0.5590034642883467, + "grad_norm": 26.889123916625977, + "learning_rate": 8e-05, + "loss": 40.1803, + "num_input_tokens_seen": 293044880, + "step": 5688 + }, + { + "epoch": 0.5592982973391317, + "grad_norm": 26.73687744140625, + "learning_rate": 8e-05, + "loss": 39.5284, + "num_input_tokens_seen": 293182688, + "step": 5691 + }, + { + "epoch": 0.5595931303899168, + "grad_norm": 24.835887908935547, + "learning_rate": 8e-05, + "loss": 38.3529, + "num_input_tokens_seen": 293343424, + "step": 5694 + }, + { + "epoch": 0.5598879634407017, + "grad_norm": 32.0245361328125, + "learning_rate": 8e-05, + "loss": 39.3238, + "num_input_tokens_seen": 293502672, + "step": 5697 + }, + { + "epoch": 0.5601827964914867, + "grad_norm": 30.10677146911621, + "learning_rate": 8e-05, + "loss": 40.3795, + "num_input_tokens_seen": 293684264, + "step": 5700 + }, + { + "epoch": 0.5604776295422716, + "grad_norm": 32.71257400512695, + "learning_rate": 8e-05, + "loss": 41.1775, + "num_input_tokens_seen": 293870016, + "step": 5703 + }, + { + "epoch": 0.5607724625930567, + "grad_norm": 21.719552993774414, + "learning_rate": 8e-05, + "loss": 35.4193, + "num_input_tokens_seen": 294033016, + "step": 5706 + }, + { + "epoch": 0.5610672956438417, + "grad_norm": 53.36152267456055, + "learning_rate": 8e-05, + "loss": 37.5693, + "num_input_tokens_seen": 294184876, + "step": 5709 + }, + { + "epoch": 0.5613621286946266, + "grad_norm": 36.04158020019531, + "learning_rate": 8e-05, + "loss": 35.0886, + "num_input_tokens_seen": 294352844, + "step": 5712 + }, + { + "epoch": 0.5616569617454117, + "grad_norm": 22.147815704345703, + "learning_rate": 8e-05, + "loss": 37.5072, + "num_input_tokens_seen": 294515272, + "step": 5715 + }, + { + "epoch": 0.5619517947961966, + "grad_norm": 30.99330711364746, + "learning_rate": 8e-05, + "loss": 41.2776, + "num_input_tokens_seen": 294669808, + "step": 5718 + }, + { + "epoch": 0.5622466278469817, + "grad_norm": 41.663692474365234, + "learning_rate": 8e-05, + "loss": 41.1665, + "num_input_tokens_seen": 294833380, + "step": 5721 + }, + { + "epoch": 0.5625414608977667, + "grad_norm": 24.908546447753906, + "learning_rate": 8e-05, + "loss": 37.6573, + "num_input_tokens_seen": 294981576, + "step": 5724 + }, + { + "epoch": 0.5628362939485516, + "grad_norm": 28.312366485595703, + "learning_rate": 8e-05, + "loss": 44.9366, + "num_input_tokens_seen": 295159120, + "step": 5727 + }, + { + "epoch": 0.5631311269993367, + "grad_norm": 32.46244812011719, + "learning_rate": 8e-05, + "loss": 32.9336, + "num_input_tokens_seen": 295319032, + "step": 5730 + }, + { + "epoch": 0.5634259600501216, + "grad_norm": 27.558486938476562, + "learning_rate": 8e-05, + "loss": 33.7036, + "num_input_tokens_seen": 295485416, + "step": 5733 + }, + { + "epoch": 0.5637207931009066, + "grad_norm": 35.52197265625, + "learning_rate": 8e-05, + "loss": 37.3093, + "num_input_tokens_seen": 295622688, + "step": 5736 + }, + { + "epoch": 0.5640156261516917, + "grad_norm": 26.202285766601562, + "learning_rate": 8e-05, + "loss": 36.7879, + "num_input_tokens_seen": 295773952, + "step": 5739 + }, + { + "epoch": 0.5643104592024766, + "grad_norm": 26.322406768798828, + "learning_rate": 8e-05, + "loss": 43.3144, + "num_input_tokens_seen": 295932736, + "step": 5742 + }, + { + "epoch": 0.5646052922532616, + "grad_norm": 35.22903823852539, + "learning_rate": 8e-05, + "loss": 40.8689, + "num_input_tokens_seen": 296092228, + "step": 5745 + }, + { + "epoch": 0.5649001253040465, + "grad_norm": 25.881752014160156, + "learning_rate": 8e-05, + "loss": 41.6622, + "num_input_tokens_seen": 296261440, + "step": 5748 + }, + { + "epoch": 0.5651949583548316, + "grad_norm": 25.221384048461914, + "learning_rate": 8e-05, + "loss": 39.834, + "num_input_tokens_seen": 296402096, + "step": 5751 + }, + { + "epoch": 0.5654897914056166, + "grad_norm": 46.59519958496094, + "learning_rate": 8e-05, + "loss": 36.7, + "num_input_tokens_seen": 296558864, + "step": 5754 + }, + { + "epoch": 0.5657846244564015, + "grad_norm": 25.258193969726562, + "learning_rate": 8e-05, + "loss": 39.4638, + "num_input_tokens_seen": 296705336, + "step": 5757 + }, + { + "epoch": 0.5660794575071866, + "grad_norm": 26.750333786010742, + "learning_rate": 8e-05, + "loss": 41.9382, + "num_input_tokens_seen": 296854004, + "step": 5760 + }, + { + "epoch": 0.5663742905579715, + "grad_norm": 28.431350708007812, + "learning_rate": 8e-05, + "loss": 39.838, + "num_input_tokens_seen": 297019044, + "step": 5763 + }, + { + "epoch": 0.5666691236087565, + "grad_norm": 27.761972427368164, + "learning_rate": 8e-05, + "loss": 35.0378, + "num_input_tokens_seen": 297178948, + "step": 5766 + }, + { + "epoch": 0.5669639566595416, + "grad_norm": 34.624732971191406, + "learning_rate": 8e-05, + "loss": 37.0351, + "num_input_tokens_seen": 297351116, + "step": 5769 + }, + { + "epoch": 0.5672587897103265, + "grad_norm": 24.31560707092285, + "learning_rate": 8e-05, + "loss": 37.4997, + "num_input_tokens_seen": 297504184, + "step": 5772 + }, + { + "epoch": 0.5675536227611115, + "grad_norm": 24.228517532348633, + "learning_rate": 8e-05, + "loss": 36.6194, + "num_input_tokens_seen": 297677124, + "step": 5775 + }, + { + "epoch": 0.5678484558118965, + "grad_norm": 39.17292785644531, + "learning_rate": 8e-05, + "loss": 39.9698, + "num_input_tokens_seen": 297827896, + "step": 5778 + }, + { + "epoch": 0.5681432888626815, + "grad_norm": 25.8333683013916, + "learning_rate": 8e-05, + "loss": 37.8598, + "num_input_tokens_seen": 297975784, + "step": 5781 + }, + { + "epoch": 0.5684381219134665, + "grad_norm": 30.754497528076172, + "learning_rate": 8e-05, + "loss": 35.6395, + "num_input_tokens_seen": 298138908, + "step": 5784 + }, + { + "epoch": 0.5687329549642515, + "grad_norm": 26.73647689819336, + "learning_rate": 8e-05, + "loss": 39.9627, + "num_input_tokens_seen": 298317840, + "step": 5787 + }, + { + "epoch": 0.5690277880150365, + "grad_norm": 128.11517333984375, + "learning_rate": 8e-05, + "loss": 38.1483, + "num_input_tokens_seen": 298477572, + "step": 5790 + }, + { + "epoch": 0.5693226210658214, + "grad_norm": 26.494300842285156, + "learning_rate": 8e-05, + "loss": 36.4683, + "num_input_tokens_seen": 298633940, + "step": 5793 + }, + { + "epoch": 0.5696174541166065, + "grad_norm": 62.236572265625, + "learning_rate": 8e-05, + "loss": 38.026, + "num_input_tokens_seen": 298784060, + "step": 5796 + }, + { + "epoch": 0.5699122871673915, + "grad_norm": 42.91933822631836, + "learning_rate": 8e-05, + "loss": 42.2562, + "num_input_tokens_seen": 298958820, + "step": 5799 + }, + { + "epoch": 0.5702071202181764, + "grad_norm": 50.08773422241211, + "learning_rate": 8e-05, + "loss": 41.6174, + "num_input_tokens_seen": 299099592, + "step": 5802 + }, + { + "epoch": 0.5705019532689615, + "grad_norm": 53.82916259765625, + "learning_rate": 8e-05, + "loss": 37.39, + "num_input_tokens_seen": 299235808, + "step": 5805 + }, + { + "epoch": 0.5707967863197464, + "grad_norm": 55.40131378173828, + "learning_rate": 8e-05, + "loss": 34.8677, + "num_input_tokens_seen": 299383952, + "step": 5808 + }, + { + "epoch": 0.5710916193705314, + "grad_norm": 37.975929260253906, + "learning_rate": 8e-05, + "loss": 41.1252, + "num_input_tokens_seen": 299550972, + "step": 5811 + }, + { + "epoch": 0.5713864524213165, + "grad_norm": 26.348045349121094, + "learning_rate": 8e-05, + "loss": 40.0975, + "num_input_tokens_seen": 299721044, + "step": 5814 + }, + { + "epoch": 0.5716812854721014, + "grad_norm": 80.32865905761719, + "learning_rate": 8e-05, + "loss": 33.7063, + "num_input_tokens_seen": 299875096, + "step": 5817 + }, + { + "epoch": 0.5719761185228864, + "grad_norm": 25.530847549438477, + "learning_rate": 8e-05, + "loss": 36.4637, + "num_input_tokens_seen": 300014580, + "step": 5820 + }, + { + "epoch": 0.5722709515736714, + "grad_norm": 29.553768157958984, + "learning_rate": 8e-05, + "loss": 40.6987, + "num_input_tokens_seen": 300195144, + "step": 5823 + }, + { + "epoch": 0.5725657846244564, + "grad_norm": 61.815162658691406, + "learning_rate": 8e-05, + "loss": 40.1162, + "num_input_tokens_seen": 300351100, + "step": 5826 + }, + { + "epoch": 0.5728606176752414, + "grad_norm": 29.161407470703125, + "learning_rate": 8e-05, + "loss": 40.9905, + "num_input_tokens_seen": 300521468, + "step": 5829 + }, + { + "epoch": 0.5731554507260264, + "grad_norm": 27.665782928466797, + "learning_rate": 8e-05, + "loss": 39.3531, + "num_input_tokens_seen": 300677212, + "step": 5832 + }, + { + "epoch": 0.5734502837768114, + "grad_norm": 27.592004776000977, + "learning_rate": 8e-05, + "loss": 37.4998, + "num_input_tokens_seen": 300817448, + "step": 5835 + }, + { + "epoch": 0.5737451168275963, + "grad_norm": 27.71179962158203, + "learning_rate": 8e-05, + "loss": 38.4142, + "num_input_tokens_seen": 300989444, + "step": 5838 + }, + { + "epoch": 0.5740399498783814, + "grad_norm": 34.622825622558594, + "learning_rate": 8e-05, + "loss": 40.1114, + "num_input_tokens_seen": 301118496, + "step": 5841 + }, + { + "epoch": 0.5743347829291664, + "grad_norm": 24.764328002929688, + "learning_rate": 8e-05, + "loss": 41.068, + "num_input_tokens_seen": 301279716, + "step": 5844 + }, + { + "epoch": 0.5746296159799513, + "grad_norm": 30.744380950927734, + "learning_rate": 8e-05, + "loss": 36.6774, + "num_input_tokens_seen": 301427464, + "step": 5847 + }, + { + "epoch": 0.5749244490307364, + "grad_norm": 200.72665405273438, + "learning_rate": 8e-05, + "loss": 40.9329, + "num_input_tokens_seen": 301587056, + "step": 5850 + }, + { + "epoch": 0.5752192820815213, + "grad_norm": 31.990041732788086, + "learning_rate": 8e-05, + "loss": 32.8286, + "num_input_tokens_seen": 301730364, + "step": 5853 + }, + { + "epoch": 0.5755141151323063, + "grad_norm": 29.088926315307617, + "learning_rate": 8e-05, + "loss": 38.0881, + "num_input_tokens_seen": 301900344, + "step": 5856 + }, + { + "epoch": 0.5758089481830914, + "grad_norm": 142.562255859375, + "learning_rate": 8e-05, + "loss": 41.9923, + "num_input_tokens_seen": 302048480, + "step": 5859 + }, + { + "epoch": 0.5761037812338763, + "grad_norm": 34.03981399536133, + "learning_rate": 8e-05, + "loss": 40.1016, + "num_input_tokens_seen": 302207504, + "step": 5862 + }, + { + "epoch": 0.5763986142846613, + "grad_norm": 26.095165252685547, + "learning_rate": 8e-05, + "loss": 35.2068, + "num_input_tokens_seen": 302378048, + "step": 5865 + }, + { + "epoch": 0.5766934473354463, + "grad_norm": 26.977977752685547, + "learning_rate": 8e-05, + "loss": 39.1896, + "num_input_tokens_seen": 302527312, + "step": 5868 + }, + { + "epoch": 0.5769882803862313, + "grad_norm": 36.75228500366211, + "learning_rate": 8e-05, + "loss": 36.731, + "num_input_tokens_seen": 302663536, + "step": 5871 + }, + { + "epoch": 0.5772831134370163, + "grad_norm": 51.440711975097656, + "learning_rate": 8e-05, + "loss": 38.3708, + "num_input_tokens_seen": 302832764, + "step": 5874 + }, + { + "epoch": 0.5775779464878013, + "grad_norm": 24.259418487548828, + "learning_rate": 8e-05, + "loss": 36.0445, + "num_input_tokens_seen": 302984212, + "step": 5877 + }, + { + "epoch": 0.5778727795385863, + "grad_norm": 69.90709686279297, + "learning_rate": 8e-05, + "loss": 38.2045, + "num_input_tokens_seen": 303133324, + "step": 5880 + }, + { + "epoch": 0.5781676125893712, + "grad_norm": 26.804569244384766, + "learning_rate": 8e-05, + "loss": 37.7684, + "num_input_tokens_seen": 303275508, + "step": 5883 + }, + { + "epoch": 0.5784624456401563, + "grad_norm": 27.848865509033203, + "learning_rate": 8e-05, + "loss": 36.8484, + "num_input_tokens_seen": 303443396, + "step": 5886 + }, + { + "epoch": 0.5787572786909413, + "grad_norm": 22.267452239990234, + "learning_rate": 8e-05, + "loss": 38.3815, + "num_input_tokens_seen": 303595712, + "step": 5889 + }, + { + "epoch": 0.5790521117417262, + "grad_norm": 41.911293029785156, + "learning_rate": 8e-05, + "loss": 42.0451, + "num_input_tokens_seen": 303737876, + "step": 5892 + }, + { + "epoch": 0.5793469447925113, + "grad_norm": 32.45378494262695, + "learning_rate": 8e-05, + "loss": 37.216, + "num_input_tokens_seen": 303900824, + "step": 5895 + }, + { + "epoch": 0.5796417778432962, + "grad_norm": 40.52968215942383, + "learning_rate": 8e-05, + "loss": 39.0123, + "num_input_tokens_seen": 304048528, + "step": 5898 + }, + { + "epoch": 0.5799366108940812, + "grad_norm": 39.96379470825195, + "learning_rate": 8e-05, + "loss": 38.7395, + "num_input_tokens_seen": 304218184, + "step": 5901 + }, + { + "epoch": 0.5802314439448663, + "grad_norm": 26.308931350708008, + "learning_rate": 8e-05, + "loss": 40.2331, + "num_input_tokens_seen": 304377180, + "step": 5904 + }, + { + "epoch": 0.5805262769956512, + "grad_norm": 26.30266761779785, + "learning_rate": 8e-05, + "loss": 37.2614, + "num_input_tokens_seen": 304539968, + "step": 5907 + }, + { + "epoch": 0.5808211100464362, + "grad_norm": 26.29747200012207, + "learning_rate": 8e-05, + "loss": 41.3342, + "num_input_tokens_seen": 304706308, + "step": 5910 + }, + { + "epoch": 0.5811159430972211, + "grad_norm": 27.593076705932617, + "learning_rate": 8e-05, + "loss": 38.9741, + "num_input_tokens_seen": 304848948, + "step": 5913 + }, + { + "epoch": 0.5814107761480062, + "grad_norm": 25.446651458740234, + "learning_rate": 8e-05, + "loss": 37.807, + "num_input_tokens_seen": 304998176, + "step": 5916 + }, + { + "epoch": 0.5817056091987912, + "grad_norm": 29.21063995361328, + "learning_rate": 8e-05, + "loss": 36.8093, + "num_input_tokens_seen": 305159976, + "step": 5919 + }, + { + "epoch": 0.5820004422495761, + "grad_norm": 34.09979248046875, + "learning_rate": 8e-05, + "loss": 36.7409, + "num_input_tokens_seen": 305308808, + "step": 5922 + }, + { + "epoch": 0.5822952753003612, + "grad_norm": 32.126407623291016, + "learning_rate": 8e-05, + "loss": 38.0265, + "num_input_tokens_seen": 305461308, + "step": 5925 + }, + { + "epoch": 0.5825901083511462, + "grad_norm": 27.993932723999023, + "learning_rate": 8e-05, + "loss": 42.3727, + "num_input_tokens_seen": 305601456, + "step": 5928 + }, + { + "epoch": 0.5828849414019311, + "grad_norm": 32.2390022277832, + "learning_rate": 8e-05, + "loss": 39.7092, + "num_input_tokens_seen": 305746720, + "step": 5931 + }, + { + "epoch": 0.5831797744527162, + "grad_norm": 77.48739624023438, + "learning_rate": 8e-05, + "loss": 34.6095, + "num_input_tokens_seen": 305876036, + "step": 5934 + }, + { + "epoch": 0.5834746075035011, + "grad_norm": 130.36790466308594, + "learning_rate": 8e-05, + "loss": 33.3341, + "num_input_tokens_seen": 306028364, + "step": 5937 + }, + { + "epoch": 0.5837694405542861, + "grad_norm": 76.7963638305664, + "learning_rate": 8e-05, + "loss": 37.3014, + "num_input_tokens_seen": 306192280, + "step": 5940 + }, + { + "epoch": 0.5840642736050712, + "grad_norm": 30.70940399169922, + "learning_rate": 8e-05, + "loss": 36.3198, + "num_input_tokens_seen": 306329048, + "step": 5943 + }, + { + "epoch": 0.5843591066558561, + "grad_norm": 61.189674377441406, + "learning_rate": 8e-05, + "loss": 43.0917, + "num_input_tokens_seen": 306471744, + "step": 5946 + }, + { + "epoch": 0.5846539397066411, + "grad_norm": 31.423398971557617, + "learning_rate": 8e-05, + "loss": 39.6598, + "num_input_tokens_seen": 306616996, + "step": 5949 + }, + { + "epoch": 0.5849487727574261, + "grad_norm": 283.5697021484375, + "learning_rate": 8e-05, + "loss": 40.6435, + "num_input_tokens_seen": 306763840, + "step": 5952 + }, + { + "epoch": 0.5852436058082111, + "grad_norm": 42.0274543762207, + "learning_rate": 8e-05, + "loss": 38.9727, + "num_input_tokens_seen": 306925976, + "step": 5955 + }, + { + "epoch": 0.5855384388589961, + "grad_norm": 29.649784088134766, + "learning_rate": 8e-05, + "loss": 37.5955, + "num_input_tokens_seen": 307096356, + "step": 5958 + }, + { + "epoch": 0.5858332719097811, + "grad_norm": 34.0390625, + "learning_rate": 8e-05, + "loss": 41.1632, + "num_input_tokens_seen": 307248556, + "step": 5961 + }, + { + "epoch": 0.5861281049605661, + "grad_norm": 36.18404769897461, + "learning_rate": 8e-05, + "loss": 36.7726, + "num_input_tokens_seen": 307399908, + "step": 5964 + }, + { + "epoch": 0.586422938011351, + "grad_norm": 27.936336517333984, + "learning_rate": 8e-05, + "loss": 35.0917, + "num_input_tokens_seen": 307577524, + "step": 5967 + }, + { + "epoch": 0.5867177710621361, + "grad_norm": 25.70185661315918, + "learning_rate": 8e-05, + "loss": 40.0438, + "num_input_tokens_seen": 307716788, + "step": 5970 + }, + { + "epoch": 0.5870126041129211, + "grad_norm": 32.73580551147461, + "learning_rate": 8e-05, + "loss": 37.6364, + "num_input_tokens_seen": 307876480, + "step": 5973 + }, + { + "epoch": 0.587307437163706, + "grad_norm": 34.62762451171875, + "learning_rate": 8e-05, + "loss": 39.1228, + "num_input_tokens_seen": 308014456, + "step": 5976 + }, + { + "epoch": 0.5876022702144911, + "grad_norm": 46.6585693359375, + "learning_rate": 8e-05, + "loss": 34.5612, + "num_input_tokens_seen": 308170444, + "step": 5979 + }, + { + "epoch": 0.587897103265276, + "grad_norm": 29.657745361328125, + "learning_rate": 8e-05, + "loss": 43.7115, + "num_input_tokens_seen": 308332068, + "step": 5982 + }, + { + "epoch": 0.588191936316061, + "grad_norm": 30.494150161743164, + "learning_rate": 8e-05, + "loss": 38.0942, + "num_input_tokens_seen": 308493208, + "step": 5985 + }, + { + "epoch": 0.5884867693668461, + "grad_norm": 31.967741012573242, + "learning_rate": 8e-05, + "loss": 40.2974, + "num_input_tokens_seen": 308640716, + "step": 5988 + }, + { + "epoch": 0.588781602417631, + "grad_norm": 29.766223907470703, + "learning_rate": 8e-05, + "loss": 39.3607, + "num_input_tokens_seen": 308776512, + "step": 5991 + }, + { + "epoch": 0.589076435468416, + "grad_norm": 51.61750793457031, + "learning_rate": 8e-05, + "loss": 36.1088, + "num_input_tokens_seen": 308914184, + "step": 5994 + }, + { + "epoch": 0.589371268519201, + "grad_norm": 38.873905181884766, + "learning_rate": 8e-05, + "loss": 34.3019, + "num_input_tokens_seen": 309069328, + "step": 5997 + }, + { + "epoch": 0.589666101569986, + "grad_norm": 62.840354919433594, + "learning_rate": 8e-05, + "loss": 41.428, + "num_input_tokens_seen": 309218384, + "step": 6000 + }, + { + "epoch": 0.589666101569986, + "eval_gen_len": 35.03, + "eval_loss": 2.4841418266296387, + "eval_rouge1": 44.7711, + "eval_rouge2": 28.0903, + "eval_rougeL": 40.7346, + "eval_rougeLsum": 40.9658, + "eval_runtime": 111.847, + "eval_samples_per_second": 1.788, + "eval_steps_per_second": 0.447, + "num_input_tokens_seen": 309218384, + "step": 6000 + }, + { + "epoch": 0.589960934620771, + "grad_norm": 29.889745712280273, + "learning_rate": 8e-05, + "loss": 39.2844, + "num_input_tokens_seen": 309394920, + "step": 6003 + }, + { + "epoch": 0.590255767671556, + "grad_norm": 27.655027389526367, + "learning_rate": 8e-05, + "loss": 36.9213, + "num_input_tokens_seen": 309554232, + "step": 6006 + }, + { + "epoch": 0.590550600722341, + "grad_norm": 29.12769317626953, + "learning_rate": 8e-05, + "loss": 36.2834, + "num_input_tokens_seen": 309700692, + "step": 6009 + }, + { + "epoch": 0.5908454337731259, + "grad_norm": 30.606224060058594, + "learning_rate": 8e-05, + "loss": 41.0752, + "num_input_tokens_seen": 309863240, + "step": 6012 + }, + { + "epoch": 0.591140266823911, + "grad_norm": 26.596939086914062, + "learning_rate": 8e-05, + "loss": 39.0024, + "num_input_tokens_seen": 310023020, + "step": 6015 + }, + { + "epoch": 0.591435099874696, + "grad_norm": 32.189170837402344, + "learning_rate": 8e-05, + "loss": 40.5805, + "num_input_tokens_seen": 310161712, + "step": 6018 + }, + { + "epoch": 0.5917299329254809, + "grad_norm": 24.282939910888672, + "learning_rate": 8e-05, + "loss": 37.5122, + "num_input_tokens_seen": 310313060, + "step": 6021 + }, + { + "epoch": 0.592024765976266, + "grad_norm": 26.229461669921875, + "learning_rate": 8e-05, + "loss": 38.3322, + "num_input_tokens_seen": 310472164, + "step": 6024 + }, + { + "epoch": 0.5923195990270509, + "grad_norm": 24.86408805847168, + "learning_rate": 8e-05, + "loss": 39.9136, + "num_input_tokens_seen": 310640152, + "step": 6027 + }, + { + "epoch": 0.5926144320778359, + "grad_norm": 56.45329666137695, + "learning_rate": 8e-05, + "loss": 36.4206, + "num_input_tokens_seen": 310796800, + "step": 6030 + }, + { + "epoch": 0.592909265128621, + "grad_norm": 26.79425048828125, + "learning_rate": 8e-05, + "loss": 39.6583, + "num_input_tokens_seen": 310954172, + "step": 6033 + }, + { + "epoch": 0.5932040981794059, + "grad_norm": 25.8710880279541, + "learning_rate": 8e-05, + "loss": 36.8764, + "num_input_tokens_seen": 311108384, + "step": 6036 + }, + { + "epoch": 0.5934989312301909, + "grad_norm": 39.8235969543457, + "learning_rate": 8e-05, + "loss": 40.9173, + "num_input_tokens_seen": 311274412, + "step": 6039 + }, + { + "epoch": 0.5937937642809759, + "grad_norm": 27.080610275268555, + "learning_rate": 8e-05, + "loss": 39.3676, + "num_input_tokens_seen": 311438424, + "step": 6042 + }, + { + "epoch": 0.5940885973317609, + "grad_norm": 28.4052677154541, + "learning_rate": 8e-05, + "loss": 36.2356, + "num_input_tokens_seen": 311595236, + "step": 6045 + }, + { + "epoch": 0.5943834303825459, + "grad_norm": 24.13104820251465, + "learning_rate": 8e-05, + "loss": 34.4232, + "num_input_tokens_seen": 311741048, + "step": 6048 + }, + { + "epoch": 0.5946782634333309, + "grad_norm": 24.363216400146484, + "learning_rate": 8e-05, + "loss": 38.3822, + "num_input_tokens_seen": 311916556, + "step": 6051 + }, + { + "epoch": 0.5949730964841159, + "grad_norm": 26.450504302978516, + "learning_rate": 8e-05, + "loss": 42.0056, + "num_input_tokens_seen": 312087284, + "step": 6054 + }, + { + "epoch": 0.5952679295349008, + "grad_norm": 25.021625518798828, + "learning_rate": 8e-05, + "loss": 34.846, + "num_input_tokens_seen": 312239932, + "step": 6057 + }, + { + "epoch": 0.5955627625856859, + "grad_norm": 41.91618728637695, + "learning_rate": 8e-05, + "loss": 34.7107, + "num_input_tokens_seen": 312380608, + "step": 6060 + }, + { + "epoch": 0.5958575956364709, + "grad_norm": 28.016080856323242, + "learning_rate": 8e-05, + "loss": 35.2353, + "num_input_tokens_seen": 312542388, + "step": 6063 + }, + { + "epoch": 0.5961524286872558, + "grad_norm": 29.0964412689209, + "learning_rate": 8e-05, + "loss": 38.4793, + "num_input_tokens_seen": 312704672, + "step": 6066 + }, + { + "epoch": 0.5964472617380409, + "grad_norm": 29.767629623413086, + "learning_rate": 8e-05, + "loss": 41.6059, + "num_input_tokens_seen": 312866032, + "step": 6069 + }, + { + "epoch": 0.5967420947888258, + "grad_norm": 27.889211654663086, + "learning_rate": 8e-05, + "loss": 37.9207, + "num_input_tokens_seen": 313003224, + "step": 6072 + }, + { + "epoch": 0.5970369278396108, + "grad_norm": 25.92238998413086, + "learning_rate": 8e-05, + "loss": 37.741, + "num_input_tokens_seen": 313140552, + "step": 6075 + }, + { + "epoch": 0.5973317608903959, + "grad_norm": 31.18004035949707, + "learning_rate": 8e-05, + "loss": 40.4069, + "num_input_tokens_seen": 313296652, + "step": 6078 + }, + { + "epoch": 0.5976265939411808, + "grad_norm": 31.6116943359375, + "learning_rate": 8e-05, + "loss": 39.7078, + "num_input_tokens_seen": 313443404, + "step": 6081 + }, + { + "epoch": 0.5979214269919658, + "grad_norm": 33.67839050292969, + "learning_rate": 8e-05, + "loss": 40.8099, + "num_input_tokens_seen": 313616320, + "step": 6084 + }, + { + "epoch": 0.5982162600427507, + "grad_norm": 25.062952041625977, + "learning_rate": 8e-05, + "loss": 39.9874, + "num_input_tokens_seen": 313772216, + "step": 6087 + }, + { + "epoch": 0.5985110930935358, + "grad_norm": 23.47759246826172, + "learning_rate": 8e-05, + "loss": 37.2121, + "num_input_tokens_seen": 313914312, + "step": 6090 + }, + { + "epoch": 0.5988059261443208, + "grad_norm": 23.165061950683594, + "learning_rate": 8e-05, + "loss": 36.6471, + "num_input_tokens_seen": 314070528, + "step": 6093 + }, + { + "epoch": 0.5991007591951057, + "grad_norm": 26.055076599121094, + "learning_rate": 8e-05, + "loss": 38.3907, + "num_input_tokens_seen": 314206052, + "step": 6096 + }, + { + "epoch": 0.5993955922458908, + "grad_norm": 30.39374351501465, + "learning_rate": 8e-05, + "loss": 36.5309, + "num_input_tokens_seen": 314366628, + "step": 6099 + }, + { + "epoch": 0.5996904252966757, + "grad_norm": 25.340938568115234, + "learning_rate": 8e-05, + "loss": 35.0818, + "num_input_tokens_seen": 314491676, + "step": 6102 + }, + { + "epoch": 0.5999852583474607, + "grad_norm": 27.407005310058594, + "learning_rate": 8e-05, + "loss": 38.7906, + "num_input_tokens_seen": 314658508, + "step": 6105 + }, + { + "epoch": 0.6002800913982458, + "grad_norm": 35.49182891845703, + "learning_rate": 8e-05, + "loss": 39.3374, + "num_input_tokens_seen": 314821812, + "step": 6108 + }, + { + "epoch": 0.6005749244490307, + "grad_norm": 30.370254516601562, + "learning_rate": 8e-05, + "loss": 39.6636, + "num_input_tokens_seen": 314976568, + "step": 6111 + }, + { + "epoch": 0.6008697574998157, + "grad_norm": 25.774280548095703, + "learning_rate": 8e-05, + "loss": 36.3847, + "num_input_tokens_seen": 315122416, + "step": 6114 + }, + { + "epoch": 0.6011645905506007, + "grad_norm": 29.883007049560547, + "learning_rate": 8e-05, + "loss": 42.0746, + "num_input_tokens_seen": 315307612, + "step": 6117 + }, + { + "epoch": 0.6014594236013857, + "grad_norm": 25.592613220214844, + "learning_rate": 8e-05, + "loss": 38.5165, + "num_input_tokens_seen": 315455472, + "step": 6120 + }, + { + "epoch": 0.6017542566521707, + "grad_norm": 24.037139892578125, + "learning_rate": 8e-05, + "loss": 36.9325, + "num_input_tokens_seen": 315587648, + "step": 6123 + }, + { + "epoch": 0.6020490897029557, + "grad_norm": 26.017288208007812, + "learning_rate": 8e-05, + "loss": 38.6846, + "num_input_tokens_seen": 315739604, + "step": 6126 + }, + { + "epoch": 0.6023439227537407, + "grad_norm": 25.019466400146484, + "learning_rate": 8e-05, + "loss": 33.4267, + "num_input_tokens_seen": 315901624, + "step": 6129 + }, + { + "epoch": 0.6026387558045256, + "grad_norm": 44.18895721435547, + "learning_rate": 8e-05, + "loss": 39.0203, + "num_input_tokens_seen": 316061572, + "step": 6132 + }, + { + "epoch": 0.6029335888553107, + "grad_norm": 29.336572647094727, + "learning_rate": 8e-05, + "loss": 37.8433, + "num_input_tokens_seen": 316219240, + "step": 6135 + }, + { + "epoch": 0.6032284219060957, + "grad_norm": 54.04864501953125, + "learning_rate": 8e-05, + "loss": 39.3524, + "num_input_tokens_seen": 316358264, + "step": 6138 + }, + { + "epoch": 0.6035232549568806, + "grad_norm": 59.748313903808594, + "learning_rate": 8e-05, + "loss": 41.2101, + "num_input_tokens_seen": 316524004, + "step": 6141 + }, + { + "epoch": 0.6038180880076657, + "grad_norm": 27.957366943359375, + "learning_rate": 8e-05, + "loss": 35.7421, + "num_input_tokens_seen": 316698732, + "step": 6144 + }, + { + "epoch": 0.6041129210584506, + "grad_norm": 25.74254035949707, + "learning_rate": 8e-05, + "loss": 37.5683, + "num_input_tokens_seen": 316845568, + "step": 6147 + }, + { + "epoch": 0.6044077541092356, + "grad_norm": 22.848718643188477, + "learning_rate": 8e-05, + "loss": 39.1254, + "num_input_tokens_seen": 317006924, + "step": 6150 + }, + { + "epoch": 0.6047025871600207, + "grad_norm": 23.866653442382812, + "learning_rate": 8e-05, + "loss": 34.2666, + "num_input_tokens_seen": 317151316, + "step": 6153 + }, + { + "epoch": 0.6049974202108056, + "grad_norm": 26.67428207397461, + "learning_rate": 8e-05, + "loss": 38.899, + "num_input_tokens_seen": 317308624, + "step": 6156 + }, + { + "epoch": 0.6052922532615906, + "grad_norm": 24.60082244873047, + "learning_rate": 8e-05, + "loss": 37.5207, + "num_input_tokens_seen": 317467004, + "step": 6159 + }, + { + "epoch": 0.6055870863123756, + "grad_norm": 26.759737014770508, + "learning_rate": 8e-05, + "loss": 41.3066, + "num_input_tokens_seen": 317632616, + "step": 6162 + }, + { + "epoch": 0.6058819193631606, + "grad_norm": 25.69239044189453, + "learning_rate": 8e-05, + "loss": 36.3191, + "num_input_tokens_seen": 317795032, + "step": 6165 + }, + { + "epoch": 0.6061767524139456, + "grad_norm": 25.120275497436523, + "learning_rate": 8e-05, + "loss": 39.3155, + "num_input_tokens_seen": 317938576, + "step": 6168 + }, + { + "epoch": 0.6064715854647306, + "grad_norm": 27.428653717041016, + "learning_rate": 8e-05, + "loss": 39.5125, + "num_input_tokens_seen": 318095440, + "step": 6171 + }, + { + "epoch": 0.6067664185155156, + "grad_norm": 26.741228103637695, + "learning_rate": 8e-05, + "loss": 34.3678, + "num_input_tokens_seen": 318268832, + "step": 6174 + }, + { + "epoch": 0.6070612515663005, + "grad_norm": 33.27923583984375, + "learning_rate": 8e-05, + "loss": 37.9281, + "num_input_tokens_seen": 318438624, + "step": 6177 + }, + { + "epoch": 0.6073560846170856, + "grad_norm": 33.121944427490234, + "learning_rate": 8e-05, + "loss": 38.8562, + "num_input_tokens_seen": 318587356, + "step": 6180 + }, + { + "epoch": 0.6076509176678706, + "grad_norm": 25.744670867919922, + "learning_rate": 8e-05, + "loss": 35.9841, + "num_input_tokens_seen": 318738416, + "step": 6183 + }, + { + "epoch": 0.6079457507186555, + "grad_norm": 35.156089782714844, + "learning_rate": 8e-05, + "loss": 39.9863, + "num_input_tokens_seen": 318900128, + "step": 6186 + }, + { + "epoch": 0.6082405837694406, + "grad_norm": 25.188457489013672, + "learning_rate": 8e-05, + "loss": 37.4518, + "num_input_tokens_seen": 319047520, + "step": 6189 + }, + { + "epoch": 0.6085354168202255, + "grad_norm": 31.52955436706543, + "learning_rate": 8e-05, + "loss": 41.9319, + "num_input_tokens_seen": 319213976, + "step": 6192 + }, + { + "epoch": 0.6088302498710105, + "grad_norm": 24.839195251464844, + "learning_rate": 8e-05, + "loss": 38.5171, + "num_input_tokens_seen": 319363700, + "step": 6195 + }, + { + "epoch": 0.6091250829217956, + "grad_norm": 26.186077117919922, + "learning_rate": 8e-05, + "loss": 38.4776, + "num_input_tokens_seen": 319510404, + "step": 6198 + }, + { + "epoch": 0.6094199159725805, + "grad_norm": 28.5213623046875, + "learning_rate": 8e-05, + "loss": 40.5308, + "num_input_tokens_seen": 319679432, + "step": 6201 + }, + { + "epoch": 0.6097147490233655, + "grad_norm": 25.307594299316406, + "learning_rate": 8e-05, + "loss": 36.822, + "num_input_tokens_seen": 319803740, + "step": 6204 + }, + { + "epoch": 0.6100095820741505, + "grad_norm": 28.7880802154541, + "learning_rate": 8e-05, + "loss": 37.6342, + "num_input_tokens_seen": 319950552, + "step": 6207 + }, + { + "epoch": 0.6103044151249355, + "grad_norm": 31.373199462890625, + "learning_rate": 8e-05, + "loss": 40.4808, + "num_input_tokens_seen": 320084952, + "step": 6210 + }, + { + "epoch": 0.6105992481757205, + "grad_norm": 22.3890438079834, + "learning_rate": 8e-05, + "loss": 38.3386, + "num_input_tokens_seen": 320259860, + "step": 6213 + }, + { + "epoch": 0.6108940812265055, + "grad_norm": 22.981782913208008, + "learning_rate": 8e-05, + "loss": 37.9938, + "num_input_tokens_seen": 320413164, + "step": 6216 + }, + { + "epoch": 0.6111889142772905, + "grad_norm": 28.02817153930664, + "learning_rate": 8e-05, + "loss": 37.9879, + "num_input_tokens_seen": 320565088, + "step": 6219 + }, + { + "epoch": 0.6114837473280754, + "grad_norm": 26.173526763916016, + "learning_rate": 8e-05, + "loss": 40.7751, + "num_input_tokens_seen": 320720500, + "step": 6222 + }, + { + "epoch": 0.6117785803788605, + "grad_norm": 24.532196044921875, + "learning_rate": 8e-05, + "loss": 38.0332, + "num_input_tokens_seen": 320866336, + "step": 6225 + }, + { + "epoch": 0.6120734134296455, + "grad_norm": 24.801034927368164, + "learning_rate": 8e-05, + "loss": 33.5498, + "num_input_tokens_seen": 321025468, + "step": 6228 + }, + { + "epoch": 0.6123682464804304, + "grad_norm": 24.207077026367188, + "learning_rate": 8e-05, + "loss": 35.1215, + "num_input_tokens_seen": 321190080, + "step": 6231 + }, + { + "epoch": 0.6126630795312155, + "grad_norm": 23.834396362304688, + "learning_rate": 8e-05, + "loss": 37.5769, + "num_input_tokens_seen": 321342708, + "step": 6234 + }, + { + "epoch": 0.6129579125820005, + "grad_norm": 30.3687744140625, + "learning_rate": 8e-05, + "loss": 37.4518, + "num_input_tokens_seen": 321499416, + "step": 6237 + }, + { + "epoch": 0.6132527456327854, + "grad_norm": 33.319515228271484, + "learning_rate": 8e-05, + "loss": 33.2451, + "num_input_tokens_seen": 321653200, + "step": 6240 + }, + { + "epoch": 0.6135475786835705, + "grad_norm": 22.197099685668945, + "learning_rate": 8e-05, + "loss": 37.2735, + "num_input_tokens_seen": 321814068, + "step": 6243 + }, + { + "epoch": 0.6138424117343554, + "grad_norm": 24.674034118652344, + "learning_rate": 8e-05, + "loss": 41.9069, + "num_input_tokens_seen": 321963460, + "step": 6246 + }, + { + "epoch": 0.6141372447851404, + "grad_norm": 25.704599380493164, + "learning_rate": 8e-05, + "loss": 35.3355, + "num_input_tokens_seen": 322105484, + "step": 6249 + }, + { + "epoch": 0.6144320778359255, + "grad_norm": 68.95780181884766, + "learning_rate": 8e-05, + "loss": 36.8214, + "num_input_tokens_seen": 322262320, + "step": 6252 + }, + { + "epoch": 0.6147269108867104, + "grad_norm": 24.768585205078125, + "learning_rate": 8e-05, + "loss": 38.9012, + "num_input_tokens_seen": 322430264, + "step": 6255 + }, + { + "epoch": 0.6150217439374954, + "grad_norm": 23.958892822265625, + "learning_rate": 8e-05, + "loss": 36.9476, + "num_input_tokens_seen": 322584252, + "step": 6258 + }, + { + "epoch": 0.6153165769882804, + "grad_norm": 22.662036895751953, + "learning_rate": 8e-05, + "loss": 35.6357, + "num_input_tokens_seen": 322735532, + "step": 6261 + }, + { + "epoch": 0.6156114100390654, + "grad_norm": 32.92559814453125, + "learning_rate": 8e-05, + "loss": 39.298, + "num_input_tokens_seen": 322882780, + "step": 6264 + }, + { + "epoch": 0.6159062430898504, + "grad_norm": 25.98016357421875, + "learning_rate": 8e-05, + "loss": 39.7257, + "num_input_tokens_seen": 323034520, + "step": 6267 + }, + { + "epoch": 0.6162010761406354, + "grad_norm": 25.833646774291992, + "learning_rate": 8e-05, + "loss": 36.4177, + "num_input_tokens_seen": 323181256, + "step": 6270 + }, + { + "epoch": 0.6164959091914204, + "grad_norm": 26.506811141967773, + "learning_rate": 8e-05, + "loss": 39.6621, + "num_input_tokens_seen": 323347496, + "step": 6273 + }, + { + "epoch": 0.6167907422422053, + "grad_norm": 24.709272384643555, + "learning_rate": 8e-05, + "loss": 38.6652, + "num_input_tokens_seen": 323495496, + "step": 6276 + }, + { + "epoch": 0.6170855752929904, + "grad_norm": 23.8092041015625, + "learning_rate": 8e-05, + "loss": 30.2883, + "num_input_tokens_seen": 323672384, + "step": 6279 + }, + { + "epoch": 0.6173804083437754, + "grad_norm": 26.053071975708008, + "learning_rate": 8e-05, + "loss": 38.5194, + "num_input_tokens_seen": 323816864, + "step": 6282 + }, + { + "epoch": 0.6176752413945603, + "grad_norm": 22.75402069091797, + "learning_rate": 8e-05, + "loss": 35.6399, + "num_input_tokens_seen": 323975752, + "step": 6285 + }, + { + "epoch": 0.6179700744453454, + "grad_norm": 28.435270309448242, + "learning_rate": 8e-05, + "loss": 40.2471, + "num_input_tokens_seen": 324128112, + "step": 6288 + }, + { + "epoch": 0.6182649074961303, + "grad_norm": 26.416706085205078, + "learning_rate": 8e-05, + "loss": 38.8477, + "num_input_tokens_seen": 324308656, + "step": 6291 + }, + { + "epoch": 0.6185597405469153, + "grad_norm": 27.599233627319336, + "learning_rate": 8e-05, + "loss": 38.1784, + "num_input_tokens_seen": 324462412, + "step": 6294 + }, + { + "epoch": 0.6188545735977004, + "grad_norm": 27.04106330871582, + "learning_rate": 8e-05, + "loss": 40.2296, + "num_input_tokens_seen": 324615088, + "step": 6297 + }, + { + "epoch": 0.6191494066484853, + "grad_norm": 25.297691345214844, + "learning_rate": 8e-05, + "loss": 35.8062, + "num_input_tokens_seen": 324758700, + "step": 6300 + }, + { + "epoch": 0.6194442396992703, + "grad_norm": 40.44841003417969, + "learning_rate": 8e-05, + "loss": 36.9515, + "num_input_tokens_seen": 324912956, + "step": 6303 + }, + { + "epoch": 0.6197390727500552, + "grad_norm": 22.532255172729492, + "learning_rate": 8e-05, + "loss": 35.7363, + "num_input_tokens_seen": 325065772, + "step": 6306 + }, + { + "epoch": 0.6200339058008403, + "grad_norm": 30.187307357788086, + "learning_rate": 8e-05, + "loss": 37.0602, + "num_input_tokens_seen": 325211688, + "step": 6309 + }, + { + "epoch": 0.6203287388516253, + "grad_norm": 29.660106658935547, + "learning_rate": 8e-05, + "loss": 42.3845, + "num_input_tokens_seen": 325346160, + "step": 6312 + }, + { + "epoch": 0.6206235719024102, + "grad_norm": 23.635356903076172, + "learning_rate": 8e-05, + "loss": 34.5203, + "num_input_tokens_seen": 325485300, + "step": 6315 + }, + { + "epoch": 0.6209184049531953, + "grad_norm": 25.654155731201172, + "learning_rate": 8e-05, + "loss": 34.3353, + "num_input_tokens_seen": 325640184, + "step": 6318 + }, + { + "epoch": 0.6212132380039802, + "grad_norm": 26.068546295166016, + "learning_rate": 8e-05, + "loss": 38.6867, + "num_input_tokens_seen": 325788596, + "step": 6321 + }, + { + "epoch": 0.6215080710547652, + "grad_norm": 26.19140625, + "learning_rate": 8e-05, + "loss": 39.2954, + "num_input_tokens_seen": 325935036, + "step": 6324 + }, + { + "epoch": 0.6218029041055503, + "grad_norm": 25.26287841796875, + "learning_rate": 8e-05, + "loss": 39.2243, + "num_input_tokens_seen": 326107384, + "step": 6327 + }, + { + "epoch": 0.6220977371563352, + "grad_norm": 42.484676361083984, + "learning_rate": 8e-05, + "loss": 38.0859, + "num_input_tokens_seen": 326258240, + "step": 6330 + }, + { + "epoch": 0.6223925702071202, + "grad_norm": 28.638322830200195, + "learning_rate": 8e-05, + "loss": 38.5841, + "num_input_tokens_seen": 326408044, + "step": 6333 + }, + { + "epoch": 0.6226874032579052, + "grad_norm": 32.85466384887695, + "learning_rate": 8e-05, + "loss": 38.6516, + "num_input_tokens_seen": 326574272, + "step": 6336 + }, + { + "epoch": 0.6229822363086902, + "grad_norm": 25.046932220458984, + "learning_rate": 8e-05, + "loss": 36.5864, + "num_input_tokens_seen": 326728284, + "step": 6339 + }, + { + "epoch": 0.6232770693594752, + "grad_norm": 25.095548629760742, + "learning_rate": 8e-05, + "loss": 39.4286, + "num_input_tokens_seen": 326868400, + "step": 6342 + }, + { + "epoch": 0.6235719024102602, + "grad_norm": 27.86627960205078, + "learning_rate": 8e-05, + "loss": 36.5636, + "num_input_tokens_seen": 327017628, + "step": 6345 + }, + { + "epoch": 0.6238667354610452, + "grad_norm": 26.67412567138672, + "learning_rate": 8e-05, + "loss": 34.9673, + "num_input_tokens_seen": 327176840, + "step": 6348 + }, + { + "epoch": 0.6241615685118301, + "grad_norm": 21.75296974182129, + "learning_rate": 8e-05, + "loss": 37.4662, + "num_input_tokens_seen": 327348448, + "step": 6351 + }, + { + "epoch": 0.6244564015626152, + "grad_norm": 24.94108009338379, + "learning_rate": 8e-05, + "loss": 37.269, + "num_input_tokens_seen": 327502008, + "step": 6354 + }, + { + "epoch": 0.6247512346134002, + "grad_norm": 25.871793746948242, + "learning_rate": 8e-05, + "loss": 39.1569, + "num_input_tokens_seen": 327664740, + "step": 6357 + }, + { + "epoch": 0.6250460676641851, + "grad_norm": 25.637754440307617, + "learning_rate": 8e-05, + "loss": 35.1057, + "num_input_tokens_seen": 327805228, + "step": 6360 + }, + { + "epoch": 0.6253409007149702, + "grad_norm": 24.5385799407959, + "learning_rate": 8e-05, + "loss": 36.3612, + "num_input_tokens_seen": 327975752, + "step": 6363 + }, + { + "epoch": 0.6256357337657551, + "grad_norm": 22.548460006713867, + "learning_rate": 8e-05, + "loss": 37.2956, + "num_input_tokens_seen": 328108840, + "step": 6366 + }, + { + "epoch": 0.6259305668165401, + "grad_norm": 27.364675521850586, + "learning_rate": 8e-05, + "loss": 37.3522, + "num_input_tokens_seen": 328267088, + "step": 6369 + }, + { + "epoch": 0.6262253998673252, + "grad_norm": 25.948596954345703, + "learning_rate": 8e-05, + "loss": 37.2168, + "num_input_tokens_seen": 328409936, + "step": 6372 + }, + { + "epoch": 0.6265202329181101, + "grad_norm": 25.288524627685547, + "learning_rate": 8e-05, + "loss": 39.4214, + "num_input_tokens_seen": 328557584, + "step": 6375 + }, + { + "epoch": 0.6268150659688951, + "grad_norm": 24.71294593811035, + "learning_rate": 8e-05, + "loss": 40.4651, + "num_input_tokens_seen": 328723240, + "step": 6378 + }, + { + "epoch": 0.6271098990196801, + "grad_norm": 28.631532669067383, + "learning_rate": 8e-05, + "loss": 35.7814, + "num_input_tokens_seen": 328869796, + "step": 6381 + }, + { + "epoch": 0.6274047320704651, + "grad_norm": 26.28819465637207, + "learning_rate": 8e-05, + "loss": 39.6646, + "num_input_tokens_seen": 329019092, + "step": 6384 + }, + { + "epoch": 0.6276995651212501, + "grad_norm": 24.583751678466797, + "learning_rate": 8e-05, + "loss": 36.6027, + "num_input_tokens_seen": 329150600, + "step": 6387 + }, + { + "epoch": 0.6279943981720351, + "grad_norm": 26.986366271972656, + "learning_rate": 8e-05, + "loss": 38.0961, + "num_input_tokens_seen": 329290408, + "step": 6390 + }, + { + "epoch": 0.6282892312228201, + "grad_norm": 24.84749984741211, + "learning_rate": 8e-05, + "loss": 39.3869, + "num_input_tokens_seen": 329450420, + "step": 6393 + }, + { + "epoch": 0.628584064273605, + "grad_norm": 28.76380157470703, + "learning_rate": 8e-05, + "loss": 36.9979, + "num_input_tokens_seen": 329609004, + "step": 6396 + }, + { + "epoch": 0.6288788973243901, + "grad_norm": 23.334930419921875, + "learning_rate": 8e-05, + "loss": 37.4399, + "num_input_tokens_seen": 329778436, + "step": 6399 + }, + { + "epoch": 0.6291737303751751, + "grad_norm": 25.433956146240234, + "learning_rate": 8e-05, + "loss": 36.705, + "num_input_tokens_seen": 329929608, + "step": 6402 + }, + { + "epoch": 0.62946856342596, + "grad_norm": 27.61007308959961, + "learning_rate": 8e-05, + "loss": 40.0462, + "num_input_tokens_seen": 330078108, + "step": 6405 + }, + { + "epoch": 0.6297633964767451, + "grad_norm": 35.30503845214844, + "learning_rate": 8e-05, + "loss": 40.6168, + "num_input_tokens_seen": 330244708, + "step": 6408 + }, + { + "epoch": 0.63005822952753, + "grad_norm": 35.08722686767578, + "learning_rate": 8e-05, + "loss": 38.4732, + "num_input_tokens_seen": 330386136, + "step": 6411 + }, + { + "epoch": 0.630353062578315, + "grad_norm": 37.645442962646484, + "learning_rate": 8e-05, + "loss": 38.4413, + "num_input_tokens_seen": 330530964, + "step": 6414 + }, + { + "epoch": 0.6306478956291001, + "grad_norm": 27.654539108276367, + "learning_rate": 8e-05, + "loss": 37.3986, + "num_input_tokens_seen": 330677096, + "step": 6417 + }, + { + "epoch": 0.630942728679885, + "grad_norm": 32.413185119628906, + "learning_rate": 8e-05, + "loss": 36.1898, + "num_input_tokens_seen": 330847940, + "step": 6420 + }, + { + "epoch": 0.63123756173067, + "grad_norm": 32.70518112182617, + "learning_rate": 8e-05, + "loss": 35.777, + "num_input_tokens_seen": 331014036, + "step": 6423 + }, + { + "epoch": 0.631532394781455, + "grad_norm": 27.792139053344727, + "learning_rate": 8e-05, + "loss": 41.193, + "num_input_tokens_seen": 331203516, + "step": 6426 + }, + { + "epoch": 0.63182722783224, + "grad_norm": 28.370925903320312, + "learning_rate": 8e-05, + "loss": 35.5362, + "num_input_tokens_seen": 331365772, + "step": 6429 + }, + { + "epoch": 0.632122060883025, + "grad_norm": 26.484954833984375, + "learning_rate": 8e-05, + "loss": 34.9362, + "num_input_tokens_seen": 331512204, + "step": 6432 + }, + { + "epoch": 0.63241689393381, + "grad_norm": 27.78957748413086, + "learning_rate": 8e-05, + "loss": 38.4179, + "num_input_tokens_seen": 331654600, + "step": 6435 + }, + { + "epoch": 0.632711726984595, + "grad_norm": 24.038084030151367, + "learning_rate": 8e-05, + "loss": 40.5222, + "num_input_tokens_seen": 331813988, + "step": 6438 + }, + { + "epoch": 0.6330065600353799, + "grad_norm": 25.796804428100586, + "learning_rate": 8e-05, + "loss": 37.0437, + "num_input_tokens_seen": 331970924, + "step": 6441 + }, + { + "epoch": 0.633301393086165, + "grad_norm": 24.9102725982666, + "learning_rate": 8e-05, + "loss": 38.9791, + "num_input_tokens_seen": 332128504, + "step": 6444 + }, + { + "epoch": 0.63359622613695, + "grad_norm": 36.67914581298828, + "learning_rate": 8e-05, + "loss": 38.2588, + "num_input_tokens_seen": 332294876, + "step": 6447 + }, + { + "epoch": 0.6338910591877349, + "grad_norm": 24.813623428344727, + "learning_rate": 8e-05, + "loss": 36.5417, + "num_input_tokens_seen": 332435244, + "step": 6450 + }, + { + "epoch": 0.63418589223852, + "grad_norm": 25.02754020690918, + "learning_rate": 8e-05, + "loss": 38.0281, + "num_input_tokens_seen": 332585000, + "step": 6453 + }, + { + "epoch": 0.6344807252893049, + "grad_norm": 28.043087005615234, + "learning_rate": 8e-05, + "loss": 36.1322, + "num_input_tokens_seen": 332746468, + "step": 6456 + }, + { + "epoch": 0.6347755583400899, + "grad_norm": 26.460355758666992, + "learning_rate": 8e-05, + "loss": 37.103, + "num_input_tokens_seen": 332895956, + "step": 6459 + }, + { + "epoch": 0.635070391390875, + "grad_norm": 25.79451560974121, + "learning_rate": 8e-05, + "loss": 41.6694, + "num_input_tokens_seen": 333051188, + "step": 6462 + }, + { + "epoch": 0.6353652244416599, + "grad_norm": 33.36618423461914, + "learning_rate": 8e-05, + "loss": 39.547, + "num_input_tokens_seen": 333195848, + "step": 6465 + }, + { + "epoch": 0.6356600574924449, + "grad_norm": 36.27760314941406, + "learning_rate": 8e-05, + "loss": 32.7442, + "num_input_tokens_seen": 333349552, + "step": 6468 + }, + { + "epoch": 0.6359548905432298, + "grad_norm": 34.557708740234375, + "learning_rate": 8e-05, + "loss": 36.7705, + "num_input_tokens_seen": 333490864, + "step": 6471 + }, + { + "epoch": 0.6362497235940149, + "grad_norm": 40.10776138305664, + "learning_rate": 8e-05, + "loss": 38.0712, + "num_input_tokens_seen": 333639924, + "step": 6474 + }, + { + "epoch": 0.6365445566447999, + "grad_norm": 27.282073974609375, + "learning_rate": 8e-05, + "loss": 37.5385, + "num_input_tokens_seen": 333774356, + "step": 6477 + }, + { + "epoch": 0.6368393896955848, + "grad_norm": 35.44887924194336, + "learning_rate": 8e-05, + "loss": 38.0226, + "num_input_tokens_seen": 333934716, + "step": 6480 + }, + { + "epoch": 0.6371342227463699, + "grad_norm": 25.15498161315918, + "learning_rate": 8e-05, + "loss": 40.2181, + "num_input_tokens_seen": 334098292, + "step": 6483 + }, + { + "epoch": 0.6374290557971548, + "grad_norm": 23.980417251586914, + "learning_rate": 8e-05, + "loss": 38.9857, + "num_input_tokens_seen": 334266788, + "step": 6486 + }, + { + "epoch": 0.6377238888479398, + "grad_norm": 26.44454002380371, + "learning_rate": 8e-05, + "loss": 37.4722, + "num_input_tokens_seen": 334416364, + "step": 6489 + }, + { + "epoch": 0.6380187218987249, + "grad_norm": 30.107446670532227, + "learning_rate": 8e-05, + "loss": 39.8594, + "num_input_tokens_seen": 334556328, + "step": 6492 + }, + { + "epoch": 0.6383135549495098, + "grad_norm": 25.60369300842285, + "learning_rate": 8e-05, + "loss": 38.6356, + "num_input_tokens_seen": 334715092, + "step": 6495 + }, + { + "epoch": 0.6386083880002948, + "grad_norm": 23.690120697021484, + "learning_rate": 8e-05, + "loss": 37.2159, + "num_input_tokens_seen": 334872568, + "step": 6498 + }, + { + "epoch": 0.6389032210510798, + "grad_norm": 25.064729690551758, + "learning_rate": 8e-05, + "loss": 38.6088, + "num_input_tokens_seen": 335003784, + "step": 6501 + }, + { + "epoch": 0.6391980541018648, + "grad_norm": 25.150876998901367, + "learning_rate": 8e-05, + "loss": 34.2671, + "num_input_tokens_seen": 335149108, + "step": 6504 + }, + { + "epoch": 0.6394928871526498, + "grad_norm": 27.666296005249023, + "learning_rate": 8e-05, + "loss": 39.7811, + "num_input_tokens_seen": 335302152, + "step": 6507 + }, + { + "epoch": 0.6397877202034348, + "grad_norm": 29.01523208618164, + "learning_rate": 8e-05, + "loss": 35.9424, + "num_input_tokens_seen": 335457552, + "step": 6510 + }, + { + "epoch": 0.6400825532542198, + "grad_norm": 20.90727996826172, + "learning_rate": 8e-05, + "loss": 32.7056, + "num_input_tokens_seen": 335620096, + "step": 6513 + }, + { + "epoch": 0.6403773863050047, + "grad_norm": 22.06777572631836, + "learning_rate": 8e-05, + "loss": 38.796, + "num_input_tokens_seen": 335769336, + "step": 6516 + }, + { + "epoch": 0.6406722193557898, + "grad_norm": 23.688201904296875, + "learning_rate": 8e-05, + "loss": 35.616, + "num_input_tokens_seen": 335919728, + "step": 6519 + }, + { + "epoch": 0.6409670524065748, + "grad_norm": 33.622276306152344, + "learning_rate": 8e-05, + "loss": 37.4969, + "num_input_tokens_seen": 336067104, + "step": 6522 + }, + { + "epoch": 0.6412618854573597, + "grad_norm": 26.230440139770508, + "learning_rate": 8e-05, + "loss": 38.4008, + "num_input_tokens_seen": 336207588, + "step": 6525 + }, + { + "epoch": 0.6415567185081448, + "grad_norm": 46.02008819580078, + "learning_rate": 8e-05, + "loss": 35.475, + "num_input_tokens_seen": 336358368, + "step": 6528 + }, + { + "epoch": 0.6418515515589298, + "grad_norm": 26.169721603393555, + "learning_rate": 8e-05, + "loss": 41.3722, + "num_input_tokens_seen": 336497716, + "step": 6531 + }, + { + "epoch": 0.6421463846097147, + "grad_norm": 26.567733764648438, + "learning_rate": 8e-05, + "loss": 38.4448, + "num_input_tokens_seen": 336646012, + "step": 6534 + }, + { + "epoch": 0.6424412176604998, + "grad_norm": 26.18414306640625, + "learning_rate": 8e-05, + "loss": 33.4987, + "num_input_tokens_seen": 336785844, + "step": 6537 + }, + { + "epoch": 0.6427360507112847, + "grad_norm": 27.476449966430664, + "learning_rate": 8e-05, + "loss": 38.427, + "num_input_tokens_seen": 336945340, + "step": 6540 + }, + { + "epoch": 0.6430308837620697, + "grad_norm": 24.81687355041504, + "learning_rate": 8e-05, + "loss": 38.1676, + "num_input_tokens_seen": 337073892, + "step": 6543 + }, + { + "epoch": 0.6433257168128548, + "grad_norm": 28.00323486328125, + "learning_rate": 8e-05, + "loss": 38.1138, + "num_input_tokens_seen": 337223080, + "step": 6546 + }, + { + "epoch": 0.6436205498636397, + "grad_norm": 31.119070053100586, + "learning_rate": 8e-05, + "loss": 36.6106, + "num_input_tokens_seen": 337369276, + "step": 6549 + }, + { + "epoch": 0.6439153829144247, + "grad_norm": 26.03184700012207, + "learning_rate": 8e-05, + "loss": 34.1828, + "num_input_tokens_seen": 337537580, + "step": 6552 + }, + { + "epoch": 0.6442102159652097, + "grad_norm": 25.940513610839844, + "learning_rate": 8e-05, + "loss": 37.7545, + "num_input_tokens_seen": 337735200, + "step": 6555 + }, + { + "epoch": 0.6445050490159947, + "grad_norm": 27.552566528320312, + "learning_rate": 8e-05, + "loss": 36.6377, + "num_input_tokens_seen": 337885680, + "step": 6558 + }, + { + "epoch": 0.6447998820667797, + "grad_norm": 25.005868911743164, + "learning_rate": 8e-05, + "loss": 38.7374, + "num_input_tokens_seen": 338018188, + "step": 6561 + }, + { + "epoch": 0.6450947151175647, + "grad_norm": 26.592487335205078, + "learning_rate": 8e-05, + "loss": 39.5499, + "num_input_tokens_seen": 338171564, + "step": 6564 + }, + { + "epoch": 0.6453895481683497, + "grad_norm": 27.41492462158203, + "learning_rate": 8e-05, + "loss": 36.1295, + "num_input_tokens_seen": 338342292, + "step": 6567 + }, + { + "epoch": 0.6456843812191346, + "grad_norm": 30.22612190246582, + "learning_rate": 8e-05, + "loss": 39.7426, + "num_input_tokens_seen": 338498444, + "step": 6570 + }, + { + "epoch": 0.6459792142699197, + "grad_norm": 28.708240509033203, + "learning_rate": 8e-05, + "loss": 39.4286, + "num_input_tokens_seen": 338668028, + "step": 6573 + }, + { + "epoch": 0.6462740473207047, + "grad_norm": 23.029550552368164, + "learning_rate": 8e-05, + "loss": 36.3736, + "num_input_tokens_seen": 338820384, + "step": 6576 + }, + { + "epoch": 0.6465688803714896, + "grad_norm": 37.06300735473633, + "learning_rate": 8e-05, + "loss": 38.5943, + "num_input_tokens_seen": 338972760, + "step": 6579 + }, + { + "epoch": 0.6468637134222747, + "grad_norm": 32.833038330078125, + "learning_rate": 8e-05, + "loss": 37.1746, + "num_input_tokens_seen": 339144456, + "step": 6582 + }, + { + "epoch": 0.6471585464730596, + "grad_norm": 29.135648727416992, + "learning_rate": 8e-05, + "loss": 38.2403, + "num_input_tokens_seen": 339275080, + "step": 6585 + }, + { + "epoch": 0.6474533795238446, + "grad_norm": 28.29663848876953, + "learning_rate": 8e-05, + "loss": 41.0382, + "num_input_tokens_seen": 339421484, + "step": 6588 + }, + { + "epoch": 0.6477482125746297, + "grad_norm": 28.951173782348633, + "learning_rate": 8e-05, + "loss": 36.4348, + "num_input_tokens_seen": 339600020, + "step": 6591 + }, + { + "epoch": 0.6480430456254146, + "grad_norm": 28.281675338745117, + "learning_rate": 8e-05, + "loss": 38.8957, + "num_input_tokens_seen": 339757804, + "step": 6594 + }, + { + "epoch": 0.6483378786761996, + "grad_norm": 34.26532745361328, + "learning_rate": 8e-05, + "loss": 35.5098, + "num_input_tokens_seen": 339923508, + "step": 6597 + }, + { + "epoch": 0.6486327117269846, + "grad_norm": 23.300016403198242, + "learning_rate": 8e-05, + "loss": 33.0052, + "num_input_tokens_seen": 340083980, + "step": 6600 + }, + { + "epoch": 0.6489275447777696, + "grad_norm": 27.63614845275879, + "learning_rate": 8e-05, + "loss": 38.4134, + "num_input_tokens_seen": 340239028, + "step": 6603 + }, + { + "epoch": 0.6492223778285546, + "grad_norm": 33.11357498168945, + "learning_rate": 8e-05, + "loss": 42.3899, + "num_input_tokens_seen": 340405744, + "step": 6606 + }, + { + "epoch": 0.6495172108793396, + "grad_norm": 25.984708786010742, + "learning_rate": 8e-05, + "loss": 40.0451, + "num_input_tokens_seen": 340542892, + "step": 6609 + }, + { + "epoch": 0.6498120439301246, + "grad_norm": 24.340328216552734, + "learning_rate": 8e-05, + "loss": 41.1376, + "num_input_tokens_seen": 340678260, + "step": 6612 + }, + { + "epoch": 0.6501068769809095, + "grad_norm": 28.840999603271484, + "learning_rate": 8e-05, + "loss": 29.5664, + "num_input_tokens_seen": 340839812, + "step": 6615 + }, + { + "epoch": 0.6504017100316946, + "grad_norm": 29.218814849853516, + "learning_rate": 8e-05, + "loss": 35.1434, + "num_input_tokens_seen": 340998380, + "step": 6618 + }, + { + "epoch": 0.6506965430824796, + "grad_norm": 221.42181396484375, + "learning_rate": 8e-05, + "loss": 39.2533, + "num_input_tokens_seen": 341162476, + "step": 6621 + }, + { + "epoch": 0.6509913761332645, + "grad_norm": 33.743621826171875, + "learning_rate": 8e-05, + "loss": 35.9878, + "num_input_tokens_seen": 341301756, + "step": 6624 + }, + { + "epoch": 0.6512862091840496, + "grad_norm": 31.24152183532715, + "learning_rate": 8e-05, + "loss": 36.8398, + "num_input_tokens_seen": 341458280, + "step": 6627 + }, + { + "epoch": 0.6515810422348345, + "grad_norm": 29.4301815032959, + "learning_rate": 8e-05, + "loss": 36.0449, + "num_input_tokens_seen": 341624296, + "step": 6630 + }, + { + "epoch": 0.6518758752856195, + "grad_norm": 22.053237915039062, + "learning_rate": 8e-05, + "loss": 34.3092, + "num_input_tokens_seen": 341804404, + "step": 6633 + }, + { + "epoch": 0.6521707083364046, + "grad_norm": 23.275348663330078, + "learning_rate": 8e-05, + "loss": 36.9765, + "num_input_tokens_seen": 341964004, + "step": 6636 + }, + { + "epoch": 0.6524655413871895, + "grad_norm": 26.191434860229492, + "learning_rate": 8e-05, + "loss": 39.8154, + "num_input_tokens_seen": 342114328, + "step": 6639 + }, + { + "epoch": 0.6527603744379745, + "grad_norm": 38.87674331665039, + "learning_rate": 8e-05, + "loss": 41.3826, + "num_input_tokens_seen": 342277812, + "step": 6642 + }, + { + "epoch": 0.6530552074887594, + "grad_norm": 24.583362579345703, + "learning_rate": 8e-05, + "loss": 35.0064, + "num_input_tokens_seen": 342421428, + "step": 6645 + }, + { + "epoch": 0.6533500405395445, + "grad_norm": 441.7207946777344, + "learning_rate": 8e-05, + "loss": 32.8596, + "num_input_tokens_seen": 342567780, + "step": 6648 + }, + { + "epoch": 0.6536448735903295, + "grad_norm": 24.749300003051758, + "learning_rate": 8e-05, + "loss": 37.5455, + "num_input_tokens_seen": 342740652, + "step": 6651 + }, + { + "epoch": 0.6539397066411144, + "grad_norm": 26.746244430541992, + "learning_rate": 8e-05, + "loss": 36.2025, + "num_input_tokens_seen": 342880932, + "step": 6654 + }, + { + "epoch": 0.6542345396918995, + "grad_norm": 72.51927947998047, + "learning_rate": 8e-05, + "loss": 39.4842, + "num_input_tokens_seen": 343042572, + "step": 6657 + }, + { + "epoch": 0.6545293727426844, + "grad_norm": 24.986759185791016, + "learning_rate": 8e-05, + "loss": 34.271, + "num_input_tokens_seen": 343196592, + "step": 6660 + }, + { + "epoch": 0.6548242057934694, + "grad_norm": 23.670583724975586, + "learning_rate": 8e-05, + "loss": 38.3177, + "num_input_tokens_seen": 343341208, + "step": 6663 + }, + { + "epoch": 0.6551190388442545, + "grad_norm": 22.490602493286133, + "learning_rate": 8e-05, + "loss": 37.3046, + "num_input_tokens_seen": 343512440, + "step": 6666 + }, + { + "epoch": 0.6554138718950394, + "grad_norm": 29.031286239624023, + "learning_rate": 8e-05, + "loss": 42.1513, + "num_input_tokens_seen": 343664580, + "step": 6669 + }, + { + "epoch": 0.6557087049458244, + "grad_norm": 27.807151794433594, + "learning_rate": 8e-05, + "loss": 36.1717, + "num_input_tokens_seen": 343818232, + "step": 6672 + }, + { + "epoch": 0.6560035379966094, + "grad_norm": 41.40616226196289, + "learning_rate": 8e-05, + "loss": 33.0627, + "num_input_tokens_seen": 343975788, + "step": 6675 + }, + { + "epoch": 0.6562983710473944, + "grad_norm": 27.92901611328125, + "learning_rate": 8e-05, + "loss": 32.2173, + "num_input_tokens_seen": 344110348, + "step": 6678 + }, + { + "epoch": 0.6565932040981795, + "grad_norm": 27.682600021362305, + "learning_rate": 8e-05, + "loss": 39.627, + "num_input_tokens_seen": 344263136, + "step": 6681 + }, + { + "epoch": 0.6568880371489644, + "grad_norm": 25.742345809936523, + "learning_rate": 8e-05, + "loss": 36.5412, + "num_input_tokens_seen": 344415904, + "step": 6684 + }, + { + "epoch": 0.6571828701997494, + "grad_norm": 23.010465621948242, + "learning_rate": 8e-05, + "loss": 37.7615, + "num_input_tokens_seen": 344578904, + "step": 6687 + }, + { + "epoch": 0.6574777032505343, + "grad_norm": 25.112548828125, + "learning_rate": 8e-05, + "loss": 34.0272, + "num_input_tokens_seen": 344717536, + "step": 6690 + }, + { + "epoch": 0.6577725363013194, + "grad_norm": 26.201248168945312, + "learning_rate": 8e-05, + "loss": 35.2759, + "num_input_tokens_seen": 344869812, + "step": 6693 + }, + { + "epoch": 0.6580673693521044, + "grad_norm": 25.718441009521484, + "learning_rate": 8e-05, + "loss": 38.9038, + "num_input_tokens_seen": 345037436, + "step": 6696 + }, + { + "epoch": 0.6583622024028893, + "grad_norm": 31.64290428161621, + "learning_rate": 8e-05, + "loss": 34.065, + "num_input_tokens_seen": 345174996, + "step": 6699 + }, + { + "epoch": 0.6586570354536744, + "grad_norm": 27.697294235229492, + "learning_rate": 8e-05, + "loss": 34.2109, + "num_input_tokens_seen": 345324196, + "step": 6702 + }, + { + "epoch": 0.6589518685044593, + "grad_norm": 30.634992599487305, + "learning_rate": 8e-05, + "loss": 35.4552, + "num_input_tokens_seen": 345477268, + "step": 6705 + }, + { + "epoch": 0.6592467015552443, + "grad_norm": 25.306087493896484, + "learning_rate": 8e-05, + "loss": 39.5236, + "num_input_tokens_seen": 345619356, + "step": 6708 + }, + { + "epoch": 0.6595415346060294, + "grad_norm": 26.60824966430664, + "learning_rate": 8e-05, + "loss": 38.9368, + "num_input_tokens_seen": 345756940, + "step": 6711 + }, + { + "epoch": 0.6598363676568143, + "grad_norm": 25.774150848388672, + "learning_rate": 8e-05, + "loss": 40.7406, + "num_input_tokens_seen": 345918180, + "step": 6714 + }, + { + "epoch": 0.6601312007075993, + "grad_norm": 23.252714157104492, + "learning_rate": 8e-05, + "loss": 38.6923, + "num_input_tokens_seen": 346046492, + "step": 6717 + }, + { + "epoch": 0.6604260337583843, + "grad_norm": 23.236473083496094, + "learning_rate": 8e-05, + "loss": 37.0205, + "num_input_tokens_seen": 346207188, + "step": 6720 + }, + { + "epoch": 0.6607208668091693, + "grad_norm": 22.641433715820312, + "learning_rate": 8e-05, + "loss": 35.8195, + "num_input_tokens_seen": 346358216, + "step": 6723 + }, + { + "epoch": 0.6610156998599543, + "grad_norm": 32.38287353515625, + "learning_rate": 8e-05, + "loss": 37.4374, + "num_input_tokens_seen": 346508352, + "step": 6726 + }, + { + "epoch": 0.6613105329107393, + "grad_norm": 23.829994201660156, + "learning_rate": 8e-05, + "loss": 38.3075, + "num_input_tokens_seen": 346658684, + "step": 6729 + }, + { + "epoch": 0.6616053659615243, + "grad_norm": 20.449419021606445, + "learning_rate": 8e-05, + "loss": 32.9127, + "num_input_tokens_seen": 346829520, + "step": 6732 + }, + { + "epoch": 0.6619001990123092, + "grad_norm": 25.241458892822266, + "learning_rate": 8e-05, + "loss": 37.9814, + "num_input_tokens_seen": 346978620, + "step": 6735 + }, + { + "epoch": 0.6621950320630943, + "grad_norm": 40.08452224731445, + "learning_rate": 8e-05, + "loss": 35.3977, + "num_input_tokens_seen": 347129724, + "step": 6738 + }, + { + "epoch": 0.6624898651138793, + "grad_norm": 25.62432289123535, + "learning_rate": 8e-05, + "loss": 37.1599, + "num_input_tokens_seen": 347281612, + "step": 6741 + }, + { + "epoch": 0.6627846981646642, + "grad_norm": 26.09302520751953, + "learning_rate": 8e-05, + "loss": 35.738, + "num_input_tokens_seen": 347432960, + "step": 6744 + }, + { + "epoch": 0.6630795312154493, + "grad_norm": 28.28545570373535, + "learning_rate": 8e-05, + "loss": 37.3327, + "num_input_tokens_seen": 347581192, + "step": 6747 + }, + { + "epoch": 0.6633743642662342, + "grad_norm": 65.24707794189453, + "learning_rate": 8e-05, + "loss": 34.8417, + "num_input_tokens_seen": 347696544, + "step": 6750 + }, + { + "epoch": 0.6636691973170192, + "grad_norm": 37.428218841552734, + "learning_rate": 8e-05, + "loss": 39.3613, + "num_input_tokens_seen": 347855764, + "step": 6753 + }, + { + "epoch": 0.6639640303678043, + "grad_norm": 23.67691993713379, + "learning_rate": 8e-05, + "loss": 38.6025, + "num_input_tokens_seen": 348022128, + "step": 6756 + }, + { + "epoch": 0.6642588634185892, + "grad_norm": 25.112234115600586, + "learning_rate": 8e-05, + "loss": 40.1824, + "num_input_tokens_seen": 348192976, + "step": 6759 + }, + { + "epoch": 0.6645536964693742, + "grad_norm": 29.855688095092773, + "learning_rate": 8e-05, + "loss": 41.8466, + "num_input_tokens_seen": 348363652, + "step": 6762 + }, + { + "epoch": 0.6648485295201592, + "grad_norm": 25.25415802001953, + "learning_rate": 8e-05, + "loss": 37.8966, + "num_input_tokens_seen": 348530436, + "step": 6765 + }, + { + "epoch": 0.6651433625709442, + "grad_norm": 25.98432159423828, + "learning_rate": 8e-05, + "loss": 36.1825, + "num_input_tokens_seen": 348684660, + "step": 6768 + }, + { + "epoch": 0.6654381956217292, + "grad_norm": 29.960161209106445, + "learning_rate": 8e-05, + "loss": 37.6181, + "num_input_tokens_seen": 348820632, + "step": 6771 + }, + { + "epoch": 0.6657330286725142, + "grad_norm": 36.65909957885742, + "learning_rate": 8e-05, + "loss": 35.4403, + "num_input_tokens_seen": 348985784, + "step": 6774 + }, + { + "epoch": 0.6660278617232992, + "grad_norm": 34.468509674072266, + "learning_rate": 8e-05, + "loss": 40.4557, + "num_input_tokens_seen": 349137960, + "step": 6777 + }, + { + "epoch": 0.6663226947740841, + "grad_norm": 24.24013328552246, + "learning_rate": 8e-05, + "loss": 37.4488, + "num_input_tokens_seen": 349282196, + "step": 6780 + }, + { + "epoch": 0.6666175278248692, + "grad_norm": 38.20613479614258, + "learning_rate": 8e-05, + "loss": 39.7592, + "num_input_tokens_seen": 349430124, + "step": 6783 + }, + { + "epoch": 0.6669123608756542, + "grad_norm": 39.29615783691406, + "learning_rate": 8e-05, + "loss": 36.0311, + "num_input_tokens_seen": 349590896, + "step": 6786 + }, + { + "epoch": 0.6672071939264391, + "grad_norm": 27.678237915039062, + "learning_rate": 8e-05, + "loss": 41.4173, + "num_input_tokens_seen": 349741976, + "step": 6789 + }, + { + "epoch": 0.6675020269772242, + "grad_norm": 23.54377555847168, + "learning_rate": 8e-05, + "loss": 35.8037, + "num_input_tokens_seen": 349891584, + "step": 6792 + }, + { + "epoch": 0.6677968600280091, + "grad_norm": 22.418794631958008, + "learning_rate": 8e-05, + "loss": 37.0058, + "num_input_tokens_seen": 350048688, + "step": 6795 + }, + { + "epoch": 0.6680916930787941, + "grad_norm": 27.08586883544922, + "learning_rate": 8e-05, + "loss": 35.055, + "num_input_tokens_seen": 350209312, + "step": 6798 + }, + { + "epoch": 0.6683865261295792, + "grad_norm": 32.486961364746094, + "learning_rate": 8e-05, + "loss": 40.9057, + "num_input_tokens_seen": 350366576, + "step": 6801 + }, + { + "epoch": 0.6686813591803641, + "grad_norm": 29.93039321899414, + "learning_rate": 8e-05, + "loss": 41.0914, + "num_input_tokens_seen": 350534288, + "step": 6804 + }, + { + "epoch": 0.6689761922311491, + "grad_norm": 26.834508895874023, + "learning_rate": 8e-05, + "loss": 35.2028, + "num_input_tokens_seen": 350683428, + "step": 6807 + }, + { + "epoch": 0.669271025281934, + "grad_norm": 23.54694175720215, + "learning_rate": 8e-05, + "loss": 37.2137, + "num_input_tokens_seen": 350839888, + "step": 6810 + }, + { + "epoch": 0.6695658583327191, + "grad_norm": 53.113704681396484, + "learning_rate": 8e-05, + "loss": 35.2484, + "num_input_tokens_seen": 350995556, + "step": 6813 + }, + { + "epoch": 0.6698606913835041, + "grad_norm": 28.740299224853516, + "learning_rate": 8e-05, + "loss": 38.2213, + "num_input_tokens_seen": 351154384, + "step": 6816 + }, + { + "epoch": 0.670155524434289, + "grad_norm": 25.740116119384766, + "learning_rate": 8e-05, + "loss": 36.9882, + "num_input_tokens_seen": 351302160, + "step": 6819 + }, + { + "epoch": 0.6704503574850741, + "grad_norm": 26.176889419555664, + "learning_rate": 8e-05, + "loss": 35.5851, + "num_input_tokens_seen": 351441112, + "step": 6822 + }, + { + "epoch": 0.670745190535859, + "grad_norm": 24.30504608154297, + "learning_rate": 8e-05, + "loss": 36.2588, + "num_input_tokens_seen": 351586372, + "step": 6825 + }, + { + "epoch": 0.671040023586644, + "grad_norm": 28.314659118652344, + "learning_rate": 8e-05, + "loss": 39.4041, + "num_input_tokens_seen": 351748720, + "step": 6828 + }, + { + "epoch": 0.6713348566374291, + "grad_norm": 25.78873062133789, + "learning_rate": 8e-05, + "loss": 35.5167, + "num_input_tokens_seen": 351929488, + "step": 6831 + }, + { + "epoch": 0.671629689688214, + "grad_norm": 29.422073364257812, + "learning_rate": 8e-05, + "loss": 38.3875, + "num_input_tokens_seen": 352077568, + "step": 6834 + }, + { + "epoch": 0.671924522738999, + "grad_norm": 25.27082633972168, + "learning_rate": 8e-05, + "loss": 35.4953, + "num_input_tokens_seen": 352236692, + "step": 6837 + }, + { + "epoch": 0.6722193557897841, + "grad_norm": 25.59380340576172, + "learning_rate": 8e-05, + "loss": 38.9723, + "num_input_tokens_seen": 352393932, + "step": 6840 + }, + { + "epoch": 0.672514188840569, + "grad_norm": 29.449504852294922, + "learning_rate": 8e-05, + "loss": 43.4062, + "num_input_tokens_seen": 352557020, + "step": 6843 + }, + { + "epoch": 0.672809021891354, + "grad_norm": 23.7353458404541, + "learning_rate": 8e-05, + "loss": 35.9395, + "num_input_tokens_seen": 352720728, + "step": 6846 + }, + { + "epoch": 0.673103854942139, + "grad_norm": 20.535308837890625, + "learning_rate": 8e-05, + "loss": 37.3628, + "num_input_tokens_seen": 352873480, + "step": 6849 + }, + { + "epoch": 0.673398687992924, + "grad_norm": 24.83087158203125, + "learning_rate": 8e-05, + "loss": 36.0663, + "num_input_tokens_seen": 353039368, + "step": 6852 + }, + { + "epoch": 0.673693521043709, + "grad_norm": 27.142587661743164, + "learning_rate": 8e-05, + "loss": 39.2065, + "num_input_tokens_seen": 353182328, + "step": 6855 + }, + { + "epoch": 0.673988354094494, + "grad_norm": 26.528867721557617, + "learning_rate": 8e-05, + "loss": 37.2345, + "num_input_tokens_seen": 353341440, + "step": 6858 + }, + { + "epoch": 0.674283187145279, + "grad_norm": 26.108625411987305, + "learning_rate": 8e-05, + "loss": 35.812, + "num_input_tokens_seen": 353509616, + "step": 6861 + }, + { + "epoch": 0.6745780201960639, + "grad_norm": 23.73592758178711, + "learning_rate": 8e-05, + "loss": 36.0127, + "num_input_tokens_seen": 353648016, + "step": 6864 + }, + { + "epoch": 0.674872853246849, + "grad_norm": 22.257362365722656, + "learning_rate": 8e-05, + "loss": 39.6556, + "num_input_tokens_seen": 353797716, + "step": 6867 + }, + { + "epoch": 0.675167686297634, + "grad_norm": 24.35578727722168, + "learning_rate": 8e-05, + "loss": 36.7126, + "num_input_tokens_seen": 353957804, + "step": 6870 + }, + { + "epoch": 0.6754625193484189, + "grad_norm": 26.841032028198242, + "learning_rate": 8e-05, + "loss": 35.9122, + "num_input_tokens_seen": 354111364, + "step": 6873 + }, + { + "epoch": 0.675757352399204, + "grad_norm": 33.47208023071289, + "learning_rate": 8e-05, + "loss": 40.0805, + "num_input_tokens_seen": 354243644, + "step": 6876 + }, + { + "epoch": 0.6760521854499889, + "grad_norm": 23.950185775756836, + "learning_rate": 8e-05, + "loss": 39.7134, + "num_input_tokens_seen": 354412388, + "step": 6879 + }, + { + "epoch": 0.6763470185007739, + "grad_norm": 23.655397415161133, + "learning_rate": 8e-05, + "loss": 39.7507, + "num_input_tokens_seen": 354566740, + "step": 6882 + }, + { + "epoch": 0.676641851551559, + "grad_norm": 25.052453994750977, + "learning_rate": 8e-05, + "loss": 40.9295, + "num_input_tokens_seen": 354715756, + "step": 6885 + }, + { + "epoch": 0.6769366846023439, + "grad_norm": 36.31843948364258, + "learning_rate": 8e-05, + "loss": 36.5385, + "num_input_tokens_seen": 354867004, + "step": 6888 + }, + { + "epoch": 0.677231517653129, + "grad_norm": 32.722171783447266, + "learning_rate": 8e-05, + "loss": 34.0506, + "num_input_tokens_seen": 355026580, + "step": 6891 + }, + { + "epoch": 0.6775263507039139, + "grad_norm": 45.20307922363281, + "learning_rate": 8e-05, + "loss": 35.6528, + "num_input_tokens_seen": 355162704, + "step": 6894 + }, + { + "epoch": 0.6778211837546989, + "grad_norm": 24.55483055114746, + "learning_rate": 8e-05, + "loss": 37.1783, + "num_input_tokens_seen": 355307844, + "step": 6897 + }, + { + "epoch": 0.678116016805484, + "grad_norm": 24.945140838623047, + "learning_rate": 8e-05, + "loss": 38.1234, + "num_input_tokens_seen": 355464488, + "step": 6900 + }, + { + "epoch": 0.6784108498562689, + "grad_norm": 31.348560333251953, + "learning_rate": 8e-05, + "loss": 35.7855, + "num_input_tokens_seen": 355599336, + "step": 6903 + }, + { + "epoch": 0.6787056829070539, + "grad_norm": 86.59671783447266, + "learning_rate": 8e-05, + "loss": 35.6197, + "num_input_tokens_seen": 355755832, + "step": 6906 + }, + { + "epoch": 0.6790005159578388, + "grad_norm": 28.258621215820312, + "learning_rate": 8e-05, + "loss": 32.1772, + "num_input_tokens_seen": 355903376, + "step": 6909 + }, + { + "epoch": 0.6792953490086239, + "grad_norm": 25.099407196044922, + "learning_rate": 8e-05, + "loss": 36.2638, + "num_input_tokens_seen": 356063268, + "step": 6912 + }, + { + "epoch": 0.6795901820594089, + "grad_norm": 23.317407608032227, + "learning_rate": 8e-05, + "loss": 36.8374, + "num_input_tokens_seen": 356208332, + "step": 6915 + }, + { + "epoch": 0.6798850151101938, + "grad_norm": 23.524396896362305, + "learning_rate": 8e-05, + "loss": 34.3438, + "num_input_tokens_seen": 356350224, + "step": 6918 + }, + { + "epoch": 0.6801798481609789, + "grad_norm": 40.0164794921875, + "learning_rate": 8e-05, + "loss": 35.6302, + "num_input_tokens_seen": 356505032, + "step": 6921 + }, + { + "epoch": 0.6804746812117638, + "grad_norm": 30.623050689697266, + "learning_rate": 8e-05, + "loss": 35.4187, + "num_input_tokens_seen": 356672508, + "step": 6924 + }, + { + "epoch": 0.6807695142625488, + "grad_norm": 25.530649185180664, + "learning_rate": 8e-05, + "loss": 40.2457, + "num_input_tokens_seen": 356835012, + "step": 6927 + }, + { + "epoch": 0.6810643473133339, + "grad_norm": 23.30413818359375, + "learning_rate": 8e-05, + "loss": 36.5556, + "num_input_tokens_seen": 356974760, + "step": 6930 + }, + { + "epoch": 0.6813591803641188, + "grad_norm": 26.084671020507812, + "learning_rate": 8e-05, + "loss": 38.8782, + "num_input_tokens_seen": 357150712, + "step": 6933 + }, + { + "epoch": 0.6816540134149038, + "grad_norm": 30.265125274658203, + "learning_rate": 8e-05, + "loss": 36.9992, + "num_input_tokens_seen": 357317688, + "step": 6936 + }, + { + "epoch": 0.6819488464656888, + "grad_norm": 26.261505126953125, + "learning_rate": 8e-05, + "loss": 39.1126, + "num_input_tokens_seen": 357463828, + "step": 6939 + }, + { + "epoch": 0.6822436795164738, + "grad_norm": 25.053955078125, + "learning_rate": 8e-05, + "loss": 34.9128, + "num_input_tokens_seen": 357611476, + "step": 6942 + }, + { + "epoch": 0.6825385125672588, + "grad_norm": 26.563329696655273, + "learning_rate": 8e-05, + "loss": 34.9007, + "num_input_tokens_seen": 357749340, + "step": 6945 + }, + { + "epoch": 0.6828333456180438, + "grad_norm": 23.786418914794922, + "learning_rate": 8e-05, + "loss": 36.6414, + "num_input_tokens_seen": 357896284, + "step": 6948 + }, + { + "epoch": 0.6831281786688288, + "grad_norm": 21.139434814453125, + "learning_rate": 8e-05, + "loss": 32.9007, + "num_input_tokens_seen": 358048648, + "step": 6951 + }, + { + "epoch": 0.6834230117196137, + "grad_norm": 28.922199249267578, + "learning_rate": 8e-05, + "loss": 39.5529, + "num_input_tokens_seen": 358200792, + "step": 6954 + }, + { + "epoch": 0.6837178447703988, + "grad_norm": 52.790138244628906, + "learning_rate": 8e-05, + "loss": 39.8963, + "num_input_tokens_seen": 358374328, + "step": 6957 + }, + { + "epoch": 0.6840126778211838, + "grad_norm": 38.69890594482422, + "learning_rate": 8e-05, + "loss": 34.4037, + "num_input_tokens_seen": 358541504, + "step": 6960 + }, + { + "epoch": 0.6843075108719687, + "grad_norm": 31.100345611572266, + "learning_rate": 8e-05, + "loss": 37.5846, + "num_input_tokens_seen": 358692296, + "step": 6963 + }, + { + "epoch": 0.6846023439227538, + "grad_norm": 33.161014556884766, + "learning_rate": 8e-05, + "loss": 35.0341, + "num_input_tokens_seen": 358851288, + "step": 6966 + }, + { + "epoch": 0.6848971769735387, + "grad_norm": 24.75032615661621, + "learning_rate": 8e-05, + "loss": 34.567, + "num_input_tokens_seen": 359008992, + "step": 6969 + }, + { + "epoch": 0.6851920100243237, + "grad_norm": 39.028160095214844, + "learning_rate": 8e-05, + "loss": 38.9036, + "num_input_tokens_seen": 359165044, + "step": 6972 + }, + { + "epoch": 0.6854868430751088, + "grad_norm": 23.45261001586914, + "learning_rate": 8e-05, + "loss": 37.1691, + "num_input_tokens_seen": 359314900, + "step": 6975 + }, + { + "epoch": 0.6857816761258937, + "grad_norm": 28.804195404052734, + "learning_rate": 8e-05, + "loss": 39.5592, + "num_input_tokens_seen": 359479956, + "step": 6978 + }, + { + "epoch": 0.6860765091766787, + "grad_norm": 44.11396408081055, + "learning_rate": 8e-05, + "loss": 37.8574, + "num_input_tokens_seen": 359625208, + "step": 6981 + }, + { + "epoch": 0.6863713422274637, + "grad_norm": 25.33087921142578, + "learning_rate": 8e-05, + "loss": 38.5327, + "num_input_tokens_seen": 359761980, + "step": 6984 + }, + { + "epoch": 0.6866661752782487, + "grad_norm": 28.133926391601562, + "learning_rate": 8e-05, + "loss": 37.843, + "num_input_tokens_seen": 359908984, + "step": 6987 + }, + { + "epoch": 0.6869610083290337, + "grad_norm": 26.1141300201416, + "learning_rate": 8e-05, + "loss": 40.021, + "num_input_tokens_seen": 360079052, + "step": 6990 + }, + { + "epoch": 0.6872558413798187, + "grad_norm": 28.791933059692383, + "learning_rate": 8e-05, + "loss": 34.2899, + "num_input_tokens_seen": 360206192, + "step": 6993 + }, + { + "epoch": 0.6875506744306037, + "grad_norm": 28.28318977355957, + "learning_rate": 8e-05, + "loss": 38.1626, + "num_input_tokens_seen": 360341872, + "step": 6996 + }, + { + "epoch": 0.6878455074813886, + "grad_norm": 23.083023071289062, + "learning_rate": 8e-05, + "loss": 36.5789, + "num_input_tokens_seen": 360499676, + "step": 6999 + }, + { + "epoch": 0.6879437851649837, + "eval_gen_len": 30.805, + "eval_loss": 2.384366750717163, + "eval_rouge1": 44.8011, + "eval_rouge2": 28.0367, + "eval_rougeL": 40.8555, + "eval_rougeLsum": 41.1516, + "eval_runtime": 93.8691, + "eval_samples_per_second": 2.131, + "eval_steps_per_second": 0.533, + "num_input_tokens_seen": 360560352, + "step": 7000 + }, + { + "epoch": 0.6881403405321737, + "grad_norm": 21.221134185791016, + "learning_rate": 8e-05, + "loss": 31.7868, + "num_input_tokens_seen": 360664356, + "step": 7002 + }, + { + "epoch": 0.6884351735829587, + "grad_norm": 28.533222198486328, + "learning_rate": 8e-05, + "loss": 39.2583, + "num_input_tokens_seen": 360821316, + "step": 7005 + }, + { + "epoch": 0.6887300066337436, + "grad_norm": 27.524171829223633, + "learning_rate": 8e-05, + "loss": 37.4306, + "num_input_tokens_seen": 360971048, + "step": 7008 + }, + { + "epoch": 0.6890248396845287, + "grad_norm": 28.358911514282227, + "learning_rate": 8e-05, + "loss": 35.6021, + "num_input_tokens_seen": 361120852, + "step": 7011 + }, + { + "epoch": 0.6893196727353136, + "grad_norm": 21.3004207611084, + "learning_rate": 8e-05, + "loss": 37.0142, + "num_input_tokens_seen": 361290472, + "step": 7014 + }, + { + "epoch": 0.6896145057860986, + "grad_norm": 26.657236099243164, + "learning_rate": 8e-05, + "loss": 38.8584, + "num_input_tokens_seen": 361439848, + "step": 7017 + }, + { + "epoch": 0.6899093388368837, + "grad_norm": 26.211191177368164, + "learning_rate": 8e-05, + "loss": 36.6387, + "num_input_tokens_seen": 361582308, + "step": 7020 + }, + { + "epoch": 0.6902041718876686, + "grad_norm": 28.06895637512207, + "learning_rate": 8e-05, + "loss": 38.8741, + "num_input_tokens_seen": 361734312, + "step": 7023 + }, + { + "epoch": 0.6904990049384536, + "grad_norm": 23.55493927001953, + "learning_rate": 8e-05, + "loss": 40.4776, + "num_input_tokens_seen": 361888652, + "step": 7026 + }, + { + "epoch": 0.6907938379892385, + "grad_norm": 23.64281463623047, + "learning_rate": 8e-05, + "loss": 34.3881, + "num_input_tokens_seen": 362026428, + "step": 7029 + }, + { + "epoch": 0.6910886710400236, + "grad_norm": 46.84697723388672, + "learning_rate": 8e-05, + "loss": 38.394, + "num_input_tokens_seen": 362179384, + "step": 7032 + }, + { + "epoch": 0.6913835040908086, + "grad_norm": 24.269325256347656, + "learning_rate": 8e-05, + "loss": 33.2829, + "num_input_tokens_seen": 362321776, + "step": 7035 + }, + { + "epoch": 0.6916783371415935, + "grad_norm": 39.650611877441406, + "learning_rate": 8e-05, + "loss": 36.2346, + "num_input_tokens_seen": 362471104, + "step": 7038 + }, + { + "epoch": 0.6919731701923786, + "grad_norm": 23.979177474975586, + "learning_rate": 8e-05, + "loss": 37.6138, + "num_input_tokens_seen": 362605908, + "step": 7041 + }, + { + "epoch": 0.6922680032431635, + "grad_norm": 40.64398956298828, + "learning_rate": 8e-05, + "loss": 38.7976, + "num_input_tokens_seen": 362781236, + "step": 7044 + }, + { + "epoch": 0.6925628362939485, + "grad_norm": 33.43219757080078, + "learning_rate": 8e-05, + "loss": 35.2467, + "num_input_tokens_seen": 362947916, + "step": 7047 + }, + { + "epoch": 0.6928576693447336, + "grad_norm": 26.71827507019043, + "learning_rate": 8e-05, + "loss": 39.5095, + "num_input_tokens_seen": 363100592, + "step": 7050 + }, + { + "epoch": 0.6931525023955185, + "grad_norm": 24.0466365814209, + "learning_rate": 8e-05, + "loss": 40.0637, + "num_input_tokens_seen": 363263896, + "step": 7053 + }, + { + "epoch": 0.6934473354463035, + "grad_norm": 29.3724308013916, + "learning_rate": 8e-05, + "loss": 36.4387, + "num_input_tokens_seen": 363397720, + "step": 7056 + }, + { + "epoch": 0.6937421684970885, + "grad_norm": 26.97890281677246, + "learning_rate": 8e-05, + "loss": 36.6981, + "num_input_tokens_seen": 363549992, + "step": 7059 + }, + { + "epoch": 0.6940370015478735, + "grad_norm": 25.043485641479492, + "learning_rate": 8e-05, + "loss": 37.4143, + "num_input_tokens_seen": 363724944, + "step": 7062 + }, + { + "epoch": 0.6943318345986585, + "grad_norm": 22.60007095336914, + "learning_rate": 8e-05, + "loss": 34.5618, + "num_input_tokens_seen": 363885164, + "step": 7065 + }, + { + "epoch": 0.6946266676494435, + "grad_norm": 25.03348159790039, + "learning_rate": 8e-05, + "loss": 37.147, + "num_input_tokens_seen": 364053588, + "step": 7068 + }, + { + "epoch": 0.6949215007002285, + "grad_norm": 24.149120330810547, + "learning_rate": 8e-05, + "loss": 36.2789, + "num_input_tokens_seen": 364200544, + "step": 7071 + }, + { + "epoch": 0.6952163337510134, + "grad_norm": 24.27806282043457, + "learning_rate": 8e-05, + "loss": 37.3367, + "num_input_tokens_seen": 364365032, + "step": 7074 + }, + { + "epoch": 0.6955111668017985, + "grad_norm": 27.3250732421875, + "learning_rate": 8e-05, + "loss": 38.3819, + "num_input_tokens_seen": 364500012, + "step": 7077 + }, + { + "epoch": 0.6958059998525835, + "grad_norm": 24.45925521850586, + "learning_rate": 8e-05, + "loss": 39.4937, + "num_input_tokens_seen": 364660220, + "step": 7080 + }, + { + "epoch": 0.6961008329033684, + "grad_norm": 32.768638610839844, + "learning_rate": 8e-05, + "loss": 38.4864, + "num_input_tokens_seen": 364805288, + "step": 7083 + }, + { + "epoch": 0.6963956659541535, + "grad_norm": 25.589962005615234, + "learning_rate": 8e-05, + "loss": 37.92, + "num_input_tokens_seen": 364960252, + "step": 7086 + }, + { + "epoch": 0.6966904990049384, + "grad_norm": 29.774845123291016, + "learning_rate": 8e-05, + "loss": 33.9415, + "num_input_tokens_seen": 365121300, + "step": 7089 + }, + { + "epoch": 0.6969853320557234, + "grad_norm": 24.776142120361328, + "learning_rate": 8e-05, + "loss": 37.4063, + "num_input_tokens_seen": 365267452, + "step": 7092 + }, + { + "epoch": 0.6972801651065085, + "grad_norm": 23.382896423339844, + "learning_rate": 8e-05, + "loss": 34.9722, + "num_input_tokens_seen": 365402260, + "step": 7095 + }, + { + "epoch": 0.6975749981572934, + "grad_norm": 24.839080810546875, + "learning_rate": 8e-05, + "loss": 35.6654, + "num_input_tokens_seen": 365544004, + "step": 7098 + }, + { + "epoch": 0.6978698312080784, + "grad_norm": 22.468021392822266, + "learning_rate": 8e-05, + "loss": 36.9169, + "num_input_tokens_seen": 365711024, + "step": 7101 + }, + { + "epoch": 0.6981646642588634, + "grad_norm": 55.25310134887695, + "learning_rate": 8e-05, + "loss": 38.9989, + "num_input_tokens_seen": 365853052, + "step": 7104 + }, + { + "epoch": 0.6984594973096484, + "grad_norm": 37.976470947265625, + "learning_rate": 8e-05, + "loss": 37.0344, + "num_input_tokens_seen": 366006644, + "step": 7107 + }, + { + "epoch": 0.6987543303604334, + "grad_norm": 25.312318801879883, + "learning_rate": 8e-05, + "loss": 37.1849, + "num_input_tokens_seen": 366154148, + "step": 7110 + }, + { + "epoch": 0.6990491634112184, + "grad_norm": 22.66741943359375, + "learning_rate": 8e-05, + "loss": 39.7188, + "num_input_tokens_seen": 366303968, + "step": 7113 + }, + { + "epoch": 0.6993439964620034, + "grad_norm": 26.739595413208008, + "learning_rate": 8e-05, + "loss": 39.6954, + "num_input_tokens_seen": 366450852, + "step": 7116 + }, + { + "epoch": 0.6996388295127883, + "grad_norm": 26.273880004882812, + "learning_rate": 8e-05, + "loss": 38.4002, + "num_input_tokens_seen": 366619148, + "step": 7119 + }, + { + "epoch": 0.6999336625635734, + "grad_norm": 28.57887840270996, + "learning_rate": 8e-05, + "loss": 35.176, + "num_input_tokens_seen": 366776044, + "step": 7122 + }, + { + "epoch": 0.7002284956143584, + "grad_norm": 34.28238296508789, + "learning_rate": 8e-05, + "loss": 39.8619, + "num_input_tokens_seen": 366925244, + "step": 7125 + }, + { + "epoch": 0.7005233286651433, + "grad_norm": 28.01417350769043, + "learning_rate": 8e-05, + "loss": 35.6729, + "num_input_tokens_seen": 367080504, + "step": 7128 + }, + { + "epoch": 0.7008181617159284, + "grad_norm": 22.694480895996094, + "learning_rate": 8e-05, + "loss": 37.7513, + "num_input_tokens_seen": 367246724, + "step": 7131 + }, + { + "epoch": 0.7011129947667133, + "grad_norm": 22.39271354675293, + "learning_rate": 8e-05, + "loss": 34.6424, + "num_input_tokens_seen": 367386972, + "step": 7134 + }, + { + "epoch": 0.7014078278174983, + "grad_norm": 24.380094528198242, + "learning_rate": 8e-05, + "loss": 34.6489, + "num_input_tokens_seen": 367571520, + "step": 7137 + }, + { + "epoch": 0.7017026608682834, + "grad_norm": 22.910329818725586, + "learning_rate": 8e-05, + "loss": 34.3629, + "num_input_tokens_seen": 367711476, + "step": 7140 + }, + { + "epoch": 0.7019974939190683, + "grad_norm": 26.820985794067383, + "learning_rate": 8e-05, + "loss": 38.2424, + "num_input_tokens_seen": 367863076, + "step": 7143 + }, + { + "epoch": 0.7022923269698533, + "grad_norm": 55.195499420166016, + "learning_rate": 8e-05, + "loss": 36.6898, + "num_input_tokens_seen": 368022532, + "step": 7146 + }, + { + "epoch": 0.7025871600206384, + "grad_norm": 23.07423210144043, + "learning_rate": 8e-05, + "loss": 39.7422, + "num_input_tokens_seen": 368191684, + "step": 7149 + }, + { + "epoch": 0.7028819930714233, + "grad_norm": 25.3260555267334, + "learning_rate": 8e-05, + "loss": 37.7671, + "num_input_tokens_seen": 368350552, + "step": 7152 + }, + { + "epoch": 0.7031768261222083, + "grad_norm": 27.317092895507812, + "learning_rate": 8e-05, + "loss": 36.9679, + "num_input_tokens_seen": 368497796, + "step": 7155 + }, + { + "epoch": 0.7034716591729933, + "grad_norm": 24.9680233001709, + "learning_rate": 8e-05, + "loss": 35.1555, + "num_input_tokens_seen": 368641600, + "step": 7158 + }, + { + "epoch": 0.7037664922237783, + "grad_norm": 23.467241287231445, + "learning_rate": 8e-05, + "loss": 36.8177, + "num_input_tokens_seen": 368788680, + "step": 7161 + }, + { + "epoch": 0.7040613252745633, + "grad_norm": 27.392885208129883, + "learning_rate": 8e-05, + "loss": 38.4683, + "num_input_tokens_seen": 368962924, + "step": 7164 + }, + { + "epoch": 0.7043561583253483, + "grad_norm": 25.242094039916992, + "learning_rate": 8e-05, + "loss": 36.1093, + "num_input_tokens_seen": 369141044, + "step": 7167 + }, + { + "epoch": 0.7046509913761333, + "grad_norm": 24.719947814941406, + "learning_rate": 8e-05, + "loss": 37.4259, + "num_input_tokens_seen": 369280964, + "step": 7170 + }, + { + "epoch": 0.7049458244269182, + "grad_norm": 22.53449058532715, + "learning_rate": 8e-05, + "loss": 34.1221, + "num_input_tokens_seen": 369436492, + "step": 7173 + }, + { + "epoch": 0.7052406574777033, + "grad_norm": 26.51321792602539, + "learning_rate": 8e-05, + "loss": 35.0682, + "num_input_tokens_seen": 369583500, + "step": 7176 + }, + { + "epoch": 0.7055354905284883, + "grad_norm": 21.050174713134766, + "learning_rate": 8e-05, + "loss": 35.6422, + "num_input_tokens_seen": 369748328, + "step": 7179 + }, + { + "epoch": 0.7058303235792732, + "grad_norm": 27.241004943847656, + "learning_rate": 8e-05, + "loss": 39.6832, + "num_input_tokens_seen": 369907856, + "step": 7182 + }, + { + "epoch": 0.7061251566300583, + "grad_norm": 25.388357162475586, + "learning_rate": 8e-05, + "loss": 41.2166, + "num_input_tokens_seen": 370097256, + "step": 7185 + }, + { + "epoch": 0.7064199896808432, + "grad_norm": 24.880773544311523, + "learning_rate": 8e-05, + "loss": 36.4923, + "num_input_tokens_seen": 370255600, + "step": 7188 + }, + { + "epoch": 0.7067148227316282, + "grad_norm": 28.246479034423828, + "learning_rate": 8e-05, + "loss": 35.7393, + "num_input_tokens_seen": 370389872, + "step": 7191 + }, + { + "epoch": 0.7070096557824133, + "grad_norm": 24.847553253173828, + "learning_rate": 8e-05, + "loss": 35.294, + "num_input_tokens_seen": 370519424, + "step": 7194 + }, + { + "epoch": 0.7073044888331982, + "grad_norm": 26.302785873413086, + "learning_rate": 8e-05, + "loss": 39.1979, + "num_input_tokens_seen": 370689432, + "step": 7197 + }, + { + "epoch": 0.7075993218839832, + "grad_norm": 21.54235076904297, + "learning_rate": 8e-05, + "loss": 35.4982, + "num_input_tokens_seen": 370857272, + "step": 7200 + }, + { + "epoch": 0.7078941549347681, + "grad_norm": 29.961332321166992, + "learning_rate": 8e-05, + "loss": 35.7793, + "num_input_tokens_seen": 370983976, + "step": 7203 + }, + { + "epoch": 0.7081889879855532, + "grad_norm": 24.172889709472656, + "learning_rate": 8e-05, + "loss": 41.917, + "num_input_tokens_seen": 371140492, + "step": 7206 + }, + { + "epoch": 0.7084838210363382, + "grad_norm": 25.67815399169922, + "learning_rate": 8e-05, + "loss": 41.9321, + "num_input_tokens_seen": 371300236, + "step": 7209 + }, + { + "epoch": 0.7087786540871231, + "grad_norm": 27.027847290039062, + "learning_rate": 8e-05, + "loss": 39.0412, + "num_input_tokens_seen": 371459420, + "step": 7212 + }, + { + "epoch": 0.7090734871379082, + "grad_norm": 180.06234741210938, + "learning_rate": 8e-05, + "loss": 38.4666, + "num_input_tokens_seen": 371615372, + "step": 7215 + }, + { + "epoch": 0.7093683201886931, + "grad_norm": 116.11947631835938, + "learning_rate": 8e-05, + "loss": 35.4061, + "num_input_tokens_seen": 371772936, + "step": 7218 + }, + { + "epoch": 0.7096631532394782, + "grad_norm": 25.75341796875, + "learning_rate": 8e-05, + "loss": 38.7179, + "num_input_tokens_seen": 371928012, + "step": 7221 + }, + { + "epoch": 0.7099579862902632, + "grad_norm": 22.445053100585938, + "learning_rate": 8e-05, + "loss": 35.927, + "num_input_tokens_seen": 372086308, + "step": 7224 + }, + { + "epoch": 0.7102528193410481, + "grad_norm": 29.624927520751953, + "learning_rate": 8e-05, + "loss": 37.9876, + "num_input_tokens_seen": 372253944, + "step": 7227 + }, + { + "epoch": 0.7105476523918332, + "grad_norm": 23.438127517700195, + "learning_rate": 8e-05, + "loss": 35.5025, + "num_input_tokens_seen": 372408864, + "step": 7230 + }, + { + "epoch": 0.7108424854426181, + "grad_norm": 29.223533630371094, + "learning_rate": 8e-05, + "loss": 32.6952, + "num_input_tokens_seen": 372563228, + "step": 7233 + }, + { + "epoch": 0.7111373184934031, + "grad_norm": 23.90278434753418, + "learning_rate": 8e-05, + "loss": 38.4112, + "num_input_tokens_seen": 372716044, + "step": 7236 + }, + { + "epoch": 0.7114321515441882, + "grad_norm": 24.182998657226562, + "learning_rate": 8e-05, + "loss": 33.5361, + "num_input_tokens_seen": 372868828, + "step": 7239 + }, + { + "epoch": 0.7117269845949731, + "grad_norm": 32.440555572509766, + "learning_rate": 8e-05, + "loss": 35.9163, + "num_input_tokens_seen": 372999444, + "step": 7242 + }, + { + "epoch": 0.7120218176457581, + "grad_norm": 47.51459884643555, + "learning_rate": 8e-05, + "loss": 35.3844, + "num_input_tokens_seen": 373163628, + "step": 7245 + }, + { + "epoch": 0.712316650696543, + "grad_norm": 31.754146575927734, + "learning_rate": 8e-05, + "loss": 38.3827, + "num_input_tokens_seen": 373315648, + "step": 7248 + }, + { + "epoch": 0.7126114837473281, + "grad_norm": 23.967334747314453, + "learning_rate": 8e-05, + "loss": 36.374, + "num_input_tokens_seen": 373463044, + "step": 7251 + }, + { + "epoch": 0.7129063167981131, + "grad_norm": 26.474748611450195, + "learning_rate": 8e-05, + "loss": 36.7464, + "num_input_tokens_seen": 373622436, + "step": 7254 + }, + { + "epoch": 0.713201149848898, + "grad_norm": 25.624454498291016, + "learning_rate": 8e-05, + "loss": 36.7132, + "num_input_tokens_seen": 373771608, + "step": 7257 + }, + { + "epoch": 0.7134959828996831, + "grad_norm": 21.67783546447754, + "learning_rate": 8e-05, + "loss": 35.3883, + "num_input_tokens_seen": 373929268, + "step": 7260 + }, + { + "epoch": 0.713790815950468, + "grad_norm": 31.39051628112793, + "learning_rate": 8e-05, + "loss": 38.0616, + "num_input_tokens_seen": 374095272, + "step": 7263 + }, + { + "epoch": 0.714085649001253, + "grad_norm": 23.98355484008789, + "learning_rate": 8e-05, + "loss": 35.5447, + "num_input_tokens_seen": 374229768, + "step": 7266 + }, + { + "epoch": 0.7143804820520381, + "grad_norm": 28.991931915283203, + "learning_rate": 8e-05, + "loss": 38.2876, + "num_input_tokens_seen": 374376744, + "step": 7269 + }, + { + "epoch": 0.714675315102823, + "grad_norm": 25.730976104736328, + "learning_rate": 8e-05, + "loss": 36.7238, + "num_input_tokens_seen": 374525608, + "step": 7272 + }, + { + "epoch": 0.714970148153608, + "grad_norm": 27.842273712158203, + "learning_rate": 8e-05, + "loss": 40.2203, + "num_input_tokens_seen": 374684756, + "step": 7275 + }, + { + "epoch": 0.715264981204393, + "grad_norm": 37.037139892578125, + "learning_rate": 8e-05, + "loss": 38.8265, + "num_input_tokens_seen": 374826580, + "step": 7278 + }, + { + "epoch": 0.715559814255178, + "grad_norm": 26.971187591552734, + "learning_rate": 8e-05, + "loss": 35.9391, + "num_input_tokens_seen": 374989824, + "step": 7281 + }, + { + "epoch": 0.715854647305963, + "grad_norm": 46.22418975830078, + "learning_rate": 8e-05, + "loss": 36.5632, + "num_input_tokens_seen": 375159888, + "step": 7284 + }, + { + "epoch": 0.716149480356748, + "grad_norm": 23.984878540039062, + "learning_rate": 8e-05, + "loss": 35.8848, + "num_input_tokens_seen": 375315320, + "step": 7287 + }, + { + "epoch": 0.716444313407533, + "grad_norm": 23.67920684814453, + "learning_rate": 8e-05, + "loss": 36.7169, + "num_input_tokens_seen": 375480140, + "step": 7290 + }, + { + "epoch": 0.7167391464583179, + "grad_norm": 24.245229721069336, + "learning_rate": 8e-05, + "loss": 36.4315, + "num_input_tokens_seen": 375655892, + "step": 7293 + }, + { + "epoch": 0.717033979509103, + "grad_norm": 24.02587127685547, + "learning_rate": 8e-05, + "loss": 37.9774, + "num_input_tokens_seen": 375809988, + "step": 7296 + }, + { + "epoch": 0.717328812559888, + "grad_norm": 26.92994499206543, + "learning_rate": 8e-05, + "loss": 40.8964, + "num_input_tokens_seen": 375970568, + "step": 7299 + }, + { + "epoch": 0.7176236456106729, + "grad_norm": 22.372398376464844, + "learning_rate": 8e-05, + "loss": 36.488, + "num_input_tokens_seen": 376134096, + "step": 7302 + }, + { + "epoch": 0.717918478661458, + "grad_norm": 23.640180587768555, + "learning_rate": 8e-05, + "loss": 33.9567, + "num_input_tokens_seen": 376277268, + "step": 7305 + }, + { + "epoch": 0.7182133117122429, + "grad_norm": 22.062767028808594, + "learning_rate": 8e-05, + "loss": 38.0409, + "num_input_tokens_seen": 376454000, + "step": 7308 + }, + { + "epoch": 0.7185081447630279, + "grad_norm": 22.81031608581543, + "learning_rate": 8e-05, + "loss": 36.8436, + "num_input_tokens_seen": 376615928, + "step": 7311 + }, + { + "epoch": 0.718802977813813, + "grad_norm": 32.10073471069336, + "learning_rate": 8e-05, + "loss": 38.9314, + "num_input_tokens_seen": 376772032, + "step": 7314 + }, + { + "epoch": 0.7190978108645979, + "grad_norm": 28.078523635864258, + "learning_rate": 8e-05, + "loss": 41.4978, + "num_input_tokens_seen": 376893188, + "step": 7317 + }, + { + "epoch": 0.7193926439153829, + "grad_norm": 43.02801513671875, + "learning_rate": 8e-05, + "loss": 38.946, + "num_input_tokens_seen": 377049156, + "step": 7320 + }, + { + "epoch": 0.7196874769661679, + "grad_norm": 34.06877517700195, + "learning_rate": 8e-05, + "loss": 40.1617, + "num_input_tokens_seen": 377217384, + "step": 7323 + }, + { + "epoch": 0.7199823100169529, + "grad_norm": 26.74899673461914, + "learning_rate": 8e-05, + "loss": 38.5652, + "num_input_tokens_seen": 377380312, + "step": 7326 + }, + { + "epoch": 0.7202771430677379, + "grad_norm": 33.939579010009766, + "learning_rate": 8e-05, + "loss": 37.3883, + "num_input_tokens_seen": 377536968, + "step": 7329 + }, + { + "epoch": 0.7205719761185229, + "grad_norm": 123.61614227294922, + "learning_rate": 8e-05, + "loss": 35.7296, + "num_input_tokens_seen": 377700640, + "step": 7332 + }, + { + "epoch": 0.7208668091693079, + "grad_norm": 24.66319465637207, + "learning_rate": 8e-05, + "loss": 35.3646, + "num_input_tokens_seen": 377846188, + "step": 7335 + }, + { + "epoch": 0.7211616422200928, + "grad_norm": 23.91818618774414, + "learning_rate": 8e-05, + "loss": 38.0516, + "num_input_tokens_seen": 377998996, + "step": 7338 + }, + { + "epoch": 0.7214564752708779, + "grad_norm": 27.984975814819336, + "learning_rate": 8e-05, + "loss": 36.3984, + "num_input_tokens_seen": 378149336, + "step": 7341 + }, + { + "epoch": 0.7217513083216629, + "grad_norm": 22.733352661132812, + "learning_rate": 8e-05, + "loss": 35.5865, + "num_input_tokens_seen": 378306384, + "step": 7344 + }, + { + "epoch": 0.7220461413724478, + "grad_norm": 46.85588836669922, + "learning_rate": 8e-05, + "loss": 34.2926, + "num_input_tokens_seen": 378455268, + "step": 7347 + }, + { + "epoch": 0.7223409744232329, + "grad_norm": 22.355670928955078, + "learning_rate": 8e-05, + "loss": 38.9565, + "num_input_tokens_seen": 378615884, + "step": 7350 + }, + { + "epoch": 0.7226358074740178, + "grad_norm": 23.133432388305664, + "learning_rate": 8e-05, + "loss": 39.7225, + "num_input_tokens_seen": 378767252, + "step": 7353 + }, + { + "epoch": 0.7229306405248028, + "grad_norm": 25.17456817626953, + "learning_rate": 8e-05, + "loss": 37.6457, + "num_input_tokens_seen": 378915276, + "step": 7356 + }, + { + "epoch": 0.7232254735755879, + "grad_norm": 22.047069549560547, + "learning_rate": 8e-05, + "loss": 38.0102, + "num_input_tokens_seen": 379071112, + "step": 7359 + }, + { + "epoch": 0.7235203066263728, + "grad_norm": 23.050128936767578, + "learning_rate": 8e-05, + "loss": 33.5328, + "num_input_tokens_seen": 379239648, + "step": 7362 + }, + { + "epoch": 0.7238151396771578, + "grad_norm": 22.13313865661621, + "learning_rate": 8e-05, + "loss": 35.4146, + "num_input_tokens_seen": 379379252, + "step": 7365 + }, + { + "epoch": 0.7241099727279428, + "grad_norm": 109.38043212890625, + "learning_rate": 8e-05, + "loss": 35.2421, + "num_input_tokens_seen": 379521348, + "step": 7368 + }, + { + "epoch": 0.7244048057787278, + "grad_norm": 28.971952438354492, + "learning_rate": 8e-05, + "loss": 37.7952, + "num_input_tokens_seen": 379636528, + "step": 7371 + }, + { + "epoch": 0.7246996388295128, + "grad_norm": 25.20716094970703, + "learning_rate": 8e-05, + "loss": 41.4213, + "num_input_tokens_seen": 379788648, + "step": 7374 + }, + { + "epoch": 0.7249944718802978, + "grad_norm": 24.693050384521484, + "learning_rate": 8e-05, + "loss": 40.8675, + "num_input_tokens_seen": 379958864, + "step": 7377 + }, + { + "epoch": 0.7252893049310828, + "grad_norm": 24.950502395629883, + "learning_rate": 8e-05, + "loss": 36.5391, + "num_input_tokens_seen": 380118128, + "step": 7380 + }, + { + "epoch": 0.7255841379818677, + "grad_norm": 44.643550872802734, + "learning_rate": 8e-05, + "loss": 37.2319, + "num_input_tokens_seen": 380272692, + "step": 7383 + }, + { + "epoch": 0.7258789710326528, + "grad_norm": 20.62811279296875, + "learning_rate": 8e-05, + "loss": 35.3777, + "num_input_tokens_seen": 380421916, + "step": 7386 + }, + { + "epoch": 0.7261738040834378, + "grad_norm": 29.82221221923828, + "learning_rate": 8e-05, + "loss": 40.0476, + "num_input_tokens_seen": 380595664, + "step": 7389 + }, + { + "epoch": 0.7264686371342227, + "grad_norm": 24.89944839477539, + "learning_rate": 8e-05, + "loss": 36.1097, + "num_input_tokens_seen": 380733088, + "step": 7392 + }, + { + "epoch": 0.7267634701850078, + "grad_norm": 26.290952682495117, + "learning_rate": 8e-05, + "loss": 38.7241, + "num_input_tokens_seen": 380878792, + "step": 7395 + }, + { + "epoch": 0.7270583032357927, + "grad_norm": 24.127761840820312, + "learning_rate": 8e-05, + "loss": 36.7168, + "num_input_tokens_seen": 381028172, + "step": 7398 + }, + { + "epoch": 0.7273531362865777, + "grad_norm": 24.939001083374023, + "learning_rate": 8e-05, + "loss": 40.1282, + "num_input_tokens_seen": 381186648, + "step": 7401 + }, + { + "epoch": 0.7276479693373628, + "grad_norm": 23.647138595581055, + "learning_rate": 8e-05, + "loss": 34.5384, + "num_input_tokens_seen": 381351084, + "step": 7404 + }, + { + "epoch": 0.7279428023881477, + "grad_norm": 31.73210906982422, + "learning_rate": 8e-05, + "loss": 39.659, + "num_input_tokens_seen": 381497584, + "step": 7407 + }, + { + "epoch": 0.7282376354389327, + "grad_norm": 23.531461715698242, + "learning_rate": 8e-05, + "loss": 40.1637, + "num_input_tokens_seen": 381669016, + "step": 7410 + }, + { + "epoch": 0.7285324684897176, + "grad_norm": 23.627595901489258, + "learning_rate": 8e-05, + "loss": 37.0926, + "num_input_tokens_seen": 381820036, + "step": 7413 + }, + { + "epoch": 0.7288273015405027, + "grad_norm": 27.529569625854492, + "learning_rate": 8e-05, + "loss": 40.0547, + "num_input_tokens_seen": 381981184, + "step": 7416 + }, + { + "epoch": 0.7291221345912877, + "grad_norm": 22.73372459411621, + "learning_rate": 8e-05, + "loss": 38.2892, + "num_input_tokens_seen": 382138552, + "step": 7419 + }, + { + "epoch": 0.7294169676420726, + "grad_norm": 28.180679321289062, + "learning_rate": 8e-05, + "loss": 31.7725, + "num_input_tokens_seen": 382298084, + "step": 7422 + }, + { + "epoch": 0.7297118006928577, + "grad_norm": 34.909271240234375, + "learning_rate": 8e-05, + "loss": 36.6046, + "num_input_tokens_seen": 382461200, + "step": 7425 + }, + { + "epoch": 0.7300066337436426, + "grad_norm": 25.757848739624023, + "learning_rate": 8e-05, + "loss": 35.9155, + "num_input_tokens_seen": 382609888, + "step": 7428 + }, + { + "epoch": 0.7303014667944276, + "grad_norm": 27.34947395324707, + "learning_rate": 8e-05, + "loss": 37.7583, + "num_input_tokens_seen": 382776656, + "step": 7431 + }, + { + "epoch": 0.7305962998452127, + "grad_norm": 1279.990234375, + "learning_rate": 8e-05, + "loss": 37.3187, + "num_input_tokens_seen": 382937396, + "step": 7434 + }, + { + "epoch": 0.7308911328959976, + "grad_norm": 21.36997413635254, + "learning_rate": 8e-05, + "loss": 34.1862, + "num_input_tokens_seen": 383099080, + "step": 7437 + }, + { + "epoch": 0.7311859659467826, + "grad_norm": 23.636608123779297, + "learning_rate": 8e-05, + "loss": 35.8693, + "num_input_tokens_seen": 383258328, + "step": 7440 + }, + { + "epoch": 0.7314807989975677, + "grad_norm": 27.117843627929688, + "learning_rate": 8e-05, + "loss": 33.6174, + "num_input_tokens_seen": 383413472, + "step": 7443 + }, + { + "epoch": 0.7317756320483526, + "grad_norm": 23.628999710083008, + "learning_rate": 8e-05, + "loss": 34.2789, + "num_input_tokens_seen": 383569168, + "step": 7446 + }, + { + "epoch": 0.7320704650991376, + "grad_norm": 28.383270263671875, + "learning_rate": 8e-05, + "loss": 34.7162, + "num_input_tokens_seen": 383741804, + "step": 7449 + }, + { + "epoch": 0.7323652981499226, + "grad_norm": 22.602317810058594, + "learning_rate": 8e-05, + "loss": 36.2921, + "num_input_tokens_seen": 383906228, + "step": 7452 + }, + { + "epoch": 0.7326601312007076, + "grad_norm": 34.03465270996094, + "learning_rate": 8e-05, + "loss": 35.5612, + "num_input_tokens_seen": 384033912, + "step": 7455 + }, + { + "epoch": 0.7329549642514926, + "grad_norm": 25.446636199951172, + "learning_rate": 8e-05, + "loss": 36.915, + "num_input_tokens_seen": 384194144, + "step": 7458 + }, + { + "epoch": 0.7332497973022776, + "grad_norm": 23.60262680053711, + "learning_rate": 8e-05, + "loss": 38.7143, + "num_input_tokens_seen": 384353784, + "step": 7461 + }, + { + "epoch": 0.7335446303530626, + "grad_norm": 22.44038200378418, + "learning_rate": 8e-05, + "loss": 35.0795, + "num_input_tokens_seen": 384487928, + "step": 7464 + }, + { + "epoch": 0.7338394634038475, + "grad_norm": 23.007976531982422, + "learning_rate": 8e-05, + "loss": 39.2289, + "num_input_tokens_seen": 384646676, + "step": 7467 + }, + { + "epoch": 0.7341342964546326, + "grad_norm": 25.747835159301758, + "learning_rate": 8e-05, + "loss": 38.7275, + "num_input_tokens_seen": 384803580, + "step": 7470 + }, + { + "epoch": 0.7344291295054176, + "grad_norm": 34.949222564697266, + "learning_rate": 8e-05, + "loss": 38.5794, + "num_input_tokens_seen": 384950216, + "step": 7473 + }, + { + "epoch": 0.7347239625562025, + "grad_norm": 25.456125259399414, + "learning_rate": 8e-05, + "loss": 36.9796, + "num_input_tokens_seen": 385096200, + "step": 7476 + }, + { + "epoch": 0.7350187956069876, + "grad_norm": 22.768922805786133, + "learning_rate": 8e-05, + "loss": 35.3669, + "num_input_tokens_seen": 385247424, + "step": 7479 + }, + { + "epoch": 0.7353136286577725, + "grad_norm": 25.837984085083008, + "learning_rate": 8e-05, + "loss": 34.7811, + "num_input_tokens_seen": 385401552, + "step": 7482 + }, + { + "epoch": 0.7356084617085575, + "grad_norm": 28.84712791442871, + "learning_rate": 8e-05, + "loss": 36.131, + "num_input_tokens_seen": 385575468, + "step": 7485 + }, + { + "epoch": 0.7359032947593426, + "grad_norm": 27.865156173706055, + "learning_rate": 8e-05, + "loss": 40.8759, + "num_input_tokens_seen": 385722096, + "step": 7488 + }, + { + "epoch": 0.7361981278101275, + "grad_norm": 25.10171890258789, + "learning_rate": 8e-05, + "loss": 34.8859, + "num_input_tokens_seen": 385912764, + "step": 7491 + }, + { + "epoch": 0.7364929608609125, + "grad_norm": 27.673572540283203, + "learning_rate": 8e-05, + "loss": 36.1053, + "num_input_tokens_seen": 386058424, + "step": 7494 + }, + { + "epoch": 0.7367877939116975, + "grad_norm": 24.04930305480957, + "learning_rate": 8e-05, + "loss": 38.4439, + "num_input_tokens_seen": 386203764, + "step": 7497 + }, + { + "epoch": 0.7370826269624825, + "grad_norm": 24.450237274169922, + "learning_rate": 8e-05, + "loss": 37.3987, + "num_input_tokens_seen": 386386776, + "step": 7500 + }, + { + "epoch": 0.7373774600132675, + "grad_norm": 22.757221221923828, + "learning_rate": 8e-05, + "loss": 37.2368, + "num_input_tokens_seen": 386534316, + "step": 7503 + }, + { + "epoch": 0.7376722930640525, + "grad_norm": 23.33014488220215, + "learning_rate": 8e-05, + "loss": 32.5769, + "num_input_tokens_seen": 386681532, + "step": 7506 + }, + { + "epoch": 0.7379671261148375, + "grad_norm": 29.790695190429688, + "learning_rate": 8e-05, + "loss": 41.2972, + "num_input_tokens_seen": 386835200, + "step": 7509 + }, + { + "epoch": 0.7382619591656224, + "grad_norm": 23.4545841217041, + "learning_rate": 8e-05, + "loss": 34.3512, + "num_input_tokens_seen": 386983248, + "step": 7512 + }, + { + "epoch": 0.7385567922164075, + "grad_norm": 24.699615478515625, + "learning_rate": 8e-05, + "loss": 35.651, + "num_input_tokens_seen": 387119612, + "step": 7515 + }, + { + "epoch": 0.7388516252671925, + "grad_norm": 23.103931427001953, + "learning_rate": 8e-05, + "loss": 37.3486, + "num_input_tokens_seen": 387275496, + "step": 7518 + }, + { + "epoch": 0.7391464583179774, + "grad_norm": 22.726150512695312, + "learning_rate": 8e-05, + "loss": 35.1715, + "num_input_tokens_seen": 387428384, + "step": 7521 + }, + { + "epoch": 0.7394412913687625, + "grad_norm": 22.575422286987305, + "learning_rate": 8e-05, + "loss": 36.0418, + "num_input_tokens_seen": 387570884, + "step": 7524 + }, + { + "epoch": 0.7397361244195474, + "grad_norm": 28.128877639770508, + "learning_rate": 8e-05, + "loss": 34.6036, + "num_input_tokens_seen": 387737660, + "step": 7527 + }, + { + "epoch": 0.7400309574703324, + "grad_norm": 25.145950317382812, + "learning_rate": 8e-05, + "loss": 31.6659, + "num_input_tokens_seen": 387893116, + "step": 7530 + }, + { + "epoch": 0.7403257905211175, + "grad_norm": 27.49928092956543, + "learning_rate": 8e-05, + "loss": 35.0847, + "num_input_tokens_seen": 388043176, + "step": 7533 + }, + { + "epoch": 0.7406206235719024, + "grad_norm": 27.566316604614258, + "learning_rate": 8e-05, + "loss": 37.3643, + "num_input_tokens_seen": 388189188, + "step": 7536 + }, + { + "epoch": 0.7409154566226874, + "grad_norm": 25.077884674072266, + "learning_rate": 8e-05, + "loss": 38.4539, + "num_input_tokens_seen": 388340236, + "step": 7539 + }, + { + "epoch": 0.7412102896734724, + "grad_norm": 26.655046463012695, + "learning_rate": 8e-05, + "loss": 31.6471, + "num_input_tokens_seen": 388481968, + "step": 7542 + }, + { + "epoch": 0.7415051227242574, + "grad_norm": 23.787466049194336, + "learning_rate": 8e-05, + "loss": 39.579, + "num_input_tokens_seen": 388656976, + "step": 7545 + }, + { + "epoch": 0.7417999557750424, + "grad_norm": 25.586387634277344, + "learning_rate": 8e-05, + "loss": 30.4443, + "num_input_tokens_seen": 388807476, + "step": 7548 + }, + { + "epoch": 0.7420947888258274, + "grad_norm": 27.737192153930664, + "learning_rate": 8e-05, + "loss": 34.7925, + "num_input_tokens_seen": 388959276, + "step": 7551 + }, + { + "epoch": 0.7423896218766124, + "grad_norm": 33.840476989746094, + "learning_rate": 8e-05, + "loss": 38.1397, + "num_input_tokens_seen": 389100868, + "step": 7554 + }, + { + "epoch": 0.7426844549273973, + "grad_norm": 24.45082664489746, + "learning_rate": 8e-05, + "loss": 36.8545, + "num_input_tokens_seen": 389255556, + "step": 7557 + }, + { + "epoch": 0.7429792879781824, + "grad_norm": 27.7251033782959, + "learning_rate": 8e-05, + "loss": 36.6671, + "num_input_tokens_seen": 389404368, + "step": 7560 + }, + { + "epoch": 0.7432741210289674, + "grad_norm": 25.771398544311523, + "learning_rate": 8e-05, + "loss": 34.4683, + "num_input_tokens_seen": 389579920, + "step": 7563 + }, + { + "epoch": 0.7435689540797523, + "grad_norm": 24.235137939453125, + "learning_rate": 8e-05, + "loss": 35.3631, + "num_input_tokens_seen": 389748364, + "step": 7566 + }, + { + "epoch": 0.7438637871305374, + "grad_norm": 22.721860885620117, + "learning_rate": 8e-05, + "loss": 34.7737, + "num_input_tokens_seen": 389891996, + "step": 7569 + }, + { + "epoch": 0.7441586201813223, + "grad_norm": 24.083951950073242, + "learning_rate": 8e-05, + "loss": 35.6209, + "num_input_tokens_seen": 390052128, + "step": 7572 + }, + { + "epoch": 0.7444534532321073, + "grad_norm": 28.595867156982422, + "learning_rate": 8e-05, + "loss": 34.9005, + "num_input_tokens_seen": 390217948, + "step": 7575 + }, + { + "epoch": 0.7447482862828924, + "grad_norm": 23.231950759887695, + "learning_rate": 8e-05, + "loss": 34.9005, + "num_input_tokens_seen": 390363036, + "step": 7578 + }, + { + "epoch": 0.7450431193336773, + "grad_norm": 152.58456420898438, + "learning_rate": 8e-05, + "loss": 32.4912, + "num_input_tokens_seen": 390528868, + "step": 7581 + }, + { + "epoch": 0.7453379523844623, + "grad_norm": 29.192127227783203, + "learning_rate": 8e-05, + "loss": 33.675, + "num_input_tokens_seen": 390681416, + "step": 7584 + }, + { + "epoch": 0.7456327854352472, + "grad_norm": 26.711700439453125, + "learning_rate": 8e-05, + "loss": 37.3324, + "num_input_tokens_seen": 390834660, + "step": 7587 + }, + { + "epoch": 0.7459276184860323, + "grad_norm": 31.674036026000977, + "learning_rate": 8e-05, + "loss": 36.7416, + "num_input_tokens_seen": 390990760, + "step": 7590 + }, + { + "epoch": 0.7462224515368173, + "grad_norm": 59.84733963012695, + "learning_rate": 8e-05, + "loss": 35.4196, + "num_input_tokens_seen": 391138464, + "step": 7593 + }, + { + "epoch": 0.7465172845876022, + "grad_norm": 33.0748405456543, + "learning_rate": 8e-05, + "loss": 35.9725, + "num_input_tokens_seen": 391293492, + "step": 7596 + }, + { + "epoch": 0.7468121176383873, + "grad_norm": 34.8997688293457, + "learning_rate": 8e-05, + "loss": 36.4004, + "num_input_tokens_seen": 391473388, + "step": 7599 + }, + { + "epoch": 0.7471069506891722, + "grad_norm": 27.16301918029785, + "learning_rate": 8e-05, + "loss": 36.9327, + "num_input_tokens_seen": 391645156, + "step": 7602 + }, + { + "epoch": 0.7474017837399572, + "grad_norm": 30.942466735839844, + "learning_rate": 8e-05, + "loss": 33.8328, + "num_input_tokens_seen": 391786612, + "step": 7605 + }, + { + "epoch": 0.7476966167907423, + "grad_norm": 35.61334991455078, + "learning_rate": 8e-05, + "loss": 38.7541, + "num_input_tokens_seen": 391931904, + "step": 7608 + }, + { + "epoch": 0.7479914498415272, + "grad_norm": 105.30634307861328, + "learning_rate": 8e-05, + "loss": 39.6637, + "num_input_tokens_seen": 392086016, + "step": 7611 + }, + { + "epoch": 0.7482862828923122, + "grad_norm": 49.63365936279297, + "learning_rate": 8e-05, + "loss": 34.3093, + "num_input_tokens_seen": 392214580, + "step": 7614 + }, + { + "epoch": 0.7485811159430972, + "grad_norm": 25.027118682861328, + "learning_rate": 8e-05, + "loss": 34.7893, + "num_input_tokens_seen": 392371752, + "step": 7617 + }, + { + "epoch": 0.7488759489938822, + "grad_norm": 27.529848098754883, + "learning_rate": 8e-05, + "loss": 34.8015, + "num_input_tokens_seen": 392525552, + "step": 7620 + }, + { + "epoch": 0.7491707820446672, + "grad_norm": 26.25337791442871, + "learning_rate": 8e-05, + "loss": 36.5017, + "num_input_tokens_seen": 392649924, + "step": 7623 + }, + { + "epoch": 0.7494656150954522, + "grad_norm": 52.97135925292969, + "learning_rate": 8e-05, + "loss": 40.43, + "num_input_tokens_seen": 392801924, + "step": 7626 + }, + { + "epoch": 0.7497604481462372, + "grad_norm": 21.912799835205078, + "learning_rate": 8e-05, + "loss": 34.4445, + "num_input_tokens_seen": 392964832, + "step": 7629 + }, + { + "epoch": 0.7500552811970221, + "grad_norm": 23.80307960510254, + "learning_rate": 8e-05, + "loss": 36.7419, + "num_input_tokens_seen": 393123684, + "step": 7632 + }, + { + "epoch": 0.7503501142478072, + "grad_norm": 23.64950180053711, + "learning_rate": 8e-05, + "loss": 35.1361, + "num_input_tokens_seen": 393286600, + "step": 7635 + }, + { + "epoch": 0.7506449472985922, + "grad_norm": 24.831253051757812, + "learning_rate": 8e-05, + "loss": 34.1056, + "num_input_tokens_seen": 393443688, + "step": 7638 + }, + { + "epoch": 0.7509397803493771, + "grad_norm": 23.34168815612793, + "learning_rate": 8e-05, + "loss": 36.7828, + "num_input_tokens_seen": 393601180, + "step": 7641 + }, + { + "epoch": 0.7512346134001622, + "grad_norm": 30.1611328125, + "learning_rate": 8e-05, + "loss": 33.5108, + "num_input_tokens_seen": 393738140, + "step": 7644 + }, + { + "epoch": 0.7515294464509471, + "grad_norm": 27.13521385192871, + "learning_rate": 8e-05, + "loss": 40.2198, + "num_input_tokens_seen": 393886392, + "step": 7647 + }, + { + "epoch": 0.7518242795017321, + "grad_norm": 27.22330093383789, + "learning_rate": 8e-05, + "loss": 37.5227, + "num_input_tokens_seen": 394037900, + "step": 7650 + }, + { + "epoch": 0.7521191125525172, + "grad_norm": 21.45113182067871, + "learning_rate": 8e-05, + "loss": 37.1464, + "num_input_tokens_seen": 394194376, + "step": 7653 + }, + { + "epoch": 0.7524139456033021, + "grad_norm": 20.75296974182129, + "learning_rate": 8e-05, + "loss": 31.1379, + "num_input_tokens_seen": 394340592, + "step": 7656 + }, + { + "epoch": 0.7527087786540871, + "grad_norm": 25.353015899658203, + "learning_rate": 8e-05, + "loss": 32.2977, + "num_input_tokens_seen": 394499732, + "step": 7659 + }, + { + "epoch": 0.7530036117048721, + "grad_norm": 23.255964279174805, + "learning_rate": 8e-05, + "loss": 38.8994, + "num_input_tokens_seen": 394655136, + "step": 7662 + }, + { + "epoch": 0.7532984447556571, + "grad_norm": 26.22134017944336, + "learning_rate": 8e-05, + "loss": 40.9635, + "num_input_tokens_seen": 394797248, + "step": 7665 + }, + { + "epoch": 0.7535932778064421, + "grad_norm": 23.839248657226562, + "learning_rate": 8e-05, + "loss": 35.2768, + "num_input_tokens_seen": 394952920, + "step": 7668 + }, + { + "epoch": 0.7538881108572271, + "grad_norm": 26.770469665527344, + "learning_rate": 8e-05, + "loss": 34.1852, + "num_input_tokens_seen": 395082960, + "step": 7671 + }, + { + "epoch": 0.7541829439080121, + "grad_norm": 26.16197967529297, + "learning_rate": 8e-05, + "loss": 39.2824, + "num_input_tokens_seen": 395222292, + "step": 7674 + }, + { + "epoch": 0.754477776958797, + "grad_norm": 22.692888259887695, + "learning_rate": 8e-05, + "loss": 35.6573, + "num_input_tokens_seen": 395378916, + "step": 7677 + }, + { + "epoch": 0.7547726100095821, + "grad_norm": 29.958852767944336, + "learning_rate": 8e-05, + "loss": 37.3705, + "num_input_tokens_seen": 395524172, + "step": 7680 + }, + { + "epoch": 0.7550674430603671, + "grad_norm": 25.045122146606445, + "learning_rate": 8e-05, + "loss": 37.268, + "num_input_tokens_seen": 395680420, + "step": 7683 + }, + { + "epoch": 0.755362276111152, + "grad_norm": 21.826213836669922, + "learning_rate": 8e-05, + "loss": 33.6373, + "num_input_tokens_seen": 395855216, + "step": 7686 + }, + { + "epoch": 0.7556571091619371, + "grad_norm": 27.030338287353516, + "learning_rate": 8e-05, + "loss": 35.9748, + "num_input_tokens_seen": 396020044, + "step": 7689 + }, + { + "epoch": 0.755951942212722, + "grad_norm": 32.4162483215332, + "learning_rate": 8e-05, + "loss": 37.4893, + "num_input_tokens_seen": 396178644, + "step": 7692 + }, + { + "epoch": 0.756246775263507, + "grad_norm": 39.71582794189453, + "learning_rate": 8e-05, + "loss": 34.4656, + "num_input_tokens_seen": 396337480, + "step": 7695 + }, + { + "epoch": 0.7565416083142921, + "grad_norm": 39.37307357788086, + "learning_rate": 8e-05, + "loss": 37.185, + "num_input_tokens_seen": 396513736, + "step": 7698 + }, + { + "epoch": 0.756836441365077, + "grad_norm": 26.994949340820312, + "learning_rate": 8e-05, + "loss": 36.2526, + "num_input_tokens_seen": 396694672, + "step": 7701 + }, + { + "epoch": 0.757131274415862, + "grad_norm": 25.13683319091797, + "learning_rate": 8e-05, + "loss": 33.1264, + "num_input_tokens_seen": 396859188, + "step": 7704 + }, + { + "epoch": 0.757426107466647, + "grad_norm": 28.42525291442871, + "learning_rate": 8e-05, + "loss": 36.3175, + "num_input_tokens_seen": 396999932, + "step": 7707 + }, + { + "epoch": 0.757720940517432, + "grad_norm": 25.426481246948242, + "learning_rate": 8e-05, + "loss": 35.1644, + "num_input_tokens_seen": 397149508, + "step": 7710 + }, + { + "epoch": 0.758015773568217, + "grad_norm": 25.66521644592285, + "learning_rate": 8e-05, + "loss": 33.9718, + "num_input_tokens_seen": 397309256, + "step": 7713 + }, + { + "epoch": 0.758310606619002, + "grad_norm": 25.486326217651367, + "learning_rate": 8e-05, + "loss": 36.1877, + "num_input_tokens_seen": 397470364, + "step": 7716 + }, + { + "epoch": 0.758605439669787, + "grad_norm": 25.189796447753906, + "learning_rate": 8e-05, + "loss": 33.6886, + "num_input_tokens_seen": 397634620, + "step": 7719 + }, + { + "epoch": 0.7589002727205719, + "grad_norm": 48.64822769165039, + "learning_rate": 8e-05, + "loss": 38.49, + "num_input_tokens_seen": 397795032, + "step": 7722 + }, + { + "epoch": 0.759195105771357, + "grad_norm": 29.638479232788086, + "learning_rate": 8e-05, + "loss": 37.9385, + "num_input_tokens_seen": 397962232, + "step": 7725 + }, + { + "epoch": 0.759489938822142, + "grad_norm": 32.13875961303711, + "learning_rate": 8e-05, + "loss": 35.1371, + "num_input_tokens_seen": 398092100, + "step": 7728 + }, + { + "epoch": 0.7597847718729269, + "grad_norm": 34.010276794433594, + "learning_rate": 8e-05, + "loss": 35.8284, + "num_input_tokens_seen": 398269600, + "step": 7731 + }, + { + "epoch": 0.760079604923712, + "grad_norm": 27.5978946685791, + "learning_rate": 8e-05, + "loss": 39.5413, + "num_input_tokens_seen": 398418832, + "step": 7734 + }, + { + "epoch": 0.7603744379744969, + "grad_norm": 28.058717727661133, + "learning_rate": 8e-05, + "loss": 38.8165, + "num_input_tokens_seen": 398581128, + "step": 7737 + }, + { + "epoch": 0.7606692710252819, + "grad_norm": 22.413394927978516, + "learning_rate": 8e-05, + "loss": 36.9416, + "num_input_tokens_seen": 398735244, + "step": 7740 + }, + { + "epoch": 0.760964104076067, + "grad_norm": 22.70956802368164, + "learning_rate": 8e-05, + "loss": 33.7998, + "num_input_tokens_seen": 398882056, + "step": 7743 + }, + { + "epoch": 0.7612589371268519, + "grad_norm": 34.98361587524414, + "learning_rate": 8e-05, + "loss": 37.2277, + "num_input_tokens_seen": 399034480, + "step": 7746 + }, + { + "epoch": 0.7615537701776369, + "grad_norm": 21.232566833496094, + "learning_rate": 8e-05, + "loss": 37.4406, + "num_input_tokens_seen": 399183192, + "step": 7749 + }, + { + "epoch": 0.761848603228422, + "grad_norm": 23.489994049072266, + "learning_rate": 8e-05, + "loss": 37.4302, + "num_input_tokens_seen": 399360340, + "step": 7752 + }, + { + "epoch": 0.7621434362792069, + "grad_norm": 29.505022048950195, + "learning_rate": 8e-05, + "loss": 35.7313, + "num_input_tokens_seen": 399537496, + "step": 7755 + }, + { + "epoch": 0.7624382693299919, + "grad_norm": 22.35037612915039, + "learning_rate": 8e-05, + "loss": 36.3991, + "num_input_tokens_seen": 399697048, + "step": 7758 + }, + { + "epoch": 0.7627331023807769, + "grad_norm": 22.932846069335938, + "learning_rate": 8e-05, + "loss": 41.2068, + "num_input_tokens_seen": 399855616, + "step": 7761 + }, + { + "epoch": 0.7630279354315619, + "grad_norm": 26.854496002197266, + "learning_rate": 8e-05, + "loss": 32.3901, + "num_input_tokens_seen": 400006544, + "step": 7764 + }, + { + "epoch": 0.7633227684823469, + "grad_norm": 26.923646926879883, + "learning_rate": 8e-05, + "loss": 39.4342, + "num_input_tokens_seen": 400169056, + "step": 7767 + }, + { + "epoch": 0.7636176015331319, + "grad_norm": 24.59294891357422, + "learning_rate": 8e-05, + "loss": 35.3785, + "num_input_tokens_seen": 400324716, + "step": 7770 + }, + { + "epoch": 0.7639124345839169, + "grad_norm": 44.60205841064453, + "learning_rate": 8e-05, + "loss": 36.1143, + "num_input_tokens_seen": 400469124, + "step": 7773 + }, + { + "epoch": 0.7642072676347018, + "grad_norm": 30.42624855041504, + "learning_rate": 8e-05, + "loss": 37.0372, + "num_input_tokens_seen": 400632716, + "step": 7776 + }, + { + "epoch": 0.7645021006854869, + "grad_norm": 23.208675384521484, + "learning_rate": 8e-05, + "loss": 32.5705, + "num_input_tokens_seen": 400798540, + "step": 7779 + }, + { + "epoch": 0.7647969337362719, + "grad_norm": 29.40182113647461, + "learning_rate": 8e-05, + "loss": 42.2359, + "num_input_tokens_seen": 400970840, + "step": 7782 + }, + { + "epoch": 0.7650917667870568, + "grad_norm": 22.822834014892578, + "learning_rate": 8e-05, + "loss": 35.2608, + "num_input_tokens_seen": 401137072, + "step": 7785 + }, + { + "epoch": 0.7653865998378419, + "grad_norm": 22.621362686157227, + "learning_rate": 8e-05, + "loss": 35.9634, + "num_input_tokens_seen": 401297236, + "step": 7788 + }, + { + "epoch": 0.7656814328886268, + "grad_norm": 23.71027183532715, + "learning_rate": 8e-05, + "loss": 37.8802, + "num_input_tokens_seen": 401453132, + "step": 7791 + }, + { + "epoch": 0.7659762659394118, + "grad_norm": 21.791362762451172, + "learning_rate": 8e-05, + "loss": 36.5754, + "num_input_tokens_seen": 401615016, + "step": 7794 + }, + { + "epoch": 0.7662710989901969, + "grad_norm": 22.979232788085938, + "learning_rate": 8e-05, + "loss": 37.7297, + "num_input_tokens_seen": 401752296, + "step": 7797 + }, + { + "epoch": 0.7665659320409818, + "grad_norm": 28.699113845825195, + "learning_rate": 8e-05, + "loss": 34.315, + "num_input_tokens_seen": 401908632, + "step": 7800 + }, + { + "epoch": 0.7668607650917668, + "grad_norm": 25.213712692260742, + "learning_rate": 8e-05, + "loss": 35.3176, + "num_input_tokens_seen": 402057280, + "step": 7803 + }, + { + "epoch": 0.7671555981425517, + "grad_norm": 23.474750518798828, + "learning_rate": 8e-05, + "loss": 38.3909, + "num_input_tokens_seen": 402225972, + "step": 7806 + }, + { + "epoch": 0.7674504311933368, + "grad_norm": 32.31821060180664, + "learning_rate": 8e-05, + "loss": 39.6794, + "num_input_tokens_seen": 402383744, + "step": 7809 + }, + { + "epoch": 0.7677452642441218, + "grad_norm": 26.248733520507812, + "learning_rate": 8e-05, + "loss": 35.504, + "num_input_tokens_seen": 402537292, + "step": 7812 + }, + { + "epoch": 0.7680400972949067, + "grad_norm": 31.621732711791992, + "learning_rate": 8e-05, + "loss": 33.5155, + "num_input_tokens_seen": 402678688, + "step": 7815 + }, + { + "epoch": 0.7683349303456918, + "grad_norm": 22.54973793029785, + "learning_rate": 8e-05, + "loss": 35.1997, + "num_input_tokens_seen": 402824144, + "step": 7818 + }, + { + "epoch": 0.7686297633964767, + "grad_norm": 23.661958694458008, + "learning_rate": 8e-05, + "loss": 39.7663, + "num_input_tokens_seen": 402990972, + "step": 7821 + }, + { + "epoch": 0.7689245964472617, + "grad_norm": 48.45589828491211, + "learning_rate": 8e-05, + "loss": 35.8389, + "num_input_tokens_seen": 403146060, + "step": 7824 + }, + { + "epoch": 0.7692194294980468, + "grad_norm": 28.95506477355957, + "learning_rate": 8e-05, + "loss": 38.4829, + "num_input_tokens_seen": 403293764, + "step": 7827 + }, + { + "epoch": 0.7695142625488317, + "grad_norm": 24.518653869628906, + "learning_rate": 8e-05, + "loss": 36.5637, + "num_input_tokens_seen": 403467032, + "step": 7830 + }, + { + "epoch": 0.7698090955996167, + "grad_norm": 187.11962890625, + "learning_rate": 8e-05, + "loss": 35.7274, + "num_input_tokens_seen": 403621044, + "step": 7833 + }, + { + "epoch": 0.7701039286504017, + "grad_norm": 29.056535720825195, + "learning_rate": 8e-05, + "loss": 34.5355, + "num_input_tokens_seen": 403776084, + "step": 7836 + }, + { + "epoch": 0.7703987617011867, + "grad_norm": 25.228620529174805, + "learning_rate": 8e-05, + "loss": 36.6865, + "num_input_tokens_seen": 403929324, + "step": 7839 + }, + { + "epoch": 0.7706935947519717, + "grad_norm": 29.44222640991211, + "learning_rate": 8e-05, + "loss": 36.8643, + "num_input_tokens_seen": 404084380, + "step": 7842 + }, + { + "epoch": 0.7709884278027567, + "grad_norm": 27.131668090820312, + "learning_rate": 8e-05, + "loss": 37.6504, + "num_input_tokens_seen": 404237284, + "step": 7845 + }, + { + "epoch": 0.7712832608535417, + "grad_norm": 268.1426086425781, + "learning_rate": 8e-05, + "loss": 33.8298, + "num_input_tokens_seen": 404377928, + "step": 7848 + }, + { + "epoch": 0.7715780939043266, + "grad_norm": 24.818805694580078, + "learning_rate": 8e-05, + "loss": 39.3884, + "num_input_tokens_seen": 404542604, + "step": 7851 + }, + { + "epoch": 0.7718729269551117, + "grad_norm": 29.92938995361328, + "learning_rate": 8e-05, + "loss": 34.5737, + "num_input_tokens_seen": 404701704, + "step": 7854 + }, + { + "epoch": 0.7721677600058967, + "grad_norm": 22.9791259765625, + "learning_rate": 8e-05, + "loss": 29.5506, + "num_input_tokens_seen": 404848704, + "step": 7857 + }, + { + "epoch": 0.7724625930566816, + "grad_norm": 56.71049880981445, + "learning_rate": 8e-05, + "loss": 38.5634, + "num_input_tokens_seen": 405007396, + "step": 7860 + }, + { + "epoch": 0.7727574261074667, + "grad_norm": 40.992950439453125, + "learning_rate": 8e-05, + "loss": 37.3678, + "num_input_tokens_seen": 405178424, + "step": 7863 + }, + { + "epoch": 0.7730522591582516, + "grad_norm": 29.336580276489258, + "learning_rate": 8e-05, + "loss": 36.4169, + "num_input_tokens_seen": 405340732, + "step": 7866 + }, + { + "epoch": 0.7733470922090366, + "grad_norm": 24.261932373046875, + "learning_rate": 8e-05, + "loss": 36.5978, + "num_input_tokens_seen": 405507356, + "step": 7869 + }, + { + "epoch": 0.7736419252598217, + "grad_norm": 26.533891677856445, + "learning_rate": 8e-05, + "loss": 38.5256, + "num_input_tokens_seen": 405644324, + "step": 7872 + }, + { + "epoch": 0.7739367583106066, + "grad_norm": 26.40947723388672, + "learning_rate": 8e-05, + "loss": 38.562, + "num_input_tokens_seen": 405801196, + "step": 7875 + }, + { + "epoch": 0.7742315913613916, + "grad_norm": 23.67729377746582, + "learning_rate": 8e-05, + "loss": 34.4802, + "num_input_tokens_seen": 405938068, + "step": 7878 + }, + { + "epoch": 0.7745264244121766, + "grad_norm": 30.940881729125977, + "learning_rate": 8e-05, + "loss": 35.3962, + "num_input_tokens_seen": 406090084, + "step": 7881 + }, + { + "epoch": 0.7748212574629616, + "grad_norm": 38.28596115112305, + "learning_rate": 8e-05, + "loss": 37.0547, + "num_input_tokens_seen": 406233952, + "step": 7884 + }, + { + "epoch": 0.7751160905137466, + "grad_norm": 26.43083381652832, + "learning_rate": 8e-05, + "loss": 37.8771, + "num_input_tokens_seen": 406400136, + "step": 7887 + }, + { + "epoch": 0.7754109235645316, + "grad_norm": 50.69662094116211, + "learning_rate": 8e-05, + "loss": 34.8928, + "num_input_tokens_seen": 406577860, + "step": 7890 + }, + { + "epoch": 0.7757057566153166, + "grad_norm": 26.12557029724121, + "learning_rate": 8e-05, + "loss": 34.7082, + "num_input_tokens_seen": 406727848, + "step": 7893 + }, + { + "epoch": 0.7760005896661015, + "grad_norm": 26.89127540588379, + "learning_rate": 8e-05, + "loss": 35.4053, + "num_input_tokens_seen": 406882000, + "step": 7896 + }, + { + "epoch": 0.7762954227168866, + "grad_norm": 25.696537017822266, + "learning_rate": 8e-05, + "loss": 32.8174, + "num_input_tokens_seen": 407037656, + "step": 7899 + }, + { + "epoch": 0.7765902557676716, + "grad_norm": 21.538856506347656, + "learning_rate": 8e-05, + "loss": 33.3996, + "num_input_tokens_seen": 407201080, + "step": 7902 + }, + { + "epoch": 0.7768850888184565, + "grad_norm": 28.80377769470215, + "learning_rate": 8e-05, + "loss": 36.5668, + "num_input_tokens_seen": 407377780, + "step": 7905 + }, + { + "epoch": 0.7771799218692416, + "grad_norm": 30.080427169799805, + "learning_rate": 8e-05, + "loss": 37.3908, + "num_input_tokens_seen": 407533544, + "step": 7908 + }, + { + "epoch": 0.7774747549200265, + "grad_norm": 35.93389892578125, + "learning_rate": 8e-05, + "loss": 36.7938, + "num_input_tokens_seen": 407698404, + "step": 7911 + }, + { + "epoch": 0.7777695879708115, + "grad_norm": 26.772830963134766, + "learning_rate": 8e-05, + "loss": 35.0283, + "num_input_tokens_seen": 407853008, + "step": 7914 + }, + { + "epoch": 0.7780644210215966, + "grad_norm": 32.96590042114258, + "learning_rate": 8e-05, + "loss": 36.3987, + "num_input_tokens_seen": 408013784, + "step": 7917 + }, + { + "epoch": 0.7783592540723815, + "grad_norm": 26.807809829711914, + "learning_rate": 8e-05, + "loss": 37.276, + "num_input_tokens_seen": 408172336, + "step": 7920 + }, + { + "epoch": 0.7786540871231665, + "grad_norm": 25.73488998413086, + "learning_rate": 8e-05, + "loss": 34.6074, + "num_input_tokens_seen": 408320856, + "step": 7923 + }, + { + "epoch": 0.7789489201739515, + "grad_norm": 23.03981590270996, + "learning_rate": 8e-05, + "loss": 39.2311, + "num_input_tokens_seen": 408461160, + "step": 7926 + }, + { + "epoch": 0.7792437532247365, + "grad_norm": 24.349489212036133, + "learning_rate": 8e-05, + "loss": 36.4847, + "num_input_tokens_seen": 408610200, + "step": 7929 + }, + { + "epoch": 0.7795385862755215, + "grad_norm": 25.687152862548828, + "learning_rate": 8e-05, + "loss": 37.7461, + "num_input_tokens_seen": 408765556, + "step": 7932 + }, + { + "epoch": 0.7798334193263065, + "grad_norm": 25.157957077026367, + "learning_rate": 8e-05, + "loss": 38.5068, + "num_input_tokens_seen": 408937720, + "step": 7935 + }, + { + "epoch": 0.7801282523770915, + "grad_norm": 21.769214630126953, + "learning_rate": 8e-05, + "loss": 36.2921, + "num_input_tokens_seen": 409109784, + "step": 7938 + }, + { + "epoch": 0.7804230854278764, + "grad_norm": 20.435977935791016, + "learning_rate": 8e-05, + "loss": 35.1186, + "num_input_tokens_seen": 409279036, + "step": 7941 + }, + { + "epoch": 0.7807179184786615, + "grad_norm": 25.90553092956543, + "learning_rate": 8e-05, + "loss": 36.176, + "num_input_tokens_seen": 409425764, + "step": 7944 + }, + { + "epoch": 0.7810127515294465, + "grad_norm": 23.491077423095703, + "learning_rate": 8e-05, + "loss": 38.1959, + "num_input_tokens_seen": 409588992, + "step": 7947 + }, + { + "epoch": 0.7813075845802314, + "grad_norm": 24.66498565673828, + "learning_rate": 8e-05, + "loss": 38.0055, + "num_input_tokens_seen": 409767288, + "step": 7950 + }, + { + "epoch": 0.7816024176310165, + "grad_norm": 27.49262046813965, + "learning_rate": 8e-05, + "loss": 35.4031, + "num_input_tokens_seen": 409910484, + "step": 7953 + }, + { + "epoch": 0.7818972506818014, + "grad_norm": 21.672725677490234, + "learning_rate": 8e-05, + "loss": 34.6222, + "num_input_tokens_seen": 410071500, + "step": 7956 + }, + { + "epoch": 0.7821920837325864, + "grad_norm": 25.07799530029297, + "learning_rate": 8e-05, + "loss": 33.0897, + "num_input_tokens_seen": 410234160, + "step": 7959 + }, + { + "epoch": 0.7824869167833715, + "grad_norm": 24.90636444091797, + "learning_rate": 8e-05, + "loss": 38.1614, + "num_input_tokens_seen": 410373068, + "step": 7962 + }, + { + "epoch": 0.7827817498341564, + "grad_norm": 20.44605827331543, + "learning_rate": 8e-05, + "loss": 32.2739, + "num_input_tokens_seen": 410509172, + "step": 7965 + }, + { + "epoch": 0.7830765828849414, + "grad_norm": 26.52505111694336, + "learning_rate": 8e-05, + "loss": 36.0589, + "num_input_tokens_seen": 410680432, + "step": 7968 + }, + { + "epoch": 0.7833714159357263, + "grad_norm": 56.93138122558594, + "learning_rate": 8e-05, + "loss": 32.2403, + "num_input_tokens_seen": 410834396, + "step": 7971 + }, + { + "epoch": 0.7836662489865114, + "grad_norm": 24.301488876342773, + "learning_rate": 8e-05, + "loss": 35.1784, + "num_input_tokens_seen": 410997416, + "step": 7974 + }, + { + "epoch": 0.7839610820372964, + "grad_norm": 22.580333709716797, + "learning_rate": 8e-05, + "loss": 31.316, + "num_input_tokens_seen": 411129556, + "step": 7977 + }, + { + "epoch": 0.7842559150880813, + "grad_norm": 20.43970489501953, + "learning_rate": 8e-05, + "loss": 37.1739, + "num_input_tokens_seen": 411311424, + "step": 7980 + }, + { + "epoch": 0.7845507481388664, + "grad_norm": 42.44162368774414, + "learning_rate": 8e-05, + "loss": 37.3535, + "num_input_tokens_seen": 411484096, + "step": 7983 + }, + { + "epoch": 0.7848455811896513, + "grad_norm": 26.987335205078125, + "learning_rate": 8e-05, + "loss": 39.5258, + "num_input_tokens_seen": 411633808, + "step": 7986 + }, + { + "epoch": 0.7851404142404363, + "grad_norm": 20.923507690429688, + "learning_rate": 8e-05, + "loss": 34.7262, + "num_input_tokens_seen": 411790656, + "step": 7989 + }, + { + "epoch": 0.7854352472912214, + "grad_norm": 21.965740203857422, + "learning_rate": 8e-05, + "loss": 37.8557, + "num_input_tokens_seen": 411961344, + "step": 7992 + }, + { + "epoch": 0.7857300803420063, + "grad_norm": 23.653724670410156, + "learning_rate": 8e-05, + "loss": 35.9979, + "num_input_tokens_seen": 412100232, + "step": 7995 + }, + { + "epoch": 0.7860249133927913, + "grad_norm": 20.85638427734375, + "learning_rate": 8e-05, + "loss": 36.1657, + "num_input_tokens_seen": 412259464, + "step": 7998 + }, + { + "epoch": 0.7862214687599813, + "eval_gen_len": 35.32, + "eval_loss": 2.3185036182403564, + "eval_rouge1": 46.647, + "eval_rouge2": 29.8361, + "eval_rougeL": 42.7361, + "eval_rougeLsum": 43.0175, + "eval_runtime": 104.6242, + "eval_samples_per_second": 1.912, + "eval_steps_per_second": 0.478, + "num_input_tokens_seen": 412353688, + "step": 8000 + }, + { + "epoch": 0.7863197464435763, + "grad_norm": 22.64118194580078, + "learning_rate": 8e-05, + "loss": 36.9061, + "num_input_tokens_seen": 412401800, + "step": 8001 + }, + { + "epoch": 0.7866145794943613, + "grad_norm": 24.01075553894043, + "learning_rate": 8e-05, + "loss": 37.7225, + "num_input_tokens_seen": 412571928, + "step": 8004 + }, + { + "epoch": 0.7869094125451463, + "grad_norm": 24.196077346801758, + "learning_rate": 8e-05, + "loss": 36.2338, + "num_input_tokens_seen": 412716236, + "step": 8007 + }, + { + "epoch": 0.7872042455959313, + "grad_norm": 23.942615509033203, + "learning_rate": 8e-05, + "loss": 36.0792, + "num_input_tokens_seen": 412874488, + "step": 8010 + }, + { + "epoch": 0.7874990786467163, + "grad_norm": 23.444011688232422, + "learning_rate": 8e-05, + "loss": 34.9672, + "num_input_tokens_seen": 413001512, + "step": 8013 + }, + { + "epoch": 0.7877939116975012, + "grad_norm": 24.958213806152344, + "learning_rate": 8e-05, + "loss": 32.6181, + "num_input_tokens_seen": 413153004, + "step": 8016 + }, + { + "epoch": 0.7880887447482863, + "grad_norm": 29.166627883911133, + "learning_rate": 8e-05, + "loss": 35.5564, + "num_input_tokens_seen": 413308188, + "step": 8019 + }, + { + "epoch": 0.7883835777990713, + "grad_norm": 26.949697494506836, + "learning_rate": 8e-05, + "loss": 36.4201, + "num_input_tokens_seen": 413440888, + "step": 8022 + }, + { + "epoch": 0.7886784108498562, + "grad_norm": 41.7303352355957, + "learning_rate": 8e-05, + "loss": 31.4589, + "num_input_tokens_seen": 413578524, + "step": 8025 + }, + { + "epoch": 0.7889732439006413, + "grad_norm": 37.500732421875, + "learning_rate": 8e-05, + "loss": 35.4136, + "num_input_tokens_seen": 413735264, + "step": 8028 + }, + { + "epoch": 0.7892680769514262, + "grad_norm": 33.136962890625, + "learning_rate": 8e-05, + "loss": 35.4613, + "num_input_tokens_seen": 413891816, + "step": 8031 + }, + { + "epoch": 0.7895629100022112, + "grad_norm": 20.599218368530273, + "learning_rate": 8e-05, + "loss": 36.9574, + "num_input_tokens_seen": 414043084, + "step": 8034 + }, + { + "epoch": 0.7898577430529963, + "grad_norm": 21.741914749145508, + "learning_rate": 8e-05, + "loss": 32.622, + "num_input_tokens_seen": 414202624, + "step": 8037 + }, + { + "epoch": 0.7901525761037812, + "grad_norm": 20.87574005126953, + "learning_rate": 8e-05, + "loss": 36.8067, + "num_input_tokens_seen": 414367648, + "step": 8040 + }, + { + "epoch": 0.7904474091545662, + "grad_norm": 23.262584686279297, + "learning_rate": 8e-05, + "loss": 38.4502, + "num_input_tokens_seen": 414525216, + "step": 8043 + }, + { + "epoch": 0.7907422422053512, + "grad_norm": 25.928823471069336, + "learning_rate": 8e-05, + "loss": 40.3445, + "num_input_tokens_seen": 414672264, + "step": 8046 + }, + { + "epoch": 0.7910370752561362, + "grad_norm": 24.576786041259766, + "learning_rate": 8e-05, + "loss": 36.1848, + "num_input_tokens_seen": 414833988, + "step": 8049 + }, + { + "epoch": 0.7913319083069212, + "grad_norm": 60.29381561279297, + "learning_rate": 8e-05, + "loss": 38.3726, + "num_input_tokens_seen": 414964680, + "step": 8052 + }, + { + "epoch": 0.7916267413577062, + "grad_norm": 21.849842071533203, + "learning_rate": 8e-05, + "loss": 34.8281, + "num_input_tokens_seen": 415133388, + "step": 8055 + }, + { + "epoch": 0.7919215744084912, + "grad_norm": 30.273502349853516, + "learning_rate": 8e-05, + "loss": 39.4244, + "num_input_tokens_seen": 415283668, + "step": 8058 + }, + { + "epoch": 0.7922164074592762, + "grad_norm": 24.501184463500977, + "learning_rate": 8e-05, + "loss": 37.741, + "num_input_tokens_seen": 415433676, + "step": 8061 + }, + { + "epoch": 0.7925112405100612, + "grad_norm": 24.874170303344727, + "learning_rate": 8e-05, + "loss": 34.0968, + "num_input_tokens_seen": 415591080, + "step": 8064 + }, + { + "epoch": 0.7928060735608462, + "grad_norm": 31.724990844726562, + "learning_rate": 8e-05, + "loss": 35.3581, + "num_input_tokens_seen": 415754600, + "step": 8067 + }, + { + "epoch": 0.7931009066116311, + "grad_norm": 28.470970153808594, + "learning_rate": 8e-05, + "loss": 34.9877, + "num_input_tokens_seen": 415916000, + "step": 8070 + }, + { + "epoch": 0.7933957396624162, + "grad_norm": 28.722963333129883, + "learning_rate": 8e-05, + "loss": 36.3972, + "num_input_tokens_seen": 416085508, + "step": 8073 + }, + { + "epoch": 0.7936905727132012, + "grad_norm": 30.28119659423828, + "learning_rate": 8e-05, + "loss": 38.09, + "num_input_tokens_seen": 416220544, + "step": 8076 + }, + { + "epoch": 0.7939854057639861, + "grad_norm": 27.0206241607666, + "learning_rate": 8e-05, + "loss": 41.4609, + "num_input_tokens_seen": 416375272, + "step": 8079 + }, + { + "epoch": 0.7942802388147712, + "grad_norm": 23.414587020874023, + "learning_rate": 8e-05, + "loss": 35.2382, + "num_input_tokens_seen": 416544904, + "step": 8082 + }, + { + "epoch": 0.7945750718655561, + "grad_norm": 24.774656295776367, + "learning_rate": 8e-05, + "loss": 36.4614, + "num_input_tokens_seen": 416707732, + "step": 8085 + }, + { + "epoch": 0.7948699049163411, + "grad_norm": 26.332231521606445, + "learning_rate": 8e-05, + "loss": 34.6727, + "num_input_tokens_seen": 416846512, + "step": 8088 + }, + { + "epoch": 0.7951647379671262, + "grad_norm": 23.728219985961914, + "learning_rate": 8e-05, + "loss": 38.4909, + "num_input_tokens_seen": 417024300, + "step": 8091 + }, + { + "epoch": 0.7954595710179111, + "grad_norm": 22.694969177246094, + "learning_rate": 8e-05, + "loss": 33.3829, + "num_input_tokens_seen": 417169300, + "step": 8094 + }, + { + "epoch": 0.7957544040686961, + "grad_norm": 21.721418380737305, + "learning_rate": 8e-05, + "loss": 37.5297, + "num_input_tokens_seen": 417310940, + "step": 8097 + }, + { + "epoch": 0.7960492371194811, + "grad_norm": 25.580392837524414, + "learning_rate": 8e-05, + "loss": 35.933, + "num_input_tokens_seen": 417492312, + "step": 8100 + }, + { + "epoch": 0.7963440701702661, + "grad_norm": 24.695316314697266, + "learning_rate": 8e-05, + "loss": 39.1894, + "num_input_tokens_seen": 417636016, + "step": 8103 + }, + { + "epoch": 0.7966389032210511, + "grad_norm": 27.233322143554688, + "learning_rate": 8e-05, + "loss": 36.8955, + "num_input_tokens_seen": 417801812, + "step": 8106 + }, + { + "epoch": 0.7969337362718361, + "grad_norm": 24.737655639648438, + "learning_rate": 8e-05, + "loss": 33.5013, + "num_input_tokens_seen": 417945108, + "step": 8109 + }, + { + "epoch": 0.7972285693226211, + "grad_norm": 26.381988525390625, + "learning_rate": 8e-05, + "loss": 37.7245, + "num_input_tokens_seen": 418084320, + "step": 8112 + }, + { + "epoch": 0.797523402373406, + "grad_norm": 20.971017837524414, + "learning_rate": 8e-05, + "loss": 32.9355, + "num_input_tokens_seen": 418243772, + "step": 8115 + }, + { + "epoch": 0.7978182354241911, + "grad_norm": 56.757423400878906, + "learning_rate": 8e-05, + "loss": 33.5788, + "num_input_tokens_seen": 418376468, + "step": 8118 + }, + { + "epoch": 0.7981130684749761, + "grad_norm": 23.269350051879883, + "learning_rate": 8e-05, + "loss": 33.1199, + "num_input_tokens_seen": 418523632, + "step": 8121 + }, + { + "epoch": 0.798407901525761, + "grad_norm": 27.43719482421875, + "learning_rate": 8e-05, + "loss": 37.3559, + "num_input_tokens_seen": 418683152, + "step": 8124 + }, + { + "epoch": 0.7987027345765461, + "grad_norm": 22.803434371948242, + "learning_rate": 8e-05, + "loss": 36.1157, + "num_input_tokens_seen": 418843020, + "step": 8127 + }, + { + "epoch": 0.798997567627331, + "grad_norm": 21.03816032409668, + "learning_rate": 8e-05, + "loss": 34.2601, + "num_input_tokens_seen": 419014256, + "step": 8130 + }, + { + "epoch": 0.799292400678116, + "grad_norm": 71.1124038696289, + "learning_rate": 8e-05, + "loss": 35.6755, + "num_input_tokens_seen": 419153168, + "step": 8133 + }, + { + "epoch": 0.7995872337289011, + "grad_norm": 21.754837036132812, + "learning_rate": 8e-05, + "loss": 33.553, + "num_input_tokens_seen": 419317532, + "step": 8136 + }, + { + "epoch": 0.799882066779686, + "grad_norm": 23.042221069335938, + "learning_rate": 8e-05, + "loss": 33.131, + "num_input_tokens_seen": 419481224, + "step": 8139 + }, + { + "epoch": 0.800176899830471, + "grad_norm": 24.852903366088867, + "learning_rate": 8e-05, + "loss": 32.4917, + "num_input_tokens_seen": 419639672, + "step": 8142 + }, + { + "epoch": 0.800471732881256, + "grad_norm": 48.62297058105469, + "learning_rate": 8e-05, + "loss": 33.921, + "num_input_tokens_seen": 419791948, + "step": 8145 + }, + { + "epoch": 0.800766565932041, + "grad_norm": 26.25032615661621, + "learning_rate": 8e-05, + "loss": 32.9486, + "num_input_tokens_seen": 419946984, + "step": 8148 + }, + { + "epoch": 0.801061398982826, + "grad_norm": 49.425872802734375, + "learning_rate": 8e-05, + "loss": 33.7382, + "num_input_tokens_seen": 420080392, + "step": 8151 + }, + { + "epoch": 0.801356232033611, + "grad_norm": 22.22492790222168, + "learning_rate": 8e-05, + "loss": 31.2606, + "num_input_tokens_seen": 420221328, + "step": 8154 + }, + { + "epoch": 0.801651065084396, + "grad_norm": 23.21967315673828, + "learning_rate": 8e-05, + "loss": 35.8521, + "num_input_tokens_seen": 420374892, + "step": 8157 + }, + { + "epoch": 0.8019458981351809, + "grad_norm": 21.875211715698242, + "learning_rate": 8e-05, + "loss": 37.2863, + "num_input_tokens_seen": 420528800, + "step": 8160 + }, + { + "epoch": 0.802240731185966, + "grad_norm": 23.19055938720703, + "learning_rate": 8e-05, + "loss": 39.5513, + "num_input_tokens_seen": 420685724, + "step": 8163 + }, + { + "epoch": 0.802535564236751, + "grad_norm": 53.719539642333984, + "learning_rate": 8e-05, + "loss": 37.4689, + "num_input_tokens_seen": 420843844, + "step": 8166 + }, + { + "epoch": 0.8028303972875359, + "grad_norm": 23.912782669067383, + "learning_rate": 8e-05, + "loss": 32.1369, + "num_input_tokens_seen": 421008060, + "step": 8169 + }, + { + "epoch": 0.803125230338321, + "grad_norm": 45.83617401123047, + "learning_rate": 8e-05, + "loss": 34.3147, + "num_input_tokens_seen": 421145352, + "step": 8172 + }, + { + "epoch": 0.8034200633891059, + "grad_norm": 37.71995544433594, + "learning_rate": 8e-05, + "loss": 35.0719, + "num_input_tokens_seen": 421296184, + "step": 8175 + }, + { + "epoch": 0.8037148964398909, + "grad_norm": 24.80175018310547, + "learning_rate": 8e-05, + "loss": 35.9003, + "num_input_tokens_seen": 421432260, + "step": 8178 + }, + { + "epoch": 0.804009729490676, + "grad_norm": 18.523286819458008, + "learning_rate": 8e-05, + "loss": 33.581, + "num_input_tokens_seen": 421601412, + "step": 8181 + }, + { + "epoch": 0.8043045625414609, + "grad_norm": 26.430187225341797, + "learning_rate": 8e-05, + "loss": 39.4251, + "num_input_tokens_seen": 421768668, + "step": 8184 + }, + { + "epoch": 0.8045993955922459, + "grad_norm": 19.468984603881836, + "learning_rate": 8e-05, + "loss": 35.396, + "num_input_tokens_seen": 421925492, + "step": 8187 + }, + { + "epoch": 0.8048942286430308, + "grad_norm": 24.663354873657227, + "learning_rate": 8e-05, + "loss": 36.6025, + "num_input_tokens_seen": 422102116, + "step": 8190 + }, + { + "epoch": 0.8051890616938159, + "grad_norm": 24.94499397277832, + "learning_rate": 8e-05, + "loss": 33.9554, + "num_input_tokens_seen": 422254632, + "step": 8193 + }, + { + "epoch": 0.8054838947446009, + "grad_norm": 27.692119598388672, + "learning_rate": 8e-05, + "loss": 36.4076, + "num_input_tokens_seen": 422399280, + "step": 8196 + }, + { + "epoch": 0.8057787277953858, + "grad_norm": 24.469003677368164, + "learning_rate": 8e-05, + "loss": 36.6231, + "num_input_tokens_seen": 422564724, + "step": 8199 + }, + { + "epoch": 0.8060735608461709, + "grad_norm": 24.645511627197266, + "learning_rate": 8e-05, + "loss": 33.4182, + "num_input_tokens_seen": 422693920, + "step": 8202 + }, + { + "epoch": 0.8063683938969558, + "grad_norm": 24.154550552368164, + "learning_rate": 8e-05, + "loss": 37.1308, + "num_input_tokens_seen": 422852320, + "step": 8205 + }, + { + "epoch": 0.8066632269477408, + "grad_norm": 20.739830017089844, + "learning_rate": 8e-05, + "loss": 34.6883, + "num_input_tokens_seen": 423036660, + "step": 8208 + }, + { + "epoch": 0.8069580599985259, + "grad_norm": 22.852832794189453, + "learning_rate": 8e-05, + "loss": 35.2849, + "num_input_tokens_seen": 423171956, + "step": 8211 + }, + { + "epoch": 0.8072528930493108, + "grad_norm": 22.849538803100586, + "learning_rate": 8e-05, + "loss": 37.724, + "num_input_tokens_seen": 423331536, + "step": 8214 + }, + { + "epoch": 0.8075477261000958, + "grad_norm": 26.494213104248047, + "learning_rate": 8e-05, + "loss": 34.4807, + "num_input_tokens_seen": 423458460, + "step": 8217 + }, + { + "epoch": 0.8078425591508808, + "grad_norm": 25.496356964111328, + "learning_rate": 8e-05, + "loss": 38.5775, + "num_input_tokens_seen": 423610528, + "step": 8220 + }, + { + "epoch": 0.8081373922016658, + "grad_norm": 20.508310317993164, + "learning_rate": 8e-05, + "loss": 31.4715, + "num_input_tokens_seen": 423747124, + "step": 8223 + }, + { + "epoch": 0.8084322252524508, + "grad_norm": 216.43284606933594, + "learning_rate": 8e-05, + "loss": 31.5416, + "num_input_tokens_seen": 423890224, + "step": 8226 + }, + { + "epoch": 0.8087270583032358, + "grad_norm": 24.748380661010742, + "learning_rate": 8e-05, + "loss": 35.1916, + "num_input_tokens_seen": 424043772, + "step": 8229 + }, + { + "epoch": 0.8090218913540208, + "grad_norm": 57.81846618652344, + "learning_rate": 8e-05, + "loss": 33.6554, + "num_input_tokens_seen": 424210108, + "step": 8232 + }, + { + "epoch": 0.8093167244048057, + "grad_norm": 24.118955612182617, + "learning_rate": 8e-05, + "loss": 34.5261, + "num_input_tokens_seen": 424371540, + "step": 8235 + }, + { + "epoch": 0.8096115574555908, + "grad_norm": 27.515722274780273, + "learning_rate": 8e-05, + "loss": 33.6207, + "num_input_tokens_seen": 424535776, + "step": 8238 + }, + { + "epoch": 0.8099063905063758, + "grad_norm": 26.33366584777832, + "learning_rate": 8e-05, + "loss": 38.9131, + "num_input_tokens_seen": 424710928, + "step": 8241 + }, + { + "epoch": 0.8102012235571607, + "grad_norm": 23.612756729125977, + "learning_rate": 8e-05, + "loss": 35.7337, + "num_input_tokens_seen": 424873100, + "step": 8244 + }, + { + "epoch": 0.8104960566079458, + "grad_norm": 27.79807472229004, + "learning_rate": 8e-05, + "loss": 35.9495, + "num_input_tokens_seen": 425019436, + "step": 8247 + }, + { + "epoch": 0.8107908896587307, + "grad_norm": 27.202306747436523, + "learning_rate": 8e-05, + "loss": 35.9226, + "num_input_tokens_seen": 425193320, + "step": 8250 + }, + { + "epoch": 0.8110857227095157, + "grad_norm": 34.29397964477539, + "learning_rate": 8e-05, + "loss": 35.7965, + "num_input_tokens_seen": 425337520, + "step": 8253 + }, + { + "epoch": 0.8113805557603008, + "grad_norm": 24.603361129760742, + "learning_rate": 8e-05, + "loss": 35.9459, + "num_input_tokens_seen": 425479424, + "step": 8256 + }, + { + "epoch": 0.8116753888110857, + "grad_norm": 26.088085174560547, + "learning_rate": 8e-05, + "loss": 36.1599, + "num_input_tokens_seen": 425633808, + "step": 8259 + }, + { + "epoch": 0.8119702218618707, + "grad_norm": 24.88553810119629, + "learning_rate": 8e-05, + "loss": 36.8701, + "num_input_tokens_seen": 425789476, + "step": 8262 + }, + { + "epoch": 0.8122650549126557, + "grad_norm": 26.12474250793457, + "learning_rate": 8e-05, + "loss": 36.3213, + "num_input_tokens_seen": 425921604, + "step": 8265 + }, + { + "epoch": 0.8125598879634407, + "grad_norm": 24.976924896240234, + "learning_rate": 8e-05, + "loss": 38.2543, + "num_input_tokens_seen": 426076664, + "step": 8268 + }, + { + "epoch": 0.8128547210142257, + "grad_norm": 25.509859085083008, + "learning_rate": 8e-05, + "loss": 35.7021, + "num_input_tokens_seen": 426225572, + "step": 8271 + }, + { + "epoch": 0.8131495540650107, + "grad_norm": 23.46617317199707, + "learning_rate": 8e-05, + "loss": 36.6647, + "num_input_tokens_seen": 426368208, + "step": 8274 + }, + { + "epoch": 0.8134443871157957, + "grad_norm": 23.891382217407227, + "learning_rate": 8e-05, + "loss": 36.64, + "num_input_tokens_seen": 426528880, + "step": 8277 + }, + { + "epoch": 0.8137392201665806, + "grad_norm": 25.366939544677734, + "learning_rate": 8e-05, + "loss": 36.1003, + "num_input_tokens_seen": 426708144, + "step": 8280 + }, + { + "epoch": 0.8140340532173657, + "grad_norm": 24.68431854248047, + "learning_rate": 8e-05, + "loss": 36.2022, + "num_input_tokens_seen": 426870136, + "step": 8283 + }, + { + "epoch": 0.8143288862681507, + "grad_norm": 21.63142204284668, + "learning_rate": 8e-05, + "loss": 33.1946, + "num_input_tokens_seen": 427021880, + "step": 8286 + }, + { + "epoch": 0.8146237193189356, + "grad_norm": 20.929309844970703, + "learning_rate": 8e-05, + "loss": 36.1105, + "num_input_tokens_seen": 427174376, + "step": 8289 + }, + { + "epoch": 0.8149185523697207, + "grad_norm": 23.457090377807617, + "learning_rate": 8e-05, + "loss": 35.412, + "num_input_tokens_seen": 427319372, + "step": 8292 + }, + { + "epoch": 0.8152133854205056, + "grad_norm": 24.533313751220703, + "learning_rate": 8e-05, + "loss": 39.5555, + "num_input_tokens_seen": 427484580, + "step": 8295 + }, + { + "epoch": 0.8155082184712906, + "grad_norm": 19.934356689453125, + "learning_rate": 8e-05, + "loss": 33.2841, + "num_input_tokens_seen": 427633100, + "step": 8298 + }, + { + "epoch": 0.8158030515220757, + "grad_norm": 25.337459564208984, + "learning_rate": 8e-05, + "loss": 39.6166, + "num_input_tokens_seen": 427768212, + "step": 8301 + }, + { + "epoch": 0.8160978845728606, + "grad_norm": 24.47291374206543, + "learning_rate": 8e-05, + "loss": 36.4094, + "num_input_tokens_seen": 427922476, + "step": 8304 + }, + { + "epoch": 0.8163927176236456, + "grad_norm": 21.60689353942871, + "learning_rate": 8e-05, + "loss": 34.1587, + "num_input_tokens_seen": 428073292, + "step": 8307 + }, + { + "epoch": 0.8166875506744306, + "grad_norm": 25.36764907836914, + "learning_rate": 8e-05, + "loss": 38.1677, + "num_input_tokens_seen": 428248860, + "step": 8310 + }, + { + "epoch": 0.8169823837252156, + "grad_norm": 27.005537033081055, + "learning_rate": 8e-05, + "loss": 38.2046, + "num_input_tokens_seen": 428395584, + "step": 8313 + }, + { + "epoch": 0.8172772167760006, + "grad_norm": 21.089380264282227, + "learning_rate": 8e-05, + "loss": 32.8911, + "num_input_tokens_seen": 428549272, + "step": 8316 + }, + { + "epoch": 0.8175720498267856, + "grad_norm": 24.877689361572266, + "learning_rate": 8e-05, + "loss": 35.3285, + "num_input_tokens_seen": 428698504, + "step": 8319 + }, + { + "epoch": 0.8178668828775706, + "grad_norm": 29.171049118041992, + "learning_rate": 8e-05, + "loss": 34.9713, + "num_input_tokens_seen": 428851904, + "step": 8322 + }, + { + "epoch": 0.8181617159283555, + "grad_norm": 25.88189697265625, + "learning_rate": 8e-05, + "loss": 30.8571, + "num_input_tokens_seen": 429018916, + "step": 8325 + }, + { + "epoch": 0.8184565489791406, + "grad_norm": 23.980445861816406, + "learning_rate": 8e-05, + "loss": 37.0643, + "num_input_tokens_seen": 429173112, + "step": 8328 + }, + { + "epoch": 0.8187513820299256, + "grad_norm": 21.506813049316406, + "learning_rate": 8e-05, + "loss": 37.2138, + "num_input_tokens_seen": 429331240, + "step": 8331 + }, + { + "epoch": 0.8190462150807105, + "grad_norm": 20.923595428466797, + "learning_rate": 8e-05, + "loss": 33.7857, + "num_input_tokens_seen": 429477164, + "step": 8334 + }, + { + "epoch": 0.8193410481314956, + "grad_norm": 24.962444305419922, + "learning_rate": 8e-05, + "loss": 36.0604, + "num_input_tokens_seen": 429642332, + "step": 8337 + }, + { + "epoch": 0.8196358811822805, + "grad_norm": 21.74043846130371, + "learning_rate": 8e-05, + "loss": 41.008, + "num_input_tokens_seen": 429789980, + "step": 8340 + }, + { + "epoch": 0.8199307142330655, + "grad_norm": 33.76387405395508, + "learning_rate": 8e-05, + "loss": 35.5676, + "num_input_tokens_seen": 429946564, + "step": 8343 + }, + { + "epoch": 0.8202255472838506, + "grad_norm": 19.511245727539062, + "learning_rate": 8e-05, + "loss": 34.3012, + "num_input_tokens_seen": 430082288, + "step": 8346 + }, + { + "epoch": 0.8205203803346355, + "grad_norm": 25.11481285095215, + "learning_rate": 8e-05, + "loss": 35.0994, + "num_input_tokens_seen": 430241712, + "step": 8349 + }, + { + "epoch": 0.8208152133854205, + "grad_norm": 28.375946044921875, + "learning_rate": 8e-05, + "loss": 35.1421, + "num_input_tokens_seen": 430369624, + "step": 8352 + }, + { + "epoch": 0.8211100464362056, + "grad_norm": 23.08304214477539, + "learning_rate": 8e-05, + "loss": 36.7995, + "num_input_tokens_seen": 430523416, + "step": 8355 + }, + { + "epoch": 0.8214048794869905, + "grad_norm": 21.1984806060791, + "learning_rate": 8e-05, + "loss": 35.5492, + "num_input_tokens_seen": 430676216, + "step": 8358 + }, + { + "epoch": 0.8216997125377755, + "grad_norm": 25.081396102905273, + "learning_rate": 8e-05, + "loss": 33.4143, + "num_input_tokens_seen": 430813496, + "step": 8361 + }, + { + "epoch": 0.8219945455885604, + "grad_norm": 22.87725257873535, + "learning_rate": 8e-05, + "loss": 35.4886, + "num_input_tokens_seen": 430970352, + "step": 8364 + }, + { + "epoch": 0.8222893786393455, + "grad_norm": 22.641258239746094, + "learning_rate": 8e-05, + "loss": 36.1933, + "num_input_tokens_seen": 431131588, + "step": 8367 + }, + { + "epoch": 0.8225842116901305, + "grad_norm": 20.716995239257812, + "learning_rate": 8e-05, + "loss": 33.6509, + "num_input_tokens_seen": 431276432, + "step": 8370 + }, + { + "epoch": 0.8228790447409154, + "grad_norm": 41.984832763671875, + "learning_rate": 8e-05, + "loss": 33.6686, + "num_input_tokens_seen": 431435588, + "step": 8373 + }, + { + "epoch": 0.8231738777917005, + "grad_norm": 23.140209197998047, + "learning_rate": 8e-05, + "loss": 37.4149, + "num_input_tokens_seen": 431600448, + "step": 8376 + }, + { + "epoch": 0.8234687108424854, + "grad_norm": 30.463956832885742, + "learning_rate": 8e-05, + "loss": 36.6518, + "num_input_tokens_seen": 431745244, + "step": 8379 + }, + { + "epoch": 0.8237635438932704, + "grad_norm": 27.39523696899414, + "learning_rate": 8e-05, + "loss": 37.3414, + "num_input_tokens_seen": 431892992, + "step": 8382 + }, + { + "epoch": 0.8240583769440555, + "grad_norm": 22.793724060058594, + "learning_rate": 8e-05, + "loss": 37.5179, + "num_input_tokens_seen": 432059272, + "step": 8385 + }, + { + "epoch": 0.8243532099948404, + "grad_norm": 19.464582443237305, + "learning_rate": 8e-05, + "loss": 33.5467, + "num_input_tokens_seen": 432208184, + "step": 8388 + }, + { + "epoch": 0.8246480430456254, + "grad_norm": 21.68764305114746, + "learning_rate": 8e-05, + "loss": 34.8924, + "num_input_tokens_seen": 432345912, + "step": 8391 + }, + { + "epoch": 0.8249428760964104, + "grad_norm": 22.38986587524414, + "learning_rate": 8e-05, + "loss": 32.9228, + "num_input_tokens_seen": 432490172, + "step": 8394 + }, + { + "epoch": 0.8252377091471954, + "grad_norm": 25.4932861328125, + "learning_rate": 8e-05, + "loss": 37.1201, + "num_input_tokens_seen": 432661932, + "step": 8397 + }, + { + "epoch": 0.8255325421979804, + "grad_norm": 24.564067840576172, + "learning_rate": 8e-05, + "loss": 37.7674, + "num_input_tokens_seen": 432830700, + "step": 8400 + }, + { + "epoch": 0.8258273752487654, + "grad_norm": 23.00069808959961, + "learning_rate": 8e-05, + "loss": 36.0192, + "num_input_tokens_seen": 432990432, + "step": 8403 + }, + { + "epoch": 0.8261222082995504, + "grad_norm": 23.691499710083008, + "learning_rate": 8e-05, + "loss": 36.5851, + "num_input_tokens_seen": 433140292, + "step": 8406 + }, + { + "epoch": 0.8264170413503353, + "grad_norm": 23.410005569458008, + "learning_rate": 8e-05, + "loss": 34.4295, + "num_input_tokens_seen": 433277012, + "step": 8409 + }, + { + "epoch": 0.8267118744011204, + "grad_norm": 23.00798988342285, + "learning_rate": 8e-05, + "loss": 35.6506, + "num_input_tokens_seen": 433426164, + "step": 8412 + }, + { + "epoch": 0.8270067074519054, + "grad_norm": 21.08866310119629, + "learning_rate": 8e-05, + "loss": 32.9966, + "num_input_tokens_seen": 433586000, + "step": 8415 + }, + { + "epoch": 0.8273015405026903, + "grad_norm": 24.12877655029297, + "learning_rate": 8e-05, + "loss": 37.5246, + "num_input_tokens_seen": 433742360, + "step": 8418 + }, + { + "epoch": 0.8275963735534754, + "grad_norm": 23.096635818481445, + "learning_rate": 8e-05, + "loss": 35.271, + "num_input_tokens_seen": 433883892, + "step": 8421 + }, + { + "epoch": 0.8278912066042603, + "grad_norm": 23.10941505432129, + "learning_rate": 8e-05, + "loss": 33.7624, + "num_input_tokens_seen": 434026752, + "step": 8424 + }, + { + "epoch": 0.8281860396550453, + "grad_norm": 25.72688102722168, + "learning_rate": 8e-05, + "loss": 37.7114, + "num_input_tokens_seen": 434179836, + "step": 8427 + }, + { + "epoch": 0.8284808727058304, + "grad_norm": 23.44504737854004, + "learning_rate": 8e-05, + "loss": 37.4356, + "num_input_tokens_seen": 434342732, + "step": 8430 + }, + { + "epoch": 0.8287757057566153, + "grad_norm": 23.88396644592285, + "learning_rate": 8e-05, + "loss": 34.6538, + "num_input_tokens_seen": 434483484, + "step": 8433 + }, + { + "epoch": 0.8290705388074003, + "grad_norm": 24.157764434814453, + "learning_rate": 8e-05, + "loss": 35.9479, + "num_input_tokens_seen": 434631000, + "step": 8436 + }, + { + "epoch": 0.8293653718581853, + "grad_norm": 25.783918380737305, + "learning_rate": 8e-05, + "loss": 34.5447, + "num_input_tokens_seen": 434777308, + "step": 8439 + }, + { + "epoch": 0.8296602049089703, + "grad_norm": 22.558677673339844, + "learning_rate": 8e-05, + "loss": 34.3088, + "num_input_tokens_seen": 434959584, + "step": 8442 + }, + { + "epoch": 0.8299550379597553, + "grad_norm": 22.762332916259766, + "learning_rate": 8e-05, + "loss": 37.8718, + "num_input_tokens_seen": 435115636, + "step": 8445 + }, + { + "epoch": 0.8302498710105403, + "grad_norm": 21.599580764770508, + "learning_rate": 8e-05, + "loss": 33.6288, + "num_input_tokens_seen": 435268344, + "step": 8448 + }, + { + "epoch": 0.8305447040613253, + "grad_norm": 23.022197723388672, + "learning_rate": 8e-05, + "loss": 39.1464, + "num_input_tokens_seen": 435408740, + "step": 8451 + }, + { + "epoch": 0.8308395371121102, + "grad_norm": 21.746292114257812, + "learning_rate": 8e-05, + "loss": 34.105, + "num_input_tokens_seen": 435567116, + "step": 8454 + }, + { + "epoch": 0.8311343701628953, + "grad_norm": 28.59079360961914, + "learning_rate": 8e-05, + "loss": 36.243, + "num_input_tokens_seen": 435724948, + "step": 8457 + }, + { + "epoch": 0.8314292032136803, + "grad_norm": 25.476356506347656, + "learning_rate": 8e-05, + "loss": 37.6103, + "num_input_tokens_seen": 435870588, + "step": 8460 + }, + { + "epoch": 0.8317240362644652, + "grad_norm": 23.399967193603516, + "learning_rate": 8e-05, + "loss": 33.0977, + "num_input_tokens_seen": 436014732, + "step": 8463 + }, + { + "epoch": 0.8320188693152503, + "grad_norm": 29.340322494506836, + "learning_rate": 8e-05, + "loss": 35.883, + "num_input_tokens_seen": 436163556, + "step": 8466 + }, + { + "epoch": 0.8323137023660352, + "grad_norm": 25.349374771118164, + "learning_rate": 8e-05, + "loss": 38.2025, + "num_input_tokens_seen": 436319124, + "step": 8469 + }, + { + "epoch": 0.8326085354168202, + "grad_norm": 25.654415130615234, + "learning_rate": 8e-05, + "loss": 35.1885, + "num_input_tokens_seen": 436452984, + "step": 8472 + }, + { + "epoch": 0.8329033684676053, + "grad_norm": 34.60222625732422, + "learning_rate": 8e-05, + "loss": 34.6596, + "num_input_tokens_seen": 436597484, + "step": 8475 + }, + { + "epoch": 0.8331982015183902, + "grad_norm": 21.619489669799805, + "learning_rate": 8e-05, + "loss": 33.9803, + "num_input_tokens_seen": 436738324, + "step": 8478 + }, + { + "epoch": 0.8334930345691752, + "grad_norm": 27.300823211669922, + "learning_rate": 8e-05, + "loss": 37.2664, + "num_input_tokens_seen": 436902392, + "step": 8481 + }, + { + "epoch": 0.8337878676199602, + "grad_norm": 18.91798210144043, + "learning_rate": 8e-05, + "loss": 32.6294, + "num_input_tokens_seen": 437055092, + "step": 8484 + }, + { + "epoch": 0.8340827006707452, + "grad_norm": 22.521039962768555, + "learning_rate": 8e-05, + "loss": 38.8009, + "num_input_tokens_seen": 437212152, + "step": 8487 + }, + { + "epoch": 0.8343775337215302, + "grad_norm": 39.985023498535156, + "learning_rate": 8e-05, + "loss": 33.9871, + "num_input_tokens_seen": 437373860, + "step": 8490 + }, + { + "epoch": 0.8346723667723152, + "grad_norm": 23.792661666870117, + "learning_rate": 8e-05, + "loss": 41.1392, + "num_input_tokens_seen": 437543544, + "step": 8493 + }, + { + "epoch": 0.8349671998231002, + "grad_norm": 22.138065338134766, + "learning_rate": 8e-05, + "loss": 35.8017, + "num_input_tokens_seen": 437715852, + "step": 8496 + }, + { + "epoch": 0.8352620328738851, + "grad_norm": 21.686561584472656, + "learning_rate": 8e-05, + "loss": 34.2745, + "num_input_tokens_seen": 437861004, + "step": 8499 + }, + { + "epoch": 0.8355568659246702, + "grad_norm": 24.75675392150879, + "learning_rate": 8e-05, + "loss": 33.8077, + "num_input_tokens_seen": 438041664, + "step": 8502 + }, + { + "epoch": 0.8358516989754552, + "grad_norm": 27.919275283813477, + "learning_rate": 8e-05, + "loss": 35.537, + "num_input_tokens_seen": 438220600, + "step": 8505 + }, + { + "epoch": 0.8361465320262401, + "grad_norm": 25.631576538085938, + "learning_rate": 8e-05, + "loss": 35.2494, + "num_input_tokens_seen": 438362040, + "step": 8508 + }, + { + "epoch": 0.8364413650770252, + "grad_norm": 23.383817672729492, + "learning_rate": 8e-05, + "loss": 38.0361, + "num_input_tokens_seen": 438518572, + "step": 8511 + }, + { + "epoch": 0.8367361981278101, + "grad_norm": 28.416120529174805, + "learning_rate": 8e-05, + "loss": 37.6057, + "num_input_tokens_seen": 438686732, + "step": 8514 + }, + { + "epoch": 0.8370310311785951, + "grad_norm": 23.688934326171875, + "learning_rate": 8e-05, + "loss": 34.7264, + "num_input_tokens_seen": 438825692, + "step": 8517 + }, + { + "epoch": 0.8373258642293802, + "grad_norm": 23.2996883392334, + "learning_rate": 8e-05, + "loss": 32.613, + "num_input_tokens_seen": 438966772, + "step": 8520 + }, + { + "epoch": 0.8376206972801651, + "grad_norm": 23.35087776184082, + "learning_rate": 8e-05, + "loss": 35.096, + "num_input_tokens_seen": 439151532, + "step": 8523 + }, + { + "epoch": 0.8379155303309501, + "grad_norm": 24.346364974975586, + "learning_rate": 8e-05, + "loss": 37.9784, + "num_input_tokens_seen": 439316152, + "step": 8526 + }, + { + "epoch": 0.838210363381735, + "grad_norm": 24.31075668334961, + "learning_rate": 8e-05, + "loss": 39.3151, + "num_input_tokens_seen": 439476964, + "step": 8529 + }, + { + "epoch": 0.8385051964325201, + "grad_norm": 24.63071060180664, + "learning_rate": 8e-05, + "loss": 36.7107, + "num_input_tokens_seen": 439631960, + "step": 8532 + }, + { + "epoch": 0.8388000294833051, + "grad_norm": 24.59788703918457, + "learning_rate": 8e-05, + "loss": 35.1999, + "num_input_tokens_seen": 439790576, + "step": 8535 + }, + { + "epoch": 0.83909486253409, + "grad_norm": 23.76664924621582, + "learning_rate": 8e-05, + "loss": 36.6624, + "num_input_tokens_seen": 439941332, + "step": 8538 + }, + { + "epoch": 0.8393896955848751, + "grad_norm": 33.22909927368164, + "learning_rate": 8e-05, + "loss": 33.9235, + "num_input_tokens_seen": 440082820, + "step": 8541 + }, + { + "epoch": 0.83968452863566, + "grad_norm": 26.295854568481445, + "learning_rate": 8e-05, + "loss": 35.2236, + "num_input_tokens_seen": 440242188, + "step": 8544 + }, + { + "epoch": 0.839979361686445, + "grad_norm": 21.903047561645508, + "learning_rate": 8e-05, + "loss": 34.7781, + "num_input_tokens_seen": 440408484, + "step": 8547 + }, + { + "epoch": 0.8402741947372301, + "grad_norm": 28.9826717376709, + "learning_rate": 8e-05, + "loss": 36.0424, + "num_input_tokens_seen": 440564132, + "step": 8550 + }, + { + "epoch": 0.840569027788015, + "grad_norm": 26.4545955657959, + "learning_rate": 8e-05, + "loss": 33.648, + "num_input_tokens_seen": 440725396, + "step": 8553 + }, + { + "epoch": 0.8408638608388, + "grad_norm": 21.50971221923828, + "learning_rate": 8e-05, + "loss": 33.2473, + "num_input_tokens_seen": 440871052, + "step": 8556 + }, + { + "epoch": 0.841158693889585, + "grad_norm": 21.93988800048828, + "learning_rate": 8e-05, + "loss": 36.9516, + "num_input_tokens_seen": 441011984, + "step": 8559 + }, + { + "epoch": 0.84145352694037, + "grad_norm": 25.327268600463867, + "learning_rate": 8e-05, + "loss": 34.4141, + "num_input_tokens_seen": 441164920, + "step": 8562 + }, + { + "epoch": 0.841748359991155, + "grad_norm": 25.629833221435547, + "learning_rate": 8e-05, + "loss": 35.3238, + "num_input_tokens_seen": 441309320, + "step": 8565 + }, + { + "epoch": 0.84204319304194, + "grad_norm": 31.811267852783203, + "learning_rate": 8e-05, + "loss": 35.6597, + "num_input_tokens_seen": 441475116, + "step": 8568 + }, + { + "epoch": 0.842338026092725, + "grad_norm": 24.13558006286621, + "learning_rate": 8e-05, + "loss": 32.54, + "num_input_tokens_seen": 441604948, + "step": 8571 + }, + { + "epoch": 0.8426328591435099, + "grad_norm": 22.01540756225586, + "learning_rate": 8e-05, + "loss": 35.1774, + "num_input_tokens_seen": 441776644, + "step": 8574 + }, + { + "epoch": 0.842927692194295, + "grad_norm": 20.971839904785156, + "learning_rate": 8e-05, + "loss": 37.2688, + "num_input_tokens_seen": 441924832, + "step": 8577 + }, + { + "epoch": 0.84322252524508, + "grad_norm": 21.542495727539062, + "learning_rate": 8e-05, + "loss": 37.1687, + "num_input_tokens_seen": 442075104, + "step": 8580 + }, + { + "epoch": 0.8435173582958649, + "grad_norm": 26.712690353393555, + "learning_rate": 8e-05, + "loss": 33.4731, + "num_input_tokens_seen": 442214348, + "step": 8583 + }, + { + "epoch": 0.84381219134665, + "grad_norm": 24.981464385986328, + "learning_rate": 8e-05, + "loss": 35.7284, + "num_input_tokens_seen": 442369648, + "step": 8586 + }, + { + "epoch": 0.8441070243974349, + "grad_norm": 24.999849319458008, + "learning_rate": 8e-05, + "loss": 35.2039, + "num_input_tokens_seen": 442540736, + "step": 8589 + }, + { + "epoch": 0.8444018574482199, + "grad_norm": 24.549636840820312, + "learning_rate": 8e-05, + "loss": 35.5619, + "num_input_tokens_seen": 442713112, + "step": 8592 + }, + { + "epoch": 0.844696690499005, + "grad_norm": 25.977907180786133, + "learning_rate": 8e-05, + "loss": 37.0179, + "num_input_tokens_seen": 442869764, + "step": 8595 + }, + { + "epoch": 0.8449915235497899, + "grad_norm": 30.217334747314453, + "learning_rate": 8e-05, + "loss": 34.2794, + "num_input_tokens_seen": 443017248, + "step": 8598 + }, + { + "epoch": 0.8452863566005749, + "grad_norm": 21.913938522338867, + "learning_rate": 8e-05, + "loss": 37.9163, + "num_input_tokens_seen": 443159376, + "step": 8601 + }, + { + "epoch": 0.8455811896513599, + "grad_norm": 23.640243530273438, + "learning_rate": 8e-05, + "loss": 34.5981, + "num_input_tokens_seen": 443288428, + "step": 8604 + }, + { + "epoch": 0.8458760227021449, + "grad_norm": 20.208879470825195, + "learning_rate": 8e-05, + "loss": 33.8835, + "num_input_tokens_seen": 443449712, + "step": 8607 + }, + { + "epoch": 0.8461708557529299, + "grad_norm": 21.805574417114258, + "learning_rate": 8e-05, + "loss": 36.3991, + "num_input_tokens_seen": 443612872, + "step": 8610 + }, + { + "epoch": 0.8464656888037149, + "grad_norm": 22.304637908935547, + "learning_rate": 8e-05, + "loss": 38.1468, + "num_input_tokens_seen": 443766068, + "step": 8613 + }, + { + "epoch": 0.8467605218544999, + "grad_norm": 24.58745765686035, + "learning_rate": 8e-05, + "loss": 37.1603, + "num_input_tokens_seen": 443937012, + "step": 8616 + }, + { + "epoch": 0.8470553549052848, + "grad_norm": 24.43326187133789, + "learning_rate": 8e-05, + "loss": 34.2103, + "num_input_tokens_seen": 444088396, + "step": 8619 + }, + { + "epoch": 0.8473501879560699, + "grad_norm": 24.7650089263916, + "learning_rate": 8e-05, + "loss": 36.8356, + "num_input_tokens_seen": 444250968, + "step": 8622 + }, + { + "epoch": 0.8476450210068549, + "grad_norm": 20.300933837890625, + "learning_rate": 8e-05, + "loss": 35.2136, + "num_input_tokens_seen": 444411484, + "step": 8625 + }, + { + "epoch": 0.8479398540576398, + "grad_norm": 21.370935440063477, + "learning_rate": 8e-05, + "loss": 36.4811, + "num_input_tokens_seen": 444578168, + "step": 8628 + }, + { + "epoch": 0.8482346871084249, + "grad_norm": 24.828235626220703, + "learning_rate": 8e-05, + "loss": 36.1291, + "num_input_tokens_seen": 444727816, + "step": 8631 + }, + { + "epoch": 0.8485295201592098, + "grad_norm": 25.074390411376953, + "learning_rate": 8e-05, + "loss": 36.5011, + "num_input_tokens_seen": 444885136, + "step": 8634 + }, + { + "epoch": 0.8488243532099948, + "grad_norm": 24.60084342956543, + "learning_rate": 8e-05, + "loss": 40.491, + "num_input_tokens_seen": 445047136, + "step": 8637 + }, + { + "epoch": 0.8491191862607799, + "grad_norm": 19.854074478149414, + "learning_rate": 8e-05, + "loss": 36.0442, + "num_input_tokens_seen": 445217352, + "step": 8640 + }, + { + "epoch": 0.8494140193115648, + "grad_norm": 24.5972843170166, + "learning_rate": 8e-05, + "loss": 35.3802, + "num_input_tokens_seen": 445358576, + "step": 8643 + }, + { + "epoch": 0.8497088523623498, + "grad_norm": 24.3794002532959, + "learning_rate": 8e-05, + "loss": 34.9128, + "num_input_tokens_seen": 445496724, + "step": 8646 + }, + { + "epoch": 0.8500036854131348, + "grad_norm": 44.509586334228516, + "learning_rate": 8e-05, + "loss": 37.9704, + "num_input_tokens_seen": 445634380, + "step": 8649 + }, + { + "epoch": 0.8502985184639198, + "grad_norm": 22.681522369384766, + "learning_rate": 8e-05, + "loss": 38.7094, + "num_input_tokens_seen": 445791312, + "step": 8652 + }, + { + "epoch": 0.8505933515147048, + "grad_norm": 31.228179931640625, + "learning_rate": 8e-05, + "loss": 35.9362, + "num_input_tokens_seen": 445939464, + "step": 8655 + }, + { + "epoch": 0.8508881845654898, + "grad_norm": 26.90880584716797, + "learning_rate": 8e-05, + "loss": 36.8482, + "num_input_tokens_seen": 446091944, + "step": 8658 + }, + { + "epoch": 0.8511830176162748, + "grad_norm": 23.329309463500977, + "learning_rate": 8e-05, + "loss": 33.0599, + "num_input_tokens_seen": 446275900, + "step": 8661 + }, + { + "epoch": 0.8514778506670598, + "grad_norm": 25.813411712646484, + "learning_rate": 8e-05, + "loss": 37.8099, + "num_input_tokens_seen": 446417860, + "step": 8664 + }, + { + "epoch": 0.8517726837178448, + "grad_norm": 26.515968322753906, + "learning_rate": 8e-05, + "loss": 35.298, + "num_input_tokens_seen": 446582760, + "step": 8667 + }, + { + "epoch": 0.8520675167686298, + "grad_norm": 32.57109832763672, + "learning_rate": 8e-05, + "loss": 34.4064, + "num_input_tokens_seen": 446744728, + "step": 8670 + }, + { + "epoch": 0.8523623498194147, + "grad_norm": 21.738887786865234, + "learning_rate": 8e-05, + "loss": 29.7742, + "num_input_tokens_seen": 446889608, + "step": 8673 + }, + { + "epoch": 0.8526571828701998, + "grad_norm": 24.397785186767578, + "learning_rate": 8e-05, + "loss": 34.2036, + "num_input_tokens_seen": 447058880, + "step": 8676 + }, + { + "epoch": 0.8529520159209848, + "grad_norm": 25.896568298339844, + "learning_rate": 8e-05, + "loss": 36.5112, + "num_input_tokens_seen": 447198284, + "step": 8679 + }, + { + "epoch": 0.8532468489717697, + "grad_norm": 25.91545867919922, + "learning_rate": 8e-05, + "loss": 35.4539, + "num_input_tokens_seen": 447326096, + "step": 8682 + }, + { + "epoch": 0.8535416820225548, + "grad_norm": 235.16880798339844, + "learning_rate": 8e-05, + "loss": 30.8254, + "num_input_tokens_seen": 447471968, + "step": 8685 + }, + { + "epoch": 0.8538365150733397, + "grad_norm": 25.915328979492188, + "learning_rate": 8e-05, + "loss": 34.4214, + "num_input_tokens_seen": 447621200, + "step": 8688 + }, + { + "epoch": 0.8541313481241247, + "grad_norm": 22.61929702758789, + "learning_rate": 8e-05, + "loss": 33.0711, + "num_input_tokens_seen": 447759144, + "step": 8691 + }, + { + "epoch": 0.8544261811749098, + "grad_norm": 64.58021545410156, + "learning_rate": 8e-05, + "loss": 35.5584, + "num_input_tokens_seen": 447912668, + "step": 8694 + }, + { + "epoch": 0.8547210142256947, + "grad_norm": 23.018518447875977, + "learning_rate": 8e-05, + "loss": 36.516, + "num_input_tokens_seen": 448061476, + "step": 8697 + }, + { + "epoch": 0.8550158472764797, + "grad_norm": 26.483489990234375, + "learning_rate": 8e-05, + "loss": 37.8034, + "num_input_tokens_seen": 448227060, + "step": 8700 + }, + { + "epoch": 0.8553106803272646, + "grad_norm": 24.367496490478516, + "learning_rate": 8e-05, + "loss": 33.9807, + "num_input_tokens_seen": 448373212, + "step": 8703 + }, + { + "epoch": 0.8556055133780497, + "grad_norm": 23.358795166015625, + "learning_rate": 8e-05, + "loss": 33.6428, + "num_input_tokens_seen": 448524656, + "step": 8706 + }, + { + "epoch": 0.8559003464288347, + "grad_norm": 23.506254196166992, + "learning_rate": 8e-05, + "loss": 38.0909, + "num_input_tokens_seen": 448690976, + "step": 8709 + }, + { + "epoch": 0.8561951794796197, + "grad_norm": 22.54275131225586, + "learning_rate": 8e-05, + "loss": 35.3043, + "num_input_tokens_seen": 448850068, + "step": 8712 + }, + { + "epoch": 0.8564900125304047, + "grad_norm": 28.30064582824707, + "learning_rate": 8e-05, + "loss": 36.4259, + "num_input_tokens_seen": 448999892, + "step": 8715 + }, + { + "epoch": 0.8567848455811896, + "grad_norm": 23.732145309448242, + "learning_rate": 8e-05, + "loss": 36.0793, + "num_input_tokens_seen": 449151824, + "step": 8718 + }, + { + "epoch": 0.8570796786319747, + "grad_norm": 26.53993797302246, + "learning_rate": 8e-05, + "loss": 35.8181, + "num_input_tokens_seen": 449326968, + "step": 8721 + }, + { + "epoch": 0.8573745116827597, + "grad_norm": 21.584989547729492, + "learning_rate": 8e-05, + "loss": 33.9483, + "num_input_tokens_seen": 449489480, + "step": 8724 + }, + { + "epoch": 0.8576693447335446, + "grad_norm": 20.787649154663086, + "learning_rate": 8e-05, + "loss": 35.5196, + "num_input_tokens_seen": 449643276, + "step": 8727 + }, + { + "epoch": 0.8579641777843297, + "grad_norm": 36.44699478149414, + "learning_rate": 8e-05, + "loss": 35.5228, + "num_input_tokens_seen": 449813676, + "step": 8730 + }, + { + "epoch": 0.8582590108351146, + "grad_norm": 24.554771423339844, + "learning_rate": 8e-05, + "loss": 33.7362, + "num_input_tokens_seen": 449952952, + "step": 8733 + }, + { + "epoch": 0.8585538438858996, + "grad_norm": 21.766437530517578, + "learning_rate": 8e-05, + "loss": 36.0747, + "num_input_tokens_seen": 450100052, + "step": 8736 + }, + { + "epoch": 0.8588486769366847, + "grad_norm": 25.506664276123047, + "learning_rate": 8e-05, + "loss": 36.1313, + "num_input_tokens_seen": 450234676, + "step": 8739 + }, + { + "epoch": 0.8591435099874696, + "grad_norm": 20.974655151367188, + "learning_rate": 8e-05, + "loss": 36.4953, + "num_input_tokens_seen": 450399724, + "step": 8742 + }, + { + "epoch": 0.8594383430382546, + "grad_norm": 21.562509536743164, + "learning_rate": 8e-05, + "loss": 34.0653, + "num_input_tokens_seen": 450567292, + "step": 8745 + }, + { + "epoch": 0.8597331760890395, + "grad_norm": 27.16927146911621, + "learning_rate": 8e-05, + "loss": 32.6643, + "num_input_tokens_seen": 450756072, + "step": 8748 + }, + { + "epoch": 0.8600280091398246, + "grad_norm": 27.619579315185547, + "learning_rate": 8e-05, + "loss": 32.074, + "num_input_tokens_seen": 450934544, + "step": 8751 + }, + { + "epoch": 0.8603228421906096, + "grad_norm": 20.881961822509766, + "learning_rate": 8e-05, + "loss": 34.7699, + "num_input_tokens_seen": 451095764, + "step": 8754 + }, + { + "epoch": 0.8606176752413945, + "grad_norm": 20.835350036621094, + "learning_rate": 8e-05, + "loss": 33.1017, + "num_input_tokens_seen": 451260184, + "step": 8757 + }, + { + "epoch": 0.8609125082921796, + "grad_norm": 38.06818771362305, + "learning_rate": 8e-05, + "loss": 35.8208, + "num_input_tokens_seen": 451409368, + "step": 8760 + }, + { + "epoch": 0.8612073413429645, + "grad_norm": 28.952232360839844, + "learning_rate": 8e-05, + "loss": 33.0805, + "num_input_tokens_seen": 451566212, + "step": 8763 + }, + { + "epoch": 0.8615021743937495, + "grad_norm": 22.913902282714844, + "learning_rate": 8e-05, + "loss": 35.8183, + "num_input_tokens_seen": 451722756, + "step": 8766 + }, + { + "epoch": 0.8617970074445346, + "grad_norm": 21.72722625732422, + "learning_rate": 8e-05, + "loss": 30.1934, + "num_input_tokens_seen": 451869548, + "step": 8769 + }, + { + "epoch": 0.8620918404953195, + "grad_norm": 20.529747009277344, + "learning_rate": 8e-05, + "loss": 37.0564, + "num_input_tokens_seen": 452039380, + "step": 8772 + }, + { + "epoch": 0.8623866735461045, + "grad_norm": 25.314756393432617, + "learning_rate": 8e-05, + "loss": 35.3177, + "num_input_tokens_seen": 452186872, + "step": 8775 + }, + { + "epoch": 0.8626815065968895, + "grad_norm": 22.90513801574707, + "learning_rate": 8e-05, + "loss": 37.0339, + "num_input_tokens_seen": 452336444, + "step": 8778 + }, + { + "epoch": 0.8629763396476745, + "grad_norm": 23.360958099365234, + "learning_rate": 8e-05, + "loss": 34.8022, + "num_input_tokens_seen": 452475436, + "step": 8781 + }, + { + "epoch": 0.8632711726984595, + "grad_norm": 21.173948287963867, + "learning_rate": 8e-05, + "loss": 32.445, + "num_input_tokens_seen": 452653208, + "step": 8784 + }, + { + "epoch": 0.8635660057492445, + "grad_norm": 21.700834274291992, + "learning_rate": 8e-05, + "loss": 33.6184, + "num_input_tokens_seen": 452806456, + "step": 8787 + }, + { + "epoch": 0.8638608388000295, + "grad_norm": 22.173959732055664, + "learning_rate": 8e-05, + "loss": 34.0303, + "num_input_tokens_seen": 452968584, + "step": 8790 + }, + { + "epoch": 0.8641556718508144, + "grad_norm": 24.537803649902344, + "learning_rate": 8e-05, + "loss": 39.1815, + "num_input_tokens_seen": 453119356, + "step": 8793 + }, + { + "epoch": 0.8644505049015995, + "grad_norm": 20.4522762298584, + "learning_rate": 8e-05, + "loss": 35.7433, + "num_input_tokens_seen": 453280312, + "step": 8796 + }, + { + "epoch": 0.8647453379523845, + "grad_norm": 22.030925750732422, + "learning_rate": 8e-05, + "loss": 34.1829, + "num_input_tokens_seen": 453433044, + "step": 8799 + }, + { + "epoch": 0.8650401710031694, + "grad_norm": 22.265579223632812, + "learning_rate": 8e-05, + "loss": 32.7075, + "num_input_tokens_seen": 453573752, + "step": 8802 + }, + { + "epoch": 0.8653350040539545, + "grad_norm": 33.58341598510742, + "learning_rate": 8e-05, + "loss": 35.9601, + "num_input_tokens_seen": 453732492, + "step": 8805 + }, + { + "epoch": 0.8656298371047394, + "grad_norm": 20.91718292236328, + "learning_rate": 8e-05, + "loss": 36.4737, + "num_input_tokens_seen": 453883240, + "step": 8808 + }, + { + "epoch": 0.8659246701555244, + "grad_norm": 17.35066795349121, + "learning_rate": 8e-05, + "loss": 31.8058, + "num_input_tokens_seen": 454042468, + "step": 8811 + }, + { + "epoch": 0.8662195032063095, + "grad_norm": 23.744104385375977, + "learning_rate": 8e-05, + "loss": 34.8083, + "num_input_tokens_seen": 454194980, + "step": 8814 + }, + { + "epoch": 0.8665143362570944, + "grad_norm": 21.31110954284668, + "learning_rate": 8e-05, + "loss": 36.2735, + "num_input_tokens_seen": 454358280, + "step": 8817 + }, + { + "epoch": 0.8668091693078794, + "grad_norm": 31.886720657348633, + "learning_rate": 8e-05, + "loss": 34.488, + "num_input_tokens_seen": 454501644, + "step": 8820 + }, + { + "epoch": 0.8671040023586644, + "grad_norm": 26.252796173095703, + "learning_rate": 8e-05, + "loss": 34.3354, + "num_input_tokens_seen": 454654120, + "step": 8823 + }, + { + "epoch": 0.8673988354094494, + "grad_norm": 26.040729522705078, + "learning_rate": 8e-05, + "loss": 35.9982, + "num_input_tokens_seen": 454844272, + "step": 8826 + }, + { + "epoch": 0.8676936684602344, + "grad_norm": 23.30434226989746, + "learning_rate": 8e-05, + "loss": 34.3949, + "num_input_tokens_seen": 454992640, + "step": 8829 + }, + { + "epoch": 0.8679885015110194, + "grad_norm": 22.62255096435547, + "learning_rate": 8e-05, + "loss": 35.0844, + "num_input_tokens_seen": 455155308, + "step": 8832 + }, + { + "epoch": 0.8682833345618044, + "grad_norm": 20.258991241455078, + "learning_rate": 8e-05, + "loss": 34.8521, + "num_input_tokens_seen": 455306464, + "step": 8835 + }, + { + "epoch": 0.8685781676125893, + "grad_norm": 37.642086029052734, + "learning_rate": 8e-05, + "loss": 31.3964, + "num_input_tokens_seen": 455458584, + "step": 8838 + }, + { + "epoch": 0.8688730006633744, + "grad_norm": 20.638486862182617, + "learning_rate": 8e-05, + "loss": 35.4098, + "num_input_tokens_seen": 455603556, + "step": 8841 + }, + { + "epoch": 0.8691678337141594, + "grad_norm": 19.583791732788086, + "learning_rate": 8e-05, + "loss": 33.9662, + "num_input_tokens_seen": 455750708, + "step": 8844 + }, + { + "epoch": 0.8694626667649443, + "grad_norm": 22.0402774810791, + "learning_rate": 8e-05, + "loss": 39.1685, + "num_input_tokens_seen": 455912140, + "step": 8847 + }, + { + "epoch": 0.8697574998157294, + "grad_norm": 23.528413772583008, + "learning_rate": 8e-05, + "loss": 37.4517, + "num_input_tokens_seen": 456084876, + "step": 8850 + }, + { + "epoch": 0.8700523328665143, + "grad_norm": 22.013490676879883, + "learning_rate": 8e-05, + "loss": 34.666, + "num_input_tokens_seen": 456246060, + "step": 8853 + }, + { + "epoch": 0.8703471659172993, + "grad_norm": 22.675901412963867, + "learning_rate": 8e-05, + "loss": 34.4143, + "num_input_tokens_seen": 456389620, + "step": 8856 + }, + { + "epoch": 0.8706419989680844, + "grad_norm": 23.27752113342285, + "learning_rate": 8e-05, + "loss": 35.8503, + "num_input_tokens_seen": 456567184, + "step": 8859 + }, + { + "epoch": 0.8709368320188693, + "grad_norm": 20.808027267456055, + "learning_rate": 8e-05, + "loss": 36.6624, + "num_input_tokens_seen": 456726572, + "step": 8862 + }, + { + "epoch": 0.8712316650696543, + "grad_norm": 21.771181106567383, + "learning_rate": 8e-05, + "loss": 32.4634, + "num_input_tokens_seen": 456871844, + "step": 8865 + }, + { + "epoch": 0.8715264981204393, + "grad_norm": 28.904388427734375, + "learning_rate": 8e-05, + "loss": 33.9851, + "num_input_tokens_seen": 457027984, + "step": 8868 + }, + { + "epoch": 0.8718213311712243, + "grad_norm": 32.225040435791016, + "learning_rate": 8e-05, + "loss": 33.2702, + "num_input_tokens_seen": 457181788, + "step": 8871 + }, + { + "epoch": 0.8721161642220093, + "grad_norm": 22.55632209777832, + "learning_rate": 8e-05, + "loss": 35.1886, + "num_input_tokens_seen": 457349848, + "step": 8874 + }, + { + "epoch": 0.8724109972727943, + "grad_norm": 22.402708053588867, + "learning_rate": 8e-05, + "loss": 28.6777, + "num_input_tokens_seen": 457518292, + "step": 8877 + }, + { + "epoch": 0.8727058303235793, + "grad_norm": 21.531339645385742, + "learning_rate": 8e-05, + "loss": 32.3938, + "num_input_tokens_seen": 457672928, + "step": 8880 + }, + { + "epoch": 0.8730006633743642, + "grad_norm": 23.64670181274414, + "learning_rate": 8e-05, + "loss": 35.7909, + "num_input_tokens_seen": 457831076, + "step": 8883 + }, + { + "epoch": 0.8732954964251493, + "grad_norm": 27.580236434936523, + "learning_rate": 8e-05, + "loss": 35.1965, + "num_input_tokens_seen": 457994408, + "step": 8886 + }, + { + "epoch": 0.8735903294759343, + "grad_norm": 20.507062911987305, + "learning_rate": 8e-05, + "loss": 34.1822, + "num_input_tokens_seen": 458163696, + "step": 8889 + }, + { + "epoch": 0.8738851625267192, + "grad_norm": 22.15749168395996, + "learning_rate": 8e-05, + "loss": 33.8258, + "num_input_tokens_seen": 458312932, + "step": 8892 + }, + { + "epoch": 0.8741799955775043, + "grad_norm": 40.3120002746582, + "learning_rate": 8e-05, + "loss": 34.9081, + "num_input_tokens_seen": 458438996, + "step": 8895 + }, + { + "epoch": 0.8744748286282892, + "grad_norm": 25.363845825195312, + "learning_rate": 8e-05, + "loss": 35.5967, + "num_input_tokens_seen": 458578764, + "step": 8898 + }, + { + "epoch": 0.8747696616790742, + "grad_norm": 22.670642852783203, + "learning_rate": 8e-05, + "loss": 33.0508, + "num_input_tokens_seen": 458726804, + "step": 8901 + }, + { + "epoch": 0.8750644947298593, + "grad_norm": 24.797653198242188, + "learning_rate": 8e-05, + "loss": 32.97, + "num_input_tokens_seen": 458882348, + "step": 8904 + }, + { + "epoch": 0.8753593277806442, + "grad_norm": 26.080148696899414, + "learning_rate": 8e-05, + "loss": 34.2268, + "num_input_tokens_seen": 459035532, + "step": 8907 + }, + { + "epoch": 0.8756541608314292, + "grad_norm": 27.912677764892578, + "learning_rate": 8e-05, + "loss": 30.2107, + "num_input_tokens_seen": 459189060, + "step": 8910 + }, + { + "epoch": 0.8759489938822141, + "grad_norm": 23.80186653137207, + "learning_rate": 8e-05, + "loss": 36.9899, + "num_input_tokens_seen": 459339772, + "step": 8913 + }, + { + "epoch": 0.8762438269329992, + "grad_norm": 23.507909774780273, + "learning_rate": 8e-05, + "loss": 38.24, + "num_input_tokens_seen": 459525644, + "step": 8916 + }, + { + "epoch": 0.8765386599837842, + "grad_norm": 20.912324905395508, + "learning_rate": 8e-05, + "loss": 35.2431, + "num_input_tokens_seen": 459672648, + "step": 8919 + }, + { + "epoch": 0.8768334930345691, + "grad_norm": 23.98154640197754, + "learning_rate": 8e-05, + "loss": 34.0235, + "num_input_tokens_seen": 459825628, + "step": 8922 + }, + { + "epoch": 0.8771283260853542, + "grad_norm": 20.608129501342773, + "learning_rate": 8e-05, + "loss": 35.8222, + "num_input_tokens_seen": 459977856, + "step": 8925 + }, + { + "epoch": 0.8774231591361391, + "grad_norm": 27.58140754699707, + "learning_rate": 8e-05, + "loss": 34.5154, + "num_input_tokens_seen": 460137076, + "step": 8928 + }, + { + "epoch": 0.8777179921869241, + "grad_norm": 24.529319763183594, + "learning_rate": 8e-05, + "loss": 38.3434, + "num_input_tokens_seen": 460300812, + "step": 8931 + }, + { + "epoch": 0.8780128252377092, + "grad_norm": 26.84439468383789, + "learning_rate": 8e-05, + "loss": 31.479, + "num_input_tokens_seen": 460439784, + "step": 8934 + }, + { + "epoch": 0.8783076582884941, + "grad_norm": 22.796037673950195, + "learning_rate": 8e-05, + "loss": 36.2946, + "num_input_tokens_seen": 460583252, + "step": 8937 + }, + { + "epoch": 0.8786024913392791, + "grad_norm": 25.805952072143555, + "learning_rate": 8e-05, + "loss": 35.5886, + "num_input_tokens_seen": 460727672, + "step": 8940 + }, + { + "epoch": 0.8788973243900641, + "grad_norm": 20.5937557220459, + "learning_rate": 8e-05, + "loss": 36.3242, + "num_input_tokens_seen": 460884440, + "step": 8943 + }, + { + "epoch": 0.8791921574408491, + "grad_norm": 20.25699234008789, + "learning_rate": 8e-05, + "loss": 32.9865, + "num_input_tokens_seen": 461038208, + "step": 8946 + }, + { + "epoch": 0.8794869904916341, + "grad_norm": 25.006460189819336, + "learning_rate": 8e-05, + "loss": 37.2056, + "num_input_tokens_seen": 461195296, + "step": 8949 + }, + { + "epoch": 0.8797818235424191, + "grad_norm": 23.678054809570312, + "learning_rate": 8e-05, + "loss": 33.6716, + "num_input_tokens_seen": 461355160, + "step": 8952 + }, + { + "epoch": 0.8800766565932041, + "grad_norm": 24.38894271850586, + "learning_rate": 8e-05, + "loss": 39.0701, + "num_input_tokens_seen": 461502236, + "step": 8955 + }, + { + "epoch": 0.880371489643989, + "grad_norm": 22.873247146606445, + "learning_rate": 8e-05, + "loss": 36.7938, + "num_input_tokens_seen": 461676032, + "step": 8958 + }, + { + "epoch": 0.8806663226947741, + "grad_norm": 24.20355796813965, + "learning_rate": 8e-05, + "loss": 38.5804, + "num_input_tokens_seen": 461823272, + "step": 8961 + }, + { + "epoch": 0.8809611557455591, + "grad_norm": 23.963706970214844, + "learning_rate": 8e-05, + "loss": 34.3477, + "num_input_tokens_seen": 461965960, + "step": 8964 + }, + { + "epoch": 0.881255988796344, + "grad_norm": 21.74570655822754, + "learning_rate": 8e-05, + "loss": 38.7081, + "num_input_tokens_seen": 462104796, + "step": 8967 + }, + { + "epoch": 0.8815508218471291, + "grad_norm": 23.517004013061523, + "learning_rate": 8e-05, + "loss": 38.1686, + "num_input_tokens_seen": 462271072, + "step": 8970 + }, + { + "epoch": 0.8818456548979141, + "grad_norm": 23.73151206970215, + "learning_rate": 8e-05, + "loss": 29.5466, + "num_input_tokens_seen": 462399608, + "step": 8973 + }, + { + "epoch": 0.882140487948699, + "grad_norm": 27.569501876831055, + "learning_rate": 8e-05, + "loss": 38.4012, + "num_input_tokens_seen": 462539888, + "step": 8976 + }, + { + "epoch": 0.8824353209994841, + "grad_norm": 58.874732971191406, + "learning_rate": 8e-05, + "loss": 34.0895, + "num_input_tokens_seen": 462694848, + "step": 8979 + }, + { + "epoch": 0.882730154050269, + "grad_norm": 21.32411003112793, + "learning_rate": 8e-05, + "loss": 37.7922, + "num_input_tokens_seen": 462872488, + "step": 8982 + }, + { + "epoch": 0.883024987101054, + "grad_norm": 24.835783004760742, + "learning_rate": 8e-05, + "loss": 36.0279, + "num_input_tokens_seen": 463015268, + "step": 8985 + }, + { + "epoch": 0.8833198201518391, + "grad_norm": 34.98569869995117, + "learning_rate": 8e-05, + "loss": 38.1697, + "num_input_tokens_seen": 463182416, + "step": 8988 + }, + { + "epoch": 0.883614653202624, + "grad_norm": 20.140106201171875, + "learning_rate": 8e-05, + "loss": 33.6916, + "num_input_tokens_seen": 463330240, + "step": 8991 + }, + { + "epoch": 0.883909486253409, + "grad_norm": 22.520090103149414, + "learning_rate": 8e-05, + "loss": 35.2222, + "num_input_tokens_seen": 463491564, + "step": 8994 + }, + { + "epoch": 0.884204319304194, + "grad_norm": 22.722288131713867, + "learning_rate": 8e-05, + "loss": 38.6709, + "num_input_tokens_seen": 463644744, + "step": 8997 + }, + { + "epoch": 0.884499152354979, + "grad_norm": 19.819669723510742, + "learning_rate": 8e-05, + "loss": 33.1455, + "num_input_tokens_seen": 463798308, + "step": 9000 + }, + { + "epoch": 0.884499152354979, + "eval_gen_len": 36.3, + "eval_loss": 2.2607827186584473, + "eval_rouge1": 48.6856, + "eval_rouge2": 32.331, + "eval_rougeL": 44.6585, + "eval_rougeLsum": 45.0587, + "eval_runtime": 119.5771, + "eval_samples_per_second": 1.673, + "eval_steps_per_second": 0.418, + "num_input_tokens_seen": 463798308, + "step": 9000 + }, + { + "epoch": 0.884793985405764, + "grad_norm": 23.7694149017334, + "learning_rate": 8e-05, + "loss": 37.3976, + "num_input_tokens_seen": 463964648, + "step": 9003 + }, + { + "epoch": 0.885088818456549, + "grad_norm": 22.439407348632812, + "learning_rate": 8e-05, + "loss": 34.1451, + "num_input_tokens_seen": 464146436, + "step": 9006 + }, + { + "epoch": 0.885383651507334, + "grad_norm": 18.74619483947754, + "learning_rate": 8e-05, + "loss": 32.3557, + "num_input_tokens_seen": 464315752, + "step": 9009 + }, + { + "epoch": 0.8856784845581189, + "grad_norm": 24.110286712646484, + "learning_rate": 8e-05, + "loss": 34.7745, + "num_input_tokens_seen": 464469780, + "step": 9012 + }, + { + "epoch": 0.885973317608904, + "grad_norm": 21.31003189086914, + "learning_rate": 8e-05, + "loss": 33.854, + "num_input_tokens_seen": 464621000, + "step": 9015 + }, + { + "epoch": 0.886268150659689, + "grad_norm": 21.028785705566406, + "learning_rate": 8e-05, + "loss": 33.7743, + "num_input_tokens_seen": 464771256, + "step": 9018 + }, + { + "epoch": 0.8865629837104739, + "grad_norm": 22.16194725036621, + "learning_rate": 8e-05, + "loss": 35.7378, + "num_input_tokens_seen": 464938064, + "step": 9021 + }, + { + "epoch": 0.886857816761259, + "grad_norm": 23.61879539489746, + "learning_rate": 8e-05, + "loss": 32.3147, + "num_input_tokens_seen": 465078100, + "step": 9024 + }, + { + "epoch": 0.8871526498120439, + "grad_norm": 23.015331268310547, + "learning_rate": 8e-05, + "loss": 36.0143, + "num_input_tokens_seen": 465224376, + "step": 9027 + }, + { + "epoch": 0.8874474828628289, + "grad_norm": 25.31874656677246, + "learning_rate": 8e-05, + "loss": 35.5241, + "num_input_tokens_seen": 465373676, + "step": 9030 + }, + { + "epoch": 0.887742315913614, + "grad_norm": 26.688716888427734, + "learning_rate": 8e-05, + "loss": 35.9495, + "num_input_tokens_seen": 465511792, + "step": 9033 + }, + { + "epoch": 0.8880371489643989, + "grad_norm": 22.31456184387207, + "learning_rate": 8e-05, + "loss": 38.7842, + "num_input_tokens_seen": 465648152, + "step": 9036 + }, + { + "epoch": 0.8883319820151839, + "grad_norm": 21.19448471069336, + "learning_rate": 8e-05, + "loss": 37.5663, + "num_input_tokens_seen": 465808344, + "step": 9039 + }, + { + "epoch": 0.8886268150659689, + "grad_norm": 24.864177703857422, + "learning_rate": 8e-05, + "loss": 36.8419, + "num_input_tokens_seen": 465973604, + "step": 9042 + }, + { + "epoch": 0.8889216481167539, + "grad_norm": 22.296798706054688, + "learning_rate": 8e-05, + "loss": 36.3395, + "num_input_tokens_seen": 466124000, + "step": 9045 + }, + { + "epoch": 0.8892164811675389, + "grad_norm": 20.440757751464844, + "learning_rate": 8e-05, + "loss": 38.0934, + "num_input_tokens_seen": 466280940, + "step": 9048 + }, + { + "epoch": 0.8895113142183239, + "grad_norm": 21.70374870300293, + "learning_rate": 8e-05, + "loss": 34.7552, + "num_input_tokens_seen": 466427172, + "step": 9051 + }, + { + "epoch": 0.8898061472691089, + "grad_norm": 21.369592666625977, + "learning_rate": 8e-05, + "loss": 33.519, + "num_input_tokens_seen": 466596836, + "step": 9054 + }, + { + "epoch": 0.8901009803198938, + "grad_norm": 21.856243133544922, + "learning_rate": 8e-05, + "loss": 37.3898, + "num_input_tokens_seen": 466776248, + "step": 9057 + }, + { + "epoch": 0.8903958133706789, + "grad_norm": 23.62397003173828, + "learning_rate": 8e-05, + "loss": 37.3079, + "num_input_tokens_seen": 466937708, + "step": 9060 + }, + { + "epoch": 0.8906906464214639, + "grad_norm": 20.472566604614258, + "learning_rate": 8e-05, + "loss": 32.3763, + "num_input_tokens_seen": 467100392, + "step": 9063 + }, + { + "epoch": 0.8909854794722488, + "grad_norm": 21.549814224243164, + "learning_rate": 8e-05, + "loss": 35.363, + "num_input_tokens_seen": 467240956, + "step": 9066 + }, + { + "epoch": 0.8912803125230339, + "grad_norm": 23.798681259155273, + "learning_rate": 8e-05, + "loss": 33.1227, + "num_input_tokens_seen": 467400232, + "step": 9069 + }, + { + "epoch": 0.8915751455738188, + "grad_norm": 22.498197555541992, + "learning_rate": 8e-05, + "loss": 37.3493, + "num_input_tokens_seen": 467548700, + "step": 9072 + }, + { + "epoch": 0.8918699786246038, + "grad_norm": 27.08021354675293, + "learning_rate": 8e-05, + "loss": 37.5852, + "num_input_tokens_seen": 467701476, + "step": 9075 + }, + { + "epoch": 0.8921648116753889, + "grad_norm": 21.05506706237793, + "learning_rate": 8e-05, + "loss": 32.7373, + "num_input_tokens_seen": 467854984, + "step": 9078 + }, + { + "epoch": 0.8924596447261738, + "grad_norm": 22.088733673095703, + "learning_rate": 8e-05, + "loss": 34.3866, + "num_input_tokens_seen": 468016524, + "step": 9081 + }, + { + "epoch": 0.8927544777769588, + "grad_norm": 34.91509246826172, + "learning_rate": 8e-05, + "loss": 32.2981, + "num_input_tokens_seen": 468170612, + "step": 9084 + }, + { + "epoch": 0.8930493108277437, + "grad_norm": 21.367778778076172, + "learning_rate": 8e-05, + "loss": 33.8347, + "num_input_tokens_seen": 468329472, + "step": 9087 + }, + { + "epoch": 0.8933441438785288, + "grad_norm": 20.561328887939453, + "learning_rate": 8e-05, + "loss": 32.3517, + "num_input_tokens_seen": 468483892, + "step": 9090 + }, + { + "epoch": 0.8936389769293138, + "grad_norm": 22.309688568115234, + "learning_rate": 8e-05, + "loss": 35.2497, + "num_input_tokens_seen": 468605508, + "step": 9093 + }, + { + "epoch": 0.8939338099800987, + "grad_norm": 26.220806121826172, + "learning_rate": 8e-05, + "loss": 37.4711, + "num_input_tokens_seen": 468758424, + "step": 9096 + }, + { + "epoch": 0.8942286430308838, + "grad_norm": 25.198400497436523, + "learning_rate": 8e-05, + "loss": 32.77, + "num_input_tokens_seen": 468913748, + "step": 9099 + }, + { + "epoch": 0.8945234760816687, + "grad_norm": 24.01615333557129, + "learning_rate": 8e-05, + "loss": 33.9408, + "num_input_tokens_seen": 469049056, + "step": 9102 + }, + { + "epoch": 0.8948183091324537, + "grad_norm": 25.380462646484375, + "learning_rate": 8e-05, + "loss": 37.3165, + "num_input_tokens_seen": 469189400, + "step": 9105 + }, + { + "epoch": 0.8951131421832388, + "grad_norm": 23.461862564086914, + "learning_rate": 8e-05, + "loss": 28.8939, + "num_input_tokens_seen": 469344408, + "step": 9108 + }, + { + "epoch": 0.8954079752340237, + "grad_norm": 23.690677642822266, + "learning_rate": 8e-05, + "loss": 35.6869, + "num_input_tokens_seen": 469513492, + "step": 9111 + }, + { + "epoch": 0.8957028082848087, + "grad_norm": 19.593894958496094, + "learning_rate": 8e-05, + "loss": 33.1535, + "num_input_tokens_seen": 469666376, + "step": 9114 + }, + { + "epoch": 0.8959976413355937, + "grad_norm": 100.64704895019531, + "learning_rate": 8e-05, + "loss": 32.9694, + "num_input_tokens_seen": 469842892, + "step": 9117 + }, + { + "epoch": 0.8962924743863787, + "grad_norm": 24.357036590576172, + "learning_rate": 8e-05, + "loss": 36.882, + "num_input_tokens_seen": 470014480, + "step": 9120 + }, + { + "epoch": 0.8965873074371637, + "grad_norm": 21.07242774963379, + "learning_rate": 8e-05, + "loss": 32.1337, + "num_input_tokens_seen": 470179932, + "step": 9123 + }, + { + "epoch": 0.8968821404879487, + "grad_norm": 20.460813522338867, + "learning_rate": 8e-05, + "loss": 31.8768, + "num_input_tokens_seen": 470352160, + "step": 9126 + }, + { + "epoch": 0.8971769735387337, + "grad_norm": 24.652921676635742, + "learning_rate": 8e-05, + "loss": 36.6415, + "num_input_tokens_seen": 470513512, + "step": 9129 + }, + { + "epoch": 0.8974718065895186, + "grad_norm": 23.364362716674805, + "learning_rate": 8e-05, + "loss": 36.4622, + "num_input_tokens_seen": 470668512, + "step": 9132 + }, + { + "epoch": 0.8977666396403037, + "grad_norm": 20.231983184814453, + "learning_rate": 8e-05, + "loss": 37.4117, + "num_input_tokens_seen": 470838268, + "step": 9135 + }, + { + "epoch": 0.8980614726910887, + "grad_norm": 18.93602752685547, + "learning_rate": 8e-05, + "loss": 34.4865, + "num_input_tokens_seen": 470981460, + "step": 9138 + }, + { + "epoch": 0.8983563057418736, + "grad_norm": 20.244422912597656, + "learning_rate": 8e-05, + "loss": 36.4161, + "num_input_tokens_seen": 471131972, + "step": 9141 + }, + { + "epoch": 0.8986511387926587, + "grad_norm": 28.71910858154297, + "learning_rate": 8e-05, + "loss": 35.6331, + "num_input_tokens_seen": 471275176, + "step": 9144 + }, + { + "epoch": 0.8989459718434436, + "grad_norm": 20.917680740356445, + "learning_rate": 8e-05, + "loss": 37.4839, + "num_input_tokens_seen": 471451868, + "step": 9147 + }, + { + "epoch": 0.8992408048942286, + "grad_norm": 22.487375259399414, + "learning_rate": 8e-05, + "loss": 36.7346, + "num_input_tokens_seen": 471614456, + "step": 9150 + }, + { + "epoch": 0.8995356379450137, + "grad_norm": 18.380311965942383, + "learning_rate": 8e-05, + "loss": 33.7283, + "num_input_tokens_seen": 471782060, + "step": 9153 + }, + { + "epoch": 0.8998304709957986, + "grad_norm": 22.763168334960938, + "learning_rate": 8e-05, + "loss": 35.6685, + "num_input_tokens_seen": 471939804, + "step": 9156 + }, + { + "epoch": 0.9001253040465836, + "grad_norm": 25.03064727783203, + "learning_rate": 8e-05, + "loss": 30.6905, + "num_input_tokens_seen": 472067844, + "step": 9159 + }, + { + "epoch": 0.9004201370973686, + "grad_norm": 20.84009552001953, + "learning_rate": 8e-05, + "loss": 39.4543, + "num_input_tokens_seen": 472217136, + "step": 9162 + }, + { + "epoch": 0.9007149701481536, + "grad_norm": 21.24103546142578, + "learning_rate": 8e-05, + "loss": 32.795, + "num_input_tokens_seen": 472381476, + "step": 9165 + }, + { + "epoch": 0.9010098031989386, + "grad_norm": 29.891502380371094, + "learning_rate": 8e-05, + "loss": 35.9941, + "num_input_tokens_seen": 472536628, + "step": 9168 + }, + { + "epoch": 0.9013046362497236, + "grad_norm": 20.72770118713379, + "learning_rate": 8e-05, + "loss": 34.2618, + "num_input_tokens_seen": 472696500, + "step": 9171 + }, + { + "epoch": 0.9015994693005086, + "grad_norm": 21.615867614746094, + "learning_rate": 8e-05, + "loss": 35.2642, + "num_input_tokens_seen": 472862236, + "step": 9174 + }, + { + "epoch": 0.9018943023512935, + "grad_norm": 17.681720733642578, + "learning_rate": 8e-05, + "loss": 33.8807, + "num_input_tokens_seen": 473041936, + "step": 9177 + }, + { + "epoch": 0.9021891354020786, + "grad_norm": 18.36534309387207, + "learning_rate": 8e-05, + "loss": 32.7494, + "num_input_tokens_seen": 473215748, + "step": 9180 + }, + { + "epoch": 0.9024839684528636, + "grad_norm": 21.646133422851562, + "learning_rate": 8e-05, + "loss": 33.7257, + "num_input_tokens_seen": 473363992, + "step": 9183 + }, + { + "epoch": 0.9027788015036485, + "grad_norm": 20.912612915039062, + "learning_rate": 8e-05, + "loss": 32.9426, + "num_input_tokens_seen": 473505952, + "step": 9186 + }, + { + "epoch": 0.9030736345544336, + "grad_norm": 22.79306983947754, + "learning_rate": 8e-05, + "loss": 37.0397, + "num_input_tokens_seen": 473665556, + "step": 9189 + }, + { + "epoch": 0.9033684676052185, + "grad_norm": 21.08576202392578, + "learning_rate": 8e-05, + "loss": 34.9447, + "num_input_tokens_seen": 473829616, + "step": 9192 + }, + { + "epoch": 0.9036633006560035, + "grad_norm": 21.607378005981445, + "learning_rate": 8e-05, + "loss": 32.5755, + "num_input_tokens_seen": 473981004, + "step": 9195 + }, + { + "epoch": 0.9039581337067886, + "grad_norm": 19.860673904418945, + "learning_rate": 8e-05, + "loss": 32.9528, + "num_input_tokens_seen": 474139044, + "step": 9198 + }, + { + "epoch": 0.9042529667575735, + "grad_norm": 20.233686447143555, + "learning_rate": 8e-05, + "loss": 34.9714, + "num_input_tokens_seen": 474286740, + "step": 9201 + }, + { + "epoch": 0.9045477998083585, + "grad_norm": 24.46356201171875, + "learning_rate": 8e-05, + "loss": 33.5463, + "num_input_tokens_seen": 474438148, + "step": 9204 + }, + { + "epoch": 0.9048426328591435, + "grad_norm": 21.49631690979004, + "learning_rate": 8e-05, + "loss": 37.0035, + "num_input_tokens_seen": 474570456, + "step": 9207 + }, + { + "epoch": 0.9051374659099285, + "grad_norm": 20.027463912963867, + "learning_rate": 8e-05, + "loss": 34.2798, + "num_input_tokens_seen": 474720192, + "step": 9210 + }, + { + "epoch": 0.9054322989607135, + "grad_norm": 22.349233627319336, + "learning_rate": 8e-05, + "loss": 35.0284, + "num_input_tokens_seen": 474892740, + "step": 9213 + }, + { + "epoch": 0.9057271320114985, + "grad_norm": 20.50714683532715, + "learning_rate": 8e-05, + "loss": 32.0477, + "num_input_tokens_seen": 475034648, + "step": 9216 + }, + { + "epoch": 0.9060219650622835, + "grad_norm": 18.592439651489258, + "learning_rate": 8e-05, + "loss": 36.5711, + "num_input_tokens_seen": 475203076, + "step": 9219 + }, + { + "epoch": 0.9063167981130684, + "grad_norm": 22.731468200683594, + "learning_rate": 8e-05, + "loss": 35.8516, + "num_input_tokens_seen": 475359344, + "step": 9222 + }, + { + "epoch": 0.9066116311638535, + "grad_norm": 21.352195739746094, + "learning_rate": 8e-05, + "loss": 34.0699, + "num_input_tokens_seen": 475514552, + "step": 9225 + }, + { + "epoch": 0.9069064642146385, + "grad_norm": 22.647829055786133, + "learning_rate": 8e-05, + "loss": 35.6129, + "num_input_tokens_seen": 475658052, + "step": 9228 + }, + { + "epoch": 0.9072012972654234, + "grad_norm": 21.473758697509766, + "learning_rate": 8e-05, + "loss": 30.6012, + "num_input_tokens_seen": 475828848, + "step": 9231 + }, + { + "epoch": 0.9074961303162085, + "grad_norm": 22.36081886291504, + "learning_rate": 8e-05, + "loss": 36.0751, + "num_input_tokens_seen": 475984260, + "step": 9234 + }, + { + "epoch": 0.9077909633669934, + "grad_norm": 26.658342361450195, + "learning_rate": 8e-05, + "loss": 34.4761, + "num_input_tokens_seen": 476143672, + "step": 9237 + }, + { + "epoch": 0.9080857964177784, + "grad_norm": 23.835533142089844, + "learning_rate": 8e-05, + "loss": 39.206, + "num_input_tokens_seen": 476301540, + "step": 9240 + }, + { + "epoch": 0.9083806294685635, + "grad_norm": 20.80655288696289, + "learning_rate": 8e-05, + "loss": 33.6508, + "num_input_tokens_seen": 476446140, + "step": 9243 + }, + { + "epoch": 0.9086754625193484, + "grad_norm": 42.49269485473633, + "learning_rate": 8e-05, + "loss": 33.288, + "num_input_tokens_seen": 476600308, + "step": 9246 + }, + { + "epoch": 0.9089702955701334, + "grad_norm": 21.22631072998047, + "learning_rate": 8e-05, + "loss": 36.4164, + "num_input_tokens_seen": 476778056, + "step": 9249 + }, + { + "epoch": 0.9092651286209184, + "grad_norm": 24.015443801879883, + "learning_rate": 8e-05, + "loss": 35.8094, + "num_input_tokens_seen": 476921668, + "step": 9252 + }, + { + "epoch": 0.9095599616717034, + "grad_norm": 21.08645248413086, + "learning_rate": 8e-05, + "loss": 33.3251, + "num_input_tokens_seen": 477090876, + "step": 9255 + }, + { + "epoch": 0.9098547947224884, + "grad_norm": 29.930673599243164, + "learning_rate": 8e-05, + "loss": 34.2575, + "num_input_tokens_seen": 477265768, + "step": 9258 + }, + { + "epoch": 0.9101496277732734, + "grad_norm": 19.717514038085938, + "learning_rate": 8e-05, + "loss": 31.6078, + "num_input_tokens_seen": 477436472, + "step": 9261 + }, + { + "epoch": 0.9104444608240584, + "grad_norm": 24.465761184692383, + "learning_rate": 8e-05, + "loss": 32.7405, + "num_input_tokens_seen": 477603040, + "step": 9264 + }, + { + "epoch": 0.9107392938748434, + "grad_norm": 25.059980392456055, + "learning_rate": 8e-05, + "loss": 36.9096, + "num_input_tokens_seen": 477755684, + "step": 9267 + }, + { + "epoch": 0.9110341269256284, + "grad_norm": 21.387956619262695, + "learning_rate": 8e-05, + "loss": 35.4109, + "num_input_tokens_seen": 477917476, + "step": 9270 + }, + { + "epoch": 0.9113289599764134, + "grad_norm": 35.50089645385742, + "learning_rate": 8e-05, + "loss": 33.5537, + "num_input_tokens_seen": 478062884, + "step": 9273 + }, + { + "epoch": 0.9116237930271983, + "grad_norm": 19.14139747619629, + "learning_rate": 8e-05, + "loss": 32.7446, + "num_input_tokens_seen": 478226776, + "step": 9276 + }, + { + "epoch": 0.9119186260779834, + "grad_norm": 22.263626098632812, + "learning_rate": 8e-05, + "loss": 37.0658, + "num_input_tokens_seen": 478393780, + "step": 9279 + }, + { + "epoch": 0.9122134591287684, + "grad_norm": 23.79678726196289, + "learning_rate": 8e-05, + "loss": 37.1102, + "num_input_tokens_seen": 478563348, + "step": 9282 + }, + { + "epoch": 0.9125082921795533, + "grad_norm": 23.68954849243164, + "learning_rate": 8e-05, + "loss": 38.8632, + "num_input_tokens_seen": 478716036, + "step": 9285 + }, + { + "epoch": 0.9128031252303384, + "grad_norm": 29.16128921508789, + "learning_rate": 8e-05, + "loss": 33.1692, + "num_input_tokens_seen": 478861352, + "step": 9288 + }, + { + "epoch": 0.9130979582811233, + "grad_norm": 25.06321144104004, + "learning_rate": 8e-05, + "loss": 33.0169, + "num_input_tokens_seen": 479003680, + "step": 9291 + }, + { + "epoch": 0.9133927913319083, + "grad_norm": 23.62033462524414, + "learning_rate": 8e-05, + "loss": 37.3165, + "num_input_tokens_seen": 479174860, + "step": 9294 + }, + { + "epoch": 0.9136876243826934, + "grad_norm": 25.722423553466797, + "learning_rate": 8e-05, + "loss": 39.9271, + "num_input_tokens_seen": 479333436, + "step": 9297 + }, + { + "epoch": 0.9139824574334783, + "grad_norm": 21.272403717041016, + "learning_rate": 8e-05, + "loss": 35.6935, + "num_input_tokens_seen": 479494764, + "step": 9300 + }, + { + "epoch": 0.9142772904842633, + "grad_norm": 21.769561767578125, + "learning_rate": 8e-05, + "loss": 31.1685, + "num_input_tokens_seen": 479666964, + "step": 9303 + }, + { + "epoch": 0.9145721235350482, + "grad_norm": 22.09537124633789, + "learning_rate": 8e-05, + "loss": 32.3473, + "num_input_tokens_seen": 479846588, + "step": 9306 + }, + { + "epoch": 0.9148669565858333, + "grad_norm": 21.59555435180664, + "learning_rate": 8e-05, + "loss": 37.1373, + "num_input_tokens_seen": 480008892, + "step": 9309 + }, + { + "epoch": 0.9151617896366183, + "grad_norm": 28.728235244750977, + "learning_rate": 8e-05, + "loss": 35.4527, + "num_input_tokens_seen": 480182680, + "step": 9312 + }, + { + "epoch": 0.9154566226874032, + "grad_norm": 19.65328598022461, + "learning_rate": 8e-05, + "loss": 31.485, + "num_input_tokens_seen": 480319308, + "step": 9315 + }, + { + "epoch": 0.9157514557381883, + "grad_norm": 21.808292388916016, + "learning_rate": 8e-05, + "loss": 35.7691, + "num_input_tokens_seen": 480467744, + "step": 9318 + }, + { + "epoch": 0.9160462887889732, + "grad_norm": 23.805320739746094, + "learning_rate": 8e-05, + "loss": 34.7721, + "num_input_tokens_seen": 480607360, + "step": 9321 + }, + { + "epoch": 0.9163411218397582, + "grad_norm": 23.046770095825195, + "learning_rate": 8e-05, + "loss": 32.3187, + "num_input_tokens_seen": 480765240, + "step": 9324 + }, + { + "epoch": 0.9166359548905433, + "grad_norm": 21.922595977783203, + "learning_rate": 8e-05, + "loss": 35.2356, + "num_input_tokens_seen": 480915016, + "step": 9327 + }, + { + "epoch": 0.9169307879413282, + "grad_norm": 21.19693374633789, + "learning_rate": 8e-05, + "loss": 32.9518, + "num_input_tokens_seen": 481050844, + "step": 9330 + }, + { + "epoch": 0.9172256209921132, + "grad_norm": 30.573301315307617, + "learning_rate": 8e-05, + "loss": 35.4635, + "num_input_tokens_seen": 481198144, + "step": 9333 + }, + { + "epoch": 0.9175204540428982, + "grad_norm": 20.04607391357422, + "learning_rate": 8e-05, + "loss": 36.8555, + "num_input_tokens_seen": 481357400, + "step": 9336 + }, + { + "epoch": 0.9178152870936832, + "grad_norm": 22.094623565673828, + "learning_rate": 8e-05, + "loss": 37.6246, + "num_input_tokens_seen": 481501936, + "step": 9339 + }, + { + "epoch": 0.9181101201444682, + "grad_norm": 21.94713020324707, + "learning_rate": 8e-05, + "loss": 34.7503, + "num_input_tokens_seen": 481656604, + "step": 9342 + }, + { + "epoch": 0.9184049531952532, + "grad_norm": 18.993497848510742, + "learning_rate": 8e-05, + "loss": 34.0214, + "num_input_tokens_seen": 481831376, + "step": 9345 + }, + { + "epoch": 0.9186997862460382, + "grad_norm": 20.522165298461914, + "learning_rate": 8e-05, + "loss": 33.8274, + "num_input_tokens_seen": 481989768, + "step": 9348 + }, + { + "epoch": 0.9189946192968231, + "grad_norm": 22.100162506103516, + "learning_rate": 8e-05, + "loss": 32.6869, + "num_input_tokens_seen": 482148876, + "step": 9351 + }, + { + "epoch": 0.9192894523476082, + "grad_norm": 31.93699073791504, + "learning_rate": 8e-05, + "loss": 38.894, + "num_input_tokens_seen": 482302396, + "step": 9354 + }, + { + "epoch": 0.9195842853983932, + "grad_norm": 18.395660400390625, + "learning_rate": 8e-05, + "loss": 35.2791, + "num_input_tokens_seen": 482459164, + "step": 9357 + }, + { + "epoch": 0.9198791184491781, + "grad_norm": 20.613759994506836, + "learning_rate": 8e-05, + "loss": 32.9776, + "num_input_tokens_seen": 482634520, + "step": 9360 + }, + { + "epoch": 0.9201739514999632, + "grad_norm": 26.639373779296875, + "learning_rate": 8e-05, + "loss": 36.3913, + "num_input_tokens_seen": 482779476, + "step": 9363 + }, + { + "epoch": 0.9204687845507481, + "grad_norm": 21.73651885986328, + "learning_rate": 8e-05, + "loss": 33.269, + "num_input_tokens_seen": 482931304, + "step": 9366 + }, + { + "epoch": 0.9207636176015331, + "grad_norm": 19.23166847229004, + "learning_rate": 8e-05, + "loss": 34.0668, + "num_input_tokens_seen": 483111640, + "step": 9369 + }, + { + "epoch": 0.9210584506523182, + "grad_norm": 21.559226989746094, + "learning_rate": 8e-05, + "loss": 40.3595, + "num_input_tokens_seen": 483274320, + "step": 9372 + }, + { + "epoch": 0.9213532837031031, + "grad_norm": 22.628686904907227, + "learning_rate": 8e-05, + "loss": 33.3141, + "num_input_tokens_seen": 483416020, + "step": 9375 + }, + { + "epoch": 0.9216481167538881, + "grad_norm": 18.740367889404297, + "learning_rate": 8e-05, + "loss": 31.2498, + "num_input_tokens_seen": 483552496, + "step": 9378 + }, + { + "epoch": 0.9219429498046731, + "grad_norm": 20.061677932739258, + "learning_rate": 8e-05, + "loss": 34.287, + "num_input_tokens_seen": 483715768, + "step": 9381 + }, + { + "epoch": 0.9222377828554581, + "grad_norm": 19.16664695739746, + "learning_rate": 8e-05, + "loss": 33.4455, + "num_input_tokens_seen": 483880060, + "step": 9384 + }, + { + "epoch": 0.9225326159062431, + "grad_norm": 20.502145767211914, + "learning_rate": 8e-05, + "loss": 29.6471, + "num_input_tokens_seen": 484019340, + "step": 9387 + }, + { + "epoch": 0.9228274489570281, + "grad_norm": 21.350358963012695, + "learning_rate": 8e-05, + "loss": 35.5749, + "num_input_tokens_seen": 484185872, + "step": 9390 + }, + { + "epoch": 0.9231222820078131, + "grad_norm": 20.89253807067871, + "learning_rate": 8e-05, + "loss": 34.3302, + "num_input_tokens_seen": 484346760, + "step": 9393 + }, + { + "epoch": 0.923417115058598, + "grad_norm": 24.376724243164062, + "learning_rate": 8e-05, + "loss": 36.6064, + "num_input_tokens_seen": 484479972, + "step": 9396 + }, + { + "epoch": 0.9237119481093831, + "grad_norm": 19.522863388061523, + "learning_rate": 8e-05, + "loss": 34.7432, + "num_input_tokens_seen": 484613160, + "step": 9399 + }, + { + "epoch": 0.9240067811601681, + "grad_norm": 199.71450805664062, + "learning_rate": 8e-05, + "loss": 32.396, + "num_input_tokens_seen": 484759556, + "step": 9402 + }, + { + "epoch": 0.924301614210953, + "grad_norm": 21.016319274902344, + "learning_rate": 8e-05, + "loss": 33.2017, + "num_input_tokens_seen": 484918408, + "step": 9405 + }, + { + "epoch": 0.9245964472617381, + "grad_norm": 21.742298126220703, + "learning_rate": 8e-05, + "loss": 36.4643, + "num_input_tokens_seen": 485084048, + "step": 9408 + }, + { + "epoch": 0.924891280312523, + "grad_norm": 21.95002555847168, + "learning_rate": 8e-05, + "loss": 34.1213, + "num_input_tokens_seen": 485233412, + "step": 9411 + }, + { + "epoch": 0.925186113363308, + "grad_norm": 23.653345108032227, + "learning_rate": 8e-05, + "loss": 34.5019, + "num_input_tokens_seen": 485369200, + "step": 9414 + }, + { + "epoch": 0.9254809464140931, + "grad_norm": 19.90846061706543, + "learning_rate": 8e-05, + "loss": 33.594, + "num_input_tokens_seen": 485519936, + "step": 9417 + }, + { + "epoch": 0.925775779464878, + "grad_norm": 22.45507049560547, + "learning_rate": 8e-05, + "loss": 40.2271, + "num_input_tokens_seen": 485667392, + "step": 9420 + }, + { + "epoch": 0.926070612515663, + "grad_norm": 23.564464569091797, + "learning_rate": 8e-05, + "loss": 33.0576, + "num_input_tokens_seen": 485811800, + "step": 9423 + }, + { + "epoch": 0.926365445566448, + "grad_norm": 21.314844131469727, + "learning_rate": 8e-05, + "loss": 30.8356, + "num_input_tokens_seen": 485970732, + "step": 9426 + }, + { + "epoch": 0.926660278617233, + "grad_norm": 18.391977310180664, + "learning_rate": 8e-05, + "loss": 33.5133, + "num_input_tokens_seen": 486143860, + "step": 9429 + }, + { + "epoch": 0.926955111668018, + "grad_norm": 19.696285247802734, + "learning_rate": 8e-05, + "loss": 33.489, + "num_input_tokens_seen": 486277652, + "step": 9432 + }, + { + "epoch": 0.927249944718803, + "grad_norm": 22.3781681060791, + "learning_rate": 8e-05, + "loss": 36.3257, + "num_input_tokens_seen": 486453628, + "step": 9435 + }, + { + "epoch": 0.927544777769588, + "grad_norm": 21.230148315429688, + "learning_rate": 8e-05, + "loss": 33.2605, + "num_input_tokens_seen": 486595912, + "step": 9438 + }, + { + "epoch": 0.9278396108203729, + "grad_norm": 21.168331146240234, + "learning_rate": 8e-05, + "loss": 33.8501, + "num_input_tokens_seen": 486731804, + "step": 9441 + }, + { + "epoch": 0.928134443871158, + "grad_norm": 18.55724334716797, + "learning_rate": 8e-05, + "loss": 28.1622, + "num_input_tokens_seen": 486884636, + "step": 9444 + }, + { + "epoch": 0.928429276921943, + "grad_norm": 24.025060653686523, + "learning_rate": 8e-05, + "loss": 37.0399, + "num_input_tokens_seen": 487042888, + "step": 9447 + }, + { + "epoch": 0.9287241099727279, + "grad_norm": 27.229087829589844, + "learning_rate": 8e-05, + "loss": 37.3008, + "num_input_tokens_seen": 487211120, + "step": 9450 + }, + { + "epoch": 0.929018943023513, + "grad_norm": 18.948274612426758, + "learning_rate": 8e-05, + "loss": 33.0499, + "num_input_tokens_seen": 487353712, + "step": 9453 + }, + { + "epoch": 0.9293137760742979, + "grad_norm": 21.234724044799805, + "learning_rate": 8e-05, + "loss": 38.9776, + "num_input_tokens_seen": 487529332, + "step": 9456 + }, + { + "epoch": 0.9296086091250829, + "grad_norm": 25.416913986206055, + "learning_rate": 8e-05, + "loss": 34.834, + "num_input_tokens_seen": 487704728, + "step": 9459 + }, + { + "epoch": 0.929903442175868, + "grad_norm": 24.957584381103516, + "learning_rate": 8e-05, + "loss": 37.8111, + "num_input_tokens_seen": 487870980, + "step": 9462 + }, + { + "epoch": 0.9301982752266529, + "grad_norm": 21.949615478515625, + "learning_rate": 8e-05, + "loss": 37.4198, + "num_input_tokens_seen": 488027824, + "step": 9465 + }, + { + "epoch": 0.9304931082774379, + "grad_norm": 21.102859497070312, + "learning_rate": 8e-05, + "loss": 35.4306, + "num_input_tokens_seen": 488174928, + "step": 9468 + }, + { + "epoch": 0.9307879413282228, + "grad_norm": 20.244783401489258, + "learning_rate": 8e-05, + "loss": 34.1812, + "num_input_tokens_seen": 488327380, + "step": 9471 + }, + { + "epoch": 0.9310827743790079, + "grad_norm": 22.8046817779541, + "learning_rate": 8e-05, + "loss": 32.5576, + "num_input_tokens_seen": 488475732, + "step": 9474 + }, + { + "epoch": 0.9313776074297929, + "grad_norm": 24.047414779663086, + "learning_rate": 8e-05, + "loss": 36.3854, + "num_input_tokens_seen": 488624304, + "step": 9477 + }, + { + "epoch": 0.9316724404805778, + "grad_norm": 22.388870239257812, + "learning_rate": 8e-05, + "loss": 34.9084, + "num_input_tokens_seen": 488770808, + "step": 9480 + }, + { + "epoch": 0.9319672735313629, + "grad_norm": 22.250295639038086, + "learning_rate": 8e-05, + "loss": 35.0276, + "num_input_tokens_seen": 488912556, + "step": 9483 + }, + { + "epoch": 0.9322621065821478, + "grad_norm": 24.2645206451416, + "learning_rate": 8e-05, + "loss": 35.794, + "num_input_tokens_seen": 489060396, + "step": 9486 + }, + { + "epoch": 0.9325569396329328, + "grad_norm": 21.49842643737793, + "learning_rate": 8e-05, + "loss": 34.637, + "num_input_tokens_seen": 489218892, + "step": 9489 + }, + { + "epoch": 0.9328517726837179, + "grad_norm": 21.13848114013672, + "learning_rate": 8e-05, + "loss": 34.4821, + "num_input_tokens_seen": 489372688, + "step": 9492 + }, + { + "epoch": 0.9331466057345028, + "grad_norm": 24.668495178222656, + "learning_rate": 8e-05, + "loss": 32.8465, + "num_input_tokens_seen": 489505468, + "step": 9495 + }, + { + "epoch": 0.9334414387852878, + "grad_norm": 19.518667221069336, + "learning_rate": 8e-05, + "loss": 33.5588, + "num_input_tokens_seen": 489659504, + "step": 9498 + }, + { + "epoch": 0.9337362718360728, + "grad_norm": 19.34673309326172, + "learning_rate": 8e-05, + "loss": 32.4264, + "num_input_tokens_seen": 489810660, + "step": 9501 + }, + { + "epoch": 0.9340311048868578, + "grad_norm": 22.143505096435547, + "learning_rate": 8e-05, + "loss": 37.9556, + "num_input_tokens_seen": 489979256, + "step": 9504 + }, + { + "epoch": 0.9343259379376428, + "grad_norm": 20.706947326660156, + "learning_rate": 8e-05, + "loss": 35.6361, + "num_input_tokens_seen": 490134768, + "step": 9507 + }, + { + "epoch": 0.9346207709884278, + "grad_norm": 20.448240280151367, + "learning_rate": 8e-05, + "loss": 32.0137, + "num_input_tokens_seen": 490292724, + "step": 9510 + }, + { + "epoch": 0.9349156040392128, + "grad_norm": 22.021400451660156, + "learning_rate": 8e-05, + "loss": 36.6951, + "num_input_tokens_seen": 490450020, + "step": 9513 + }, + { + "epoch": 0.9352104370899977, + "grad_norm": 21.240678787231445, + "learning_rate": 8e-05, + "loss": 35.2441, + "num_input_tokens_seen": 490605652, + "step": 9516 + }, + { + "epoch": 0.9355052701407828, + "grad_norm": 21.042343139648438, + "learning_rate": 8e-05, + "loss": 36.4855, + "num_input_tokens_seen": 490769044, + "step": 9519 + }, + { + "epoch": 0.9358001031915678, + "grad_norm": 21.686046600341797, + "learning_rate": 8e-05, + "loss": 31.8773, + "num_input_tokens_seen": 490922684, + "step": 9522 + }, + { + "epoch": 0.9360949362423527, + "grad_norm": 31.692363739013672, + "learning_rate": 8e-05, + "loss": 38.7701, + "num_input_tokens_seen": 491073924, + "step": 9525 + }, + { + "epoch": 0.9363897692931378, + "grad_norm": 20.821338653564453, + "learning_rate": 8e-05, + "loss": 37.0108, + "num_input_tokens_seen": 491229148, + "step": 9528 + }, + { + "epoch": 0.9366846023439227, + "grad_norm": 24.331012725830078, + "learning_rate": 8e-05, + "loss": 36.7816, + "num_input_tokens_seen": 491401840, + "step": 9531 + }, + { + "epoch": 0.9369794353947077, + "grad_norm": 22.13974952697754, + "learning_rate": 8e-05, + "loss": 36.4159, + "num_input_tokens_seen": 491558052, + "step": 9534 + }, + { + "epoch": 0.9372742684454928, + "grad_norm": 20.529359817504883, + "learning_rate": 8e-05, + "loss": 33.4253, + "num_input_tokens_seen": 491710880, + "step": 9537 + }, + { + "epoch": 0.9375691014962777, + "grad_norm": 22.973304748535156, + "learning_rate": 8e-05, + "loss": 38.1455, + "num_input_tokens_seen": 491863964, + "step": 9540 + }, + { + "epoch": 0.9378639345470627, + "grad_norm": 20.182170867919922, + "learning_rate": 8e-05, + "loss": 36.4646, + "num_input_tokens_seen": 492011328, + "step": 9543 + }, + { + "epoch": 0.9381587675978477, + "grad_norm": 21.98539161682129, + "learning_rate": 8e-05, + "loss": 35.9327, + "num_input_tokens_seen": 492181992, + "step": 9546 + }, + { + "epoch": 0.9384536006486327, + "grad_norm": 22.3552303314209, + "learning_rate": 8e-05, + "loss": 36.1662, + "num_input_tokens_seen": 492328688, + "step": 9549 + }, + { + "epoch": 0.9387484336994177, + "grad_norm": 18.721784591674805, + "learning_rate": 8e-05, + "loss": 36.1254, + "num_input_tokens_seen": 492490200, + "step": 9552 + }, + { + "epoch": 0.9390432667502027, + "grad_norm": 22.69650650024414, + "learning_rate": 8e-05, + "loss": 34.9092, + "num_input_tokens_seen": 492650216, + "step": 9555 + }, + { + "epoch": 0.9393380998009877, + "grad_norm": 25.502473831176758, + "learning_rate": 8e-05, + "loss": 33.9575, + "num_input_tokens_seen": 492808952, + "step": 9558 + }, + { + "epoch": 0.9396329328517726, + "grad_norm": 19.732494354248047, + "learning_rate": 8e-05, + "loss": 32.2993, + "num_input_tokens_seen": 492963472, + "step": 9561 + }, + { + "epoch": 0.9399277659025577, + "grad_norm": 20.05377960205078, + "learning_rate": 8e-05, + "loss": 33.9659, + "num_input_tokens_seen": 493119448, + "step": 9564 + }, + { + "epoch": 0.9402225989533427, + "grad_norm": 30.67023277282715, + "learning_rate": 8e-05, + "loss": 34.9658, + "num_input_tokens_seen": 493269844, + "step": 9567 + }, + { + "epoch": 0.9405174320041276, + "grad_norm": 26.700212478637695, + "learning_rate": 8e-05, + "loss": 35.2712, + "num_input_tokens_seen": 493403864, + "step": 9570 + }, + { + "epoch": 0.9408122650549127, + "grad_norm": 24.358205795288086, + "learning_rate": 8e-05, + "loss": 34.8117, + "num_input_tokens_seen": 493562716, + "step": 9573 + }, + { + "epoch": 0.9411070981056977, + "grad_norm": 22.915876388549805, + "learning_rate": 8e-05, + "loss": 34.7899, + "num_input_tokens_seen": 493726740, + "step": 9576 + }, + { + "epoch": 0.9414019311564826, + "grad_norm": 25.79482650756836, + "learning_rate": 8e-05, + "loss": 36.4804, + "num_input_tokens_seen": 493875008, + "step": 9579 + }, + { + "epoch": 0.9416967642072677, + "grad_norm": 18.314313888549805, + "learning_rate": 8e-05, + "loss": 33.5188, + "num_input_tokens_seen": 494033616, + "step": 9582 + }, + { + "epoch": 0.9419915972580526, + "grad_norm": 22.25737953186035, + "learning_rate": 8e-05, + "loss": 37.1847, + "num_input_tokens_seen": 494174624, + "step": 9585 + }, + { + "epoch": 0.9422864303088376, + "grad_norm": 23.608041763305664, + "learning_rate": 8e-05, + "loss": 33.9381, + "num_input_tokens_seen": 494322532, + "step": 9588 + }, + { + "epoch": 0.9425812633596227, + "grad_norm": 20.048398971557617, + "learning_rate": 8e-05, + "loss": 36.2292, + "num_input_tokens_seen": 494474040, + "step": 9591 + }, + { + "epoch": 0.9428760964104076, + "grad_norm": 18.939929962158203, + "learning_rate": 8e-05, + "loss": 30.8911, + "num_input_tokens_seen": 494617224, + "step": 9594 + }, + { + "epoch": 0.9431709294611926, + "grad_norm": 17.95427894592285, + "learning_rate": 8e-05, + "loss": 35.8452, + "num_input_tokens_seen": 494774616, + "step": 9597 + }, + { + "epoch": 0.9434657625119776, + "grad_norm": 21.61246109008789, + "learning_rate": 8e-05, + "loss": 36.3985, + "num_input_tokens_seen": 494931148, + "step": 9600 + }, + { + "epoch": 0.9437605955627626, + "grad_norm": 21.577884674072266, + "learning_rate": 8e-05, + "loss": 35.3917, + "num_input_tokens_seen": 495082104, + "step": 9603 + }, + { + "epoch": 0.9440554286135476, + "grad_norm": 20.373001098632812, + "learning_rate": 8e-05, + "loss": 33.8704, + "num_input_tokens_seen": 495222352, + "step": 9606 + }, + { + "epoch": 0.9443502616643326, + "grad_norm": 20.00078010559082, + "learning_rate": 8e-05, + "loss": 37.882, + "num_input_tokens_seen": 495387952, + "step": 9609 + }, + { + "epoch": 0.9446450947151176, + "grad_norm": 20.51920509338379, + "learning_rate": 8e-05, + "loss": 34.1078, + "num_input_tokens_seen": 495531268, + "step": 9612 + }, + { + "epoch": 0.9449399277659025, + "grad_norm": 21.88003158569336, + "learning_rate": 8e-05, + "loss": 37.5921, + "num_input_tokens_seen": 495702868, + "step": 9615 + }, + { + "epoch": 0.9452347608166876, + "grad_norm": 28.009687423706055, + "learning_rate": 8e-05, + "loss": 36.7524, + "num_input_tokens_seen": 495851672, + "step": 9618 + }, + { + "epoch": 0.9455295938674726, + "grad_norm": 23.272294998168945, + "learning_rate": 8e-05, + "loss": 34.6014, + "num_input_tokens_seen": 495997208, + "step": 9621 + }, + { + "epoch": 0.9458244269182575, + "grad_norm": 27.648483276367188, + "learning_rate": 8e-05, + "loss": 35.1428, + "num_input_tokens_seen": 496153292, + "step": 9624 + }, + { + "epoch": 0.9461192599690426, + "grad_norm": 21.118453979492188, + "learning_rate": 8e-05, + "loss": 35.3225, + "num_input_tokens_seen": 496308760, + "step": 9627 + }, + { + "epoch": 0.9464140930198275, + "grad_norm": 19.653175354003906, + "learning_rate": 8e-05, + "loss": 28.756, + "num_input_tokens_seen": 496479760, + "step": 9630 + }, + { + "epoch": 0.9467089260706125, + "grad_norm": 24.253093719482422, + "learning_rate": 8e-05, + "loss": 35.9993, + "num_input_tokens_seen": 496618096, + "step": 9633 + }, + { + "epoch": 0.9470037591213976, + "grad_norm": 25.256576538085938, + "learning_rate": 8e-05, + "loss": 38.7862, + "num_input_tokens_seen": 496770116, + "step": 9636 + }, + { + "epoch": 0.9472985921721825, + "grad_norm": 28.437040328979492, + "learning_rate": 8e-05, + "loss": 31.186, + "num_input_tokens_seen": 496941048, + "step": 9639 + }, + { + "epoch": 0.9475934252229675, + "grad_norm": 20.87740707397461, + "learning_rate": 8e-05, + "loss": 37.1772, + "num_input_tokens_seen": 497102572, + "step": 9642 + }, + { + "epoch": 0.9478882582737524, + "grad_norm": 20.97877311706543, + "learning_rate": 8e-05, + "loss": 34.5502, + "num_input_tokens_seen": 497254912, + "step": 9645 + }, + { + "epoch": 0.9481830913245375, + "grad_norm": 23.761564254760742, + "learning_rate": 8e-05, + "loss": 38.1404, + "num_input_tokens_seen": 497407040, + "step": 9648 + }, + { + "epoch": 0.9484779243753225, + "grad_norm": 22.26776695251465, + "learning_rate": 8e-05, + "loss": 36.5098, + "num_input_tokens_seen": 497547540, + "step": 9651 + }, + { + "epoch": 0.9487727574261074, + "grad_norm": 22.641294479370117, + "learning_rate": 8e-05, + "loss": 31.9351, + "num_input_tokens_seen": 497700032, + "step": 9654 + }, + { + "epoch": 0.9490675904768925, + "grad_norm": 22.351139068603516, + "learning_rate": 8e-05, + "loss": 37.8459, + "num_input_tokens_seen": 497862068, + "step": 9657 + }, + { + "epoch": 0.9493624235276774, + "grad_norm": 19.74834632873535, + "learning_rate": 8e-05, + "loss": 36.4842, + "num_input_tokens_seen": 498024708, + "step": 9660 + }, + { + "epoch": 0.9496572565784624, + "grad_norm": 26.638202667236328, + "learning_rate": 8e-05, + "loss": 37.338, + "num_input_tokens_seen": 498177824, + "step": 9663 + }, + { + "epoch": 0.9499520896292475, + "grad_norm": 22.298736572265625, + "learning_rate": 8e-05, + "loss": 33.4881, + "num_input_tokens_seen": 498333556, + "step": 9666 + }, + { + "epoch": 0.9502469226800324, + "grad_norm": 19.22551155090332, + "learning_rate": 8e-05, + "loss": 30.2335, + "num_input_tokens_seen": 498469020, + "step": 9669 + }, + { + "epoch": 0.9505417557308175, + "grad_norm": 24.792818069458008, + "learning_rate": 8e-05, + "loss": 33.5654, + "num_input_tokens_seen": 498619116, + "step": 9672 + }, + { + "epoch": 0.9508365887816024, + "grad_norm": 20.585163116455078, + "learning_rate": 8e-05, + "loss": 30.6095, + "num_input_tokens_seen": 498767456, + "step": 9675 + }, + { + "epoch": 0.9511314218323874, + "grad_norm": 18.556869506835938, + "learning_rate": 8e-05, + "loss": 35.2455, + "num_input_tokens_seen": 498925584, + "step": 9678 + }, + { + "epoch": 0.9514262548831725, + "grad_norm": 23.33407974243164, + "learning_rate": 8e-05, + "loss": 35.0502, + "num_input_tokens_seen": 499102064, + "step": 9681 + }, + { + "epoch": 0.9517210879339574, + "grad_norm": 20.793521881103516, + "learning_rate": 8e-05, + "loss": 35.8787, + "num_input_tokens_seen": 499253736, + "step": 9684 + }, + { + "epoch": 0.9520159209847424, + "grad_norm": 20.79303741455078, + "learning_rate": 8e-05, + "loss": 38.7343, + "num_input_tokens_seen": 499411668, + "step": 9687 + }, + { + "epoch": 0.9523107540355273, + "grad_norm": 19.87205696105957, + "learning_rate": 8e-05, + "loss": 34.112, + "num_input_tokens_seen": 499575948, + "step": 9690 + }, + { + "epoch": 0.9526055870863124, + "grad_norm": 24.625654220581055, + "learning_rate": 8e-05, + "loss": 34.9216, + "num_input_tokens_seen": 499714888, + "step": 9693 + }, + { + "epoch": 0.9529004201370974, + "grad_norm": 21.79035758972168, + "learning_rate": 8e-05, + "loss": 35.6022, + "num_input_tokens_seen": 499867056, + "step": 9696 + }, + { + "epoch": 0.9531952531878823, + "grad_norm": 17.22002601623535, + "learning_rate": 8e-05, + "loss": 33.1651, + "num_input_tokens_seen": 500020932, + "step": 9699 + }, + { + "epoch": 0.9534900862386674, + "grad_norm": 20.668373107910156, + "learning_rate": 8e-05, + "loss": 28.9119, + "num_input_tokens_seen": 500185388, + "step": 9702 + }, + { + "epoch": 0.9537849192894523, + "grad_norm": 24.211820602416992, + "learning_rate": 8e-05, + "loss": 37.2794, + "num_input_tokens_seen": 500339608, + "step": 9705 + }, + { + "epoch": 0.9540797523402373, + "grad_norm": 22.269710540771484, + "learning_rate": 8e-05, + "loss": 33.7264, + "num_input_tokens_seen": 500474852, + "step": 9708 + }, + { + "epoch": 0.9543745853910224, + "grad_norm": 20.687097549438477, + "learning_rate": 8e-05, + "loss": 35.9343, + "num_input_tokens_seen": 500621900, + "step": 9711 + }, + { + "epoch": 0.9546694184418073, + "grad_norm": 18.96708869934082, + "learning_rate": 8e-05, + "loss": 34.9415, + "num_input_tokens_seen": 500796820, + "step": 9714 + }, + { + "epoch": 0.9549642514925923, + "grad_norm": 481.27459716796875, + "learning_rate": 8e-05, + "loss": 33.8589, + "num_input_tokens_seen": 500949388, + "step": 9717 + }, + { + "epoch": 0.9552590845433773, + "grad_norm": 23.094690322875977, + "learning_rate": 8e-05, + "loss": 32.8335, + "num_input_tokens_seen": 501096632, + "step": 9720 + }, + { + "epoch": 0.9555539175941623, + "grad_norm": 20.492908477783203, + "learning_rate": 8e-05, + "loss": 34.0337, + "num_input_tokens_seen": 501247176, + "step": 9723 + }, + { + "epoch": 0.9558487506449473, + "grad_norm": 20.346040725708008, + "learning_rate": 8e-05, + "loss": 34.3415, + "num_input_tokens_seen": 501389276, + "step": 9726 + }, + { + "epoch": 0.9561435836957323, + "grad_norm": 25.47278594970703, + "learning_rate": 8e-05, + "loss": 30.5468, + "num_input_tokens_seen": 501528648, + "step": 9729 + }, + { + "epoch": 0.9564384167465173, + "grad_norm": 21.43868064880371, + "learning_rate": 8e-05, + "loss": 37.3752, + "num_input_tokens_seen": 501666952, + "step": 9732 + }, + { + "epoch": 0.9567332497973022, + "grad_norm": 23.923418045043945, + "learning_rate": 8e-05, + "loss": 37.6275, + "num_input_tokens_seen": 501814412, + "step": 9735 + }, + { + "epoch": 0.9570280828480873, + "grad_norm": 22.2674503326416, + "learning_rate": 8e-05, + "loss": 39.7825, + "num_input_tokens_seen": 501952240, + "step": 9738 + }, + { + "epoch": 0.9573229158988723, + "grad_norm": 19.780776977539062, + "learning_rate": 8e-05, + "loss": 33.9042, + "num_input_tokens_seen": 502113492, + "step": 9741 + }, + { + "epoch": 0.9576177489496572, + "grad_norm": 18.62359046936035, + "learning_rate": 8e-05, + "loss": 29.9758, + "num_input_tokens_seen": 502272680, + "step": 9744 + }, + { + "epoch": 0.9579125820004423, + "grad_norm": 45.219757080078125, + "learning_rate": 8e-05, + "loss": 38.9523, + "num_input_tokens_seen": 502412496, + "step": 9747 + }, + { + "epoch": 0.9582074150512272, + "grad_norm": 21.53115463256836, + "learning_rate": 8e-05, + "loss": 35.1453, + "num_input_tokens_seen": 502565504, + "step": 9750 + }, + { + "epoch": 0.9585022481020122, + "grad_norm": 19.353473663330078, + "learning_rate": 8e-05, + "loss": 36.7383, + "num_input_tokens_seen": 502697288, + "step": 9753 + }, + { + "epoch": 0.9587970811527973, + "grad_norm": 18.89324951171875, + "learning_rate": 8e-05, + "loss": 33.598, + "num_input_tokens_seen": 502848920, + "step": 9756 + }, + { + "epoch": 0.9590919142035822, + "grad_norm": 21.069604873657227, + "learning_rate": 8e-05, + "loss": 34.0274, + "num_input_tokens_seen": 503030192, + "step": 9759 + }, + { + "epoch": 0.9593867472543672, + "grad_norm": 37.08634948730469, + "learning_rate": 8e-05, + "loss": 33.2065, + "num_input_tokens_seen": 503184856, + "step": 9762 + }, + { + "epoch": 0.9596815803051522, + "grad_norm": 22.28505516052246, + "learning_rate": 8e-05, + "loss": 35.7401, + "num_input_tokens_seen": 503320700, + "step": 9765 + }, + { + "epoch": 0.9599764133559372, + "grad_norm": 18.990753173828125, + "learning_rate": 8e-05, + "loss": 33.9176, + "num_input_tokens_seen": 503488576, + "step": 9768 + }, + { + "epoch": 0.9602712464067222, + "grad_norm": 19.933992385864258, + "learning_rate": 8e-05, + "loss": 35.2393, + "num_input_tokens_seen": 503654248, + "step": 9771 + }, + { + "epoch": 0.9605660794575072, + "grad_norm": 19.628435134887695, + "learning_rate": 8e-05, + "loss": 34.0539, + "num_input_tokens_seen": 503801032, + "step": 9774 + }, + { + "epoch": 0.9608609125082922, + "grad_norm": 24.20693016052246, + "learning_rate": 8e-05, + "loss": 33.1455, + "num_input_tokens_seen": 503960648, + "step": 9777 + }, + { + "epoch": 0.9611557455590771, + "grad_norm": 22.341066360473633, + "learning_rate": 8e-05, + "loss": 38.3758, + "num_input_tokens_seen": 504113264, + "step": 9780 + }, + { + "epoch": 0.9614505786098622, + "grad_norm": 20.44728660583496, + "learning_rate": 8e-05, + "loss": 34.0762, + "num_input_tokens_seen": 504251244, + "step": 9783 + }, + { + "epoch": 0.9617454116606472, + "grad_norm": 18.34883689880371, + "learning_rate": 8e-05, + "loss": 31.6663, + "num_input_tokens_seen": 504408236, + "step": 9786 + }, + { + "epoch": 0.9620402447114321, + "grad_norm": 19.12334442138672, + "learning_rate": 8e-05, + "loss": 37.1449, + "num_input_tokens_seen": 504583188, + "step": 9789 + }, + { + "epoch": 0.9623350777622172, + "grad_norm": 22.31780242919922, + "learning_rate": 8e-05, + "loss": 34.1645, + "num_input_tokens_seen": 504735980, + "step": 9792 + }, + { + "epoch": 0.9626299108130021, + "grad_norm": 21.41351318359375, + "learning_rate": 8e-05, + "loss": 35.3733, + "num_input_tokens_seen": 504874156, + "step": 9795 + }, + { + "epoch": 0.9629247438637871, + "grad_norm": 18.388166427612305, + "learning_rate": 8e-05, + "loss": 29.836, + "num_input_tokens_seen": 505025504, + "step": 9798 + }, + { + "epoch": 0.9632195769145722, + "grad_norm": 20.411054611206055, + "learning_rate": 8e-05, + "loss": 31.9745, + "num_input_tokens_seen": 505190108, + "step": 9801 + }, + { + "epoch": 0.9635144099653571, + "grad_norm": 25.66120147705078, + "learning_rate": 8e-05, + "loss": 39.5065, + "num_input_tokens_seen": 505335460, + "step": 9804 + }, + { + "epoch": 0.9638092430161421, + "grad_norm": 21.30845832824707, + "learning_rate": 8e-05, + "loss": 33.3261, + "num_input_tokens_seen": 505511100, + "step": 9807 + }, + { + "epoch": 0.964104076066927, + "grad_norm": 25.881977081298828, + "learning_rate": 8e-05, + "loss": 34.2817, + "num_input_tokens_seen": 505643444, + "step": 9810 + }, + { + "epoch": 0.9643989091177121, + "grad_norm": 20.899316787719727, + "learning_rate": 8e-05, + "loss": 34.6943, + "num_input_tokens_seen": 505804368, + "step": 9813 + }, + { + "epoch": 0.9646937421684971, + "grad_norm": 20.84662628173828, + "learning_rate": 8e-05, + "loss": 35.9952, + "num_input_tokens_seen": 505949192, + "step": 9816 + }, + { + "epoch": 0.964988575219282, + "grad_norm": 23.826128005981445, + "learning_rate": 8e-05, + "loss": 34.4256, + "num_input_tokens_seen": 506116204, + "step": 9819 + }, + { + "epoch": 0.9652834082700671, + "grad_norm": 21.089387893676758, + "learning_rate": 8e-05, + "loss": 37.0237, + "num_input_tokens_seen": 506268228, + "step": 9822 + }, + { + "epoch": 0.965578241320852, + "grad_norm": 20.195451736450195, + "learning_rate": 8e-05, + "loss": 32.2907, + "num_input_tokens_seen": 506423696, + "step": 9825 + }, + { + "epoch": 0.965873074371637, + "grad_norm": 20.46344757080078, + "learning_rate": 8e-05, + "loss": 34.8418, + "num_input_tokens_seen": 506570468, + "step": 9828 + }, + { + "epoch": 0.9661679074224221, + "grad_norm": 20.803773880004883, + "learning_rate": 8e-05, + "loss": 35.3394, + "num_input_tokens_seen": 506735356, + "step": 9831 + }, + { + "epoch": 0.966462740473207, + "grad_norm": 20.69495391845703, + "learning_rate": 8e-05, + "loss": 31.3141, + "num_input_tokens_seen": 506882236, + "step": 9834 + }, + { + "epoch": 0.966757573523992, + "grad_norm": 20.818252563476562, + "learning_rate": 8e-05, + "loss": 34.065, + "num_input_tokens_seen": 507036800, + "step": 9837 + }, + { + "epoch": 0.967052406574777, + "grad_norm": 19.533720016479492, + "learning_rate": 8e-05, + "loss": 36.8215, + "num_input_tokens_seen": 507189136, + "step": 9840 + }, + { + "epoch": 0.967347239625562, + "grad_norm": 24.33357810974121, + "learning_rate": 8e-05, + "loss": 31.5452, + "num_input_tokens_seen": 507332120, + "step": 9843 + }, + { + "epoch": 0.967642072676347, + "grad_norm": 21.880701065063477, + "learning_rate": 8e-05, + "loss": 37.1414, + "num_input_tokens_seen": 507486400, + "step": 9846 + }, + { + "epoch": 0.967936905727132, + "grad_norm": 23.587797164916992, + "learning_rate": 8e-05, + "loss": 33.8317, + "num_input_tokens_seen": 507651728, + "step": 9849 + }, + { + "epoch": 0.968231738777917, + "grad_norm": 16.616315841674805, + "learning_rate": 8e-05, + "loss": 34.2093, + "num_input_tokens_seen": 507812512, + "step": 9852 + }, + { + "epoch": 0.9685265718287019, + "grad_norm": 21.581829071044922, + "learning_rate": 8e-05, + "loss": 30.2119, + "num_input_tokens_seen": 507974420, + "step": 9855 + }, + { + "epoch": 0.968821404879487, + "grad_norm": 20.407085418701172, + "learning_rate": 8e-05, + "loss": 37.4761, + "num_input_tokens_seen": 508119500, + "step": 9858 + }, + { + "epoch": 0.969116237930272, + "grad_norm": 24.967605590820312, + "learning_rate": 8e-05, + "loss": 32.8561, + "num_input_tokens_seen": 508279472, + "step": 9861 + }, + { + "epoch": 0.9694110709810569, + "grad_norm": 28.922719955444336, + "learning_rate": 8e-05, + "loss": 31.4779, + "num_input_tokens_seen": 508415424, + "step": 9864 + }, + { + "epoch": 0.969705904031842, + "grad_norm": 24.99030303955078, + "learning_rate": 8e-05, + "loss": 35.4844, + "num_input_tokens_seen": 508561032, + "step": 9867 + }, + { + "epoch": 0.9700007370826269, + "grad_norm": 21.515268325805664, + "learning_rate": 8e-05, + "loss": 37.914, + "num_input_tokens_seen": 508724348, + "step": 9870 + }, + { + "epoch": 0.9702955701334119, + "grad_norm": 20.35093879699707, + "learning_rate": 8e-05, + "loss": 39.0067, + "num_input_tokens_seen": 508882664, + "step": 9873 + }, + { + "epoch": 0.970590403184197, + "grad_norm": 21.495630264282227, + "learning_rate": 8e-05, + "loss": 33.9999, + "num_input_tokens_seen": 509022868, + "step": 9876 + }, + { + "epoch": 0.9708852362349819, + "grad_norm": 22.823671340942383, + "learning_rate": 8e-05, + "loss": 36.8194, + "num_input_tokens_seen": 509156400, + "step": 9879 + }, + { + "epoch": 0.971180069285767, + "grad_norm": 19.8090877532959, + "learning_rate": 8e-05, + "loss": 33.599, + "num_input_tokens_seen": 509318684, + "step": 9882 + }, + { + "epoch": 0.971474902336552, + "grad_norm": 22.890684127807617, + "learning_rate": 8e-05, + "loss": 35.3154, + "num_input_tokens_seen": 509481856, + "step": 9885 + }, + { + "epoch": 0.9717697353873369, + "grad_norm": 20.67902374267578, + "learning_rate": 8e-05, + "loss": 37.2118, + "num_input_tokens_seen": 509665356, + "step": 9888 + }, + { + "epoch": 0.972064568438122, + "grad_norm": 25.929218292236328, + "learning_rate": 8e-05, + "loss": 34.125, + "num_input_tokens_seen": 509812800, + "step": 9891 + }, + { + "epoch": 0.9723594014889069, + "grad_norm": 22.32213592529297, + "learning_rate": 8e-05, + "loss": 38.672, + "num_input_tokens_seen": 509972556, + "step": 9894 + }, + { + "epoch": 0.9726542345396919, + "grad_norm": 17.61611557006836, + "learning_rate": 8e-05, + "loss": 32.8811, + "num_input_tokens_seen": 510141016, + "step": 9897 + }, + { + "epoch": 0.972949067590477, + "grad_norm": 22.887008666992188, + "learning_rate": 8e-05, + "loss": 35.0657, + "num_input_tokens_seen": 510288240, + "step": 9900 + }, + { + "epoch": 0.9732439006412619, + "grad_norm": 20.91583251953125, + "learning_rate": 8e-05, + "loss": 35.5109, + "num_input_tokens_seen": 510457332, + "step": 9903 + }, + { + "epoch": 0.9735387336920469, + "grad_norm": 22.92230224609375, + "learning_rate": 8e-05, + "loss": 35.5309, + "num_input_tokens_seen": 510609764, + "step": 9906 + }, + { + "epoch": 0.9738335667428318, + "grad_norm": 22.04583740234375, + "learning_rate": 8e-05, + "loss": 31.3168, + "num_input_tokens_seen": 510767660, + "step": 9909 + }, + { + "epoch": 0.9741283997936169, + "grad_norm": 20.284467697143555, + "learning_rate": 8e-05, + "loss": 34.3692, + "num_input_tokens_seen": 510936116, + "step": 9912 + }, + { + "epoch": 0.9744232328444019, + "grad_norm": 21.815916061401367, + "learning_rate": 8e-05, + "loss": 33.3647, + "num_input_tokens_seen": 511094124, + "step": 9915 + }, + { + "epoch": 0.9747180658951868, + "grad_norm": 26.54884910583496, + "learning_rate": 8e-05, + "loss": 34.5375, + "num_input_tokens_seen": 511243880, + "step": 9918 + }, + { + "epoch": 0.9750128989459719, + "grad_norm": 20.4998779296875, + "learning_rate": 8e-05, + "loss": 33.2815, + "num_input_tokens_seen": 511410856, + "step": 9921 + }, + { + "epoch": 0.9753077319967568, + "grad_norm": 19.39156723022461, + "learning_rate": 8e-05, + "loss": 31.9833, + "num_input_tokens_seen": 511576308, + "step": 9924 + }, + { + "epoch": 0.9756025650475418, + "grad_norm": 19.645227432250977, + "learning_rate": 8e-05, + "loss": 32.86, + "num_input_tokens_seen": 511739464, + "step": 9927 + }, + { + "epoch": 0.9758973980983269, + "grad_norm": 23.1904354095459, + "learning_rate": 8e-05, + "loss": 33.4071, + "num_input_tokens_seen": 511878860, + "step": 9930 + }, + { + "epoch": 0.9761922311491118, + "grad_norm": 18.9490966796875, + "learning_rate": 8e-05, + "loss": 36.429, + "num_input_tokens_seen": 512038108, + "step": 9933 + }, + { + "epoch": 0.9764870641998968, + "grad_norm": 20.765483856201172, + "learning_rate": 8e-05, + "loss": 34.1946, + "num_input_tokens_seen": 512178532, + "step": 9936 + }, + { + "epoch": 0.9767818972506818, + "grad_norm": 21.686630249023438, + "learning_rate": 8e-05, + "loss": 39.4583, + "num_input_tokens_seen": 512327616, + "step": 9939 + }, + { + "epoch": 0.9770767303014668, + "grad_norm": 19.82565689086914, + "learning_rate": 8e-05, + "loss": 32.1733, + "num_input_tokens_seen": 512489768, + "step": 9942 + }, + { + "epoch": 0.9773715633522518, + "grad_norm": 28.079265594482422, + "learning_rate": 8e-05, + "loss": 39.3916, + "num_input_tokens_seen": 512627556, + "step": 9945 + }, + { + "epoch": 0.9776663964030368, + "grad_norm": 22.157262802124023, + "learning_rate": 8e-05, + "loss": 31.2814, + "num_input_tokens_seen": 512799820, + "step": 9948 + }, + { + "epoch": 0.9779612294538218, + "grad_norm": 24.7800235748291, + "learning_rate": 8e-05, + "loss": 34.3267, + "num_input_tokens_seen": 512947652, + "step": 9951 + }, + { + "epoch": 0.9782560625046067, + "grad_norm": 21.06366729736328, + "learning_rate": 8e-05, + "loss": 35.091, + "num_input_tokens_seen": 513105068, + "step": 9954 + }, + { + "epoch": 0.9785508955553918, + "grad_norm": 21.90027618408203, + "learning_rate": 8e-05, + "loss": 35.5001, + "num_input_tokens_seen": 513270152, + "step": 9957 + }, + { + "epoch": 0.9788457286061768, + "grad_norm": 21.52861213684082, + "learning_rate": 8e-05, + "loss": 34.9564, + "num_input_tokens_seen": 513417040, + "step": 9960 + }, + { + "epoch": 0.9791405616569617, + "grad_norm": 21.92180824279785, + "learning_rate": 8e-05, + "loss": 34.5154, + "num_input_tokens_seen": 513567416, + "step": 9963 + }, + { + "epoch": 0.9794353947077468, + "grad_norm": 20.202768325805664, + "learning_rate": 8e-05, + "loss": 30.0621, + "num_input_tokens_seen": 513723628, + "step": 9966 + }, + { + "epoch": 0.9797302277585317, + "grad_norm": 21.07074546813965, + "learning_rate": 8e-05, + "loss": 35.3967, + "num_input_tokens_seen": 513884464, + "step": 9969 + }, + { + "epoch": 0.9800250608093167, + "grad_norm": 23.317293167114258, + "learning_rate": 8e-05, + "loss": 38.406, + "num_input_tokens_seen": 514050832, + "step": 9972 + }, + { + "epoch": 0.9803198938601018, + "grad_norm": 20.868349075317383, + "learning_rate": 8e-05, + "loss": 35.6753, + "num_input_tokens_seen": 514197228, + "step": 9975 + }, + { + "epoch": 0.9806147269108867, + "grad_norm": 23.616369247436523, + "learning_rate": 8e-05, + "loss": 35.2552, + "num_input_tokens_seen": 514350780, + "step": 9978 + }, + { + "epoch": 0.9809095599616717, + "grad_norm": 20.292543411254883, + "learning_rate": 8e-05, + "loss": 33.7011, + "num_input_tokens_seen": 514503084, + "step": 9981 + }, + { + "epoch": 0.9812043930124567, + "grad_norm": 25.273117065429688, + "learning_rate": 8e-05, + "loss": 35.5905, + "num_input_tokens_seen": 514659228, + "step": 9984 + }, + { + "epoch": 0.9814992260632417, + "grad_norm": 20.815622329711914, + "learning_rate": 8e-05, + "loss": 31.5263, + "num_input_tokens_seen": 514817380, + "step": 9987 + }, + { + "epoch": 0.9817940591140267, + "grad_norm": 20.29827308654785, + "learning_rate": 8e-05, + "loss": 35.3308, + "num_input_tokens_seen": 514996984, + "step": 9990 + }, + { + "epoch": 0.9820888921648117, + "grad_norm": 19.71322250366211, + "learning_rate": 8e-05, + "loss": 34.5728, + "num_input_tokens_seen": 515142160, + "step": 9993 + }, + { + "epoch": 0.9823837252155967, + "grad_norm": 20.120180130004883, + "learning_rate": 8e-05, + "loss": 37.7434, + "num_input_tokens_seen": 515298572, + "step": 9996 + }, + { + "epoch": 0.9826785582663816, + "grad_norm": 23.941017150878906, + "learning_rate": 8e-05, + "loss": 36.9318, + "num_input_tokens_seen": 515474328, + "step": 9999 + }, + { + "epoch": 0.9827768359499767, + "eval_gen_len": 30.6, + "eval_loss": 2.209508180618286, + "eval_rouge1": 50.3518, + "eval_rouge2": 33.9831, + "eval_rougeL": 46.3741, + "eval_rougeLsum": 46.7798, + "eval_runtime": 92.1739, + "eval_samples_per_second": 2.17, + "eval_steps_per_second": 0.542, + "num_input_tokens_seen": 515531508, + "step": 10000 + }, + { + "epoch": 0.9829733913171667, + "grad_norm": 19.42774772644043, + "learning_rate": 8e-05, + "loss": 34.3803, + "num_input_tokens_seen": 515630088, + "step": 10002 + }, + { + "epoch": 0.9832682243679517, + "grad_norm": 22.675596237182617, + "learning_rate": 8e-05, + "loss": 32.2009, + "num_input_tokens_seen": 515809936, + "step": 10005 + }, + { + "epoch": 0.9835630574187366, + "grad_norm": 17.020742416381836, + "learning_rate": 8e-05, + "loss": 32.3546, + "num_input_tokens_seen": 515963532, + "step": 10008 + }, + { + "epoch": 0.9838578904695217, + "grad_norm": 19.389602661132812, + "learning_rate": 8e-05, + "loss": 32.8277, + "num_input_tokens_seen": 516124864, + "step": 10011 + }, + { + "epoch": 0.9841527235203066, + "grad_norm": 27.274927139282227, + "learning_rate": 8e-05, + "loss": 37.2066, + "num_input_tokens_seen": 516280980, + "step": 10014 + }, + { + "epoch": 0.9844475565710916, + "grad_norm": 21.650020599365234, + "learning_rate": 8e-05, + "loss": 34.0361, + "num_input_tokens_seen": 516413736, + "step": 10017 + }, + { + "epoch": 0.9847423896218767, + "grad_norm": 23.6706485748291, + "learning_rate": 8e-05, + "loss": 37.021, + "num_input_tokens_seen": 516578896, + "step": 10020 + }, + { + "epoch": 0.9850372226726616, + "grad_norm": 19.572914123535156, + "learning_rate": 8e-05, + "loss": 32.311, + "num_input_tokens_seen": 516759116, + "step": 10023 + }, + { + "epoch": 0.9853320557234466, + "grad_norm": 21.539064407348633, + "learning_rate": 8e-05, + "loss": 33.711, + "num_input_tokens_seen": 516912860, + "step": 10026 + }, + { + "epoch": 0.9856268887742315, + "grad_norm": 29.89957618713379, + "learning_rate": 8e-05, + "loss": 36.039, + "num_input_tokens_seen": 517066956, + "step": 10029 + }, + { + "epoch": 0.9859217218250166, + "grad_norm": 40.08900833129883, + "learning_rate": 8e-05, + "loss": 37.2409, + "num_input_tokens_seen": 517232724, + "step": 10032 + }, + { + "epoch": 0.9862165548758016, + "grad_norm": 23.061819076538086, + "learning_rate": 8e-05, + "loss": 38.555, + "num_input_tokens_seen": 517389860, + "step": 10035 + }, + { + "epoch": 0.9865113879265865, + "grad_norm": 22.41363525390625, + "learning_rate": 8e-05, + "loss": 33.8079, + "num_input_tokens_seen": 517542184, + "step": 10038 + }, + { + "epoch": 0.9868062209773716, + "grad_norm": 19.55914878845215, + "learning_rate": 8e-05, + "loss": 35.6898, + "num_input_tokens_seen": 517694116, + "step": 10041 + }, + { + "epoch": 0.9871010540281565, + "grad_norm": 22.108427047729492, + "learning_rate": 8e-05, + "loss": 37.0617, + "num_input_tokens_seen": 517869780, + "step": 10044 + }, + { + "epoch": 0.9873958870789415, + "grad_norm": 22.998760223388672, + "learning_rate": 8e-05, + "loss": 35.7518, + "num_input_tokens_seen": 518023508, + "step": 10047 + }, + { + "epoch": 0.9876907201297266, + "grad_norm": 21.714473724365234, + "learning_rate": 8e-05, + "loss": 33.4973, + "num_input_tokens_seen": 518175596, + "step": 10050 + }, + { + "epoch": 0.9879855531805115, + "grad_norm": 21.371646881103516, + "learning_rate": 8e-05, + "loss": 31.9276, + "num_input_tokens_seen": 518322968, + "step": 10053 + }, + { + "epoch": 0.9882803862312965, + "grad_norm": 28.615001678466797, + "learning_rate": 8e-05, + "loss": 34.2391, + "num_input_tokens_seen": 518484088, + "step": 10056 + }, + { + "epoch": 0.9885752192820815, + "grad_norm": 20.553464889526367, + "learning_rate": 8e-05, + "loss": 35.4243, + "num_input_tokens_seen": 518626344, + "step": 10059 + }, + { + "epoch": 0.9888700523328665, + "grad_norm": 18.18429183959961, + "learning_rate": 8e-05, + "loss": 31.8329, + "num_input_tokens_seen": 518788096, + "step": 10062 + }, + { + "epoch": 0.9891648853836515, + "grad_norm": 18.766237258911133, + "learning_rate": 8e-05, + "loss": 29.4481, + "num_input_tokens_seen": 518957216, + "step": 10065 + }, + { + "epoch": 0.9894597184344365, + "grad_norm": 21.449504852294922, + "learning_rate": 8e-05, + "loss": 33.9893, + "num_input_tokens_seen": 519103360, + "step": 10068 + }, + { + "epoch": 0.9897545514852215, + "grad_norm": 21.447940826416016, + "learning_rate": 8e-05, + "loss": 37.1803, + "num_input_tokens_seen": 519259144, + "step": 10071 + }, + { + "epoch": 0.9900493845360064, + "grad_norm": 21.051067352294922, + "learning_rate": 8e-05, + "loss": 31.7947, + "num_input_tokens_seen": 519414808, + "step": 10074 + }, + { + "epoch": 0.9903442175867915, + "grad_norm": 23.860065460205078, + "learning_rate": 8e-05, + "loss": 37.1056, + "num_input_tokens_seen": 519583132, + "step": 10077 + }, + { + "epoch": 0.9906390506375765, + "grad_norm": 22.17423439025879, + "learning_rate": 8e-05, + "loss": 31.622, + "num_input_tokens_seen": 519756728, + "step": 10080 + }, + { + "epoch": 0.9909338836883614, + "grad_norm": 56.88254928588867, + "learning_rate": 8e-05, + "loss": 31.7862, + "num_input_tokens_seen": 519898204, + "step": 10083 + }, + { + "epoch": 0.9912287167391465, + "grad_norm": 21.94686508178711, + "learning_rate": 8e-05, + "loss": 34.7363, + "num_input_tokens_seen": 520060116, + "step": 10086 + }, + { + "epoch": 0.9915235497899314, + "grad_norm": 19.87834358215332, + "learning_rate": 8e-05, + "loss": 31.9671, + "num_input_tokens_seen": 520198240, + "step": 10089 + }, + { + "epoch": 0.9918183828407164, + "grad_norm": 21.034942626953125, + "learning_rate": 8e-05, + "loss": 35.371, + "num_input_tokens_seen": 520331472, + "step": 10092 + }, + { + "epoch": 0.9921132158915015, + "grad_norm": 21.12396240234375, + "learning_rate": 8e-05, + "loss": 35.7865, + "num_input_tokens_seen": 520478780, + "step": 10095 + }, + { + "epoch": 0.9924080489422864, + "grad_norm": 20.821521759033203, + "learning_rate": 8e-05, + "loss": 30.3265, + "num_input_tokens_seen": 520624224, + "step": 10098 + }, + { + "epoch": 0.9927028819930714, + "grad_norm": 22.01003074645996, + "learning_rate": 8e-05, + "loss": 31.763, + "num_input_tokens_seen": 520782620, + "step": 10101 + }, + { + "epoch": 0.9929977150438564, + "grad_norm": 22.409605026245117, + "learning_rate": 8e-05, + "loss": 32.1014, + "num_input_tokens_seen": 520933140, + "step": 10104 + }, + { + "epoch": 0.9932925480946414, + "grad_norm": 21.183134078979492, + "learning_rate": 8e-05, + "loss": 32.2919, + "num_input_tokens_seen": 521090992, + "step": 10107 + }, + { + "epoch": 0.9935873811454264, + "grad_norm": 25.62685775756836, + "learning_rate": 8e-05, + "loss": 34.3917, + "num_input_tokens_seen": 521225684, + "step": 10110 + }, + { + "epoch": 0.9938822141962114, + "grad_norm": 27.680646896362305, + "learning_rate": 8e-05, + "loss": 34.8568, + "num_input_tokens_seen": 521374296, + "step": 10113 + }, + { + "epoch": 0.9941770472469964, + "grad_norm": 19.09922981262207, + "learning_rate": 8e-05, + "loss": 34.4068, + "num_input_tokens_seen": 521563176, + "step": 10116 + }, + { + "epoch": 0.9944718802977813, + "grad_norm": 27.627317428588867, + "learning_rate": 8e-05, + "loss": 38.226, + "num_input_tokens_seen": 521719468, + "step": 10119 + }, + { + "epoch": 0.9947667133485664, + "grad_norm": 20.5487003326416, + "learning_rate": 8e-05, + "loss": 35.3753, + "num_input_tokens_seen": 521881312, + "step": 10122 + }, + { + "epoch": 0.9950615463993514, + "grad_norm": 26.295984268188477, + "learning_rate": 8e-05, + "loss": 33.185, + "num_input_tokens_seen": 522020508, + "step": 10125 + }, + { + "epoch": 0.9953563794501363, + "grad_norm": 25.220104217529297, + "learning_rate": 8e-05, + "loss": 33.8547, + "num_input_tokens_seen": 522183960, + "step": 10128 + }, + { + "epoch": 0.9956512125009214, + "grad_norm": 32.48232650756836, + "learning_rate": 8e-05, + "loss": 35.6518, + "num_input_tokens_seen": 522335996, + "step": 10131 + }, + { + "epoch": 0.9959460455517063, + "grad_norm": 21.6962947845459, + "learning_rate": 8e-05, + "loss": 33.5165, + "num_input_tokens_seen": 522504656, + "step": 10134 + }, + { + "epoch": 0.9962408786024913, + "grad_norm": 20.183027267456055, + "learning_rate": 8e-05, + "loss": 32.3433, + "num_input_tokens_seen": 522656296, + "step": 10137 + }, + { + "epoch": 0.9965357116532764, + "grad_norm": 25.173770904541016, + "learning_rate": 8e-05, + "loss": 36.5661, + "num_input_tokens_seen": 522813752, + "step": 10140 + }, + { + "epoch": 0.9968305447040613, + "grad_norm": 24.288291931152344, + "learning_rate": 8e-05, + "loss": 36.8847, + "num_input_tokens_seen": 522965148, + "step": 10143 + }, + { + "epoch": 0.9971253777548463, + "grad_norm": 20.81410026550293, + "learning_rate": 8e-05, + "loss": 37.2171, + "num_input_tokens_seen": 523100376, + "step": 10146 + }, + { + "epoch": 0.9974202108056313, + "grad_norm": 20.889305114746094, + "learning_rate": 8e-05, + "loss": 31.6453, + "num_input_tokens_seen": 523248276, + "step": 10149 + }, + { + "epoch": 0.9977150438564163, + "grad_norm": 22.11374282836914, + "learning_rate": 8e-05, + "loss": 37.9344, + "num_input_tokens_seen": 523429220, + "step": 10152 + }, + { + "epoch": 0.9980098769072013, + "grad_norm": 21.695816040039062, + "learning_rate": 8e-05, + "loss": 38.5086, + "num_input_tokens_seen": 523578488, + "step": 10155 + }, + { + "epoch": 0.9983047099579863, + "grad_norm": 20.22509002685547, + "learning_rate": 8e-05, + "loss": 36.4175, + "num_input_tokens_seen": 523725232, + "step": 10158 + }, + { + "epoch": 0.9985995430087713, + "grad_norm": 22.71636962890625, + "learning_rate": 8e-05, + "loss": 33.4109, + "num_input_tokens_seen": 523883028, + "step": 10161 + }, + { + "epoch": 0.9988943760595562, + "grad_norm": 19.05315589904785, + "learning_rate": 8e-05, + "loss": 34.5973, + "num_input_tokens_seen": 524037912, + "step": 10164 + }, + { + "epoch": 0.9991892091103413, + "grad_norm": 19.2845401763916, + "learning_rate": 8e-05, + "loss": 35.7759, + "num_input_tokens_seen": 524190012, + "step": 10167 + }, + { + "epoch": 0.9994840421611263, + "grad_norm": 23.7741641998291, + "learning_rate": 8e-05, + "loss": 32.828, + "num_input_tokens_seen": 524345060, + "step": 10170 + }, + { + "epoch": 0.9997788752119112, + "grad_norm": 20.280067443847656, + "learning_rate": 8e-05, + "loss": 32.2748, + "num_input_tokens_seen": 524511820, + "step": 10173 + }, + { + "epoch": 0.9999754305791012, + "num_input_tokens_seen": 524625736, + "step": 10175, + "total_flos": 1.8734435060870185e+18, + "train_loss": 51.37127453965696, + "train_runtime": 54286.9149, + "train_samples_per_second": 11.996, + "train_steps_per_second": 0.187, + "train_tokens_per_second": 9664.096 + } + ], + "logging_steps": 3, + "max_steps": 10175, + "num_input_tokens_seen": 524625736, + "num_train_epochs": 1, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.8734435060870185e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}