diff --git "a/checkpoint-7000/trainer_state.json" "b/checkpoint-7000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-7000/trainer_state.json" @@ -0,0 +1,56348 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.49033772010472215, + "eval_steps": 200, + "global_step": 7000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 7.004824572924602e-05, + "grad_norm": 6.222772121429443, + "learning_rate": 9.99930017513135e-05, + "loss": 1.1076, + "num_input_tokens_seen": 16384, + "step": 1 + }, + { + "epoch": 0.00014009649145849205, + "grad_norm": 6.042057037353516, + "learning_rate": 9.998600350262697e-05, + "loss": 1.1086, + "num_input_tokens_seen": 32768, + "step": 2 + }, + { + "epoch": 0.00021014473718773804, + "grad_norm": 7.119229316711426, + "learning_rate": 9.997900525394046e-05, + "loss": 1.4047, + "num_input_tokens_seen": 49152, + "step": 3 + }, + { + "epoch": 0.0002801929829169841, + "grad_norm": 7.133191108703613, + "learning_rate": 9.997200700525395e-05, + "loss": 1.3921, + "num_input_tokens_seen": 65536, + "step": 4 + }, + { + "epoch": 0.0003502412286462301, + "grad_norm": 6.1078338623046875, + "learning_rate": 9.996500875656743e-05, + "loss": 1.3171, + "num_input_tokens_seen": 81920, + "step": 5 + }, + { + "epoch": 0.0004202894743754761, + "grad_norm": 6.466420650482178, + "learning_rate": 9.995801050788092e-05, + "loss": 1.0732, + "num_input_tokens_seen": 97344, + "step": 6 + }, + { + "epoch": 0.0004903377201047221, + "grad_norm": 5.578189849853516, + "learning_rate": 9.99510122591944e-05, + "loss": 0.9929, + "num_input_tokens_seen": 113728, + "step": 7 + }, + { + "epoch": 0.0005603859658339682, + "grad_norm": 7.197720527648926, + "learning_rate": 9.994401401050789e-05, + "loss": 1.2512, + "num_input_tokens_seen": 129528, + "step": 8 + }, + { + "epoch": 0.0006304342115632141, + "grad_norm": 6.618913650512695, + "learning_rate": 9.993701576182136e-05, + "loss": 1.3495, + "num_input_tokens_seen": 145704, + "step": 9 + }, + { + "epoch": 0.0007004824572924602, + "grad_norm": 6.955508232116699, + "learning_rate": 9.993001751313485e-05, + "loss": 1.1823, + "num_input_tokens_seen": 161664, + "step": 10 + }, + { + "epoch": 0.0007705307030217062, + "grad_norm": 6.6807074546813965, + "learning_rate": 9.992301926444835e-05, + "loss": 1.1693, + "num_input_tokens_seen": 177960, + "step": 11 + }, + { + "epoch": 0.0008405789487509522, + "grad_norm": 6.784447193145752, + "learning_rate": 9.991602101576183e-05, + "loss": 1.3744, + "num_input_tokens_seen": 194344, + "step": 12 + }, + { + "epoch": 0.0009106271944801982, + "grad_norm": 6.7418437004089355, + "learning_rate": 9.990902276707532e-05, + "loss": 1.22, + "num_input_tokens_seen": 210728, + "step": 13 + }, + { + "epoch": 0.0009806754402094443, + "grad_norm": 6.43395471572876, + "learning_rate": 9.990202451838879e-05, + "loss": 1.1772, + "num_input_tokens_seen": 227112, + "step": 14 + }, + { + "epoch": 0.0010507236859386903, + "grad_norm": 6.09422492980957, + "learning_rate": 9.989502626970228e-05, + "loss": 1.195, + "num_input_tokens_seen": 243496, + "step": 15 + }, + { + "epoch": 0.0011207719316679364, + "grad_norm": 6.238271236419678, + "learning_rate": 9.988802802101577e-05, + "loss": 1.2623, + "num_input_tokens_seen": 259744, + "step": 16 + }, + { + "epoch": 0.0011908201773971822, + "grad_norm": 6.56187629699707, + "learning_rate": 9.988102977232926e-05, + "loss": 1.2721, + "num_input_tokens_seen": 276128, + "step": 17 + }, + { + "epoch": 0.0012608684231264283, + "grad_norm": 6.818358898162842, + "learning_rate": 9.987403152364275e-05, + "loss": 1.2649, + "num_input_tokens_seen": 292512, + "step": 18 + }, + { + "epoch": 0.0013309166688556743, + "grad_norm": 5.950352191925049, + "learning_rate": 9.986703327495622e-05, + "loss": 1.0024, + "num_input_tokens_seen": 308632, + "step": 19 + }, + { + "epoch": 0.0014009649145849204, + "grad_norm": 6.387479305267334, + "learning_rate": 9.986003502626971e-05, + "loss": 1.2783, + "num_input_tokens_seen": 325016, + "step": 20 + }, + { + "epoch": 0.0014710131603141664, + "grad_norm": 6.187346458435059, + "learning_rate": 9.985303677758318e-05, + "loss": 1.1701, + "num_input_tokens_seen": 341384, + "step": 21 + }, + { + "epoch": 0.0015410614060434125, + "grad_norm": 5.371951103210449, + "learning_rate": 9.984603852889667e-05, + "loss": 1.0483, + "num_input_tokens_seen": 357768, + "step": 22 + }, + { + "epoch": 0.0016111096517726585, + "grad_norm": 6.2206807136535645, + "learning_rate": 9.983904028021016e-05, + "loss": 1.2516, + "num_input_tokens_seen": 374152, + "step": 23 + }, + { + "epoch": 0.0016811578975019044, + "grad_norm": 6.121264457702637, + "learning_rate": 9.983204203152365e-05, + "loss": 1.1506, + "num_input_tokens_seen": 390536, + "step": 24 + }, + { + "epoch": 0.0017512061432311504, + "grad_norm": 6.353756904602051, + "learning_rate": 9.982504378283714e-05, + "loss": 1.3118, + "num_input_tokens_seen": 406920, + "step": 25 + }, + { + "epoch": 0.0018212543889603965, + "grad_norm": 6.270686149597168, + "learning_rate": 9.981804553415061e-05, + "loss": 1.0883, + "num_input_tokens_seen": 422728, + "step": 26 + }, + { + "epoch": 0.0018913026346896425, + "grad_norm": 6.117632865905762, + "learning_rate": 9.98110472854641e-05, + "loss": 1.3346, + "num_input_tokens_seen": 439112, + "step": 27 + }, + { + "epoch": 0.0019613508804188886, + "grad_norm": 6.429015159606934, + "learning_rate": 9.980404903677759e-05, + "loss": 1.2494, + "num_input_tokens_seen": 455144, + "step": 28 + }, + { + "epoch": 0.0020313991261481346, + "grad_norm": 6.4467620849609375, + "learning_rate": 9.979705078809107e-05, + "loss": 1.3335, + "num_input_tokens_seen": 470360, + "step": 29 + }, + { + "epoch": 0.0021014473718773807, + "grad_norm": 6.57926082611084, + "learning_rate": 9.979005253940455e-05, + "loss": 1.2126, + "num_input_tokens_seen": 486120, + "step": 30 + }, + { + "epoch": 0.0021714956176066267, + "grad_norm": 5.650569915771484, + "learning_rate": 9.978305429071804e-05, + "loss": 1.1363, + "num_input_tokens_seen": 501896, + "step": 31 + }, + { + "epoch": 0.0022415438633358728, + "grad_norm": 6.380292892456055, + "learning_rate": 9.977605604203153e-05, + "loss": 1.2251, + "num_input_tokens_seen": 517752, + "step": 32 + }, + { + "epoch": 0.002311592109065119, + "grad_norm": 5.704173564910889, + "learning_rate": 9.976905779334502e-05, + "loss": 1.1685, + "num_input_tokens_seen": 534136, + "step": 33 + }, + { + "epoch": 0.0023816403547943644, + "grad_norm": 5.342978000640869, + "learning_rate": 9.97620595446585e-05, + "loss": 1.2012, + "num_input_tokens_seen": 550216, + "step": 34 + }, + { + "epoch": 0.0024516886005236105, + "grad_norm": 5.7014241218566895, + "learning_rate": 9.975506129597198e-05, + "loss": 1.2342, + "num_input_tokens_seen": 566600, + "step": 35 + }, + { + "epoch": 0.0025217368462528565, + "grad_norm": 6.26229190826416, + "learning_rate": 9.974806304728546e-05, + "loss": 1.2041, + "num_input_tokens_seen": 582984, + "step": 36 + }, + { + "epoch": 0.0025917850919821026, + "grad_norm": 6.583463191986084, + "learning_rate": 9.974106479859896e-05, + "loss": 1.3021, + "num_input_tokens_seen": 598968, + "step": 37 + }, + { + "epoch": 0.0026618333377113486, + "grad_norm": 5.58498477935791, + "learning_rate": 9.973406654991245e-05, + "loss": 1.1622, + "num_input_tokens_seen": 614840, + "step": 38 + }, + { + "epoch": 0.0027318815834405947, + "grad_norm": 5.906906604766846, + "learning_rate": 9.972706830122592e-05, + "loss": 1.1971, + "num_input_tokens_seen": 631224, + "step": 39 + }, + { + "epoch": 0.0028019298291698407, + "grad_norm": 5.962359428405762, + "learning_rate": 9.972007005253941e-05, + "loss": 1.1326, + "num_input_tokens_seen": 647000, + "step": 40 + }, + { + "epoch": 0.002871978074899087, + "grad_norm": 6.447500705718994, + "learning_rate": 9.971307180385289e-05, + "loss": 1.0905, + "num_input_tokens_seen": 662480, + "step": 41 + }, + { + "epoch": 0.002942026320628333, + "grad_norm": 5.7290520668029785, + "learning_rate": 9.970607355516638e-05, + "loss": 1.3585, + "num_input_tokens_seen": 678480, + "step": 42 + }, + { + "epoch": 0.003012074566357579, + "grad_norm": 6.063445568084717, + "learning_rate": 9.969907530647987e-05, + "loss": 1.2841, + "num_input_tokens_seen": 694256, + "step": 43 + }, + { + "epoch": 0.003082122812086825, + "grad_norm": 5.302809238433838, + "learning_rate": 9.969207705779335e-05, + "loss": 1.1168, + "num_input_tokens_seen": 710152, + "step": 44 + }, + { + "epoch": 0.003152171057816071, + "grad_norm": 5.634128093719482, + "learning_rate": 9.968507880910684e-05, + "loss": 1.0609, + "num_input_tokens_seen": 726184, + "step": 45 + }, + { + "epoch": 0.003222219303545317, + "grad_norm": 5.652642726898193, + "learning_rate": 9.967808056042032e-05, + "loss": 1.2228, + "num_input_tokens_seen": 742520, + "step": 46 + }, + { + "epoch": 0.0032922675492745627, + "grad_norm": 5.340751647949219, + "learning_rate": 9.96710823117338e-05, + "loss": 1.0595, + "num_input_tokens_seen": 758904, + "step": 47 + }, + { + "epoch": 0.0033623157950038087, + "grad_norm": 5.422239780426025, + "learning_rate": 9.966408406304728e-05, + "loss": 1.1161, + "num_input_tokens_seen": 775040, + "step": 48 + }, + { + "epoch": 0.0034323640407330548, + "grad_norm": 5.29241418838501, + "learning_rate": 9.965708581436077e-05, + "loss": 1.0255, + "num_input_tokens_seen": 790856, + "step": 49 + }, + { + "epoch": 0.003502412286462301, + "grad_norm": 5.146270275115967, + "learning_rate": 9.965008756567426e-05, + "loss": 0.9762, + "num_input_tokens_seen": 807064, + "step": 50 + }, + { + "epoch": 0.003572460532191547, + "grad_norm": 5.825758457183838, + "learning_rate": 9.964308931698775e-05, + "loss": 1.2108, + "num_input_tokens_seen": 823448, + "step": 51 + }, + { + "epoch": 0.003642508777920793, + "grad_norm": 6.179538726806641, + "learning_rate": 9.963609106830124e-05, + "loss": 1.322, + "num_input_tokens_seen": 838888, + "step": 52 + }, + { + "epoch": 0.003712557023650039, + "grad_norm": 6.464454174041748, + "learning_rate": 9.962909281961471e-05, + "loss": 1.5077, + "num_input_tokens_seen": 855272, + "step": 53 + }, + { + "epoch": 0.003782605269379285, + "grad_norm": 5.4227294921875, + "learning_rate": 9.96220945709282e-05, + "loss": 1.2679, + "num_input_tokens_seen": 871656, + "step": 54 + }, + { + "epoch": 0.003852653515108531, + "grad_norm": 5.949041366577148, + "learning_rate": 9.961509632224169e-05, + "loss": 1.3618, + "num_input_tokens_seen": 888040, + "step": 55 + }, + { + "epoch": 0.003922701760837777, + "grad_norm": 6.050904750823975, + "learning_rate": 9.960809807355516e-05, + "loss": 1.3155, + "num_input_tokens_seen": 904400, + "step": 56 + }, + { + "epoch": 0.003992750006567023, + "grad_norm": 6.048308849334717, + "learning_rate": 9.960109982486866e-05, + "loss": 1.3131, + "num_input_tokens_seen": 919952, + "step": 57 + }, + { + "epoch": 0.004062798252296269, + "grad_norm": 5.683863162994385, + "learning_rate": 9.959410157618214e-05, + "loss": 1.1692, + "num_input_tokens_seen": 936336, + "step": 58 + }, + { + "epoch": 0.004132846498025515, + "grad_norm": 5.449287414550781, + "learning_rate": 9.958710332749563e-05, + "loss": 1.0613, + "num_input_tokens_seen": 952152, + "step": 59 + }, + { + "epoch": 0.004202894743754761, + "grad_norm": 5.31496524810791, + "learning_rate": 9.958010507880912e-05, + "loss": 0.9605, + "num_input_tokens_seen": 967824, + "step": 60 + }, + { + "epoch": 0.004272942989484007, + "grad_norm": 5.57105016708374, + "learning_rate": 9.957310683012259e-05, + "loss": 1.1701, + "num_input_tokens_seen": 983864, + "step": 61 + }, + { + "epoch": 0.004342991235213253, + "grad_norm": 5.3456830978393555, + "learning_rate": 9.956610858143608e-05, + "loss": 1.0995, + "num_input_tokens_seen": 1000248, + "step": 62 + }, + { + "epoch": 0.004413039480942499, + "grad_norm": 5.453295707702637, + "learning_rate": 9.955911033274957e-05, + "loss": 1.2413, + "num_input_tokens_seen": 1016632, + "step": 63 + }, + { + "epoch": 0.0044830877266717455, + "grad_norm": 4.975449562072754, + "learning_rate": 9.955211208406306e-05, + "loss": 1.0961, + "num_input_tokens_seen": 1033016, + "step": 64 + }, + { + "epoch": 0.004553135972400991, + "grad_norm": 5.542137145996094, + "learning_rate": 9.954511383537655e-05, + "loss": 1.1171, + "num_input_tokens_seen": 1049400, + "step": 65 + }, + { + "epoch": 0.004623184218130238, + "grad_norm": 5.213950157165527, + "learning_rate": 9.953811558669002e-05, + "loss": 1.2228, + "num_input_tokens_seen": 1065784, + "step": 66 + }, + { + "epoch": 0.004693232463859483, + "grad_norm": 5.496099948883057, + "learning_rate": 9.953111733800351e-05, + "loss": 1.1529, + "num_input_tokens_seen": 1082168, + "step": 67 + }, + { + "epoch": 0.004763280709588729, + "grad_norm": 5.64145565032959, + "learning_rate": 9.952411908931698e-05, + "loss": 1.2301, + "num_input_tokens_seen": 1098024, + "step": 68 + }, + { + "epoch": 0.004833328955317975, + "grad_norm": 5.566709995269775, + "learning_rate": 9.951712084063047e-05, + "loss": 1.2679, + "num_input_tokens_seen": 1114408, + "step": 69 + }, + { + "epoch": 0.004903377201047221, + "grad_norm": 6.443673133850098, + "learning_rate": 9.951012259194396e-05, + "loss": 1.2313, + "num_input_tokens_seen": 1130792, + "step": 70 + }, + { + "epoch": 0.0049734254467764675, + "grad_norm": 5.882962226867676, + "learning_rate": 9.950312434325745e-05, + "loss": 1.4304, + "num_input_tokens_seen": 1147176, + "step": 71 + }, + { + "epoch": 0.005043473692505713, + "grad_norm": 6.0052666664123535, + "learning_rate": 9.949612609457094e-05, + "loss": 1.3027, + "num_input_tokens_seen": 1160968, + "step": 72 + }, + { + "epoch": 0.0051135219382349596, + "grad_norm": 5.260256767272949, + "learning_rate": 9.948912784588441e-05, + "loss": 1.1526, + "num_input_tokens_seen": 1177352, + "step": 73 + }, + { + "epoch": 0.005183570183964205, + "grad_norm": 5.641814708709717, + "learning_rate": 9.94821295971979e-05, + "loss": 1.0666, + "num_input_tokens_seen": 1193032, + "step": 74 + }, + { + "epoch": 0.005253618429693452, + "grad_norm": 5.121115207672119, + "learning_rate": 9.947513134851138e-05, + "loss": 1.2404, + "num_input_tokens_seen": 1208952, + "step": 75 + }, + { + "epoch": 0.005323666675422697, + "grad_norm": 5.63930082321167, + "learning_rate": 9.946813309982487e-05, + "loss": 1.5127, + "num_input_tokens_seen": 1225000, + "step": 76 + }, + { + "epoch": 0.005393714921151944, + "grad_norm": 4.880716800689697, + "learning_rate": 9.946113485113837e-05, + "loss": 1.1484, + "num_input_tokens_seen": 1241384, + "step": 77 + }, + { + "epoch": 0.005463763166881189, + "grad_norm": 5.59611177444458, + "learning_rate": 9.945413660245184e-05, + "loss": 1.1678, + "num_input_tokens_seen": 1257680, + "step": 78 + }, + { + "epoch": 0.005533811412610436, + "grad_norm": 5.052026271820068, + "learning_rate": 9.944713835376533e-05, + "loss": 1.2207, + "num_input_tokens_seen": 1274064, + "step": 79 + }, + { + "epoch": 0.0056038596583396815, + "grad_norm": 5.285096168518066, + "learning_rate": 9.944014010507881e-05, + "loss": 1.1457, + "num_input_tokens_seen": 1290448, + "step": 80 + }, + { + "epoch": 0.005673907904068927, + "grad_norm": 5.4286580085754395, + "learning_rate": 9.94331418563923e-05, + "loss": 1.3047, + "num_input_tokens_seen": 1306832, + "step": 81 + }, + { + "epoch": 0.005743956149798174, + "grad_norm": 5.937953472137451, + "learning_rate": 9.942614360770578e-05, + "loss": 1.4353, + "num_input_tokens_seen": 1323216, + "step": 82 + }, + { + "epoch": 0.005814004395527419, + "grad_norm": 5.129006385803223, + "learning_rate": 9.941914535901927e-05, + "loss": 1.1434, + "num_input_tokens_seen": 1339408, + "step": 83 + }, + { + "epoch": 0.005884052641256666, + "grad_norm": 5.179675102233887, + "learning_rate": 9.941214711033276e-05, + "loss": 1.2452, + "num_input_tokens_seen": 1355792, + "step": 84 + }, + { + "epoch": 0.005954100886985911, + "grad_norm": 4.912832736968994, + "learning_rate": 9.940514886164624e-05, + "loss": 1.1255, + "num_input_tokens_seen": 1372176, + "step": 85 + }, + { + "epoch": 0.006024149132715158, + "grad_norm": 5.190899848937988, + "learning_rate": 9.939815061295973e-05, + "loss": 1.2543, + "num_input_tokens_seen": 1388560, + "step": 86 + }, + { + "epoch": 0.006094197378444403, + "grad_norm": 5.1751275062561035, + "learning_rate": 9.939115236427321e-05, + "loss": 1.3145, + "num_input_tokens_seen": 1404944, + "step": 87 + }, + { + "epoch": 0.00616424562417365, + "grad_norm": 5.450705528259277, + "learning_rate": 9.938415411558669e-05, + "loss": 1.2844, + "num_input_tokens_seen": 1421328, + "step": 88 + }, + { + "epoch": 0.0062342938699028955, + "grad_norm": 5.593935012817383, + "learning_rate": 9.937715586690018e-05, + "loss": 1.3284, + "num_input_tokens_seen": 1437464, + "step": 89 + }, + { + "epoch": 0.006304342115632142, + "grad_norm": 5.156428813934326, + "learning_rate": 9.937015761821367e-05, + "loss": 1.1682, + "num_input_tokens_seen": 1452952, + "step": 90 + }, + { + "epoch": 0.006374390361361388, + "grad_norm": 4.673638820648193, + "learning_rate": 9.936315936952715e-05, + "loss": 1.004, + "num_input_tokens_seen": 1469336, + "step": 91 + }, + { + "epoch": 0.006444438607090634, + "grad_norm": 4.996700763702393, + "learning_rate": 9.935616112084064e-05, + "loss": 1.087, + "num_input_tokens_seen": 1485448, + "step": 92 + }, + { + "epoch": 0.00651448685281988, + "grad_norm": 4.817474365234375, + "learning_rate": 9.934916287215412e-05, + "loss": 1.151, + "num_input_tokens_seen": 1501472, + "step": 93 + }, + { + "epoch": 0.006584535098549125, + "grad_norm": 5.400479316711426, + "learning_rate": 9.934216462346761e-05, + "loss": 1.3144, + "num_input_tokens_seen": 1516424, + "step": 94 + }, + { + "epoch": 0.006654583344278372, + "grad_norm": 5.232216835021973, + "learning_rate": 9.933516637478108e-05, + "loss": 1.0019, + "num_input_tokens_seen": 1532792, + "step": 95 + }, + { + "epoch": 0.006724631590007617, + "grad_norm": 5.392521381378174, + "learning_rate": 9.932816812609457e-05, + "loss": 1.3195, + "num_input_tokens_seen": 1548600, + "step": 96 + }, + { + "epoch": 0.006794679835736864, + "grad_norm": 5.5280866622924805, + "learning_rate": 9.932116987740806e-05, + "loss": 1.283, + "num_input_tokens_seen": 1564088, + "step": 97 + }, + { + "epoch": 0.0068647280814661095, + "grad_norm": 4.963179588317871, + "learning_rate": 9.931417162872155e-05, + "loss": 1.2716, + "num_input_tokens_seen": 1580040, + "step": 98 + }, + { + "epoch": 0.006934776327195356, + "grad_norm": 4.920302391052246, + "learning_rate": 9.930717338003504e-05, + "loss": 1.088, + "num_input_tokens_seen": 1595880, + "step": 99 + }, + { + "epoch": 0.007004824572924602, + "grad_norm": 4.935486793518066, + "learning_rate": 9.930017513134851e-05, + "loss": 1.0122, + "num_input_tokens_seen": 1611864, + "step": 100 + }, + { + "epoch": 0.007074872818653848, + "grad_norm": 5.099087238311768, + "learning_rate": 9.9293176882662e-05, + "loss": 1.1605, + "num_input_tokens_seen": 1627472, + "step": 101 + }, + { + "epoch": 0.007144921064383094, + "grad_norm": 5.3764328956604, + "learning_rate": 9.928617863397548e-05, + "loss": 1.2225, + "num_input_tokens_seen": 1643856, + "step": 102 + }, + { + "epoch": 0.00721496931011234, + "grad_norm": 5.281564712524414, + "learning_rate": 9.927918038528898e-05, + "loss": 1.1483, + "num_input_tokens_seen": 1660240, + "step": 103 + }, + { + "epoch": 0.007285017555841586, + "grad_norm": 5.395167827606201, + "learning_rate": 9.927218213660247e-05, + "loss": 1.6014, + "num_input_tokens_seen": 1676624, + "step": 104 + }, + { + "epoch": 0.007355065801570832, + "grad_norm": 5.322319507598877, + "learning_rate": 9.926518388791594e-05, + "loss": 1.0933, + "num_input_tokens_seen": 1693008, + "step": 105 + }, + { + "epoch": 0.007425114047300078, + "grad_norm": 5.301229953765869, + "learning_rate": 9.925818563922943e-05, + "loss": 1.1998, + "num_input_tokens_seen": 1708424, + "step": 106 + }, + { + "epoch": 0.0074951622930293236, + "grad_norm": 4.958597183227539, + "learning_rate": 9.92511873905429e-05, + "loss": 1.3285, + "num_input_tokens_seen": 1724808, + "step": 107 + }, + { + "epoch": 0.00756521053875857, + "grad_norm": 4.3913960456848145, + "learning_rate": 9.924418914185639e-05, + "loss": 0.9017, + "num_input_tokens_seen": 1740752, + "step": 108 + }, + { + "epoch": 0.007635258784487816, + "grad_norm": 5.401021480560303, + "learning_rate": 9.923719089316988e-05, + "loss": 1.3646, + "num_input_tokens_seen": 1755176, + "step": 109 + }, + { + "epoch": 0.007705307030217062, + "grad_norm": 4.894444942474365, + "learning_rate": 9.923019264448337e-05, + "loss": 0.9955, + "num_input_tokens_seen": 1771560, + "step": 110 + }, + { + "epoch": 0.007775355275946308, + "grad_norm": 4.878688335418701, + "learning_rate": 9.922319439579686e-05, + "loss": 1.1766, + "num_input_tokens_seen": 1787944, + "step": 111 + }, + { + "epoch": 0.007845403521675554, + "grad_norm": 4.9379777908325195, + "learning_rate": 9.921619614711033e-05, + "loss": 1.1631, + "num_input_tokens_seen": 1803568, + "step": 112 + }, + { + "epoch": 0.0079154517674048, + "grad_norm": 5.101811408996582, + "learning_rate": 9.920919789842382e-05, + "loss": 1.2165, + "num_input_tokens_seen": 1819952, + "step": 113 + }, + { + "epoch": 0.007985500013134045, + "grad_norm": 5.32574987411499, + "learning_rate": 9.920219964973731e-05, + "loss": 1.3012, + "num_input_tokens_seen": 1835296, + "step": 114 + }, + { + "epoch": 0.008055548258863293, + "grad_norm": 5.2391180992126465, + "learning_rate": 9.919520140105079e-05, + "loss": 1.2451, + "num_input_tokens_seen": 1851224, + "step": 115 + }, + { + "epoch": 0.008125596504592538, + "grad_norm": 4.865017890930176, + "learning_rate": 9.918820315236427e-05, + "loss": 1.1683, + "num_input_tokens_seen": 1867608, + "step": 116 + }, + { + "epoch": 0.008195644750321784, + "grad_norm": 4.943136215209961, + "learning_rate": 9.918120490367776e-05, + "loss": 1.31, + "num_input_tokens_seen": 1883696, + "step": 117 + }, + { + "epoch": 0.00826569299605103, + "grad_norm": 4.769871711730957, + "learning_rate": 9.917420665499125e-05, + "loss": 1.1212, + "num_input_tokens_seen": 1900080, + "step": 118 + }, + { + "epoch": 0.008335741241780275, + "grad_norm": 4.785780429840088, + "learning_rate": 9.916720840630474e-05, + "loss": 1.2415, + "num_input_tokens_seen": 1916464, + "step": 119 + }, + { + "epoch": 0.008405789487509523, + "grad_norm": 4.802333831787109, + "learning_rate": 9.916021015761822e-05, + "loss": 1.0513, + "num_input_tokens_seen": 1932848, + "step": 120 + }, + { + "epoch": 0.008475837733238768, + "grad_norm": 5.22212553024292, + "learning_rate": 9.91532119089317e-05, + "loss": 1.2574, + "num_input_tokens_seen": 1949232, + "step": 121 + }, + { + "epoch": 0.008545885978968014, + "grad_norm": 5.104204177856445, + "learning_rate": 9.914621366024518e-05, + "loss": 1.0436, + "num_input_tokens_seen": 1964184, + "step": 122 + }, + { + "epoch": 0.00861593422469726, + "grad_norm": 5.11055326461792, + "learning_rate": 9.913921541155868e-05, + "loss": 1.1939, + "num_input_tokens_seen": 1980568, + "step": 123 + }, + { + "epoch": 0.008685982470426507, + "grad_norm": 4.784866809844971, + "learning_rate": 9.913221716287216e-05, + "loss": 1.2056, + "num_input_tokens_seen": 1996952, + "step": 124 + }, + { + "epoch": 0.008756030716155752, + "grad_norm": 4.763037204742432, + "learning_rate": 9.912521891418564e-05, + "loss": 1.1403, + "num_input_tokens_seen": 2013336, + "step": 125 + }, + { + "epoch": 0.008826078961884998, + "grad_norm": 4.813408851623535, + "learning_rate": 9.911822066549913e-05, + "loss": 1.1897, + "num_input_tokens_seen": 2029720, + "step": 126 + }, + { + "epoch": 0.008896127207614244, + "grad_norm": 4.79008674621582, + "learning_rate": 9.911122241681261e-05, + "loss": 1.2315, + "num_input_tokens_seen": 2046104, + "step": 127 + }, + { + "epoch": 0.008966175453343491, + "grad_norm": 4.843508720397949, + "learning_rate": 9.91042241681261e-05, + "loss": 1.0883, + "num_input_tokens_seen": 2061592, + "step": 128 + }, + { + "epoch": 0.009036223699072737, + "grad_norm": 4.917592525482178, + "learning_rate": 9.909722591943959e-05, + "loss": 1.2512, + "num_input_tokens_seen": 2077792, + "step": 129 + }, + { + "epoch": 0.009106271944801982, + "grad_norm": 4.9154133796691895, + "learning_rate": 9.909022767075307e-05, + "loss": 1.3284, + "num_input_tokens_seen": 2094176, + "step": 130 + }, + { + "epoch": 0.009176320190531228, + "grad_norm": 5.2125420570373535, + "learning_rate": 9.908322942206656e-05, + "loss": 1.3469, + "num_input_tokens_seen": 2110480, + "step": 131 + }, + { + "epoch": 0.009246368436260475, + "grad_norm": 4.715712547302246, + "learning_rate": 9.907623117338004e-05, + "loss": 1.0844, + "num_input_tokens_seen": 2126864, + "step": 132 + }, + { + "epoch": 0.009316416681989721, + "grad_norm": 4.805694580078125, + "learning_rate": 9.906923292469353e-05, + "loss": 1.069, + "num_input_tokens_seen": 2142848, + "step": 133 + }, + { + "epoch": 0.009386464927718966, + "grad_norm": 4.961355209350586, + "learning_rate": 9.9062234676007e-05, + "loss": 1.3387, + "num_input_tokens_seen": 2159232, + "step": 134 + }, + { + "epoch": 0.009456513173448212, + "grad_norm": 4.582219123840332, + "learning_rate": 9.905523642732049e-05, + "loss": 1.2013, + "num_input_tokens_seen": 2175616, + "step": 135 + }, + { + "epoch": 0.009526561419177458, + "grad_norm": 5.195998191833496, + "learning_rate": 9.904823817863398e-05, + "loss": 1.2552, + "num_input_tokens_seen": 2191872, + "step": 136 + }, + { + "epoch": 0.009596609664906705, + "grad_norm": 4.934189319610596, + "learning_rate": 9.904123992994747e-05, + "loss": 1.2961, + "num_input_tokens_seen": 2208208, + "step": 137 + }, + { + "epoch": 0.00966665791063595, + "grad_norm": 4.981037616729736, + "learning_rate": 9.903424168126096e-05, + "loss": 1.1546, + "num_input_tokens_seen": 2224592, + "step": 138 + }, + { + "epoch": 0.009736706156365196, + "grad_norm": 5.469496250152588, + "learning_rate": 9.902724343257443e-05, + "loss": 1.3833, + "num_input_tokens_seen": 2240976, + "step": 139 + }, + { + "epoch": 0.009806754402094442, + "grad_norm": 4.889583587646484, + "learning_rate": 9.902024518388792e-05, + "loss": 1.2095, + "num_input_tokens_seen": 2257360, + "step": 140 + }, + { + "epoch": 0.00987680264782369, + "grad_norm": 4.532052516937256, + "learning_rate": 9.901324693520141e-05, + "loss": 1.143, + "num_input_tokens_seen": 2272848, + "step": 141 + }, + { + "epoch": 0.009946850893552935, + "grad_norm": 5.278079032897949, + "learning_rate": 9.900624868651488e-05, + "loss": 1.2849, + "num_input_tokens_seen": 2289232, + "step": 142 + }, + { + "epoch": 0.01001689913928218, + "grad_norm": 4.549891948699951, + "learning_rate": 9.899925043782839e-05, + "loss": 1.0482, + "num_input_tokens_seen": 2305424, + "step": 143 + }, + { + "epoch": 0.010086947385011426, + "grad_norm": 4.7777180671691895, + "learning_rate": 9.899225218914186e-05, + "loss": 1.1926, + "num_input_tokens_seen": 2320968, + "step": 144 + }, + { + "epoch": 0.010156995630740673, + "grad_norm": 4.320313453674316, + "learning_rate": 9.898525394045535e-05, + "loss": 1.0468, + "num_input_tokens_seen": 2337352, + "step": 145 + }, + { + "epoch": 0.010227043876469919, + "grad_norm": 4.915202617645264, + "learning_rate": 9.897825569176882e-05, + "loss": 1.1326, + "num_input_tokens_seen": 2353064, + "step": 146 + }, + { + "epoch": 0.010297092122199165, + "grad_norm": 4.569783687591553, + "learning_rate": 9.897125744308231e-05, + "loss": 0.8586, + "num_input_tokens_seen": 2369128, + "step": 147 + }, + { + "epoch": 0.01036714036792841, + "grad_norm": 4.591664791107178, + "learning_rate": 9.89642591943958e-05, + "loss": 1.1369, + "num_input_tokens_seen": 2385512, + "step": 148 + }, + { + "epoch": 0.010437188613657656, + "grad_norm": 4.913016319274902, + "learning_rate": 9.895726094570929e-05, + "loss": 1.1564, + "num_input_tokens_seen": 2401208, + "step": 149 + }, + { + "epoch": 0.010507236859386903, + "grad_norm": 4.908018112182617, + "learning_rate": 9.895026269702278e-05, + "loss": 1.1247, + "num_input_tokens_seen": 2417592, + "step": 150 + }, + { + "epoch": 0.010577285105116149, + "grad_norm": 4.536910057067871, + "learning_rate": 9.894326444833625e-05, + "loss": 1.014, + "num_input_tokens_seen": 2433976, + "step": 151 + }, + { + "epoch": 0.010647333350845395, + "grad_norm": 4.899227142333984, + "learning_rate": 9.893626619964974e-05, + "loss": 1.0418, + "num_input_tokens_seen": 2448072, + "step": 152 + }, + { + "epoch": 0.01071738159657464, + "grad_norm": 4.600861072540283, + "learning_rate": 9.892926795096323e-05, + "loss": 1.0459, + "num_input_tokens_seen": 2464240, + "step": 153 + }, + { + "epoch": 0.010787429842303888, + "grad_norm": 4.707681179046631, + "learning_rate": 9.89222697022767e-05, + "loss": 1.0859, + "num_input_tokens_seen": 2480624, + "step": 154 + }, + { + "epoch": 0.010857478088033133, + "grad_norm": 4.748518466949463, + "learning_rate": 9.89152714535902e-05, + "loss": 1.0608, + "num_input_tokens_seen": 2497008, + "step": 155 + }, + { + "epoch": 0.010927526333762379, + "grad_norm": 4.794179439544678, + "learning_rate": 9.890827320490368e-05, + "loss": 1.2243, + "num_input_tokens_seen": 2513392, + "step": 156 + }, + { + "epoch": 0.010997574579491624, + "grad_norm": 4.593925476074219, + "learning_rate": 9.890127495621717e-05, + "loss": 1.1002, + "num_input_tokens_seen": 2529776, + "step": 157 + }, + { + "epoch": 0.011067622825220872, + "grad_norm": 4.318257808685303, + "learning_rate": 9.889427670753066e-05, + "loss": 0.9561, + "num_input_tokens_seen": 2546160, + "step": 158 + }, + { + "epoch": 0.011137671070950117, + "grad_norm": 4.631777286529541, + "learning_rate": 9.888727845884414e-05, + "loss": 1.1553, + "num_input_tokens_seen": 2562544, + "step": 159 + }, + { + "epoch": 0.011207719316679363, + "grad_norm": 4.896609783172607, + "learning_rate": 9.888028021015762e-05, + "loss": 1.1779, + "num_input_tokens_seen": 2578088, + "step": 160 + }, + { + "epoch": 0.011277767562408609, + "grad_norm": 4.3978681564331055, + "learning_rate": 9.88732819614711e-05, + "loss": 1.1778, + "num_input_tokens_seen": 2594416, + "step": 161 + }, + { + "epoch": 0.011347815808137854, + "grad_norm": 4.82927942276001, + "learning_rate": 9.886628371278459e-05, + "loss": 1.0339, + "num_input_tokens_seen": 2609776, + "step": 162 + }, + { + "epoch": 0.011417864053867102, + "grad_norm": 4.413319110870361, + "learning_rate": 9.885928546409809e-05, + "loss": 1.0992, + "num_input_tokens_seen": 2626160, + "step": 163 + }, + { + "epoch": 0.011487912299596347, + "grad_norm": 4.626354694366455, + "learning_rate": 9.885228721541156e-05, + "loss": 1.1948, + "num_input_tokens_seen": 2642464, + "step": 164 + }, + { + "epoch": 0.011557960545325593, + "grad_norm": 4.328434467315674, + "learning_rate": 9.884528896672505e-05, + "loss": 1.1493, + "num_input_tokens_seen": 2658528, + "step": 165 + }, + { + "epoch": 0.011628008791054838, + "grad_norm": 4.57839822769165, + "learning_rate": 9.883829071803853e-05, + "loss": 1.0775, + "num_input_tokens_seen": 2674912, + "step": 166 + }, + { + "epoch": 0.011698057036784086, + "grad_norm": 5.103973865509033, + "learning_rate": 9.883129246935202e-05, + "loss": 1.2458, + "num_input_tokens_seen": 2690792, + "step": 167 + }, + { + "epoch": 0.011768105282513331, + "grad_norm": 4.558016300201416, + "learning_rate": 9.88242942206655e-05, + "loss": 1.0122, + "num_input_tokens_seen": 2705616, + "step": 168 + }, + { + "epoch": 0.011838153528242577, + "grad_norm": 4.811260223388672, + "learning_rate": 9.8817295971979e-05, + "loss": 1.2989, + "num_input_tokens_seen": 2721704, + "step": 169 + }, + { + "epoch": 0.011908201773971823, + "grad_norm": 4.726966857910156, + "learning_rate": 9.881029772329248e-05, + "loss": 1.176, + "num_input_tokens_seen": 2738088, + "step": 170 + }, + { + "epoch": 0.01197825001970107, + "grad_norm": 4.874902725219727, + "learning_rate": 9.880329947460596e-05, + "loss": 1.2586, + "num_input_tokens_seen": 2754040, + "step": 171 + }, + { + "epoch": 0.012048298265430316, + "grad_norm": 4.379549980163574, + "learning_rate": 9.879630122591945e-05, + "loss": 1.1771, + "num_input_tokens_seen": 2770424, + "step": 172 + }, + { + "epoch": 0.012118346511159561, + "grad_norm": 4.455331802368164, + "learning_rate": 9.878930297723292e-05, + "loss": 1.0714, + "num_input_tokens_seen": 2786808, + "step": 173 + }, + { + "epoch": 0.012188394756888807, + "grad_norm": 4.42273473739624, + "learning_rate": 9.878230472854641e-05, + "loss": 1.1798, + "num_input_tokens_seen": 2803176, + "step": 174 + }, + { + "epoch": 0.012258443002618052, + "grad_norm": 4.4078874588012695, + "learning_rate": 9.87753064798599e-05, + "loss": 1.1672, + "num_input_tokens_seen": 2819448, + "step": 175 + }, + { + "epoch": 0.0123284912483473, + "grad_norm": 4.79048490524292, + "learning_rate": 9.876830823117339e-05, + "loss": 1.3331, + "num_input_tokens_seen": 2835832, + "step": 176 + }, + { + "epoch": 0.012398539494076545, + "grad_norm": 4.212133884429932, + "learning_rate": 9.876130998248688e-05, + "loss": 1.0007, + "num_input_tokens_seen": 2851776, + "step": 177 + }, + { + "epoch": 0.012468587739805791, + "grad_norm": 5.7587738037109375, + "learning_rate": 9.875431173380035e-05, + "loss": 1.4729, + "num_input_tokens_seen": 2867896, + "step": 178 + }, + { + "epoch": 0.012538635985535037, + "grad_norm": 4.3469462394714355, + "learning_rate": 9.874731348511384e-05, + "loss": 0.957, + "num_input_tokens_seen": 2884280, + "step": 179 + }, + { + "epoch": 0.012608684231264284, + "grad_norm": 4.584625244140625, + "learning_rate": 9.874031523642733e-05, + "loss": 1.0753, + "num_input_tokens_seen": 2899208, + "step": 180 + }, + { + "epoch": 0.01267873247699353, + "grad_norm": 4.544627666473389, + "learning_rate": 9.87333169877408e-05, + "loss": 1.1706, + "num_input_tokens_seen": 2915416, + "step": 181 + }, + { + "epoch": 0.012748780722722775, + "grad_norm": 4.8749237060546875, + "learning_rate": 9.872631873905429e-05, + "loss": 1.3382, + "num_input_tokens_seen": 2931360, + "step": 182 + }, + { + "epoch": 0.01281882896845202, + "grad_norm": 4.593903541564941, + "learning_rate": 9.871932049036778e-05, + "loss": 1.1588, + "num_input_tokens_seen": 2947744, + "step": 183 + }, + { + "epoch": 0.012888877214181268, + "grad_norm": 4.478219509124756, + "learning_rate": 9.871232224168127e-05, + "loss": 1.1013, + "num_input_tokens_seen": 2963664, + "step": 184 + }, + { + "epoch": 0.012958925459910514, + "grad_norm": 5.028106212615967, + "learning_rate": 9.870532399299476e-05, + "loss": 1.3223, + "num_input_tokens_seen": 2980048, + "step": 185 + }, + { + "epoch": 0.01302897370563976, + "grad_norm": 4.866946697235107, + "learning_rate": 9.869832574430823e-05, + "loss": 1.2376, + "num_input_tokens_seen": 2995992, + "step": 186 + }, + { + "epoch": 0.013099021951369005, + "grad_norm": 4.421341419219971, + "learning_rate": 9.869132749562172e-05, + "loss": 1.2252, + "num_input_tokens_seen": 3012000, + "step": 187 + }, + { + "epoch": 0.01316907019709825, + "grad_norm": 4.88083028793335, + "learning_rate": 9.86843292469352e-05, + "loss": 1.2951, + "num_input_tokens_seen": 3028384, + "step": 188 + }, + { + "epoch": 0.013239118442827498, + "grad_norm": 4.654318809509277, + "learning_rate": 9.86773309982487e-05, + "loss": 1.2839, + "num_input_tokens_seen": 3044768, + "step": 189 + }, + { + "epoch": 0.013309166688556744, + "grad_norm": 4.626763820648193, + "learning_rate": 9.867033274956219e-05, + "loss": 1.2389, + "num_input_tokens_seen": 3061152, + "step": 190 + }, + { + "epoch": 0.01337921493428599, + "grad_norm": 4.178484916687012, + "learning_rate": 9.866333450087566e-05, + "loss": 1.1186, + "num_input_tokens_seen": 3077056, + "step": 191 + }, + { + "epoch": 0.013449263180015235, + "grad_norm": 4.755034923553467, + "learning_rate": 9.865633625218915e-05, + "loss": 1.0594, + "num_input_tokens_seen": 3093400, + "step": 192 + }, + { + "epoch": 0.013519311425744482, + "grad_norm": 4.437506198883057, + "learning_rate": 9.864933800350263e-05, + "loss": 1.2078, + "num_input_tokens_seen": 3109784, + "step": 193 + }, + { + "epoch": 0.013589359671473728, + "grad_norm": 5.140488624572754, + "learning_rate": 9.864233975481611e-05, + "loss": 1.4312, + "num_input_tokens_seen": 3124976, + "step": 194 + }, + { + "epoch": 0.013659407917202973, + "grad_norm": 4.72155237197876, + "learning_rate": 9.86353415061296e-05, + "loss": 1.1752, + "num_input_tokens_seen": 3140632, + "step": 195 + }, + { + "epoch": 0.013729456162932219, + "grad_norm": 4.914645671844482, + "learning_rate": 9.862834325744309e-05, + "loss": 1.2464, + "num_input_tokens_seen": 3156616, + "step": 196 + }, + { + "epoch": 0.013799504408661466, + "grad_norm": 4.23387336730957, + "learning_rate": 9.862134500875658e-05, + "loss": 0.9722, + "num_input_tokens_seen": 3172840, + "step": 197 + }, + { + "epoch": 0.013869552654390712, + "grad_norm": 4.659370422363281, + "learning_rate": 9.861434676007005e-05, + "loss": 1.1981, + "num_input_tokens_seen": 3188584, + "step": 198 + }, + { + "epoch": 0.013939600900119958, + "grad_norm": 4.580902576446533, + "learning_rate": 9.860734851138354e-05, + "loss": 1.1913, + "num_input_tokens_seen": 3204432, + "step": 199 + }, + { + "epoch": 0.014009649145849203, + "grad_norm": 4.208237648010254, + "learning_rate": 9.860035026269702e-05, + "loss": 1.2056, + "num_input_tokens_seen": 3220816, + "step": 200 + }, + { + "epoch": 0.014009649145849203, + "eval_loss": 1.2226407527923584, + "eval_runtime": 0.3992, + "eval_samples_per_second": 2.505, + "eval_steps_per_second": 2.505, + "num_input_tokens_seen": 3220816, + "step": 200 + }, + { + "epoch": 0.014079697391578449, + "grad_norm": 4.526260852813721, + "learning_rate": 9.85933520140105e-05, + "loss": 1.0488, + "num_input_tokens_seen": 3237200, + "step": 201 + }, + { + "epoch": 0.014149745637307696, + "grad_norm": 4.46895170211792, + "learning_rate": 9.8586353765324e-05, + "loss": 1.1101, + "num_input_tokens_seen": 3253336, + "step": 202 + }, + { + "epoch": 0.014219793883036942, + "grad_norm": 4.367347717285156, + "learning_rate": 9.857935551663748e-05, + "loss": 1.0425, + "num_input_tokens_seen": 3269632, + "step": 203 + }, + { + "epoch": 0.014289842128766187, + "grad_norm": 4.860860347747803, + "learning_rate": 9.857235726795097e-05, + "loss": 1.4068, + "num_input_tokens_seen": 3285432, + "step": 204 + }, + { + "epoch": 0.014359890374495433, + "grad_norm": 4.336480617523193, + "learning_rate": 9.856535901926445e-05, + "loss": 1.2579, + "num_input_tokens_seen": 3301632, + "step": 205 + }, + { + "epoch": 0.01442993862022468, + "grad_norm": 4.587873458862305, + "learning_rate": 9.855836077057794e-05, + "loss": 1.1508, + "num_input_tokens_seen": 3318016, + "step": 206 + }, + { + "epoch": 0.014499986865953926, + "grad_norm": 4.719262599945068, + "learning_rate": 9.855136252189142e-05, + "loss": 1.0208, + "num_input_tokens_seen": 3333168, + "step": 207 + }, + { + "epoch": 0.014570035111683172, + "grad_norm": 4.419138431549072, + "learning_rate": 9.85443642732049e-05, + "loss": 1.2576, + "num_input_tokens_seen": 3349384, + "step": 208 + }, + { + "epoch": 0.014640083357412417, + "grad_norm": 4.3150835037231445, + "learning_rate": 9.85373660245184e-05, + "loss": 1.1786, + "num_input_tokens_seen": 3365768, + "step": 209 + }, + { + "epoch": 0.014710131603141665, + "grad_norm": 4.5917649269104, + "learning_rate": 9.853036777583188e-05, + "loss": 1.2821, + "num_input_tokens_seen": 3382152, + "step": 210 + }, + { + "epoch": 0.01478017984887091, + "grad_norm": 4.9094343185424805, + "learning_rate": 9.852336952714537e-05, + "loss": 1.2415, + "num_input_tokens_seen": 3397896, + "step": 211 + }, + { + "epoch": 0.014850228094600156, + "grad_norm": 4.394861698150635, + "learning_rate": 9.851637127845885e-05, + "loss": 1.1776, + "num_input_tokens_seen": 3414280, + "step": 212 + }, + { + "epoch": 0.014920276340329401, + "grad_norm": 4.196374416351318, + "learning_rate": 9.850937302977233e-05, + "loss": 1.065, + "num_input_tokens_seen": 3430584, + "step": 213 + }, + { + "epoch": 0.014990324586058647, + "grad_norm": 4.728682518005371, + "learning_rate": 9.850237478108582e-05, + "loss": 1.2686, + "num_input_tokens_seen": 3446968, + "step": 214 + }, + { + "epoch": 0.015060372831787894, + "grad_norm": 4.291411876678467, + "learning_rate": 9.84953765323993e-05, + "loss": 1.1877, + "num_input_tokens_seen": 3462568, + "step": 215 + }, + { + "epoch": 0.01513042107751714, + "grad_norm": 4.405060768127441, + "learning_rate": 9.84883782837128e-05, + "loss": 1.2873, + "num_input_tokens_seen": 3478952, + "step": 216 + }, + { + "epoch": 0.015200469323246386, + "grad_norm": 4.254365921020508, + "learning_rate": 9.848138003502628e-05, + "loss": 1.1062, + "num_input_tokens_seen": 3495304, + "step": 217 + }, + { + "epoch": 0.015270517568975631, + "grad_norm": 4.741672039031982, + "learning_rate": 9.847438178633976e-05, + "loss": 1.1983, + "num_input_tokens_seen": 3511688, + "step": 218 + }, + { + "epoch": 0.015340565814704879, + "grad_norm": 4.352742671966553, + "learning_rate": 9.846738353765325e-05, + "loss": 1.2028, + "num_input_tokens_seen": 3528072, + "step": 219 + }, + { + "epoch": 0.015410614060434124, + "grad_norm": 4.996603488922119, + "learning_rate": 9.846038528896672e-05, + "loss": 1.1561, + "num_input_tokens_seen": 3542904, + "step": 220 + }, + { + "epoch": 0.01548066230616337, + "grad_norm": 4.911815166473389, + "learning_rate": 9.845338704028021e-05, + "loss": 1.3375, + "num_input_tokens_seen": 3558352, + "step": 221 + }, + { + "epoch": 0.015550710551892616, + "grad_norm": 4.638419151306152, + "learning_rate": 9.84463887915937e-05, + "loss": 1.1963, + "num_input_tokens_seen": 3574736, + "step": 222 + }, + { + "epoch": 0.015620758797621863, + "grad_norm": 4.323521614074707, + "learning_rate": 9.843939054290719e-05, + "loss": 1.1224, + "num_input_tokens_seen": 3591120, + "step": 223 + }, + { + "epoch": 0.01569080704335111, + "grad_norm": 4.466544151306152, + "learning_rate": 9.843239229422068e-05, + "loss": 1.3988, + "num_input_tokens_seen": 3607392, + "step": 224 + }, + { + "epoch": 0.015760855289080354, + "grad_norm": 4.476973533630371, + "learning_rate": 9.842539404553415e-05, + "loss": 1.184, + "num_input_tokens_seen": 3623776, + "step": 225 + }, + { + "epoch": 0.0158309035348096, + "grad_norm": 4.648625373840332, + "learning_rate": 9.841839579684764e-05, + "loss": 1.1768, + "num_input_tokens_seen": 3640008, + "step": 226 + }, + { + "epoch": 0.015900951780538845, + "grad_norm": 4.364476203918457, + "learning_rate": 9.841139754816112e-05, + "loss": 1.0208, + "num_input_tokens_seen": 3656392, + "step": 227 + }, + { + "epoch": 0.01597100002626809, + "grad_norm": 4.3054633140563965, + "learning_rate": 9.84043992994746e-05, + "loss": 1.1215, + "num_input_tokens_seen": 3672392, + "step": 228 + }, + { + "epoch": 0.016041048271997337, + "grad_norm": 4.83436918258667, + "learning_rate": 9.83974010507881e-05, + "loss": 1.2284, + "num_input_tokens_seen": 3688776, + "step": 229 + }, + { + "epoch": 0.016111096517726586, + "grad_norm": 4.447519779205322, + "learning_rate": 9.839040280210158e-05, + "loss": 1.1765, + "num_input_tokens_seen": 3705080, + "step": 230 + }, + { + "epoch": 0.01618114476345583, + "grad_norm": 4.269217491149902, + "learning_rate": 9.838340455341507e-05, + "loss": 1.0466, + "num_input_tokens_seen": 3721464, + "step": 231 + }, + { + "epoch": 0.016251193009185077, + "grad_norm": 4.41223669052124, + "learning_rate": 9.837640630472854e-05, + "loss": 1.2098, + "num_input_tokens_seen": 3737184, + "step": 232 + }, + { + "epoch": 0.016321241254914323, + "grad_norm": 4.632737159729004, + "learning_rate": 9.836940805604203e-05, + "loss": 1.1562, + "num_input_tokens_seen": 3753192, + "step": 233 + }, + { + "epoch": 0.016391289500643568, + "grad_norm": 4.379425525665283, + "learning_rate": 9.836240980735552e-05, + "loss": 1.1219, + "num_input_tokens_seen": 3767976, + "step": 234 + }, + { + "epoch": 0.016461337746372814, + "grad_norm": 4.28551721572876, + "learning_rate": 9.835541155866901e-05, + "loss": 1.0259, + "num_input_tokens_seen": 3784008, + "step": 235 + }, + { + "epoch": 0.01653138599210206, + "grad_norm": 4.642453670501709, + "learning_rate": 9.83484133099825e-05, + "loss": 1.1684, + "num_input_tokens_seen": 3800000, + "step": 236 + }, + { + "epoch": 0.016601434237831305, + "grad_norm": 4.367178440093994, + "learning_rate": 9.834141506129597e-05, + "loss": 1.2877, + "num_input_tokens_seen": 3816384, + "step": 237 + }, + { + "epoch": 0.01667148248356055, + "grad_norm": 4.5724005699157715, + "learning_rate": 9.833441681260946e-05, + "loss": 1.1814, + "num_input_tokens_seen": 3830328, + "step": 238 + }, + { + "epoch": 0.0167415307292898, + "grad_norm": 4.318159580230713, + "learning_rate": 9.832741856392295e-05, + "loss": 1.1143, + "num_input_tokens_seen": 3846712, + "step": 239 + }, + { + "epoch": 0.016811578975019045, + "grad_norm": 4.408501625061035, + "learning_rate": 9.832042031523643e-05, + "loss": 1.1508, + "num_input_tokens_seen": 3861776, + "step": 240 + }, + { + "epoch": 0.01688162722074829, + "grad_norm": 4.20060920715332, + "learning_rate": 9.831342206654991e-05, + "loss": 1.209, + "num_input_tokens_seen": 3877736, + "step": 241 + }, + { + "epoch": 0.016951675466477537, + "grad_norm": 4.431649208068848, + "learning_rate": 9.83064238178634e-05, + "loss": 1.2458, + "num_input_tokens_seen": 3893320, + "step": 242 + }, + { + "epoch": 0.017021723712206782, + "grad_norm": 4.000490188598633, + "learning_rate": 9.829942556917689e-05, + "loss": 1.0274, + "num_input_tokens_seen": 3909704, + "step": 243 + }, + { + "epoch": 0.017091771957936028, + "grad_norm": 4.703495025634766, + "learning_rate": 9.829242732049038e-05, + "loss": 1.1711, + "num_input_tokens_seen": 3925808, + "step": 244 + }, + { + "epoch": 0.017161820203665273, + "grad_norm": 4.639338970184326, + "learning_rate": 9.828542907180386e-05, + "loss": 1.3046, + "num_input_tokens_seen": 3942192, + "step": 245 + }, + { + "epoch": 0.01723186844939452, + "grad_norm": 4.414276599884033, + "learning_rate": 9.827843082311734e-05, + "loss": 1.271, + "num_input_tokens_seen": 3958528, + "step": 246 + }, + { + "epoch": 0.017301916695123768, + "grad_norm": 4.404853820800781, + "learning_rate": 9.827143257443082e-05, + "loss": 1.0693, + "num_input_tokens_seen": 3974912, + "step": 247 + }, + { + "epoch": 0.017371964940853014, + "grad_norm": 4.519491195678711, + "learning_rate": 9.826443432574431e-05, + "loss": 1.2894, + "num_input_tokens_seen": 3991296, + "step": 248 + }, + { + "epoch": 0.01744201318658226, + "grad_norm": 4.261727809906006, + "learning_rate": 9.825743607705781e-05, + "loss": 1.2059, + "num_input_tokens_seen": 4006544, + "step": 249 + }, + { + "epoch": 0.017512061432311505, + "grad_norm": 4.102485656738281, + "learning_rate": 9.825043782837129e-05, + "loss": 0.9365, + "num_input_tokens_seen": 4022320, + "step": 250 + }, + { + "epoch": 0.01758210967804075, + "grad_norm": 4.804764270782471, + "learning_rate": 9.824343957968477e-05, + "loss": 1.3344, + "num_input_tokens_seen": 4037048, + "step": 251 + }, + { + "epoch": 0.017652157923769996, + "grad_norm": 4.130600452423096, + "learning_rate": 9.823644133099825e-05, + "loss": 1.2349, + "num_input_tokens_seen": 4053432, + "step": 252 + }, + { + "epoch": 0.017722206169499242, + "grad_norm": 4.234742641448975, + "learning_rate": 9.822944308231174e-05, + "loss": 1.1371, + "num_input_tokens_seen": 4069816, + "step": 253 + }, + { + "epoch": 0.017792254415228487, + "grad_norm": 4.754928112030029, + "learning_rate": 9.822244483362521e-05, + "loss": 1.5168, + "num_input_tokens_seen": 4085864, + "step": 254 + }, + { + "epoch": 0.017862302660957733, + "grad_norm": 4.542768478393555, + "learning_rate": 9.821544658493871e-05, + "loss": 1.1943, + "num_input_tokens_seen": 4102240, + "step": 255 + }, + { + "epoch": 0.017932350906686982, + "grad_norm": 4.411310195922852, + "learning_rate": 9.82084483362522e-05, + "loss": 1.2694, + "num_input_tokens_seen": 4118544, + "step": 256 + }, + { + "epoch": 0.018002399152416228, + "grad_norm": 4.205377101898193, + "learning_rate": 9.820145008756568e-05, + "loss": 1.1581, + "num_input_tokens_seen": 4134928, + "step": 257 + }, + { + "epoch": 0.018072447398145473, + "grad_norm": 4.451165199279785, + "learning_rate": 9.819445183887917e-05, + "loss": 1.089, + "num_input_tokens_seen": 4150848, + "step": 258 + }, + { + "epoch": 0.01814249564387472, + "grad_norm": 4.366336822509766, + "learning_rate": 9.818745359019264e-05, + "loss": 1.1767, + "num_input_tokens_seen": 4167184, + "step": 259 + }, + { + "epoch": 0.018212543889603965, + "grad_norm": 4.394649982452393, + "learning_rate": 9.818045534150613e-05, + "loss": 1.0741, + "num_input_tokens_seen": 4183376, + "step": 260 + }, + { + "epoch": 0.01828259213533321, + "grad_norm": 4.344518184661865, + "learning_rate": 9.817345709281962e-05, + "loss": 1.2282, + "num_input_tokens_seen": 4199760, + "step": 261 + }, + { + "epoch": 0.018352640381062456, + "grad_norm": 4.403041362762451, + "learning_rate": 9.816645884413311e-05, + "loss": 1.2317, + "num_input_tokens_seen": 4215816, + "step": 262 + }, + { + "epoch": 0.0184226886267917, + "grad_norm": 4.715320110321045, + "learning_rate": 9.81594605954466e-05, + "loss": 1.3074, + "num_input_tokens_seen": 4231504, + "step": 263 + }, + { + "epoch": 0.01849273687252095, + "grad_norm": 4.5754265785217285, + "learning_rate": 9.815246234676007e-05, + "loss": 1.253, + "num_input_tokens_seen": 4247888, + "step": 264 + }, + { + "epoch": 0.018562785118250196, + "grad_norm": 4.2346930503845215, + "learning_rate": 9.814546409807356e-05, + "loss": 1.1727, + "num_input_tokens_seen": 4264248, + "step": 265 + }, + { + "epoch": 0.018632833363979442, + "grad_norm": 4.186713218688965, + "learning_rate": 9.813846584938705e-05, + "loss": 1.2693, + "num_input_tokens_seen": 4280632, + "step": 266 + }, + { + "epoch": 0.018702881609708687, + "grad_norm": 4.6356706619262695, + "learning_rate": 9.813146760070052e-05, + "loss": 1.3755, + "num_input_tokens_seen": 4296648, + "step": 267 + }, + { + "epoch": 0.018772929855437933, + "grad_norm": 4.466466903686523, + "learning_rate": 9.812446935201401e-05, + "loss": 1.283, + "num_input_tokens_seen": 4311408, + "step": 268 + }, + { + "epoch": 0.01884297810116718, + "grad_norm": 4.3369140625, + "learning_rate": 9.81174711033275e-05, + "loss": 1.1555, + "num_input_tokens_seen": 4326736, + "step": 269 + }, + { + "epoch": 0.018913026346896424, + "grad_norm": 4.434782028198242, + "learning_rate": 9.811047285464099e-05, + "loss": 1.2859, + "num_input_tokens_seen": 4343120, + "step": 270 + }, + { + "epoch": 0.01898307459262567, + "grad_norm": 4.346708297729492, + "learning_rate": 9.810347460595448e-05, + "loss": 1.1421, + "num_input_tokens_seen": 4359504, + "step": 271 + }, + { + "epoch": 0.019053122838354915, + "grad_norm": 4.529878616333008, + "learning_rate": 9.809647635726795e-05, + "loss": 1.2654, + "num_input_tokens_seen": 4375888, + "step": 272 + }, + { + "epoch": 0.019123171084084165, + "grad_norm": 4.051745891571045, + "learning_rate": 9.808947810858144e-05, + "loss": 1.1469, + "num_input_tokens_seen": 4392224, + "step": 273 + }, + { + "epoch": 0.01919321932981341, + "grad_norm": 4.403522491455078, + "learning_rate": 9.808247985989492e-05, + "loss": 1.233, + "num_input_tokens_seen": 4408608, + "step": 274 + }, + { + "epoch": 0.019263267575542656, + "grad_norm": 4.166261196136475, + "learning_rate": 9.807548161120842e-05, + "loss": 1.1697, + "num_input_tokens_seen": 4424992, + "step": 275 + }, + { + "epoch": 0.0193333158212719, + "grad_norm": 4.29187536239624, + "learning_rate": 9.806848336252191e-05, + "loss": 1.0503, + "num_input_tokens_seen": 4441376, + "step": 276 + }, + { + "epoch": 0.019403364067001147, + "grad_norm": 4.4056172370910645, + "learning_rate": 9.806148511383538e-05, + "loss": 1.1965, + "num_input_tokens_seen": 4457760, + "step": 277 + }, + { + "epoch": 0.019473412312730393, + "grad_norm": 4.355875015258789, + "learning_rate": 9.805448686514887e-05, + "loss": 1.1024, + "num_input_tokens_seen": 4474144, + "step": 278 + }, + { + "epoch": 0.019543460558459638, + "grad_norm": 4.46420955657959, + "learning_rate": 9.804748861646235e-05, + "loss": 1.203, + "num_input_tokens_seen": 4488912, + "step": 279 + }, + { + "epoch": 0.019613508804188884, + "grad_norm": 4.48052453994751, + "learning_rate": 9.804049036777583e-05, + "loss": 1.2089, + "num_input_tokens_seen": 4505296, + "step": 280 + }, + { + "epoch": 0.01968355704991813, + "grad_norm": 4.458749294281006, + "learning_rate": 9.803349211908932e-05, + "loss": 1.1557, + "num_input_tokens_seen": 4520576, + "step": 281 + }, + { + "epoch": 0.01975360529564738, + "grad_norm": 4.551771640777588, + "learning_rate": 9.802649387040281e-05, + "loss": 1.1671, + "num_input_tokens_seen": 4536960, + "step": 282 + }, + { + "epoch": 0.019823653541376624, + "grad_norm": 4.038064956665039, + "learning_rate": 9.80194956217163e-05, + "loss": 1.1562, + "num_input_tokens_seen": 4553344, + "step": 283 + }, + { + "epoch": 0.01989370178710587, + "grad_norm": 4.647075653076172, + "learning_rate": 9.801249737302978e-05, + "loss": 1.3069, + "num_input_tokens_seen": 4568928, + "step": 284 + }, + { + "epoch": 0.019963750032835115, + "grad_norm": 4.258941650390625, + "learning_rate": 9.800549912434326e-05, + "loss": 1.0349, + "num_input_tokens_seen": 4585312, + "step": 285 + }, + { + "epoch": 0.02003379827856436, + "grad_norm": 4.348769664764404, + "learning_rate": 9.799850087565674e-05, + "loss": 1.1163, + "num_input_tokens_seen": 4601696, + "step": 286 + }, + { + "epoch": 0.020103846524293607, + "grad_norm": 4.105901718139648, + "learning_rate": 9.799150262697023e-05, + "loss": 1.0313, + "num_input_tokens_seen": 4617312, + "step": 287 + }, + { + "epoch": 0.020173894770022852, + "grad_norm": 4.079495429992676, + "learning_rate": 9.798450437828372e-05, + "loss": 1.0828, + "num_input_tokens_seen": 4633696, + "step": 288 + }, + { + "epoch": 0.020243943015752098, + "grad_norm": 4.03472375869751, + "learning_rate": 9.79775061295972e-05, + "loss": 0.9475, + "num_input_tokens_seen": 4650080, + "step": 289 + }, + { + "epoch": 0.020313991261481347, + "grad_norm": 4.077049732208252, + "learning_rate": 9.797050788091069e-05, + "loss": 1.1323, + "num_input_tokens_seen": 4666328, + "step": 290 + }, + { + "epoch": 0.020384039507210593, + "grad_norm": 4.086606025695801, + "learning_rate": 9.796350963222417e-05, + "loss": 1.1218, + "num_input_tokens_seen": 4682256, + "step": 291 + }, + { + "epoch": 0.020454087752939838, + "grad_norm": 4.296900749206543, + "learning_rate": 9.795651138353766e-05, + "loss": 1.2964, + "num_input_tokens_seen": 4698640, + "step": 292 + }, + { + "epoch": 0.020524135998669084, + "grad_norm": 4.040759086608887, + "learning_rate": 9.794951313485115e-05, + "loss": 1.1077, + "num_input_tokens_seen": 4714928, + "step": 293 + }, + { + "epoch": 0.02059418424439833, + "grad_norm": 3.8260273933410645, + "learning_rate": 9.794251488616462e-05, + "loss": 0.9667, + "num_input_tokens_seen": 4731312, + "step": 294 + }, + { + "epoch": 0.020664232490127575, + "grad_norm": 4.294517993927002, + "learning_rate": 9.793551663747811e-05, + "loss": 1.2704, + "num_input_tokens_seen": 4747544, + "step": 295 + }, + { + "epoch": 0.02073428073585682, + "grad_norm": 4.206037521362305, + "learning_rate": 9.79285183887916e-05, + "loss": 1.1593, + "num_input_tokens_seen": 4763928, + "step": 296 + }, + { + "epoch": 0.020804328981586066, + "grad_norm": 4.147867202758789, + "learning_rate": 9.792152014010509e-05, + "loss": 1.1256, + "num_input_tokens_seen": 4780312, + "step": 297 + }, + { + "epoch": 0.020874377227315312, + "grad_norm": 4.23718786239624, + "learning_rate": 9.791452189141857e-05, + "loss": 1.2353, + "num_input_tokens_seen": 4796384, + "step": 298 + }, + { + "epoch": 0.02094442547304456, + "grad_norm": 4.172685146331787, + "learning_rate": 9.790752364273205e-05, + "loss": 1.1868, + "num_input_tokens_seen": 4812768, + "step": 299 + }, + { + "epoch": 0.021014473718773807, + "grad_norm": 4.167289733886719, + "learning_rate": 9.790052539404554e-05, + "loss": 1.0606, + "num_input_tokens_seen": 4829152, + "step": 300 + }, + { + "epoch": 0.021084521964503052, + "grad_norm": 4.096963882446289, + "learning_rate": 9.789352714535903e-05, + "loss": 1.0557, + "num_input_tokens_seen": 4845384, + "step": 301 + }, + { + "epoch": 0.021154570210232298, + "grad_norm": 4.223779678344727, + "learning_rate": 9.788652889667252e-05, + "loss": 1.1485, + "num_input_tokens_seen": 4861768, + "step": 302 + }, + { + "epoch": 0.021224618455961543, + "grad_norm": 3.8243472576141357, + "learning_rate": 9.7879530647986e-05, + "loss": 1.004, + "num_input_tokens_seen": 4878152, + "step": 303 + }, + { + "epoch": 0.02129466670169079, + "grad_norm": 4.092590808868408, + "learning_rate": 9.787253239929948e-05, + "loss": 1.0211, + "num_input_tokens_seen": 4894536, + "step": 304 + }, + { + "epoch": 0.021364714947420035, + "grad_norm": 4.42412805557251, + "learning_rate": 9.786553415061297e-05, + "loss": 0.9915, + "num_input_tokens_seen": 4910320, + "step": 305 + }, + { + "epoch": 0.02143476319314928, + "grad_norm": 4.488316535949707, + "learning_rate": 9.785853590192644e-05, + "loss": 1.1782, + "num_input_tokens_seen": 4926704, + "step": 306 + }, + { + "epoch": 0.021504811438878526, + "grad_norm": 4.110256195068359, + "learning_rate": 9.785153765323993e-05, + "loss": 1.102, + "num_input_tokens_seen": 4943088, + "step": 307 + }, + { + "epoch": 0.021574859684607775, + "grad_norm": 4.246950149536133, + "learning_rate": 9.784453940455342e-05, + "loss": 1.067, + "num_input_tokens_seen": 4958736, + "step": 308 + }, + { + "epoch": 0.02164490793033702, + "grad_norm": 4.175214767456055, + "learning_rate": 9.783754115586691e-05, + "loss": 1.0638, + "num_input_tokens_seen": 4975120, + "step": 309 + }, + { + "epoch": 0.021714956176066266, + "grad_norm": 4.427795886993408, + "learning_rate": 9.78305429071804e-05, + "loss": 1.1347, + "num_input_tokens_seen": 4991504, + "step": 310 + }, + { + "epoch": 0.021785004421795512, + "grad_norm": 4.158191204071045, + "learning_rate": 9.782354465849387e-05, + "loss": 1.1662, + "num_input_tokens_seen": 5007152, + "step": 311 + }, + { + "epoch": 0.021855052667524758, + "grad_norm": 4.184347629547119, + "learning_rate": 9.781654640980736e-05, + "loss": 1.0791, + "num_input_tokens_seen": 5023536, + "step": 312 + }, + { + "epoch": 0.021925100913254003, + "grad_norm": 3.8506295680999756, + "learning_rate": 9.780954816112084e-05, + "loss": 1.0615, + "num_input_tokens_seen": 5039728, + "step": 313 + }, + { + "epoch": 0.02199514915898325, + "grad_norm": 4.310062408447266, + "learning_rate": 9.780254991243432e-05, + "loss": 1.1363, + "num_input_tokens_seen": 5056008, + "step": 314 + }, + { + "epoch": 0.022065197404712494, + "grad_norm": 4.215006351470947, + "learning_rate": 9.779555166374781e-05, + "loss": 1.1715, + "num_input_tokens_seen": 5072096, + "step": 315 + }, + { + "epoch": 0.022135245650441743, + "grad_norm": 4.219073295593262, + "learning_rate": 9.77885534150613e-05, + "loss": 1.219, + "num_input_tokens_seen": 5088432, + "step": 316 + }, + { + "epoch": 0.02220529389617099, + "grad_norm": 4.319522857666016, + "learning_rate": 9.778155516637479e-05, + "loss": 1.3085, + "num_input_tokens_seen": 5104240, + "step": 317 + }, + { + "epoch": 0.022275342141900235, + "grad_norm": 4.118961334228516, + "learning_rate": 9.777455691768827e-05, + "loss": 1.0926, + "num_input_tokens_seen": 5120624, + "step": 318 + }, + { + "epoch": 0.02234539038762948, + "grad_norm": 4.195051193237305, + "learning_rate": 9.776755866900175e-05, + "loss": 1.0894, + "num_input_tokens_seen": 5137008, + "step": 319 + }, + { + "epoch": 0.022415438633358726, + "grad_norm": 4.114197254180908, + "learning_rate": 9.776056042031524e-05, + "loss": 1.1897, + "num_input_tokens_seen": 5153272, + "step": 320 + }, + { + "epoch": 0.02248548687908797, + "grad_norm": 4.014908313751221, + "learning_rate": 9.775356217162872e-05, + "loss": 1.0932, + "num_input_tokens_seen": 5169472, + "step": 321 + }, + { + "epoch": 0.022555535124817217, + "grad_norm": 4.190642356872559, + "learning_rate": 9.774656392294222e-05, + "loss": 1.1413, + "num_input_tokens_seen": 5185856, + "step": 322 + }, + { + "epoch": 0.022625583370546463, + "grad_norm": 4.562993049621582, + "learning_rate": 9.77395656742557e-05, + "loss": 1.2865, + "num_input_tokens_seen": 5202240, + "step": 323 + }, + { + "epoch": 0.02269563161627571, + "grad_norm": 4.607022762298584, + "learning_rate": 9.773256742556918e-05, + "loss": 1.1465, + "num_input_tokens_seen": 5218168, + "step": 324 + }, + { + "epoch": 0.022765679862004957, + "grad_norm": 3.956439256668091, + "learning_rate": 9.772556917688267e-05, + "loss": 1.028, + "num_input_tokens_seen": 5234368, + "step": 325 + }, + { + "epoch": 0.022835728107734203, + "grad_norm": 4.20713472366333, + "learning_rate": 9.771857092819615e-05, + "loss": 1.2332, + "num_input_tokens_seen": 5249808, + "step": 326 + }, + { + "epoch": 0.02290577635346345, + "grad_norm": 4.4092864990234375, + "learning_rate": 9.771157267950964e-05, + "loss": 1.104, + "num_input_tokens_seen": 5266120, + "step": 327 + }, + { + "epoch": 0.022975824599192694, + "grad_norm": 4.529845237731934, + "learning_rate": 9.770457443082312e-05, + "loss": 1.3475, + "num_input_tokens_seen": 5282504, + "step": 328 + }, + { + "epoch": 0.02304587284492194, + "grad_norm": 4.221986293792725, + "learning_rate": 9.769757618213661e-05, + "loss": 1.4115, + "num_input_tokens_seen": 5298344, + "step": 329 + }, + { + "epoch": 0.023115921090651186, + "grad_norm": 4.29000186920166, + "learning_rate": 9.76905779334501e-05, + "loss": 1.2855, + "num_input_tokens_seen": 5314728, + "step": 330 + }, + { + "epoch": 0.02318596933638043, + "grad_norm": 4.426812648773193, + "learning_rate": 9.768357968476358e-05, + "loss": 1.514, + "num_input_tokens_seen": 5330816, + "step": 331 + }, + { + "epoch": 0.023256017582109677, + "grad_norm": 4.210752964019775, + "learning_rate": 9.767658143607706e-05, + "loss": 1.0854, + "num_input_tokens_seen": 5346552, + "step": 332 + }, + { + "epoch": 0.023326065827838922, + "grad_norm": 4.216427326202393, + "learning_rate": 9.766958318739054e-05, + "loss": 1.1573, + "num_input_tokens_seen": 5362936, + "step": 333 + }, + { + "epoch": 0.02339611407356817, + "grad_norm": 4.132325649261475, + "learning_rate": 9.766258493870403e-05, + "loss": 1.0942, + "num_input_tokens_seen": 5379320, + "step": 334 + }, + { + "epoch": 0.023466162319297417, + "grad_norm": 4.277027130126953, + "learning_rate": 9.765558669001752e-05, + "loss": 1.1227, + "num_input_tokens_seen": 5395704, + "step": 335 + }, + { + "epoch": 0.023536210565026663, + "grad_norm": 4.228096961975098, + "learning_rate": 9.7648588441331e-05, + "loss": 1.1094, + "num_input_tokens_seen": 5412088, + "step": 336 + }, + { + "epoch": 0.02360625881075591, + "grad_norm": 4.194522380828857, + "learning_rate": 9.76415901926445e-05, + "loss": 1.2066, + "num_input_tokens_seen": 5428472, + "step": 337 + }, + { + "epoch": 0.023676307056485154, + "grad_norm": 4.336326599121094, + "learning_rate": 9.763459194395797e-05, + "loss": 1.2251, + "num_input_tokens_seen": 5444856, + "step": 338 + }, + { + "epoch": 0.0237463553022144, + "grad_norm": 4.2723307609558105, + "learning_rate": 9.762759369527146e-05, + "loss": 1.0927, + "num_input_tokens_seen": 5460304, + "step": 339 + }, + { + "epoch": 0.023816403547943645, + "grad_norm": 4.190036773681641, + "learning_rate": 9.762059544658493e-05, + "loss": 1.2036, + "num_input_tokens_seen": 5476688, + "step": 340 + }, + { + "epoch": 0.02388645179367289, + "grad_norm": 4.477560043334961, + "learning_rate": 9.761359719789842e-05, + "loss": 1.362, + "num_input_tokens_seen": 5493072, + "step": 341 + }, + { + "epoch": 0.02395650003940214, + "grad_norm": 4.160232067108154, + "learning_rate": 9.760659894921192e-05, + "loss": 1.1602, + "num_input_tokens_seen": 5509456, + "step": 342 + }, + { + "epoch": 0.024026548285131386, + "grad_norm": 3.857335090637207, + "learning_rate": 9.75996007005254e-05, + "loss": 1.0963, + "num_input_tokens_seen": 5525840, + "step": 343 + }, + { + "epoch": 0.02409659653086063, + "grad_norm": 4.141246318817139, + "learning_rate": 9.759260245183889e-05, + "loss": 1.2009, + "num_input_tokens_seen": 5541888, + "step": 344 + }, + { + "epoch": 0.024166644776589877, + "grad_norm": 4.50364875793457, + "learning_rate": 9.758560420315236e-05, + "loss": 1.1483, + "num_input_tokens_seen": 5557848, + "step": 345 + }, + { + "epoch": 0.024236693022319122, + "grad_norm": 4.3343353271484375, + "learning_rate": 9.757860595446585e-05, + "loss": 1.3594, + "num_input_tokens_seen": 5573504, + "step": 346 + }, + { + "epoch": 0.024306741268048368, + "grad_norm": 4.050408363342285, + "learning_rate": 9.757160770577934e-05, + "loss": 1.0563, + "num_input_tokens_seen": 5589544, + "step": 347 + }, + { + "epoch": 0.024376789513777614, + "grad_norm": 4.051811695098877, + "learning_rate": 9.756460945709283e-05, + "loss": 1.0288, + "num_input_tokens_seen": 5605368, + "step": 348 + }, + { + "epoch": 0.02444683775950686, + "grad_norm": 4.365113258361816, + "learning_rate": 9.755761120840632e-05, + "loss": 1.3054, + "num_input_tokens_seen": 5621752, + "step": 349 + }, + { + "epoch": 0.024516886005236105, + "grad_norm": 4.0057501792907715, + "learning_rate": 9.755061295971979e-05, + "loss": 1.1302, + "num_input_tokens_seen": 5638136, + "step": 350 + }, + { + "epoch": 0.024586934250965354, + "grad_norm": 4.254896640777588, + "learning_rate": 9.754361471103328e-05, + "loss": 1.0495, + "num_input_tokens_seen": 5653168, + "step": 351 + }, + { + "epoch": 0.0246569824966946, + "grad_norm": 3.8119771480560303, + "learning_rate": 9.753661646234677e-05, + "loss": 1.0349, + "num_input_tokens_seen": 5669504, + "step": 352 + }, + { + "epoch": 0.024727030742423845, + "grad_norm": 4.5082621574401855, + "learning_rate": 9.752961821366024e-05, + "loss": 1.2537, + "num_input_tokens_seen": 5685168, + "step": 353 + }, + { + "epoch": 0.02479707898815309, + "grad_norm": 4.392731189727783, + "learning_rate": 9.752261996497373e-05, + "loss": 1.2534, + "num_input_tokens_seen": 5701240, + "step": 354 + }, + { + "epoch": 0.024867127233882336, + "grad_norm": 4.293395519256592, + "learning_rate": 9.751562171628722e-05, + "loss": 1.2774, + "num_input_tokens_seen": 5717624, + "step": 355 + }, + { + "epoch": 0.024937175479611582, + "grad_norm": 4.64813756942749, + "learning_rate": 9.750862346760071e-05, + "loss": 1.2795, + "num_input_tokens_seen": 5733104, + "step": 356 + }, + { + "epoch": 0.025007223725340828, + "grad_norm": 4.5166778564453125, + "learning_rate": 9.75016252189142e-05, + "loss": 1.1301, + "num_input_tokens_seen": 5749488, + "step": 357 + }, + { + "epoch": 0.025077271971070073, + "grad_norm": 3.894291400909424, + "learning_rate": 9.749462697022767e-05, + "loss": 0.901, + "num_input_tokens_seen": 5765872, + "step": 358 + }, + { + "epoch": 0.02514732021679932, + "grad_norm": 4.10056209564209, + "learning_rate": 9.748762872154116e-05, + "loss": 1.0529, + "num_input_tokens_seen": 5780856, + "step": 359 + }, + { + "epoch": 0.025217368462528568, + "grad_norm": 4.6277666091918945, + "learning_rate": 9.748063047285464e-05, + "loss": 1.3649, + "num_input_tokens_seen": 5796856, + "step": 360 + }, + { + "epoch": 0.025287416708257814, + "grad_norm": 4.029720306396484, + "learning_rate": 9.747363222416813e-05, + "loss": 0.8863, + "num_input_tokens_seen": 5812176, + "step": 361 + }, + { + "epoch": 0.02535746495398706, + "grad_norm": 3.7772202491760254, + "learning_rate": 9.746663397548161e-05, + "loss": 1.0448, + "num_input_tokens_seen": 5828064, + "step": 362 + }, + { + "epoch": 0.025427513199716305, + "grad_norm": 4.379861354827881, + "learning_rate": 9.74596357267951e-05, + "loss": 1.3274, + "num_input_tokens_seen": 5843680, + "step": 363 + }, + { + "epoch": 0.02549756144544555, + "grad_norm": 4.254587173461914, + "learning_rate": 9.745263747810859e-05, + "loss": 1.1502, + "num_input_tokens_seen": 5859024, + "step": 364 + }, + { + "epoch": 0.025567609691174796, + "grad_norm": 4.271276473999023, + "learning_rate": 9.744563922942207e-05, + "loss": 1.2785, + "num_input_tokens_seen": 5874320, + "step": 365 + }, + { + "epoch": 0.02563765793690404, + "grad_norm": 4.224324703216553, + "learning_rate": 9.743864098073555e-05, + "loss": 1.0926, + "num_input_tokens_seen": 5890704, + "step": 366 + }, + { + "epoch": 0.025707706182633287, + "grad_norm": 4.289444446563721, + "learning_rate": 9.743164273204903e-05, + "loss": 1.1913, + "num_input_tokens_seen": 5906016, + "step": 367 + }, + { + "epoch": 0.025777754428362536, + "grad_norm": 4.280707359313965, + "learning_rate": 9.742464448336253e-05, + "loss": 1.2238, + "num_input_tokens_seen": 5921784, + "step": 368 + }, + { + "epoch": 0.025847802674091782, + "grad_norm": 4.554803848266602, + "learning_rate": 9.741764623467602e-05, + "loss": 1.2491, + "num_input_tokens_seen": 5938072, + "step": 369 + }, + { + "epoch": 0.025917850919821028, + "grad_norm": 4.677784442901611, + "learning_rate": 9.74106479859895e-05, + "loss": 1.2387, + "num_input_tokens_seen": 5954456, + "step": 370 + }, + { + "epoch": 0.025987899165550273, + "grad_norm": 4.268225193023682, + "learning_rate": 9.740364973730298e-05, + "loss": 1.2983, + "num_input_tokens_seen": 5970664, + "step": 371 + }, + { + "epoch": 0.02605794741127952, + "grad_norm": 4.361818790435791, + "learning_rate": 9.739665148861646e-05, + "loss": 1.199, + "num_input_tokens_seen": 5987048, + "step": 372 + }, + { + "epoch": 0.026127995657008764, + "grad_norm": 3.9990735054016113, + "learning_rate": 9.738965323992995e-05, + "loss": 1.0777, + "num_input_tokens_seen": 6003432, + "step": 373 + }, + { + "epoch": 0.02619804390273801, + "grad_norm": 3.992142915725708, + "learning_rate": 9.738265499124344e-05, + "loss": 1.0443, + "num_input_tokens_seen": 6019816, + "step": 374 + }, + { + "epoch": 0.026268092148467256, + "grad_norm": 4.270167827606201, + "learning_rate": 9.737565674255693e-05, + "loss": 1.1764, + "num_input_tokens_seen": 6036200, + "step": 375 + }, + { + "epoch": 0.0263381403941965, + "grad_norm": 4.362086296081543, + "learning_rate": 9.736865849387041e-05, + "loss": 1.2735, + "num_input_tokens_seen": 6052120, + "step": 376 + }, + { + "epoch": 0.02640818863992575, + "grad_norm": 3.6900475025177, + "learning_rate": 9.736166024518389e-05, + "loss": 0.8729, + "num_input_tokens_seen": 6068264, + "step": 377 + }, + { + "epoch": 0.026478236885654996, + "grad_norm": 3.8281285762786865, + "learning_rate": 9.735466199649738e-05, + "loss": 1.1096, + "num_input_tokens_seen": 6084504, + "step": 378 + }, + { + "epoch": 0.02654828513138424, + "grad_norm": 3.9335553646087646, + "learning_rate": 9.734766374781087e-05, + "loss": 1.0763, + "num_input_tokens_seen": 6100592, + "step": 379 + }, + { + "epoch": 0.026618333377113487, + "grad_norm": 4.332645416259766, + "learning_rate": 9.734066549912434e-05, + "loss": 1.1751, + "num_input_tokens_seen": 6116976, + "step": 380 + }, + { + "epoch": 0.026688381622842733, + "grad_norm": 4.160863399505615, + "learning_rate": 9.733366725043783e-05, + "loss": 1.0778, + "num_input_tokens_seen": 6133360, + "step": 381 + }, + { + "epoch": 0.02675842986857198, + "grad_norm": 4.388178825378418, + "learning_rate": 9.732666900175132e-05, + "loss": 1.2214, + "num_input_tokens_seen": 6149744, + "step": 382 + }, + { + "epoch": 0.026828478114301224, + "grad_norm": 4.354910373687744, + "learning_rate": 9.73196707530648e-05, + "loss": 1.4115, + "num_input_tokens_seen": 6166048, + "step": 383 + }, + { + "epoch": 0.02689852636003047, + "grad_norm": 4.058071613311768, + "learning_rate": 9.73126725043783e-05, + "loss": 1.0934, + "num_input_tokens_seen": 6181840, + "step": 384 + }, + { + "epoch": 0.026968574605759715, + "grad_norm": 4.060855865478516, + "learning_rate": 9.730567425569177e-05, + "loss": 1.1395, + "num_input_tokens_seen": 6198224, + "step": 385 + }, + { + "epoch": 0.027038622851488964, + "grad_norm": 4.316681385040283, + "learning_rate": 9.729867600700526e-05, + "loss": 1.1052, + "num_input_tokens_seen": 6214608, + "step": 386 + }, + { + "epoch": 0.02710867109721821, + "grad_norm": 4.322516918182373, + "learning_rate": 9.729167775831873e-05, + "loss": 1.2512, + "num_input_tokens_seen": 6230992, + "step": 387 + }, + { + "epoch": 0.027178719342947456, + "grad_norm": 4.090857028961182, + "learning_rate": 9.728467950963224e-05, + "loss": 1.0772, + "num_input_tokens_seen": 6246760, + "step": 388 + }, + { + "epoch": 0.0272487675886767, + "grad_norm": 4.0143961906433105, + "learning_rate": 9.727768126094571e-05, + "loss": 1.0578, + "num_input_tokens_seen": 6261968, + "step": 389 + }, + { + "epoch": 0.027318815834405947, + "grad_norm": 4.911194324493408, + "learning_rate": 9.72706830122592e-05, + "loss": 1.3016, + "num_input_tokens_seen": 6276664, + "step": 390 + }, + { + "epoch": 0.027388864080135192, + "grad_norm": 4.057498931884766, + "learning_rate": 9.726368476357269e-05, + "loss": 1.026, + "num_input_tokens_seen": 6293048, + "step": 391 + }, + { + "epoch": 0.027458912325864438, + "grad_norm": 3.9827401638031006, + "learning_rate": 9.725668651488616e-05, + "loss": 1.136, + "num_input_tokens_seen": 6309432, + "step": 392 + }, + { + "epoch": 0.027528960571593684, + "grad_norm": 4.640822887420654, + "learning_rate": 9.724968826619965e-05, + "loss": 1.2823, + "num_input_tokens_seen": 6325568, + "step": 393 + }, + { + "epoch": 0.027599008817322933, + "grad_norm": 4.372538089752197, + "learning_rate": 9.724269001751314e-05, + "loss": 1.0354, + "num_input_tokens_seen": 6341952, + "step": 394 + }, + { + "epoch": 0.02766905706305218, + "grad_norm": 4.018289566040039, + "learning_rate": 9.723569176882663e-05, + "loss": 1.029, + "num_input_tokens_seen": 6358336, + "step": 395 + }, + { + "epoch": 0.027739105308781424, + "grad_norm": 4.440858364105225, + "learning_rate": 9.722869352014012e-05, + "loss": 1.2272, + "num_input_tokens_seen": 6374680, + "step": 396 + }, + { + "epoch": 0.02780915355451067, + "grad_norm": 4.246788024902344, + "learning_rate": 9.722169527145359e-05, + "loss": 1.0161, + "num_input_tokens_seen": 6390672, + "step": 397 + }, + { + "epoch": 0.027879201800239915, + "grad_norm": 4.27274751663208, + "learning_rate": 9.721469702276708e-05, + "loss": 1.293, + "num_input_tokens_seen": 6407056, + "step": 398 + }, + { + "epoch": 0.02794925004596916, + "grad_norm": 4.171760559082031, + "learning_rate": 9.720769877408056e-05, + "loss": 1.2766, + "num_input_tokens_seen": 6423440, + "step": 399 + }, + { + "epoch": 0.028019298291698407, + "grad_norm": 4.174622535705566, + "learning_rate": 9.720070052539405e-05, + "loss": 1.049, + "num_input_tokens_seen": 6439824, + "step": 400 + }, + { + "epoch": 0.028019298291698407, + "eval_loss": 1.1994441747665405, + "eval_runtime": 0.2131, + "eval_samples_per_second": 4.693, + "eval_steps_per_second": 4.693, + "num_input_tokens_seen": 6439824, + "step": 400 + }, + { + "epoch": 0.028089346537427652, + "grad_norm": 4.199150562286377, + "learning_rate": 9.719370227670753e-05, + "loss": 1.3432, + "num_input_tokens_seen": 6456208, + "step": 401 + }, + { + "epoch": 0.028159394783156898, + "grad_norm": 3.9011733531951904, + "learning_rate": 9.718670402802102e-05, + "loss": 1.0895, + "num_input_tokens_seen": 6472592, + "step": 402 + }, + { + "epoch": 0.028229443028886147, + "grad_norm": 4.142306327819824, + "learning_rate": 9.717970577933451e-05, + "loss": 0.9031, + "num_input_tokens_seen": 6488976, + "step": 403 + }, + { + "epoch": 0.028299491274615392, + "grad_norm": 3.9745633602142334, + "learning_rate": 9.717270753064799e-05, + "loss": 0.9951, + "num_input_tokens_seen": 6505360, + "step": 404 + }, + { + "epoch": 0.028369539520344638, + "grad_norm": 3.838865280151367, + "learning_rate": 9.716570928196147e-05, + "loss": 0.809, + "num_input_tokens_seen": 6521744, + "step": 405 + }, + { + "epoch": 0.028439587766073884, + "grad_norm": 4.48146390914917, + "learning_rate": 9.715871103327496e-05, + "loss": 1.4985, + "num_input_tokens_seen": 6538128, + "step": 406 + }, + { + "epoch": 0.02850963601180313, + "grad_norm": 4.393556594848633, + "learning_rate": 9.715171278458844e-05, + "loss": 1.2355, + "num_input_tokens_seen": 6554512, + "step": 407 + }, + { + "epoch": 0.028579684257532375, + "grad_norm": 3.970860004425049, + "learning_rate": 9.714471453590194e-05, + "loss": 1.1513, + "num_input_tokens_seen": 6570896, + "step": 408 + }, + { + "epoch": 0.02864973250326162, + "grad_norm": 4.166610240936279, + "learning_rate": 9.713771628721542e-05, + "loss": 1.108, + "num_input_tokens_seen": 6587216, + "step": 409 + }, + { + "epoch": 0.028719780748990866, + "grad_norm": 3.9887096881866455, + "learning_rate": 9.71307180385289e-05, + "loss": 1.1639, + "num_input_tokens_seen": 6603600, + "step": 410 + }, + { + "epoch": 0.028789828994720112, + "grad_norm": 4.195802211761475, + "learning_rate": 9.712371978984239e-05, + "loss": 1.1478, + "num_input_tokens_seen": 6619984, + "step": 411 + }, + { + "epoch": 0.02885987724044936, + "grad_norm": 4.011331081390381, + "learning_rate": 9.711672154115587e-05, + "loss": 0.9554, + "num_input_tokens_seen": 6635904, + "step": 412 + }, + { + "epoch": 0.028929925486178606, + "grad_norm": 4.4170026779174805, + "learning_rate": 9.710972329246936e-05, + "loss": 1.1452, + "num_input_tokens_seen": 6651944, + "step": 413 + }, + { + "epoch": 0.028999973731907852, + "grad_norm": 4.073450088500977, + "learning_rate": 9.710272504378284e-05, + "loss": 1.1187, + "num_input_tokens_seen": 6668096, + "step": 414 + }, + { + "epoch": 0.029070021977637098, + "grad_norm": 4.161722183227539, + "learning_rate": 9.709572679509633e-05, + "loss": 1.1603, + "num_input_tokens_seen": 6684480, + "step": 415 + }, + { + "epoch": 0.029140070223366343, + "grad_norm": 4.540097713470459, + "learning_rate": 9.708872854640981e-05, + "loss": 1.2143, + "num_input_tokens_seen": 6700536, + "step": 416 + }, + { + "epoch": 0.02921011846909559, + "grad_norm": 4.030871868133545, + "learning_rate": 9.70817302977233e-05, + "loss": 0.9791, + "num_input_tokens_seen": 6716920, + "step": 417 + }, + { + "epoch": 0.029280166714824835, + "grad_norm": 4.1743268966674805, + "learning_rate": 9.707473204903679e-05, + "loss": 0.9818, + "num_input_tokens_seen": 6733304, + "step": 418 + }, + { + "epoch": 0.02935021496055408, + "grad_norm": 4.227272987365723, + "learning_rate": 9.706773380035026e-05, + "loss": 1.0945, + "num_input_tokens_seen": 6749688, + "step": 419 + }, + { + "epoch": 0.02942026320628333, + "grad_norm": 4.406428813934326, + "learning_rate": 9.706073555166375e-05, + "loss": 1.0302, + "num_input_tokens_seen": 6766072, + "step": 420 + }, + { + "epoch": 0.029490311452012575, + "grad_norm": 4.17899227142334, + "learning_rate": 9.705373730297724e-05, + "loss": 1.1048, + "num_input_tokens_seen": 6782456, + "step": 421 + }, + { + "epoch": 0.02956035969774182, + "grad_norm": 4.034752368927002, + "learning_rate": 9.704673905429073e-05, + "loss": 1.2639, + "num_input_tokens_seen": 6798840, + "step": 422 + }, + { + "epoch": 0.029630407943471066, + "grad_norm": 4.795727729797363, + "learning_rate": 9.703974080560421e-05, + "loss": 1.2448, + "num_input_tokens_seen": 6814912, + "step": 423 + }, + { + "epoch": 0.029700456189200312, + "grad_norm": 4.509056568145752, + "learning_rate": 9.703274255691769e-05, + "loss": 1.2157, + "num_input_tokens_seen": 6830720, + "step": 424 + }, + { + "epoch": 0.029770504434929557, + "grad_norm": 4.064620494842529, + "learning_rate": 9.702574430823118e-05, + "loss": 1.2042, + "num_input_tokens_seen": 6847104, + "step": 425 + }, + { + "epoch": 0.029840552680658803, + "grad_norm": 3.9060182571411133, + "learning_rate": 9.701874605954465e-05, + "loss": 0.9116, + "num_input_tokens_seen": 6862952, + "step": 426 + }, + { + "epoch": 0.02991060092638805, + "grad_norm": 3.9900951385498047, + "learning_rate": 9.701174781085814e-05, + "loss": 1.1621, + "num_input_tokens_seen": 6879336, + "step": 427 + }, + { + "epoch": 0.029980649172117294, + "grad_norm": 4.371436595916748, + "learning_rate": 9.700474956217164e-05, + "loss": 1.2731, + "num_input_tokens_seen": 6895720, + "step": 428 + }, + { + "epoch": 0.030050697417846543, + "grad_norm": 3.9422085285186768, + "learning_rate": 9.699775131348512e-05, + "loss": 0.9636, + "num_input_tokens_seen": 6912104, + "step": 429 + }, + { + "epoch": 0.03012074566357579, + "grad_norm": 4.080913543701172, + "learning_rate": 9.699075306479861e-05, + "loss": 1.1507, + "num_input_tokens_seen": 6928488, + "step": 430 + }, + { + "epoch": 0.030190793909305035, + "grad_norm": 4.493942737579346, + "learning_rate": 9.698375481611208e-05, + "loss": 1.2274, + "num_input_tokens_seen": 6944664, + "step": 431 + }, + { + "epoch": 0.03026084215503428, + "grad_norm": 4.073723793029785, + "learning_rate": 9.697675656742557e-05, + "loss": 1.0498, + "num_input_tokens_seen": 6960344, + "step": 432 + }, + { + "epoch": 0.030330890400763526, + "grad_norm": 3.9672274589538574, + "learning_rate": 9.696975831873906e-05, + "loss": 1.007, + "num_input_tokens_seen": 6976720, + "step": 433 + }, + { + "epoch": 0.03040093864649277, + "grad_norm": 4.497872829437256, + "learning_rate": 9.696276007005255e-05, + "loss": 1.1339, + "num_input_tokens_seen": 6992552, + "step": 434 + }, + { + "epoch": 0.030470986892222017, + "grad_norm": 4.422168731689453, + "learning_rate": 9.695576182136604e-05, + "loss": 1.34, + "num_input_tokens_seen": 7008936, + "step": 435 + }, + { + "epoch": 0.030541035137951263, + "grad_norm": 4.3009138107299805, + "learning_rate": 9.694876357267951e-05, + "loss": 1.2479, + "num_input_tokens_seen": 7024512, + "step": 436 + }, + { + "epoch": 0.030611083383680508, + "grad_norm": 4.04030704498291, + "learning_rate": 9.6941765323993e-05, + "loss": 1.097, + "num_input_tokens_seen": 7040896, + "step": 437 + }, + { + "epoch": 0.030681131629409757, + "grad_norm": 3.877417802810669, + "learning_rate": 9.693476707530649e-05, + "loss": 1.1363, + "num_input_tokens_seen": 7057280, + "step": 438 + }, + { + "epoch": 0.030751179875139003, + "grad_norm": 3.8185505867004395, + "learning_rate": 9.692776882661996e-05, + "loss": 0.9067, + "num_input_tokens_seen": 7072544, + "step": 439 + }, + { + "epoch": 0.03082122812086825, + "grad_norm": 4.028950214385986, + "learning_rate": 9.692077057793345e-05, + "loss": 1.1195, + "num_input_tokens_seen": 7088928, + "step": 440 + }, + { + "epoch": 0.030891276366597494, + "grad_norm": 4.2786431312561035, + "learning_rate": 9.691377232924694e-05, + "loss": 1.1199, + "num_input_tokens_seen": 7105248, + "step": 441 + }, + { + "epoch": 0.03096132461232674, + "grad_norm": 4.193462371826172, + "learning_rate": 9.690677408056043e-05, + "loss": 1.1812, + "num_input_tokens_seen": 7121008, + "step": 442 + }, + { + "epoch": 0.031031372858055985, + "grad_norm": 3.93597412109375, + "learning_rate": 9.68997758318739e-05, + "loss": 1.0677, + "num_input_tokens_seen": 7136944, + "step": 443 + }, + { + "epoch": 0.03110142110378523, + "grad_norm": 4.3208537101745605, + "learning_rate": 9.68927775831874e-05, + "loss": 1.1358, + "num_input_tokens_seen": 7152928, + "step": 444 + }, + { + "epoch": 0.031171469349514477, + "grad_norm": 3.9743378162384033, + "learning_rate": 9.688577933450088e-05, + "loss": 1.094, + "num_input_tokens_seen": 7169312, + "step": 445 + }, + { + "epoch": 0.031241517595243726, + "grad_norm": 4.226114273071289, + "learning_rate": 9.687878108581436e-05, + "loss": 1.1752, + "num_input_tokens_seen": 7185696, + "step": 446 + }, + { + "epoch": 0.03131156584097297, + "grad_norm": 4.210222244262695, + "learning_rate": 9.687178283712785e-05, + "loss": 1.1262, + "num_input_tokens_seen": 7201784, + "step": 447 + }, + { + "epoch": 0.03138161408670222, + "grad_norm": 4.311635971069336, + "learning_rate": 9.686478458844133e-05, + "loss": 1.2491, + "num_input_tokens_seen": 7218168, + "step": 448 + }, + { + "epoch": 0.03145166233243146, + "grad_norm": 4.56603479385376, + "learning_rate": 9.685778633975482e-05, + "loss": 1.3512, + "num_input_tokens_seen": 7233360, + "step": 449 + }, + { + "epoch": 0.03152171057816071, + "grad_norm": 4.232856750488281, + "learning_rate": 9.685078809106831e-05, + "loss": 0.9387, + "num_input_tokens_seen": 7248280, + "step": 450 + }, + { + "epoch": 0.031591758823889954, + "grad_norm": 4.512947082519531, + "learning_rate": 9.684378984238179e-05, + "loss": 1.1988, + "num_input_tokens_seen": 7264664, + "step": 451 + }, + { + "epoch": 0.0316618070696192, + "grad_norm": 4.273897171020508, + "learning_rate": 9.683679159369528e-05, + "loss": 1.2523, + "num_input_tokens_seen": 7281048, + "step": 452 + }, + { + "epoch": 0.031731855315348445, + "grad_norm": 4.288438320159912, + "learning_rate": 9.682979334500875e-05, + "loss": 1.1692, + "num_input_tokens_seen": 7297424, + "step": 453 + }, + { + "epoch": 0.03180190356107769, + "grad_norm": 4.27367639541626, + "learning_rate": 9.682279509632225e-05, + "loss": 1.1868, + "num_input_tokens_seen": 7312792, + "step": 454 + }, + { + "epoch": 0.031871951806806936, + "grad_norm": 3.978926181793213, + "learning_rate": 9.681579684763574e-05, + "loss": 1.0382, + "num_input_tokens_seen": 7329176, + "step": 455 + }, + { + "epoch": 0.03194200005253618, + "grad_norm": 4.4399919509887695, + "learning_rate": 9.680879859894922e-05, + "loss": 1.2072, + "num_input_tokens_seen": 7345560, + "step": 456 + }, + { + "epoch": 0.03201204829826543, + "grad_norm": 3.9786529541015625, + "learning_rate": 9.68018003502627e-05, + "loss": 1.1704, + "num_input_tokens_seen": 7361944, + "step": 457 + }, + { + "epoch": 0.03208209654399467, + "grad_norm": 4.171195030212402, + "learning_rate": 9.679480210157618e-05, + "loss": 1.1307, + "num_input_tokens_seen": 7378328, + "step": 458 + }, + { + "epoch": 0.032152144789723926, + "grad_norm": 3.9415268898010254, + "learning_rate": 9.678780385288967e-05, + "loss": 0.9971, + "num_input_tokens_seen": 7394208, + "step": 459 + }, + { + "epoch": 0.03222219303545317, + "grad_norm": 4.066036224365234, + "learning_rate": 9.678080560420316e-05, + "loss": 1.1227, + "num_input_tokens_seen": 7410328, + "step": 460 + }, + { + "epoch": 0.03229224128118242, + "grad_norm": 4.22513484954834, + "learning_rate": 9.677380735551665e-05, + "loss": 1.0883, + "num_input_tokens_seen": 7426712, + "step": 461 + }, + { + "epoch": 0.03236228952691166, + "grad_norm": 4.310954570770264, + "learning_rate": 9.676680910683013e-05, + "loss": 1.1695, + "num_input_tokens_seen": 7442736, + "step": 462 + }, + { + "epoch": 0.03243233777264091, + "grad_norm": 4.2868828773498535, + "learning_rate": 9.675981085814361e-05, + "loss": 1.0594, + "num_input_tokens_seen": 7458560, + "step": 463 + }, + { + "epoch": 0.032502386018370154, + "grad_norm": 4.318186283111572, + "learning_rate": 9.67528126094571e-05, + "loss": 1.1791, + "num_input_tokens_seen": 7474944, + "step": 464 + }, + { + "epoch": 0.0325724342640994, + "grad_norm": 4.040421009063721, + "learning_rate": 9.674581436077059e-05, + "loss": 1.0649, + "num_input_tokens_seen": 7490344, + "step": 465 + }, + { + "epoch": 0.032642482509828645, + "grad_norm": 3.914815902709961, + "learning_rate": 9.673881611208406e-05, + "loss": 1.1381, + "num_input_tokens_seen": 7506728, + "step": 466 + }, + { + "epoch": 0.03271253075555789, + "grad_norm": 4.054527282714844, + "learning_rate": 9.673181786339755e-05, + "loss": 1.2264, + "num_input_tokens_seen": 7522912, + "step": 467 + }, + { + "epoch": 0.032782579001287136, + "grad_norm": 4.295147895812988, + "learning_rate": 9.672481961471104e-05, + "loss": 1.1369, + "num_input_tokens_seen": 7539040, + "step": 468 + }, + { + "epoch": 0.03285262724701638, + "grad_norm": 4.109183311462402, + "learning_rate": 9.671782136602453e-05, + "loss": 1.1676, + "num_input_tokens_seen": 7555424, + "step": 469 + }, + { + "epoch": 0.03292267549274563, + "grad_norm": 4.131369590759277, + "learning_rate": 9.6710823117338e-05, + "loss": 1.1188, + "num_input_tokens_seen": 7571808, + "step": 470 + }, + { + "epoch": 0.03299272373847487, + "grad_norm": 3.998414993286133, + "learning_rate": 9.670382486865149e-05, + "loss": 1.0201, + "num_input_tokens_seen": 7587528, + "step": 471 + }, + { + "epoch": 0.03306277198420412, + "grad_norm": 4.1235551834106445, + "learning_rate": 9.669682661996498e-05, + "loss": 1.1265, + "num_input_tokens_seen": 7603912, + "step": 472 + }, + { + "epoch": 0.033132820229933364, + "grad_norm": 4.800798416137695, + "learning_rate": 9.668982837127845e-05, + "loss": 1.3634, + "num_input_tokens_seen": 7617512, + "step": 473 + }, + { + "epoch": 0.03320286847566261, + "grad_norm": 4.068000316619873, + "learning_rate": 9.668283012259196e-05, + "loss": 1.1427, + "num_input_tokens_seen": 7633040, + "step": 474 + }, + { + "epoch": 0.033272916721391856, + "grad_norm": 4.0715484619140625, + "learning_rate": 9.667583187390543e-05, + "loss": 1.0633, + "num_input_tokens_seen": 7648416, + "step": 475 + }, + { + "epoch": 0.0333429649671211, + "grad_norm": 3.937807321548462, + "learning_rate": 9.666883362521892e-05, + "loss": 1.1393, + "num_input_tokens_seen": 7664624, + "step": 476 + }, + { + "epoch": 0.033413013212850354, + "grad_norm": 4.195656776428223, + "learning_rate": 9.666183537653241e-05, + "loss": 1.1801, + "num_input_tokens_seen": 7680480, + "step": 477 + }, + { + "epoch": 0.0334830614585796, + "grad_norm": 4.227575778961182, + "learning_rate": 9.665483712784588e-05, + "loss": 1.0453, + "num_input_tokens_seen": 7696632, + "step": 478 + }, + { + "epoch": 0.033553109704308845, + "grad_norm": 4.328822135925293, + "learning_rate": 9.664783887915937e-05, + "loss": 1.221, + "num_input_tokens_seen": 7713016, + "step": 479 + }, + { + "epoch": 0.03362315795003809, + "grad_norm": 4.086736679077148, + "learning_rate": 9.664084063047286e-05, + "loss": 1.2817, + "num_input_tokens_seen": 7729400, + "step": 480 + }, + { + "epoch": 0.033693206195767336, + "grad_norm": 4.555233955383301, + "learning_rate": 9.663384238178635e-05, + "loss": 1.483, + "num_input_tokens_seen": 7745784, + "step": 481 + }, + { + "epoch": 0.03376325444149658, + "grad_norm": 4.118983745574951, + "learning_rate": 9.662684413309984e-05, + "loss": 0.9139, + "num_input_tokens_seen": 7762168, + "step": 482 + }, + { + "epoch": 0.03383330268722583, + "grad_norm": 4.232059001922607, + "learning_rate": 9.661984588441331e-05, + "loss": 1.1269, + "num_input_tokens_seen": 7777920, + "step": 483 + }, + { + "epoch": 0.03390335093295507, + "grad_norm": 6.288865089416504, + "learning_rate": 9.66128476357268e-05, + "loss": 1.0642, + "num_input_tokens_seen": 7794304, + "step": 484 + }, + { + "epoch": 0.03397339917868432, + "grad_norm": 4.133046627044678, + "learning_rate": 9.660584938704028e-05, + "loss": 1.2067, + "num_input_tokens_seen": 7810200, + "step": 485 + }, + { + "epoch": 0.034043447424413564, + "grad_norm": 4.147965431213379, + "learning_rate": 9.659885113835377e-05, + "loss": 1.0367, + "num_input_tokens_seen": 7826384, + "step": 486 + }, + { + "epoch": 0.03411349567014281, + "grad_norm": 4.1191020011901855, + "learning_rate": 9.659185288966725e-05, + "loss": 1.0972, + "num_input_tokens_seen": 7841704, + "step": 487 + }, + { + "epoch": 0.034183543915872056, + "grad_norm": 4.518441677093506, + "learning_rate": 9.658485464098074e-05, + "loss": 1.263, + "num_input_tokens_seen": 7858088, + "step": 488 + }, + { + "epoch": 0.0342535921616013, + "grad_norm": 4.321181297302246, + "learning_rate": 9.657785639229423e-05, + "loss": 1.1378, + "num_input_tokens_seen": 7874472, + "step": 489 + }, + { + "epoch": 0.03432364040733055, + "grad_norm": 4.366185665130615, + "learning_rate": 9.65708581436077e-05, + "loss": 1.1636, + "num_input_tokens_seen": 7890856, + "step": 490 + }, + { + "epoch": 0.03439368865305979, + "grad_norm": 4.042731761932373, + "learning_rate": 9.65638598949212e-05, + "loss": 1.0601, + "num_input_tokens_seen": 7906776, + "step": 491 + }, + { + "epoch": 0.03446373689878904, + "grad_norm": 3.743668556213379, + "learning_rate": 9.655686164623468e-05, + "loss": 1.0441, + "num_input_tokens_seen": 7923160, + "step": 492 + }, + { + "epoch": 0.034533785144518284, + "grad_norm": 3.8547139167785645, + "learning_rate": 9.654986339754816e-05, + "loss": 1.0842, + "num_input_tokens_seen": 7939296, + "step": 493 + }, + { + "epoch": 0.034603833390247536, + "grad_norm": 4.238414287567139, + "learning_rate": 9.654286514886166e-05, + "loss": 1.2498, + "num_input_tokens_seen": 7955504, + "step": 494 + }, + { + "epoch": 0.03467388163597678, + "grad_norm": 4.134857177734375, + "learning_rate": 9.653586690017514e-05, + "loss": 1.1241, + "num_input_tokens_seen": 7971888, + "step": 495 + }, + { + "epoch": 0.03474392988170603, + "grad_norm": 4.2501983642578125, + "learning_rate": 9.652886865148862e-05, + "loss": 1.1829, + "num_input_tokens_seen": 7988272, + "step": 496 + }, + { + "epoch": 0.03481397812743527, + "grad_norm": 7.4397053718566895, + "learning_rate": 9.65218704028021e-05, + "loss": 0.9952, + "num_input_tokens_seen": 8003744, + "step": 497 + }, + { + "epoch": 0.03488402637316452, + "grad_norm": 4.2750959396362305, + "learning_rate": 9.651487215411559e-05, + "loss": 1.2387, + "num_input_tokens_seen": 8019184, + "step": 498 + }, + { + "epoch": 0.034954074618893764, + "grad_norm": 4.156162261962891, + "learning_rate": 9.650787390542908e-05, + "loss": 1.1201, + "num_input_tokens_seen": 8035176, + "step": 499 + }, + { + "epoch": 0.03502412286462301, + "grad_norm": 4.178225040435791, + "learning_rate": 9.650087565674257e-05, + "loss": 1.2026, + "num_input_tokens_seen": 8051560, + "step": 500 + }, + { + "epoch": 0.035094171110352256, + "grad_norm": 4.147096157073975, + "learning_rate": 9.649387740805605e-05, + "loss": 1.2465, + "num_input_tokens_seen": 8067944, + "step": 501 + }, + { + "epoch": 0.0351642193560815, + "grad_norm": 4.329249858856201, + "learning_rate": 9.648687915936953e-05, + "loss": 1.2742, + "num_input_tokens_seen": 8083824, + "step": 502 + }, + { + "epoch": 0.03523426760181075, + "grad_norm": 4.404232978820801, + "learning_rate": 9.647988091068302e-05, + "loss": 1.1511, + "num_input_tokens_seen": 8100208, + "step": 503 + }, + { + "epoch": 0.03530431584753999, + "grad_norm": 4.190586090087891, + "learning_rate": 9.64728826619965e-05, + "loss": 0.9884, + "num_input_tokens_seen": 8116048, + "step": 504 + }, + { + "epoch": 0.03537436409326924, + "grad_norm": 4.262845516204834, + "learning_rate": 9.646588441330998e-05, + "loss": 1.1321, + "num_input_tokens_seen": 8132432, + "step": 505 + }, + { + "epoch": 0.035444412338998484, + "grad_norm": 4.452746391296387, + "learning_rate": 9.645888616462347e-05, + "loss": 1.1667, + "num_input_tokens_seen": 8148816, + "step": 506 + }, + { + "epoch": 0.03551446058472773, + "grad_norm": 4.111443042755127, + "learning_rate": 9.645188791593696e-05, + "loss": 1.0049, + "num_input_tokens_seen": 8164856, + "step": 507 + }, + { + "epoch": 0.035584508830456975, + "grad_norm": 4.292227268218994, + "learning_rate": 9.644488966725045e-05, + "loss": 1.1535, + "num_input_tokens_seen": 8181240, + "step": 508 + }, + { + "epoch": 0.03565455707618622, + "grad_norm": 4.295238971710205, + "learning_rate": 9.643789141856394e-05, + "loss": 1.236, + "num_input_tokens_seen": 8197624, + "step": 509 + }, + { + "epoch": 0.035724605321915466, + "grad_norm": 3.930659294128418, + "learning_rate": 9.643089316987741e-05, + "loss": 0.9195, + "num_input_tokens_seen": 8213816, + "step": 510 + }, + { + "epoch": 0.03579465356764472, + "grad_norm": 4.092316150665283, + "learning_rate": 9.64238949211909e-05, + "loss": 1.0799, + "num_input_tokens_seen": 8229632, + "step": 511 + }, + { + "epoch": 0.035864701813373964, + "grad_norm": 4.2939252853393555, + "learning_rate": 9.641689667250437e-05, + "loss": 1.111, + "num_input_tokens_seen": 8245232, + "step": 512 + }, + { + "epoch": 0.03593475005910321, + "grad_norm": 4.191503524780273, + "learning_rate": 9.640989842381786e-05, + "loss": 0.9399, + "num_input_tokens_seen": 8260912, + "step": 513 + }, + { + "epoch": 0.036004798304832455, + "grad_norm": 4.141485214233398, + "learning_rate": 9.640290017513136e-05, + "loss": 1.1334, + "num_input_tokens_seen": 8276864, + "step": 514 + }, + { + "epoch": 0.0360748465505617, + "grad_norm": 3.890547752380371, + "learning_rate": 9.639590192644484e-05, + "loss": 1.0055, + "num_input_tokens_seen": 8292720, + "step": 515 + }, + { + "epoch": 0.03614489479629095, + "grad_norm": 4.405922889709473, + "learning_rate": 9.638890367775833e-05, + "loss": 1.2238, + "num_input_tokens_seen": 8309104, + "step": 516 + }, + { + "epoch": 0.03621494304202019, + "grad_norm": 4.207942485809326, + "learning_rate": 9.63819054290718e-05, + "loss": 1.0688, + "num_input_tokens_seen": 8325304, + "step": 517 + }, + { + "epoch": 0.03628499128774944, + "grad_norm": 4.174366474151611, + "learning_rate": 9.637490718038529e-05, + "loss": 1.2303, + "num_input_tokens_seen": 8341688, + "step": 518 + }, + { + "epoch": 0.036355039533478684, + "grad_norm": 3.9641714096069336, + "learning_rate": 9.636790893169878e-05, + "loss": 1.2244, + "num_input_tokens_seen": 8357760, + "step": 519 + }, + { + "epoch": 0.03642508777920793, + "grad_norm": 5.832678318023682, + "learning_rate": 9.636091068301227e-05, + "loss": 1.0645, + "num_input_tokens_seen": 8372712, + "step": 520 + }, + { + "epoch": 0.036495136024937175, + "grad_norm": 3.7905161380767822, + "learning_rate": 9.635391243432576e-05, + "loss": 1.0551, + "num_input_tokens_seen": 8389096, + "step": 521 + }, + { + "epoch": 0.03656518427066642, + "grad_norm": 3.6744072437286377, + "learning_rate": 9.634691418563923e-05, + "loss": 1.0687, + "num_input_tokens_seen": 8405216, + "step": 522 + }, + { + "epoch": 0.036635232516395666, + "grad_norm": 4.897486209869385, + "learning_rate": 9.633991593695272e-05, + "loss": 1.1968, + "num_input_tokens_seen": 8421600, + "step": 523 + }, + { + "epoch": 0.03670528076212491, + "grad_norm": 3.821457862854004, + "learning_rate": 9.63329176882662e-05, + "loss": 1.0473, + "num_input_tokens_seen": 8437984, + "step": 524 + }, + { + "epoch": 0.03677532900785416, + "grad_norm": 3.873832941055298, + "learning_rate": 9.632591943957969e-05, + "loss": 0.9656, + "num_input_tokens_seen": 8453760, + "step": 525 + }, + { + "epoch": 0.0368453772535834, + "grad_norm": 4.139901161193848, + "learning_rate": 9.631892119089317e-05, + "loss": 1.0881, + "num_input_tokens_seen": 8470144, + "step": 526 + }, + { + "epoch": 0.03691542549931265, + "grad_norm": 3.9512782096862793, + "learning_rate": 9.631192294220666e-05, + "loss": 1.1093, + "num_input_tokens_seen": 8486528, + "step": 527 + }, + { + "epoch": 0.0369854737450419, + "grad_norm": 3.8937103748321533, + "learning_rate": 9.630492469352015e-05, + "loss": 0.9722, + "num_input_tokens_seen": 8502912, + "step": 528 + }, + { + "epoch": 0.03705552199077115, + "grad_norm": 4.482640743255615, + "learning_rate": 9.629792644483363e-05, + "loss": 1.056, + "num_input_tokens_seen": 8519296, + "step": 529 + }, + { + "epoch": 0.03712557023650039, + "grad_norm": 4.127941131591797, + "learning_rate": 9.629092819614711e-05, + "loss": 1.0285, + "num_input_tokens_seen": 8535160, + "step": 530 + }, + { + "epoch": 0.03719561848222964, + "grad_norm": 3.973585844039917, + "learning_rate": 9.62839299474606e-05, + "loss": 1.0356, + "num_input_tokens_seen": 8551256, + "step": 531 + }, + { + "epoch": 0.037265666727958884, + "grad_norm": 4.22855281829834, + "learning_rate": 9.627693169877408e-05, + "loss": 1.134, + "num_input_tokens_seen": 8567640, + "step": 532 + }, + { + "epoch": 0.03733571497368813, + "grad_norm": 4.144021511077881, + "learning_rate": 9.626993345008757e-05, + "loss": 1.0963, + "num_input_tokens_seen": 8583504, + "step": 533 + }, + { + "epoch": 0.037405763219417375, + "grad_norm": 3.8666226863861084, + "learning_rate": 9.626293520140106e-05, + "loss": 0.912, + "num_input_tokens_seen": 8599888, + "step": 534 + }, + { + "epoch": 0.03747581146514662, + "grad_norm": 4.215412616729736, + "learning_rate": 9.625593695271454e-05, + "loss": 1.1055, + "num_input_tokens_seen": 8616256, + "step": 535 + }, + { + "epoch": 0.037545859710875866, + "grad_norm": 4.353022575378418, + "learning_rate": 9.624893870402803e-05, + "loss": 1.0379, + "num_input_tokens_seen": 8632640, + "step": 536 + }, + { + "epoch": 0.03761590795660511, + "grad_norm": 3.778947591781616, + "learning_rate": 9.624194045534151e-05, + "loss": 1.0547, + "num_input_tokens_seen": 8648624, + "step": 537 + }, + { + "epoch": 0.03768595620233436, + "grad_norm": 4.481568336486816, + "learning_rate": 9.6234942206655e-05, + "loss": 1.3407, + "num_input_tokens_seen": 8664200, + "step": 538 + }, + { + "epoch": 0.0377560044480636, + "grad_norm": 4.066302299499512, + "learning_rate": 9.622794395796847e-05, + "loss": 0.995, + "num_input_tokens_seen": 8680584, + "step": 539 + }, + { + "epoch": 0.03782605269379285, + "grad_norm": 4.262768268585205, + "learning_rate": 9.622094570928197e-05, + "loss": 1.3054, + "num_input_tokens_seen": 8696968, + "step": 540 + }, + { + "epoch": 0.037896100939522094, + "grad_norm": 3.777597665786743, + "learning_rate": 9.621394746059546e-05, + "loss": 0.9831, + "num_input_tokens_seen": 8713352, + "step": 541 + }, + { + "epoch": 0.03796614918525134, + "grad_norm": 3.9732742309570312, + "learning_rate": 9.620694921190894e-05, + "loss": 1.0699, + "num_input_tokens_seen": 8729048, + "step": 542 + }, + { + "epoch": 0.038036197430980585, + "grad_norm": 4.543329238891602, + "learning_rate": 9.619995096322243e-05, + "loss": 1.1546, + "num_input_tokens_seen": 8745432, + "step": 543 + }, + { + "epoch": 0.03810624567670983, + "grad_norm": 4.903865814208984, + "learning_rate": 9.61929527145359e-05, + "loss": 1.1548, + "num_input_tokens_seen": 8760296, + "step": 544 + }, + { + "epoch": 0.03817629392243908, + "grad_norm": 4.197691917419434, + "learning_rate": 9.618595446584939e-05, + "loss": 1.1616, + "num_input_tokens_seen": 8776680, + "step": 545 + }, + { + "epoch": 0.03824634216816833, + "grad_norm": 3.912689208984375, + "learning_rate": 9.617895621716288e-05, + "loss": 0.9926, + "num_input_tokens_seen": 8793064, + "step": 546 + }, + { + "epoch": 0.038316390413897575, + "grad_norm": 4.291840076446533, + "learning_rate": 9.617195796847637e-05, + "loss": 1.1943, + "num_input_tokens_seen": 8809448, + "step": 547 + }, + { + "epoch": 0.03838643865962682, + "grad_norm": 3.9053072929382324, + "learning_rate": 9.616495971978985e-05, + "loss": 1.2437, + "num_input_tokens_seen": 8825536, + "step": 548 + }, + { + "epoch": 0.038456486905356066, + "grad_norm": 4.860696315765381, + "learning_rate": 9.615796147110333e-05, + "loss": 1.3045, + "num_input_tokens_seen": 8841920, + "step": 549 + }, + { + "epoch": 0.03852653515108531, + "grad_norm": 3.9394373893737793, + "learning_rate": 9.615096322241682e-05, + "loss": 1.1367, + "num_input_tokens_seen": 8858304, + "step": 550 + }, + { + "epoch": 0.03859658339681456, + "grad_norm": 3.8160409927368164, + "learning_rate": 9.61439649737303e-05, + "loss": 1.0864, + "num_input_tokens_seen": 8874688, + "step": 551 + }, + { + "epoch": 0.0386666316425438, + "grad_norm": 4.3792805671691895, + "learning_rate": 9.613696672504378e-05, + "loss": 1.2516, + "num_input_tokens_seen": 8891072, + "step": 552 + }, + { + "epoch": 0.03873667988827305, + "grad_norm": 4.103452682495117, + "learning_rate": 9.612996847635727e-05, + "loss": 0.9737, + "num_input_tokens_seen": 8907456, + "step": 553 + }, + { + "epoch": 0.038806728134002294, + "grad_norm": 4.117603302001953, + "learning_rate": 9.612297022767076e-05, + "loss": 1.096, + "num_input_tokens_seen": 8923816, + "step": 554 + }, + { + "epoch": 0.03887677637973154, + "grad_norm": 4.272468566894531, + "learning_rate": 9.611597197898425e-05, + "loss": 1.161, + "num_input_tokens_seen": 8939344, + "step": 555 + }, + { + "epoch": 0.038946824625460785, + "grad_norm": 4.323635578155518, + "learning_rate": 9.610897373029772e-05, + "loss": 1.1922, + "num_input_tokens_seen": 8954920, + "step": 556 + }, + { + "epoch": 0.03901687287119003, + "grad_norm": 3.783510684967041, + "learning_rate": 9.610197548161121e-05, + "loss": 1.0658, + "num_input_tokens_seen": 8971304, + "step": 557 + }, + { + "epoch": 0.039086921116919277, + "grad_norm": 4.3757548332214355, + "learning_rate": 9.60949772329247e-05, + "loss": 1.3186, + "num_input_tokens_seen": 8987672, + "step": 558 + }, + { + "epoch": 0.03915696936264852, + "grad_norm": 4.048824787139893, + "learning_rate": 9.608797898423818e-05, + "loss": 1.1452, + "num_input_tokens_seen": 9003896, + "step": 559 + }, + { + "epoch": 0.03922701760837777, + "grad_norm": 4.06865930557251, + "learning_rate": 9.608098073555168e-05, + "loss": 0.9861, + "num_input_tokens_seen": 9020280, + "step": 560 + }, + { + "epoch": 0.03929706585410701, + "grad_norm": 3.966737747192383, + "learning_rate": 9.607398248686515e-05, + "loss": 1.0323, + "num_input_tokens_seen": 9036280, + "step": 561 + }, + { + "epoch": 0.03936711409983626, + "grad_norm": 4.466656684875488, + "learning_rate": 9.606698423817864e-05, + "loss": 1.2462, + "num_input_tokens_seen": 9052664, + "step": 562 + }, + { + "epoch": 0.03943716234556551, + "grad_norm": 4.312132358551025, + "learning_rate": 9.605998598949213e-05, + "loss": 1.2133, + "num_input_tokens_seen": 9068832, + "step": 563 + }, + { + "epoch": 0.03950721059129476, + "grad_norm": 3.9202895164489746, + "learning_rate": 9.60529877408056e-05, + "loss": 1.0723, + "num_input_tokens_seen": 9084680, + "step": 564 + }, + { + "epoch": 0.039577258837024, + "grad_norm": 5.139899730682373, + "learning_rate": 9.604598949211909e-05, + "loss": 1.1165, + "num_input_tokens_seen": 9099792, + "step": 565 + }, + { + "epoch": 0.03964730708275325, + "grad_norm": 4.398557186126709, + "learning_rate": 9.603899124343258e-05, + "loss": 1.1737, + "num_input_tokens_seen": 9116136, + "step": 566 + }, + { + "epoch": 0.039717355328482494, + "grad_norm": 4.350982666015625, + "learning_rate": 9.603199299474607e-05, + "loss": 1.2174, + "num_input_tokens_seen": 9132520, + "step": 567 + }, + { + "epoch": 0.03978740357421174, + "grad_norm": 3.787644386291504, + "learning_rate": 9.602499474605956e-05, + "loss": 0.9914, + "num_input_tokens_seen": 9148856, + "step": 568 + }, + { + "epoch": 0.039857451819940985, + "grad_norm": 4.630245685577393, + "learning_rate": 9.601799649737303e-05, + "loss": 1.4135, + "num_input_tokens_seen": 9164888, + "step": 569 + }, + { + "epoch": 0.03992750006567023, + "grad_norm": 4.063969135284424, + "learning_rate": 9.601099824868652e-05, + "loss": 1.1312, + "num_input_tokens_seen": 9181272, + "step": 570 + }, + { + "epoch": 0.039997548311399476, + "grad_norm": 4.2443413734436035, + "learning_rate": 9.6004e-05, + "loss": 1.1627, + "num_input_tokens_seen": 9197344, + "step": 571 + }, + { + "epoch": 0.04006759655712872, + "grad_norm": 4.396352767944336, + "learning_rate": 9.599700175131349e-05, + "loss": 1.1222, + "num_input_tokens_seen": 9212312, + "step": 572 + }, + { + "epoch": 0.04013764480285797, + "grad_norm": 4.364585876464844, + "learning_rate": 9.599000350262697e-05, + "loss": 1.0522, + "num_input_tokens_seen": 9228696, + "step": 573 + }, + { + "epoch": 0.04020769304858721, + "grad_norm": 3.9348409175872803, + "learning_rate": 9.598300525394046e-05, + "loss": 1.1375, + "num_input_tokens_seen": 9245080, + "step": 574 + }, + { + "epoch": 0.04027774129431646, + "grad_norm": 4.051416873931885, + "learning_rate": 9.597600700525395e-05, + "loss": 1.0265, + "num_input_tokens_seen": 9260752, + "step": 575 + }, + { + "epoch": 0.040347789540045705, + "grad_norm": 4.661770820617676, + "learning_rate": 9.596900875656743e-05, + "loss": 1.192, + "num_input_tokens_seen": 9276792, + "step": 576 + }, + { + "epoch": 0.04041783778577495, + "grad_norm": 4.378422260284424, + "learning_rate": 9.596201050788092e-05, + "loss": 1.0497, + "num_input_tokens_seen": 9292768, + "step": 577 + }, + { + "epoch": 0.040487886031504196, + "grad_norm": 4.4690399169921875, + "learning_rate": 9.595501225919439e-05, + "loss": 1.2398, + "num_input_tokens_seen": 9309152, + "step": 578 + }, + { + "epoch": 0.04055793427723344, + "grad_norm": 4.1711273193359375, + "learning_rate": 9.594801401050788e-05, + "loss": 1.097, + "num_input_tokens_seen": 9325536, + "step": 579 + }, + { + "epoch": 0.040627982522962694, + "grad_norm": 3.8115949630737305, + "learning_rate": 9.594101576182137e-05, + "loss": 1.0317, + "num_input_tokens_seen": 9341920, + "step": 580 + }, + { + "epoch": 0.04069803076869194, + "grad_norm": 4.072190284729004, + "learning_rate": 9.593401751313486e-05, + "loss": 1.0649, + "num_input_tokens_seen": 9357904, + "step": 581 + }, + { + "epoch": 0.040768079014421185, + "grad_norm": 3.895766258239746, + "learning_rate": 9.592701926444835e-05, + "loss": 1.1906, + "num_input_tokens_seen": 9373496, + "step": 582 + }, + { + "epoch": 0.04083812726015043, + "grad_norm": 4.026490688323975, + "learning_rate": 9.592002101576182e-05, + "loss": 0.9913, + "num_input_tokens_seen": 9389824, + "step": 583 + }, + { + "epoch": 0.040908175505879676, + "grad_norm": 3.612987518310547, + "learning_rate": 9.591302276707531e-05, + "loss": 0.9376, + "num_input_tokens_seen": 9406208, + "step": 584 + }, + { + "epoch": 0.04097822375160892, + "grad_norm": 4.4619646072387695, + "learning_rate": 9.59060245183888e-05, + "loss": 1.2198, + "num_input_tokens_seen": 9422592, + "step": 585 + }, + { + "epoch": 0.04104827199733817, + "grad_norm": 3.990372896194458, + "learning_rate": 9.589902626970229e-05, + "loss": 1.082, + "num_input_tokens_seen": 9438816, + "step": 586 + }, + { + "epoch": 0.04111832024306741, + "grad_norm": 3.7697947025299072, + "learning_rate": 9.589202802101577e-05, + "loss": 1.0173, + "num_input_tokens_seen": 9455200, + "step": 587 + }, + { + "epoch": 0.04118836848879666, + "grad_norm": 4.066056728363037, + "learning_rate": 9.588502977232925e-05, + "loss": 1.124, + "num_input_tokens_seen": 9471320, + "step": 588 + }, + { + "epoch": 0.041258416734525905, + "grad_norm": 3.913506507873535, + "learning_rate": 9.587803152364274e-05, + "loss": 1.0501, + "num_input_tokens_seen": 9487304, + "step": 589 + }, + { + "epoch": 0.04132846498025515, + "grad_norm": 3.9049429893493652, + "learning_rate": 9.587103327495623e-05, + "loss": 1.0563, + "num_input_tokens_seen": 9503688, + "step": 590 + }, + { + "epoch": 0.041398513225984396, + "grad_norm": 4.316978454589844, + "learning_rate": 9.58640350262697e-05, + "loss": 1.1333, + "num_input_tokens_seen": 9519488, + "step": 591 + }, + { + "epoch": 0.04146856147171364, + "grad_norm": 3.7818517684936523, + "learning_rate": 9.585703677758319e-05, + "loss": 1.0537, + "num_input_tokens_seen": 9535872, + "step": 592 + }, + { + "epoch": 0.04153860971744289, + "grad_norm": 3.8751401901245117, + "learning_rate": 9.585003852889668e-05, + "loss": 1.1745, + "num_input_tokens_seen": 9551928, + "step": 593 + }, + { + "epoch": 0.04160865796317213, + "grad_norm": 4.357265949249268, + "learning_rate": 9.584304028021017e-05, + "loss": 1.1154, + "num_input_tokens_seen": 9568312, + "step": 594 + }, + { + "epoch": 0.04167870620890138, + "grad_norm": 4.184159755706787, + "learning_rate": 9.583604203152366e-05, + "loss": 1.125, + "num_input_tokens_seen": 9583968, + "step": 595 + }, + { + "epoch": 0.041748754454630624, + "grad_norm": 3.9540369510650635, + "learning_rate": 9.582904378283713e-05, + "loss": 1.2032, + "num_input_tokens_seen": 9600152, + "step": 596 + }, + { + "epoch": 0.04181880270035987, + "grad_norm": 4.401122093200684, + "learning_rate": 9.582204553415062e-05, + "loss": 1.4808, + "num_input_tokens_seen": 9615632, + "step": 597 + }, + { + "epoch": 0.04188885094608912, + "grad_norm": 4.418131351470947, + "learning_rate": 9.58150472854641e-05, + "loss": 1.0077, + "num_input_tokens_seen": 9631712, + "step": 598 + }, + { + "epoch": 0.04195889919181837, + "grad_norm": 4.362226963043213, + "learning_rate": 9.580804903677758e-05, + "loss": 1.1614, + "num_input_tokens_seen": 9648096, + "step": 599 + }, + { + "epoch": 0.04202894743754761, + "grad_norm": 4.051177024841309, + "learning_rate": 9.580105078809107e-05, + "loss": 1.0718, + "num_input_tokens_seen": 9663792, + "step": 600 + }, + { + "epoch": 0.04202894743754761, + "eval_loss": 1.1809133291244507, + "eval_runtime": 0.2062, + "eval_samples_per_second": 4.849, + "eval_steps_per_second": 4.849, + "num_input_tokens_seen": 9663792, + "step": 600 + }, + { + "epoch": 0.04209899568327686, + "grad_norm": 4.478739261627197, + "learning_rate": 9.579405253940456e-05, + "loss": 1.1963, + "num_input_tokens_seen": 9680176, + "step": 601 + }, + { + "epoch": 0.042169043929006104, + "grad_norm": 4.05004358291626, + "learning_rate": 9.578705429071805e-05, + "loss": 1.1005, + "num_input_tokens_seen": 9696560, + "step": 602 + }, + { + "epoch": 0.04223909217473535, + "grad_norm": 4.092396259307861, + "learning_rate": 9.578005604203152e-05, + "loss": 1.1796, + "num_input_tokens_seen": 9712944, + "step": 603 + }, + { + "epoch": 0.042309140420464596, + "grad_norm": 4.428014278411865, + "learning_rate": 9.577305779334501e-05, + "loss": 0.9734, + "num_input_tokens_seen": 9729096, + "step": 604 + }, + { + "epoch": 0.04237918866619384, + "grad_norm": 4.202315807342529, + "learning_rate": 9.576605954465849e-05, + "loss": 1.0502, + "num_input_tokens_seen": 9745480, + "step": 605 + }, + { + "epoch": 0.04244923691192309, + "grad_norm": 3.7633514404296875, + "learning_rate": 9.575906129597198e-05, + "loss": 0.9218, + "num_input_tokens_seen": 9761272, + "step": 606 + }, + { + "epoch": 0.04251928515765233, + "grad_norm": 4.170671463012695, + "learning_rate": 9.575206304728548e-05, + "loss": 1.1196, + "num_input_tokens_seen": 9777656, + "step": 607 + }, + { + "epoch": 0.04258933340338158, + "grad_norm": 4.20021915435791, + "learning_rate": 9.574506479859895e-05, + "loss": 1.1146, + "num_input_tokens_seen": 9794032, + "step": 608 + }, + { + "epoch": 0.042659381649110824, + "grad_norm": 4.437755107879639, + "learning_rate": 9.573806654991244e-05, + "loss": 1.0911, + "num_input_tokens_seen": 9809936, + "step": 609 + }, + { + "epoch": 0.04272942989484007, + "grad_norm": 4.417452335357666, + "learning_rate": 9.573106830122592e-05, + "loss": 1.2079, + "num_input_tokens_seen": 9825232, + "step": 610 + }, + { + "epoch": 0.042799478140569315, + "grad_norm": 4.144030570983887, + "learning_rate": 9.57240700525394e-05, + "loss": 1.1229, + "num_input_tokens_seen": 9840648, + "step": 611 + }, + { + "epoch": 0.04286952638629856, + "grad_norm": 3.991605043411255, + "learning_rate": 9.57170718038529e-05, + "loss": 1.0762, + "num_input_tokens_seen": 9857032, + "step": 612 + }, + { + "epoch": 0.042939574632027806, + "grad_norm": 4.516556262969971, + "learning_rate": 9.571007355516638e-05, + "loss": 1.3056, + "num_input_tokens_seen": 9872328, + "step": 613 + }, + { + "epoch": 0.04300962287775705, + "grad_norm": 4.030200481414795, + "learning_rate": 9.570307530647987e-05, + "loss": 0.9493, + "num_input_tokens_seen": 9887832, + "step": 614 + }, + { + "epoch": 0.043079671123486304, + "grad_norm": 4.345893859863281, + "learning_rate": 9.569607705779335e-05, + "loss": 1.2707, + "num_input_tokens_seen": 9904216, + "step": 615 + }, + { + "epoch": 0.04314971936921555, + "grad_norm": 4.158145427703857, + "learning_rate": 9.568907880910684e-05, + "loss": 1.0377, + "num_input_tokens_seen": 9920072, + "step": 616 + }, + { + "epoch": 0.043219767614944796, + "grad_norm": 4.155702590942383, + "learning_rate": 9.568208056042032e-05, + "loss": 1.091, + "num_input_tokens_seen": 9936416, + "step": 617 + }, + { + "epoch": 0.04328981586067404, + "grad_norm": 3.76328444480896, + "learning_rate": 9.56750823117338e-05, + "loss": 1.1011, + "num_input_tokens_seen": 9952456, + "step": 618 + }, + { + "epoch": 0.04335986410640329, + "grad_norm": 4.252495765686035, + "learning_rate": 9.566808406304729e-05, + "loss": 1.0616, + "num_input_tokens_seen": 9968608, + "step": 619 + }, + { + "epoch": 0.04342991235213253, + "grad_norm": 9.254091262817383, + "learning_rate": 9.566108581436078e-05, + "loss": 1.0315, + "num_input_tokens_seen": 9983016, + "step": 620 + }, + { + "epoch": 0.04349996059786178, + "grad_norm": 4.028343200683594, + "learning_rate": 9.565408756567426e-05, + "loss": 1.0667, + "num_input_tokens_seen": 9999400, + "step": 621 + }, + { + "epoch": 0.043570008843591024, + "grad_norm": 4.051328659057617, + "learning_rate": 9.564708931698775e-05, + "loss": 1.1375, + "num_input_tokens_seen": 10015384, + "step": 622 + }, + { + "epoch": 0.04364005708932027, + "grad_norm": 4.495016098022461, + "learning_rate": 9.564009106830123e-05, + "loss": 1.0691, + "num_input_tokens_seen": 10031152, + "step": 623 + }, + { + "epoch": 0.043710105335049515, + "grad_norm": 4.876840114593506, + "learning_rate": 9.563309281961472e-05, + "loss": 1.17, + "num_input_tokens_seen": 10047536, + "step": 624 + }, + { + "epoch": 0.04378015358077876, + "grad_norm": 4.407329559326172, + "learning_rate": 9.562609457092819e-05, + "loss": 1.2381, + "num_input_tokens_seen": 10063920, + "step": 625 + }, + { + "epoch": 0.043850201826508006, + "grad_norm": 4.161394119262695, + "learning_rate": 9.561909632224168e-05, + "loss": 1.0903, + "num_input_tokens_seen": 10079024, + "step": 626 + }, + { + "epoch": 0.04392025007223725, + "grad_norm": 4.382974624633789, + "learning_rate": 9.561209807355518e-05, + "loss": 1.3156, + "num_input_tokens_seen": 10095408, + "step": 627 + }, + { + "epoch": 0.0439902983179665, + "grad_norm": 4.004157543182373, + "learning_rate": 9.560509982486866e-05, + "loss": 1.1333, + "num_input_tokens_seen": 10111792, + "step": 628 + }, + { + "epoch": 0.04406034656369574, + "grad_norm": 3.9019265174865723, + "learning_rate": 9.559810157618215e-05, + "loss": 1.0948, + "num_input_tokens_seen": 10128144, + "step": 629 + }, + { + "epoch": 0.04413039480942499, + "grad_norm": 4.410470485687256, + "learning_rate": 9.559110332749562e-05, + "loss": 1.3219, + "num_input_tokens_seen": 10144288, + "step": 630 + }, + { + "epoch": 0.044200443055154234, + "grad_norm": 4.233544826507568, + "learning_rate": 9.558410507880911e-05, + "loss": 0.999, + "num_input_tokens_seen": 10160296, + "step": 631 + }, + { + "epoch": 0.04427049130088349, + "grad_norm": 4.120091438293457, + "learning_rate": 9.557710683012258e-05, + "loss": 1.0166, + "num_input_tokens_seen": 10176680, + "step": 632 + }, + { + "epoch": 0.04434053954661273, + "grad_norm": 5.061972618103027, + "learning_rate": 9.557010858143609e-05, + "loss": 1.251, + "num_input_tokens_seen": 10192088, + "step": 633 + }, + { + "epoch": 0.04441058779234198, + "grad_norm": 4.3690948486328125, + "learning_rate": 9.556311033274958e-05, + "loss": 1.2113, + "num_input_tokens_seen": 10208472, + "step": 634 + }, + { + "epoch": 0.044480636038071224, + "grad_norm": 3.798710346221924, + "learning_rate": 9.555611208406305e-05, + "loss": 1.0286, + "num_input_tokens_seen": 10224856, + "step": 635 + }, + { + "epoch": 0.04455068428380047, + "grad_norm": 4.41818380355835, + "learning_rate": 9.554911383537654e-05, + "loss": 1.14, + "num_input_tokens_seen": 10241200, + "step": 636 + }, + { + "epoch": 0.044620732529529715, + "grad_norm": 4.256262302398682, + "learning_rate": 9.554211558669001e-05, + "loss": 1.3103, + "num_input_tokens_seen": 10257584, + "step": 637 + }, + { + "epoch": 0.04469078077525896, + "grad_norm": 4.176064968109131, + "learning_rate": 9.55351173380035e-05, + "loss": 1.1985, + "num_input_tokens_seen": 10273760, + "step": 638 + }, + { + "epoch": 0.044760829020988206, + "grad_norm": 3.9971530437469482, + "learning_rate": 9.552811908931699e-05, + "loss": 1.1579, + "num_input_tokens_seen": 10290144, + "step": 639 + }, + { + "epoch": 0.04483087726671745, + "grad_norm": 4.150514602661133, + "learning_rate": 9.552112084063048e-05, + "loss": 1.1144, + "num_input_tokens_seen": 10306528, + "step": 640 + }, + { + "epoch": 0.0449009255124467, + "grad_norm": 4.1868367195129395, + "learning_rate": 9.551412259194397e-05, + "loss": 1.0099, + "num_input_tokens_seen": 10322480, + "step": 641 + }, + { + "epoch": 0.04497097375817594, + "grad_norm": 4.409821510314941, + "learning_rate": 9.550712434325744e-05, + "loss": 1.2574, + "num_input_tokens_seen": 10338864, + "step": 642 + }, + { + "epoch": 0.04504102200390519, + "grad_norm": 4.500023365020752, + "learning_rate": 9.550012609457093e-05, + "loss": 1.35, + "num_input_tokens_seen": 10355072, + "step": 643 + }, + { + "epoch": 0.045111070249634434, + "grad_norm": 10.278129577636719, + "learning_rate": 9.549312784588442e-05, + "loss": 1.0618, + "num_input_tokens_seen": 10371456, + "step": 644 + }, + { + "epoch": 0.04518111849536368, + "grad_norm": 3.9800543785095215, + "learning_rate": 9.54861295971979e-05, + "loss": 1.0341, + "num_input_tokens_seen": 10387720, + "step": 645 + }, + { + "epoch": 0.045251166741092926, + "grad_norm": 3.855720281600952, + "learning_rate": 9.547913134851138e-05, + "loss": 1.1323, + "num_input_tokens_seen": 10403936, + "step": 646 + }, + { + "epoch": 0.04532121498682217, + "grad_norm": 4.719264984130859, + "learning_rate": 9.547213309982487e-05, + "loss": 1.1407, + "num_input_tokens_seen": 10420320, + "step": 647 + }, + { + "epoch": 0.04539126323255142, + "grad_norm": 4.6528167724609375, + "learning_rate": 9.546513485113836e-05, + "loss": 1.1014, + "num_input_tokens_seen": 10436704, + "step": 648 + }, + { + "epoch": 0.04546131147828066, + "grad_norm": 4.0597028732299805, + "learning_rate": 9.545813660245185e-05, + "loss": 1.116, + "num_input_tokens_seen": 10452592, + "step": 649 + }, + { + "epoch": 0.045531359724009915, + "grad_norm": 4.161896705627441, + "learning_rate": 9.545113835376533e-05, + "loss": 1.1373, + "num_input_tokens_seen": 10468976, + "step": 650 + }, + { + "epoch": 0.04560140796973916, + "grad_norm": 4.125041961669922, + "learning_rate": 9.544414010507881e-05, + "loss": 1.0947, + "num_input_tokens_seen": 10484584, + "step": 651 + }, + { + "epoch": 0.045671456215468406, + "grad_norm": 4.278462886810303, + "learning_rate": 9.543714185639229e-05, + "loss": 1.1369, + "num_input_tokens_seen": 10500504, + "step": 652 + }, + { + "epoch": 0.04574150446119765, + "grad_norm": 4.766538619995117, + "learning_rate": 9.543014360770579e-05, + "loss": 1.1876, + "num_input_tokens_seen": 10516472, + "step": 653 + }, + { + "epoch": 0.0458115527069269, + "grad_norm": 4.457921504974365, + "learning_rate": 9.542314535901928e-05, + "loss": 1.0788, + "num_input_tokens_seen": 10532272, + "step": 654 + }, + { + "epoch": 0.04588160095265614, + "grad_norm": 5.021823883056641, + "learning_rate": 9.541614711033275e-05, + "loss": 1.1152, + "num_input_tokens_seen": 10547696, + "step": 655 + }, + { + "epoch": 0.04595164919838539, + "grad_norm": 4.407228469848633, + "learning_rate": 9.540914886164624e-05, + "loss": 1.0863, + "num_input_tokens_seen": 10564080, + "step": 656 + }, + { + "epoch": 0.046021697444114634, + "grad_norm": 3.9986062049865723, + "learning_rate": 9.540215061295972e-05, + "loss": 1.1624, + "num_input_tokens_seen": 10580464, + "step": 657 + }, + { + "epoch": 0.04609174568984388, + "grad_norm": 7.9165191650390625, + "learning_rate": 9.539515236427321e-05, + "loss": 1.0809, + "num_input_tokens_seen": 10595336, + "step": 658 + }, + { + "epoch": 0.046161793935573125, + "grad_norm": 4.357856273651123, + "learning_rate": 9.53881541155867e-05, + "loss": 1.0324, + "num_input_tokens_seen": 10611720, + "step": 659 + }, + { + "epoch": 0.04623184218130237, + "grad_norm": 3.8115761280059814, + "learning_rate": 9.538115586690018e-05, + "loss": 1.1499, + "num_input_tokens_seen": 10628104, + "step": 660 + }, + { + "epoch": 0.04630189042703162, + "grad_norm": 3.879671096801758, + "learning_rate": 9.537415761821367e-05, + "loss": 1.0474, + "num_input_tokens_seen": 10644096, + "step": 661 + }, + { + "epoch": 0.04637193867276086, + "grad_norm": 4.324586391448975, + "learning_rate": 9.536715936952715e-05, + "loss": 1.1904, + "num_input_tokens_seen": 10659408, + "step": 662 + }, + { + "epoch": 0.04644198691849011, + "grad_norm": 4.020029067993164, + "learning_rate": 9.536016112084064e-05, + "loss": 1.0848, + "num_input_tokens_seen": 10675792, + "step": 663 + }, + { + "epoch": 0.046512035164219354, + "grad_norm": 4.563455581665039, + "learning_rate": 9.535316287215411e-05, + "loss": 1.1735, + "num_input_tokens_seen": 10691632, + "step": 664 + }, + { + "epoch": 0.0465820834099486, + "grad_norm": 4.444424629211426, + "learning_rate": 9.53461646234676e-05, + "loss": 1.258, + "num_input_tokens_seen": 10708016, + "step": 665 + }, + { + "epoch": 0.046652131655677845, + "grad_norm": 3.9864089488983154, + "learning_rate": 9.533916637478109e-05, + "loss": 1.1315, + "num_input_tokens_seen": 10724176, + "step": 666 + }, + { + "epoch": 0.0467221799014071, + "grad_norm": 4.860849857330322, + "learning_rate": 9.533216812609458e-05, + "loss": 1.2276, + "num_input_tokens_seen": 10740560, + "step": 667 + }, + { + "epoch": 0.04679222814713634, + "grad_norm": 3.9701120853424072, + "learning_rate": 9.532516987740807e-05, + "loss": 1.1406, + "num_input_tokens_seen": 10756864, + "step": 668 + }, + { + "epoch": 0.04686227639286559, + "grad_norm": 3.660257577896118, + "learning_rate": 9.531817162872154e-05, + "loss": 1.0182, + "num_input_tokens_seen": 10773248, + "step": 669 + }, + { + "epoch": 0.046932324638594834, + "grad_norm": 3.888510227203369, + "learning_rate": 9.531117338003503e-05, + "loss": 1.0223, + "num_input_tokens_seen": 10789632, + "step": 670 + }, + { + "epoch": 0.04700237288432408, + "grad_norm": 4.794105052947998, + "learning_rate": 9.530417513134852e-05, + "loss": 1.0565, + "num_input_tokens_seen": 10804496, + "step": 671 + }, + { + "epoch": 0.047072421130053325, + "grad_norm": 4.293116092681885, + "learning_rate": 9.529717688266199e-05, + "loss": 1.2509, + "num_input_tokens_seen": 10819976, + "step": 672 + }, + { + "epoch": 0.04714246937578257, + "grad_norm": 5.112069129943848, + "learning_rate": 9.52901786339755e-05, + "loss": 1.0964, + "num_input_tokens_seen": 10836360, + "step": 673 + }, + { + "epoch": 0.04721251762151182, + "grad_norm": 3.9091360569000244, + "learning_rate": 9.528318038528897e-05, + "loss": 1.0647, + "num_input_tokens_seen": 10852744, + "step": 674 + }, + { + "epoch": 0.04728256586724106, + "grad_norm": 4.032161235809326, + "learning_rate": 9.527618213660246e-05, + "loss": 1.2362, + "num_input_tokens_seen": 10868928, + "step": 675 + }, + { + "epoch": 0.04735261411297031, + "grad_norm": 3.931156635284424, + "learning_rate": 9.526918388791595e-05, + "loss": 1.0571, + "num_input_tokens_seen": 10884776, + "step": 676 + }, + { + "epoch": 0.047422662358699554, + "grad_norm": 3.9511048793792725, + "learning_rate": 9.526218563922942e-05, + "loss": 1.0249, + "num_input_tokens_seen": 10901160, + "step": 677 + }, + { + "epoch": 0.0474927106044288, + "grad_norm": 4.199029445648193, + "learning_rate": 9.525518739054291e-05, + "loss": 1.2813, + "num_input_tokens_seen": 10917544, + "step": 678 + }, + { + "epoch": 0.047562758850158045, + "grad_norm": 3.8590247631073, + "learning_rate": 9.52481891418564e-05, + "loss": 1.02, + "num_input_tokens_seen": 10933928, + "step": 679 + }, + { + "epoch": 0.04763280709588729, + "grad_norm": 5.530341625213623, + "learning_rate": 9.524119089316989e-05, + "loss": 1.2316, + "num_input_tokens_seen": 10949600, + "step": 680 + }, + { + "epoch": 0.047702855341616536, + "grad_norm": 4.17647123336792, + "learning_rate": 9.523419264448338e-05, + "loss": 1.2985, + "num_input_tokens_seen": 10965984, + "step": 681 + }, + { + "epoch": 0.04777290358734578, + "grad_norm": 4.250451564788818, + "learning_rate": 9.522719439579685e-05, + "loss": 1.1638, + "num_input_tokens_seen": 10982368, + "step": 682 + }, + { + "epoch": 0.04784295183307503, + "grad_norm": 4.132594108581543, + "learning_rate": 9.522019614711034e-05, + "loss": 0.9638, + "num_input_tokens_seen": 10998752, + "step": 683 + }, + { + "epoch": 0.04791300007880428, + "grad_norm": 5.863363265991211, + "learning_rate": 9.521319789842382e-05, + "loss": 1.0736, + "num_input_tokens_seen": 11014376, + "step": 684 + }, + { + "epoch": 0.047983048324533525, + "grad_norm": 3.740323543548584, + "learning_rate": 9.52061996497373e-05, + "loss": 0.9958, + "num_input_tokens_seen": 11030440, + "step": 685 + }, + { + "epoch": 0.04805309657026277, + "grad_norm": 4.927120685577393, + "learning_rate": 9.519920140105079e-05, + "loss": 1.156, + "num_input_tokens_seen": 11046824, + "step": 686 + }, + { + "epoch": 0.04812314481599202, + "grad_norm": 4.708818435668945, + "learning_rate": 9.519220315236428e-05, + "loss": 1.2139, + "num_input_tokens_seen": 11063208, + "step": 687 + }, + { + "epoch": 0.04819319306172126, + "grad_norm": 3.7547767162323, + "learning_rate": 9.518520490367777e-05, + "loss": 0.9557, + "num_input_tokens_seen": 11079592, + "step": 688 + }, + { + "epoch": 0.04826324130745051, + "grad_norm": 4.038534641265869, + "learning_rate": 9.517820665499124e-05, + "loss": 1.1124, + "num_input_tokens_seen": 11095976, + "step": 689 + }, + { + "epoch": 0.048333289553179754, + "grad_norm": 4.159554481506348, + "learning_rate": 9.517120840630473e-05, + "loss": 1.0043, + "num_input_tokens_seen": 11112360, + "step": 690 + }, + { + "epoch": 0.048403337798909, + "grad_norm": 7.104836463928223, + "learning_rate": 9.516421015761821e-05, + "loss": 0.9736, + "num_input_tokens_seen": 11127800, + "step": 691 + }, + { + "epoch": 0.048473386044638245, + "grad_norm": 4.073885917663574, + "learning_rate": 9.51572119089317e-05, + "loss": 1.1249, + "num_input_tokens_seen": 11144184, + "step": 692 + }, + { + "epoch": 0.04854343429036749, + "grad_norm": 3.7190351486206055, + "learning_rate": 9.51502136602452e-05, + "loss": 1.1035, + "num_input_tokens_seen": 11160568, + "step": 693 + }, + { + "epoch": 0.048613482536096736, + "grad_norm": 4.252142429351807, + "learning_rate": 9.514321541155867e-05, + "loss": 1.1588, + "num_input_tokens_seen": 11176952, + "step": 694 + }, + { + "epoch": 0.04868353078182598, + "grad_norm": 4.418105125427246, + "learning_rate": 9.513621716287216e-05, + "loss": 1.2496, + "num_input_tokens_seen": 11193336, + "step": 695 + }, + { + "epoch": 0.04875357902755523, + "grad_norm": 4.195918560028076, + "learning_rate": 9.512921891418564e-05, + "loss": 1.0193, + "num_input_tokens_seen": 11209720, + "step": 696 + }, + { + "epoch": 0.04882362727328447, + "grad_norm": 5.138080596923828, + "learning_rate": 9.512222066549913e-05, + "loss": 1.1861, + "num_input_tokens_seen": 11225888, + "step": 697 + }, + { + "epoch": 0.04889367551901372, + "grad_norm": 4.489223003387451, + "learning_rate": 9.511522241681261e-05, + "loss": 1.1497, + "num_input_tokens_seen": 11241744, + "step": 698 + }, + { + "epoch": 0.048963723764742964, + "grad_norm": 3.972590208053589, + "learning_rate": 9.51082241681261e-05, + "loss": 1.2765, + "num_input_tokens_seen": 11257768, + "step": 699 + }, + { + "epoch": 0.04903377201047221, + "grad_norm": 13.274886131286621, + "learning_rate": 9.510122591943959e-05, + "loss": 1.1124, + "num_input_tokens_seen": 11273216, + "step": 700 + }, + { + "epoch": 0.049103820256201455, + "grad_norm": 3.7899255752563477, + "learning_rate": 9.509422767075307e-05, + "loss": 1.0445, + "num_input_tokens_seen": 11289600, + "step": 701 + }, + { + "epoch": 0.04917386850193071, + "grad_norm": 4.226947784423828, + "learning_rate": 9.508722942206656e-05, + "loss": 1.4313, + "num_input_tokens_seen": 11305920, + "step": 702 + }, + { + "epoch": 0.049243916747659953, + "grad_norm": 4.098162651062012, + "learning_rate": 9.508023117338003e-05, + "loss": 0.952, + "num_input_tokens_seen": 11322304, + "step": 703 + }, + { + "epoch": 0.0493139649933892, + "grad_norm": 3.9205965995788574, + "learning_rate": 9.507323292469352e-05, + "loss": 1.1648, + "num_input_tokens_seen": 11338688, + "step": 704 + }, + { + "epoch": 0.049384013239118445, + "grad_norm": 4.06537389755249, + "learning_rate": 9.506623467600701e-05, + "loss": 1.1295, + "num_input_tokens_seen": 11353544, + "step": 705 + }, + { + "epoch": 0.04945406148484769, + "grad_norm": 4.309032440185547, + "learning_rate": 9.50592364273205e-05, + "loss": 1.1475, + "num_input_tokens_seen": 11369928, + "step": 706 + }, + { + "epoch": 0.049524109730576936, + "grad_norm": 4.320526599884033, + "learning_rate": 9.505223817863399e-05, + "loss": 1.0102, + "num_input_tokens_seen": 11386312, + "step": 707 + }, + { + "epoch": 0.04959415797630618, + "grad_norm": 5.025510787963867, + "learning_rate": 9.504523992994747e-05, + "loss": 1.1182, + "num_input_tokens_seen": 11402696, + "step": 708 + }, + { + "epoch": 0.04966420622203543, + "grad_norm": 3.9406464099884033, + "learning_rate": 9.503824168126095e-05, + "loss": 1.068, + "num_input_tokens_seen": 11419080, + "step": 709 + }, + { + "epoch": 0.04973425446776467, + "grad_norm": 3.9148502349853516, + "learning_rate": 9.503124343257444e-05, + "loss": 1.1062, + "num_input_tokens_seen": 11435464, + "step": 710 + }, + { + "epoch": 0.04980430271349392, + "grad_norm": 3.9386026859283447, + "learning_rate": 9.502424518388791e-05, + "loss": 0.9516, + "num_input_tokens_seen": 11451848, + "step": 711 + }, + { + "epoch": 0.049874350959223164, + "grad_norm": 3.9537665843963623, + "learning_rate": 9.50172469352014e-05, + "loss": 1.1372, + "num_input_tokens_seen": 11468216, + "step": 712 + }, + { + "epoch": 0.04994439920495241, + "grad_norm": 3.97929310798645, + "learning_rate": 9.501024868651489e-05, + "loss": 1.0705, + "num_input_tokens_seen": 11484192, + "step": 713 + }, + { + "epoch": 0.050014447450681655, + "grad_norm": 3.9326419830322266, + "learning_rate": 9.500325043782838e-05, + "loss": 1.0986, + "num_input_tokens_seen": 11500576, + "step": 714 + }, + { + "epoch": 0.0500844956964109, + "grad_norm": 3.769347667694092, + "learning_rate": 9.499625218914187e-05, + "loss": 0.9265, + "num_input_tokens_seen": 11516960, + "step": 715 + }, + { + "epoch": 0.050154543942140146, + "grad_norm": 4.264547348022461, + "learning_rate": 9.498925394045534e-05, + "loss": 1.3166, + "num_input_tokens_seen": 11532616, + "step": 716 + }, + { + "epoch": 0.05022459218786939, + "grad_norm": 4.885791778564453, + "learning_rate": 9.498225569176883e-05, + "loss": 1.0669, + "num_input_tokens_seen": 11548552, + "step": 717 + }, + { + "epoch": 0.05029464043359864, + "grad_norm": 5.4089741706848145, + "learning_rate": 9.49752574430823e-05, + "loss": 1.3986, + "num_input_tokens_seen": 11564936, + "step": 718 + }, + { + "epoch": 0.05036468867932789, + "grad_norm": 4.503393173217773, + "learning_rate": 9.496825919439581e-05, + "loss": 0.9947, + "num_input_tokens_seen": 11580720, + "step": 719 + }, + { + "epoch": 0.050434736925057136, + "grad_norm": 4.364518165588379, + "learning_rate": 9.49612609457093e-05, + "loss": 1.12, + "num_input_tokens_seen": 11597104, + "step": 720 + }, + { + "epoch": 0.05050478517078638, + "grad_norm": 4.229926109313965, + "learning_rate": 9.495426269702277e-05, + "loss": 1.098, + "num_input_tokens_seen": 11612120, + "step": 721 + }, + { + "epoch": 0.05057483341651563, + "grad_norm": 4.477171897888184, + "learning_rate": 9.494726444833626e-05, + "loss": 1.1565, + "num_input_tokens_seen": 11627000, + "step": 722 + }, + { + "epoch": 0.05064488166224487, + "grad_norm": 4.071736812591553, + "learning_rate": 9.494026619964973e-05, + "loss": 1.2951, + "num_input_tokens_seen": 11643256, + "step": 723 + }, + { + "epoch": 0.05071492990797412, + "grad_norm": 4.219758033752441, + "learning_rate": 9.493326795096322e-05, + "loss": 1.1408, + "num_input_tokens_seen": 11659424, + "step": 724 + }, + { + "epoch": 0.050784978153703364, + "grad_norm": 4.108195781707764, + "learning_rate": 9.492626970227671e-05, + "loss": 0.9847, + "num_input_tokens_seen": 11675808, + "step": 725 + }, + { + "epoch": 0.05085502639943261, + "grad_norm": 3.964359760284424, + "learning_rate": 9.49192714535902e-05, + "loss": 1.0935, + "num_input_tokens_seen": 11691760, + "step": 726 + }, + { + "epoch": 0.050925074645161855, + "grad_norm": 4.585779190063477, + "learning_rate": 9.491227320490369e-05, + "loss": 1.1561, + "num_input_tokens_seen": 11706600, + "step": 727 + }, + { + "epoch": 0.0509951228908911, + "grad_norm": 3.8540141582489014, + "learning_rate": 9.490527495621716e-05, + "loss": 1.0163, + "num_input_tokens_seen": 11722984, + "step": 728 + }, + { + "epoch": 0.051065171136620346, + "grad_norm": 4.138955593109131, + "learning_rate": 9.489827670753065e-05, + "loss": 1.2842, + "num_input_tokens_seen": 11738968, + "step": 729 + }, + { + "epoch": 0.05113521938234959, + "grad_norm": 4.138274192810059, + "learning_rate": 9.489127845884413e-05, + "loss": 1.1452, + "num_input_tokens_seen": 11754952, + "step": 730 + }, + { + "epoch": 0.05120526762807884, + "grad_norm": 4.374305248260498, + "learning_rate": 9.488428021015762e-05, + "loss": 1.3622, + "num_input_tokens_seen": 11770832, + "step": 731 + }, + { + "epoch": 0.05127531587380808, + "grad_norm": 4.242674350738525, + "learning_rate": 9.48772819614711e-05, + "loss": 1.1914, + "num_input_tokens_seen": 11786872, + "step": 732 + }, + { + "epoch": 0.05134536411953733, + "grad_norm": 4.173389911651611, + "learning_rate": 9.48702837127846e-05, + "loss": 1.1853, + "num_input_tokens_seen": 11803256, + "step": 733 + }, + { + "epoch": 0.051415412365266575, + "grad_norm": 4.014588356018066, + "learning_rate": 9.486328546409808e-05, + "loss": 1.0436, + "num_input_tokens_seen": 11819608, + "step": 734 + }, + { + "epoch": 0.05148546061099582, + "grad_norm": 4.759418964385986, + "learning_rate": 9.485628721541157e-05, + "loss": 1.1605, + "num_input_tokens_seen": 11834296, + "step": 735 + }, + { + "epoch": 0.05155550885672507, + "grad_norm": 4.258687973022461, + "learning_rate": 9.484928896672505e-05, + "loss": 1.2993, + "num_input_tokens_seen": 11849728, + "step": 736 + }, + { + "epoch": 0.05162555710245432, + "grad_norm": 4.690395832061768, + "learning_rate": 9.484229071803853e-05, + "loss": 1.0655, + "num_input_tokens_seen": 11866112, + "step": 737 + }, + { + "epoch": 0.051695605348183564, + "grad_norm": 4.373327255249023, + "learning_rate": 9.483529246935201e-05, + "loss": 1.1364, + "num_input_tokens_seen": 11881960, + "step": 738 + }, + { + "epoch": 0.05176565359391281, + "grad_norm": 4.008789539337158, + "learning_rate": 9.482829422066551e-05, + "loss": 1.1174, + "num_input_tokens_seen": 11897936, + "step": 739 + }, + { + "epoch": 0.051835701839642055, + "grad_norm": 4.391345977783203, + "learning_rate": 9.482129597197899e-05, + "loss": 1.2045, + "num_input_tokens_seen": 11914320, + "step": 740 + }, + { + "epoch": 0.0519057500853713, + "grad_norm": 4.119503021240234, + "learning_rate": 9.481429772329248e-05, + "loss": 0.927, + "num_input_tokens_seen": 11930440, + "step": 741 + }, + { + "epoch": 0.051975798331100546, + "grad_norm": 4.186014175415039, + "learning_rate": 9.480729947460596e-05, + "loss": 1.1583, + "num_input_tokens_seen": 11946720, + "step": 742 + }, + { + "epoch": 0.05204584657682979, + "grad_norm": 4.119131088256836, + "learning_rate": 9.480030122591944e-05, + "loss": 1.0792, + "num_input_tokens_seen": 11962360, + "step": 743 + }, + { + "epoch": 0.05211589482255904, + "grad_norm": 3.921030044555664, + "learning_rate": 9.479330297723293e-05, + "loss": 0.9966, + "num_input_tokens_seen": 11978744, + "step": 744 + }, + { + "epoch": 0.05218594306828828, + "grad_norm": 3.806251049041748, + "learning_rate": 9.478630472854642e-05, + "loss": 1.1207, + "num_input_tokens_seen": 11994912, + "step": 745 + }, + { + "epoch": 0.05225599131401753, + "grad_norm": 4.508687973022461, + "learning_rate": 9.47793064798599e-05, + "loss": 1.1038, + "num_input_tokens_seen": 12011296, + "step": 746 + }, + { + "epoch": 0.052326039559746775, + "grad_norm": 4.458346843719482, + "learning_rate": 9.47723082311734e-05, + "loss": 1.2878, + "num_input_tokens_seen": 12027408, + "step": 747 + }, + { + "epoch": 0.05239608780547602, + "grad_norm": 5.779678821563721, + "learning_rate": 9.476530998248687e-05, + "loss": 1.2722, + "num_input_tokens_seen": 12043792, + "step": 748 + }, + { + "epoch": 0.052466136051205266, + "grad_norm": 4.621145725250244, + "learning_rate": 9.475831173380036e-05, + "loss": 1.2636, + "num_input_tokens_seen": 12059856, + "step": 749 + }, + { + "epoch": 0.05253618429693451, + "grad_norm": 4.276626110076904, + "learning_rate": 9.475131348511383e-05, + "loss": 1.3378, + "num_input_tokens_seen": 12076240, + "step": 750 + }, + { + "epoch": 0.05260623254266376, + "grad_norm": 4.533468246459961, + "learning_rate": 9.474431523642732e-05, + "loss": 0.921, + "num_input_tokens_seen": 12092416, + "step": 751 + }, + { + "epoch": 0.052676280788393, + "grad_norm": 4.626596927642822, + "learning_rate": 9.473731698774081e-05, + "loss": 1.2807, + "num_input_tokens_seen": 12108664, + "step": 752 + }, + { + "epoch": 0.052746329034122255, + "grad_norm": 4.3372907638549805, + "learning_rate": 9.47303187390543e-05, + "loss": 1.2754, + "num_input_tokens_seen": 12125048, + "step": 753 + }, + { + "epoch": 0.0528163772798515, + "grad_norm": 3.6576266288757324, + "learning_rate": 9.472332049036779e-05, + "loss": 0.8487, + "num_input_tokens_seen": 12141296, + "step": 754 + }, + { + "epoch": 0.052886425525580746, + "grad_norm": 3.8973164558410645, + "learning_rate": 9.471632224168126e-05, + "loss": 1.1211, + "num_input_tokens_seen": 12157544, + "step": 755 + }, + { + "epoch": 0.05295647377130999, + "grad_norm": 3.9059019088745117, + "learning_rate": 9.470932399299475e-05, + "loss": 1.2484, + "num_input_tokens_seen": 12173928, + "step": 756 + }, + { + "epoch": 0.05302652201703924, + "grad_norm": 4.133029937744141, + "learning_rate": 9.470232574430822e-05, + "loss": 1.0762, + "num_input_tokens_seen": 12189864, + "step": 757 + }, + { + "epoch": 0.05309657026276848, + "grad_norm": 3.8380961418151855, + "learning_rate": 9.469532749562171e-05, + "loss": 0.9938, + "num_input_tokens_seen": 12206248, + "step": 758 + }, + { + "epoch": 0.05316661850849773, + "grad_norm": 4.753637790679932, + "learning_rate": 9.468832924693522e-05, + "loss": 1.1272, + "num_input_tokens_seen": 12222632, + "step": 759 + }, + { + "epoch": 0.053236666754226974, + "grad_norm": 4.704193592071533, + "learning_rate": 9.468133099824869e-05, + "loss": 1.2276, + "num_input_tokens_seen": 12239016, + "step": 760 + }, + { + "epoch": 0.05330671499995622, + "grad_norm": 3.870870351791382, + "learning_rate": 9.467433274956218e-05, + "loss": 0.916, + "num_input_tokens_seen": 12254784, + "step": 761 + }, + { + "epoch": 0.053376763245685466, + "grad_norm": 3.8597328662872314, + "learning_rate": 9.466733450087567e-05, + "loss": 0.9871, + "num_input_tokens_seen": 12271160, + "step": 762 + }, + { + "epoch": 0.05344681149141471, + "grad_norm": 3.7109553813934326, + "learning_rate": 9.466033625218914e-05, + "loss": 1.1248, + "num_input_tokens_seen": 12286944, + "step": 763 + }, + { + "epoch": 0.05351685973714396, + "grad_norm": 3.985595464706421, + "learning_rate": 9.465333800350263e-05, + "loss": 1.0524, + "num_input_tokens_seen": 12303312, + "step": 764 + }, + { + "epoch": 0.0535869079828732, + "grad_norm": 3.797247886657715, + "learning_rate": 9.464633975481612e-05, + "loss": 1.0799, + "num_input_tokens_seen": 12319696, + "step": 765 + }, + { + "epoch": 0.05365695622860245, + "grad_norm": 4.88303279876709, + "learning_rate": 9.463934150612961e-05, + "loss": 1.2865, + "num_input_tokens_seen": 12335448, + "step": 766 + }, + { + "epoch": 0.053727004474331694, + "grad_norm": 4.273831367492676, + "learning_rate": 9.463234325744308e-05, + "loss": 1.1724, + "num_input_tokens_seen": 12351720, + "step": 767 + }, + { + "epoch": 0.05379705272006094, + "grad_norm": 3.9505984783172607, + "learning_rate": 9.462534500875657e-05, + "loss": 1.1478, + "num_input_tokens_seen": 12368104, + "step": 768 + }, + { + "epoch": 0.053867100965790185, + "grad_norm": 4.20963191986084, + "learning_rate": 9.461834676007006e-05, + "loss": 1.1018, + "num_input_tokens_seen": 12384488, + "step": 769 + }, + { + "epoch": 0.05393714921151943, + "grad_norm": 4.106869220733643, + "learning_rate": 9.461134851138354e-05, + "loss": 1.1097, + "num_input_tokens_seen": 12400128, + "step": 770 + }, + { + "epoch": 0.05400719745724868, + "grad_norm": 4.28592586517334, + "learning_rate": 9.460435026269702e-05, + "loss": 1.036, + "num_input_tokens_seen": 12416512, + "step": 771 + }, + { + "epoch": 0.05407724570297793, + "grad_norm": 3.821927070617676, + "learning_rate": 9.459735201401051e-05, + "loss": 1.1215, + "num_input_tokens_seen": 12432896, + "step": 772 + }, + { + "epoch": 0.054147293948707174, + "grad_norm": 4.14424467086792, + "learning_rate": 9.4590353765324e-05, + "loss": 1.0092, + "num_input_tokens_seen": 12449208, + "step": 773 + }, + { + "epoch": 0.05421734219443642, + "grad_norm": 4.610694885253906, + "learning_rate": 9.458335551663749e-05, + "loss": 1.2265, + "num_input_tokens_seen": 12464128, + "step": 774 + }, + { + "epoch": 0.054287390440165666, + "grad_norm": 4.410182952880859, + "learning_rate": 9.457635726795097e-05, + "loss": 1.1904, + "num_input_tokens_seen": 12479728, + "step": 775 + }, + { + "epoch": 0.05435743868589491, + "grad_norm": 4.096780300140381, + "learning_rate": 9.456935901926445e-05, + "loss": 1.2317, + "num_input_tokens_seen": 12495720, + "step": 776 + }, + { + "epoch": 0.05442748693162416, + "grad_norm": 4.028350830078125, + "learning_rate": 9.456236077057793e-05, + "loss": 1.1825, + "num_input_tokens_seen": 12511480, + "step": 777 + }, + { + "epoch": 0.0544975351773534, + "grad_norm": 5.264276504516602, + "learning_rate": 9.455536252189142e-05, + "loss": 1.057, + "num_input_tokens_seen": 12527864, + "step": 778 + }, + { + "epoch": 0.05456758342308265, + "grad_norm": 4.371725082397461, + "learning_rate": 9.454836427320492e-05, + "loss": 1.1625, + "num_input_tokens_seen": 12544168, + "step": 779 + }, + { + "epoch": 0.054637631668811894, + "grad_norm": 4.692862510681152, + "learning_rate": 9.45413660245184e-05, + "loss": 1.2211, + "num_input_tokens_seen": 12560552, + "step": 780 + }, + { + "epoch": 0.05470767991454114, + "grad_norm": 3.7462823390960693, + "learning_rate": 9.453436777583188e-05, + "loss": 1.0815, + "num_input_tokens_seen": 12576936, + "step": 781 + }, + { + "epoch": 0.054777728160270385, + "grad_norm": 4.161571025848389, + "learning_rate": 9.452736952714536e-05, + "loss": 0.9788, + "num_input_tokens_seen": 12593040, + "step": 782 + }, + { + "epoch": 0.05484777640599963, + "grad_norm": 3.96793532371521, + "learning_rate": 9.452037127845885e-05, + "loss": 1.1396, + "num_input_tokens_seen": 12609424, + "step": 783 + }, + { + "epoch": 0.054917824651728876, + "grad_norm": 4.183755874633789, + "learning_rate": 9.451337302977232e-05, + "loss": 1.0868, + "num_input_tokens_seen": 12625312, + "step": 784 + }, + { + "epoch": 0.05498787289745812, + "grad_norm": 4.506673336029053, + "learning_rate": 9.450637478108582e-05, + "loss": 1.1112, + "num_input_tokens_seen": 12641696, + "step": 785 + }, + { + "epoch": 0.05505792114318737, + "grad_norm": 3.8601651191711426, + "learning_rate": 9.449937653239931e-05, + "loss": 1.2149, + "num_input_tokens_seen": 12658080, + "step": 786 + }, + { + "epoch": 0.05512796938891661, + "grad_norm": 5.190856456756592, + "learning_rate": 9.449237828371279e-05, + "loss": 1.2661, + "num_input_tokens_seen": 12673032, + "step": 787 + }, + { + "epoch": 0.055198017634645866, + "grad_norm": 4.323099136352539, + "learning_rate": 9.448538003502628e-05, + "loss": 1.139, + "num_input_tokens_seen": 12689064, + "step": 788 + }, + { + "epoch": 0.05526806588037511, + "grad_norm": 4.271193981170654, + "learning_rate": 9.447838178633976e-05, + "loss": 1.037, + "num_input_tokens_seen": 12705448, + "step": 789 + }, + { + "epoch": 0.05533811412610436, + "grad_norm": 3.793525218963623, + "learning_rate": 9.447138353765324e-05, + "loss": 1.0265, + "num_input_tokens_seen": 12721832, + "step": 790 + }, + { + "epoch": 0.0554081623718336, + "grad_norm": 3.747575283050537, + "learning_rate": 9.446438528896673e-05, + "loss": 0.9567, + "num_input_tokens_seen": 12738216, + "step": 791 + }, + { + "epoch": 0.05547821061756285, + "grad_norm": 4.222849369049072, + "learning_rate": 9.445738704028022e-05, + "loss": 1.1859, + "num_input_tokens_seen": 12754600, + "step": 792 + }, + { + "epoch": 0.055548258863292094, + "grad_norm": 9.102783203125, + "learning_rate": 9.44503887915937e-05, + "loss": 1.0361, + "num_input_tokens_seen": 12770568, + "step": 793 + }, + { + "epoch": 0.05561830710902134, + "grad_norm": 4.4447808265686035, + "learning_rate": 9.444339054290718e-05, + "loss": 1.2908, + "num_input_tokens_seen": 12785768, + "step": 794 + }, + { + "epoch": 0.055688355354750585, + "grad_norm": 4.038604259490967, + "learning_rate": 9.443639229422067e-05, + "loss": 0.9294, + "num_input_tokens_seen": 12801704, + "step": 795 + }, + { + "epoch": 0.05575840360047983, + "grad_norm": 4.492194652557373, + "learning_rate": 9.442939404553416e-05, + "loss": 1.0466, + "num_input_tokens_seen": 12818088, + "step": 796 + }, + { + "epoch": 0.055828451846209076, + "grad_norm": 3.978029489517212, + "learning_rate": 9.442239579684763e-05, + "loss": 1.1719, + "num_input_tokens_seen": 12834432, + "step": 797 + }, + { + "epoch": 0.05589850009193832, + "grad_norm": 4.014431476593018, + "learning_rate": 9.441539754816112e-05, + "loss": 1.1222, + "num_input_tokens_seen": 12850816, + "step": 798 + }, + { + "epoch": 0.05596854833766757, + "grad_norm": 4.0948638916015625, + "learning_rate": 9.440839929947461e-05, + "loss": 1.2013, + "num_input_tokens_seen": 12867200, + "step": 799 + }, + { + "epoch": 0.05603859658339681, + "grad_norm": 4.18120813369751, + "learning_rate": 9.44014010507881e-05, + "loss": 0.9403, + "num_input_tokens_seen": 12883072, + "step": 800 + }, + { + "epoch": 0.05603859658339681, + "eval_loss": 1.1718552112579346, + "eval_runtime": 0.2039, + "eval_samples_per_second": 4.905, + "eval_steps_per_second": 4.905, + "num_input_tokens_seen": 12883072, + "step": 800 + }, + { + "epoch": 0.05610864482912606, + "grad_norm": 4.425891399383545, + "learning_rate": 9.439440280210159e-05, + "loss": 1.0435, + "num_input_tokens_seen": 12899456, + "step": 801 + }, + { + "epoch": 0.056178693074855304, + "grad_norm": 4.319190979003906, + "learning_rate": 9.438740455341506e-05, + "loss": 1.2612, + "num_input_tokens_seen": 12915840, + "step": 802 + }, + { + "epoch": 0.05624874132058455, + "grad_norm": 4.28010892868042, + "learning_rate": 9.438040630472855e-05, + "loss": 1.0853, + "num_input_tokens_seen": 12932096, + "step": 803 + }, + { + "epoch": 0.056318789566313796, + "grad_norm": 3.9454870223999023, + "learning_rate": 9.437340805604203e-05, + "loss": 1.055, + "num_input_tokens_seen": 12948208, + "step": 804 + }, + { + "epoch": 0.05638883781204305, + "grad_norm": 4.009400367736816, + "learning_rate": 9.436640980735553e-05, + "loss": 1.0681, + "num_input_tokens_seen": 12964096, + "step": 805 + }, + { + "epoch": 0.056458886057772294, + "grad_norm": 3.7949161529541016, + "learning_rate": 9.435941155866902e-05, + "loss": 1.0787, + "num_input_tokens_seen": 12980480, + "step": 806 + }, + { + "epoch": 0.05652893430350154, + "grad_norm": 3.910456418991089, + "learning_rate": 9.435241330998249e-05, + "loss": 0.9212, + "num_input_tokens_seen": 12996864, + "step": 807 + }, + { + "epoch": 0.056598982549230785, + "grad_norm": 4.744706630706787, + "learning_rate": 9.434541506129598e-05, + "loss": 1.0582, + "num_input_tokens_seen": 13013248, + "step": 808 + }, + { + "epoch": 0.05666903079496003, + "grad_norm": 4.4282732009887695, + "learning_rate": 9.433841681260946e-05, + "loss": 1.1353, + "num_input_tokens_seen": 13029632, + "step": 809 + }, + { + "epoch": 0.056739079040689276, + "grad_norm": 3.8422467708587646, + "learning_rate": 9.433141856392294e-05, + "loss": 0.9881, + "num_input_tokens_seen": 13046016, + "step": 810 + }, + { + "epoch": 0.05680912728641852, + "grad_norm": 4.1764445304870605, + "learning_rate": 9.432442031523643e-05, + "loss": 1.183, + "num_input_tokens_seen": 13062400, + "step": 811 + }, + { + "epoch": 0.05687917553214777, + "grad_norm": 4.713895320892334, + "learning_rate": 9.431742206654992e-05, + "loss": 1.0752, + "num_input_tokens_seen": 13078584, + "step": 812 + }, + { + "epoch": 0.05694922377787701, + "grad_norm": 4.265610694885254, + "learning_rate": 9.431042381786341e-05, + "loss": 0.9469, + "num_input_tokens_seen": 13094968, + "step": 813 + }, + { + "epoch": 0.05701927202360626, + "grad_norm": 3.9274330139160156, + "learning_rate": 9.430342556917688e-05, + "loss": 1.1765, + "num_input_tokens_seen": 13111304, + "step": 814 + }, + { + "epoch": 0.057089320269335504, + "grad_norm": 4.44935941696167, + "learning_rate": 9.429642732049037e-05, + "loss": 1.1014, + "num_input_tokens_seen": 13127304, + "step": 815 + }, + { + "epoch": 0.05715936851506475, + "grad_norm": 5.019375801086426, + "learning_rate": 9.428942907180386e-05, + "loss": 1.0535, + "num_input_tokens_seen": 13143688, + "step": 816 + }, + { + "epoch": 0.057229416760793995, + "grad_norm": 4.743424892425537, + "learning_rate": 9.428243082311734e-05, + "loss": 1.3912, + "num_input_tokens_seen": 13160072, + "step": 817 + }, + { + "epoch": 0.05729946500652324, + "grad_norm": 3.921475887298584, + "learning_rate": 9.427543257443083e-05, + "loss": 1.1116, + "num_input_tokens_seen": 13176456, + "step": 818 + }, + { + "epoch": 0.05736951325225249, + "grad_norm": 4.106019020080566, + "learning_rate": 9.426843432574431e-05, + "loss": 0.9, + "num_input_tokens_seen": 13192840, + "step": 819 + }, + { + "epoch": 0.05743956149798173, + "grad_norm": 4.298704147338867, + "learning_rate": 9.42614360770578e-05, + "loss": 1.281, + "num_input_tokens_seen": 13209144, + "step": 820 + }, + { + "epoch": 0.05750960974371098, + "grad_norm": 4.29774284362793, + "learning_rate": 9.425443782837128e-05, + "loss": 1.2703, + "num_input_tokens_seen": 13224752, + "step": 821 + }, + { + "epoch": 0.057579657989440224, + "grad_norm": 4.6176838874816895, + "learning_rate": 9.424743957968477e-05, + "loss": 1.232, + "num_input_tokens_seen": 13240856, + "step": 822 + }, + { + "epoch": 0.057649706235169476, + "grad_norm": 4.450786590576172, + "learning_rate": 9.424044133099826e-05, + "loss": 1.1369, + "num_input_tokens_seen": 13256800, + "step": 823 + }, + { + "epoch": 0.05771975448089872, + "grad_norm": 3.8302414417266846, + "learning_rate": 9.423344308231173e-05, + "loss": 0.9985, + "num_input_tokens_seen": 13273032, + "step": 824 + }, + { + "epoch": 0.05778980272662797, + "grad_norm": 4.641941070556641, + "learning_rate": 9.422644483362523e-05, + "loss": 1.2238, + "num_input_tokens_seen": 13289104, + "step": 825 + }, + { + "epoch": 0.05785985097235721, + "grad_norm": 4.369805335998535, + "learning_rate": 9.421944658493871e-05, + "loss": 1.2047, + "num_input_tokens_seen": 13304752, + "step": 826 + }, + { + "epoch": 0.05792989921808646, + "grad_norm": 3.863507032394409, + "learning_rate": 9.42124483362522e-05, + "loss": 1.1098, + "num_input_tokens_seen": 13321088, + "step": 827 + }, + { + "epoch": 0.057999947463815704, + "grad_norm": 5.323369979858398, + "learning_rate": 9.420545008756568e-05, + "loss": 1.1722, + "num_input_tokens_seen": 13336912, + "step": 828 + }, + { + "epoch": 0.05806999570954495, + "grad_norm": 4.006597995758057, + "learning_rate": 9.419845183887916e-05, + "loss": 1.0382, + "num_input_tokens_seen": 13353280, + "step": 829 + }, + { + "epoch": 0.058140043955274195, + "grad_norm": 4.1039886474609375, + "learning_rate": 9.419145359019265e-05, + "loss": 1.2037, + "num_input_tokens_seen": 13369664, + "step": 830 + }, + { + "epoch": 0.05821009220100344, + "grad_norm": 3.903517007827759, + "learning_rate": 9.418445534150614e-05, + "loss": 1.2185, + "num_input_tokens_seen": 13386048, + "step": 831 + }, + { + "epoch": 0.05828014044673269, + "grad_norm": 4.434885025024414, + "learning_rate": 9.417745709281963e-05, + "loss": 1.2444, + "num_input_tokens_seen": 13402432, + "step": 832 + }, + { + "epoch": 0.05835018869246193, + "grad_norm": 4.6121296882629395, + "learning_rate": 9.417045884413311e-05, + "loss": 1.2831, + "num_input_tokens_seen": 13418816, + "step": 833 + }, + { + "epoch": 0.05842023693819118, + "grad_norm": 3.6966841220855713, + "learning_rate": 9.416346059544659e-05, + "loss": 1.0751, + "num_input_tokens_seen": 13435200, + "step": 834 + }, + { + "epoch": 0.058490285183920424, + "grad_norm": 4.292221546173096, + "learning_rate": 9.415646234676008e-05, + "loss": 1.2068, + "num_input_tokens_seen": 13451584, + "step": 835 + }, + { + "epoch": 0.05856033342964967, + "grad_norm": 4.053999900817871, + "learning_rate": 9.414946409807355e-05, + "loss": 1.1735, + "num_input_tokens_seen": 13467824, + "step": 836 + }, + { + "epoch": 0.058630381675378915, + "grad_norm": 4.4411234855651855, + "learning_rate": 9.414246584938704e-05, + "loss": 1.0647, + "num_input_tokens_seen": 13483200, + "step": 837 + }, + { + "epoch": 0.05870042992110816, + "grad_norm": 3.956787347793579, + "learning_rate": 9.413546760070053e-05, + "loss": 0.9813, + "num_input_tokens_seen": 13499584, + "step": 838 + }, + { + "epoch": 0.058770478166837406, + "grad_norm": 5.050291061401367, + "learning_rate": 9.412846935201402e-05, + "loss": 1.1193, + "num_input_tokens_seen": 13515448, + "step": 839 + }, + { + "epoch": 0.05884052641256666, + "grad_norm": 3.8736393451690674, + "learning_rate": 9.412147110332751e-05, + "loss": 1.0294, + "num_input_tokens_seen": 13531200, + "step": 840 + }, + { + "epoch": 0.058910574658295904, + "grad_norm": 6.07747745513916, + "learning_rate": 9.411447285464098e-05, + "loss": 0.9684, + "num_input_tokens_seen": 13547584, + "step": 841 + }, + { + "epoch": 0.05898062290402515, + "grad_norm": 4.606445789337158, + "learning_rate": 9.410747460595447e-05, + "loss": 1.2119, + "num_input_tokens_seen": 13563528, + "step": 842 + }, + { + "epoch": 0.059050671149754395, + "grad_norm": 4.3981709480285645, + "learning_rate": 9.410047635726796e-05, + "loss": 1.3313, + "num_input_tokens_seen": 13579912, + "step": 843 + }, + { + "epoch": 0.05912071939548364, + "grad_norm": 3.64546799659729, + "learning_rate": 9.409347810858143e-05, + "loss": 0.8892, + "num_input_tokens_seen": 13596296, + "step": 844 + }, + { + "epoch": 0.05919076764121289, + "grad_norm": 4.15845251083374, + "learning_rate": 9.408647985989494e-05, + "loss": 1.1464, + "num_input_tokens_seen": 13612680, + "step": 845 + }, + { + "epoch": 0.05926081588694213, + "grad_norm": 6.049203872680664, + "learning_rate": 9.407948161120841e-05, + "loss": 1.1907, + "num_input_tokens_seen": 13627832, + "step": 846 + }, + { + "epoch": 0.05933086413267138, + "grad_norm": 3.7192461490631104, + "learning_rate": 9.40724833625219e-05, + "loss": 1.165, + "num_input_tokens_seen": 13643824, + "step": 847 + }, + { + "epoch": 0.059400912378400623, + "grad_norm": 4.183239936828613, + "learning_rate": 9.406548511383537e-05, + "loss": 1.1697, + "num_input_tokens_seen": 13660208, + "step": 848 + }, + { + "epoch": 0.05947096062412987, + "grad_norm": 4.126212120056152, + "learning_rate": 9.405848686514886e-05, + "loss": 1.0532, + "num_input_tokens_seen": 13676592, + "step": 849 + }, + { + "epoch": 0.059541008869859115, + "grad_norm": 4.033525466918945, + "learning_rate": 9.405148861646235e-05, + "loss": 1.1497, + "num_input_tokens_seen": 13692600, + "step": 850 + }, + { + "epoch": 0.05961105711558836, + "grad_norm": 4.162797451019287, + "learning_rate": 9.404449036777584e-05, + "loss": 1.162, + "num_input_tokens_seen": 13708984, + "step": 851 + }, + { + "epoch": 0.059681105361317606, + "grad_norm": 4.057224750518799, + "learning_rate": 9.403749211908933e-05, + "loss": 1.2166, + "num_input_tokens_seen": 13724656, + "step": 852 + }, + { + "epoch": 0.05975115360704685, + "grad_norm": 4.201955318450928, + "learning_rate": 9.40304938704028e-05, + "loss": 1.2195, + "num_input_tokens_seen": 13741040, + "step": 853 + }, + { + "epoch": 0.0598212018527761, + "grad_norm": 3.8704352378845215, + "learning_rate": 9.402349562171629e-05, + "loss": 0.8946, + "num_input_tokens_seen": 13757424, + "step": 854 + }, + { + "epoch": 0.05989125009850534, + "grad_norm": 6.010958671569824, + "learning_rate": 9.401649737302978e-05, + "loss": 1.2095, + "num_input_tokens_seen": 13773808, + "step": 855 + }, + { + "epoch": 0.05996129834423459, + "grad_norm": 4.975742816925049, + "learning_rate": 9.400949912434326e-05, + "loss": 1.1064, + "num_input_tokens_seen": 13789704, + "step": 856 + }, + { + "epoch": 0.06003134658996384, + "grad_norm": 4.021739959716797, + "learning_rate": 9.400250087565675e-05, + "loss": 1.2036, + "num_input_tokens_seen": 13806088, + "step": 857 + }, + { + "epoch": 0.06010139483569309, + "grad_norm": 4.262394905090332, + "learning_rate": 9.399550262697023e-05, + "loss": 1.1053, + "num_input_tokens_seen": 13821928, + "step": 858 + }, + { + "epoch": 0.06017144308142233, + "grad_norm": 4.3033671379089355, + "learning_rate": 9.398850437828372e-05, + "loss": 1.0213, + "num_input_tokens_seen": 13838232, + "step": 859 + }, + { + "epoch": 0.06024149132715158, + "grad_norm": 4.066610336303711, + "learning_rate": 9.398150612959721e-05, + "loss": 1.0579, + "num_input_tokens_seen": 13853912, + "step": 860 + }, + { + "epoch": 0.06031153957288082, + "grad_norm": 4.308155059814453, + "learning_rate": 9.397450788091069e-05, + "loss": 1.3624, + "num_input_tokens_seen": 13870224, + "step": 861 + }, + { + "epoch": 0.06038158781861007, + "grad_norm": 4.307553291320801, + "learning_rate": 9.396750963222417e-05, + "loss": 1.0942, + "num_input_tokens_seen": 13886608, + "step": 862 + }, + { + "epoch": 0.060451636064339315, + "grad_norm": 3.8107142448425293, + "learning_rate": 9.396051138353765e-05, + "loss": 1.1285, + "num_input_tokens_seen": 13902992, + "step": 863 + }, + { + "epoch": 0.06052168431006856, + "grad_norm": 4.530765533447266, + "learning_rate": 9.395351313485114e-05, + "loss": 1.2028, + "num_input_tokens_seen": 13919376, + "step": 864 + }, + { + "epoch": 0.060591732555797806, + "grad_norm": 4.035069465637207, + "learning_rate": 9.394651488616463e-05, + "loss": 1.0291, + "num_input_tokens_seen": 13935664, + "step": 865 + }, + { + "epoch": 0.06066178080152705, + "grad_norm": 4.028316497802734, + "learning_rate": 9.393951663747812e-05, + "loss": 1.21, + "num_input_tokens_seen": 13951096, + "step": 866 + }, + { + "epoch": 0.0607318290472563, + "grad_norm": 4.039167881011963, + "learning_rate": 9.39325183887916e-05, + "loss": 0.929, + "num_input_tokens_seen": 13966272, + "step": 867 + }, + { + "epoch": 0.06080187729298554, + "grad_norm": 4.139703273773193, + "learning_rate": 9.392552014010508e-05, + "loss": 1.2575, + "num_input_tokens_seen": 13981848, + "step": 868 + }, + { + "epoch": 0.06087192553871479, + "grad_norm": 4.222180366516113, + "learning_rate": 9.391852189141857e-05, + "loss": 1.2067, + "num_input_tokens_seen": 13997920, + "step": 869 + }, + { + "epoch": 0.060941973784444034, + "grad_norm": 3.7993030548095703, + "learning_rate": 9.391152364273206e-05, + "loss": 1.0865, + "num_input_tokens_seen": 14014304, + "step": 870 + }, + { + "epoch": 0.06101202203017328, + "grad_norm": 4.811493396759033, + "learning_rate": 9.390452539404554e-05, + "loss": 1.1331, + "num_input_tokens_seen": 14030688, + "step": 871 + }, + { + "epoch": 0.061082070275902525, + "grad_norm": 13.88792610168457, + "learning_rate": 9.389752714535903e-05, + "loss": 1.1368, + "num_input_tokens_seen": 14045584, + "step": 872 + }, + { + "epoch": 0.06115211852163177, + "grad_norm": 3.7678709030151367, + "learning_rate": 9.389052889667251e-05, + "loss": 1.1012, + "num_input_tokens_seen": 14061968, + "step": 873 + }, + { + "epoch": 0.061222166767361016, + "grad_norm": 4.252075672149658, + "learning_rate": 9.3883530647986e-05, + "loss": 1.0472, + "num_input_tokens_seen": 14077584, + "step": 874 + }, + { + "epoch": 0.06129221501309027, + "grad_norm": 3.555629253387451, + "learning_rate": 9.387653239929947e-05, + "loss": 0.8653, + "num_input_tokens_seen": 14093704, + "step": 875 + }, + { + "epoch": 0.061362263258819515, + "grad_norm": 4.122331619262695, + "learning_rate": 9.386953415061296e-05, + "loss": 1.0395, + "num_input_tokens_seen": 14109624, + "step": 876 + }, + { + "epoch": 0.06143231150454876, + "grad_norm": 3.6772518157958984, + "learning_rate": 9.386253590192645e-05, + "loss": 0.8842, + "num_input_tokens_seen": 14126008, + "step": 877 + }, + { + "epoch": 0.061502359750278006, + "grad_norm": 3.791351079940796, + "learning_rate": 9.385553765323994e-05, + "loss": 1.1118, + "num_input_tokens_seen": 14142392, + "step": 878 + }, + { + "epoch": 0.06157240799600725, + "grad_norm": 3.781759738922119, + "learning_rate": 9.384853940455343e-05, + "loss": 1.0577, + "num_input_tokens_seen": 14158776, + "step": 879 + }, + { + "epoch": 0.0616424562417365, + "grad_norm": 4.2420830726623535, + "learning_rate": 9.38415411558669e-05, + "loss": 1.268, + "num_input_tokens_seen": 14173920, + "step": 880 + }, + { + "epoch": 0.06171250448746574, + "grad_norm": 4.000860214233398, + "learning_rate": 9.383454290718039e-05, + "loss": 1.1626, + "num_input_tokens_seen": 14190032, + "step": 881 + }, + { + "epoch": 0.06178255273319499, + "grad_norm": 3.760969877243042, + "learning_rate": 9.382754465849388e-05, + "loss": 0.9684, + "num_input_tokens_seen": 14206416, + "step": 882 + }, + { + "epoch": 0.061852600978924234, + "grad_norm": 4.81919002532959, + "learning_rate": 9.382054640980735e-05, + "loss": 1.1056, + "num_input_tokens_seen": 14222408, + "step": 883 + }, + { + "epoch": 0.06192264922465348, + "grad_norm": 4.951950550079346, + "learning_rate": 9.381354816112084e-05, + "loss": 1.0334, + "num_input_tokens_seen": 14238616, + "step": 884 + }, + { + "epoch": 0.061992697470382725, + "grad_norm": 4.15132999420166, + "learning_rate": 9.380654991243433e-05, + "loss": 1.3171, + "num_input_tokens_seen": 14254968, + "step": 885 + }, + { + "epoch": 0.06206274571611197, + "grad_norm": 5.100244998931885, + "learning_rate": 9.379955166374782e-05, + "loss": 1.1684, + "num_input_tokens_seen": 14271352, + "step": 886 + }, + { + "epoch": 0.062132793961841216, + "grad_norm": 5.999105453491211, + "learning_rate": 9.379255341506131e-05, + "loss": 0.9824, + "num_input_tokens_seen": 14287496, + "step": 887 + }, + { + "epoch": 0.06220284220757046, + "grad_norm": 3.8826348781585693, + "learning_rate": 9.378555516637478e-05, + "loss": 1.0829, + "num_input_tokens_seen": 14303880, + "step": 888 + }, + { + "epoch": 0.06227289045329971, + "grad_norm": 5.308819770812988, + "learning_rate": 9.377855691768827e-05, + "loss": 1.1377, + "num_input_tokens_seen": 14320264, + "step": 889 + }, + { + "epoch": 0.06234293869902895, + "grad_norm": 4.383331775665283, + "learning_rate": 9.377155866900175e-05, + "loss": 1.0147, + "num_input_tokens_seen": 14336232, + "step": 890 + }, + { + "epoch": 0.0624129869447582, + "grad_norm": 4.335045337677002, + "learning_rate": 9.376456042031524e-05, + "loss": 0.9807, + "num_input_tokens_seen": 14351704, + "step": 891 + }, + { + "epoch": 0.06248303519048745, + "grad_norm": 3.6901326179504395, + "learning_rate": 9.375756217162872e-05, + "loss": 1.0494, + "num_input_tokens_seen": 14368088, + "step": 892 + }, + { + "epoch": 0.0625530834362167, + "grad_norm": 3.912727117538452, + "learning_rate": 9.375056392294221e-05, + "loss": 1.1191, + "num_input_tokens_seen": 14383904, + "step": 893 + }, + { + "epoch": 0.06262313168194594, + "grad_norm": 3.5688252449035645, + "learning_rate": 9.37435656742557e-05, + "loss": 0.833, + "num_input_tokens_seen": 14399648, + "step": 894 + }, + { + "epoch": 0.06269317992767519, + "grad_norm": 4.6460137367248535, + "learning_rate": 9.373656742556918e-05, + "loss": 1.2523, + "num_input_tokens_seen": 14415640, + "step": 895 + }, + { + "epoch": 0.06276322817340443, + "grad_norm": 3.8113012313842773, + "learning_rate": 9.372956917688266e-05, + "loss": 1.1789, + "num_input_tokens_seen": 14432024, + "step": 896 + }, + { + "epoch": 0.06283327641913368, + "grad_norm": 3.8755953311920166, + "learning_rate": 9.372257092819615e-05, + "loss": 1.1506, + "num_input_tokens_seen": 14448152, + "step": 897 + }, + { + "epoch": 0.06290332466486293, + "grad_norm": 4.225901126861572, + "learning_rate": 9.371557267950964e-05, + "loss": 1.0754, + "num_input_tokens_seen": 14464536, + "step": 898 + }, + { + "epoch": 0.06297337291059217, + "grad_norm": 3.9437992572784424, + "learning_rate": 9.370857443082313e-05, + "loss": 1.049, + "num_input_tokens_seen": 14480072, + "step": 899 + }, + { + "epoch": 0.06304342115632142, + "grad_norm": 3.8961846828460693, + "learning_rate": 9.37015761821366e-05, + "loss": 1.1925, + "num_input_tokens_seen": 14496456, + "step": 900 + }, + { + "epoch": 0.06311346940205066, + "grad_norm": 4.844581604003906, + "learning_rate": 9.36945779334501e-05, + "loss": 1.0867, + "num_input_tokens_seen": 14512520, + "step": 901 + }, + { + "epoch": 0.06318351764777991, + "grad_norm": 4.89027214050293, + "learning_rate": 9.368757968476357e-05, + "loss": 1.0997, + "num_input_tokens_seen": 14528904, + "step": 902 + }, + { + "epoch": 0.06325356589350915, + "grad_norm": 4.303073883056641, + "learning_rate": 9.368058143607706e-05, + "loss": 1.0626, + "num_input_tokens_seen": 14545288, + "step": 903 + }, + { + "epoch": 0.0633236141392384, + "grad_norm": 5.145171165466309, + "learning_rate": 9.367358318739055e-05, + "loss": 1.3597, + "num_input_tokens_seen": 14561672, + "step": 904 + }, + { + "epoch": 0.06339366238496764, + "grad_norm": 5.7905964851379395, + "learning_rate": 9.366658493870403e-05, + "loss": 1.1075, + "num_input_tokens_seen": 14575896, + "step": 905 + }, + { + "epoch": 0.06346371063069689, + "grad_norm": 3.7394728660583496, + "learning_rate": 9.365958669001752e-05, + "loss": 0.9347, + "num_input_tokens_seen": 14592280, + "step": 906 + }, + { + "epoch": 0.06353375887642614, + "grad_norm": 3.916626453399658, + "learning_rate": 9.3652588441331e-05, + "loss": 1.0793, + "num_input_tokens_seen": 14608072, + "step": 907 + }, + { + "epoch": 0.06360380712215538, + "grad_norm": 5.088227272033691, + "learning_rate": 9.364559019264449e-05, + "loss": 1.158, + "num_input_tokens_seen": 14624360, + "step": 908 + }, + { + "epoch": 0.06367385536788463, + "grad_norm": 3.8519606590270996, + "learning_rate": 9.363859194395798e-05, + "loss": 1.1235, + "num_input_tokens_seen": 14640744, + "step": 909 + }, + { + "epoch": 0.06374390361361387, + "grad_norm": 4.450200080871582, + "learning_rate": 9.363159369527145e-05, + "loss": 1.0145, + "num_input_tokens_seen": 14657128, + "step": 910 + }, + { + "epoch": 0.06381395185934312, + "grad_norm": 4.188115119934082, + "learning_rate": 9.362459544658494e-05, + "loss": 1.1457, + "num_input_tokens_seen": 14673128, + "step": 911 + }, + { + "epoch": 0.06388400010507236, + "grad_norm": 4.67346715927124, + "learning_rate": 9.361759719789843e-05, + "loss": 1.2841, + "num_input_tokens_seen": 14689512, + "step": 912 + }, + { + "epoch": 0.06395404835080161, + "grad_norm": 3.737790822982788, + "learning_rate": 9.361059894921192e-05, + "loss": 1.0114, + "num_input_tokens_seen": 14705872, + "step": 913 + }, + { + "epoch": 0.06402409659653086, + "grad_norm": 4.2486653327941895, + "learning_rate": 9.36036007005254e-05, + "loss": 1.1526, + "num_input_tokens_seen": 14721816, + "step": 914 + }, + { + "epoch": 0.0640941448422601, + "grad_norm": 4.120566368103027, + "learning_rate": 9.359660245183888e-05, + "loss": 1.1045, + "num_input_tokens_seen": 14738200, + "step": 915 + }, + { + "epoch": 0.06416419308798935, + "grad_norm": 5.259902477264404, + "learning_rate": 9.358960420315237e-05, + "loss": 1.3544, + "num_input_tokens_seen": 14753920, + "step": 916 + }, + { + "epoch": 0.06423424133371859, + "grad_norm": 3.900827646255493, + "learning_rate": 9.358260595446584e-05, + "loss": 1.1079, + "num_input_tokens_seen": 14769640, + "step": 917 + }, + { + "epoch": 0.06430428957944785, + "grad_norm": 4.103065490722656, + "learning_rate": 9.357560770577935e-05, + "loss": 0.963, + "num_input_tokens_seen": 14786024, + "step": 918 + }, + { + "epoch": 0.0643743378251771, + "grad_norm": 3.9913623332977295, + "learning_rate": 9.356860945709282e-05, + "loss": 1.0959, + "num_input_tokens_seen": 14802408, + "step": 919 + }, + { + "epoch": 0.06444438607090634, + "grad_norm": 3.7369885444641113, + "learning_rate": 9.356161120840631e-05, + "loss": 1.131, + "num_input_tokens_seen": 14818792, + "step": 920 + }, + { + "epoch": 0.06451443431663559, + "grad_norm": 4.029351711273193, + "learning_rate": 9.35546129597198e-05, + "loss": 1.0378, + "num_input_tokens_seen": 14833792, + "step": 921 + }, + { + "epoch": 0.06458448256236483, + "grad_norm": 4.043665885925293, + "learning_rate": 9.354761471103327e-05, + "loss": 1.179, + "num_input_tokens_seen": 14850176, + "step": 922 + }, + { + "epoch": 0.06465453080809408, + "grad_norm": 3.7803280353546143, + "learning_rate": 9.354061646234676e-05, + "loss": 0.9886, + "num_input_tokens_seen": 14866096, + "step": 923 + }, + { + "epoch": 0.06472457905382333, + "grad_norm": 5.537375450134277, + "learning_rate": 9.353361821366025e-05, + "loss": 1.2519, + "num_input_tokens_seen": 14882480, + "step": 924 + }, + { + "epoch": 0.06479462729955257, + "grad_norm": 4.944652557373047, + "learning_rate": 9.352661996497374e-05, + "loss": 1.1963, + "num_input_tokens_seen": 14898864, + "step": 925 + }, + { + "epoch": 0.06486467554528182, + "grad_norm": 4.3231611251831055, + "learning_rate": 9.351962171628723e-05, + "loss": 1.1858, + "num_input_tokens_seen": 14913856, + "step": 926 + }, + { + "epoch": 0.06493472379101106, + "grad_norm": 4.386692523956299, + "learning_rate": 9.35126234676007e-05, + "loss": 1.0464, + "num_input_tokens_seen": 14929816, + "step": 927 + }, + { + "epoch": 0.06500477203674031, + "grad_norm": 4.607088088989258, + "learning_rate": 9.350562521891419e-05, + "loss": 1.2197, + "num_input_tokens_seen": 14946200, + "step": 928 + }, + { + "epoch": 0.06507482028246955, + "grad_norm": 4.7108001708984375, + "learning_rate": 9.349862697022767e-05, + "loss": 1.2335, + "num_input_tokens_seen": 14961816, + "step": 929 + }, + { + "epoch": 0.0651448685281988, + "grad_norm": 3.844571352005005, + "learning_rate": 9.349162872154115e-05, + "loss": 1.2745, + "num_input_tokens_seen": 14978200, + "step": 930 + }, + { + "epoch": 0.06521491677392804, + "grad_norm": 4.078561782836914, + "learning_rate": 9.348463047285464e-05, + "loss": 1.1737, + "num_input_tokens_seen": 14994440, + "step": 931 + }, + { + "epoch": 0.06528496501965729, + "grad_norm": 4.317986011505127, + "learning_rate": 9.347763222416813e-05, + "loss": 1.3046, + "num_input_tokens_seen": 15010824, + "step": 932 + }, + { + "epoch": 0.06535501326538654, + "grad_norm": 4.459141254425049, + "learning_rate": 9.347063397548162e-05, + "loss": 1.2893, + "num_input_tokens_seen": 15026608, + "step": 933 + }, + { + "epoch": 0.06542506151111578, + "grad_norm": 4.251399993896484, + "learning_rate": 9.34636357267951e-05, + "loss": 1.2346, + "num_input_tokens_seen": 15042328, + "step": 934 + }, + { + "epoch": 0.06549510975684503, + "grad_norm": 4.568341255187988, + "learning_rate": 9.345663747810858e-05, + "loss": 1.4343, + "num_input_tokens_seen": 15058712, + "step": 935 + }, + { + "epoch": 0.06556515800257427, + "grad_norm": 4.7616424560546875, + "learning_rate": 9.344963922942207e-05, + "loss": 1.0925, + "num_input_tokens_seen": 15075096, + "step": 936 + }, + { + "epoch": 0.06563520624830352, + "grad_norm": 3.8224191665649414, + "learning_rate": 9.344264098073555e-05, + "loss": 1.0958, + "num_input_tokens_seen": 15091480, + "step": 937 + }, + { + "epoch": 0.06570525449403276, + "grad_norm": 4.985624313354492, + "learning_rate": 9.343564273204905e-05, + "loss": 1.233, + "num_input_tokens_seen": 15107864, + "step": 938 + }, + { + "epoch": 0.06577530273976201, + "grad_norm": 4.3780975341796875, + "learning_rate": 9.342864448336252e-05, + "loss": 1.1819, + "num_input_tokens_seen": 15123656, + "step": 939 + }, + { + "epoch": 0.06584535098549125, + "grad_norm": 4.435183525085449, + "learning_rate": 9.342164623467601e-05, + "loss": 1.1107, + "num_input_tokens_seen": 15140040, + "step": 940 + }, + { + "epoch": 0.0659153992312205, + "grad_norm": 4.560804843902588, + "learning_rate": 9.34146479859895e-05, + "loss": 1.1274, + "num_input_tokens_seen": 15156424, + "step": 941 + }, + { + "epoch": 0.06598544747694975, + "grad_norm": 5.184841156005859, + "learning_rate": 9.340764973730298e-05, + "loss": 1.3124, + "num_input_tokens_seen": 15172504, + "step": 942 + }, + { + "epoch": 0.06605549572267899, + "grad_norm": 3.5243096351623535, + "learning_rate": 9.340065148861647e-05, + "loss": 0.8203, + "num_input_tokens_seen": 15188888, + "step": 943 + }, + { + "epoch": 0.06612554396840824, + "grad_norm": 4.041544437408447, + "learning_rate": 9.339365323992995e-05, + "loss": 1.0602, + "num_input_tokens_seen": 15204672, + "step": 944 + }, + { + "epoch": 0.06619559221413748, + "grad_norm": 3.720906972885132, + "learning_rate": 9.338665499124344e-05, + "loss": 1.0722, + "num_input_tokens_seen": 15220688, + "step": 945 + }, + { + "epoch": 0.06626564045986673, + "grad_norm": 3.9778380393981934, + "learning_rate": 9.337965674255692e-05, + "loss": 1.2653, + "num_input_tokens_seen": 15236856, + "step": 946 + }, + { + "epoch": 0.06633568870559597, + "grad_norm": 4.486488342285156, + "learning_rate": 9.33726584938704e-05, + "loss": 1.2408, + "num_input_tokens_seen": 15253240, + "step": 947 + }, + { + "epoch": 0.06640573695132522, + "grad_norm": 8.369994163513184, + "learning_rate": 9.33656602451839e-05, + "loss": 1.4841, + "num_input_tokens_seen": 15267728, + "step": 948 + }, + { + "epoch": 0.06647578519705447, + "grad_norm": 4.2056732177734375, + "learning_rate": 9.335866199649737e-05, + "loss": 1.4258, + "num_input_tokens_seen": 15284112, + "step": 949 + }, + { + "epoch": 0.06654583344278371, + "grad_norm": 4.396723747253418, + "learning_rate": 9.335166374781086e-05, + "loss": 1.1578, + "num_input_tokens_seen": 15300496, + "step": 950 + }, + { + "epoch": 0.06661588168851296, + "grad_norm": 3.7177491188049316, + "learning_rate": 9.334466549912435e-05, + "loss": 1.0664, + "num_input_tokens_seen": 15316608, + "step": 951 + }, + { + "epoch": 0.0666859299342422, + "grad_norm": 4.080933094024658, + "learning_rate": 9.333766725043784e-05, + "loss": 1.1282, + "num_input_tokens_seen": 15332976, + "step": 952 + }, + { + "epoch": 0.06675597817997146, + "grad_norm": 5.188856601715088, + "learning_rate": 9.333066900175132e-05, + "loss": 1.2079, + "num_input_tokens_seen": 15349080, + "step": 953 + }, + { + "epoch": 0.06682602642570071, + "grad_norm": 4.583539962768555, + "learning_rate": 9.33236707530648e-05, + "loss": 0.9047, + "num_input_tokens_seen": 15365256, + "step": 954 + }, + { + "epoch": 0.06689607467142995, + "grad_norm": 3.873830795288086, + "learning_rate": 9.331667250437829e-05, + "loss": 1.159, + "num_input_tokens_seen": 15381640, + "step": 955 + }, + { + "epoch": 0.0669661229171592, + "grad_norm": 3.9574460983276367, + "learning_rate": 9.330967425569176e-05, + "loss": 1.0696, + "num_input_tokens_seen": 15397800, + "step": 956 + }, + { + "epoch": 0.06703617116288844, + "grad_norm": 3.8933448791503906, + "learning_rate": 9.330267600700525e-05, + "loss": 0.9844, + "num_input_tokens_seen": 15414112, + "step": 957 + }, + { + "epoch": 0.06710621940861769, + "grad_norm": 4.748478412628174, + "learning_rate": 9.329567775831875e-05, + "loss": 1.1308, + "num_input_tokens_seen": 15430496, + "step": 958 + }, + { + "epoch": 0.06717626765434694, + "grad_norm": 6.755379676818848, + "learning_rate": 9.328867950963223e-05, + "loss": 1.206, + "num_input_tokens_seen": 15445072, + "step": 959 + }, + { + "epoch": 0.06724631590007618, + "grad_norm": 4.382065773010254, + "learning_rate": 9.328168126094572e-05, + "loss": 1.0753, + "num_input_tokens_seen": 15460336, + "step": 960 + }, + { + "epoch": 0.06731636414580543, + "grad_norm": 5.037116527557373, + "learning_rate": 9.327468301225919e-05, + "loss": 1.0562, + "num_input_tokens_seen": 15474752, + "step": 961 + }, + { + "epoch": 0.06738641239153467, + "grad_norm": 5.838945388793945, + "learning_rate": 9.326768476357268e-05, + "loss": 1.314, + "num_input_tokens_seen": 15491136, + "step": 962 + }, + { + "epoch": 0.06745646063726392, + "grad_norm": 3.690436840057373, + "learning_rate": 9.326068651488617e-05, + "loss": 0.996, + "num_input_tokens_seen": 15507520, + "step": 963 + }, + { + "epoch": 0.06752650888299316, + "grad_norm": 4.1123247146606445, + "learning_rate": 9.325368826619966e-05, + "loss": 1.2031, + "num_input_tokens_seen": 15523904, + "step": 964 + }, + { + "epoch": 0.06759655712872241, + "grad_norm": 4.120308876037598, + "learning_rate": 9.324669001751315e-05, + "loss": 0.9671, + "num_input_tokens_seen": 15540136, + "step": 965 + }, + { + "epoch": 0.06766660537445165, + "grad_norm": 3.9849514961242676, + "learning_rate": 9.323969176882662e-05, + "loss": 1.1669, + "num_input_tokens_seen": 15556312, + "step": 966 + }, + { + "epoch": 0.0677366536201809, + "grad_norm": 3.9164884090423584, + "learning_rate": 9.323269352014011e-05, + "loss": 1.0883, + "num_input_tokens_seen": 15571864, + "step": 967 + }, + { + "epoch": 0.06780670186591015, + "grad_norm": 4.282434940338135, + "learning_rate": 9.32256952714536e-05, + "loss": 1.241, + "num_input_tokens_seen": 15587800, + "step": 968 + }, + { + "epoch": 0.06787675011163939, + "grad_norm": 4.118724346160889, + "learning_rate": 9.321869702276707e-05, + "loss": 1.0905, + "num_input_tokens_seen": 15603128, + "step": 969 + }, + { + "epoch": 0.06794679835736864, + "grad_norm": 4.233770847320557, + "learning_rate": 9.321169877408056e-05, + "loss": 1.0618, + "num_input_tokens_seen": 15617864, + "step": 970 + }, + { + "epoch": 0.06801684660309788, + "grad_norm": 3.933587074279785, + "learning_rate": 9.320470052539405e-05, + "loss": 0.982, + "num_input_tokens_seen": 15634248, + "step": 971 + }, + { + "epoch": 0.06808689484882713, + "grad_norm": 4.641788482666016, + "learning_rate": 9.319770227670754e-05, + "loss": 0.9793, + "num_input_tokens_seen": 15650304, + "step": 972 + }, + { + "epoch": 0.06815694309455637, + "grad_norm": 4.138880729675293, + "learning_rate": 9.319070402802102e-05, + "loss": 1.1991, + "num_input_tokens_seen": 15666688, + "step": 973 + }, + { + "epoch": 0.06822699134028562, + "grad_norm": 4.823685169219971, + "learning_rate": 9.31837057793345e-05, + "loss": 0.9162, + "num_input_tokens_seen": 15682936, + "step": 974 + }, + { + "epoch": 0.06829703958601487, + "grad_norm": 4.432481288909912, + "learning_rate": 9.317670753064799e-05, + "loss": 0.9626, + "num_input_tokens_seen": 15699320, + "step": 975 + }, + { + "epoch": 0.06836708783174411, + "grad_norm": 4.115868091583252, + "learning_rate": 9.316970928196147e-05, + "loss": 1.105, + "num_input_tokens_seen": 15715296, + "step": 976 + }, + { + "epoch": 0.06843713607747336, + "grad_norm": 3.964905023574829, + "learning_rate": 9.316271103327496e-05, + "loss": 1.0064, + "num_input_tokens_seen": 15731680, + "step": 977 + }, + { + "epoch": 0.0685071843232026, + "grad_norm": 3.686522960662842, + "learning_rate": 9.315571278458846e-05, + "loss": 0.9924, + "num_input_tokens_seen": 15747808, + "step": 978 + }, + { + "epoch": 0.06857723256893185, + "grad_norm": 4.0614423751831055, + "learning_rate": 9.314871453590193e-05, + "loss": 1.0425, + "num_input_tokens_seen": 15764168, + "step": 979 + }, + { + "epoch": 0.0686472808146611, + "grad_norm": 3.756350517272949, + "learning_rate": 9.314171628721542e-05, + "loss": 1.0757, + "num_input_tokens_seen": 15780176, + "step": 980 + }, + { + "epoch": 0.06871732906039034, + "grad_norm": 4.30344820022583, + "learning_rate": 9.31347180385289e-05, + "loss": 0.9496, + "num_input_tokens_seen": 15795720, + "step": 981 + }, + { + "epoch": 0.06878737730611958, + "grad_norm": 4.055768013000488, + "learning_rate": 9.312771978984239e-05, + "loss": 1.0189, + "num_input_tokens_seen": 15811528, + "step": 982 + }, + { + "epoch": 0.06885742555184883, + "grad_norm": 3.8779115676879883, + "learning_rate": 9.312072154115586e-05, + "loss": 1.0516, + "num_input_tokens_seen": 15827392, + "step": 983 + }, + { + "epoch": 0.06892747379757808, + "grad_norm": 5.014206886291504, + "learning_rate": 9.311372329246936e-05, + "loss": 1.3421, + "num_input_tokens_seen": 15843776, + "step": 984 + }, + { + "epoch": 0.06899752204330732, + "grad_norm": 4.548489570617676, + "learning_rate": 9.310672504378285e-05, + "loss": 1.1652, + "num_input_tokens_seen": 15858880, + "step": 985 + }, + { + "epoch": 0.06906757028903657, + "grad_norm": 4.312918186187744, + "learning_rate": 9.309972679509633e-05, + "loss": 1.2728, + "num_input_tokens_seen": 15874840, + "step": 986 + }, + { + "epoch": 0.06913761853476583, + "grad_norm": 3.9783735275268555, + "learning_rate": 9.309272854640981e-05, + "loss": 0.9377, + "num_input_tokens_seen": 15890568, + "step": 987 + }, + { + "epoch": 0.06920766678049507, + "grad_norm": 4.155986309051514, + "learning_rate": 9.308573029772329e-05, + "loss": 1.0278, + "num_input_tokens_seen": 15906952, + "step": 988 + }, + { + "epoch": 0.06927771502622432, + "grad_norm": 3.633018732070923, + "learning_rate": 9.307873204903678e-05, + "loss": 1.1276, + "num_input_tokens_seen": 15923336, + "step": 989 + }, + { + "epoch": 0.06934776327195356, + "grad_norm": 3.9513449668884277, + "learning_rate": 9.307173380035027e-05, + "loss": 0.9076, + "num_input_tokens_seen": 15939720, + "step": 990 + }, + { + "epoch": 0.06941781151768281, + "grad_norm": 4.296191692352295, + "learning_rate": 9.306473555166376e-05, + "loss": 1.0375, + "num_input_tokens_seen": 15956104, + "step": 991 + }, + { + "epoch": 0.06948785976341205, + "grad_norm": 5.266847133636475, + "learning_rate": 9.305773730297724e-05, + "loss": 1.1645, + "num_input_tokens_seen": 15972488, + "step": 992 + }, + { + "epoch": 0.0695579080091413, + "grad_norm": 4.321287155151367, + "learning_rate": 9.305073905429072e-05, + "loss": 1.046, + "num_input_tokens_seen": 15988408, + "step": 993 + }, + { + "epoch": 0.06962795625487055, + "grad_norm": 4.1421613693237305, + "learning_rate": 9.304374080560421e-05, + "loss": 1.0639, + "num_input_tokens_seen": 16002904, + "step": 994 + }, + { + "epoch": 0.06969800450059979, + "grad_norm": 6.811270713806152, + "learning_rate": 9.30367425569177e-05, + "loss": 1.1012, + "num_input_tokens_seen": 16017424, + "step": 995 + }, + { + "epoch": 0.06976805274632904, + "grad_norm": 4.968684196472168, + "learning_rate": 9.302974430823117e-05, + "loss": 1.0935, + "num_input_tokens_seen": 16033808, + "step": 996 + }, + { + "epoch": 0.06983810099205828, + "grad_norm": 4.592737197875977, + "learning_rate": 9.302274605954466e-05, + "loss": 0.9698, + "num_input_tokens_seen": 16050192, + "step": 997 + }, + { + "epoch": 0.06990814923778753, + "grad_norm": 3.7984917163848877, + "learning_rate": 9.301574781085815e-05, + "loss": 1.0976, + "num_input_tokens_seen": 16066192, + "step": 998 + }, + { + "epoch": 0.06997819748351677, + "grad_norm": 4.594212055206299, + "learning_rate": 9.300874956217164e-05, + "loss": 1.3718, + "num_input_tokens_seen": 16082576, + "step": 999 + }, + { + "epoch": 0.07004824572924602, + "grad_norm": 5.062666893005371, + "learning_rate": 9.300175131348511e-05, + "loss": 1.3139, + "num_input_tokens_seen": 16098960, + "step": 1000 + }, + { + "epoch": 0.07004824572924602, + "eval_loss": 1.1650840044021606, + "eval_runtime": 0.192, + "eval_samples_per_second": 5.208, + "eval_steps_per_second": 5.208, + "num_input_tokens_seen": 16098960, + "step": 1000 + }, + { + "epoch": 0.07011829397497527, + "grad_norm": 4.100902557373047, + "learning_rate": 9.29947530647986e-05, + "loss": 1.2711, + "num_input_tokens_seen": 16115216, + "step": 1001 + }, + { + "epoch": 0.07018834222070451, + "grad_norm": 4.24728536605835, + "learning_rate": 9.298775481611209e-05, + "loss": 0.9946, + "num_input_tokens_seen": 16130080, + "step": 1002 + }, + { + "epoch": 0.07025839046643376, + "grad_norm": 3.4653356075286865, + "learning_rate": 9.298075656742556e-05, + "loss": 0.8736, + "num_input_tokens_seen": 16146400, + "step": 1003 + }, + { + "epoch": 0.070328438712163, + "grad_norm": 5.548775672912598, + "learning_rate": 9.297375831873907e-05, + "loss": 0.9841, + "num_input_tokens_seen": 16162784, + "step": 1004 + }, + { + "epoch": 0.07039848695789225, + "grad_norm": 4.11661958694458, + "learning_rate": 9.296676007005256e-05, + "loss": 0.9857, + "num_input_tokens_seen": 16179024, + "step": 1005 + }, + { + "epoch": 0.0704685352036215, + "grad_norm": 4.006300449371338, + "learning_rate": 9.295976182136603e-05, + "loss": 1.0587, + "num_input_tokens_seen": 16195408, + "step": 1006 + }, + { + "epoch": 0.07053858344935074, + "grad_norm": 4.418802261352539, + "learning_rate": 9.295276357267952e-05, + "loss": 1.3845, + "num_input_tokens_seen": 16211792, + "step": 1007 + }, + { + "epoch": 0.07060863169507998, + "grad_norm": 5.625720024108887, + "learning_rate": 9.2945765323993e-05, + "loss": 1.2198, + "num_input_tokens_seen": 16226584, + "step": 1008 + }, + { + "epoch": 0.07067867994080923, + "grad_norm": 4.209630489349365, + "learning_rate": 9.293876707530648e-05, + "loss": 0.9387, + "num_input_tokens_seen": 16242256, + "step": 1009 + }, + { + "epoch": 0.07074872818653848, + "grad_norm": 4.0324788093566895, + "learning_rate": 9.293176882661997e-05, + "loss": 1.0713, + "num_input_tokens_seen": 16258640, + "step": 1010 + }, + { + "epoch": 0.07081877643226772, + "grad_norm": 4.0557684898376465, + "learning_rate": 9.292477057793346e-05, + "loss": 1.2831, + "num_input_tokens_seen": 16275024, + "step": 1011 + }, + { + "epoch": 0.07088882467799697, + "grad_norm": 4.511384010314941, + "learning_rate": 9.291777232924695e-05, + "loss": 1.1949, + "num_input_tokens_seen": 16291112, + "step": 1012 + }, + { + "epoch": 0.07095887292372621, + "grad_norm": 3.8120172023773193, + "learning_rate": 9.291077408056042e-05, + "loss": 1.013, + "num_input_tokens_seen": 16307496, + "step": 1013 + }, + { + "epoch": 0.07102892116945546, + "grad_norm": 4.039558410644531, + "learning_rate": 9.290377583187391e-05, + "loss": 1.1575, + "num_input_tokens_seen": 16323880, + "step": 1014 + }, + { + "epoch": 0.0710989694151847, + "grad_norm": 3.9076366424560547, + "learning_rate": 9.289677758318739e-05, + "loss": 1.1776, + "num_input_tokens_seen": 16339624, + "step": 1015 + }, + { + "epoch": 0.07116901766091395, + "grad_norm": 3.8083527088165283, + "learning_rate": 9.288977933450088e-05, + "loss": 0.965, + "num_input_tokens_seen": 16356008, + "step": 1016 + }, + { + "epoch": 0.0712390659066432, + "grad_norm": 4.5387282371521, + "learning_rate": 9.288278108581436e-05, + "loss": 1.1113, + "num_input_tokens_seen": 16372392, + "step": 1017 + }, + { + "epoch": 0.07130911415237244, + "grad_norm": 3.9228522777557373, + "learning_rate": 9.287578283712785e-05, + "loss": 1.1609, + "num_input_tokens_seen": 16388776, + "step": 1018 + }, + { + "epoch": 0.07137916239810169, + "grad_norm": 4.170912742614746, + "learning_rate": 9.286878458844134e-05, + "loss": 1.1324, + "num_input_tokens_seen": 16405160, + "step": 1019 + }, + { + "epoch": 0.07144921064383093, + "grad_norm": 4.426759719848633, + "learning_rate": 9.286178633975482e-05, + "loss": 1.2825, + "num_input_tokens_seen": 16421544, + "step": 1020 + }, + { + "epoch": 0.07151925888956018, + "grad_norm": 3.8606133460998535, + "learning_rate": 9.28547880910683e-05, + "loss": 1.1734, + "num_input_tokens_seen": 16437736, + "step": 1021 + }, + { + "epoch": 0.07158930713528944, + "grad_norm": 4.040006637573242, + "learning_rate": 9.28477898423818e-05, + "loss": 1.0824, + "num_input_tokens_seen": 16453776, + "step": 1022 + }, + { + "epoch": 0.07165935538101868, + "grad_norm": 3.7698042392730713, + "learning_rate": 9.284079159369527e-05, + "loss": 1.0951, + "num_input_tokens_seen": 16470160, + "step": 1023 + }, + { + "epoch": 0.07172940362674793, + "grad_norm": 4.180328369140625, + "learning_rate": 9.283379334500877e-05, + "loss": 1.0087, + "num_input_tokens_seen": 16486280, + "step": 1024 + }, + { + "epoch": 0.07179945187247717, + "grad_norm": 6.02299690246582, + "learning_rate": 9.282679509632225e-05, + "loss": 0.9788, + "num_input_tokens_seen": 16501784, + "step": 1025 + }, + { + "epoch": 0.07186950011820642, + "grad_norm": 4.239454746246338, + "learning_rate": 9.281979684763573e-05, + "loss": 1.3031, + "num_input_tokens_seen": 16518096, + "step": 1026 + }, + { + "epoch": 0.07193954836393567, + "grad_norm": 3.446030616760254, + "learning_rate": 9.281279859894921e-05, + "loss": 0.9523, + "num_input_tokens_seen": 16534480, + "step": 1027 + }, + { + "epoch": 0.07200959660966491, + "grad_norm": 4.2813568115234375, + "learning_rate": 9.28058003502627e-05, + "loss": 1.1041, + "num_input_tokens_seen": 16550864, + "step": 1028 + }, + { + "epoch": 0.07207964485539416, + "grad_norm": 5.289443016052246, + "learning_rate": 9.279880210157619e-05, + "loss": 1.3036, + "num_input_tokens_seen": 16567248, + "step": 1029 + }, + { + "epoch": 0.0721496931011234, + "grad_norm": 3.680283308029175, + "learning_rate": 9.279180385288967e-05, + "loss": 1.1434, + "num_input_tokens_seen": 16583632, + "step": 1030 + }, + { + "epoch": 0.07221974134685265, + "grad_norm": 4.283925533294678, + "learning_rate": 9.278480560420316e-05, + "loss": 1.1569, + "num_input_tokens_seen": 16600016, + "step": 1031 + }, + { + "epoch": 0.0722897895925819, + "grad_norm": 4.913532733917236, + "learning_rate": 9.277780735551665e-05, + "loss": 1.218, + "num_input_tokens_seen": 16616400, + "step": 1032 + }, + { + "epoch": 0.07235983783831114, + "grad_norm": 4.344277381896973, + "learning_rate": 9.277080910683013e-05, + "loss": 1.1495, + "num_input_tokens_seen": 16632024, + "step": 1033 + }, + { + "epoch": 0.07242988608404038, + "grad_norm": 3.9231889247894287, + "learning_rate": 9.276381085814362e-05, + "loss": 1.0492, + "num_input_tokens_seen": 16648408, + "step": 1034 + }, + { + "epoch": 0.07249993432976963, + "grad_norm": 4.062288284301758, + "learning_rate": 9.275681260945709e-05, + "loss": 0.927, + "num_input_tokens_seen": 16664792, + "step": 1035 + }, + { + "epoch": 0.07256998257549888, + "grad_norm": 4.163131237030029, + "learning_rate": 9.274981436077058e-05, + "loss": 1.0782, + "num_input_tokens_seen": 16680216, + "step": 1036 + }, + { + "epoch": 0.07264003082122812, + "grad_norm": 5.220231056213379, + "learning_rate": 9.274281611208407e-05, + "loss": 1.125, + "num_input_tokens_seen": 16696160, + "step": 1037 + }, + { + "epoch": 0.07271007906695737, + "grad_norm": 3.63785457611084, + "learning_rate": 9.273581786339756e-05, + "loss": 1.0229, + "num_input_tokens_seen": 16712544, + "step": 1038 + }, + { + "epoch": 0.07278012731268661, + "grad_norm": 4.612295627593994, + "learning_rate": 9.272881961471105e-05, + "loss": 1.3076, + "num_input_tokens_seen": 16728928, + "step": 1039 + }, + { + "epoch": 0.07285017555841586, + "grad_norm": 5.278262615203857, + "learning_rate": 9.272182136602452e-05, + "loss": 1.2682, + "num_input_tokens_seen": 16744184, + "step": 1040 + }, + { + "epoch": 0.0729202238041451, + "grad_norm": 4.3274455070495605, + "learning_rate": 9.271482311733801e-05, + "loss": 1.3517, + "num_input_tokens_seen": 16760056, + "step": 1041 + }, + { + "epoch": 0.07299027204987435, + "grad_norm": 4.1077375411987305, + "learning_rate": 9.270782486865148e-05, + "loss": 1.175, + "num_input_tokens_seen": 16776280, + "step": 1042 + }, + { + "epoch": 0.0730603202956036, + "grad_norm": 3.954604148864746, + "learning_rate": 9.270082661996497e-05, + "loss": 1.189, + "num_input_tokens_seen": 16792456, + "step": 1043 + }, + { + "epoch": 0.07313036854133284, + "grad_norm": 4.111297607421875, + "learning_rate": 9.269382837127847e-05, + "loss": 1.0265, + "num_input_tokens_seen": 16808840, + "step": 1044 + }, + { + "epoch": 0.07320041678706209, + "grad_norm": 3.56953501701355, + "learning_rate": 9.268683012259195e-05, + "loss": 1.0114, + "num_input_tokens_seen": 16824720, + "step": 1045 + }, + { + "epoch": 0.07327046503279133, + "grad_norm": 4.962648868560791, + "learning_rate": 9.267983187390544e-05, + "loss": 1.1714, + "num_input_tokens_seen": 16841104, + "step": 1046 + }, + { + "epoch": 0.07334051327852058, + "grad_norm": 3.7930710315704346, + "learning_rate": 9.267283362521891e-05, + "loss": 1.0903, + "num_input_tokens_seen": 16857488, + "step": 1047 + }, + { + "epoch": 0.07341056152424982, + "grad_norm": 4.158027172088623, + "learning_rate": 9.26658353765324e-05, + "loss": 1.1823, + "num_input_tokens_seen": 16873856, + "step": 1048 + }, + { + "epoch": 0.07348060976997907, + "grad_norm": 4.1571197509765625, + "learning_rate": 9.265883712784589e-05, + "loss": 1.2572, + "num_input_tokens_seen": 16890240, + "step": 1049 + }, + { + "epoch": 0.07355065801570831, + "grad_norm": 4.330874443054199, + "learning_rate": 9.265183887915938e-05, + "loss": 1.194, + "num_input_tokens_seen": 16906624, + "step": 1050 + }, + { + "epoch": 0.07362070626143756, + "grad_norm": 6.105716705322266, + "learning_rate": 9.264484063047287e-05, + "loss": 1.0685, + "num_input_tokens_seen": 16922864, + "step": 1051 + }, + { + "epoch": 0.0736907545071668, + "grad_norm": 4.8344407081604, + "learning_rate": 9.263784238178634e-05, + "loss": 1.1992, + "num_input_tokens_seen": 16939200, + "step": 1052 + }, + { + "epoch": 0.07376080275289605, + "grad_norm": 3.553568124771118, + "learning_rate": 9.263084413309983e-05, + "loss": 0.7907, + "num_input_tokens_seen": 16955584, + "step": 1053 + }, + { + "epoch": 0.0738308509986253, + "grad_norm": 3.8178694248199463, + "learning_rate": 9.26238458844133e-05, + "loss": 1.2031, + "num_input_tokens_seen": 16971968, + "step": 1054 + }, + { + "epoch": 0.07390089924435454, + "grad_norm": 3.5509321689605713, + "learning_rate": 9.26168476357268e-05, + "loss": 1.1189, + "num_input_tokens_seen": 16988352, + "step": 1055 + }, + { + "epoch": 0.0739709474900838, + "grad_norm": 3.870811939239502, + "learning_rate": 9.260984938704028e-05, + "loss": 1.0205, + "num_input_tokens_seen": 17004736, + "step": 1056 + }, + { + "epoch": 0.07404099573581305, + "grad_norm": 11.86201286315918, + "learning_rate": 9.260285113835377e-05, + "loss": 1.037, + "num_input_tokens_seen": 17020544, + "step": 1057 + }, + { + "epoch": 0.0741110439815423, + "grad_norm": 5.2176127433776855, + "learning_rate": 9.259585288966726e-05, + "loss": 1.0797, + "num_input_tokens_seen": 17036472, + "step": 1058 + }, + { + "epoch": 0.07418109222727154, + "grad_norm": 3.72566819190979, + "learning_rate": 9.258885464098075e-05, + "loss": 0.9307, + "num_input_tokens_seen": 17052360, + "step": 1059 + }, + { + "epoch": 0.07425114047300078, + "grad_norm": 4.323361396789551, + "learning_rate": 9.258185639229422e-05, + "loss": 1.0783, + "num_input_tokens_seen": 17067672, + "step": 1060 + }, + { + "epoch": 0.07432118871873003, + "grad_norm": 4.01705265045166, + "learning_rate": 9.257485814360771e-05, + "loss": 1.0402, + "num_input_tokens_seen": 17084056, + "step": 1061 + }, + { + "epoch": 0.07439123696445928, + "grad_norm": 4.4460039138793945, + "learning_rate": 9.256785989492119e-05, + "loss": 1.2294, + "num_input_tokens_seen": 17100096, + "step": 1062 + }, + { + "epoch": 0.07446128521018852, + "grad_norm": 4.634500503540039, + "learning_rate": 9.256086164623468e-05, + "loss": 1.1479, + "num_input_tokens_seen": 17116440, + "step": 1063 + }, + { + "epoch": 0.07453133345591777, + "grad_norm": 4.146971702575684, + "learning_rate": 9.255386339754817e-05, + "loss": 0.9052, + "num_input_tokens_seen": 17132592, + "step": 1064 + }, + { + "epoch": 0.07460138170164701, + "grad_norm": 6.171874523162842, + "learning_rate": 9.254686514886165e-05, + "loss": 1.1135, + "num_input_tokens_seen": 17148704, + "step": 1065 + }, + { + "epoch": 0.07467142994737626, + "grad_norm": 6.25461483001709, + "learning_rate": 9.253986690017514e-05, + "loss": 1.0003, + "num_input_tokens_seen": 17164920, + "step": 1066 + }, + { + "epoch": 0.0747414781931055, + "grad_norm": 3.886582851409912, + "learning_rate": 9.253286865148862e-05, + "loss": 1.1917, + "num_input_tokens_seen": 17181304, + "step": 1067 + }, + { + "epoch": 0.07481152643883475, + "grad_norm": 5.067885398864746, + "learning_rate": 9.25258704028021e-05, + "loss": 1.4475, + "num_input_tokens_seen": 17197208, + "step": 1068 + }, + { + "epoch": 0.074881574684564, + "grad_norm": 4.186190128326416, + "learning_rate": 9.251887215411558e-05, + "loss": 1.1255, + "num_input_tokens_seen": 17212680, + "step": 1069 + }, + { + "epoch": 0.07495162293029324, + "grad_norm": 4.059047698974609, + "learning_rate": 9.251187390542908e-05, + "loss": 1.1467, + "num_input_tokens_seen": 17229064, + "step": 1070 + }, + { + "epoch": 0.07502167117602249, + "grad_norm": 4.154530048370361, + "learning_rate": 9.250487565674257e-05, + "loss": 1.0811, + "num_input_tokens_seen": 17245448, + "step": 1071 + }, + { + "epoch": 0.07509171942175173, + "grad_norm": 3.760453701019287, + "learning_rate": 9.249787740805605e-05, + "loss": 1.1493, + "num_input_tokens_seen": 17261832, + "step": 1072 + }, + { + "epoch": 0.07516176766748098, + "grad_norm": 3.8155417442321777, + "learning_rate": 9.249087915936954e-05, + "loss": 1.0934, + "num_input_tokens_seen": 17278216, + "step": 1073 + }, + { + "epoch": 0.07523181591321022, + "grad_norm": 4.807973384857178, + "learning_rate": 9.248388091068301e-05, + "loss": 1.0704, + "num_input_tokens_seen": 17294600, + "step": 1074 + }, + { + "epoch": 0.07530186415893947, + "grad_norm": 11.421661376953125, + "learning_rate": 9.24768826619965e-05, + "loss": 0.9472, + "num_input_tokens_seen": 17308960, + "step": 1075 + }, + { + "epoch": 0.07537191240466871, + "grad_norm": 3.7491819858551025, + "learning_rate": 9.246988441330999e-05, + "loss": 1.1395, + "num_input_tokens_seen": 17324536, + "step": 1076 + }, + { + "epoch": 0.07544196065039796, + "grad_norm": 3.6289992332458496, + "learning_rate": 9.246288616462348e-05, + "loss": 0.9375, + "num_input_tokens_seen": 17340920, + "step": 1077 + }, + { + "epoch": 0.0755120088961272, + "grad_norm": 5.741896629333496, + "learning_rate": 9.245588791593696e-05, + "loss": 1.1656, + "num_input_tokens_seen": 17357304, + "step": 1078 + }, + { + "epoch": 0.07558205714185645, + "grad_norm": 3.5879697799682617, + "learning_rate": 9.244888966725044e-05, + "loss": 0.9421, + "num_input_tokens_seen": 17373592, + "step": 1079 + }, + { + "epoch": 0.0756521053875857, + "grad_norm": 7.3384504318237305, + "learning_rate": 9.244189141856393e-05, + "loss": 1.1358, + "num_input_tokens_seen": 17387872, + "step": 1080 + }, + { + "epoch": 0.07572215363331494, + "grad_norm": 3.6677255630493164, + "learning_rate": 9.24348931698774e-05, + "loss": 0.892, + "num_input_tokens_seen": 17403088, + "step": 1081 + }, + { + "epoch": 0.07579220187904419, + "grad_norm": 3.953216075897217, + "learning_rate": 9.242789492119089e-05, + "loss": 0.9757, + "num_input_tokens_seen": 17419392, + "step": 1082 + }, + { + "epoch": 0.07586225012477343, + "grad_norm": 4.827987194061279, + "learning_rate": 9.242089667250438e-05, + "loss": 1.1493, + "num_input_tokens_seen": 17435776, + "step": 1083 + }, + { + "epoch": 0.07593229837050268, + "grad_norm": 4.416223526000977, + "learning_rate": 9.241389842381787e-05, + "loss": 0.9913, + "num_input_tokens_seen": 17452080, + "step": 1084 + }, + { + "epoch": 0.07600234661623193, + "grad_norm": 3.7776753902435303, + "learning_rate": 9.240690017513136e-05, + "loss": 1.0589, + "num_input_tokens_seen": 17468160, + "step": 1085 + }, + { + "epoch": 0.07607239486196117, + "grad_norm": 4.139477252960205, + "learning_rate": 9.239990192644485e-05, + "loss": 0.9475, + "num_input_tokens_seen": 17484544, + "step": 1086 + }, + { + "epoch": 0.07614244310769042, + "grad_norm": 5.218942642211914, + "learning_rate": 9.239290367775832e-05, + "loss": 1.1626, + "num_input_tokens_seen": 17500928, + "step": 1087 + }, + { + "epoch": 0.07621249135341966, + "grad_norm": 4.773080348968506, + "learning_rate": 9.238590542907181e-05, + "loss": 1.154, + "num_input_tokens_seen": 17517312, + "step": 1088 + }, + { + "epoch": 0.07628253959914891, + "grad_norm": 3.840151309967041, + "learning_rate": 9.237890718038528e-05, + "loss": 1.0862, + "num_input_tokens_seen": 17533696, + "step": 1089 + }, + { + "epoch": 0.07635258784487815, + "grad_norm": 4.201962471008301, + "learning_rate": 9.237190893169879e-05, + "loss": 1.0945, + "num_input_tokens_seen": 17549512, + "step": 1090 + }, + { + "epoch": 0.07642263609060741, + "grad_norm": 4.4583001136779785, + "learning_rate": 9.236491068301226e-05, + "loss": 1.074, + "num_input_tokens_seen": 17565896, + "step": 1091 + }, + { + "epoch": 0.07649268433633666, + "grad_norm": 4.013672351837158, + "learning_rate": 9.235791243432575e-05, + "loss": 1.2545, + "num_input_tokens_seen": 17582264, + "step": 1092 + }, + { + "epoch": 0.0765627325820659, + "grad_norm": 3.69555926322937, + "learning_rate": 9.235091418563924e-05, + "loss": 1.1615, + "num_input_tokens_seen": 17597888, + "step": 1093 + }, + { + "epoch": 0.07663278082779515, + "grad_norm": 4.341784954071045, + "learning_rate": 9.234391593695271e-05, + "loss": 1.0369, + "num_input_tokens_seen": 17613392, + "step": 1094 + }, + { + "epoch": 0.0767028290735244, + "grad_norm": 4.043522357940674, + "learning_rate": 9.23369176882662e-05, + "loss": 1.0509, + "num_input_tokens_seen": 17629216, + "step": 1095 + }, + { + "epoch": 0.07677287731925364, + "grad_norm": 4.330739498138428, + "learning_rate": 9.232991943957969e-05, + "loss": 1.2208, + "num_input_tokens_seen": 17645600, + "step": 1096 + }, + { + "epoch": 0.07684292556498289, + "grad_norm": 4.8433122634887695, + "learning_rate": 9.232292119089318e-05, + "loss": 0.9492, + "num_input_tokens_seen": 17660952, + "step": 1097 + }, + { + "epoch": 0.07691297381071213, + "grad_norm": 3.9039859771728516, + "learning_rate": 9.231592294220667e-05, + "loss": 1.0601, + "num_input_tokens_seen": 17677336, + "step": 1098 + }, + { + "epoch": 0.07698302205644138, + "grad_norm": 3.814103126525879, + "learning_rate": 9.230892469352014e-05, + "loss": 0.9902, + "num_input_tokens_seen": 17693720, + "step": 1099 + }, + { + "epoch": 0.07705307030217062, + "grad_norm": 3.9864039421081543, + "learning_rate": 9.230192644483363e-05, + "loss": 1.1622, + "num_input_tokens_seen": 17710104, + "step": 1100 + }, + { + "epoch": 0.07712311854789987, + "grad_norm": 4.469820499420166, + "learning_rate": 9.229492819614711e-05, + "loss": 1.044, + "num_input_tokens_seen": 17726488, + "step": 1101 + }, + { + "epoch": 0.07719316679362911, + "grad_norm": 3.8044216632843018, + "learning_rate": 9.22879299474606e-05, + "loss": 1.1283, + "num_input_tokens_seen": 17742648, + "step": 1102 + }, + { + "epoch": 0.07726321503935836, + "grad_norm": 4.859435558319092, + "learning_rate": 9.228093169877408e-05, + "loss": 1.0995, + "num_input_tokens_seen": 17759032, + "step": 1103 + }, + { + "epoch": 0.0773332632850876, + "grad_norm": 3.830214023590088, + "learning_rate": 9.227393345008757e-05, + "loss": 1.1731, + "num_input_tokens_seen": 17774872, + "step": 1104 + }, + { + "epoch": 0.07740331153081685, + "grad_norm": 4.196676254272461, + "learning_rate": 9.226693520140106e-05, + "loss": 1.2055, + "num_input_tokens_seen": 17790832, + "step": 1105 + }, + { + "epoch": 0.0774733597765461, + "grad_norm": 4.50007438659668, + "learning_rate": 9.225993695271454e-05, + "loss": 0.952, + "num_input_tokens_seen": 17805024, + "step": 1106 + }, + { + "epoch": 0.07754340802227534, + "grad_norm": 4.392070293426514, + "learning_rate": 9.225293870402803e-05, + "loss": 1.1548, + "num_input_tokens_seen": 17820008, + "step": 1107 + }, + { + "epoch": 0.07761345626800459, + "grad_norm": 4.09447717666626, + "learning_rate": 9.22459404553415e-05, + "loss": 1.1233, + "num_input_tokens_seen": 17836392, + "step": 1108 + }, + { + "epoch": 0.07768350451373383, + "grad_norm": 4.591554641723633, + "learning_rate": 9.223894220665499e-05, + "loss": 1.2772, + "num_input_tokens_seen": 17852776, + "step": 1109 + }, + { + "epoch": 0.07775355275946308, + "grad_norm": 5.629931926727295, + "learning_rate": 9.223194395796849e-05, + "loss": 1.1453, + "num_input_tokens_seen": 17869160, + "step": 1110 + }, + { + "epoch": 0.07782360100519232, + "grad_norm": 4.307553768157959, + "learning_rate": 9.222494570928197e-05, + "loss": 1.1479, + "num_input_tokens_seen": 17885544, + "step": 1111 + }, + { + "epoch": 0.07789364925092157, + "grad_norm": 4.599300384521484, + "learning_rate": 9.221794746059545e-05, + "loss": 1.1304, + "num_input_tokens_seen": 17901848, + "step": 1112 + }, + { + "epoch": 0.07796369749665082, + "grad_norm": 4.217408657073975, + "learning_rate": 9.221094921190894e-05, + "loss": 1.1611, + "num_input_tokens_seen": 17918232, + "step": 1113 + }, + { + "epoch": 0.07803374574238006, + "grad_norm": 3.885847568511963, + "learning_rate": 9.220395096322242e-05, + "loss": 0.968, + "num_input_tokens_seen": 17934504, + "step": 1114 + }, + { + "epoch": 0.07810379398810931, + "grad_norm": 4.280134677886963, + "learning_rate": 9.219695271453591e-05, + "loss": 1.0944, + "num_input_tokens_seen": 17950888, + "step": 1115 + }, + { + "epoch": 0.07817384223383855, + "grad_norm": 4.081259727478027, + "learning_rate": 9.21899544658494e-05, + "loss": 1.0872, + "num_input_tokens_seen": 17967088, + "step": 1116 + }, + { + "epoch": 0.0782438904795678, + "grad_norm": 4.206293106079102, + "learning_rate": 9.218295621716288e-05, + "loss": 1.2013, + "num_input_tokens_seen": 17983312, + "step": 1117 + }, + { + "epoch": 0.07831393872529704, + "grad_norm": 4.837226390838623, + "learning_rate": 9.217595796847636e-05, + "loss": 1.2628, + "num_input_tokens_seen": 17998768, + "step": 1118 + }, + { + "epoch": 0.07838398697102629, + "grad_norm": 4.344440460205078, + "learning_rate": 9.216895971978985e-05, + "loss": 1.0389, + "num_input_tokens_seen": 18014840, + "step": 1119 + }, + { + "epoch": 0.07845403521675554, + "grad_norm": 4.357896327972412, + "learning_rate": 9.216196147110334e-05, + "loss": 1.2444, + "num_input_tokens_seen": 18030696, + "step": 1120 + }, + { + "epoch": 0.07852408346248478, + "grad_norm": 3.6449878215789795, + "learning_rate": 9.215496322241681e-05, + "loss": 1.0622, + "num_input_tokens_seen": 18047024, + "step": 1121 + }, + { + "epoch": 0.07859413170821403, + "grad_norm": 4.154385566711426, + "learning_rate": 9.21479649737303e-05, + "loss": 1.1551, + "num_input_tokens_seen": 18063408, + "step": 1122 + }, + { + "epoch": 0.07866417995394327, + "grad_norm": 3.5929031372070312, + "learning_rate": 9.214096672504379e-05, + "loss": 0.9682, + "num_input_tokens_seen": 18079280, + "step": 1123 + }, + { + "epoch": 0.07873422819967252, + "grad_norm": 3.5724170207977295, + "learning_rate": 9.213396847635728e-05, + "loss": 0.8952, + "num_input_tokens_seen": 18094488, + "step": 1124 + }, + { + "epoch": 0.07880427644540176, + "grad_norm": 4.100067615509033, + "learning_rate": 9.212697022767077e-05, + "loss": 0.9066, + "num_input_tokens_seen": 18110872, + "step": 1125 + }, + { + "epoch": 0.07887432469113102, + "grad_norm": 4.431338787078857, + "learning_rate": 9.211997197898424e-05, + "loss": 1.0116, + "num_input_tokens_seen": 18127256, + "step": 1126 + }, + { + "epoch": 0.07894437293686027, + "grad_norm": 3.9577043056488037, + "learning_rate": 9.211297373029773e-05, + "loss": 1.1299, + "num_input_tokens_seen": 18143208, + "step": 1127 + }, + { + "epoch": 0.07901442118258951, + "grad_norm": 4.753921985626221, + "learning_rate": 9.21059754816112e-05, + "loss": 1.0686, + "num_input_tokens_seen": 18158888, + "step": 1128 + }, + { + "epoch": 0.07908446942831876, + "grad_norm": 3.763982057571411, + "learning_rate": 9.209897723292469e-05, + "loss": 1.0467, + "num_input_tokens_seen": 18175192, + "step": 1129 + }, + { + "epoch": 0.079154517674048, + "grad_norm": 3.729553699493408, + "learning_rate": 9.20919789842382e-05, + "loss": 1.1152, + "num_input_tokens_seen": 18191384, + "step": 1130 + }, + { + "epoch": 0.07922456591977725, + "grad_norm": 3.7760956287384033, + "learning_rate": 9.208498073555167e-05, + "loss": 1.0994, + "num_input_tokens_seen": 18207768, + "step": 1131 + }, + { + "epoch": 0.0792946141655065, + "grad_norm": 4.64035177230835, + "learning_rate": 9.207798248686516e-05, + "loss": 1.1037, + "num_input_tokens_seen": 18224152, + "step": 1132 + }, + { + "epoch": 0.07936466241123574, + "grad_norm": 4.1443352699279785, + "learning_rate": 9.207098423817863e-05, + "loss": 1.2329, + "num_input_tokens_seen": 18240536, + "step": 1133 + }, + { + "epoch": 0.07943471065696499, + "grad_norm": 5.332706451416016, + "learning_rate": 9.206398598949212e-05, + "loss": 1.1303, + "num_input_tokens_seen": 18255528, + "step": 1134 + }, + { + "epoch": 0.07950475890269423, + "grad_norm": 3.914705514907837, + "learning_rate": 9.20569877408056e-05, + "loss": 1.1182, + "num_input_tokens_seen": 18271768, + "step": 1135 + }, + { + "epoch": 0.07957480714842348, + "grad_norm": 4.994162559509277, + "learning_rate": 9.20499894921191e-05, + "loss": 1.175, + "num_input_tokens_seen": 18288152, + "step": 1136 + }, + { + "epoch": 0.07964485539415272, + "grad_norm": 4.132298946380615, + "learning_rate": 9.204299124343259e-05, + "loss": 0.9402, + "num_input_tokens_seen": 18303784, + "step": 1137 + }, + { + "epoch": 0.07971490363988197, + "grad_norm": 3.9048449993133545, + "learning_rate": 9.203599299474606e-05, + "loss": 1.1283, + "num_input_tokens_seen": 18319968, + "step": 1138 + }, + { + "epoch": 0.07978495188561122, + "grad_norm": 3.981844425201416, + "learning_rate": 9.202899474605955e-05, + "loss": 1.0472, + "num_input_tokens_seen": 18335976, + "step": 1139 + }, + { + "epoch": 0.07985500013134046, + "grad_norm": 4.491240501403809, + "learning_rate": 9.202199649737304e-05, + "loss": 1.1022, + "num_input_tokens_seen": 18352360, + "step": 1140 + }, + { + "epoch": 0.07992504837706971, + "grad_norm": 4.152430534362793, + "learning_rate": 9.201499824868652e-05, + "loss": 1.0688, + "num_input_tokens_seen": 18368736, + "step": 1141 + }, + { + "epoch": 0.07999509662279895, + "grad_norm": 4.337832450866699, + "learning_rate": 9.2008e-05, + "loss": 1.0397, + "num_input_tokens_seen": 18385120, + "step": 1142 + }, + { + "epoch": 0.0800651448685282, + "grad_norm": 4.865042209625244, + "learning_rate": 9.200100175131349e-05, + "loss": 0.9616, + "num_input_tokens_seen": 18401504, + "step": 1143 + }, + { + "epoch": 0.08013519311425744, + "grad_norm": 3.783113479614258, + "learning_rate": 9.199400350262698e-05, + "loss": 1.0001, + "num_input_tokens_seen": 18417176, + "step": 1144 + }, + { + "epoch": 0.08020524135998669, + "grad_norm": 4.98455286026001, + "learning_rate": 9.198700525394046e-05, + "loss": 1.2139, + "num_input_tokens_seen": 18432584, + "step": 1145 + }, + { + "epoch": 0.08027528960571594, + "grad_norm": 4.1859517097473145, + "learning_rate": 9.198000700525394e-05, + "loss": 1.1333, + "num_input_tokens_seen": 18448968, + "step": 1146 + }, + { + "epoch": 0.08034533785144518, + "grad_norm": 3.7193386554718018, + "learning_rate": 9.197300875656743e-05, + "loss": 1.0055, + "num_input_tokens_seen": 18465352, + "step": 1147 + }, + { + "epoch": 0.08041538609717443, + "grad_norm": 4.280893325805664, + "learning_rate": 9.196601050788091e-05, + "loss": 1.1261, + "num_input_tokens_seen": 18481736, + "step": 1148 + }, + { + "epoch": 0.08048543434290367, + "grad_norm": 3.9979352951049805, + "learning_rate": 9.19590122591944e-05, + "loss": 1.025, + "num_input_tokens_seen": 18498120, + "step": 1149 + }, + { + "epoch": 0.08055548258863292, + "grad_norm": 5.594225883483887, + "learning_rate": 9.195201401050789e-05, + "loss": 1.0527, + "num_input_tokens_seen": 18513944, + "step": 1150 + }, + { + "epoch": 0.08062553083436216, + "grad_norm": 4.758842468261719, + "learning_rate": 9.194501576182137e-05, + "loss": 1.0915, + "num_input_tokens_seen": 18530328, + "step": 1151 + }, + { + "epoch": 0.08069557908009141, + "grad_norm": 5.597489356994629, + "learning_rate": 9.193801751313486e-05, + "loss": 1.0673, + "num_input_tokens_seen": 18546632, + "step": 1152 + }, + { + "epoch": 0.08076562732582065, + "grad_norm": 5.279472827911377, + "learning_rate": 9.193101926444834e-05, + "loss": 1.2897, + "num_input_tokens_seen": 18561856, + "step": 1153 + }, + { + "epoch": 0.0808356755715499, + "grad_norm": 4.672069072723389, + "learning_rate": 9.192402101576183e-05, + "loss": 1.0298, + "num_input_tokens_seen": 18577944, + "step": 1154 + }, + { + "epoch": 0.08090572381727915, + "grad_norm": 3.65533447265625, + "learning_rate": 9.19170227670753e-05, + "loss": 0.933, + "num_input_tokens_seen": 18593720, + "step": 1155 + }, + { + "epoch": 0.08097577206300839, + "grad_norm": 4.212414741516113, + "learning_rate": 9.19100245183888e-05, + "loss": 1.0496, + "num_input_tokens_seen": 18609864, + "step": 1156 + }, + { + "epoch": 0.08104582030873764, + "grad_norm": 4.471503734588623, + "learning_rate": 9.190302626970229e-05, + "loss": 1.2261, + "num_input_tokens_seen": 18626248, + "step": 1157 + }, + { + "epoch": 0.08111586855446688, + "grad_norm": 4.952723979949951, + "learning_rate": 9.189602802101577e-05, + "loss": 1.056, + "num_input_tokens_seen": 18642632, + "step": 1158 + }, + { + "epoch": 0.08118591680019613, + "grad_norm": 3.921449661254883, + "learning_rate": 9.188902977232926e-05, + "loss": 1.1617, + "num_input_tokens_seen": 18659016, + "step": 1159 + }, + { + "epoch": 0.08125596504592539, + "grad_norm": 3.728752374649048, + "learning_rate": 9.188203152364273e-05, + "loss": 1.1217, + "num_input_tokens_seen": 18675400, + "step": 1160 + }, + { + "epoch": 0.08132601329165463, + "grad_norm": 3.8742613792419434, + "learning_rate": 9.187503327495622e-05, + "loss": 1.1538, + "num_input_tokens_seen": 18691232, + "step": 1161 + }, + { + "epoch": 0.08139606153738388, + "grad_norm": 3.827157735824585, + "learning_rate": 9.186803502626971e-05, + "loss": 1.1457, + "num_input_tokens_seen": 18707616, + "step": 1162 + }, + { + "epoch": 0.08146610978311312, + "grad_norm": 3.8507778644561768, + "learning_rate": 9.18610367775832e-05, + "loss": 1.0317, + "num_input_tokens_seen": 18724000, + "step": 1163 + }, + { + "epoch": 0.08153615802884237, + "grad_norm": 5.328095436096191, + "learning_rate": 9.185403852889669e-05, + "loss": 1.0921, + "num_input_tokens_seen": 18740384, + "step": 1164 + }, + { + "epoch": 0.08160620627457162, + "grad_norm": 4.8900322914123535, + "learning_rate": 9.184704028021016e-05, + "loss": 1.1308, + "num_input_tokens_seen": 18756768, + "step": 1165 + }, + { + "epoch": 0.08167625452030086, + "grad_norm": 3.810084104537964, + "learning_rate": 9.184004203152365e-05, + "loss": 1.1244, + "num_input_tokens_seen": 18772632, + "step": 1166 + }, + { + "epoch": 0.08174630276603011, + "grad_norm": 4.318419456481934, + "learning_rate": 9.183304378283714e-05, + "loss": 1.0372, + "num_input_tokens_seen": 18788272, + "step": 1167 + }, + { + "epoch": 0.08181635101175935, + "grad_norm": 4.093379020690918, + "learning_rate": 9.182604553415061e-05, + "loss": 1.18, + "num_input_tokens_seen": 18803672, + "step": 1168 + }, + { + "epoch": 0.0818863992574886, + "grad_norm": 4.630450248718262, + "learning_rate": 9.18190472854641e-05, + "loss": 1.1439, + "num_input_tokens_seen": 18820056, + "step": 1169 + }, + { + "epoch": 0.08195644750321784, + "grad_norm": 4.388457775115967, + "learning_rate": 9.181204903677759e-05, + "loss": 1.0971, + "num_input_tokens_seen": 18836440, + "step": 1170 + }, + { + "epoch": 0.08202649574894709, + "grad_norm": 3.6942262649536133, + "learning_rate": 9.180505078809108e-05, + "loss": 1.1594, + "num_input_tokens_seen": 18852824, + "step": 1171 + }, + { + "epoch": 0.08209654399467634, + "grad_norm": 3.937696933746338, + "learning_rate": 9.179805253940455e-05, + "loss": 1.1841, + "num_input_tokens_seen": 18869208, + "step": 1172 + }, + { + "epoch": 0.08216659224040558, + "grad_norm": 4.062703609466553, + "learning_rate": 9.179105429071804e-05, + "loss": 1.083, + "num_input_tokens_seen": 18885320, + "step": 1173 + }, + { + "epoch": 0.08223664048613483, + "grad_norm": 7.794081211090088, + "learning_rate": 9.178405604203153e-05, + "loss": 1.2287, + "num_input_tokens_seen": 18900224, + "step": 1174 + }, + { + "epoch": 0.08230668873186407, + "grad_norm": 4.429391860961914, + "learning_rate": 9.1777057793345e-05, + "loss": 1.0504, + "num_input_tokens_seen": 18916456, + "step": 1175 + }, + { + "epoch": 0.08237673697759332, + "grad_norm": 3.954869508743286, + "learning_rate": 9.17700595446585e-05, + "loss": 1.1558, + "num_input_tokens_seen": 18932840, + "step": 1176 + }, + { + "epoch": 0.08244678522332256, + "grad_norm": 5.555337429046631, + "learning_rate": 9.176306129597198e-05, + "loss": 1.3628, + "num_input_tokens_seen": 18949224, + "step": 1177 + }, + { + "epoch": 0.08251683346905181, + "grad_norm": 3.575295925140381, + "learning_rate": 9.175606304728547e-05, + "loss": 1.0651, + "num_input_tokens_seen": 18965552, + "step": 1178 + }, + { + "epoch": 0.08258688171478105, + "grad_norm": 5.927703380584717, + "learning_rate": 9.174906479859896e-05, + "loss": 1.0582, + "num_input_tokens_seen": 18981496, + "step": 1179 + }, + { + "epoch": 0.0826569299605103, + "grad_norm": 6.553986549377441, + "learning_rate": 9.174206654991243e-05, + "loss": 1.4058, + "num_input_tokens_seen": 18996808, + "step": 1180 + }, + { + "epoch": 0.08272697820623955, + "grad_norm": 4.315832138061523, + "learning_rate": 9.173506830122592e-05, + "loss": 1.1166, + "num_input_tokens_seen": 19013192, + "step": 1181 + }, + { + "epoch": 0.08279702645196879, + "grad_norm": 3.818033218383789, + "learning_rate": 9.172807005253941e-05, + "loss": 1.0744, + "num_input_tokens_seen": 19029464, + "step": 1182 + }, + { + "epoch": 0.08286707469769804, + "grad_norm": 3.4207711219787598, + "learning_rate": 9.17210718038529e-05, + "loss": 0.8952, + "num_input_tokens_seen": 19045592, + "step": 1183 + }, + { + "epoch": 0.08293712294342728, + "grad_norm": 4.3305864334106445, + "learning_rate": 9.171407355516639e-05, + "loss": 0.9617, + "num_input_tokens_seen": 19061864, + "step": 1184 + }, + { + "epoch": 0.08300717118915653, + "grad_norm": 5.365218162536621, + "learning_rate": 9.170707530647986e-05, + "loss": 1.1669, + "num_input_tokens_seen": 19075448, + "step": 1185 + }, + { + "epoch": 0.08307721943488577, + "grad_norm": 3.9939708709716797, + "learning_rate": 9.170007705779335e-05, + "loss": 1.1325, + "num_input_tokens_seen": 19091832, + "step": 1186 + }, + { + "epoch": 0.08314726768061502, + "grad_norm": 3.8088884353637695, + "learning_rate": 9.169307880910683e-05, + "loss": 1.0132, + "num_input_tokens_seen": 19107920, + "step": 1187 + }, + { + "epoch": 0.08321731592634427, + "grad_norm": 3.858799457550049, + "learning_rate": 9.168608056042032e-05, + "loss": 0.9805, + "num_input_tokens_seen": 19123776, + "step": 1188 + }, + { + "epoch": 0.08328736417207351, + "grad_norm": 4.042770862579346, + "learning_rate": 9.16790823117338e-05, + "loss": 1.1668, + "num_input_tokens_seen": 19139752, + "step": 1189 + }, + { + "epoch": 0.08335741241780276, + "grad_norm": 4.2054762840271, + "learning_rate": 9.16720840630473e-05, + "loss": 1.0702, + "num_input_tokens_seen": 19156136, + "step": 1190 + }, + { + "epoch": 0.083427460663532, + "grad_norm": 4.450238227844238, + "learning_rate": 9.166508581436078e-05, + "loss": 1.0751, + "num_input_tokens_seen": 19172240, + "step": 1191 + }, + { + "epoch": 0.08349750890926125, + "grad_norm": 4.126129627227783, + "learning_rate": 9.165808756567426e-05, + "loss": 0.9957, + "num_input_tokens_seen": 19188624, + "step": 1192 + }, + { + "epoch": 0.0835675571549905, + "grad_norm": 4.131893157958984, + "learning_rate": 9.165108931698775e-05, + "loss": 1.2004, + "num_input_tokens_seen": 19205008, + "step": 1193 + }, + { + "epoch": 0.08363760540071974, + "grad_norm": 4.25187873840332, + "learning_rate": 9.164409106830123e-05, + "loss": 1.3571, + "num_input_tokens_seen": 19220856, + "step": 1194 + }, + { + "epoch": 0.083707653646449, + "grad_norm": 3.842498302459717, + "learning_rate": 9.163709281961471e-05, + "loss": 1.0963, + "num_input_tokens_seen": 19237208, + "step": 1195 + }, + { + "epoch": 0.08377770189217824, + "grad_norm": 3.694279432296753, + "learning_rate": 9.16300945709282e-05, + "loss": 1.1177, + "num_input_tokens_seen": 19253592, + "step": 1196 + }, + { + "epoch": 0.08384775013790749, + "grad_norm": 4.382254123687744, + "learning_rate": 9.162309632224169e-05, + "loss": 1.0344, + "num_input_tokens_seen": 19269976, + "step": 1197 + }, + { + "epoch": 0.08391779838363674, + "grad_norm": 4.267289161682129, + "learning_rate": 9.161609807355518e-05, + "loss": 1.1211, + "num_input_tokens_seen": 19286360, + "step": 1198 + }, + { + "epoch": 0.08398784662936598, + "grad_norm": 5.554534435272217, + "learning_rate": 9.160909982486865e-05, + "loss": 0.9674, + "num_input_tokens_seen": 19301800, + "step": 1199 + }, + { + "epoch": 0.08405789487509523, + "grad_norm": 4.1479668617248535, + "learning_rate": 9.160210157618214e-05, + "loss": 1.2334, + "num_input_tokens_seen": 19317392, + "step": 1200 + }, + { + "epoch": 0.08405789487509523, + "eval_loss": 1.1600490808486938, + "eval_runtime": 0.2015, + "eval_samples_per_second": 4.962, + "eval_steps_per_second": 4.962, + "num_input_tokens_seen": 19317392, + "step": 1200 + }, + { + "epoch": 0.08412794312082447, + "grad_norm": 4.1876349449157715, + "learning_rate": 9.159510332749563e-05, + "loss": 1.2036, + "num_input_tokens_seen": 19333776, + "step": 1201 + }, + { + "epoch": 0.08419799136655372, + "grad_norm": 4.031203746795654, + "learning_rate": 9.15881050788091e-05, + "loss": 1.2127, + "num_input_tokens_seen": 19349616, + "step": 1202 + }, + { + "epoch": 0.08426803961228296, + "grad_norm": 4.013350963592529, + "learning_rate": 9.15811068301226e-05, + "loss": 1.2147, + "num_input_tokens_seen": 19366000, + "step": 1203 + }, + { + "epoch": 0.08433808785801221, + "grad_norm": 4.509790897369385, + "learning_rate": 9.157410858143608e-05, + "loss": 1.3484, + "num_input_tokens_seen": 19381904, + "step": 1204 + }, + { + "epoch": 0.08440813610374145, + "grad_norm": 4.630336761474609, + "learning_rate": 9.156711033274957e-05, + "loss": 1.0246, + "num_input_tokens_seen": 19398288, + "step": 1205 + }, + { + "epoch": 0.0844781843494707, + "grad_norm": 3.819884777069092, + "learning_rate": 9.156011208406304e-05, + "loss": 1.1242, + "num_input_tokens_seen": 19414248, + "step": 1206 + }, + { + "epoch": 0.08454823259519995, + "grad_norm": 3.7933132648468018, + "learning_rate": 9.155311383537653e-05, + "loss": 1.0766, + "num_input_tokens_seen": 19430632, + "step": 1207 + }, + { + "epoch": 0.08461828084092919, + "grad_norm": 5.7384934425354, + "learning_rate": 9.154611558669002e-05, + "loss": 1.0691, + "num_input_tokens_seen": 19446248, + "step": 1208 + }, + { + "epoch": 0.08468832908665844, + "grad_norm": 3.9594175815582275, + "learning_rate": 9.153911733800351e-05, + "loss": 1.2029, + "num_input_tokens_seen": 19462632, + "step": 1209 + }, + { + "epoch": 0.08475837733238768, + "grad_norm": 3.8251891136169434, + "learning_rate": 9.1532119089317e-05, + "loss": 0.9994, + "num_input_tokens_seen": 19479016, + "step": 1210 + }, + { + "epoch": 0.08482842557811693, + "grad_norm": 3.9750332832336426, + "learning_rate": 9.152512084063049e-05, + "loss": 1.1737, + "num_input_tokens_seen": 19495112, + "step": 1211 + }, + { + "epoch": 0.08489847382384617, + "grad_norm": 3.986170530319214, + "learning_rate": 9.151812259194396e-05, + "loss": 1.1441, + "num_input_tokens_seen": 19511216, + "step": 1212 + }, + { + "epoch": 0.08496852206957542, + "grad_norm": 3.914065361022949, + "learning_rate": 9.151112434325745e-05, + "loss": 1.2233, + "num_input_tokens_seen": 19527600, + "step": 1213 + }, + { + "epoch": 0.08503857031530467, + "grad_norm": 4.328094482421875, + "learning_rate": 9.150412609457093e-05, + "loss": 1.2076, + "num_input_tokens_seen": 19543984, + "step": 1214 + }, + { + "epoch": 0.08510861856103391, + "grad_norm": 4.112467288970947, + "learning_rate": 9.149712784588441e-05, + "loss": 1.1732, + "num_input_tokens_seen": 19560368, + "step": 1215 + }, + { + "epoch": 0.08517866680676316, + "grad_norm": 4.680009365081787, + "learning_rate": 9.14901295971979e-05, + "loss": 0.985, + "num_input_tokens_seen": 19575616, + "step": 1216 + }, + { + "epoch": 0.0852487150524924, + "grad_norm": 4.4872660636901855, + "learning_rate": 9.148313134851139e-05, + "loss": 1.1799, + "num_input_tokens_seen": 19592000, + "step": 1217 + }, + { + "epoch": 0.08531876329822165, + "grad_norm": 3.7546637058258057, + "learning_rate": 9.147613309982488e-05, + "loss": 1.1989, + "num_input_tokens_seen": 19608384, + "step": 1218 + }, + { + "epoch": 0.0853888115439509, + "grad_norm": 5.590888500213623, + "learning_rate": 9.146913485113835e-05, + "loss": 1.1411, + "num_input_tokens_seen": 19624768, + "step": 1219 + }, + { + "epoch": 0.08545885978968014, + "grad_norm": 3.958021640777588, + "learning_rate": 9.146213660245184e-05, + "loss": 0.9309, + "num_input_tokens_seen": 19641152, + "step": 1220 + }, + { + "epoch": 0.08552890803540938, + "grad_norm": 3.7641196250915527, + "learning_rate": 9.145513835376533e-05, + "loss": 1.0299, + "num_input_tokens_seen": 19657536, + "step": 1221 + }, + { + "epoch": 0.08559895628113863, + "grad_norm": 4.395461559295654, + "learning_rate": 9.14481401050788e-05, + "loss": 1.1404, + "num_input_tokens_seen": 19673712, + "step": 1222 + }, + { + "epoch": 0.08566900452686788, + "grad_norm": 3.8162319660186768, + "learning_rate": 9.144114185639231e-05, + "loss": 1.1638, + "num_input_tokens_seen": 19689336, + "step": 1223 + }, + { + "epoch": 0.08573905277259712, + "grad_norm": 3.7025444507598877, + "learning_rate": 9.143414360770578e-05, + "loss": 0.9995, + "num_input_tokens_seen": 19705464, + "step": 1224 + }, + { + "epoch": 0.08580910101832637, + "grad_norm": 3.8621439933776855, + "learning_rate": 9.142714535901927e-05, + "loss": 1.1639, + "num_input_tokens_seen": 19721848, + "step": 1225 + }, + { + "epoch": 0.08587914926405561, + "grad_norm": 4.243250846862793, + "learning_rate": 9.142014711033275e-05, + "loss": 1.0104, + "num_input_tokens_seen": 19738072, + "step": 1226 + }, + { + "epoch": 0.08594919750978486, + "grad_norm": 4.05800724029541, + "learning_rate": 9.141314886164624e-05, + "loss": 1.0257, + "num_input_tokens_seen": 19754456, + "step": 1227 + }, + { + "epoch": 0.0860192457555141, + "grad_norm": 4.0894455909729, + "learning_rate": 9.140615061295972e-05, + "loss": 1.254, + "num_input_tokens_seen": 19770840, + "step": 1228 + }, + { + "epoch": 0.08608929400124336, + "grad_norm": 4.296894073486328, + "learning_rate": 9.139915236427321e-05, + "loss": 1.1298, + "num_input_tokens_seen": 19786864, + "step": 1229 + }, + { + "epoch": 0.08615934224697261, + "grad_norm": 4.0352888107299805, + "learning_rate": 9.13921541155867e-05, + "loss": 1.0611, + "num_input_tokens_seen": 19801800, + "step": 1230 + }, + { + "epoch": 0.08622939049270185, + "grad_norm": 4.087375640869141, + "learning_rate": 9.138515586690018e-05, + "loss": 0.9686, + "num_input_tokens_seen": 19818184, + "step": 1231 + }, + { + "epoch": 0.0862994387384311, + "grad_norm": 4.045078754425049, + "learning_rate": 9.137815761821367e-05, + "loss": 1.0915, + "num_input_tokens_seen": 19833016, + "step": 1232 + }, + { + "epoch": 0.08636948698416035, + "grad_norm": 4.399363040924072, + "learning_rate": 9.137115936952714e-05, + "loss": 1.1875, + "num_input_tokens_seen": 19848912, + "step": 1233 + }, + { + "epoch": 0.08643953522988959, + "grad_norm": 4.420406818389893, + "learning_rate": 9.136416112084063e-05, + "loss": 1.0534, + "num_input_tokens_seen": 19865296, + "step": 1234 + }, + { + "epoch": 0.08650958347561884, + "grad_norm": 4.131808280944824, + "learning_rate": 9.135716287215412e-05, + "loss": 1.1865, + "num_input_tokens_seen": 19881376, + "step": 1235 + }, + { + "epoch": 0.08657963172134808, + "grad_norm": 3.8256850242614746, + "learning_rate": 9.13501646234676e-05, + "loss": 1.0539, + "num_input_tokens_seen": 19897704, + "step": 1236 + }, + { + "epoch": 0.08664967996707733, + "grad_norm": 4.3497233390808105, + "learning_rate": 9.13431663747811e-05, + "loss": 1.191, + "num_input_tokens_seen": 19914088, + "step": 1237 + }, + { + "epoch": 0.08671972821280657, + "grad_norm": 4.18136739730835, + "learning_rate": 9.133616812609458e-05, + "loss": 1.0539, + "num_input_tokens_seen": 19930128, + "step": 1238 + }, + { + "epoch": 0.08678977645853582, + "grad_norm": 4.782970905303955, + "learning_rate": 9.132916987740806e-05, + "loss": 1.1992, + "num_input_tokens_seen": 19946512, + "step": 1239 + }, + { + "epoch": 0.08685982470426507, + "grad_norm": 4.16589879989624, + "learning_rate": 9.132217162872155e-05, + "loss": 1.1463, + "num_input_tokens_seen": 19962488, + "step": 1240 + }, + { + "epoch": 0.08692987294999431, + "grad_norm": 3.73541522026062, + "learning_rate": 9.131517338003502e-05, + "loss": 1.0272, + "num_input_tokens_seen": 19978584, + "step": 1241 + }, + { + "epoch": 0.08699992119572356, + "grad_norm": 4.225815773010254, + "learning_rate": 9.130817513134851e-05, + "loss": 1.177, + "num_input_tokens_seen": 19994816, + "step": 1242 + }, + { + "epoch": 0.0870699694414528, + "grad_norm": 7.807470321655273, + "learning_rate": 9.1301176882662e-05, + "loss": 1.1635, + "num_input_tokens_seen": 20010576, + "step": 1243 + }, + { + "epoch": 0.08714001768718205, + "grad_norm": 4.818174839019775, + "learning_rate": 9.129417863397549e-05, + "loss": 1.1892, + "num_input_tokens_seen": 20025712, + "step": 1244 + }, + { + "epoch": 0.0872100659329113, + "grad_norm": 3.8367979526519775, + "learning_rate": 9.128718038528898e-05, + "loss": 1.0096, + "num_input_tokens_seen": 20041904, + "step": 1245 + }, + { + "epoch": 0.08728011417864054, + "grad_norm": 3.9912586212158203, + "learning_rate": 9.128018213660245e-05, + "loss": 1.097, + "num_input_tokens_seen": 20058288, + "step": 1246 + }, + { + "epoch": 0.08735016242436978, + "grad_norm": 4.842557907104492, + "learning_rate": 9.127318388791594e-05, + "loss": 1.2012, + "num_input_tokens_seen": 20074672, + "step": 1247 + }, + { + "epoch": 0.08742021067009903, + "grad_norm": 3.816938877105713, + "learning_rate": 9.126618563922943e-05, + "loss": 1.1683, + "num_input_tokens_seen": 20090664, + "step": 1248 + }, + { + "epoch": 0.08749025891582828, + "grad_norm": 3.712480306625366, + "learning_rate": 9.125918739054292e-05, + "loss": 1.1978, + "num_input_tokens_seen": 20107048, + "step": 1249 + }, + { + "epoch": 0.08756030716155752, + "grad_norm": 4.185492515563965, + "learning_rate": 9.12521891418564e-05, + "loss": 1.2042, + "num_input_tokens_seen": 20123432, + "step": 1250 + }, + { + "epoch": 0.08763035540728677, + "grad_norm": 5.510714530944824, + "learning_rate": 9.124519089316988e-05, + "loss": 0.9757, + "num_input_tokens_seen": 20139112, + "step": 1251 + }, + { + "epoch": 0.08770040365301601, + "grad_norm": 3.9170289039611816, + "learning_rate": 9.123819264448337e-05, + "loss": 1.0213, + "num_input_tokens_seen": 20155496, + "step": 1252 + }, + { + "epoch": 0.08777045189874526, + "grad_norm": 3.738008975982666, + "learning_rate": 9.123119439579684e-05, + "loss": 0.9446, + "num_input_tokens_seen": 20171760, + "step": 1253 + }, + { + "epoch": 0.0878405001444745, + "grad_norm": 4.845873832702637, + "learning_rate": 9.122419614711033e-05, + "loss": 1.2135, + "num_input_tokens_seen": 20188056, + "step": 1254 + }, + { + "epoch": 0.08791054839020375, + "grad_norm": 4.166906356811523, + "learning_rate": 9.121719789842382e-05, + "loss": 1.1558, + "num_input_tokens_seen": 20204440, + "step": 1255 + }, + { + "epoch": 0.087980596635933, + "grad_norm": 4.039194107055664, + "learning_rate": 9.121019964973731e-05, + "loss": 1.0297, + "num_input_tokens_seen": 20220824, + "step": 1256 + }, + { + "epoch": 0.08805064488166224, + "grad_norm": 3.545482635498047, + "learning_rate": 9.12032014010508e-05, + "loss": 0.9757, + "num_input_tokens_seen": 20236888, + "step": 1257 + }, + { + "epoch": 0.08812069312739149, + "grad_norm": 3.82114839553833, + "learning_rate": 9.119620315236427e-05, + "loss": 1.1637, + "num_input_tokens_seen": 20253272, + "step": 1258 + }, + { + "epoch": 0.08819074137312073, + "grad_norm": 4.770678997039795, + "learning_rate": 9.118920490367776e-05, + "loss": 1.1421, + "num_input_tokens_seen": 20269656, + "step": 1259 + }, + { + "epoch": 0.08826078961884998, + "grad_norm": 4.4319539070129395, + "learning_rate": 9.118220665499124e-05, + "loss": 1.1565, + "num_input_tokens_seen": 20285456, + "step": 1260 + }, + { + "epoch": 0.08833083786457922, + "grad_norm": 4.0923357009887695, + "learning_rate": 9.117520840630473e-05, + "loss": 1.2328, + "num_input_tokens_seen": 20301232, + "step": 1261 + }, + { + "epoch": 0.08840088611030847, + "grad_norm": 5.8347344398498535, + "learning_rate": 9.116821015761821e-05, + "loss": 0.8824, + "num_input_tokens_seen": 20317224, + "step": 1262 + }, + { + "epoch": 0.08847093435603771, + "grad_norm": 4.525367259979248, + "learning_rate": 9.11612119089317e-05, + "loss": 1.1554, + "num_input_tokens_seen": 20332616, + "step": 1263 + }, + { + "epoch": 0.08854098260176697, + "grad_norm": 3.9754436016082764, + "learning_rate": 9.115421366024519e-05, + "loss": 1.0423, + "num_input_tokens_seen": 20348336, + "step": 1264 + }, + { + "epoch": 0.08861103084749622, + "grad_norm": 4.40745735168457, + "learning_rate": 9.114721541155868e-05, + "loss": 1.0485, + "num_input_tokens_seen": 20364312, + "step": 1265 + }, + { + "epoch": 0.08868107909322547, + "grad_norm": 7.126221179962158, + "learning_rate": 9.114021716287216e-05, + "loss": 1.2035, + "num_input_tokens_seen": 20380696, + "step": 1266 + }, + { + "epoch": 0.08875112733895471, + "grad_norm": 4.306386947631836, + "learning_rate": 9.113321891418564e-05, + "loss": 1.0399, + "num_input_tokens_seen": 20397080, + "step": 1267 + }, + { + "epoch": 0.08882117558468396, + "grad_norm": 3.566943407058716, + "learning_rate": 9.112622066549912e-05, + "loss": 1.0463, + "num_input_tokens_seen": 20413464, + "step": 1268 + }, + { + "epoch": 0.0888912238304132, + "grad_norm": 3.975228786468506, + "learning_rate": 9.111922241681262e-05, + "loss": 1.2576, + "num_input_tokens_seen": 20429848, + "step": 1269 + }, + { + "epoch": 0.08896127207614245, + "grad_norm": 4.928854465484619, + "learning_rate": 9.11122241681261e-05, + "loss": 1.1555, + "num_input_tokens_seen": 20446192, + "step": 1270 + }, + { + "epoch": 0.0890313203218717, + "grad_norm": 4.288821697235107, + "learning_rate": 9.110522591943958e-05, + "loss": 1.2559, + "num_input_tokens_seen": 20462576, + "step": 1271 + }, + { + "epoch": 0.08910136856760094, + "grad_norm": 3.9346396923065186, + "learning_rate": 9.109822767075307e-05, + "loss": 1.1479, + "num_input_tokens_seen": 20478520, + "step": 1272 + }, + { + "epoch": 0.08917141681333018, + "grad_norm": 3.7976620197296143, + "learning_rate": 9.109122942206655e-05, + "loss": 0.9903, + "num_input_tokens_seen": 20494408, + "step": 1273 + }, + { + "epoch": 0.08924146505905943, + "grad_norm": 5.373577117919922, + "learning_rate": 9.108423117338004e-05, + "loss": 0.8863, + "num_input_tokens_seen": 20510792, + "step": 1274 + }, + { + "epoch": 0.08931151330478868, + "grad_norm": 4.248324394226074, + "learning_rate": 9.107723292469353e-05, + "loss": 1.3492, + "num_input_tokens_seen": 20527064, + "step": 1275 + }, + { + "epoch": 0.08938156155051792, + "grad_norm": 4.453672885894775, + "learning_rate": 9.107023467600701e-05, + "loss": 0.9763, + "num_input_tokens_seen": 20543448, + "step": 1276 + }, + { + "epoch": 0.08945160979624717, + "grad_norm": 4.8721184730529785, + "learning_rate": 9.10632364273205e-05, + "loss": 0.9455, + "num_input_tokens_seen": 20559832, + "step": 1277 + }, + { + "epoch": 0.08952165804197641, + "grad_norm": 5.0173540115356445, + "learning_rate": 9.105623817863398e-05, + "loss": 1.0303, + "num_input_tokens_seen": 20576216, + "step": 1278 + }, + { + "epoch": 0.08959170628770566, + "grad_norm": 5.00100040435791, + "learning_rate": 9.104923992994747e-05, + "loss": 1.0393, + "num_input_tokens_seen": 20592600, + "step": 1279 + }, + { + "epoch": 0.0896617545334349, + "grad_norm": 4.271099090576172, + "learning_rate": 9.104224168126094e-05, + "loss": 1.2307, + "num_input_tokens_seen": 20608632, + "step": 1280 + }, + { + "epoch": 0.08973180277916415, + "grad_norm": 4.246976852416992, + "learning_rate": 9.103524343257443e-05, + "loss": 1.1405, + "num_input_tokens_seen": 20625016, + "step": 1281 + }, + { + "epoch": 0.0898018510248934, + "grad_norm": 5.033923149108887, + "learning_rate": 9.102824518388792e-05, + "loss": 1.0849, + "num_input_tokens_seen": 20641400, + "step": 1282 + }, + { + "epoch": 0.08987189927062264, + "grad_norm": 4.4118571281433105, + "learning_rate": 9.102124693520141e-05, + "loss": 1.118, + "num_input_tokens_seen": 20657448, + "step": 1283 + }, + { + "epoch": 0.08994194751635189, + "grad_norm": 4.150144577026367, + "learning_rate": 9.10142486865149e-05, + "loss": 1.0676, + "num_input_tokens_seen": 20673080, + "step": 1284 + }, + { + "epoch": 0.09001199576208113, + "grad_norm": 3.767683744430542, + "learning_rate": 9.100725043782837e-05, + "loss": 0.8968, + "num_input_tokens_seen": 20689464, + "step": 1285 + }, + { + "epoch": 0.09008204400781038, + "grad_norm": 4.816582202911377, + "learning_rate": 9.100025218914186e-05, + "loss": 1.0039, + "num_input_tokens_seen": 20703896, + "step": 1286 + }, + { + "epoch": 0.09015209225353962, + "grad_norm": 3.8913414478302, + "learning_rate": 9.099325394045533e-05, + "loss": 1.0077, + "num_input_tokens_seen": 20720280, + "step": 1287 + }, + { + "epoch": 0.09022214049926887, + "grad_norm": 4.305298328399658, + "learning_rate": 9.098625569176882e-05, + "loss": 1.1555, + "num_input_tokens_seen": 20735944, + "step": 1288 + }, + { + "epoch": 0.09029218874499811, + "grad_norm": 3.3120992183685303, + "learning_rate": 9.097925744308233e-05, + "loss": 0.8591, + "num_input_tokens_seen": 20752128, + "step": 1289 + }, + { + "epoch": 0.09036223699072736, + "grad_norm": 4.705013751983643, + "learning_rate": 9.09722591943958e-05, + "loss": 1.4579, + "num_input_tokens_seen": 20768512, + "step": 1290 + }, + { + "epoch": 0.0904322852364566, + "grad_norm": 5.08630895614624, + "learning_rate": 9.096526094570929e-05, + "loss": 1.1049, + "num_input_tokens_seen": 20783976, + "step": 1291 + }, + { + "epoch": 0.09050233348218585, + "grad_norm": 3.634686231613159, + "learning_rate": 9.095826269702278e-05, + "loss": 1.0344, + "num_input_tokens_seen": 20800360, + "step": 1292 + }, + { + "epoch": 0.0905723817279151, + "grad_norm": 4.220744609832764, + "learning_rate": 9.095126444833625e-05, + "loss": 1.1843, + "num_input_tokens_seen": 20816744, + "step": 1293 + }, + { + "epoch": 0.09064242997364434, + "grad_norm": 4.724472522735596, + "learning_rate": 9.094426619964974e-05, + "loss": 1.1365, + "num_input_tokens_seen": 20833128, + "step": 1294 + }, + { + "epoch": 0.09071247821937359, + "grad_norm": 3.9398090839385986, + "learning_rate": 9.093726795096323e-05, + "loss": 1.0703, + "num_input_tokens_seen": 20849448, + "step": 1295 + }, + { + "epoch": 0.09078252646510283, + "grad_norm": 4.260062217712402, + "learning_rate": 9.093026970227672e-05, + "loss": 1.0968, + "num_input_tokens_seen": 20865832, + "step": 1296 + }, + { + "epoch": 0.09085257471083208, + "grad_norm": 4.383310317993164, + "learning_rate": 9.09232714535902e-05, + "loss": 1.2542, + "num_input_tokens_seen": 20881288, + "step": 1297 + }, + { + "epoch": 0.09092262295656132, + "grad_norm": 4.479433059692383, + "learning_rate": 9.091627320490368e-05, + "loss": 0.9533, + "num_input_tokens_seen": 20897328, + "step": 1298 + }, + { + "epoch": 0.09099267120229058, + "grad_norm": 4.911858081817627, + "learning_rate": 9.090927495621717e-05, + "loss": 1.3399, + "num_input_tokens_seen": 20913712, + "step": 1299 + }, + { + "epoch": 0.09106271944801983, + "grad_norm": 4.015485763549805, + "learning_rate": 9.090227670753065e-05, + "loss": 1.1156, + "num_input_tokens_seen": 20929984, + "step": 1300 + }, + { + "epoch": 0.09113276769374908, + "grad_norm": 3.8690338134765625, + "learning_rate": 9.089527845884413e-05, + "loss": 1.0634, + "num_input_tokens_seen": 20946368, + "step": 1301 + }, + { + "epoch": 0.09120281593947832, + "grad_norm": 5.142012596130371, + "learning_rate": 9.088828021015762e-05, + "loss": 1.0579, + "num_input_tokens_seen": 20962752, + "step": 1302 + }, + { + "epoch": 0.09127286418520757, + "grad_norm": 3.954049587249756, + "learning_rate": 9.088128196147111e-05, + "loss": 1.0862, + "num_input_tokens_seen": 20979136, + "step": 1303 + }, + { + "epoch": 0.09134291243093681, + "grad_norm": 4.13312292098999, + "learning_rate": 9.08742837127846e-05, + "loss": 1.0548, + "num_input_tokens_seen": 20995520, + "step": 1304 + }, + { + "epoch": 0.09141296067666606, + "grad_norm": 4.24699592590332, + "learning_rate": 9.086728546409808e-05, + "loss": 1.0126, + "num_input_tokens_seen": 21011904, + "step": 1305 + }, + { + "epoch": 0.0914830089223953, + "grad_norm": 4.847048759460449, + "learning_rate": 9.086028721541156e-05, + "loss": 0.9973, + "num_input_tokens_seen": 21027664, + "step": 1306 + }, + { + "epoch": 0.09155305716812455, + "grad_norm": 4.573661804199219, + "learning_rate": 9.085328896672504e-05, + "loss": 1.005, + "num_input_tokens_seen": 21044048, + "step": 1307 + }, + { + "epoch": 0.0916231054138538, + "grad_norm": 4.13530158996582, + "learning_rate": 9.084629071803853e-05, + "loss": 1.1033, + "num_input_tokens_seen": 21060432, + "step": 1308 + }, + { + "epoch": 0.09169315365958304, + "grad_norm": 4.017937183380127, + "learning_rate": 9.083929246935203e-05, + "loss": 1.1971, + "num_input_tokens_seen": 21076816, + "step": 1309 + }, + { + "epoch": 0.09176320190531229, + "grad_norm": 5.928586483001709, + "learning_rate": 9.08322942206655e-05, + "loss": 1.0547, + "num_input_tokens_seen": 21093200, + "step": 1310 + }, + { + "epoch": 0.09183325015104153, + "grad_norm": 4.2442169189453125, + "learning_rate": 9.082529597197899e-05, + "loss": 1.2794, + "num_input_tokens_seen": 21109256, + "step": 1311 + }, + { + "epoch": 0.09190329839677078, + "grad_norm": 4.891444683074951, + "learning_rate": 9.081829772329247e-05, + "loss": 1.1833, + "num_input_tokens_seen": 21124848, + "step": 1312 + }, + { + "epoch": 0.09197334664250002, + "grad_norm": 4.323850154876709, + "learning_rate": 9.081129947460596e-05, + "loss": 1.1683, + "num_input_tokens_seen": 21141176, + "step": 1313 + }, + { + "epoch": 0.09204339488822927, + "grad_norm": 4.239765644073486, + "learning_rate": 9.080430122591943e-05, + "loss": 1.1073, + "num_input_tokens_seen": 21157240, + "step": 1314 + }, + { + "epoch": 0.09211344313395851, + "grad_norm": 4.12881326675415, + "learning_rate": 9.079730297723293e-05, + "loss": 1.2522, + "num_input_tokens_seen": 21173624, + "step": 1315 + }, + { + "epoch": 0.09218349137968776, + "grad_norm": 4.238161087036133, + "learning_rate": 9.079030472854642e-05, + "loss": 1.1828, + "num_input_tokens_seen": 21190008, + "step": 1316 + }, + { + "epoch": 0.092253539625417, + "grad_norm": 4.124176502227783, + "learning_rate": 9.07833064798599e-05, + "loss": 1.1388, + "num_input_tokens_seen": 21206392, + "step": 1317 + }, + { + "epoch": 0.09232358787114625, + "grad_norm": 3.772136926651001, + "learning_rate": 9.077630823117339e-05, + "loss": 1.068, + "num_input_tokens_seen": 21222776, + "step": 1318 + }, + { + "epoch": 0.0923936361168755, + "grad_norm": 4.628321170806885, + "learning_rate": 9.076930998248687e-05, + "loss": 1.2363, + "num_input_tokens_seen": 21239160, + "step": 1319 + }, + { + "epoch": 0.09246368436260474, + "grad_norm": 5.3034348487854, + "learning_rate": 9.076231173380035e-05, + "loss": 1.0638, + "num_input_tokens_seen": 21255544, + "step": 1320 + }, + { + "epoch": 0.09253373260833399, + "grad_norm": 3.6543760299682617, + "learning_rate": 9.075531348511384e-05, + "loss": 1.0071, + "num_input_tokens_seen": 21271928, + "step": 1321 + }, + { + "epoch": 0.09260378085406323, + "grad_norm": 4.1335062980651855, + "learning_rate": 9.074831523642733e-05, + "loss": 1.084, + "num_input_tokens_seen": 21288312, + "step": 1322 + }, + { + "epoch": 0.09267382909979248, + "grad_norm": 3.6392204761505127, + "learning_rate": 9.074131698774082e-05, + "loss": 1.1146, + "num_input_tokens_seen": 21304696, + "step": 1323 + }, + { + "epoch": 0.09274387734552172, + "grad_norm": 4.035269737243652, + "learning_rate": 9.073431873905429e-05, + "loss": 0.9578, + "num_input_tokens_seen": 21321080, + "step": 1324 + }, + { + "epoch": 0.09281392559125097, + "grad_norm": 4.650269508361816, + "learning_rate": 9.072732049036778e-05, + "loss": 1.0242, + "num_input_tokens_seen": 21337464, + "step": 1325 + }, + { + "epoch": 0.09288397383698022, + "grad_norm": 5.850543022155762, + "learning_rate": 9.072032224168127e-05, + "loss": 1.1196, + "num_input_tokens_seen": 21352968, + "step": 1326 + }, + { + "epoch": 0.09295402208270946, + "grad_norm": 4.177901744842529, + "learning_rate": 9.071332399299474e-05, + "loss": 1.1351, + "num_input_tokens_seen": 21368968, + "step": 1327 + }, + { + "epoch": 0.09302407032843871, + "grad_norm": 4.582173824310303, + "learning_rate": 9.070632574430823e-05, + "loss": 0.9115, + "num_input_tokens_seen": 21385352, + "step": 1328 + }, + { + "epoch": 0.09309411857416795, + "grad_norm": 4.7911787033081055, + "learning_rate": 9.069932749562173e-05, + "loss": 1.0413, + "num_input_tokens_seen": 21401144, + "step": 1329 + }, + { + "epoch": 0.0931641668198972, + "grad_norm": 4.058457374572754, + "learning_rate": 9.069232924693521e-05, + "loss": 1.0611, + "num_input_tokens_seen": 21416640, + "step": 1330 + }, + { + "epoch": 0.09323421506562644, + "grad_norm": 4.972208499908447, + "learning_rate": 9.06853309982487e-05, + "loss": 1.016, + "num_input_tokens_seen": 21433024, + "step": 1331 + }, + { + "epoch": 0.09330426331135569, + "grad_norm": 4.0875091552734375, + "learning_rate": 9.067833274956217e-05, + "loss": 1.089, + "num_input_tokens_seen": 21448888, + "step": 1332 + }, + { + "epoch": 0.09337431155708495, + "grad_norm": 3.923112154006958, + "learning_rate": 9.067133450087566e-05, + "loss": 0.9824, + "num_input_tokens_seen": 21465272, + "step": 1333 + }, + { + "epoch": 0.0934443598028142, + "grad_norm": 4.067697525024414, + "learning_rate": 9.066433625218914e-05, + "loss": 1.0492, + "num_input_tokens_seen": 21481656, + "step": 1334 + }, + { + "epoch": 0.09351440804854344, + "grad_norm": 4.185417652130127, + "learning_rate": 9.065733800350264e-05, + "loss": 1.1073, + "num_input_tokens_seen": 21498040, + "step": 1335 + }, + { + "epoch": 0.09358445629427269, + "grad_norm": 7.31542444229126, + "learning_rate": 9.065033975481613e-05, + "loss": 1.4322, + "num_input_tokens_seen": 21514088, + "step": 1336 + }, + { + "epoch": 0.09365450454000193, + "grad_norm": 4.754745006561279, + "learning_rate": 9.06433415061296e-05, + "loss": 0.9953, + "num_input_tokens_seen": 21530472, + "step": 1337 + }, + { + "epoch": 0.09372455278573118, + "grad_norm": 5.81265926361084, + "learning_rate": 9.063634325744309e-05, + "loss": 1.1434, + "num_input_tokens_seen": 21545728, + "step": 1338 + }, + { + "epoch": 0.09379460103146042, + "grad_norm": 5.586238861083984, + "learning_rate": 9.062934500875657e-05, + "loss": 0.9818, + "num_input_tokens_seen": 21562112, + "step": 1339 + }, + { + "epoch": 0.09386464927718967, + "grad_norm": 4.096534729003906, + "learning_rate": 9.062234676007005e-05, + "loss": 1.1856, + "num_input_tokens_seen": 21578496, + "step": 1340 + }, + { + "epoch": 0.09393469752291891, + "grad_norm": 4.913814544677734, + "learning_rate": 9.061534851138354e-05, + "loss": 1.041, + "num_input_tokens_seen": 21594792, + "step": 1341 + }, + { + "epoch": 0.09400474576864816, + "grad_norm": 3.8853912353515625, + "learning_rate": 9.060835026269703e-05, + "loss": 1.1651, + "num_input_tokens_seen": 21611176, + "step": 1342 + }, + { + "epoch": 0.0940747940143774, + "grad_norm": 4.187959671020508, + "learning_rate": 9.060135201401052e-05, + "loss": 1.1757, + "num_input_tokens_seen": 21627560, + "step": 1343 + }, + { + "epoch": 0.09414484226010665, + "grad_norm": 4.128627777099609, + "learning_rate": 9.0594353765324e-05, + "loss": 0.9243, + "num_input_tokens_seen": 21643576, + "step": 1344 + }, + { + "epoch": 0.0942148905058359, + "grad_norm": 4.7016825675964355, + "learning_rate": 9.058735551663748e-05, + "loss": 1.2425, + "num_input_tokens_seen": 21658600, + "step": 1345 + }, + { + "epoch": 0.09428493875156514, + "grad_norm": 3.970548391342163, + "learning_rate": 9.058035726795097e-05, + "loss": 1.0495, + "num_input_tokens_seen": 21674264, + "step": 1346 + }, + { + "epoch": 0.09435498699729439, + "grad_norm": 3.812196731567383, + "learning_rate": 9.057335901926445e-05, + "loss": 0.9558, + "num_input_tokens_seen": 21690112, + "step": 1347 + }, + { + "epoch": 0.09442503524302363, + "grad_norm": 3.6845176219940186, + "learning_rate": 9.056636077057794e-05, + "loss": 0.9758, + "num_input_tokens_seen": 21705744, + "step": 1348 + }, + { + "epoch": 0.09449508348875288, + "grad_norm": 4.119202136993408, + "learning_rate": 9.055936252189142e-05, + "loss": 1.0948, + "num_input_tokens_seen": 21721776, + "step": 1349 + }, + { + "epoch": 0.09456513173448212, + "grad_norm": 4.176985740661621, + "learning_rate": 9.055236427320491e-05, + "loss": 0.9475, + "num_input_tokens_seen": 21737912, + "step": 1350 + }, + { + "epoch": 0.09463517998021137, + "grad_norm": 4.057264804840088, + "learning_rate": 9.054536602451839e-05, + "loss": 1.1746, + "num_input_tokens_seen": 21754296, + "step": 1351 + }, + { + "epoch": 0.09470522822594062, + "grad_norm": 4.5631914138793945, + "learning_rate": 9.053836777583188e-05, + "loss": 1.0894, + "num_input_tokens_seen": 21770680, + "step": 1352 + }, + { + "epoch": 0.09477527647166986, + "grad_norm": 4.854849815368652, + "learning_rate": 9.053136952714536e-05, + "loss": 1.0686, + "num_input_tokens_seen": 21787064, + "step": 1353 + }, + { + "epoch": 0.09484532471739911, + "grad_norm": 5.326946258544922, + "learning_rate": 9.052437127845884e-05, + "loss": 0.872, + "num_input_tokens_seen": 21803448, + "step": 1354 + }, + { + "epoch": 0.09491537296312835, + "grad_norm": 4.283742904663086, + "learning_rate": 9.051737302977234e-05, + "loss": 1.2683, + "num_input_tokens_seen": 21819832, + "step": 1355 + }, + { + "epoch": 0.0949854212088576, + "grad_norm": 4.165935039520264, + "learning_rate": 9.051037478108582e-05, + "loss": 0.977, + "num_input_tokens_seen": 21836216, + "step": 1356 + }, + { + "epoch": 0.09505546945458684, + "grad_norm": 4.502480983734131, + "learning_rate": 9.05033765323993e-05, + "loss": 1.2854, + "num_input_tokens_seen": 21852600, + "step": 1357 + }, + { + "epoch": 0.09512551770031609, + "grad_norm": 4.185445308685303, + "learning_rate": 9.04963782837128e-05, + "loss": 1.2225, + "num_input_tokens_seen": 21868984, + "step": 1358 + }, + { + "epoch": 0.09519556594604534, + "grad_norm": 7.288909435272217, + "learning_rate": 9.048938003502627e-05, + "loss": 1.154, + "num_input_tokens_seen": 21884648, + "step": 1359 + }, + { + "epoch": 0.09526561419177458, + "grad_norm": 4.038896560668945, + "learning_rate": 9.048238178633976e-05, + "loss": 1.1437, + "num_input_tokens_seen": 21900704, + "step": 1360 + }, + { + "epoch": 0.09533566243750383, + "grad_norm": 4.216241836547852, + "learning_rate": 9.047538353765325e-05, + "loss": 1.1379, + "num_input_tokens_seen": 21916520, + "step": 1361 + }, + { + "epoch": 0.09540571068323307, + "grad_norm": 4.2549147605896, + "learning_rate": 9.046838528896673e-05, + "loss": 1.2578, + "num_input_tokens_seen": 21932904, + "step": 1362 + }, + { + "epoch": 0.09547575892896232, + "grad_norm": 3.6919445991516113, + "learning_rate": 9.046138704028022e-05, + "loss": 0.9876, + "num_input_tokens_seen": 21949288, + "step": 1363 + }, + { + "epoch": 0.09554580717469156, + "grad_norm": 5.467876434326172, + "learning_rate": 9.04543887915937e-05, + "loss": 0.9735, + "num_input_tokens_seen": 21965672, + "step": 1364 + }, + { + "epoch": 0.09561585542042081, + "grad_norm": 4.036736011505127, + "learning_rate": 9.044739054290719e-05, + "loss": 1.0712, + "num_input_tokens_seen": 21980792, + "step": 1365 + }, + { + "epoch": 0.09568590366615005, + "grad_norm": 4.083346843719482, + "learning_rate": 9.044039229422066e-05, + "loss": 1.0883, + "num_input_tokens_seen": 21996888, + "step": 1366 + }, + { + "epoch": 0.0957559519118793, + "grad_norm": 3.553262948989868, + "learning_rate": 9.043339404553415e-05, + "loss": 1.0116, + "num_input_tokens_seen": 22013160, + "step": 1367 + }, + { + "epoch": 0.09582600015760856, + "grad_norm": 4.787721633911133, + "learning_rate": 9.042639579684764e-05, + "loss": 1.1444, + "num_input_tokens_seen": 22029544, + "step": 1368 + }, + { + "epoch": 0.0958960484033378, + "grad_norm": 3.8053700923919678, + "learning_rate": 9.041939754816113e-05, + "loss": 1.1654, + "num_input_tokens_seen": 22045888, + "step": 1369 + }, + { + "epoch": 0.09596609664906705, + "grad_norm": 3.7679660320281982, + "learning_rate": 9.041239929947462e-05, + "loss": 1.1753, + "num_input_tokens_seen": 22062272, + "step": 1370 + }, + { + "epoch": 0.0960361448947963, + "grad_norm": 5.086554527282715, + "learning_rate": 9.040540105078809e-05, + "loss": 0.9579, + "num_input_tokens_seen": 22078080, + "step": 1371 + }, + { + "epoch": 0.09610619314052554, + "grad_norm": 4.255527496337891, + "learning_rate": 9.039840280210158e-05, + "loss": 1.0953, + "num_input_tokens_seen": 22093808, + "step": 1372 + }, + { + "epoch": 0.09617624138625479, + "grad_norm": 6.081700325012207, + "learning_rate": 9.039140455341507e-05, + "loss": 1.0363, + "num_input_tokens_seen": 22110192, + "step": 1373 + }, + { + "epoch": 0.09624628963198403, + "grad_norm": 4.376565456390381, + "learning_rate": 9.038440630472854e-05, + "loss": 1.1737, + "num_input_tokens_seen": 22126576, + "step": 1374 + }, + { + "epoch": 0.09631633787771328, + "grad_norm": 4.051114559173584, + "learning_rate": 9.037740805604205e-05, + "loss": 1.1921, + "num_input_tokens_seen": 22142768, + "step": 1375 + }, + { + "epoch": 0.09638638612344252, + "grad_norm": 4.46164083480835, + "learning_rate": 9.037040980735552e-05, + "loss": 1.1541, + "num_input_tokens_seen": 22158600, + "step": 1376 + }, + { + "epoch": 0.09645643436917177, + "grad_norm": 4.242503643035889, + "learning_rate": 9.036341155866901e-05, + "loss": 1.1314, + "num_input_tokens_seen": 22174984, + "step": 1377 + }, + { + "epoch": 0.09652648261490102, + "grad_norm": 3.6338908672332764, + "learning_rate": 9.035641330998248e-05, + "loss": 0.9257, + "num_input_tokens_seen": 22190880, + "step": 1378 + }, + { + "epoch": 0.09659653086063026, + "grad_norm": 4.73402738571167, + "learning_rate": 9.034941506129597e-05, + "loss": 1.1981, + "num_input_tokens_seen": 22206632, + "step": 1379 + }, + { + "epoch": 0.09666657910635951, + "grad_norm": 4.450289726257324, + "learning_rate": 9.034241681260946e-05, + "loss": 1.0851, + "num_input_tokens_seen": 22222896, + "step": 1380 + }, + { + "epoch": 0.09673662735208875, + "grad_norm": 5.578179359436035, + "learning_rate": 9.033541856392295e-05, + "loss": 1.2856, + "num_input_tokens_seen": 22238280, + "step": 1381 + }, + { + "epoch": 0.096806675597818, + "grad_norm": 3.8745546340942383, + "learning_rate": 9.032842031523644e-05, + "loss": 0.9841, + "num_input_tokens_seen": 22254664, + "step": 1382 + }, + { + "epoch": 0.09687672384354724, + "grad_norm": 5.7268548011779785, + "learning_rate": 9.032142206654991e-05, + "loss": 1.2024, + "num_input_tokens_seen": 22270000, + "step": 1383 + }, + { + "epoch": 0.09694677208927649, + "grad_norm": 4.380898952484131, + "learning_rate": 9.03144238178634e-05, + "loss": 1.0589, + "num_input_tokens_seen": 22286384, + "step": 1384 + }, + { + "epoch": 0.09701682033500574, + "grad_norm": 5.762500762939453, + "learning_rate": 9.030742556917689e-05, + "loss": 1.2061, + "num_input_tokens_seen": 22302272, + "step": 1385 + }, + { + "epoch": 0.09708686858073498, + "grad_norm": 3.739488363265991, + "learning_rate": 9.030042732049037e-05, + "loss": 0.9867, + "num_input_tokens_seen": 22318656, + "step": 1386 + }, + { + "epoch": 0.09715691682646423, + "grad_norm": 4.584897994995117, + "learning_rate": 9.029342907180385e-05, + "loss": 1.1934, + "num_input_tokens_seen": 22334704, + "step": 1387 + }, + { + "epoch": 0.09722696507219347, + "grad_norm": 4.161139488220215, + "learning_rate": 9.028643082311734e-05, + "loss": 1.1638, + "num_input_tokens_seen": 22349800, + "step": 1388 + }, + { + "epoch": 0.09729701331792272, + "grad_norm": 4.115293979644775, + "learning_rate": 9.027943257443083e-05, + "loss": 1.0181, + "num_input_tokens_seen": 22366184, + "step": 1389 + }, + { + "epoch": 0.09736706156365196, + "grad_norm": 3.7355988025665283, + "learning_rate": 9.027243432574432e-05, + "loss": 1.1182, + "num_input_tokens_seen": 22382568, + "step": 1390 + }, + { + "epoch": 0.09743710980938121, + "grad_norm": 4.15507173538208, + "learning_rate": 9.02654360770578e-05, + "loss": 1.0272, + "num_input_tokens_seen": 22398480, + "step": 1391 + }, + { + "epoch": 0.09750715805511045, + "grad_norm": 3.770918607711792, + "learning_rate": 9.025843782837128e-05, + "loss": 0.9834, + "num_input_tokens_seen": 22414864, + "step": 1392 + }, + { + "epoch": 0.0975772063008397, + "grad_norm": 4.214321136474609, + "learning_rate": 9.025143957968476e-05, + "loss": 1.1738, + "num_input_tokens_seen": 22429752, + "step": 1393 + }, + { + "epoch": 0.09764725454656895, + "grad_norm": 3.9854986667633057, + "learning_rate": 9.024444133099825e-05, + "loss": 1.2832, + "num_input_tokens_seen": 22446136, + "step": 1394 + }, + { + "epoch": 0.09771730279229819, + "grad_norm": 4.996057510375977, + "learning_rate": 9.023744308231174e-05, + "loss": 1.1691, + "num_input_tokens_seen": 22461160, + "step": 1395 + }, + { + "epoch": 0.09778735103802744, + "grad_norm": 3.682765007019043, + "learning_rate": 9.023044483362523e-05, + "loss": 0.9548, + "num_input_tokens_seen": 22477336, + "step": 1396 + }, + { + "epoch": 0.09785739928375668, + "grad_norm": 4.367272853851318, + "learning_rate": 9.022344658493871e-05, + "loss": 1.0512, + "num_input_tokens_seen": 22492952, + "step": 1397 + }, + { + "epoch": 0.09792744752948593, + "grad_norm": 3.9716336727142334, + "learning_rate": 9.021644833625219e-05, + "loss": 1.103, + "num_input_tokens_seen": 22509336, + "step": 1398 + }, + { + "epoch": 0.09799749577521517, + "grad_norm": 4.043631553649902, + "learning_rate": 9.020945008756568e-05, + "loss": 1.1439, + "num_input_tokens_seen": 22525568, + "step": 1399 + }, + { + "epoch": 0.09806754402094442, + "grad_norm": 4.343166351318359, + "learning_rate": 9.020245183887917e-05, + "loss": 1.1948, + "num_input_tokens_seen": 22541328, + "step": 1400 + }, + { + "epoch": 0.09806754402094442, + "eval_loss": 1.1561514139175415, + "eval_runtime": 0.1977, + "eval_samples_per_second": 5.058, + "eval_steps_per_second": 5.058, + "num_input_tokens_seen": 22541328, + "step": 1400 + }, + { + "epoch": 0.09813759226667366, + "grad_norm": 4.709417819976807, + "learning_rate": 9.019545359019265e-05, + "loss": 1.1398, + "num_input_tokens_seen": 22557304, + "step": 1401 + }, + { + "epoch": 0.09820764051240291, + "grad_norm": 7.022638320922852, + "learning_rate": 9.018845534150614e-05, + "loss": 1.0342, + "num_input_tokens_seen": 22573688, + "step": 1402 + }, + { + "epoch": 0.09827768875813217, + "grad_norm": 3.7976694107055664, + "learning_rate": 9.018145709281962e-05, + "loss": 0.9829, + "num_input_tokens_seen": 22589848, + "step": 1403 + }, + { + "epoch": 0.09834773700386142, + "grad_norm": 3.70877742767334, + "learning_rate": 9.01744588441331e-05, + "loss": 0.9707, + "num_input_tokens_seen": 22606232, + "step": 1404 + }, + { + "epoch": 0.09841778524959066, + "grad_norm": 7.724960803985596, + "learning_rate": 9.016746059544658e-05, + "loss": 0.9602, + "num_input_tokens_seen": 22621912, + "step": 1405 + }, + { + "epoch": 0.09848783349531991, + "grad_norm": 3.9619522094726562, + "learning_rate": 9.016046234676007e-05, + "loss": 0.998, + "num_input_tokens_seen": 22638296, + "step": 1406 + }, + { + "epoch": 0.09855788174104915, + "grad_norm": 3.8303041458129883, + "learning_rate": 9.015346409807356e-05, + "loss": 1.0762, + "num_input_tokens_seen": 22654496, + "step": 1407 + }, + { + "epoch": 0.0986279299867784, + "grad_norm": 4.029507637023926, + "learning_rate": 9.014646584938705e-05, + "loss": 1.2072, + "num_input_tokens_seen": 22670544, + "step": 1408 + }, + { + "epoch": 0.09869797823250764, + "grad_norm": 3.8487346172332764, + "learning_rate": 9.013946760070054e-05, + "loss": 1.1834, + "num_input_tokens_seen": 22686592, + "step": 1409 + }, + { + "epoch": 0.09876802647823689, + "grad_norm": 3.700751543045044, + "learning_rate": 9.013246935201401e-05, + "loss": 0.8698, + "num_input_tokens_seen": 22702976, + "step": 1410 + }, + { + "epoch": 0.09883807472396614, + "grad_norm": 3.686884641647339, + "learning_rate": 9.01254711033275e-05, + "loss": 0.9591, + "num_input_tokens_seen": 22719360, + "step": 1411 + }, + { + "epoch": 0.09890812296969538, + "grad_norm": 4.176409721374512, + "learning_rate": 9.011847285464099e-05, + "loss": 1.1578, + "num_input_tokens_seen": 22735744, + "step": 1412 + }, + { + "epoch": 0.09897817121542463, + "grad_norm": 4.331852912902832, + "learning_rate": 9.011147460595446e-05, + "loss": 0.9769, + "num_input_tokens_seen": 22752128, + "step": 1413 + }, + { + "epoch": 0.09904821946115387, + "grad_norm": 3.8534255027770996, + "learning_rate": 9.010447635726795e-05, + "loss": 1.1536, + "num_input_tokens_seen": 22768512, + "step": 1414 + }, + { + "epoch": 0.09911826770688312, + "grad_norm": 4.066548824310303, + "learning_rate": 9.009747810858144e-05, + "loss": 1.1199, + "num_input_tokens_seen": 22784760, + "step": 1415 + }, + { + "epoch": 0.09918831595261236, + "grad_norm": 4.076517581939697, + "learning_rate": 9.009047985989493e-05, + "loss": 1.1132, + "num_input_tokens_seen": 22801144, + "step": 1416 + }, + { + "epoch": 0.09925836419834161, + "grad_norm": 3.8858346939086914, + "learning_rate": 9.008348161120842e-05, + "loss": 0.9509, + "num_input_tokens_seen": 22817320, + "step": 1417 + }, + { + "epoch": 0.09932841244407085, + "grad_norm": 6.4605584144592285, + "learning_rate": 9.007648336252189e-05, + "loss": 1.2701, + "num_input_tokens_seen": 22833704, + "step": 1418 + }, + { + "epoch": 0.0993984606898001, + "grad_norm": 4.157481670379639, + "learning_rate": 9.006948511383538e-05, + "loss": 1.0169, + "num_input_tokens_seen": 22850088, + "step": 1419 + }, + { + "epoch": 0.09946850893552935, + "grad_norm": 3.725755214691162, + "learning_rate": 9.006248686514886e-05, + "loss": 1.0183, + "num_input_tokens_seen": 22866472, + "step": 1420 + }, + { + "epoch": 0.09953855718125859, + "grad_norm": 4.012838363647461, + "learning_rate": 9.005548861646236e-05, + "loss": 0.8425, + "num_input_tokens_seen": 22882856, + "step": 1421 + }, + { + "epoch": 0.09960860542698784, + "grad_norm": 3.8754239082336426, + "learning_rate": 9.004849036777583e-05, + "loss": 1.1375, + "num_input_tokens_seen": 22899240, + "step": 1422 + }, + { + "epoch": 0.09967865367271708, + "grad_norm": 3.90873384475708, + "learning_rate": 9.004149211908932e-05, + "loss": 1.0574, + "num_input_tokens_seen": 22915160, + "step": 1423 + }, + { + "epoch": 0.09974870191844633, + "grad_norm": 5.698948860168457, + "learning_rate": 9.003449387040281e-05, + "loss": 1.1338, + "num_input_tokens_seen": 22930592, + "step": 1424 + }, + { + "epoch": 0.09981875016417557, + "grad_norm": 4.103662014007568, + "learning_rate": 9.002749562171629e-05, + "loss": 1.2384, + "num_input_tokens_seen": 22946976, + "step": 1425 + }, + { + "epoch": 0.09988879840990482, + "grad_norm": 4.404048442840576, + "learning_rate": 9.002049737302977e-05, + "loss": 1.3855, + "num_input_tokens_seen": 22963360, + "step": 1426 + }, + { + "epoch": 0.09995884665563406, + "grad_norm": 4.043710708618164, + "learning_rate": 9.001349912434326e-05, + "loss": 1.2713, + "num_input_tokens_seen": 22979544, + "step": 1427 + }, + { + "epoch": 0.10002889490136331, + "grad_norm": 4.169802188873291, + "learning_rate": 9.000650087565675e-05, + "loss": 1.0777, + "num_input_tokens_seen": 22995072, + "step": 1428 + }, + { + "epoch": 0.10009894314709256, + "grad_norm": 4.010350227355957, + "learning_rate": 8.999950262697024e-05, + "loss": 1.1245, + "num_input_tokens_seen": 23010904, + "step": 1429 + }, + { + "epoch": 0.1001689913928218, + "grad_norm": 4.496591567993164, + "learning_rate": 8.999250437828372e-05, + "loss": 1.3372, + "num_input_tokens_seen": 23027288, + "step": 1430 + }, + { + "epoch": 0.10023903963855105, + "grad_norm": 4.2428765296936035, + "learning_rate": 8.99855061295972e-05, + "loss": 1.0258, + "num_input_tokens_seen": 23043352, + "step": 1431 + }, + { + "epoch": 0.10030908788428029, + "grad_norm": 4.083342552185059, + "learning_rate": 8.997850788091068e-05, + "loss": 1.227, + "num_input_tokens_seen": 23059736, + "step": 1432 + }, + { + "epoch": 0.10037913613000954, + "grad_norm": 3.860734462738037, + "learning_rate": 8.997150963222417e-05, + "loss": 1.0791, + "num_input_tokens_seen": 23075400, + "step": 1433 + }, + { + "epoch": 0.10044918437573878, + "grad_norm": 3.985151767730713, + "learning_rate": 8.996451138353766e-05, + "loss": 1.0486, + "num_input_tokens_seen": 23091704, + "step": 1434 + }, + { + "epoch": 0.10051923262146803, + "grad_norm": 4.039731502532959, + "learning_rate": 8.995751313485114e-05, + "loss": 0.9793, + "num_input_tokens_seen": 23108088, + "step": 1435 + }, + { + "epoch": 0.10058928086719728, + "grad_norm": 6.1780619621276855, + "learning_rate": 8.995051488616463e-05, + "loss": 1.0645, + "num_input_tokens_seen": 23123128, + "step": 1436 + }, + { + "epoch": 0.10065932911292653, + "grad_norm": 4.5783586502075195, + "learning_rate": 8.994351663747811e-05, + "loss": 1.1634, + "num_input_tokens_seen": 23139168, + "step": 1437 + }, + { + "epoch": 0.10072937735865578, + "grad_norm": 3.889927864074707, + "learning_rate": 8.99365183887916e-05, + "loss": 0.97, + "num_input_tokens_seen": 23154952, + "step": 1438 + }, + { + "epoch": 0.10079942560438503, + "grad_norm": 3.927945852279663, + "learning_rate": 8.992952014010509e-05, + "loss": 1.2428, + "num_input_tokens_seen": 23170688, + "step": 1439 + }, + { + "epoch": 0.10086947385011427, + "grad_norm": 3.8991434574127197, + "learning_rate": 8.992252189141856e-05, + "loss": 0.9519, + "num_input_tokens_seen": 23186432, + "step": 1440 + }, + { + "epoch": 0.10093952209584352, + "grad_norm": 3.6479310989379883, + "learning_rate": 8.991552364273206e-05, + "loss": 0.9656, + "num_input_tokens_seen": 23202816, + "step": 1441 + }, + { + "epoch": 0.10100957034157276, + "grad_norm": 4.637960910797119, + "learning_rate": 8.990852539404554e-05, + "loss": 1.2853, + "num_input_tokens_seen": 23218304, + "step": 1442 + }, + { + "epoch": 0.10107961858730201, + "grad_norm": 4.000091552734375, + "learning_rate": 8.990152714535903e-05, + "loss": 1.0421, + "num_input_tokens_seen": 23234688, + "step": 1443 + }, + { + "epoch": 0.10114966683303125, + "grad_norm": 4.959738731384277, + "learning_rate": 8.989452889667251e-05, + "loss": 1.0904, + "num_input_tokens_seen": 23250656, + "step": 1444 + }, + { + "epoch": 0.1012197150787605, + "grad_norm": 3.9251675605773926, + "learning_rate": 8.988753064798599e-05, + "loss": 0.9219, + "num_input_tokens_seen": 23266984, + "step": 1445 + }, + { + "epoch": 0.10128976332448975, + "grad_norm": 4.28665828704834, + "learning_rate": 8.988053239929948e-05, + "loss": 1.1465, + "num_input_tokens_seen": 23283368, + "step": 1446 + }, + { + "epoch": 0.10135981157021899, + "grad_norm": 4.421731472015381, + "learning_rate": 8.987353415061297e-05, + "loss": 1.1098, + "num_input_tokens_seen": 23298728, + "step": 1447 + }, + { + "epoch": 0.10142985981594824, + "grad_norm": 5.080065727233887, + "learning_rate": 8.986653590192646e-05, + "loss": 1.1172, + "num_input_tokens_seen": 23315112, + "step": 1448 + }, + { + "epoch": 0.10149990806167748, + "grad_norm": 5.618803977966309, + "learning_rate": 8.985953765323993e-05, + "loss": 0.9551, + "num_input_tokens_seen": 23329864, + "step": 1449 + }, + { + "epoch": 0.10156995630740673, + "grad_norm": 3.756836414337158, + "learning_rate": 8.985253940455342e-05, + "loss": 1.0981, + "num_input_tokens_seen": 23345672, + "step": 1450 + }, + { + "epoch": 0.10164000455313597, + "grad_norm": 4.461424827575684, + "learning_rate": 8.984554115586691e-05, + "loss": 1.1914, + "num_input_tokens_seen": 23362056, + "step": 1451 + }, + { + "epoch": 0.10171005279886522, + "grad_norm": 5.267919063568115, + "learning_rate": 8.983854290718038e-05, + "loss": 1.1928, + "num_input_tokens_seen": 23378440, + "step": 1452 + }, + { + "epoch": 0.10178010104459446, + "grad_norm": 5.513551235198975, + "learning_rate": 8.983154465849387e-05, + "loss": 1.2405, + "num_input_tokens_seen": 23394824, + "step": 1453 + }, + { + "epoch": 0.10185014929032371, + "grad_norm": 4.46366548538208, + "learning_rate": 8.982454640980736e-05, + "loss": 1.1436, + "num_input_tokens_seen": 23410568, + "step": 1454 + }, + { + "epoch": 0.10192019753605296, + "grad_norm": 5.066692352294922, + "learning_rate": 8.981754816112085e-05, + "loss": 1.1389, + "num_input_tokens_seen": 23426952, + "step": 1455 + }, + { + "epoch": 0.1019902457817822, + "grad_norm": 3.980743169784546, + "learning_rate": 8.981054991243434e-05, + "loss": 1.0623, + "num_input_tokens_seen": 23443256, + "step": 1456 + }, + { + "epoch": 0.10206029402751145, + "grad_norm": 4.088611125946045, + "learning_rate": 8.980355166374781e-05, + "loss": 1.0388, + "num_input_tokens_seen": 23459640, + "step": 1457 + }, + { + "epoch": 0.10213034227324069, + "grad_norm": 3.9585626125335693, + "learning_rate": 8.97965534150613e-05, + "loss": 1.2051, + "num_input_tokens_seen": 23475176, + "step": 1458 + }, + { + "epoch": 0.10220039051896994, + "grad_norm": 3.7923290729522705, + "learning_rate": 8.978955516637478e-05, + "loss": 1.0001, + "num_input_tokens_seen": 23490704, + "step": 1459 + }, + { + "epoch": 0.10227043876469918, + "grad_norm": 3.9089629650115967, + "learning_rate": 8.978255691768826e-05, + "loss": 0.9786, + "num_input_tokens_seen": 23506168, + "step": 1460 + }, + { + "epoch": 0.10234048701042843, + "grad_norm": 6.2259039878845215, + "learning_rate": 8.977555866900175e-05, + "loss": 1.2854, + "num_input_tokens_seen": 23522552, + "step": 1461 + }, + { + "epoch": 0.10241053525615768, + "grad_norm": 4.071867942810059, + "learning_rate": 8.976856042031524e-05, + "loss": 1.0724, + "num_input_tokens_seen": 23538936, + "step": 1462 + }, + { + "epoch": 0.10248058350188692, + "grad_norm": 4.587897777557373, + "learning_rate": 8.976156217162873e-05, + "loss": 1.1307, + "num_input_tokens_seen": 23554536, + "step": 1463 + }, + { + "epoch": 0.10255063174761617, + "grad_norm": 3.944937229156494, + "learning_rate": 8.97545639229422e-05, + "loss": 1.1503, + "num_input_tokens_seen": 23570888, + "step": 1464 + }, + { + "epoch": 0.10262067999334541, + "grad_norm": 3.7418766021728516, + "learning_rate": 8.97475656742557e-05, + "loss": 1.0414, + "num_input_tokens_seen": 23587272, + "step": 1465 + }, + { + "epoch": 0.10269072823907466, + "grad_norm": 3.9055676460266113, + "learning_rate": 8.974056742556918e-05, + "loss": 1.2284, + "num_input_tokens_seen": 23603640, + "step": 1466 + }, + { + "epoch": 0.1027607764848039, + "grad_norm": 3.9338066577911377, + "learning_rate": 8.973356917688267e-05, + "loss": 1.2389, + "num_input_tokens_seen": 23620024, + "step": 1467 + }, + { + "epoch": 0.10283082473053315, + "grad_norm": 4.024602890014648, + "learning_rate": 8.972657092819616e-05, + "loss": 1.1358, + "num_input_tokens_seen": 23636408, + "step": 1468 + }, + { + "epoch": 0.1029008729762624, + "grad_norm": 4.09812068939209, + "learning_rate": 8.971957267950963e-05, + "loss": 1.0734, + "num_input_tokens_seen": 23652480, + "step": 1469 + }, + { + "epoch": 0.10297092122199164, + "grad_norm": 4.7382025718688965, + "learning_rate": 8.971257443082312e-05, + "loss": 1.4112, + "num_input_tokens_seen": 23668424, + "step": 1470 + }, + { + "epoch": 0.10304096946772089, + "grad_norm": 4.518669605255127, + "learning_rate": 8.970557618213661e-05, + "loss": 1.3466, + "num_input_tokens_seen": 23684808, + "step": 1471 + }, + { + "epoch": 0.10311101771345015, + "grad_norm": 4.023036003112793, + "learning_rate": 8.969857793345009e-05, + "loss": 1.0246, + "num_input_tokens_seen": 23701192, + "step": 1472 + }, + { + "epoch": 0.10318106595917939, + "grad_norm": 4.6244215965271, + "learning_rate": 8.969157968476358e-05, + "loss": 1.2391, + "num_input_tokens_seen": 23717576, + "step": 1473 + }, + { + "epoch": 0.10325111420490864, + "grad_norm": 4.517683506011963, + "learning_rate": 8.968458143607706e-05, + "loss": 1.3872, + "num_input_tokens_seen": 23733960, + "step": 1474 + }, + { + "epoch": 0.10332116245063788, + "grad_norm": 4.048764705657959, + "learning_rate": 8.967758318739055e-05, + "loss": 1.0453, + "num_input_tokens_seen": 23750344, + "step": 1475 + }, + { + "epoch": 0.10339121069636713, + "grad_norm": 4.248376369476318, + "learning_rate": 8.967058493870403e-05, + "loss": 1.176, + "num_input_tokens_seen": 23766160, + "step": 1476 + }, + { + "epoch": 0.10346125894209637, + "grad_norm": 3.780548095703125, + "learning_rate": 8.966358669001752e-05, + "loss": 0.9048, + "num_input_tokens_seen": 23782544, + "step": 1477 + }, + { + "epoch": 0.10353130718782562, + "grad_norm": 4.26375675201416, + "learning_rate": 8.9656588441331e-05, + "loss": 0.8651, + "num_input_tokens_seen": 23798928, + "step": 1478 + }, + { + "epoch": 0.10360135543355486, + "grad_norm": 3.9202687740325928, + "learning_rate": 8.964959019264448e-05, + "loss": 1.1058, + "num_input_tokens_seen": 23815312, + "step": 1479 + }, + { + "epoch": 0.10367140367928411, + "grad_norm": 3.983797788619995, + "learning_rate": 8.964259194395797e-05, + "loss": 1.0778, + "num_input_tokens_seen": 23831696, + "step": 1480 + }, + { + "epoch": 0.10374145192501336, + "grad_norm": 4.471195220947266, + "learning_rate": 8.963559369527146e-05, + "loss": 1.1858, + "num_input_tokens_seen": 23847768, + "step": 1481 + }, + { + "epoch": 0.1038115001707426, + "grad_norm": 3.560317039489746, + "learning_rate": 8.962859544658495e-05, + "loss": 1.0205, + "num_input_tokens_seen": 23864152, + "step": 1482 + }, + { + "epoch": 0.10388154841647185, + "grad_norm": 3.8699846267700195, + "learning_rate": 8.962159719789843e-05, + "loss": 1.1438, + "num_input_tokens_seen": 23880536, + "step": 1483 + }, + { + "epoch": 0.10395159666220109, + "grad_norm": 4.547862529754639, + "learning_rate": 8.961459894921191e-05, + "loss": 1.0303, + "num_input_tokens_seen": 23896704, + "step": 1484 + }, + { + "epoch": 0.10402164490793034, + "grad_norm": 4.669456481933594, + "learning_rate": 8.96076007005254e-05, + "loss": 1.1994, + "num_input_tokens_seen": 23913088, + "step": 1485 + }, + { + "epoch": 0.10409169315365958, + "grad_norm": 4.346285343170166, + "learning_rate": 8.960060245183887e-05, + "loss": 1.2677, + "num_input_tokens_seen": 23929472, + "step": 1486 + }, + { + "epoch": 0.10416174139938883, + "grad_norm": 6.5028581619262695, + "learning_rate": 8.959360420315236e-05, + "loss": 0.989, + "num_input_tokens_seen": 23945216, + "step": 1487 + }, + { + "epoch": 0.10423178964511808, + "grad_norm": 3.935488224029541, + "learning_rate": 8.958660595446586e-05, + "loss": 1.2657, + "num_input_tokens_seen": 23961600, + "step": 1488 + }, + { + "epoch": 0.10430183789084732, + "grad_norm": 3.772397518157959, + "learning_rate": 8.957960770577934e-05, + "loss": 1.1038, + "num_input_tokens_seen": 23977984, + "step": 1489 + }, + { + "epoch": 0.10437188613657657, + "grad_norm": 4.508286476135254, + "learning_rate": 8.957260945709283e-05, + "loss": 1.2694, + "num_input_tokens_seen": 23993752, + "step": 1490 + }, + { + "epoch": 0.10444193438230581, + "grad_norm": 4.667380332946777, + "learning_rate": 8.95656112084063e-05, + "loss": 1.2837, + "num_input_tokens_seen": 24009832, + "step": 1491 + }, + { + "epoch": 0.10451198262803506, + "grad_norm": 7.675503730773926, + "learning_rate": 8.955861295971979e-05, + "loss": 1.121, + "num_input_tokens_seen": 24025784, + "step": 1492 + }, + { + "epoch": 0.1045820308737643, + "grad_norm": 4.427548408508301, + "learning_rate": 8.955161471103328e-05, + "loss": 0.835, + "num_input_tokens_seen": 24041568, + "step": 1493 + }, + { + "epoch": 0.10465207911949355, + "grad_norm": 3.9065396785736084, + "learning_rate": 8.954461646234677e-05, + "loss": 1.1322, + "num_input_tokens_seen": 24057952, + "step": 1494 + }, + { + "epoch": 0.1047221273652228, + "grad_norm": 4.052605628967285, + "learning_rate": 8.953761821366026e-05, + "loss": 1.1133, + "num_input_tokens_seen": 24074336, + "step": 1495 + }, + { + "epoch": 0.10479217561095204, + "grad_norm": 3.758476734161377, + "learning_rate": 8.953061996497373e-05, + "loss": 1.1302, + "num_input_tokens_seen": 24090720, + "step": 1496 + }, + { + "epoch": 0.10486222385668129, + "grad_norm": 4.4470014572143555, + "learning_rate": 8.952362171628722e-05, + "loss": 1.0969, + "num_input_tokens_seen": 24107024, + "step": 1497 + }, + { + "epoch": 0.10493227210241053, + "grad_norm": 4.222001075744629, + "learning_rate": 8.951662346760071e-05, + "loss": 1.147, + "num_input_tokens_seen": 24123408, + "step": 1498 + }, + { + "epoch": 0.10500232034813978, + "grad_norm": 4.72997522354126, + "learning_rate": 8.950962521891418e-05, + "loss": 1.1086, + "num_input_tokens_seen": 24137672, + "step": 1499 + }, + { + "epoch": 0.10507236859386902, + "grad_norm": 4.342312812805176, + "learning_rate": 8.950262697022767e-05, + "loss": 1.2044, + "num_input_tokens_seen": 24153248, + "step": 1500 + }, + { + "epoch": 0.10514241683959827, + "grad_norm": 4.723706245422363, + "learning_rate": 8.949562872154116e-05, + "loss": 1.1075, + "num_input_tokens_seen": 24169240, + "step": 1501 + }, + { + "epoch": 0.10521246508532751, + "grad_norm": 4.244345188140869, + "learning_rate": 8.948863047285465e-05, + "loss": 1.1839, + "num_input_tokens_seen": 24184608, + "step": 1502 + }, + { + "epoch": 0.10528251333105676, + "grad_norm": 3.6271615028381348, + "learning_rate": 8.948163222416812e-05, + "loss": 1.0755, + "num_input_tokens_seen": 24200992, + "step": 1503 + }, + { + "epoch": 0.105352561576786, + "grad_norm": 3.858696937561035, + "learning_rate": 8.947463397548161e-05, + "loss": 1.0598, + "num_input_tokens_seen": 24217376, + "step": 1504 + }, + { + "epoch": 0.10542260982251525, + "grad_norm": 7.14077091217041, + "learning_rate": 8.94676357267951e-05, + "loss": 1.0362, + "num_input_tokens_seen": 24232368, + "step": 1505 + }, + { + "epoch": 0.10549265806824451, + "grad_norm": 4.203495979309082, + "learning_rate": 8.946063747810858e-05, + "loss": 1.2491, + "num_input_tokens_seen": 24248520, + "step": 1506 + }, + { + "epoch": 0.10556270631397376, + "grad_norm": 4.344188213348389, + "learning_rate": 8.945363922942207e-05, + "loss": 0.905, + "num_input_tokens_seen": 24264824, + "step": 1507 + }, + { + "epoch": 0.105632754559703, + "grad_norm": 6.156280517578125, + "learning_rate": 8.944664098073557e-05, + "loss": 1.3046, + "num_input_tokens_seen": 24281208, + "step": 1508 + }, + { + "epoch": 0.10570280280543225, + "grad_norm": 4.687212944030762, + "learning_rate": 8.943964273204904e-05, + "loss": 1.1898, + "num_input_tokens_seen": 24297592, + "step": 1509 + }, + { + "epoch": 0.10577285105116149, + "grad_norm": 3.9128546714782715, + "learning_rate": 8.943264448336253e-05, + "loss": 1.0506, + "num_input_tokens_seen": 24313976, + "step": 1510 + }, + { + "epoch": 0.10584289929689074, + "grad_norm": 5.766979694366455, + "learning_rate": 8.9425646234676e-05, + "loss": 1.119, + "num_input_tokens_seen": 24330296, + "step": 1511 + }, + { + "epoch": 0.10591294754261998, + "grad_norm": 3.9610238075256348, + "learning_rate": 8.94186479859895e-05, + "loss": 1.279, + "num_input_tokens_seen": 24346680, + "step": 1512 + }, + { + "epoch": 0.10598299578834923, + "grad_norm": 4.262688636779785, + "learning_rate": 8.941164973730297e-05, + "loss": 1.3217, + "num_input_tokens_seen": 24362408, + "step": 1513 + }, + { + "epoch": 0.10605304403407848, + "grad_norm": 5.190121173858643, + "learning_rate": 8.940465148861647e-05, + "loss": 1.0615, + "num_input_tokens_seen": 24378248, + "step": 1514 + }, + { + "epoch": 0.10612309227980772, + "grad_norm": 4.5859479904174805, + "learning_rate": 8.939765323992996e-05, + "loss": 1.1377, + "num_input_tokens_seen": 24394632, + "step": 1515 + }, + { + "epoch": 0.10619314052553697, + "grad_norm": 4.021294593811035, + "learning_rate": 8.939065499124344e-05, + "loss": 0.9913, + "num_input_tokens_seen": 24411016, + "step": 1516 + }, + { + "epoch": 0.10626318877126621, + "grad_norm": 4.296265602111816, + "learning_rate": 8.938365674255692e-05, + "loss": 1.1753, + "num_input_tokens_seen": 24426792, + "step": 1517 + }, + { + "epoch": 0.10633323701699546, + "grad_norm": 3.4397289752960205, + "learning_rate": 8.93766584938704e-05, + "loss": 0.8159, + "num_input_tokens_seen": 24443176, + "step": 1518 + }, + { + "epoch": 0.1064032852627247, + "grad_norm": 4.009952545166016, + "learning_rate": 8.936966024518389e-05, + "loss": 1.0728, + "num_input_tokens_seen": 24459384, + "step": 1519 + }, + { + "epoch": 0.10647333350845395, + "grad_norm": 4.786280632019043, + "learning_rate": 8.936266199649738e-05, + "loss": 1.1303, + "num_input_tokens_seen": 24474904, + "step": 1520 + }, + { + "epoch": 0.1065433817541832, + "grad_norm": 3.869297981262207, + "learning_rate": 8.935566374781087e-05, + "loss": 1.0829, + "num_input_tokens_seen": 24490456, + "step": 1521 + }, + { + "epoch": 0.10661342999991244, + "grad_norm": 3.995553731918335, + "learning_rate": 8.934866549912435e-05, + "loss": 1.0813, + "num_input_tokens_seen": 24506840, + "step": 1522 + }, + { + "epoch": 0.10668347824564169, + "grad_norm": 4.195018291473389, + "learning_rate": 8.934166725043783e-05, + "loss": 1.0585, + "num_input_tokens_seen": 24522440, + "step": 1523 + }, + { + "epoch": 0.10675352649137093, + "grad_norm": 4.0432515144348145, + "learning_rate": 8.933466900175132e-05, + "loss": 1.0757, + "num_input_tokens_seen": 24538824, + "step": 1524 + }, + { + "epoch": 0.10682357473710018, + "grad_norm": 5.120638847351074, + "learning_rate": 8.93276707530648e-05, + "loss": 1.1328, + "num_input_tokens_seen": 24555208, + "step": 1525 + }, + { + "epoch": 0.10689362298282942, + "grad_norm": 3.925096035003662, + "learning_rate": 8.932067250437828e-05, + "loss": 1.1569, + "num_input_tokens_seen": 24571544, + "step": 1526 + }, + { + "epoch": 0.10696367122855867, + "grad_norm": 3.930328130722046, + "learning_rate": 8.931367425569177e-05, + "loss": 0.9385, + "num_input_tokens_seen": 24587736, + "step": 1527 + }, + { + "epoch": 0.10703371947428791, + "grad_norm": 3.7056055068969727, + "learning_rate": 8.930667600700526e-05, + "loss": 0.8675, + "num_input_tokens_seen": 24604120, + "step": 1528 + }, + { + "epoch": 0.10710376772001716, + "grad_norm": 5.945568561553955, + "learning_rate": 8.929967775831875e-05, + "loss": 1.0395, + "num_input_tokens_seen": 24620504, + "step": 1529 + }, + { + "epoch": 0.1071738159657464, + "grad_norm": 3.7765939235687256, + "learning_rate": 8.929267950963222e-05, + "loss": 0.8796, + "num_input_tokens_seen": 24635440, + "step": 1530 + }, + { + "epoch": 0.10724386421147565, + "grad_norm": 4.229284286499023, + "learning_rate": 8.928568126094571e-05, + "loss": 1.0941, + "num_input_tokens_seen": 24651824, + "step": 1531 + }, + { + "epoch": 0.1073139124572049, + "grad_norm": 4.198834419250488, + "learning_rate": 8.92786830122592e-05, + "loss": 1.118, + "num_input_tokens_seen": 24668208, + "step": 1532 + }, + { + "epoch": 0.10738396070293414, + "grad_norm": 8.091620445251465, + "learning_rate": 8.927168476357267e-05, + "loss": 1.1515, + "num_input_tokens_seen": 24684248, + "step": 1533 + }, + { + "epoch": 0.10745400894866339, + "grad_norm": 4.091879844665527, + "learning_rate": 8.926468651488618e-05, + "loss": 1.1283, + "num_input_tokens_seen": 24700632, + "step": 1534 + }, + { + "epoch": 0.10752405719439263, + "grad_norm": 3.90326189994812, + "learning_rate": 8.925768826619966e-05, + "loss": 1.047, + "num_input_tokens_seen": 24717016, + "step": 1535 + }, + { + "epoch": 0.10759410544012188, + "grad_norm": 4.097111225128174, + "learning_rate": 8.925069001751314e-05, + "loss": 1.1623, + "num_input_tokens_seen": 24732776, + "step": 1536 + }, + { + "epoch": 0.10766415368585112, + "grad_norm": 3.5537095069885254, + "learning_rate": 8.924369176882663e-05, + "loss": 0.989, + "num_input_tokens_seen": 24749064, + "step": 1537 + }, + { + "epoch": 0.10773420193158037, + "grad_norm": 4.3086256980896, + "learning_rate": 8.92366935201401e-05, + "loss": 1.0864, + "num_input_tokens_seen": 24765448, + "step": 1538 + }, + { + "epoch": 0.10780425017730962, + "grad_norm": 4.177425861358643, + "learning_rate": 8.922969527145359e-05, + "loss": 1.0652, + "num_input_tokens_seen": 24780816, + "step": 1539 + }, + { + "epoch": 0.10787429842303886, + "grad_norm": 3.6013338565826416, + "learning_rate": 8.922269702276708e-05, + "loss": 1.1045, + "num_input_tokens_seen": 24796600, + "step": 1540 + }, + { + "epoch": 0.10794434666876812, + "grad_norm": 4.05686092376709, + "learning_rate": 8.921569877408057e-05, + "loss": 1.1408, + "num_input_tokens_seen": 24812984, + "step": 1541 + }, + { + "epoch": 0.10801439491449737, + "grad_norm": 4.245424747467041, + "learning_rate": 8.920870052539406e-05, + "loss": 1.2634, + "num_input_tokens_seen": 24829368, + "step": 1542 + }, + { + "epoch": 0.10808444316022661, + "grad_norm": 3.9563350677490234, + "learning_rate": 8.920170227670753e-05, + "loss": 1.1015, + "num_input_tokens_seen": 24845752, + "step": 1543 + }, + { + "epoch": 0.10815449140595586, + "grad_norm": 4.209373474121094, + "learning_rate": 8.919470402802102e-05, + "loss": 1.2394, + "num_input_tokens_seen": 24862136, + "step": 1544 + }, + { + "epoch": 0.1082245396516851, + "grad_norm": 3.6590163707733154, + "learning_rate": 8.91877057793345e-05, + "loss": 1.0168, + "num_input_tokens_seen": 24878520, + "step": 1545 + }, + { + "epoch": 0.10829458789741435, + "grad_norm": 3.937568187713623, + "learning_rate": 8.918070753064799e-05, + "loss": 1.0999, + "num_input_tokens_seen": 24894696, + "step": 1546 + }, + { + "epoch": 0.1083646361431436, + "grad_norm": 3.948453426361084, + "learning_rate": 8.917370928196147e-05, + "loss": 1.0565, + "num_input_tokens_seen": 24910208, + "step": 1547 + }, + { + "epoch": 0.10843468438887284, + "grad_norm": 3.61549711227417, + "learning_rate": 8.916671103327496e-05, + "loss": 1.0294, + "num_input_tokens_seen": 24926592, + "step": 1548 + }, + { + "epoch": 0.10850473263460209, + "grad_norm": 4.091664791107178, + "learning_rate": 8.915971278458845e-05, + "loss": 1.0596, + "num_input_tokens_seen": 24942976, + "step": 1549 + }, + { + "epoch": 0.10857478088033133, + "grad_norm": 5.494830131530762, + "learning_rate": 8.915271453590193e-05, + "loss": 1.1564, + "num_input_tokens_seen": 24957984, + "step": 1550 + }, + { + "epoch": 0.10864482912606058, + "grad_norm": 4.546476364135742, + "learning_rate": 8.914571628721541e-05, + "loss": 1.0753, + "num_input_tokens_seen": 24974368, + "step": 1551 + }, + { + "epoch": 0.10871487737178982, + "grad_norm": 3.775996446609497, + "learning_rate": 8.91387180385289e-05, + "loss": 1.11, + "num_input_tokens_seen": 24990200, + "step": 1552 + }, + { + "epoch": 0.10878492561751907, + "grad_norm": 3.9989728927612305, + "learning_rate": 8.913171978984238e-05, + "loss": 1.0121, + "num_input_tokens_seen": 25006584, + "step": 1553 + }, + { + "epoch": 0.10885497386324831, + "grad_norm": 4.417224884033203, + "learning_rate": 8.912472154115588e-05, + "loss": 1.1891, + "num_input_tokens_seen": 25022464, + "step": 1554 + }, + { + "epoch": 0.10892502210897756, + "grad_norm": 4.604903697967529, + "learning_rate": 8.911772329246936e-05, + "loss": 0.9414, + "num_input_tokens_seen": 25038848, + "step": 1555 + }, + { + "epoch": 0.1089950703547068, + "grad_norm": 4.823176860809326, + "learning_rate": 8.911072504378284e-05, + "loss": 1.1259, + "num_input_tokens_seen": 25053776, + "step": 1556 + }, + { + "epoch": 0.10906511860043605, + "grad_norm": 3.6778531074523926, + "learning_rate": 8.910372679509632e-05, + "loss": 0.9995, + "num_input_tokens_seen": 25069872, + "step": 1557 + }, + { + "epoch": 0.1091351668461653, + "grad_norm": 4.344213485717773, + "learning_rate": 8.909672854640981e-05, + "loss": 1.1984, + "num_input_tokens_seen": 25086256, + "step": 1558 + }, + { + "epoch": 0.10920521509189454, + "grad_norm": 4.592464923858643, + "learning_rate": 8.90897302977233e-05, + "loss": 1.502, + "num_input_tokens_seen": 25102640, + "step": 1559 + }, + { + "epoch": 0.10927526333762379, + "grad_norm": 4.103248119354248, + "learning_rate": 8.908273204903678e-05, + "loss": 0.9454, + "num_input_tokens_seen": 25118328, + "step": 1560 + }, + { + "epoch": 0.10934531158335303, + "grad_norm": 4.637456893920898, + "learning_rate": 8.907573380035027e-05, + "loss": 1.3611, + "num_input_tokens_seen": 25134712, + "step": 1561 + }, + { + "epoch": 0.10941535982908228, + "grad_norm": 4.4709930419921875, + "learning_rate": 8.906873555166376e-05, + "loss": 1.1147, + "num_input_tokens_seen": 25149304, + "step": 1562 + }, + { + "epoch": 0.10948540807481152, + "grad_norm": 4.154660701751709, + "learning_rate": 8.906173730297724e-05, + "loss": 1.2855, + "num_input_tokens_seen": 25165360, + "step": 1563 + }, + { + "epoch": 0.10955545632054077, + "grad_norm": 4.1212334632873535, + "learning_rate": 8.905473905429073e-05, + "loss": 1.2015, + "num_input_tokens_seen": 25181744, + "step": 1564 + }, + { + "epoch": 0.10962550456627002, + "grad_norm": 3.8060882091522217, + "learning_rate": 8.90477408056042e-05, + "loss": 1.0333, + "num_input_tokens_seen": 25197800, + "step": 1565 + }, + { + "epoch": 0.10969555281199926, + "grad_norm": 3.4948956966400146, + "learning_rate": 8.904074255691769e-05, + "loss": 0.941, + "num_input_tokens_seen": 25214008, + "step": 1566 + }, + { + "epoch": 0.1097656010577285, + "grad_norm": 4.181606292724609, + "learning_rate": 8.903374430823118e-05, + "loss": 1.1185, + "num_input_tokens_seen": 25229496, + "step": 1567 + }, + { + "epoch": 0.10983564930345775, + "grad_norm": 4.206098556518555, + "learning_rate": 8.902674605954467e-05, + "loss": 1.0363, + "num_input_tokens_seen": 25244864, + "step": 1568 + }, + { + "epoch": 0.109905697549187, + "grad_norm": 3.797475576400757, + "learning_rate": 8.901974781085815e-05, + "loss": 1.0443, + "num_input_tokens_seen": 25261248, + "step": 1569 + }, + { + "epoch": 0.10997574579491624, + "grad_norm": 4.131814479827881, + "learning_rate": 8.901274956217163e-05, + "loss": 0.9977, + "num_input_tokens_seen": 25277632, + "step": 1570 + }, + { + "epoch": 0.11004579404064549, + "grad_norm": 3.9447309970855713, + "learning_rate": 8.900575131348512e-05, + "loss": 1.0839, + "num_input_tokens_seen": 25294016, + "step": 1571 + }, + { + "epoch": 0.11011584228637473, + "grad_norm": 3.916949510574341, + "learning_rate": 8.89987530647986e-05, + "loss": 1.1793, + "num_input_tokens_seen": 25309912, + "step": 1572 + }, + { + "epoch": 0.11018589053210398, + "grad_norm": 3.7132885456085205, + "learning_rate": 8.899175481611208e-05, + "loss": 1.081, + "num_input_tokens_seen": 25326296, + "step": 1573 + }, + { + "epoch": 0.11025593877783323, + "grad_norm": 4.5842390060424805, + "learning_rate": 8.898475656742558e-05, + "loss": 0.926, + "num_input_tokens_seen": 25342328, + "step": 1574 + }, + { + "epoch": 0.11032598702356247, + "grad_norm": 3.578962802886963, + "learning_rate": 8.897775831873906e-05, + "loss": 1.0599, + "num_input_tokens_seen": 25357640, + "step": 1575 + }, + { + "epoch": 0.11039603526929173, + "grad_norm": 3.5823471546173096, + "learning_rate": 8.897076007005255e-05, + "loss": 0.9519, + "num_input_tokens_seen": 25373424, + "step": 1576 + }, + { + "epoch": 0.11046608351502098, + "grad_norm": 3.721482515335083, + "learning_rate": 8.896376182136602e-05, + "loss": 0.976, + "num_input_tokens_seen": 25389808, + "step": 1577 + }, + { + "epoch": 0.11053613176075022, + "grad_norm": 4.874295711517334, + "learning_rate": 8.895676357267951e-05, + "loss": 1.3507, + "num_input_tokens_seen": 25406192, + "step": 1578 + }, + { + "epoch": 0.11060618000647947, + "grad_norm": 3.8547258377075195, + "learning_rate": 8.8949765323993e-05, + "loss": 0.9444, + "num_input_tokens_seen": 25421632, + "step": 1579 + }, + { + "epoch": 0.11067622825220871, + "grad_norm": 4.847586631774902, + "learning_rate": 8.894276707530649e-05, + "loss": 1.0526, + "num_input_tokens_seen": 25438016, + "step": 1580 + }, + { + "epoch": 0.11074627649793796, + "grad_norm": 3.950594425201416, + "learning_rate": 8.893576882661998e-05, + "loss": 1.0688, + "num_input_tokens_seen": 25454400, + "step": 1581 + }, + { + "epoch": 0.1108163247436672, + "grad_norm": 3.7372758388519287, + "learning_rate": 8.892877057793345e-05, + "loss": 1.2211, + "num_input_tokens_seen": 25470304, + "step": 1582 + }, + { + "epoch": 0.11088637298939645, + "grad_norm": 3.8695788383483887, + "learning_rate": 8.892177232924694e-05, + "loss": 1.1006, + "num_input_tokens_seen": 25486688, + "step": 1583 + }, + { + "epoch": 0.1109564212351257, + "grad_norm": 4.623810768127441, + "learning_rate": 8.891477408056042e-05, + "loss": 1.034, + "num_input_tokens_seen": 25503072, + "step": 1584 + }, + { + "epoch": 0.11102646948085494, + "grad_norm": 4.03538179397583, + "learning_rate": 8.89077758318739e-05, + "loss": 1.0915, + "num_input_tokens_seen": 25519008, + "step": 1585 + }, + { + "epoch": 0.11109651772658419, + "grad_norm": 7.486603736877441, + "learning_rate": 8.890077758318739e-05, + "loss": 1.0137, + "num_input_tokens_seen": 25533808, + "step": 1586 + }, + { + "epoch": 0.11116656597231343, + "grad_norm": 4.660414218902588, + "learning_rate": 8.889377933450088e-05, + "loss": 1.0172, + "num_input_tokens_seen": 25549784, + "step": 1587 + }, + { + "epoch": 0.11123661421804268, + "grad_norm": 3.9375548362731934, + "learning_rate": 8.888678108581437e-05, + "loss": 0.9843, + "num_input_tokens_seen": 25566168, + "step": 1588 + }, + { + "epoch": 0.11130666246377192, + "grad_norm": 4.275035858154297, + "learning_rate": 8.887978283712786e-05, + "loss": 1.1802, + "num_input_tokens_seen": 25582552, + "step": 1589 + }, + { + "epoch": 0.11137671070950117, + "grad_norm": 4.799124240875244, + "learning_rate": 8.887278458844133e-05, + "loss": 1.2702, + "num_input_tokens_seen": 25598936, + "step": 1590 + }, + { + "epoch": 0.11144675895523042, + "grad_norm": 4.143614768981934, + "learning_rate": 8.886578633975482e-05, + "loss": 1.1797, + "num_input_tokens_seen": 25615320, + "step": 1591 + }, + { + "epoch": 0.11151680720095966, + "grad_norm": 4.490556716918945, + "learning_rate": 8.88587880910683e-05, + "loss": 1.1351, + "num_input_tokens_seen": 25630624, + "step": 1592 + }, + { + "epoch": 0.1115868554466889, + "grad_norm": 6.010688781738281, + "learning_rate": 8.885178984238179e-05, + "loss": 1.059, + "num_input_tokens_seen": 25646048, + "step": 1593 + }, + { + "epoch": 0.11165690369241815, + "grad_norm": 3.7447726726531982, + "learning_rate": 8.884479159369527e-05, + "loss": 0.9902, + "num_input_tokens_seen": 25661528, + "step": 1594 + }, + { + "epoch": 0.1117269519381474, + "grad_norm": 4.77920389175415, + "learning_rate": 8.883779334500876e-05, + "loss": 1.1158, + "num_input_tokens_seen": 25677912, + "step": 1595 + }, + { + "epoch": 0.11179700018387664, + "grad_norm": 3.9812231063842773, + "learning_rate": 8.883079509632225e-05, + "loss": 1.096, + "num_input_tokens_seen": 25694296, + "step": 1596 + }, + { + "epoch": 0.11186704842960589, + "grad_norm": 3.7404634952545166, + "learning_rate": 8.882379684763573e-05, + "loss": 0.9965, + "num_input_tokens_seen": 25710448, + "step": 1597 + }, + { + "epoch": 0.11193709667533513, + "grad_norm": 4.466211318969727, + "learning_rate": 8.881679859894922e-05, + "loss": 1.1495, + "num_input_tokens_seen": 25726624, + "step": 1598 + }, + { + "epoch": 0.11200714492106438, + "grad_norm": 3.6850225925445557, + "learning_rate": 8.880980035026269e-05, + "loss": 0.9685, + "num_input_tokens_seen": 25742456, + "step": 1599 + }, + { + "epoch": 0.11207719316679363, + "grad_norm": 4.128363609313965, + "learning_rate": 8.880280210157619e-05, + "loss": 1.1052, + "num_input_tokens_seen": 25758840, + "step": 1600 + }, + { + "epoch": 0.11207719316679363, + "eval_loss": 1.1512293815612793, + "eval_runtime": 0.1988, + "eval_samples_per_second": 5.031, + "eval_steps_per_second": 5.031, + "num_input_tokens_seen": 25758840, + "step": 1600 + }, + { + "epoch": 0.11214724141252287, + "grad_norm": 4.852661609649658, + "learning_rate": 8.879580385288968e-05, + "loss": 1.0778, + "num_input_tokens_seen": 25774312, + "step": 1601 + }, + { + "epoch": 0.11221728965825212, + "grad_norm": 4.501857280731201, + "learning_rate": 8.878880560420316e-05, + "loss": 1.302, + "num_input_tokens_seen": 25790696, + "step": 1602 + }, + { + "epoch": 0.11228733790398136, + "grad_norm": 4.142490863800049, + "learning_rate": 8.878180735551665e-05, + "loss": 1.0375, + "num_input_tokens_seen": 25807080, + "step": 1603 + }, + { + "epoch": 0.11235738614971061, + "grad_norm": 3.606905698776245, + "learning_rate": 8.877480910683012e-05, + "loss": 0.9254, + "num_input_tokens_seen": 25822552, + "step": 1604 + }, + { + "epoch": 0.11242743439543985, + "grad_norm": 3.837010145187378, + "learning_rate": 8.876781085814361e-05, + "loss": 1.1756, + "num_input_tokens_seen": 25838088, + "step": 1605 + }, + { + "epoch": 0.1124974826411691, + "grad_norm": 3.9082963466644287, + "learning_rate": 8.87608126094571e-05, + "loss": 1.0201, + "num_input_tokens_seen": 25854240, + "step": 1606 + }, + { + "epoch": 0.11256753088689835, + "grad_norm": 4.062923908233643, + "learning_rate": 8.875381436077059e-05, + "loss": 1.1034, + "num_input_tokens_seen": 25870624, + "step": 1607 + }, + { + "epoch": 0.11263757913262759, + "grad_norm": 4.331594944000244, + "learning_rate": 8.874681611208407e-05, + "loss": 1.2043, + "num_input_tokens_seen": 25886656, + "step": 1608 + }, + { + "epoch": 0.11270762737835684, + "grad_norm": 3.77466082572937, + "learning_rate": 8.873981786339755e-05, + "loss": 0.936, + "num_input_tokens_seen": 25902704, + "step": 1609 + }, + { + "epoch": 0.1127776756240861, + "grad_norm": 3.3747365474700928, + "learning_rate": 8.873281961471104e-05, + "loss": 0.9071, + "num_input_tokens_seen": 25919088, + "step": 1610 + }, + { + "epoch": 0.11284772386981534, + "grad_norm": 5.377493381500244, + "learning_rate": 8.872582136602451e-05, + "loss": 0.9246, + "num_input_tokens_seen": 25935472, + "step": 1611 + }, + { + "epoch": 0.11291777211554459, + "grad_norm": 5.506969451904297, + "learning_rate": 8.8718823117338e-05, + "loss": 0.9211, + "num_input_tokens_seen": 25951664, + "step": 1612 + }, + { + "epoch": 0.11298782036127383, + "grad_norm": 4.874104976654053, + "learning_rate": 8.871182486865149e-05, + "loss": 1.1654, + "num_input_tokens_seen": 25968048, + "step": 1613 + }, + { + "epoch": 0.11305786860700308, + "grad_norm": 4.666824817657471, + "learning_rate": 8.870482661996498e-05, + "loss": 1.2155, + "num_input_tokens_seen": 25983784, + "step": 1614 + }, + { + "epoch": 0.11312791685273232, + "grad_norm": 3.949862241744995, + "learning_rate": 8.869782837127847e-05, + "loss": 1.1243, + "num_input_tokens_seen": 26000168, + "step": 1615 + }, + { + "epoch": 0.11319796509846157, + "grad_norm": 3.866542339324951, + "learning_rate": 8.869083012259196e-05, + "loss": 1.1302, + "num_input_tokens_seen": 26015456, + "step": 1616 + }, + { + "epoch": 0.11326801334419082, + "grad_norm": 3.8679909706115723, + "learning_rate": 8.868383187390543e-05, + "loss": 1.0886, + "num_input_tokens_seen": 26031224, + "step": 1617 + }, + { + "epoch": 0.11333806158992006, + "grad_norm": 4.7508087158203125, + "learning_rate": 8.867683362521892e-05, + "loss": 1.2837, + "num_input_tokens_seen": 26046952, + "step": 1618 + }, + { + "epoch": 0.1134081098356493, + "grad_norm": 3.878549337387085, + "learning_rate": 8.86698353765324e-05, + "loss": 0.99, + "num_input_tokens_seen": 26063280, + "step": 1619 + }, + { + "epoch": 0.11347815808137855, + "grad_norm": 3.8016276359558105, + "learning_rate": 8.86628371278459e-05, + "loss": 1.1682, + "num_input_tokens_seen": 26079616, + "step": 1620 + }, + { + "epoch": 0.1135482063271078, + "grad_norm": 4.040102481842041, + "learning_rate": 8.865583887915937e-05, + "loss": 1.1008, + "num_input_tokens_seen": 26095232, + "step": 1621 + }, + { + "epoch": 0.11361825457283704, + "grad_norm": 3.932529926300049, + "learning_rate": 8.864884063047286e-05, + "loss": 1.1663, + "num_input_tokens_seen": 26111616, + "step": 1622 + }, + { + "epoch": 0.11368830281856629, + "grad_norm": 4.568112373352051, + "learning_rate": 8.864184238178635e-05, + "loss": 1.1932, + "num_input_tokens_seen": 26128000, + "step": 1623 + }, + { + "epoch": 0.11375835106429553, + "grad_norm": 4.23036527633667, + "learning_rate": 8.863484413309982e-05, + "loss": 1.0223, + "num_input_tokens_seen": 26144384, + "step": 1624 + }, + { + "epoch": 0.11382839931002478, + "grad_norm": 4.209012031555176, + "learning_rate": 8.862784588441331e-05, + "loss": 1.0992, + "num_input_tokens_seen": 26160768, + "step": 1625 + }, + { + "epoch": 0.11389844755575403, + "grad_norm": 3.865983724594116, + "learning_rate": 8.86208476357268e-05, + "loss": 1.1213, + "num_input_tokens_seen": 26177152, + "step": 1626 + }, + { + "epoch": 0.11396849580148327, + "grad_norm": 3.781083822250366, + "learning_rate": 8.861384938704029e-05, + "loss": 1.0132, + "num_input_tokens_seen": 26193536, + "step": 1627 + }, + { + "epoch": 0.11403854404721252, + "grad_norm": 4.330471038818359, + "learning_rate": 8.860685113835378e-05, + "loss": 0.9749, + "num_input_tokens_seen": 26208976, + "step": 1628 + }, + { + "epoch": 0.11410859229294176, + "grad_norm": 4.772238254547119, + "learning_rate": 8.859985288966725e-05, + "loss": 1.2796, + "num_input_tokens_seen": 26225360, + "step": 1629 + }, + { + "epoch": 0.11417864053867101, + "grad_norm": 4.0468668937683105, + "learning_rate": 8.859285464098074e-05, + "loss": 1.0056, + "num_input_tokens_seen": 26241744, + "step": 1630 + }, + { + "epoch": 0.11424868878440025, + "grad_norm": 3.9648735523223877, + "learning_rate": 8.858585639229422e-05, + "loss": 1.2185, + "num_input_tokens_seen": 26258128, + "step": 1631 + }, + { + "epoch": 0.1143187370301295, + "grad_norm": 4.7014079093933105, + "learning_rate": 8.85788581436077e-05, + "loss": 1.1795, + "num_input_tokens_seen": 26274512, + "step": 1632 + }, + { + "epoch": 0.11438878527585875, + "grad_norm": 4.6375627517700195, + "learning_rate": 8.85718598949212e-05, + "loss": 1.0074, + "num_input_tokens_seen": 26290008, + "step": 1633 + }, + { + "epoch": 0.11445883352158799, + "grad_norm": 4.427719593048096, + "learning_rate": 8.856486164623468e-05, + "loss": 1.2769, + "num_input_tokens_seen": 26305512, + "step": 1634 + }, + { + "epoch": 0.11452888176731724, + "grad_norm": 6.001821994781494, + "learning_rate": 8.855786339754817e-05, + "loss": 1.0606, + "num_input_tokens_seen": 26319504, + "step": 1635 + }, + { + "epoch": 0.11459893001304648, + "grad_norm": 3.970672369003296, + "learning_rate": 8.855086514886165e-05, + "loss": 1.1944, + "num_input_tokens_seen": 26335888, + "step": 1636 + }, + { + "epoch": 0.11466897825877573, + "grad_norm": 3.924450635910034, + "learning_rate": 8.854386690017514e-05, + "loss": 0.9607, + "num_input_tokens_seen": 26351536, + "step": 1637 + }, + { + "epoch": 0.11473902650450497, + "grad_norm": 4.400977611541748, + "learning_rate": 8.853686865148861e-05, + "loss": 1.0641, + "num_input_tokens_seen": 26367808, + "step": 1638 + }, + { + "epoch": 0.11480907475023422, + "grad_norm": 3.9734365940093994, + "learning_rate": 8.85298704028021e-05, + "loss": 1.2258, + "num_input_tokens_seen": 26383864, + "step": 1639 + }, + { + "epoch": 0.11487912299596346, + "grad_norm": 3.792949914932251, + "learning_rate": 8.85228721541156e-05, + "loss": 1.0401, + "num_input_tokens_seen": 26400248, + "step": 1640 + }, + { + "epoch": 0.11494917124169271, + "grad_norm": 5.14591121673584, + "learning_rate": 8.851587390542908e-05, + "loss": 1.0484, + "num_input_tokens_seen": 26416056, + "step": 1641 + }, + { + "epoch": 0.11501921948742196, + "grad_norm": 5.0158162117004395, + "learning_rate": 8.850887565674256e-05, + "loss": 1.2823, + "num_input_tokens_seen": 26431400, + "step": 1642 + }, + { + "epoch": 0.1150892677331512, + "grad_norm": 4.459201812744141, + "learning_rate": 8.850187740805605e-05, + "loss": 1.2371, + "num_input_tokens_seen": 26446920, + "step": 1643 + }, + { + "epoch": 0.11515931597888045, + "grad_norm": 3.717949867248535, + "learning_rate": 8.849487915936953e-05, + "loss": 1.1299, + "num_input_tokens_seen": 26463304, + "step": 1644 + }, + { + "epoch": 0.1152293642246097, + "grad_norm": 3.7555253505706787, + "learning_rate": 8.848788091068302e-05, + "loss": 1.0835, + "num_input_tokens_seen": 26479296, + "step": 1645 + }, + { + "epoch": 0.11529941247033895, + "grad_norm": 4.3726325035095215, + "learning_rate": 8.84808826619965e-05, + "loss": 0.9606, + "num_input_tokens_seen": 26495024, + "step": 1646 + }, + { + "epoch": 0.1153694607160682, + "grad_norm": 3.728700876235962, + "learning_rate": 8.847388441331e-05, + "loss": 1.0486, + "num_input_tokens_seen": 26511408, + "step": 1647 + }, + { + "epoch": 0.11543950896179744, + "grad_norm": 4.276855945587158, + "learning_rate": 8.846688616462347e-05, + "loss": 0.9869, + "num_input_tokens_seen": 26527688, + "step": 1648 + }, + { + "epoch": 0.11550955720752669, + "grad_norm": 5.386009693145752, + "learning_rate": 8.845988791593696e-05, + "loss": 1.0021, + "num_input_tokens_seen": 26544072, + "step": 1649 + }, + { + "epoch": 0.11557960545325593, + "grad_norm": 4.978610992431641, + "learning_rate": 8.845288966725045e-05, + "loss": 1.2531, + "num_input_tokens_seen": 26560456, + "step": 1650 + }, + { + "epoch": 0.11564965369898518, + "grad_norm": 5.325594425201416, + "learning_rate": 8.844589141856392e-05, + "loss": 0.9983, + "num_input_tokens_seen": 26576840, + "step": 1651 + }, + { + "epoch": 0.11571970194471443, + "grad_norm": 4.359868049621582, + "learning_rate": 8.843889316987741e-05, + "loss": 0.9652, + "num_input_tokens_seen": 26593224, + "step": 1652 + }, + { + "epoch": 0.11578975019044367, + "grad_norm": 7.921500205993652, + "learning_rate": 8.84318949211909e-05, + "loss": 1.0767, + "num_input_tokens_seen": 26607352, + "step": 1653 + }, + { + "epoch": 0.11585979843617292, + "grad_norm": 3.51788330078125, + "learning_rate": 8.842489667250439e-05, + "loss": 1.0677, + "num_input_tokens_seen": 26623696, + "step": 1654 + }, + { + "epoch": 0.11592984668190216, + "grad_norm": 4.120747089385986, + "learning_rate": 8.841789842381788e-05, + "loss": 1.2139, + "num_input_tokens_seen": 26639832, + "step": 1655 + }, + { + "epoch": 0.11599989492763141, + "grad_norm": 4.077361106872559, + "learning_rate": 8.841090017513135e-05, + "loss": 1.0639, + "num_input_tokens_seen": 26655432, + "step": 1656 + }, + { + "epoch": 0.11606994317336065, + "grad_norm": 3.9629955291748047, + "learning_rate": 8.840390192644484e-05, + "loss": 1.0846, + "num_input_tokens_seen": 26671816, + "step": 1657 + }, + { + "epoch": 0.1161399914190899, + "grad_norm": 3.933544635772705, + "learning_rate": 8.839690367775831e-05, + "loss": 1.1543, + "num_input_tokens_seen": 26688096, + "step": 1658 + }, + { + "epoch": 0.11621003966481915, + "grad_norm": 4.702983379364014, + "learning_rate": 8.83899054290718e-05, + "loss": 1.0699, + "num_input_tokens_seen": 26704480, + "step": 1659 + }, + { + "epoch": 0.11628008791054839, + "grad_norm": 4.536739826202393, + "learning_rate": 8.83829071803853e-05, + "loss": 1.149, + "num_input_tokens_seen": 26720864, + "step": 1660 + }, + { + "epoch": 0.11635013615627764, + "grad_norm": 4.419711589813232, + "learning_rate": 8.837590893169878e-05, + "loss": 1.1994, + "num_input_tokens_seen": 26737248, + "step": 1661 + }, + { + "epoch": 0.11642018440200688, + "grad_norm": 4.106175899505615, + "learning_rate": 8.836891068301227e-05, + "loss": 1.0682, + "num_input_tokens_seen": 26753632, + "step": 1662 + }, + { + "epoch": 0.11649023264773613, + "grad_norm": 3.469658374786377, + "learning_rate": 8.836191243432574e-05, + "loss": 1.0356, + "num_input_tokens_seen": 26769944, + "step": 1663 + }, + { + "epoch": 0.11656028089346537, + "grad_norm": 7.273227691650391, + "learning_rate": 8.835491418563923e-05, + "loss": 1.1699, + "num_input_tokens_seen": 26784520, + "step": 1664 + }, + { + "epoch": 0.11663032913919462, + "grad_norm": 3.611165761947632, + "learning_rate": 8.834791593695271e-05, + "loss": 0.8595, + "num_input_tokens_seen": 26800360, + "step": 1665 + }, + { + "epoch": 0.11670037738492386, + "grad_norm": 4.405304908752441, + "learning_rate": 8.834091768826621e-05, + "loss": 1.2055, + "num_input_tokens_seen": 26816744, + "step": 1666 + }, + { + "epoch": 0.11677042563065311, + "grad_norm": 3.897247791290283, + "learning_rate": 8.83339194395797e-05, + "loss": 0.9599, + "num_input_tokens_seen": 26832520, + "step": 1667 + }, + { + "epoch": 0.11684047387638236, + "grad_norm": 3.898019313812256, + "learning_rate": 8.832692119089317e-05, + "loss": 1.0838, + "num_input_tokens_seen": 26848080, + "step": 1668 + }, + { + "epoch": 0.1169105221221116, + "grad_norm": 4.6351542472839355, + "learning_rate": 8.831992294220666e-05, + "loss": 1.2776, + "num_input_tokens_seen": 26864464, + "step": 1669 + }, + { + "epoch": 0.11698057036784085, + "grad_norm": 4.020237922668457, + "learning_rate": 8.831292469352015e-05, + "loss": 0.9955, + "num_input_tokens_seen": 26880848, + "step": 1670 + }, + { + "epoch": 0.11705061861357009, + "grad_norm": 5.813192367553711, + "learning_rate": 8.830592644483363e-05, + "loss": 1.2867, + "num_input_tokens_seen": 26897232, + "step": 1671 + }, + { + "epoch": 0.11712066685929934, + "grad_norm": 4.058423042297363, + "learning_rate": 8.829892819614711e-05, + "loss": 1.0697, + "num_input_tokens_seen": 26912872, + "step": 1672 + }, + { + "epoch": 0.11719071510502858, + "grad_norm": 4.76987361907959, + "learning_rate": 8.82919299474606e-05, + "loss": 0.9226, + "num_input_tokens_seen": 26929256, + "step": 1673 + }, + { + "epoch": 0.11726076335075783, + "grad_norm": 3.8400967121124268, + "learning_rate": 8.828493169877409e-05, + "loss": 1.0089, + "num_input_tokens_seen": 26945624, + "step": 1674 + }, + { + "epoch": 0.11733081159648708, + "grad_norm": 4.49709415435791, + "learning_rate": 8.827793345008757e-05, + "loss": 1.0898, + "num_input_tokens_seen": 26961464, + "step": 1675 + }, + { + "epoch": 0.11740085984221632, + "grad_norm": 4.143093109130859, + "learning_rate": 8.827093520140105e-05, + "loss": 1.0493, + "num_input_tokens_seen": 26976720, + "step": 1676 + }, + { + "epoch": 0.11747090808794557, + "grad_norm": 4.138030529022217, + "learning_rate": 8.826393695271454e-05, + "loss": 1.1555, + "num_input_tokens_seen": 26993056, + "step": 1677 + }, + { + "epoch": 0.11754095633367481, + "grad_norm": 3.8191847801208496, + "learning_rate": 8.825693870402802e-05, + "loss": 1.0993, + "num_input_tokens_seen": 27009440, + "step": 1678 + }, + { + "epoch": 0.11761100457940406, + "grad_norm": 3.8392176628112793, + "learning_rate": 8.824994045534151e-05, + "loss": 1.1067, + "num_input_tokens_seen": 27024880, + "step": 1679 + }, + { + "epoch": 0.11768105282513332, + "grad_norm": 4.468568801879883, + "learning_rate": 8.8242942206655e-05, + "loss": 1.1424, + "num_input_tokens_seen": 27040672, + "step": 1680 + }, + { + "epoch": 0.11775110107086256, + "grad_norm": 3.6515510082244873, + "learning_rate": 8.823594395796848e-05, + "loss": 1.0659, + "num_input_tokens_seen": 27057056, + "step": 1681 + }, + { + "epoch": 0.11782114931659181, + "grad_norm": 4.479739189147949, + "learning_rate": 8.822894570928197e-05, + "loss": 1.0399, + "num_input_tokens_seen": 27073440, + "step": 1682 + }, + { + "epoch": 0.11789119756232105, + "grad_norm": 3.762479782104492, + "learning_rate": 8.822194746059545e-05, + "loss": 1.1041, + "num_input_tokens_seen": 27089824, + "step": 1683 + }, + { + "epoch": 0.1179612458080503, + "grad_norm": 4.694389343261719, + "learning_rate": 8.821494921190894e-05, + "loss": 1.2785, + "num_input_tokens_seen": 27106208, + "step": 1684 + }, + { + "epoch": 0.11803129405377955, + "grad_norm": 3.738931179046631, + "learning_rate": 8.820795096322241e-05, + "loss": 0.9039, + "num_input_tokens_seen": 27122352, + "step": 1685 + }, + { + "epoch": 0.11810134229950879, + "grad_norm": 4.065624237060547, + "learning_rate": 8.820095271453591e-05, + "loss": 1.0048, + "num_input_tokens_seen": 27138160, + "step": 1686 + }, + { + "epoch": 0.11817139054523804, + "grad_norm": 3.5373826026916504, + "learning_rate": 8.81939544658494e-05, + "loss": 0.8786, + "num_input_tokens_seen": 27154544, + "step": 1687 + }, + { + "epoch": 0.11824143879096728, + "grad_norm": 3.773066282272339, + "learning_rate": 8.818695621716288e-05, + "loss": 1.0043, + "num_input_tokens_seen": 27170928, + "step": 1688 + }, + { + "epoch": 0.11831148703669653, + "grad_norm": 3.3876242637634277, + "learning_rate": 8.817995796847637e-05, + "loss": 0.9909, + "num_input_tokens_seen": 27187312, + "step": 1689 + }, + { + "epoch": 0.11838153528242577, + "grad_norm": 4.526343822479248, + "learning_rate": 8.817295971978984e-05, + "loss": 1.0899, + "num_input_tokens_seen": 27202208, + "step": 1690 + }, + { + "epoch": 0.11845158352815502, + "grad_norm": 4.691114902496338, + "learning_rate": 8.816596147110333e-05, + "loss": 1.0823, + "num_input_tokens_seen": 27218592, + "step": 1691 + }, + { + "epoch": 0.11852163177388426, + "grad_norm": 3.90531849861145, + "learning_rate": 8.815896322241682e-05, + "loss": 1.1438, + "num_input_tokens_seen": 27234976, + "step": 1692 + }, + { + "epoch": 0.11859168001961351, + "grad_norm": 3.5546317100524902, + "learning_rate": 8.81519649737303e-05, + "loss": 1.0326, + "num_input_tokens_seen": 27251360, + "step": 1693 + }, + { + "epoch": 0.11866172826534276, + "grad_norm": 5.117360591888428, + "learning_rate": 8.81449667250438e-05, + "loss": 1.1921, + "num_input_tokens_seen": 27267744, + "step": 1694 + }, + { + "epoch": 0.118731776511072, + "grad_norm": 4.055267810821533, + "learning_rate": 8.813796847635727e-05, + "loss": 1.0607, + "num_input_tokens_seen": 27283688, + "step": 1695 + }, + { + "epoch": 0.11880182475680125, + "grad_norm": 4.04268741607666, + "learning_rate": 8.813097022767076e-05, + "loss": 1.1862, + "num_input_tokens_seen": 27300072, + "step": 1696 + }, + { + "epoch": 0.11887187300253049, + "grad_norm": 4.048800945281982, + "learning_rate": 8.812397197898425e-05, + "loss": 0.9231, + "num_input_tokens_seen": 27316456, + "step": 1697 + }, + { + "epoch": 0.11894192124825974, + "grad_norm": 4.445494174957275, + "learning_rate": 8.811697373029772e-05, + "loss": 1.241, + "num_input_tokens_seen": 27332464, + "step": 1698 + }, + { + "epoch": 0.11901196949398898, + "grad_norm": 4.522054672241211, + "learning_rate": 8.810997548161121e-05, + "loss": 1.3945, + "num_input_tokens_seen": 27348848, + "step": 1699 + }, + { + "epoch": 0.11908201773971823, + "grad_norm": 4.106349468231201, + "learning_rate": 8.81029772329247e-05, + "loss": 1.1457, + "num_input_tokens_seen": 27365232, + "step": 1700 + }, + { + "epoch": 0.11915206598544748, + "grad_norm": 6.059356689453125, + "learning_rate": 8.809597898423819e-05, + "loss": 1.3381, + "num_input_tokens_seen": 27380448, + "step": 1701 + }, + { + "epoch": 0.11922211423117672, + "grad_norm": 3.8089959621429443, + "learning_rate": 8.808898073555166e-05, + "loss": 1.0699, + "num_input_tokens_seen": 27396832, + "step": 1702 + }, + { + "epoch": 0.11929216247690597, + "grad_norm": 4.21024227142334, + "learning_rate": 8.808198248686515e-05, + "loss": 1.306, + "num_input_tokens_seen": 27413096, + "step": 1703 + }, + { + "epoch": 0.11936221072263521, + "grad_norm": 4.286004066467285, + "learning_rate": 8.807498423817864e-05, + "loss": 1.2325, + "num_input_tokens_seen": 27429480, + "step": 1704 + }, + { + "epoch": 0.11943225896836446, + "grad_norm": 3.512561559677124, + "learning_rate": 8.806798598949212e-05, + "loss": 0.8804, + "num_input_tokens_seen": 27445864, + "step": 1705 + }, + { + "epoch": 0.1195023072140937, + "grad_norm": 4.096526145935059, + "learning_rate": 8.806098774080562e-05, + "loss": 1.0591, + "num_input_tokens_seen": 27462248, + "step": 1706 + }, + { + "epoch": 0.11957235545982295, + "grad_norm": 5.032350063323975, + "learning_rate": 8.805398949211909e-05, + "loss": 0.8948, + "num_input_tokens_seen": 27478312, + "step": 1707 + }, + { + "epoch": 0.1196424037055522, + "grad_norm": 4.756420612335205, + "learning_rate": 8.804699124343258e-05, + "loss": 1.0584, + "num_input_tokens_seen": 27494696, + "step": 1708 + }, + { + "epoch": 0.11971245195128144, + "grad_norm": 4.869518756866455, + "learning_rate": 8.803999299474607e-05, + "loss": 0.9394, + "num_input_tokens_seen": 27511080, + "step": 1709 + }, + { + "epoch": 0.11978250019701069, + "grad_norm": 3.451759099960327, + "learning_rate": 8.803299474605954e-05, + "loss": 0.9171, + "num_input_tokens_seen": 27527328, + "step": 1710 + }, + { + "epoch": 0.11985254844273993, + "grad_norm": 4.247021675109863, + "learning_rate": 8.802599649737303e-05, + "loss": 1.1204, + "num_input_tokens_seen": 27543712, + "step": 1711 + }, + { + "epoch": 0.11992259668846918, + "grad_norm": 4.597024917602539, + "learning_rate": 8.801899824868652e-05, + "loss": 1.196, + "num_input_tokens_seen": 27560096, + "step": 1712 + }, + { + "epoch": 0.11999264493419842, + "grad_norm": 4.242952823638916, + "learning_rate": 8.801200000000001e-05, + "loss": 1.1747, + "num_input_tokens_seen": 27576320, + "step": 1713 + }, + { + "epoch": 0.12006269317992768, + "grad_norm": 5.1166486740112305, + "learning_rate": 8.80050017513135e-05, + "loss": 1.4222, + "num_input_tokens_seen": 27591024, + "step": 1714 + }, + { + "epoch": 0.12013274142565693, + "grad_norm": 4.6713714599609375, + "learning_rate": 8.799800350262697e-05, + "loss": 1.1869, + "num_input_tokens_seen": 27606352, + "step": 1715 + }, + { + "epoch": 0.12020278967138617, + "grad_norm": 4.62678861618042, + "learning_rate": 8.799100525394046e-05, + "loss": 1.1524, + "num_input_tokens_seen": 27622736, + "step": 1716 + }, + { + "epoch": 0.12027283791711542, + "grad_norm": 3.611985206604004, + "learning_rate": 8.798400700525394e-05, + "loss": 1.1179, + "num_input_tokens_seen": 27639120, + "step": 1717 + }, + { + "epoch": 0.12034288616284466, + "grad_norm": 4.165099143981934, + "learning_rate": 8.797700875656743e-05, + "loss": 1.0104, + "num_input_tokens_seen": 27654024, + "step": 1718 + }, + { + "epoch": 0.12041293440857391, + "grad_norm": 4.532061576843262, + "learning_rate": 8.797001050788091e-05, + "loss": 1.05, + "num_input_tokens_seen": 27670408, + "step": 1719 + }, + { + "epoch": 0.12048298265430316, + "grad_norm": 4.880197048187256, + "learning_rate": 8.79630122591944e-05, + "loss": 1.0321, + "num_input_tokens_seen": 27686792, + "step": 1720 + }, + { + "epoch": 0.1205530309000324, + "grad_norm": 3.521052360534668, + "learning_rate": 8.795601401050789e-05, + "loss": 0.9048, + "num_input_tokens_seen": 27703176, + "step": 1721 + }, + { + "epoch": 0.12062307914576165, + "grad_norm": 3.965725898742676, + "learning_rate": 8.794901576182137e-05, + "loss": 1.1348, + "num_input_tokens_seen": 27719024, + "step": 1722 + }, + { + "epoch": 0.12069312739149089, + "grad_norm": 3.936962842941284, + "learning_rate": 8.794201751313486e-05, + "loss": 1.1531, + "num_input_tokens_seen": 27734736, + "step": 1723 + }, + { + "epoch": 0.12076317563722014, + "grad_norm": 5.225526332855225, + "learning_rate": 8.793501926444834e-05, + "loss": 1.2784, + "num_input_tokens_seen": 27751120, + "step": 1724 + }, + { + "epoch": 0.12083322388294938, + "grad_norm": 4.125289440155029, + "learning_rate": 8.792802101576182e-05, + "loss": 1.1893, + "num_input_tokens_seen": 27767288, + "step": 1725 + }, + { + "epoch": 0.12090327212867863, + "grad_norm": 3.9352405071258545, + "learning_rate": 8.792102276707532e-05, + "loss": 1.1867, + "num_input_tokens_seen": 27783672, + "step": 1726 + }, + { + "epoch": 0.12097332037440787, + "grad_norm": 3.908578634262085, + "learning_rate": 8.79140245183888e-05, + "loss": 1.0024, + "num_input_tokens_seen": 27799640, + "step": 1727 + }, + { + "epoch": 0.12104336862013712, + "grad_norm": 3.694387435913086, + "learning_rate": 8.790702626970229e-05, + "loss": 1.0652, + "num_input_tokens_seen": 27816024, + "step": 1728 + }, + { + "epoch": 0.12111341686586637, + "grad_norm": 4.0100016593933105, + "learning_rate": 8.790002802101576e-05, + "loss": 1.0511, + "num_input_tokens_seen": 27832408, + "step": 1729 + }, + { + "epoch": 0.12118346511159561, + "grad_norm": 5.454882621765137, + "learning_rate": 8.789302977232925e-05, + "loss": 1.1096, + "num_input_tokens_seen": 27848792, + "step": 1730 + }, + { + "epoch": 0.12125351335732486, + "grad_norm": 5.065526485443115, + "learning_rate": 8.788603152364274e-05, + "loss": 1.0354, + "num_input_tokens_seen": 27864688, + "step": 1731 + }, + { + "epoch": 0.1213235616030541, + "grad_norm": 3.73103666305542, + "learning_rate": 8.787903327495623e-05, + "loss": 1.0328, + "num_input_tokens_seen": 27881072, + "step": 1732 + }, + { + "epoch": 0.12139360984878335, + "grad_norm": 3.971198081970215, + "learning_rate": 8.787203502626971e-05, + "loss": 1.1908, + "num_input_tokens_seen": 27896912, + "step": 1733 + }, + { + "epoch": 0.1214636580945126, + "grad_norm": 3.933809518814087, + "learning_rate": 8.786503677758319e-05, + "loss": 1.1125, + "num_input_tokens_seen": 27913104, + "step": 1734 + }, + { + "epoch": 0.12153370634024184, + "grad_norm": 3.92167329788208, + "learning_rate": 8.785803852889668e-05, + "loss": 1.0007, + "num_input_tokens_seen": 27929488, + "step": 1735 + }, + { + "epoch": 0.12160375458597109, + "grad_norm": 4.441089630126953, + "learning_rate": 8.785104028021017e-05, + "loss": 0.9748, + "num_input_tokens_seen": 27945504, + "step": 1736 + }, + { + "epoch": 0.12167380283170033, + "grad_norm": 4.023623466491699, + "learning_rate": 8.784404203152364e-05, + "loss": 0.8826, + "num_input_tokens_seen": 27961888, + "step": 1737 + }, + { + "epoch": 0.12174385107742958, + "grad_norm": 4.0328826904296875, + "learning_rate": 8.783704378283713e-05, + "loss": 1.2769, + "num_input_tokens_seen": 27978024, + "step": 1738 + }, + { + "epoch": 0.12181389932315882, + "grad_norm": 4.5445733070373535, + "learning_rate": 8.783004553415062e-05, + "loss": 1.3745, + "num_input_tokens_seen": 27993840, + "step": 1739 + }, + { + "epoch": 0.12188394756888807, + "grad_norm": 3.609834671020508, + "learning_rate": 8.782304728546411e-05, + "loss": 0.916, + "num_input_tokens_seen": 28010224, + "step": 1740 + }, + { + "epoch": 0.12195399581461731, + "grad_norm": 3.849306344985962, + "learning_rate": 8.78160490367776e-05, + "loss": 1.1135, + "num_input_tokens_seen": 28026232, + "step": 1741 + }, + { + "epoch": 0.12202404406034656, + "grad_norm": 4.11102294921875, + "learning_rate": 8.780905078809107e-05, + "loss": 1.2269, + "num_input_tokens_seen": 28041880, + "step": 1742 + }, + { + "epoch": 0.1220940923060758, + "grad_norm": 4.156986713409424, + "learning_rate": 8.780205253940456e-05, + "loss": 1.0321, + "num_input_tokens_seen": 28058264, + "step": 1743 + }, + { + "epoch": 0.12216414055180505, + "grad_norm": 3.9670159816741943, + "learning_rate": 8.779505429071803e-05, + "loss": 0.9752, + "num_input_tokens_seen": 28073168, + "step": 1744 + }, + { + "epoch": 0.1222341887975343, + "grad_norm": 5.342650890350342, + "learning_rate": 8.778805604203152e-05, + "loss": 1.1416, + "num_input_tokens_seen": 28089552, + "step": 1745 + }, + { + "epoch": 0.12230423704326354, + "grad_norm": 4.031285285949707, + "learning_rate": 8.778105779334501e-05, + "loss": 1.1134, + "num_input_tokens_seen": 28105264, + "step": 1746 + }, + { + "epoch": 0.12237428528899279, + "grad_norm": 3.5976450443267822, + "learning_rate": 8.77740595446585e-05, + "loss": 1.0342, + "num_input_tokens_seen": 28121648, + "step": 1747 + }, + { + "epoch": 0.12244433353472203, + "grad_norm": 4.947859764099121, + "learning_rate": 8.776706129597199e-05, + "loss": 1.0809, + "num_input_tokens_seen": 28137640, + "step": 1748 + }, + { + "epoch": 0.12251438178045129, + "grad_norm": 4.004949569702148, + "learning_rate": 8.776006304728546e-05, + "loss": 1.0921, + "num_input_tokens_seen": 28154024, + "step": 1749 + }, + { + "epoch": 0.12258443002618054, + "grad_norm": 3.9022445678710938, + "learning_rate": 8.775306479859895e-05, + "loss": 1.0844, + "num_input_tokens_seen": 28170408, + "step": 1750 + }, + { + "epoch": 0.12265447827190978, + "grad_norm": 4.171925067901611, + "learning_rate": 8.774606654991244e-05, + "loss": 1.1894, + "num_input_tokens_seen": 28186792, + "step": 1751 + }, + { + "epoch": 0.12272452651763903, + "grad_norm": 3.9387433528900146, + "learning_rate": 8.773906830122592e-05, + "loss": 1.0303, + "num_input_tokens_seen": 28203176, + "step": 1752 + }, + { + "epoch": 0.12279457476336827, + "grad_norm": 5.067278861999512, + "learning_rate": 8.773207005253942e-05, + "loss": 1.1924, + "num_input_tokens_seen": 28219192, + "step": 1753 + }, + { + "epoch": 0.12286462300909752, + "grad_norm": 3.673807144165039, + "learning_rate": 8.77250718038529e-05, + "loss": 1.0438, + "num_input_tokens_seen": 28235576, + "step": 1754 + }, + { + "epoch": 0.12293467125482677, + "grad_norm": 5.303588390350342, + "learning_rate": 8.771807355516638e-05, + "loss": 1.2601, + "num_input_tokens_seen": 28251960, + "step": 1755 + }, + { + "epoch": 0.12300471950055601, + "grad_norm": 5.343825340270996, + "learning_rate": 8.771107530647986e-05, + "loss": 1.1126, + "num_input_tokens_seen": 28268344, + "step": 1756 + }, + { + "epoch": 0.12307476774628526, + "grad_norm": 4.125874996185303, + "learning_rate": 8.770407705779335e-05, + "loss": 1.1497, + "num_input_tokens_seen": 28284144, + "step": 1757 + }, + { + "epoch": 0.1231448159920145, + "grad_norm": 4.628546714782715, + "learning_rate": 8.769707880910683e-05, + "loss": 1.1757, + "num_input_tokens_seen": 28299896, + "step": 1758 + }, + { + "epoch": 0.12321486423774375, + "grad_norm": 3.946603775024414, + "learning_rate": 8.769008056042032e-05, + "loss": 1.2739, + "num_input_tokens_seen": 28316280, + "step": 1759 + }, + { + "epoch": 0.123284912483473, + "grad_norm": 3.4837770462036133, + "learning_rate": 8.768308231173381e-05, + "loss": 0.9682, + "num_input_tokens_seen": 28332128, + "step": 1760 + }, + { + "epoch": 0.12335496072920224, + "grad_norm": 3.9601573944091797, + "learning_rate": 8.767608406304729e-05, + "loss": 1.2647, + "num_input_tokens_seen": 28347488, + "step": 1761 + }, + { + "epoch": 0.12342500897493149, + "grad_norm": 4.178001403808594, + "learning_rate": 8.766908581436078e-05, + "loss": 1.0055, + "num_input_tokens_seen": 28363872, + "step": 1762 + }, + { + "epoch": 0.12349505722066073, + "grad_norm": 3.9182498455047607, + "learning_rate": 8.766208756567426e-05, + "loss": 1.1407, + "num_input_tokens_seen": 28380208, + "step": 1763 + }, + { + "epoch": 0.12356510546638998, + "grad_norm": 4.071939468383789, + "learning_rate": 8.765508931698774e-05, + "loss": 1.3196, + "num_input_tokens_seen": 28396592, + "step": 1764 + }, + { + "epoch": 0.12363515371211922, + "grad_norm": 4.657908916473389, + "learning_rate": 8.764809106830123e-05, + "loss": 1.0739, + "num_input_tokens_seen": 28412976, + "step": 1765 + }, + { + "epoch": 0.12370520195784847, + "grad_norm": 3.9706201553344727, + "learning_rate": 8.764109281961472e-05, + "loss": 1.0904, + "num_input_tokens_seen": 28429088, + "step": 1766 + }, + { + "epoch": 0.12377525020357771, + "grad_norm": 4.571341514587402, + "learning_rate": 8.76340945709282e-05, + "loss": 1.1314, + "num_input_tokens_seen": 28445472, + "step": 1767 + }, + { + "epoch": 0.12384529844930696, + "grad_norm": 4.197002410888672, + "learning_rate": 8.762709632224169e-05, + "loss": 0.8251, + "num_input_tokens_seen": 28461656, + "step": 1768 + }, + { + "epoch": 0.1239153466950362, + "grad_norm": 5.376040935516357, + "learning_rate": 8.762009807355517e-05, + "loss": 1.1626, + "num_input_tokens_seen": 28477088, + "step": 1769 + }, + { + "epoch": 0.12398539494076545, + "grad_norm": 3.987495183944702, + "learning_rate": 8.761309982486866e-05, + "loss": 1.2449, + "num_input_tokens_seen": 28493472, + "step": 1770 + }, + { + "epoch": 0.1240554431864947, + "grad_norm": 4.379208564758301, + "learning_rate": 8.760610157618213e-05, + "loss": 1.2834, + "num_input_tokens_seen": 28509856, + "step": 1771 + }, + { + "epoch": 0.12412549143222394, + "grad_norm": 3.7258729934692383, + "learning_rate": 8.759910332749562e-05, + "loss": 1.1115, + "num_input_tokens_seen": 28525664, + "step": 1772 + }, + { + "epoch": 0.12419553967795319, + "grad_norm": 4.0574774742126465, + "learning_rate": 8.759210507880911e-05, + "loss": 1.1005, + "num_input_tokens_seen": 28541920, + "step": 1773 + }, + { + "epoch": 0.12426558792368243, + "grad_norm": 3.8423895835876465, + "learning_rate": 8.75851068301226e-05, + "loss": 1.1067, + "num_input_tokens_seen": 28558216, + "step": 1774 + }, + { + "epoch": 0.12433563616941168, + "grad_norm": 3.8898398876190186, + "learning_rate": 8.757810858143609e-05, + "loss": 1.1963, + "num_input_tokens_seen": 28574536, + "step": 1775 + }, + { + "epoch": 0.12440568441514092, + "grad_norm": 3.286412000656128, + "learning_rate": 8.757111033274956e-05, + "loss": 0.9159, + "num_input_tokens_seen": 28590920, + "step": 1776 + }, + { + "epoch": 0.12447573266087017, + "grad_norm": 3.7219464778900146, + "learning_rate": 8.756411208406305e-05, + "loss": 1.0883, + "num_input_tokens_seen": 28607192, + "step": 1777 + }, + { + "epoch": 0.12454578090659942, + "grad_norm": 3.8907012939453125, + "learning_rate": 8.755711383537654e-05, + "loss": 1.0226, + "num_input_tokens_seen": 28623176, + "step": 1778 + }, + { + "epoch": 0.12461582915232866, + "grad_norm": 3.8087925910949707, + "learning_rate": 8.755011558669003e-05, + "loss": 1.0115, + "num_input_tokens_seen": 28639528, + "step": 1779 + }, + { + "epoch": 0.1246858773980579, + "grad_norm": 4.8956217765808105, + "learning_rate": 8.754311733800352e-05, + "loss": 1.0108, + "num_input_tokens_seen": 28654976, + "step": 1780 + }, + { + "epoch": 0.12475592564378715, + "grad_norm": 3.7400572299957275, + "learning_rate": 8.753611908931699e-05, + "loss": 0.8787, + "num_input_tokens_seen": 28671064, + "step": 1781 + }, + { + "epoch": 0.1248259738895164, + "grad_norm": 4.689199924468994, + "learning_rate": 8.752912084063048e-05, + "loss": 1.2326, + "num_input_tokens_seen": 28686664, + "step": 1782 + }, + { + "epoch": 0.12489602213524566, + "grad_norm": 3.6594929695129395, + "learning_rate": 8.752212259194395e-05, + "loss": 1.1626, + "num_input_tokens_seen": 28703048, + "step": 1783 + }, + { + "epoch": 0.1249660703809749, + "grad_norm": 4.6070356369018555, + "learning_rate": 8.751512434325744e-05, + "loss": 1.358, + "num_input_tokens_seen": 28719000, + "step": 1784 + }, + { + "epoch": 0.12503611862670413, + "grad_norm": 4.658362865447998, + "learning_rate": 8.750812609457093e-05, + "loss": 1.2852, + "num_input_tokens_seen": 28735384, + "step": 1785 + }, + { + "epoch": 0.1251061668724334, + "grad_norm": 3.6963465213775635, + "learning_rate": 8.750112784588442e-05, + "loss": 1.1068, + "num_input_tokens_seen": 28750856, + "step": 1786 + }, + { + "epoch": 0.12517621511816263, + "grad_norm": 4.419562816619873, + "learning_rate": 8.749412959719791e-05, + "loss": 1.1559, + "num_input_tokens_seen": 28766824, + "step": 1787 + }, + { + "epoch": 0.12524626336389189, + "grad_norm": 4.601676940917969, + "learning_rate": 8.74871313485114e-05, + "loss": 1.0642, + "num_input_tokens_seen": 28783208, + "step": 1788 + }, + { + "epoch": 0.12531631160962112, + "grad_norm": 3.8597445487976074, + "learning_rate": 8.748013309982487e-05, + "loss": 1.1149, + "num_input_tokens_seen": 28799160, + "step": 1789 + }, + { + "epoch": 0.12538635985535038, + "grad_norm": 3.654649257659912, + "learning_rate": 8.747313485113835e-05, + "loss": 1.3127, + "num_input_tokens_seen": 28815440, + "step": 1790 + }, + { + "epoch": 0.1254564081010796, + "grad_norm": 4.043321132659912, + "learning_rate": 8.746613660245184e-05, + "loss": 1.0844, + "num_input_tokens_seen": 28831824, + "step": 1791 + }, + { + "epoch": 0.12552645634680887, + "grad_norm": 4.5223894119262695, + "learning_rate": 8.745913835376532e-05, + "loss": 1.0627, + "num_input_tokens_seen": 28846984, + "step": 1792 + }, + { + "epoch": 0.1255965045925381, + "grad_norm": 4.074361801147461, + "learning_rate": 8.745214010507881e-05, + "loss": 0.9772, + "num_input_tokens_seen": 28863368, + "step": 1793 + }, + { + "epoch": 0.12566655283826736, + "grad_norm": 4.661183834075928, + "learning_rate": 8.74451418563923e-05, + "loss": 1.152, + "num_input_tokens_seen": 28879752, + "step": 1794 + }, + { + "epoch": 0.1257366010839966, + "grad_norm": 3.95831561088562, + "learning_rate": 8.743814360770579e-05, + "loss": 1.117, + "num_input_tokens_seen": 28895728, + "step": 1795 + }, + { + "epoch": 0.12580664932972585, + "grad_norm": 4.271726131439209, + "learning_rate": 8.743114535901927e-05, + "loss": 1.0935, + "num_input_tokens_seen": 28912112, + "step": 1796 + }, + { + "epoch": 0.12587669757545508, + "grad_norm": 4.079075336456299, + "learning_rate": 8.742414711033275e-05, + "loss": 1.1397, + "num_input_tokens_seen": 28928496, + "step": 1797 + }, + { + "epoch": 0.12594674582118434, + "grad_norm": 4.030980587005615, + "learning_rate": 8.741714886164623e-05, + "loss": 0.9405, + "num_input_tokens_seen": 28943968, + "step": 1798 + }, + { + "epoch": 0.12601679406691357, + "grad_norm": 3.7285454273223877, + "learning_rate": 8.741015061295973e-05, + "loss": 0.8448, + "num_input_tokens_seen": 28959800, + "step": 1799 + }, + { + "epoch": 0.12608684231264283, + "grad_norm": 3.964663028717041, + "learning_rate": 8.74031523642732e-05, + "loss": 1.1614, + "num_input_tokens_seen": 28976184, + "step": 1800 + }, + { + "epoch": 0.12608684231264283, + "eval_loss": 1.1493111848831177, + "eval_runtime": 0.196, + "eval_samples_per_second": 5.102, + "eval_steps_per_second": 5.102, + "num_input_tokens_seen": 28976184, + "step": 1800 + }, + { + "epoch": 0.1261568905583721, + "grad_norm": 4.2887396812438965, + "learning_rate": 8.73961541155867e-05, + "loss": 0.9047, + "num_input_tokens_seen": 28992552, + "step": 1801 + }, + { + "epoch": 0.12622693880410132, + "grad_norm": 5.139194011688232, + "learning_rate": 8.738915586690018e-05, + "loss": 1.1656, + "num_input_tokens_seen": 29007480, + "step": 1802 + }, + { + "epoch": 0.12629698704983058, + "grad_norm": 4.023421287536621, + "learning_rate": 8.738215761821366e-05, + "loss": 1.0585, + "num_input_tokens_seen": 29023864, + "step": 1803 + }, + { + "epoch": 0.12636703529555982, + "grad_norm": 3.6131162643432617, + "learning_rate": 8.737515936952715e-05, + "loss": 1.0964, + "num_input_tokens_seen": 29039640, + "step": 1804 + }, + { + "epoch": 0.12643708354128907, + "grad_norm": 4.477705478668213, + "learning_rate": 8.736816112084064e-05, + "loss": 0.8054, + "num_input_tokens_seen": 29055816, + "step": 1805 + }, + { + "epoch": 0.1265071317870183, + "grad_norm": 3.7637252807617188, + "learning_rate": 8.736116287215412e-05, + "loss": 1.0389, + "num_input_tokens_seen": 29071456, + "step": 1806 + }, + { + "epoch": 0.12657718003274757, + "grad_norm": 3.9611611366271973, + "learning_rate": 8.735416462346761e-05, + "loss": 1.1907, + "num_input_tokens_seen": 29087840, + "step": 1807 + }, + { + "epoch": 0.1266472282784768, + "grad_norm": 3.6022791862487793, + "learning_rate": 8.734716637478109e-05, + "loss": 0.9538, + "num_input_tokens_seen": 29104224, + "step": 1808 + }, + { + "epoch": 0.12671727652420606, + "grad_norm": 3.7403485774993896, + "learning_rate": 8.734016812609458e-05, + "loss": 1.12, + "num_input_tokens_seen": 29120608, + "step": 1809 + }, + { + "epoch": 0.1267873247699353, + "grad_norm": 3.5624709129333496, + "learning_rate": 8.733316987740805e-05, + "loss": 1.0931, + "num_input_tokens_seen": 29136840, + "step": 1810 + }, + { + "epoch": 0.12685737301566455, + "grad_norm": 3.961516857147217, + "learning_rate": 8.732617162872154e-05, + "loss": 0.9529, + "num_input_tokens_seen": 29153224, + "step": 1811 + }, + { + "epoch": 0.12692742126139378, + "grad_norm": 4.895046234130859, + "learning_rate": 8.731917338003503e-05, + "loss": 1.0697, + "num_input_tokens_seen": 29168336, + "step": 1812 + }, + { + "epoch": 0.12699746950712304, + "grad_norm": 4.290217876434326, + "learning_rate": 8.731217513134852e-05, + "loss": 0.8945, + "num_input_tokens_seen": 29184720, + "step": 1813 + }, + { + "epoch": 0.12706751775285227, + "grad_norm": 3.6602399349212646, + "learning_rate": 8.7305176882662e-05, + "loss": 1.0465, + "num_input_tokens_seen": 29200920, + "step": 1814 + }, + { + "epoch": 0.12713756599858153, + "grad_norm": 3.7980921268463135, + "learning_rate": 8.72981786339755e-05, + "loss": 0.8915, + "num_input_tokens_seen": 29217304, + "step": 1815 + }, + { + "epoch": 0.12720761424431076, + "grad_norm": 3.646242141723633, + "learning_rate": 8.729118038528897e-05, + "loss": 1.0058, + "num_input_tokens_seen": 29233688, + "step": 1816 + }, + { + "epoch": 0.12727766249004002, + "grad_norm": 5.226564884185791, + "learning_rate": 8.728418213660244e-05, + "loss": 0.9569, + "num_input_tokens_seen": 29247896, + "step": 1817 + }, + { + "epoch": 0.12734771073576925, + "grad_norm": 3.8191912174224854, + "learning_rate": 8.727718388791593e-05, + "loss": 1.1548, + "num_input_tokens_seen": 29263896, + "step": 1818 + }, + { + "epoch": 0.1274177589814985, + "grad_norm": 4.349045276641846, + "learning_rate": 8.727018563922944e-05, + "loss": 1.1368, + "num_input_tokens_seen": 29280224, + "step": 1819 + }, + { + "epoch": 0.12748780722722775, + "grad_norm": 3.842888116836548, + "learning_rate": 8.726318739054291e-05, + "loss": 1.0052, + "num_input_tokens_seen": 29296608, + "step": 1820 + }, + { + "epoch": 0.127557855472957, + "grad_norm": 3.8854012489318848, + "learning_rate": 8.72561891418564e-05, + "loss": 1.0584, + "num_input_tokens_seen": 29312992, + "step": 1821 + }, + { + "epoch": 0.12762790371868624, + "grad_norm": 4.102949619293213, + "learning_rate": 8.724919089316989e-05, + "loss": 0.9004, + "num_input_tokens_seen": 29328416, + "step": 1822 + }, + { + "epoch": 0.1276979519644155, + "grad_norm": 5.0174336433410645, + "learning_rate": 8.724219264448336e-05, + "loss": 1.0837, + "num_input_tokens_seen": 29344800, + "step": 1823 + }, + { + "epoch": 0.12776800021014473, + "grad_norm": 3.6122186183929443, + "learning_rate": 8.723519439579685e-05, + "loss": 0.924, + "num_input_tokens_seen": 29361184, + "step": 1824 + }, + { + "epoch": 0.127838048455874, + "grad_norm": 4.086683750152588, + "learning_rate": 8.722819614711034e-05, + "loss": 1.0945, + "num_input_tokens_seen": 29376840, + "step": 1825 + }, + { + "epoch": 0.12790809670160322, + "grad_norm": 4.279770851135254, + "learning_rate": 8.722119789842383e-05, + "loss": 0.9831, + "num_input_tokens_seen": 29393016, + "step": 1826 + }, + { + "epoch": 0.12797814494733248, + "grad_norm": 5.032819747924805, + "learning_rate": 8.72141996497373e-05, + "loss": 1.1691, + "num_input_tokens_seen": 29409400, + "step": 1827 + }, + { + "epoch": 0.1280481931930617, + "grad_norm": 4.480144023895264, + "learning_rate": 8.720720140105079e-05, + "loss": 1.1481, + "num_input_tokens_seen": 29425472, + "step": 1828 + }, + { + "epoch": 0.12811824143879097, + "grad_norm": 3.6843478679656982, + "learning_rate": 8.720020315236428e-05, + "loss": 1.1302, + "num_input_tokens_seen": 29441472, + "step": 1829 + }, + { + "epoch": 0.1281882896845202, + "grad_norm": 3.7091941833496094, + "learning_rate": 8.719320490367776e-05, + "loss": 1.0351, + "num_input_tokens_seen": 29457600, + "step": 1830 + }, + { + "epoch": 0.12825833793024946, + "grad_norm": 4.122303009033203, + "learning_rate": 8.718620665499124e-05, + "loss": 1.0791, + "num_input_tokens_seen": 29473984, + "step": 1831 + }, + { + "epoch": 0.1283283861759787, + "grad_norm": 5.282047748565674, + "learning_rate": 8.717920840630473e-05, + "loss": 1.4479, + "num_input_tokens_seen": 29490336, + "step": 1832 + }, + { + "epoch": 0.12839843442170795, + "grad_norm": 4.0706586837768555, + "learning_rate": 8.717221015761822e-05, + "loss": 1.0026, + "num_input_tokens_seen": 29506432, + "step": 1833 + }, + { + "epoch": 0.12846848266743718, + "grad_norm": 3.856018543243408, + "learning_rate": 8.716521190893171e-05, + "loss": 1.0545, + "num_input_tokens_seen": 29521744, + "step": 1834 + }, + { + "epoch": 0.12853853091316644, + "grad_norm": 3.7059905529022217, + "learning_rate": 8.715821366024518e-05, + "loss": 0.9876, + "num_input_tokens_seen": 29537104, + "step": 1835 + }, + { + "epoch": 0.1286085791588957, + "grad_norm": 3.915038585662842, + "learning_rate": 8.715121541155867e-05, + "loss": 1.2072, + "num_input_tokens_seen": 29552928, + "step": 1836 + }, + { + "epoch": 0.12867862740462493, + "grad_norm": 3.6828839778900146, + "learning_rate": 8.714421716287215e-05, + "loss": 0.9849, + "num_input_tokens_seen": 29569312, + "step": 1837 + }, + { + "epoch": 0.1287486756503542, + "grad_norm": 4.3285441398620605, + "learning_rate": 8.713721891418564e-05, + "loss": 1.2812, + "num_input_tokens_seen": 29584376, + "step": 1838 + }, + { + "epoch": 0.12881872389608343, + "grad_norm": 4.646363258361816, + "learning_rate": 8.713022066549914e-05, + "loss": 1.1107, + "num_input_tokens_seen": 29599856, + "step": 1839 + }, + { + "epoch": 0.12888877214181269, + "grad_norm": 4.180859088897705, + "learning_rate": 8.712322241681261e-05, + "loss": 1.0751, + "num_input_tokens_seen": 29616224, + "step": 1840 + }, + { + "epoch": 0.12895882038754192, + "grad_norm": 3.666090250015259, + "learning_rate": 8.71162241681261e-05, + "loss": 1.0568, + "num_input_tokens_seen": 29632608, + "step": 1841 + }, + { + "epoch": 0.12902886863327118, + "grad_norm": 3.4623513221740723, + "learning_rate": 8.710922591943959e-05, + "loss": 0.9662, + "num_input_tokens_seen": 29648992, + "step": 1842 + }, + { + "epoch": 0.1290989168790004, + "grad_norm": 4.720603942871094, + "learning_rate": 8.710222767075307e-05, + "loss": 1.0566, + "num_input_tokens_seen": 29665136, + "step": 1843 + }, + { + "epoch": 0.12916896512472967, + "grad_norm": 4.208099365234375, + "learning_rate": 8.709522942206654e-05, + "loss": 1.1878, + "num_input_tokens_seen": 29681520, + "step": 1844 + }, + { + "epoch": 0.1292390133704589, + "grad_norm": 4.145462989807129, + "learning_rate": 8.708823117338004e-05, + "loss": 1.0159, + "num_input_tokens_seen": 29697480, + "step": 1845 + }, + { + "epoch": 0.12930906161618816, + "grad_norm": 3.9043843746185303, + "learning_rate": 8.708123292469353e-05, + "loss": 1.0809, + "num_input_tokens_seen": 29713560, + "step": 1846 + }, + { + "epoch": 0.1293791098619174, + "grad_norm": 4.092489242553711, + "learning_rate": 8.707423467600701e-05, + "loss": 1.0432, + "num_input_tokens_seen": 29729944, + "step": 1847 + }, + { + "epoch": 0.12944915810764665, + "grad_norm": 4.73677396774292, + "learning_rate": 8.70672364273205e-05, + "loss": 1.0276, + "num_input_tokens_seen": 29746328, + "step": 1848 + }, + { + "epoch": 0.12951920635337588, + "grad_norm": 6.134850025177002, + "learning_rate": 8.706023817863398e-05, + "loss": 0.9749, + "num_input_tokens_seen": 29762584, + "step": 1849 + }, + { + "epoch": 0.12958925459910514, + "grad_norm": 3.4841954708099365, + "learning_rate": 8.705323992994746e-05, + "loss": 0.9534, + "num_input_tokens_seen": 29778968, + "step": 1850 + }, + { + "epoch": 0.12965930284483437, + "grad_norm": 3.8816237449645996, + "learning_rate": 8.704624168126095e-05, + "loss": 0.7471, + "num_input_tokens_seen": 29795352, + "step": 1851 + }, + { + "epoch": 0.12972935109056363, + "grad_norm": 3.596538543701172, + "learning_rate": 8.703924343257444e-05, + "loss": 0.9753, + "num_input_tokens_seen": 29811608, + "step": 1852 + }, + { + "epoch": 0.12979939933629286, + "grad_norm": 3.9403269290924072, + "learning_rate": 8.703224518388793e-05, + "loss": 1.0667, + "num_input_tokens_seen": 29827608, + "step": 1853 + }, + { + "epoch": 0.12986944758202212, + "grad_norm": 4.586714744567871, + "learning_rate": 8.70252469352014e-05, + "loss": 0.9335, + "num_input_tokens_seen": 29843992, + "step": 1854 + }, + { + "epoch": 0.12993949582775136, + "grad_norm": 3.905280590057373, + "learning_rate": 8.701824868651489e-05, + "loss": 0.9115, + "num_input_tokens_seen": 29860376, + "step": 1855 + }, + { + "epoch": 0.13000954407348062, + "grad_norm": 4.974122524261475, + "learning_rate": 8.701125043782838e-05, + "loss": 0.9887, + "num_input_tokens_seen": 29875880, + "step": 1856 + }, + { + "epoch": 0.13007959231920985, + "grad_norm": 4.33966064453125, + "learning_rate": 8.700425218914185e-05, + "loss": 1.1955, + "num_input_tokens_seen": 29891088, + "step": 1857 + }, + { + "epoch": 0.1301496405649391, + "grad_norm": 4.593107223510742, + "learning_rate": 8.699725394045534e-05, + "loss": 0.9012, + "num_input_tokens_seen": 29907472, + "step": 1858 + }, + { + "epoch": 0.13021968881066834, + "grad_norm": 4.036941051483154, + "learning_rate": 8.699025569176884e-05, + "loss": 1.048, + "num_input_tokens_seen": 29923856, + "step": 1859 + }, + { + "epoch": 0.1302897370563976, + "grad_norm": 3.887981653213501, + "learning_rate": 8.698325744308232e-05, + "loss": 1.2116, + "num_input_tokens_seen": 29939872, + "step": 1860 + }, + { + "epoch": 0.13035978530212683, + "grad_norm": 3.796053886413574, + "learning_rate": 8.697625919439581e-05, + "loss": 1.1678, + "num_input_tokens_seen": 29955928, + "step": 1861 + }, + { + "epoch": 0.1304298335478561, + "grad_norm": 4.5357184410095215, + "learning_rate": 8.696926094570928e-05, + "loss": 0.9246, + "num_input_tokens_seen": 29970760, + "step": 1862 + }, + { + "epoch": 0.13049988179358532, + "grad_norm": 5.54911994934082, + "learning_rate": 8.696226269702277e-05, + "loss": 1.1874, + "num_input_tokens_seen": 29986408, + "step": 1863 + }, + { + "epoch": 0.13056993003931458, + "grad_norm": 3.6517300605773926, + "learning_rate": 8.695526444833625e-05, + "loss": 1.0949, + "num_input_tokens_seen": 30002792, + "step": 1864 + }, + { + "epoch": 0.1306399782850438, + "grad_norm": 3.6885063648223877, + "learning_rate": 8.694826619964975e-05, + "loss": 1.0027, + "num_input_tokens_seen": 30019176, + "step": 1865 + }, + { + "epoch": 0.13071002653077307, + "grad_norm": 4.417117118835449, + "learning_rate": 8.694126795096324e-05, + "loss": 1.1017, + "num_input_tokens_seen": 30034856, + "step": 1866 + }, + { + "epoch": 0.1307800747765023, + "grad_norm": 4.070515155792236, + "learning_rate": 8.693426970227671e-05, + "loss": 1.0393, + "num_input_tokens_seen": 30051240, + "step": 1867 + }, + { + "epoch": 0.13085012302223156, + "grad_norm": 4.135226726531982, + "learning_rate": 8.69272714535902e-05, + "loss": 1.0886, + "num_input_tokens_seen": 30067392, + "step": 1868 + }, + { + "epoch": 0.1309201712679608, + "grad_norm": 4.304529666900635, + "learning_rate": 8.692027320490369e-05, + "loss": 1.0851, + "num_input_tokens_seen": 30083640, + "step": 1869 + }, + { + "epoch": 0.13099021951369005, + "grad_norm": 4.633643627166748, + "learning_rate": 8.691327495621716e-05, + "loss": 1.1934, + "num_input_tokens_seen": 30099968, + "step": 1870 + }, + { + "epoch": 0.1310602677594193, + "grad_norm": 3.6481478214263916, + "learning_rate": 8.690627670753065e-05, + "loss": 1.0661, + "num_input_tokens_seen": 30116352, + "step": 1871 + }, + { + "epoch": 0.13113031600514855, + "grad_norm": 4.15482234954834, + "learning_rate": 8.689927845884414e-05, + "loss": 1.1083, + "num_input_tokens_seen": 30132256, + "step": 1872 + }, + { + "epoch": 0.1312003642508778, + "grad_norm": 3.6562340259552, + "learning_rate": 8.689228021015763e-05, + "loss": 0.9322, + "num_input_tokens_seen": 30147520, + "step": 1873 + }, + { + "epoch": 0.13127041249660704, + "grad_norm": 5.323586463928223, + "learning_rate": 8.68852819614711e-05, + "loss": 1.4077, + "num_input_tokens_seen": 30163880, + "step": 1874 + }, + { + "epoch": 0.1313404607423363, + "grad_norm": 4.068235397338867, + "learning_rate": 8.687828371278459e-05, + "loss": 1.144, + "num_input_tokens_seen": 30180264, + "step": 1875 + }, + { + "epoch": 0.13141050898806553, + "grad_norm": 3.743837594985962, + "learning_rate": 8.687128546409808e-05, + "loss": 0.9754, + "num_input_tokens_seen": 30196520, + "step": 1876 + }, + { + "epoch": 0.1314805572337948, + "grad_norm": 4.344557285308838, + "learning_rate": 8.686428721541156e-05, + "loss": 1.2745, + "num_input_tokens_seen": 30212904, + "step": 1877 + }, + { + "epoch": 0.13155060547952402, + "grad_norm": 4.048375129699707, + "learning_rate": 8.685728896672505e-05, + "loss": 1.1916, + "num_input_tokens_seen": 30228464, + "step": 1878 + }, + { + "epoch": 0.13162065372525328, + "grad_norm": 3.893768548965454, + "learning_rate": 8.685029071803853e-05, + "loss": 1.1462, + "num_input_tokens_seen": 30244848, + "step": 1879 + }, + { + "epoch": 0.1316907019709825, + "grad_norm": 4.469354629516602, + "learning_rate": 8.684329246935202e-05, + "loss": 1.0267, + "num_input_tokens_seen": 30260744, + "step": 1880 + }, + { + "epoch": 0.13176075021671177, + "grad_norm": 3.8471877574920654, + "learning_rate": 8.68362942206655e-05, + "loss": 0.8467, + "num_input_tokens_seen": 30277128, + "step": 1881 + }, + { + "epoch": 0.131830798462441, + "grad_norm": 4.37143611907959, + "learning_rate": 8.682929597197899e-05, + "loss": 0.9103, + "num_input_tokens_seen": 30293184, + "step": 1882 + }, + { + "epoch": 0.13190084670817026, + "grad_norm": 4.4709601402282715, + "learning_rate": 8.682229772329247e-05, + "loss": 0.9975, + "num_input_tokens_seen": 30309568, + "step": 1883 + }, + { + "epoch": 0.1319708949538995, + "grad_norm": 4.016445159912109, + "learning_rate": 8.681529947460595e-05, + "loss": 1.1499, + "num_input_tokens_seen": 30325952, + "step": 1884 + }, + { + "epoch": 0.13204094319962875, + "grad_norm": 3.6610453128814697, + "learning_rate": 8.680830122591945e-05, + "loss": 1.1407, + "num_input_tokens_seen": 30341608, + "step": 1885 + }, + { + "epoch": 0.13211099144535798, + "grad_norm": 4.226510524749756, + "learning_rate": 8.680130297723294e-05, + "loss": 0.8327, + "num_input_tokens_seen": 30357992, + "step": 1886 + }, + { + "epoch": 0.13218103969108724, + "grad_norm": 4.135020732879639, + "learning_rate": 8.679430472854642e-05, + "loss": 1.0807, + "num_input_tokens_seen": 30373464, + "step": 1887 + }, + { + "epoch": 0.13225108793681647, + "grad_norm": 3.858785629272461, + "learning_rate": 8.67873064798599e-05, + "loss": 0.9305, + "num_input_tokens_seen": 30389336, + "step": 1888 + }, + { + "epoch": 0.13232113618254573, + "grad_norm": 3.5424365997314453, + "learning_rate": 8.678030823117338e-05, + "loss": 1.0885, + "num_input_tokens_seen": 30405720, + "step": 1889 + }, + { + "epoch": 0.13239118442827497, + "grad_norm": 4.177000522613525, + "learning_rate": 8.677330998248687e-05, + "loss": 1.2172, + "num_input_tokens_seen": 30422104, + "step": 1890 + }, + { + "epoch": 0.13246123267400423, + "grad_norm": 4.08710241317749, + "learning_rate": 8.676631173380036e-05, + "loss": 1.0063, + "num_input_tokens_seen": 30437560, + "step": 1891 + }, + { + "epoch": 0.13253128091973346, + "grad_norm": 3.889277219772339, + "learning_rate": 8.675931348511384e-05, + "loss": 1.0227, + "num_input_tokens_seen": 30453944, + "step": 1892 + }, + { + "epoch": 0.13260132916546272, + "grad_norm": 3.7967042922973633, + "learning_rate": 8.675231523642733e-05, + "loss": 0.8988, + "num_input_tokens_seen": 30469480, + "step": 1893 + }, + { + "epoch": 0.13267137741119195, + "grad_norm": 4.2189202308654785, + "learning_rate": 8.674531698774081e-05, + "loss": 1.0591, + "num_input_tokens_seen": 30485536, + "step": 1894 + }, + { + "epoch": 0.1327414256569212, + "grad_norm": 4.682656764984131, + "learning_rate": 8.67383187390543e-05, + "loss": 1.2001, + "num_input_tokens_seen": 30501720, + "step": 1895 + }, + { + "epoch": 0.13281147390265044, + "grad_norm": 4.151151657104492, + "learning_rate": 8.673132049036779e-05, + "loss": 1.027, + "num_input_tokens_seen": 30518104, + "step": 1896 + }, + { + "epoch": 0.1328815221483797, + "grad_norm": 3.700916290283203, + "learning_rate": 8.672432224168126e-05, + "loss": 1.0545, + "num_input_tokens_seen": 30534488, + "step": 1897 + }, + { + "epoch": 0.13295157039410893, + "grad_norm": 3.512343406677246, + "learning_rate": 8.671732399299475e-05, + "loss": 1.0569, + "num_input_tokens_seen": 30550872, + "step": 1898 + }, + { + "epoch": 0.1330216186398382, + "grad_norm": 3.5579488277435303, + "learning_rate": 8.671032574430824e-05, + "loss": 0.9725, + "num_input_tokens_seen": 30567256, + "step": 1899 + }, + { + "epoch": 0.13309166688556742, + "grad_norm": 3.7006070613861084, + "learning_rate": 8.670332749562173e-05, + "loss": 0.9628, + "num_input_tokens_seen": 30582784, + "step": 1900 + }, + { + "epoch": 0.13316171513129668, + "grad_norm": 4.373071670532227, + "learning_rate": 8.66963292469352e-05, + "loss": 1.2223, + "num_input_tokens_seen": 30599168, + "step": 1901 + }, + { + "epoch": 0.1332317633770259, + "grad_norm": 4.459958076477051, + "learning_rate": 8.668933099824869e-05, + "loss": 1.2149, + "num_input_tokens_seen": 30615552, + "step": 1902 + }, + { + "epoch": 0.13330181162275517, + "grad_norm": 4.919619560241699, + "learning_rate": 8.668233274956218e-05, + "loss": 1.069, + "num_input_tokens_seen": 30631936, + "step": 1903 + }, + { + "epoch": 0.1333718598684844, + "grad_norm": 3.709568977355957, + "learning_rate": 8.667533450087565e-05, + "loss": 0.9867, + "num_input_tokens_seen": 30648320, + "step": 1904 + }, + { + "epoch": 0.13344190811421366, + "grad_norm": 4.097365379333496, + "learning_rate": 8.666833625218916e-05, + "loss": 1.2128, + "num_input_tokens_seen": 30664704, + "step": 1905 + }, + { + "epoch": 0.13351195635994292, + "grad_norm": 4.702358722686768, + "learning_rate": 8.666133800350263e-05, + "loss": 1.2809, + "num_input_tokens_seen": 30681088, + "step": 1906 + }, + { + "epoch": 0.13358200460567216, + "grad_norm": 3.7732086181640625, + "learning_rate": 8.665433975481612e-05, + "loss": 1.1529, + "num_input_tokens_seen": 30697472, + "step": 1907 + }, + { + "epoch": 0.13365205285140142, + "grad_norm": 5.318485260009766, + "learning_rate": 8.66473415061296e-05, + "loss": 1.0414, + "num_input_tokens_seen": 30712336, + "step": 1908 + }, + { + "epoch": 0.13372210109713065, + "grad_norm": 4.364311695098877, + "learning_rate": 8.664034325744308e-05, + "loss": 1.0634, + "num_input_tokens_seen": 30728600, + "step": 1909 + }, + { + "epoch": 0.1337921493428599, + "grad_norm": 4.860876083374023, + "learning_rate": 8.663334500875657e-05, + "loss": 1.0945, + "num_input_tokens_seen": 30744832, + "step": 1910 + }, + { + "epoch": 0.13386219758858914, + "grad_norm": 4.455454349517822, + "learning_rate": 8.662634676007006e-05, + "loss": 1.1765, + "num_input_tokens_seen": 30761216, + "step": 1911 + }, + { + "epoch": 0.1339322458343184, + "grad_norm": 4.70845365524292, + "learning_rate": 8.661934851138355e-05, + "loss": 1.2774, + "num_input_tokens_seen": 30776600, + "step": 1912 + }, + { + "epoch": 0.13400229408004763, + "grad_norm": 3.9769747257232666, + "learning_rate": 8.661235026269704e-05, + "loss": 1.006, + "num_input_tokens_seen": 30792632, + "step": 1913 + }, + { + "epoch": 0.1340723423257769, + "grad_norm": 4.387015342712402, + "learning_rate": 8.660535201401051e-05, + "loss": 1.1839, + "num_input_tokens_seen": 30809016, + "step": 1914 + }, + { + "epoch": 0.13414239057150612, + "grad_norm": 4.786890506744385, + "learning_rate": 8.6598353765324e-05, + "loss": 1.2352, + "num_input_tokens_seen": 30825136, + "step": 1915 + }, + { + "epoch": 0.13421243881723538, + "grad_norm": 3.502570629119873, + "learning_rate": 8.659135551663748e-05, + "loss": 1.0175, + "num_input_tokens_seen": 30841472, + "step": 1916 + }, + { + "epoch": 0.1342824870629646, + "grad_norm": 4.2404913902282715, + "learning_rate": 8.658435726795096e-05, + "loss": 1.1882, + "num_input_tokens_seen": 30857856, + "step": 1917 + }, + { + "epoch": 0.13435253530869387, + "grad_norm": 4.230425834655762, + "learning_rate": 8.657735901926445e-05, + "loss": 1.098, + "num_input_tokens_seen": 30874240, + "step": 1918 + }, + { + "epoch": 0.1344225835544231, + "grad_norm": 3.9034597873687744, + "learning_rate": 8.657036077057794e-05, + "loss": 1.0441, + "num_input_tokens_seen": 30890560, + "step": 1919 + }, + { + "epoch": 0.13449263180015236, + "grad_norm": 3.829190492630005, + "learning_rate": 8.656336252189143e-05, + "loss": 1.0675, + "num_input_tokens_seen": 30906480, + "step": 1920 + }, + { + "epoch": 0.1345626800458816, + "grad_norm": 3.9801993370056152, + "learning_rate": 8.65563642732049e-05, + "loss": 1.0407, + "num_input_tokens_seen": 30922160, + "step": 1921 + }, + { + "epoch": 0.13463272829161085, + "grad_norm": 5.018815994262695, + "learning_rate": 8.65493660245184e-05, + "loss": 1.1155, + "num_input_tokens_seen": 30938544, + "step": 1922 + }, + { + "epoch": 0.13470277653734009, + "grad_norm": 3.6515283584594727, + "learning_rate": 8.654236777583188e-05, + "loss": 1.0436, + "num_input_tokens_seen": 30954088, + "step": 1923 + }, + { + "epoch": 0.13477282478306934, + "grad_norm": 4.440131664276123, + "learning_rate": 8.653536952714536e-05, + "loss": 1.002, + "num_input_tokens_seen": 30970472, + "step": 1924 + }, + { + "epoch": 0.13484287302879858, + "grad_norm": 5.27577543258667, + "learning_rate": 8.652837127845885e-05, + "loss": 1.0783, + "num_input_tokens_seen": 30985544, + "step": 1925 + }, + { + "epoch": 0.13491292127452784, + "grad_norm": 4.632978916168213, + "learning_rate": 8.652137302977233e-05, + "loss": 1.1539, + "num_input_tokens_seen": 31001928, + "step": 1926 + }, + { + "epoch": 0.13498296952025707, + "grad_norm": 3.9239861965179443, + "learning_rate": 8.651437478108582e-05, + "loss": 1.0231, + "num_input_tokens_seen": 31018312, + "step": 1927 + }, + { + "epoch": 0.13505301776598633, + "grad_norm": 4.819107532501221, + "learning_rate": 8.65073765323993e-05, + "loss": 1.1631, + "num_input_tokens_seen": 31033568, + "step": 1928 + }, + { + "epoch": 0.13512306601171556, + "grad_norm": 3.5287766456604004, + "learning_rate": 8.650037828371279e-05, + "loss": 1.0172, + "num_input_tokens_seen": 31049952, + "step": 1929 + }, + { + "epoch": 0.13519311425744482, + "grad_norm": 3.536736488342285, + "learning_rate": 8.649338003502628e-05, + "loss": 0.9576, + "num_input_tokens_seen": 31066336, + "step": 1930 + }, + { + "epoch": 0.13526316250317405, + "grad_norm": 5.148278713226318, + "learning_rate": 8.648638178633976e-05, + "loss": 1.2137, + "num_input_tokens_seen": 31082136, + "step": 1931 + }, + { + "epoch": 0.1353332107489033, + "grad_norm": 4.076564788818359, + "learning_rate": 8.647938353765325e-05, + "loss": 1.081, + "num_input_tokens_seen": 31098520, + "step": 1932 + }, + { + "epoch": 0.13540325899463254, + "grad_norm": 4.747740745544434, + "learning_rate": 8.647238528896673e-05, + "loss": 1.1989, + "num_input_tokens_seen": 31114560, + "step": 1933 + }, + { + "epoch": 0.1354733072403618, + "grad_norm": 3.662280797958374, + "learning_rate": 8.646538704028022e-05, + "loss": 1.0797, + "num_input_tokens_seen": 31130944, + "step": 1934 + }, + { + "epoch": 0.13554335548609103, + "grad_norm": 3.8747782707214355, + "learning_rate": 8.645838879159369e-05, + "loss": 0.9258, + "num_input_tokens_seen": 31146544, + "step": 1935 + }, + { + "epoch": 0.1356134037318203, + "grad_norm": 3.465095281600952, + "learning_rate": 8.645139054290718e-05, + "loss": 1.0582, + "num_input_tokens_seen": 31162928, + "step": 1936 + }, + { + "epoch": 0.13568345197754952, + "grad_norm": 4.640190124511719, + "learning_rate": 8.644439229422067e-05, + "loss": 1.1265, + "num_input_tokens_seen": 31177712, + "step": 1937 + }, + { + "epoch": 0.13575350022327878, + "grad_norm": 3.88620924949646, + "learning_rate": 8.643739404553416e-05, + "loss": 1.0244, + "num_input_tokens_seen": 31193640, + "step": 1938 + }, + { + "epoch": 0.13582354846900804, + "grad_norm": 3.657331705093384, + "learning_rate": 8.643039579684765e-05, + "loss": 0.9715, + "num_input_tokens_seen": 31209112, + "step": 1939 + }, + { + "epoch": 0.13589359671473727, + "grad_norm": 6.8866448402404785, + "learning_rate": 8.642339754816113e-05, + "loss": 0.9734, + "num_input_tokens_seen": 31223968, + "step": 1940 + }, + { + "epoch": 0.13596364496046653, + "grad_norm": 5.0794172286987305, + "learning_rate": 8.641639929947461e-05, + "loss": 1.1988, + "num_input_tokens_seen": 31240352, + "step": 1941 + }, + { + "epoch": 0.13603369320619577, + "grad_norm": 4.631995677947998, + "learning_rate": 8.64094010507881e-05, + "loss": 1.1814, + "num_input_tokens_seen": 31256736, + "step": 1942 + }, + { + "epoch": 0.13610374145192503, + "grad_norm": 5.566014766693115, + "learning_rate": 8.640240280210157e-05, + "loss": 1.1769, + "num_input_tokens_seen": 31273120, + "step": 1943 + }, + { + "epoch": 0.13617378969765426, + "grad_norm": 3.940988302230835, + "learning_rate": 8.639540455341506e-05, + "loss": 1.0196, + "num_input_tokens_seen": 31289504, + "step": 1944 + }, + { + "epoch": 0.13624383794338352, + "grad_norm": 3.9979453086853027, + "learning_rate": 8.638840630472855e-05, + "loss": 1.0467, + "num_input_tokens_seen": 31305888, + "step": 1945 + }, + { + "epoch": 0.13631388618911275, + "grad_norm": 5.303500175476074, + "learning_rate": 8.638140805604204e-05, + "loss": 1.0938, + "num_input_tokens_seen": 31321512, + "step": 1946 + }, + { + "epoch": 0.136383934434842, + "grad_norm": 4.6745429039001465, + "learning_rate": 8.637440980735553e-05, + "loss": 1.3665, + "num_input_tokens_seen": 31337896, + "step": 1947 + }, + { + "epoch": 0.13645398268057124, + "grad_norm": 4.203839302062988, + "learning_rate": 8.6367411558669e-05, + "loss": 0.8949, + "num_input_tokens_seen": 31354176, + "step": 1948 + }, + { + "epoch": 0.1365240309263005, + "grad_norm": 4.802511215209961, + "learning_rate": 8.636041330998249e-05, + "loss": 1.2427, + "num_input_tokens_seen": 31369976, + "step": 1949 + }, + { + "epoch": 0.13659407917202973, + "grad_norm": 4.077885627746582, + "learning_rate": 8.635341506129598e-05, + "loss": 1.1259, + "num_input_tokens_seen": 31386360, + "step": 1950 + }, + { + "epoch": 0.136664127417759, + "grad_norm": 5.009285926818848, + "learning_rate": 8.634641681260947e-05, + "loss": 1.0278, + "num_input_tokens_seen": 31402744, + "step": 1951 + }, + { + "epoch": 0.13673417566348822, + "grad_norm": 3.539872646331787, + "learning_rate": 8.633941856392294e-05, + "loss": 1.0522, + "num_input_tokens_seen": 31419128, + "step": 1952 + }, + { + "epoch": 0.13680422390921748, + "grad_norm": 4.664520740509033, + "learning_rate": 8.633242031523643e-05, + "loss": 1.1559, + "num_input_tokens_seen": 31435400, + "step": 1953 + }, + { + "epoch": 0.1368742721549467, + "grad_norm": 3.8469269275665283, + "learning_rate": 8.632542206654992e-05, + "loss": 1.1237, + "num_input_tokens_seen": 31451408, + "step": 1954 + }, + { + "epoch": 0.13694432040067597, + "grad_norm": 4.064670085906982, + "learning_rate": 8.63184238178634e-05, + "loss": 0.8825, + "num_input_tokens_seen": 31467504, + "step": 1955 + }, + { + "epoch": 0.1370143686464052, + "grad_norm": 3.9931817054748535, + "learning_rate": 8.631142556917688e-05, + "loss": 1.17, + "num_input_tokens_seen": 31483528, + "step": 1956 + }, + { + "epoch": 0.13708441689213446, + "grad_norm": 4.136581897735596, + "learning_rate": 8.630442732049037e-05, + "loss": 1.069, + "num_input_tokens_seen": 31499912, + "step": 1957 + }, + { + "epoch": 0.1371544651378637, + "grad_norm": 3.7189536094665527, + "learning_rate": 8.629742907180386e-05, + "loss": 1.0509, + "num_input_tokens_seen": 31515560, + "step": 1958 + }, + { + "epoch": 0.13722451338359296, + "grad_norm": 3.7821719646453857, + "learning_rate": 8.629043082311735e-05, + "loss": 1.0583, + "num_input_tokens_seen": 31531944, + "step": 1959 + }, + { + "epoch": 0.1372945616293222, + "grad_norm": 6.815886497497559, + "learning_rate": 8.628343257443082e-05, + "loss": 0.9118, + "num_input_tokens_seen": 31548248, + "step": 1960 + }, + { + "epoch": 0.13736460987505145, + "grad_norm": 7.490451812744141, + "learning_rate": 8.627643432574431e-05, + "loss": 1.1145, + "num_input_tokens_seen": 31562560, + "step": 1961 + }, + { + "epoch": 0.13743465812078068, + "grad_norm": 4.918768405914307, + "learning_rate": 8.626943607705779e-05, + "loss": 1.2198, + "num_input_tokens_seen": 31578944, + "step": 1962 + }, + { + "epoch": 0.13750470636650994, + "grad_norm": 5.567696571350098, + "learning_rate": 8.626243782837128e-05, + "loss": 1.1083, + "num_input_tokens_seen": 31594312, + "step": 1963 + }, + { + "epoch": 0.13757475461223917, + "grad_norm": 4.24015474319458, + "learning_rate": 8.625543957968477e-05, + "loss": 1.1807, + "num_input_tokens_seen": 31609656, + "step": 1964 + }, + { + "epoch": 0.13764480285796843, + "grad_norm": 5.664759635925293, + "learning_rate": 8.624844133099825e-05, + "loss": 1.1775, + "num_input_tokens_seen": 31626040, + "step": 1965 + }, + { + "epoch": 0.13771485110369766, + "grad_norm": 3.7281267642974854, + "learning_rate": 8.624144308231174e-05, + "loss": 1.0994, + "num_input_tokens_seen": 31642424, + "step": 1966 + }, + { + "epoch": 0.13778489934942692, + "grad_norm": 4.112753391265869, + "learning_rate": 8.623444483362523e-05, + "loss": 1.2113, + "num_input_tokens_seen": 31658808, + "step": 1967 + }, + { + "epoch": 0.13785494759515615, + "grad_norm": 3.8851754665374756, + "learning_rate": 8.62274465849387e-05, + "loss": 1.0596, + "num_input_tokens_seen": 31675192, + "step": 1968 + }, + { + "epoch": 0.1379249958408854, + "grad_norm": 4.161825656890869, + "learning_rate": 8.62204483362522e-05, + "loss": 1.03, + "num_input_tokens_seen": 31691576, + "step": 1969 + }, + { + "epoch": 0.13799504408661464, + "grad_norm": 4.802804470062256, + "learning_rate": 8.621345008756567e-05, + "loss": 1.4374, + "num_input_tokens_seen": 31707960, + "step": 1970 + }, + { + "epoch": 0.1380650923323439, + "grad_norm": 3.752012252807617, + "learning_rate": 8.620645183887917e-05, + "loss": 1.124, + "num_input_tokens_seen": 31724344, + "step": 1971 + }, + { + "epoch": 0.13813514057807313, + "grad_norm": 3.8039815425872803, + "learning_rate": 8.619945359019265e-05, + "loss": 1.0051, + "num_input_tokens_seen": 31740456, + "step": 1972 + }, + { + "epoch": 0.1382051888238024, + "grad_norm": 4.029634952545166, + "learning_rate": 8.619245534150614e-05, + "loss": 1.2221, + "num_input_tokens_seen": 31756776, + "step": 1973 + }, + { + "epoch": 0.13827523706953165, + "grad_norm": 5.531665802001953, + "learning_rate": 8.618545709281962e-05, + "loss": 1.1534, + "num_input_tokens_seen": 31772480, + "step": 1974 + }, + { + "epoch": 0.13834528531526089, + "grad_norm": 4.6494646072387695, + "learning_rate": 8.61784588441331e-05, + "loss": 0.9723, + "num_input_tokens_seen": 31788504, + "step": 1975 + }, + { + "epoch": 0.13841533356099014, + "grad_norm": 4.201340675354004, + "learning_rate": 8.617146059544659e-05, + "loss": 1.0648, + "num_input_tokens_seen": 31804888, + "step": 1976 + }, + { + "epoch": 0.13848538180671938, + "grad_norm": 4.272038459777832, + "learning_rate": 8.616446234676008e-05, + "loss": 1.2557, + "num_input_tokens_seen": 31821272, + "step": 1977 + }, + { + "epoch": 0.13855543005244864, + "grad_norm": 3.729841947555542, + "learning_rate": 8.615746409807357e-05, + "loss": 1.0346, + "num_input_tokens_seen": 31837656, + "step": 1978 + }, + { + "epoch": 0.13862547829817787, + "grad_norm": 3.5615944862365723, + "learning_rate": 8.615046584938704e-05, + "loss": 0.9986, + "num_input_tokens_seen": 31854040, + "step": 1979 + }, + { + "epoch": 0.13869552654390713, + "grad_norm": 3.7658376693725586, + "learning_rate": 8.614346760070053e-05, + "loss": 1.3268, + "num_input_tokens_seen": 31870424, + "step": 1980 + }, + { + "epoch": 0.13876557478963636, + "grad_norm": 4.124275207519531, + "learning_rate": 8.613646935201402e-05, + "loss": 1.2736, + "num_input_tokens_seen": 31886808, + "step": 1981 + }, + { + "epoch": 0.13883562303536562, + "grad_norm": 5.348685264587402, + "learning_rate": 8.612947110332749e-05, + "loss": 1.0492, + "num_input_tokens_seen": 31902880, + "step": 1982 + }, + { + "epoch": 0.13890567128109485, + "grad_norm": 5.311651706695557, + "learning_rate": 8.612247285464098e-05, + "loss": 1.2034, + "num_input_tokens_seen": 31918704, + "step": 1983 + }, + { + "epoch": 0.1389757195268241, + "grad_norm": 4.194555759429932, + "learning_rate": 8.611547460595447e-05, + "loss": 1.2802, + "num_input_tokens_seen": 31935088, + "step": 1984 + }, + { + "epoch": 0.13904576777255334, + "grad_norm": 3.6576390266418457, + "learning_rate": 8.610847635726796e-05, + "loss": 1.0618, + "num_input_tokens_seen": 31951472, + "step": 1985 + }, + { + "epoch": 0.1391158160182826, + "grad_norm": 4.169801235198975, + "learning_rate": 8.610147810858145e-05, + "loss": 1.1668, + "num_input_tokens_seen": 31967856, + "step": 1986 + }, + { + "epoch": 0.13918586426401183, + "grad_norm": 3.79791259765625, + "learning_rate": 8.609447985989492e-05, + "loss": 1.2546, + "num_input_tokens_seen": 31984232, + "step": 1987 + }, + { + "epoch": 0.1392559125097411, + "grad_norm": 3.726701021194458, + "learning_rate": 8.608748161120841e-05, + "loss": 1.177, + "num_input_tokens_seen": 31999720, + "step": 1988 + }, + { + "epoch": 0.13932596075547032, + "grad_norm": 3.7376129627227783, + "learning_rate": 8.608048336252189e-05, + "loss": 1.0174, + "num_input_tokens_seen": 32016104, + "step": 1989 + }, + { + "epoch": 0.13939600900119958, + "grad_norm": 4.290423393249512, + "learning_rate": 8.607348511383537e-05, + "loss": 1.1556, + "num_input_tokens_seen": 32031992, + "step": 1990 + }, + { + "epoch": 0.13946605724692882, + "grad_norm": 3.592384099960327, + "learning_rate": 8.606648686514888e-05, + "loss": 1.0629, + "num_input_tokens_seen": 32047904, + "step": 1991 + }, + { + "epoch": 0.13953610549265807, + "grad_norm": 3.753692626953125, + "learning_rate": 8.605948861646235e-05, + "loss": 1.0111, + "num_input_tokens_seen": 32063720, + "step": 1992 + }, + { + "epoch": 0.1396061537383873, + "grad_norm": 4.698465347290039, + "learning_rate": 8.605249036777584e-05, + "loss": 1.0255, + "num_input_tokens_seen": 32079768, + "step": 1993 + }, + { + "epoch": 0.13967620198411657, + "grad_norm": 4.187407970428467, + "learning_rate": 8.604549211908933e-05, + "loss": 0.9006, + "num_input_tokens_seen": 32095120, + "step": 1994 + }, + { + "epoch": 0.1397462502298458, + "grad_norm": 4.256275653839111, + "learning_rate": 8.60384938704028e-05, + "loss": 1.1607, + "num_input_tokens_seen": 32111408, + "step": 1995 + }, + { + "epoch": 0.13981629847557506, + "grad_norm": 6.693331241607666, + "learning_rate": 8.603149562171629e-05, + "loss": 1.2317, + "num_input_tokens_seen": 32127792, + "step": 1996 + }, + { + "epoch": 0.1398863467213043, + "grad_norm": 3.69393253326416, + "learning_rate": 8.602449737302978e-05, + "loss": 0.9747, + "num_input_tokens_seen": 32143792, + "step": 1997 + }, + { + "epoch": 0.13995639496703355, + "grad_norm": 4.117836952209473, + "learning_rate": 8.601749912434327e-05, + "loss": 1.0732, + "num_input_tokens_seen": 32158624, + "step": 1998 + }, + { + "epoch": 0.14002644321276278, + "grad_norm": 5.14541482925415, + "learning_rate": 8.601050087565674e-05, + "loss": 1.1787, + "num_input_tokens_seen": 32175008, + "step": 1999 + }, + { + "epoch": 0.14009649145849204, + "grad_norm": 4.0103535652160645, + "learning_rate": 8.600350262697023e-05, + "loss": 1.1308, + "num_input_tokens_seen": 32191392, + "step": 2000 + }, + { + "epoch": 0.14009649145849204, + "eval_loss": 1.1461617946624756, + "eval_runtime": 0.1945, + "eval_samples_per_second": 5.141, + "eval_steps_per_second": 5.141, + "num_input_tokens_seen": 32191392, + "step": 2000 + }, + { + "epoch": 0.14016653970422127, + "grad_norm": 3.8072049617767334, + "learning_rate": 8.599650437828372e-05, + "loss": 0.8617, + "num_input_tokens_seen": 32207712, + "step": 2001 + }, + { + "epoch": 0.14023658794995053, + "grad_norm": 4.034494400024414, + "learning_rate": 8.59895061295972e-05, + "loss": 1.1719, + "num_input_tokens_seen": 32223440, + "step": 2002 + }, + { + "epoch": 0.14030663619567976, + "grad_norm": 3.9485251903533936, + "learning_rate": 8.598250788091069e-05, + "loss": 1.2242, + "num_input_tokens_seen": 32239824, + "step": 2003 + }, + { + "epoch": 0.14037668444140902, + "grad_norm": 5.427109241485596, + "learning_rate": 8.597550963222417e-05, + "loss": 1.1922, + "num_input_tokens_seen": 32255976, + "step": 2004 + }, + { + "epoch": 0.14044673268713825, + "grad_norm": 4.4832000732421875, + "learning_rate": 8.596851138353766e-05, + "loss": 1.2791, + "num_input_tokens_seen": 32272304, + "step": 2005 + }, + { + "epoch": 0.1405167809328675, + "grad_norm": 4.4699859619140625, + "learning_rate": 8.596151313485114e-05, + "loss": 1.0175, + "num_input_tokens_seen": 32288688, + "step": 2006 + }, + { + "epoch": 0.14058682917859674, + "grad_norm": 6.007316589355469, + "learning_rate": 8.595451488616463e-05, + "loss": 1.2402, + "num_input_tokens_seen": 32304992, + "step": 2007 + }, + { + "epoch": 0.140656877424326, + "grad_norm": 5.460748195648193, + "learning_rate": 8.594751663747811e-05, + "loss": 1.2683, + "num_input_tokens_seen": 32320104, + "step": 2008 + }, + { + "epoch": 0.14072692567005526, + "grad_norm": 4.430675029754639, + "learning_rate": 8.594051838879159e-05, + "loss": 0.9664, + "num_input_tokens_seen": 32336040, + "step": 2009 + }, + { + "epoch": 0.1407969739157845, + "grad_norm": 4.469089508056641, + "learning_rate": 8.593352014010508e-05, + "loss": 1.0335, + "num_input_tokens_seen": 32352424, + "step": 2010 + }, + { + "epoch": 0.14086702216151376, + "grad_norm": 4.94099760055542, + "learning_rate": 8.592652189141858e-05, + "loss": 1.1091, + "num_input_tokens_seen": 32367944, + "step": 2011 + }, + { + "epoch": 0.140937070407243, + "grad_norm": 5.430322170257568, + "learning_rate": 8.591952364273206e-05, + "loss": 1.2256, + "num_input_tokens_seen": 32384328, + "step": 2012 + }, + { + "epoch": 0.14100711865297225, + "grad_norm": 3.847569704055786, + "learning_rate": 8.591252539404554e-05, + "loss": 1.019, + "num_input_tokens_seen": 32400712, + "step": 2013 + }, + { + "epoch": 0.14107716689870148, + "grad_norm": 3.7531189918518066, + "learning_rate": 8.590552714535902e-05, + "loss": 0.9409, + "num_input_tokens_seen": 32417096, + "step": 2014 + }, + { + "epoch": 0.14114721514443074, + "grad_norm": 4.070606708526611, + "learning_rate": 8.589852889667251e-05, + "loss": 1.0857, + "num_input_tokens_seen": 32432504, + "step": 2015 + }, + { + "epoch": 0.14121726339015997, + "grad_norm": 4.791952610015869, + "learning_rate": 8.589153064798598e-05, + "loss": 0.8467, + "num_input_tokens_seen": 32448008, + "step": 2016 + }, + { + "epoch": 0.14128731163588923, + "grad_norm": 4.672977924346924, + "learning_rate": 8.588453239929948e-05, + "loss": 1.081, + "num_input_tokens_seen": 32463792, + "step": 2017 + }, + { + "epoch": 0.14135735988161846, + "grad_norm": 6.187239170074463, + "learning_rate": 8.587753415061297e-05, + "loss": 1.0416, + "num_input_tokens_seen": 32480104, + "step": 2018 + }, + { + "epoch": 0.14142740812734772, + "grad_norm": 4.058189392089844, + "learning_rate": 8.587053590192645e-05, + "loss": 1.0598, + "num_input_tokens_seen": 32495824, + "step": 2019 + }, + { + "epoch": 0.14149745637307695, + "grad_norm": 3.862661838531494, + "learning_rate": 8.586353765323994e-05, + "loss": 0.9371, + "num_input_tokens_seen": 32512208, + "step": 2020 + }, + { + "epoch": 0.1415675046188062, + "grad_norm": 3.7348716259002686, + "learning_rate": 8.585653940455343e-05, + "loss": 1.1021, + "num_input_tokens_seen": 32528592, + "step": 2021 + }, + { + "epoch": 0.14163755286453544, + "grad_norm": 4.405923843383789, + "learning_rate": 8.58495411558669e-05, + "loss": 1.1405, + "num_input_tokens_seen": 32544120, + "step": 2022 + }, + { + "epoch": 0.1417076011102647, + "grad_norm": 3.73984694480896, + "learning_rate": 8.584254290718039e-05, + "loss": 1.0797, + "num_input_tokens_seen": 32560504, + "step": 2023 + }, + { + "epoch": 0.14177764935599393, + "grad_norm": 5.73613166809082, + "learning_rate": 8.583554465849388e-05, + "loss": 1.2119, + "num_input_tokens_seen": 32576888, + "step": 2024 + }, + { + "epoch": 0.1418476976017232, + "grad_norm": 6.435116291046143, + "learning_rate": 8.582854640980737e-05, + "loss": 1.3408, + "num_input_tokens_seen": 32591592, + "step": 2025 + }, + { + "epoch": 0.14191774584745243, + "grad_norm": 4.520002365112305, + "learning_rate": 8.582154816112084e-05, + "loss": 1.1654, + "num_input_tokens_seen": 32607448, + "step": 2026 + }, + { + "epoch": 0.14198779409318169, + "grad_norm": 4.01891565322876, + "learning_rate": 8.581454991243433e-05, + "loss": 1.1203, + "num_input_tokens_seen": 32623104, + "step": 2027 + }, + { + "epoch": 0.14205784233891092, + "grad_norm": 3.8237030506134033, + "learning_rate": 8.580755166374782e-05, + "loss": 1.0548, + "num_input_tokens_seen": 32639376, + "step": 2028 + }, + { + "epoch": 0.14212789058464018, + "grad_norm": 4.893499851226807, + "learning_rate": 8.58005534150613e-05, + "loss": 1.1629, + "num_input_tokens_seen": 32654800, + "step": 2029 + }, + { + "epoch": 0.1421979388303694, + "grad_norm": 3.6075315475463867, + "learning_rate": 8.579355516637478e-05, + "loss": 1.0889, + "num_input_tokens_seen": 32671184, + "step": 2030 + }, + { + "epoch": 0.14226798707609867, + "grad_norm": 4.696410179138184, + "learning_rate": 8.578655691768827e-05, + "loss": 1.1777, + "num_input_tokens_seen": 32687360, + "step": 2031 + }, + { + "epoch": 0.1423380353218279, + "grad_norm": 3.9465558528900146, + "learning_rate": 8.577955866900176e-05, + "loss": 1.1378, + "num_input_tokens_seen": 32703744, + "step": 2032 + }, + { + "epoch": 0.14240808356755716, + "grad_norm": 3.933898448944092, + "learning_rate": 8.577256042031523e-05, + "loss": 0.8353, + "num_input_tokens_seen": 32720128, + "step": 2033 + }, + { + "epoch": 0.1424781318132864, + "grad_norm": 3.865894317626953, + "learning_rate": 8.576556217162872e-05, + "loss": 0.9827, + "num_input_tokens_seen": 32735976, + "step": 2034 + }, + { + "epoch": 0.14254818005901565, + "grad_norm": 3.9533474445343018, + "learning_rate": 8.575856392294221e-05, + "loss": 1.1028, + "num_input_tokens_seen": 32752240, + "step": 2035 + }, + { + "epoch": 0.14261822830474488, + "grad_norm": 3.5534164905548096, + "learning_rate": 8.575156567425569e-05, + "loss": 1.0887, + "num_input_tokens_seen": 32768624, + "step": 2036 + }, + { + "epoch": 0.14268827655047414, + "grad_norm": 5.689724922180176, + "learning_rate": 8.574456742556918e-05, + "loss": 1.0588, + "num_input_tokens_seen": 32784600, + "step": 2037 + }, + { + "epoch": 0.14275832479620337, + "grad_norm": 4.010136604309082, + "learning_rate": 8.573756917688268e-05, + "loss": 0.8989, + "num_input_tokens_seen": 32799824, + "step": 2038 + }, + { + "epoch": 0.14282837304193263, + "grad_norm": 4.153547763824463, + "learning_rate": 8.573057092819615e-05, + "loss": 1.162, + "num_input_tokens_seen": 32815744, + "step": 2039 + }, + { + "epoch": 0.14289842128766186, + "grad_norm": 3.976120948791504, + "learning_rate": 8.572357267950964e-05, + "loss": 1.2234, + "num_input_tokens_seen": 32831664, + "step": 2040 + }, + { + "epoch": 0.14296846953339112, + "grad_norm": 3.9593231678009033, + "learning_rate": 8.571657443082312e-05, + "loss": 1.0482, + "num_input_tokens_seen": 32848048, + "step": 2041 + }, + { + "epoch": 0.14303851777912036, + "grad_norm": 3.920823097229004, + "learning_rate": 8.57095761821366e-05, + "loss": 1.1891, + "num_input_tokens_seen": 32863168, + "step": 2042 + }, + { + "epoch": 0.14310856602484961, + "grad_norm": 4.754055976867676, + "learning_rate": 8.57025779334501e-05, + "loss": 1.1123, + "num_input_tokens_seen": 32879552, + "step": 2043 + }, + { + "epoch": 0.14317861427057887, + "grad_norm": 3.6835105419158936, + "learning_rate": 8.569557968476358e-05, + "loss": 1.0919, + "num_input_tokens_seen": 32895864, + "step": 2044 + }, + { + "epoch": 0.1432486625163081, + "grad_norm": 4.115698337554932, + "learning_rate": 8.568858143607707e-05, + "loss": 1.166, + "num_input_tokens_seen": 32912232, + "step": 2045 + }, + { + "epoch": 0.14331871076203737, + "grad_norm": 6.536626815795898, + "learning_rate": 8.568158318739055e-05, + "loss": 1.1534, + "num_input_tokens_seen": 32928616, + "step": 2046 + }, + { + "epoch": 0.1433887590077666, + "grad_norm": 5.43113899230957, + "learning_rate": 8.567458493870403e-05, + "loss": 0.9645, + "num_input_tokens_seen": 32945000, + "step": 2047 + }, + { + "epoch": 0.14345880725349586, + "grad_norm": 3.8677239418029785, + "learning_rate": 8.566758669001752e-05, + "loss": 1.2213, + "num_input_tokens_seen": 32961384, + "step": 2048 + }, + { + "epoch": 0.1435288554992251, + "grad_norm": 6.913444995880127, + "learning_rate": 8.5660588441331e-05, + "loss": 1.2204, + "num_input_tokens_seen": 32977768, + "step": 2049 + }, + { + "epoch": 0.14359890374495435, + "grad_norm": 4.870579719543457, + "learning_rate": 8.565359019264449e-05, + "loss": 1.1022, + "num_input_tokens_seen": 32994152, + "step": 2050 + }, + { + "epoch": 0.14366895199068358, + "grad_norm": 4.057044982910156, + "learning_rate": 8.564659194395797e-05, + "loss": 1.0599, + "num_input_tokens_seen": 33010536, + "step": 2051 + }, + { + "epoch": 0.14373900023641284, + "grad_norm": 8.405828475952148, + "learning_rate": 8.563959369527146e-05, + "loss": 1.0928, + "num_input_tokens_seen": 33025192, + "step": 2052 + }, + { + "epoch": 0.14380904848214207, + "grad_norm": 4.188510894775391, + "learning_rate": 8.563259544658494e-05, + "loss": 1.1207, + "num_input_tokens_seen": 33041576, + "step": 2053 + }, + { + "epoch": 0.14387909672787133, + "grad_norm": 6.505815505981445, + "learning_rate": 8.562559719789843e-05, + "loss": 1.1196, + "num_input_tokens_seen": 33057800, + "step": 2054 + }, + { + "epoch": 0.14394914497360056, + "grad_norm": 4.021209716796875, + "learning_rate": 8.561859894921192e-05, + "loss": 0.953, + "num_input_tokens_seen": 33073872, + "step": 2055 + }, + { + "epoch": 0.14401919321932982, + "grad_norm": 3.924671173095703, + "learning_rate": 8.561160070052539e-05, + "loss": 1.022, + "num_input_tokens_seen": 33090256, + "step": 2056 + }, + { + "epoch": 0.14408924146505905, + "grad_norm": 4.1323418617248535, + "learning_rate": 8.560460245183888e-05, + "loss": 1.0995, + "num_input_tokens_seen": 33106256, + "step": 2057 + }, + { + "epoch": 0.1441592897107883, + "grad_norm": 4.236043930053711, + "learning_rate": 8.559760420315237e-05, + "loss": 1.0842, + "num_input_tokens_seen": 33122352, + "step": 2058 + }, + { + "epoch": 0.14422933795651754, + "grad_norm": 3.4836020469665527, + "learning_rate": 8.559060595446586e-05, + "loss": 1.0136, + "num_input_tokens_seen": 33138736, + "step": 2059 + }, + { + "epoch": 0.1442993862022468, + "grad_norm": 4.363439083099365, + "learning_rate": 8.558360770577933e-05, + "loss": 1.1382, + "num_input_tokens_seen": 33153936, + "step": 2060 + }, + { + "epoch": 0.14436943444797604, + "grad_norm": 5.099925994873047, + "learning_rate": 8.557660945709282e-05, + "loss": 1.0027, + "num_input_tokens_seen": 33170320, + "step": 2061 + }, + { + "epoch": 0.1444394826937053, + "grad_norm": 4.438295364379883, + "learning_rate": 8.556961120840631e-05, + "loss": 1.11, + "num_input_tokens_seen": 33186704, + "step": 2062 + }, + { + "epoch": 0.14450953093943453, + "grad_norm": 3.7912747859954834, + "learning_rate": 8.556261295971978e-05, + "loss": 1.0708, + "num_input_tokens_seen": 33203088, + "step": 2063 + }, + { + "epoch": 0.1445795791851638, + "grad_norm": 4.679794788360596, + "learning_rate": 8.555561471103329e-05, + "loss": 0.9707, + "num_input_tokens_seen": 33218936, + "step": 2064 + }, + { + "epoch": 0.14464962743089302, + "grad_norm": 4.092919826507568, + "learning_rate": 8.554861646234677e-05, + "loss": 1.2103, + "num_input_tokens_seen": 33235320, + "step": 2065 + }, + { + "epoch": 0.14471967567662228, + "grad_norm": 4.13189172744751, + "learning_rate": 8.554161821366025e-05, + "loss": 0.9919, + "num_input_tokens_seen": 33251704, + "step": 2066 + }, + { + "epoch": 0.1447897239223515, + "grad_norm": 3.618739366531372, + "learning_rate": 8.553461996497374e-05, + "loss": 1.0026, + "num_input_tokens_seen": 33268088, + "step": 2067 + }, + { + "epoch": 0.14485977216808077, + "grad_norm": 4.197813034057617, + "learning_rate": 8.552762171628721e-05, + "loss": 1.3134, + "num_input_tokens_seen": 33284472, + "step": 2068 + }, + { + "epoch": 0.14492982041381, + "grad_norm": 4.159245491027832, + "learning_rate": 8.55206234676007e-05, + "loss": 0.9579, + "num_input_tokens_seen": 33300560, + "step": 2069 + }, + { + "epoch": 0.14499986865953926, + "grad_norm": 3.970898389816284, + "learning_rate": 8.551362521891419e-05, + "loss": 1.0587, + "num_input_tokens_seen": 33316744, + "step": 2070 + }, + { + "epoch": 0.1450699169052685, + "grad_norm": 5.635775089263916, + "learning_rate": 8.550662697022768e-05, + "loss": 1.2284, + "num_input_tokens_seen": 33333128, + "step": 2071 + }, + { + "epoch": 0.14513996515099775, + "grad_norm": 5.239542007446289, + "learning_rate": 8.549962872154117e-05, + "loss": 1.166, + "num_input_tokens_seen": 33348392, + "step": 2072 + }, + { + "epoch": 0.14521001339672698, + "grad_norm": 3.8646957874298096, + "learning_rate": 8.549263047285464e-05, + "loss": 1.0343, + "num_input_tokens_seen": 33364504, + "step": 2073 + }, + { + "epoch": 0.14528006164245624, + "grad_norm": 4.49400520324707, + "learning_rate": 8.548563222416813e-05, + "loss": 0.9953, + "num_input_tokens_seen": 33379680, + "step": 2074 + }, + { + "epoch": 0.14535010988818547, + "grad_norm": 3.782107353210449, + "learning_rate": 8.547863397548162e-05, + "loss": 1.1396, + "num_input_tokens_seen": 33396064, + "step": 2075 + }, + { + "epoch": 0.14542015813391473, + "grad_norm": 3.8171703815460205, + "learning_rate": 8.54716357267951e-05, + "loss": 1.1364, + "num_input_tokens_seen": 33411640, + "step": 2076 + }, + { + "epoch": 0.14549020637964397, + "grad_norm": 3.56487774848938, + "learning_rate": 8.546463747810858e-05, + "loss": 1.0396, + "num_input_tokens_seen": 33428024, + "step": 2077 + }, + { + "epoch": 0.14556025462537323, + "grad_norm": 5.169209003448486, + "learning_rate": 8.545763922942207e-05, + "loss": 1.1262, + "num_input_tokens_seen": 33444408, + "step": 2078 + }, + { + "epoch": 0.14563030287110248, + "grad_norm": 3.718086004257202, + "learning_rate": 8.545064098073556e-05, + "loss": 1.0769, + "num_input_tokens_seen": 33460416, + "step": 2079 + }, + { + "epoch": 0.14570035111683172, + "grad_norm": 4.2451372146606445, + "learning_rate": 8.544364273204904e-05, + "loss": 1.0298, + "num_input_tokens_seen": 33476800, + "step": 2080 + }, + { + "epoch": 0.14577039936256098, + "grad_norm": 3.7441632747650146, + "learning_rate": 8.543664448336252e-05, + "loss": 0.9785, + "num_input_tokens_seen": 33492536, + "step": 2081 + }, + { + "epoch": 0.1458404476082902, + "grad_norm": 3.8453383445739746, + "learning_rate": 8.542964623467601e-05, + "loss": 1.2527, + "num_input_tokens_seen": 33508920, + "step": 2082 + }, + { + "epoch": 0.14591049585401947, + "grad_norm": 3.6744494438171387, + "learning_rate": 8.542264798598949e-05, + "loss": 1.0739, + "num_input_tokens_seen": 33525304, + "step": 2083 + }, + { + "epoch": 0.1459805440997487, + "grad_norm": 4.209956645965576, + "learning_rate": 8.541564973730299e-05, + "loss": 1.1538, + "num_input_tokens_seen": 33541544, + "step": 2084 + }, + { + "epoch": 0.14605059234547796, + "grad_norm": 4.347019672393799, + "learning_rate": 8.540865148861647e-05, + "loss": 1.1078, + "num_input_tokens_seen": 33557928, + "step": 2085 + }, + { + "epoch": 0.1461206405912072, + "grad_norm": 5.323390483856201, + "learning_rate": 8.540165323992995e-05, + "loss": 1.0776, + "num_input_tokens_seen": 33573152, + "step": 2086 + }, + { + "epoch": 0.14619068883693645, + "grad_norm": 3.632425308227539, + "learning_rate": 8.539465499124343e-05, + "loss": 1.0595, + "num_input_tokens_seen": 33588848, + "step": 2087 + }, + { + "epoch": 0.14626073708266568, + "grad_norm": 4.460893154144287, + "learning_rate": 8.538765674255692e-05, + "loss": 1.1131, + "num_input_tokens_seen": 33604984, + "step": 2088 + }, + { + "epoch": 0.14633078532839494, + "grad_norm": 4.059104919433594, + "learning_rate": 8.53806584938704e-05, + "loss": 1.1818, + "num_input_tokens_seen": 33620384, + "step": 2089 + }, + { + "epoch": 0.14640083357412417, + "grad_norm": 6.023964881896973, + "learning_rate": 8.53736602451839e-05, + "loss": 1.1589, + "num_input_tokens_seen": 33636416, + "step": 2090 + }, + { + "epoch": 0.14647088181985343, + "grad_norm": 4.462921619415283, + "learning_rate": 8.536666199649738e-05, + "loss": 0.9362, + "num_input_tokens_seen": 33652504, + "step": 2091 + }, + { + "epoch": 0.14654093006558266, + "grad_norm": 4.003902435302734, + "learning_rate": 8.535966374781087e-05, + "loss": 1.1062, + "num_input_tokens_seen": 33668888, + "step": 2092 + }, + { + "epoch": 0.14661097831131192, + "grad_norm": 4.161351680755615, + "learning_rate": 8.535266549912435e-05, + "loss": 1.0252, + "num_input_tokens_seen": 33685272, + "step": 2093 + }, + { + "epoch": 0.14668102655704116, + "grad_norm": 4.424163341522217, + "learning_rate": 8.534566725043784e-05, + "loss": 1.0225, + "num_input_tokens_seen": 33700872, + "step": 2094 + }, + { + "epoch": 0.14675107480277041, + "grad_norm": 4.2255072593688965, + "learning_rate": 8.533866900175131e-05, + "loss": 1.2044, + "num_input_tokens_seen": 33717256, + "step": 2095 + }, + { + "epoch": 0.14682112304849965, + "grad_norm": 4.204975605010986, + "learning_rate": 8.53316707530648e-05, + "loss": 1.1861, + "num_input_tokens_seen": 33732544, + "step": 2096 + }, + { + "epoch": 0.1468911712942289, + "grad_norm": 3.7058298587799072, + "learning_rate": 8.532467250437829e-05, + "loss": 1.1568, + "num_input_tokens_seen": 33748928, + "step": 2097 + }, + { + "epoch": 0.14696121953995814, + "grad_norm": 6.157133102416992, + "learning_rate": 8.531767425569178e-05, + "loss": 1.0704, + "num_input_tokens_seen": 33765312, + "step": 2098 + }, + { + "epoch": 0.1470312677856874, + "grad_norm": 4.3684210777282715, + "learning_rate": 8.531067600700526e-05, + "loss": 1.0977, + "num_input_tokens_seen": 33781552, + "step": 2099 + }, + { + "epoch": 0.14710131603141663, + "grad_norm": 3.957848310470581, + "learning_rate": 8.530367775831874e-05, + "loss": 1.1412, + "num_input_tokens_seen": 33797464, + "step": 2100 + }, + { + "epoch": 0.1471713642771459, + "grad_norm": 4.9368486404418945, + "learning_rate": 8.529667950963223e-05, + "loss": 0.9986, + "num_input_tokens_seen": 33812672, + "step": 2101 + }, + { + "epoch": 0.14724141252287512, + "grad_norm": 3.8515660762786865, + "learning_rate": 8.528968126094572e-05, + "loss": 0.8715, + "num_input_tokens_seen": 33829024, + "step": 2102 + }, + { + "epoch": 0.14731146076860438, + "grad_norm": 3.961448907852173, + "learning_rate": 8.528268301225919e-05, + "loss": 1.1416, + "num_input_tokens_seen": 33845408, + "step": 2103 + }, + { + "epoch": 0.1473815090143336, + "grad_norm": 4.101677894592285, + "learning_rate": 8.52756847635727e-05, + "loss": 1.001, + "num_input_tokens_seen": 33861240, + "step": 2104 + }, + { + "epoch": 0.14745155726006287, + "grad_norm": 3.886634349822998, + "learning_rate": 8.526868651488617e-05, + "loss": 1.1546, + "num_input_tokens_seen": 33876832, + "step": 2105 + }, + { + "epoch": 0.1475216055057921, + "grad_norm": 3.7241156101226807, + "learning_rate": 8.526168826619966e-05, + "loss": 1.2116, + "num_input_tokens_seen": 33893216, + "step": 2106 + }, + { + "epoch": 0.14759165375152136, + "grad_norm": 3.829458236694336, + "learning_rate": 8.525469001751313e-05, + "loss": 0.8324, + "num_input_tokens_seen": 33909224, + "step": 2107 + }, + { + "epoch": 0.1476617019972506, + "grad_norm": 5.275660514831543, + "learning_rate": 8.524769176882662e-05, + "loss": 1.0253, + "num_input_tokens_seen": 33924768, + "step": 2108 + }, + { + "epoch": 0.14773175024297985, + "grad_norm": 4.207718372344971, + "learning_rate": 8.524069352014011e-05, + "loss": 1.1871, + "num_input_tokens_seen": 33940288, + "step": 2109 + }, + { + "epoch": 0.14780179848870909, + "grad_norm": 4.205242156982422, + "learning_rate": 8.52336952714536e-05, + "loss": 1.0834, + "num_input_tokens_seen": 33956512, + "step": 2110 + }, + { + "epoch": 0.14787184673443834, + "grad_norm": 4.365423202514648, + "learning_rate": 8.522669702276709e-05, + "loss": 1.2043, + "num_input_tokens_seen": 33972896, + "step": 2111 + }, + { + "epoch": 0.1479418949801676, + "grad_norm": 4.416136741638184, + "learning_rate": 8.521969877408056e-05, + "loss": 1.001, + "num_input_tokens_seen": 33989280, + "step": 2112 + }, + { + "epoch": 0.14801194322589684, + "grad_norm": 4.375226974487305, + "learning_rate": 8.521270052539405e-05, + "loss": 1.1186, + "num_input_tokens_seen": 34005664, + "step": 2113 + }, + { + "epoch": 0.1480819914716261, + "grad_norm": 5.2603840827941895, + "learning_rate": 8.520570227670753e-05, + "loss": 1.0723, + "num_input_tokens_seen": 34021576, + "step": 2114 + }, + { + "epoch": 0.14815203971735533, + "grad_norm": 4.02445125579834, + "learning_rate": 8.519870402802101e-05, + "loss": 1.11, + "num_input_tokens_seen": 34037960, + "step": 2115 + }, + { + "epoch": 0.1482220879630846, + "grad_norm": 3.6527910232543945, + "learning_rate": 8.51917057793345e-05, + "loss": 1.0293, + "num_input_tokens_seen": 34053240, + "step": 2116 + }, + { + "epoch": 0.14829213620881382, + "grad_norm": 4.170680999755859, + "learning_rate": 8.518470753064799e-05, + "loss": 1.2068, + "num_input_tokens_seen": 34068896, + "step": 2117 + }, + { + "epoch": 0.14836218445454308, + "grad_norm": 4.366664886474609, + "learning_rate": 8.517770928196148e-05, + "loss": 0.9541, + "num_input_tokens_seen": 34085280, + "step": 2118 + }, + { + "epoch": 0.1484322327002723, + "grad_norm": 3.50757098197937, + "learning_rate": 8.517071103327497e-05, + "loss": 0.9992, + "num_input_tokens_seen": 34101664, + "step": 2119 + }, + { + "epoch": 0.14850228094600157, + "grad_norm": 4.607417106628418, + "learning_rate": 8.516371278458844e-05, + "loss": 1.1974, + "num_input_tokens_seen": 34117752, + "step": 2120 + }, + { + "epoch": 0.1485723291917308, + "grad_norm": 3.959874391555786, + "learning_rate": 8.515671453590193e-05, + "loss": 0.9902, + "num_input_tokens_seen": 34133576, + "step": 2121 + }, + { + "epoch": 0.14864237743746006, + "grad_norm": 4.708366870880127, + "learning_rate": 8.514971628721541e-05, + "loss": 1.1201, + "num_input_tokens_seen": 34149952, + "step": 2122 + }, + { + "epoch": 0.1487124256831893, + "grad_norm": 3.6237339973449707, + "learning_rate": 8.51427180385289e-05, + "loss": 1.1091, + "num_input_tokens_seen": 34166336, + "step": 2123 + }, + { + "epoch": 0.14878247392891855, + "grad_norm": 4.606329917907715, + "learning_rate": 8.513571978984238e-05, + "loss": 1.0986, + "num_input_tokens_seen": 34181128, + "step": 2124 + }, + { + "epoch": 0.14885252217464778, + "grad_norm": 4.559760093688965, + "learning_rate": 8.512872154115587e-05, + "loss": 1.1022, + "num_input_tokens_seen": 34197512, + "step": 2125 + }, + { + "epoch": 0.14892257042037704, + "grad_norm": 3.870089292526245, + "learning_rate": 8.512172329246936e-05, + "loss": 1.0378, + "num_input_tokens_seen": 34213896, + "step": 2126 + }, + { + "epoch": 0.14899261866610627, + "grad_norm": 4.441296100616455, + "learning_rate": 8.511472504378284e-05, + "loss": 1.1473, + "num_input_tokens_seen": 34229472, + "step": 2127 + }, + { + "epoch": 0.14906266691183553, + "grad_norm": 3.8565545082092285, + "learning_rate": 8.510772679509633e-05, + "loss": 1.1465, + "num_input_tokens_seen": 34245856, + "step": 2128 + }, + { + "epoch": 0.14913271515756477, + "grad_norm": 3.563889741897583, + "learning_rate": 8.510072854640981e-05, + "loss": 0.8612, + "num_input_tokens_seen": 34262240, + "step": 2129 + }, + { + "epoch": 0.14920276340329403, + "grad_norm": 4.2634429931640625, + "learning_rate": 8.50937302977233e-05, + "loss": 1.1825, + "num_input_tokens_seen": 34278624, + "step": 2130 + }, + { + "epoch": 0.14927281164902326, + "grad_norm": 5.418450355529785, + "learning_rate": 8.508673204903679e-05, + "loss": 0.9869, + "num_input_tokens_seen": 34294216, + "step": 2131 + }, + { + "epoch": 0.14934285989475252, + "grad_norm": 3.511815309524536, + "learning_rate": 8.507973380035027e-05, + "loss": 0.9725, + "num_input_tokens_seen": 34310592, + "step": 2132 + }, + { + "epoch": 0.14941290814048175, + "grad_norm": 4.088070392608643, + "learning_rate": 8.507273555166375e-05, + "loss": 1.1299, + "num_input_tokens_seen": 34326352, + "step": 2133 + }, + { + "epoch": 0.149482956386211, + "grad_norm": 3.8594932556152344, + "learning_rate": 8.506573730297723e-05, + "loss": 1.0281, + "num_input_tokens_seen": 34342512, + "step": 2134 + }, + { + "epoch": 0.14955300463194024, + "grad_norm": 5.410063743591309, + "learning_rate": 8.505873905429072e-05, + "loss": 1.1376, + "num_input_tokens_seen": 34358896, + "step": 2135 + }, + { + "epoch": 0.1496230528776695, + "grad_norm": 4.02821159362793, + "learning_rate": 8.505174080560421e-05, + "loss": 0.9707, + "num_input_tokens_seen": 34375280, + "step": 2136 + }, + { + "epoch": 0.14969310112339873, + "grad_norm": 3.866480827331543, + "learning_rate": 8.50447425569177e-05, + "loss": 1.0727, + "num_input_tokens_seen": 34391584, + "step": 2137 + }, + { + "epoch": 0.149763149369128, + "grad_norm": 3.667064905166626, + "learning_rate": 8.503774430823118e-05, + "loss": 1.0609, + "num_input_tokens_seen": 34407264, + "step": 2138 + }, + { + "epoch": 0.14983319761485722, + "grad_norm": 5.41308069229126, + "learning_rate": 8.503074605954466e-05, + "loss": 1.0779, + "num_input_tokens_seen": 34423648, + "step": 2139 + }, + { + "epoch": 0.14990324586058648, + "grad_norm": 4.1716485023498535, + "learning_rate": 8.502374781085815e-05, + "loss": 1.2112, + "num_input_tokens_seen": 34439512, + "step": 2140 + }, + { + "epoch": 0.1499732941063157, + "grad_norm": 4.1403913497924805, + "learning_rate": 8.501674956217162e-05, + "loss": 1.0773, + "num_input_tokens_seen": 34455896, + "step": 2141 + }, + { + "epoch": 0.15004334235204497, + "grad_norm": 3.75219988822937, + "learning_rate": 8.500975131348511e-05, + "loss": 1.0685, + "num_input_tokens_seen": 34472280, + "step": 2142 + }, + { + "epoch": 0.1501133905977742, + "grad_norm": 4.339532852172852, + "learning_rate": 8.50027530647986e-05, + "loss": 1.0439, + "num_input_tokens_seen": 34488664, + "step": 2143 + }, + { + "epoch": 0.15018343884350346, + "grad_norm": 4.259124755859375, + "learning_rate": 8.499575481611209e-05, + "loss": 1.0576, + "num_input_tokens_seen": 34505048, + "step": 2144 + }, + { + "epoch": 0.1502534870892327, + "grad_norm": 5.031396865844727, + "learning_rate": 8.498875656742558e-05, + "loss": 0.9932, + "num_input_tokens_seen": 34521432, + "step": 2145 + }, + { + "epoch": 0.15032353533496196, + "grad_norm": 5.313172340393066, + "learning_rate": 8.498175831873907e-05, + "loss": 1.1737, + "num_input_tokens_seen": 34536344, + "step": 2146 + }, + { + "epoch": 0.15039358358069121, + "grad_norm": 4.844740390777588, + "learning_rate": 8.497476007005254e-05, + "loss": 1.4095, + "num_input_tokens_seen": 34552728, + "step": 2147 + }, + { + "epoch": 0.15046363182642045, + "grad_norm": 4.231154441833496, + "learning_rate": 8.496776182136603e-05, + "loss": 1.1196, + "num_input_tokens_seen": 34569016, + "step": 2148 + }, + { + "epoch": 0.1505336800721497, + "grad_norm": 4.176802635192871, + "learning_rate": 8.49607635726795e-05, + "loss": 1.0856, + "num_input_tokens_seen": 34585376, + "step": 2149 + }, + { + "epoch": 0.15060372831787894, + "grad_norm": 4.710334777832031, + "learning_rate": 8.4953765323993e-05, + "loss": 1.0085, + "num_input_tokens_seen": 34600400, + "step": 2150 + }, + { + "epoch": 0.1506737765636082, + "grad_norm": 3.9053258895874023, + "learning_rate": 8.494676707530648e-05, + "loss": 1.2191, + "num_input_tokens_seen": 34616688, + "step": 2151 + }, + { + "epoch": 0.15074382480933743, + "grad_norm": 4.043003559112549, + "learning_rate": 8.493976882661997e-05, + "loss": 1.0541, + "num_input_tokens_seen": 34631920, + "step": 2152 + }, + { + "epoch": 0.1508138730550667, + "grad_norm": 5.230721473693848, + "learning_rate": 8.493277057793346e-05, + "loss": 1.1491, + "num_input_tokens_seen": 34648128, + "step": 2153 + }, + { + "epoch": 0.15088392130079592, + "grad_norm": 4.098349094390869, + "learning_rate": 8.492577232924693e-05, + "loss": 1.1302, + "num_input_tokens_seen": 34664512, + "step": 2154 + }, + { + "epoch": 0.15095396954652518, + "grad_norm": 4.803813457489014, + "learning_rate": 8.491877408056042e-05, + "loss": 0.9653, + "num_input_tokens_seen": 34680560, + "step": 2155 + }, + { + "epoch": 0.1510240177922544, + "grad_norm": 4.25751256942749, + "learning_rate": 8.491177583187391e-05, + "loss": 1.2481, + "num_input_tokens_seen": 34696944, + "step": 2156 + }, + { + "epoch": 0.15109406603798367, + "grad_norm": 6.600613117218018, + "learning_rate": 8.49047775831874e-05, + "loss": 1.1786, + "num_input_tokens_seen": 34712416, + "step": 2157 + }, + { + "epoch": 0.1511641142837129, + "grad_norm": 5.649744987487793, + "learning_rate": 8.489777933450087e-05, + "loss": 1.3045, + "num_input_tokens_seen": 34728520, + "step": 2158 + }, + { + "epoch": 0.15123416252944216, + "grad_norm": 5.778639316558838, + "learning_rate": 8.489078108581436e-05, + "loss": 1.1224, + "num_input_tokens_seen": 34744776, + "step": 2159 + }, + { + "epoch": 0.1513042107751714, + "grad_norm": 5.944733619689941, + "learning_rate": 8.488378283712785e-05, + "loss": 1.3293, + "num_input_tokens_seen": 34761160, + "step": 2160 + }, + { + "epoch": 0.15137425902090065, + "grad_norm": 3.7783594131469727, + "learning_rate": 8.487678458844133e-05, + "loss": 1.0975, + "num_input_tokens_seen": 34777544, + "step": 2161 + }, + { + "epoch": 0.15144430726662989, + "grad_norm": 5.126344680786133, + "learning_rate": 8.486978633975482e-05, + "loss": 1.0509, + "num_input_tokens_seen": 34793072, + "step": 2162 + }, + { + "epoch": 0.15151435551235914, + "grad_norm": 4.689150333404541, + "learning_rate": 8.48627880910683e-05, + "loss": 1.1454, + "num_input_tokens_seen": 34809456, + "step": 2163 + }, + { + "epoch": 0.15158440375808838, + "grad_norm": 3.7559547424316406, + "learning_rate": 8.485578984238179e-05, + "loss": 1.1414, + "num_input_tokens_seen": 34825208, + "step": 2164 + }, + { + "epoch": 0.15165445200381764, + "grad_norm": 3.9225172996520996, + "learning_rate": 8.484879159369528e-05, + "loss": 1.1771, + "num_input_tokens_seen": 34841592, + "step": 2165 + }, + { + "epoch": 0.15172450024954687, + "grad_norm": 4.264125347137451, + "learning_rate": 8.484179334500876e-05, + "loss": 1.0046, + "num_input_tokens_seen": 34857928, + "step": 2166 + }, + { + "epoch": 0.15179454849527613, + "grad_norm": 4.0784382820129395, + "learning_rate": 8.483479509632224e-05, + "loss": 1.0638, + "num_input_tokens_seen": 34873224, + "step": 2167 + }, + { + "epoch": 0.15186459674100536, + "grad_norm": 4.371130466461182, + "learning_rate": 8.482779684763572e-05, + "loss": 1.3854, + "num_input_tokens_seen": 34889608, + "step": 2168 + }, + { + "epoch": 0.15193464498673462, + "grad_norm": 3.7022883892059326, + "learning_rate": 8.482079859894921e-05, + "loss": 0.9892, + "num_input_tokens_seen": 34905984, + "step": 2169 + }, + { + "epoch": 0.15200469323246385, + "grad_norm": 4.196985721588135, + "learning_rate": 8.481380035026271e-05, + "loss": 0.9674, + "num_input_tokens_seen": 34922368, + "step": 2170 + }, + { + "epoch": 0.1520747414781931, + "grad_norm": 4.0252580642700195, + "learning_rate": 8.480680210157619e-05, + "loss": 1.0478, + "num_input_tokens_seen": 34938752, + "step": 2171 + }, + { + "epoch": 0.15214478972392234, + "grad_norm": 4.03692626953125, + "learning_rate": 8.479980385288967e-05, + "loss": 1.1801, + "num_input_tokens_seen": 34954176, + "step": 2172 + }, + { + "epoch": 0.1522148379696516, + "grad_norm": 4.183175563812256, + "learning_rate": 8.479280560420316e-05, + "loss": 1.1117, + "num_input_tokens_seen": 34969880, + "step": 2173 + }, + { + "epoch": 0.15228488621538083, + "grad_norm": 3.757636070251465, + "learning_rate": 8.478580735551664e-05, + "loss": 1.1507, + "num_input_tokens_seen": 34985576, + "step": 2174 + }, + { + "epoch": 0.1523549344611101, + "grad_norm": 3.9442903995513916, + "learning_rate": 8.477880910683013e-05, + "loss": 1.034, + "num_input_tokens_seen": 35001896, + "step": 2175 + }, + { + "epoch": 0.15242498270683932, + "grad_norm": 4.092566013336182, + "learning_rate": 8.477181085814362e-05, + "loss": 1.0542, + "num_input_tokens_seen": 35018280, + "step": 2176 + }, + { + "epoch": 0.15249503095256858, + "grad_norm": 5.494921684265137, + "learning_rate": 8.47648126094571e-05, + "loss": 1.0988, + "num_input_tokens_seen": 35034544, + "step": 2177 + }, + { + "epoch": 0.15256507919829781, + "grad_norm": 7.327289581298828, + "learning_rate": 8.475781436077058e-05, + "loss": 1.1879, + "num_input_tokens_seen": 35050928, + "step": 2178 + }, + { + "epoch": 0.15263512744402707, + "grad_norm": 4.048150539398193, + "learning_rate": 8.475081611208407e-05, + "loss": 1.1071, + "num_input_tokens_seen": 35067000, + "step": 2179 + }, + { + "epoch": 0.1527051756897563, + "grad_norm": 6.388006210327148, + "learning_rate": 8.474381786339756e-05, + "loss": 0.9821, + "num_input_tokens_seen": 35082064, + "step": 2180 + }, + { + "epoch": 0.15277522393548557, + "grad_norm": 4.289052963256836, + "learning_rate": 8.473681961471103e-05, + "loss": 1.077, + "num_input_tokens_seen": 35098448, + "step": 2181 + }, + { + "epoch": 0.15284527218121483, + "grad_norm": 4.288560390472412, + "learning_rate": 8.472982136602452e-05, + "loss": 1.2723, + "num_input_tokens_seen": 35114832, + "step": 2182 + }, + { + "epoch": 0.15291532042694406, + "grad_norm": 4.17701530456543, + "learning_rate": 8.472282311733801e-05, + "loss": 1.1691, + "num_input_tokens_seen": 35131216, + "step": 2183 + }, + { + "epoch": 0.15298536867267332, + "grad_norm": 4.975949764251709, + "learning_rate": 8.47158248686515e-05, + "loss": 1.057, + "num_input_tokens_seen": 35147600, + "step": 2184 + }, + { + "epoch": 0.15305541691840255, + "grad_norm": 5.465437889099121, + "learning_rate": 8.470882661996497e-05, + "loss": 1.0328, + "num_input_tokens_seen": 35162464, + "step": 2185 + }, + { + "epoch": 0.1531254651641318, + "grad_norm": 3.329401731491089, + "learning_rate": 8.470182837127846e-05, + "loss": 1.0596, + "num_input_tokens_seen": 35178744, + "step": 2186 + }, + { + "epoch": 0.15319551340986104, + "grad_norm": 5.962124824523926, + "learning_rate": 8.469483012259195e-05, + "loss": 1.2799, + "num_input_tokens_seen": 35194736, + "step": 2187 + }, + { + "epoch": 0.1532655616555903, + "grad_norm": 3.897841691970825, + "learning_rate": 8.468783187390542e-05, + "loss": 1.1701, + "num_input_tokens_seen": 35211120, + "step": 2188 + }, + { + "epoch": 0.15333560990131953, + "grad_norm": 3.9668943881988525, + "learning_rate": 8.468083362521891e-05, + "loss": 1.1302, + "num_input_tokens_seen": 35227504, + "step": 2189 + }, + { + "epoch": 0.1534056581470488, + "grad_norm": 3.8960444927215576, + "learning_rate": 8.467383537653241e-05, + "loss": 0.891, + "num_input_tokens_seen": 35243584, + "step": 2190 + }, + { + "epoch": 0.15347570639277802, + "grad_norm": 3.7700982093811035, + "learning_rate": 8.466683712784589e-05, + "loss": 1.1744, + "num_input_tokens_seen": 35259968, + "step": 2191 + }, + { + "epoch": 0.15354575463850728, + "grad_norm": 4.65008020401001, + "learning_rate": 8.465983887915938e-05, + "loss": 1.2807, + "num_input_tokens_seen": 35276352, + "step": 2192 + }, + { + "epoch": 0.1536158028842365, + "grad_norm": 3.5371146202087402, + "learning_rate": 8.465284063047285e-05, + "loss": 0.9699, + "num_input_tokens_seen": 35292736, + "step": 2193 + }, + { + "epoch": 0.15368585112996577, + "grad_norm": 4.395732879638672, + "learning_rate": 8.464584238178634e-05, + "loss": 0.9862, + "num_input_tokens_seen": 35309120, + "step": 2194 + }, + { + "epoch": 0.153755899375695, + "grad_norm": 5.01919412612915, + "learning_rate": 8.463884413309982e-05, + "loss": 1.0143, + "num_input_tokens_seen": 35325504, + "step": 2195 + }, + { + "epoch": 0.15382594762142426, + "grad_norm": 3.7417054176330566, + "learning_rate": 8.463184588441332e-05, + "loss": 1.0712, + "num_input_tokens_seen": 35341376, + "step": 2196 + }, + { + "epoch": 0.1538959958671535, + "grad_norm": 4.119459629058838, + "learning_rate": 8.462484763572681e-05, + "loss": 1.0919, + "num_input_tokens_seen": 35357520, + "step": 2197 + }, + { + "epoch": 0.15396604411288276, + "grad_norm": 6.938751220703125, + "learning_rate": 8.461784938704028e-05, + "loss": 1.1272, + "num_input_tokens_seen": 35372920, + "step": 2198 + }, + { + "epoch": 0.154036092358612, + "grad_norm": 5.000339984893799, + "learning_rate": 8.461085113835377e-05, + "loss": 1.1508, + "num_input_tokens_seen": 35389304, + "step": 2199 + }, + { + "epoch": 0.15410614060434125, + "grad_norm": 3.6554362773895264, + "learning_rate": 8.460385288966726e-05, + "loss": 1.0765, + "num_input_tokens_seen": 35405688, + "step": 2200 + }, + { + "epoch": 0.15410614060434125, + "eval_loss": 1.145054578781128, + "eval_runtime": 0.1886, + "eval_samples_per_second": 5.303, + "eval_steps_per_second": 5.303, + "num_input_tokens_seen": 35405688, + "step": 2200 + }, + { + "epoch": 0.15417618885007048, + "grad_norm": 3.718207836151123, + "learning_rate": 8.459685464098073e-05, + "loss": 0.8814, + "num_input_tokens_seen": 35422072, + "step": 2201 + }, + { + "epoch": 0.15424623709579974, + "grad_norm": 4.98813533782959, + "learning_rate": 8.458985639229422e-05, + "loss": 1.1814, + "num_input_tokens_seen": 35438456, + "step": 2202 + }, + { + "epoch": 0.15431628534152897, + "grad_norm": 3.550008535385132, + "learning_rate": 8.458285814360771e-05, + "loss": 1.1281, + "num_input_tokens_seen": 35454840, + "step": 2203 + }, + { + "epoch": 0.15438633358725823, + "grad_norm": 3.8408641815185547, + "learning_rate": 8.45758598949212e-05, + "loss": 0.9759, + "num_input_tokens_seen": 35471080, + "step": 2204 + }, + { + "epoch": 0.15445638183298746, + "grad_norm": 4.515852451324463, + "learning_rate": 8.456886164623468e-05, + "loss": 0.9394, + "num_input_tokens_seen": 35486904, + "step": 2205 + }, + { + "epoch": 0.15452643007871672, + "grad_norm": 3.6536715030670166, + "learning_rate": 8.456186339754816e-05, + "loss": 0.9649, + "num_input_tokens_seen": 35503064, + "step": 2206 + }, + { + "epoch": 0.15459647832444595, + "grad_norm": 4.071808338165283, + "learning_rate": 8.455486514886165e-05, + "loss": 1.0972, + "num_input_tokens_seen": 35518880, + "step": 2207 + }, + { + "epoch": 0.1546665265701752, + "grad_norm": 4.329566955566406, + "learning_rate": 8.454786690017513e-05, + "loss": 1.0843, + "num_input_tokens_seen": 35535256, + "step": 2208 + }, + { + "epoch": 0.15473657481590444, + "grad_norm": 4.243298053741455, + "learning_rate": 8.454086865148862e-05, + "loss": 1.1688, + "num_input_tokens_seen": 35551376, + "step": 2209 + }, + { + "epoch": 0.1548066230616337, + "grad_norm": 4.154253959655762, + "learning_rate": 8.453387040280212e-05, + "loss": 1.0458, + "num_input_tokens_seen": 35567696, + "step": 2210 + }, + { + "epoch": 0.15487667130736293, + "grad_norm": 4.0564494132995605, + "learning_rate": 8.45268721541156e-05, + "loss": 1.0585, + "num_input_tokens_seen": 35583576, + "step": 2211 + }, + { + "epoch": 0.1549467195530922, + "grad_norm": 3.735724687576294, + "learning_rate": 8.451987390542907e-05, + "loss": 0.92, + "num_input_tokens_seen": 35599536, + "step": 2212 + }, + { + "epoch": 0.15501676779882143, + "grad_norm": 4.651454925537109, + "learning_rate": 8.451287565674256e-05, + "loss": 1.2097, + "num_input_tokens_seen": 35615920, + "step": 2213 + }, + { + "epoch": 0.15508681604455068, + "grad_norm": 5.01883602142334, + "learning_rate": 8.450587740805605e-05, + "loss": 0.9275, + "num_input_tokens_seen": 35631208, + "step": 2214 + }, + { + "epoch": 0.15515686429027992, + "grad_norm": 4.435250282287598, + "learning_rate": 8.449887915936952e-05, + "loss": 1.003, + "num_input_tokens_seen": 35647328, + "step": 2215 + }, + { + "epoch": 0.15522691253600918, + "grad_norm": 3.495476245880127, + "learning_rate": 8.449188091068302e-05, + "loss": 0.9968, + "num_input_tokens_seen": 35663472, + "step": 2216 + }, + { + "epoch": 0.15529696078173844, + "grad_norm": 4.461013317108154, + "learning_rate": 8.448488266199651e-05, + "loss": 1.1098, + "num_input_tokens_seen": 35679856, + "step": 2217 + }, + { + "epoch": 0.15536700902746767, + "grad_norm": 5.4857683181762695, + "learning_rate": 8.447788441330999e-05, + "loss": 1.143, + "num_input_tokens_seen": 35695616, + "step": 2218 + }, + { + "epoch": 0.15543705727319693, + "grad_norm": 4.20158052444458, + "learning_rate": 8.447088616462348e-05, + "loss": 1.1643, + "num_input_tokens_seen": 35711432, + "step": 2219 + }, + { + "epoch": 0.15550710551892616, + "grad_norm": 4.289988040924072, + "learning_rate": 8.446388791593695e-05, + "loss": 1.1582, + "num_input_tokens_seen": 35727552, + "step": 2220 + }, + { + "epoch": 0.15557715376465542, + "grad_norm": 3.7897555828094482, + "learning_rate": 8.445688966725044e-05, + "loss": 1.255, + "num_input_tokens_seen": 35743800, + "step": 2221 + }, + { + "epoch": 0.15564720201038465, + "grad_norm": 4.405816078186035, + "learning_rate": 8.444989141856393e-05, + "loss": 1.1057, + "num_input_tokens_seen": 35760184, + "step": 2222 + }, + { + "epoch": 0.1557172502561139, + "grad_norm": 4.2683610916137695, + "learning_rate": 8.444289316987742e-05, + "loss": 1.1042, + "num_input_tokens_seen": 35776568, + "step": 2223 + }, + { + "epoch": 0.15578729850184314, + "grad_norm": 3.9999659061431885, + "learning_rate": 8.44358949211909e-05, + "loss": 1.0504, + "num_input_tokens_seen": 35792952, + "step": 2224 + }, + { + "epoch": 0.1558573467475724, + "grad_norm": 3.6252965927124023, + "learning_rate": 8.442889667250438e-05, + "loss": 0.9755, + "num_input_tokens_seen": 35809176, + "step": 2225 + }, + { + "epoch": 0.15592739499330163, + "grad_norm": 3.9726274013519287, + "learning_rate": 8.442189842381787e-05, + "loss": 1.1104, + "num_input_tokens_seen": 35825560, + "step": 2226 + }, + { + "epoch": 0.1559974432390309, + "grad_norm": 5.004739761352539, + "learning_rate": 8.441490017513136e-05, + "loss": 1.2484, + "num_input_tokens_seen": 35841936, + "step": 2227 + }, + { + "epoch": 0.15606749148476012, + "grad_norm": 5.432271480560303, + "learning_rate": 8.440790192644483e-05, + "loss": 0.9799, + "num_input_tokens_seen": 35857944, + "step": 2228 + }, + { + "epoch": 0.15613753973048938, + "grad_norm": 4.553518295288086, + "learning_rate": 8.440090367775832e-05, + "loss": 1.1077, + "num_input_tokens_seen": 35873920, + "step": 2229 + }, + { + "epoch": 0.15620758797621861, + "grad_norm": 5.924668312072754, + "learning_rate": 8.439390542907181e-05, + "loss": 1.2937, + "num_input_tokens_seen": 35888872, + "step": 2230 + }, + { + "epoch": 0.15627763622194787, + "grad_norm": 4.276167392730713, + "learning_rate": 8.43869071803853e-05, + "loss": 1.1883, + "num_input_tokens_seen": 35905256, + "step": 2231 + }, + { + "epoch": 0.1563476844676771, + "grad_norm": 3.719632863998413, + "learning_rate": 8.437990893169877e-05, + "loss": 1.0713, + "num_input_tokens_seen": 35921640, + "step": 2232 + }, + { + "epoch": 0.15641773271340637, + "grad_norm": 4.769368648529053, + "learning_rate": 8.437291068301226e-05, + "loss": 1.079, + "num_input_tokens_seen": 35936256, + "step": 2233 + }, + { + "epoch": 0.1564877809591356, + "grad_norm": 4.957282543182373, + "learning_rate": 8.436591243432575e-05, + "loss": 1.0535, + "num_input_tokens_seen": 35952640, + "step": 2234 + }, + { + "epoch": 0.15655782920486486, + "grad_norm": 4.782018661499023, + "learning_rate": 8.435891418563923e-05, + "loss": 1.0799, + "num_input_tokens_seen": 35967880, + "step": 2235 + }, + { + "epoch": 0.1566278774505941, + "grad_norm": 4.716582775115967, + "learning_rate": 8.435191593695273e-05, + "loss": 1.1388, + "num_input_tokens_seen": 35984016, + "step": 2236 + }, + { + "epoch": 0.15669792569632335, + "grad_norm": 4.36606502532959, + "learning_rate": 8.434491768826622e-05, + "loss": 0.954, + "num_input_tokens_seen": 35999904, + "step": 2237 + }, + { + "epoch": 0.15676797394205258, + "grad_norm": 3.8300321102142334, + "learning_rate": 8.433791943957969e-05, + "loss": 1.0903, + "num_input_tokens_seen": 36016216, + "step": 2238 + }, + { + "epoch": 0.15683802218778184, + "grad_norm": 3.7595677375793457, + "learning_rate": 8.433092119089317e-05, + "loss": 1.0214, + "num_input_tokens_seen": 36032600, + "step": 2239 + }, + { + "epoch": 0.15690807043351107, + "grad_norm": 4.783555030822754, + "learning_rate": 8.432392294220665e-05, + "loss": 1.1621, + "num_input_tokens_seen": 36048984, + "step": 2240 + }, + { + "epoch": 0.15697811867924033, + "grad_norm": 4.393221855163574, + "learning_rate": 8.431692469352014e-05, + "loss": 1.2196, + "num_input_tokens_seen": 36065368, + "step": 2241 + }, + { + "epoch": 0.15704816692496956, + "grad_norm": 3.8634722232818604, + "learning_rate": 8.430992644483363e-05, + "loss": 1.0227, + "num_input_tokens_seen": 36081752, + "step": 2242 + }, + { + "epoch": 0.15711821517069882, + "grad_norm": 4.5091233253479, + "learning_rate": 8.430292819614712e-05, + "loss": 0.9261, + "num_input_tokens_seen": 36097672, + "step": 2243 + }, + { + "epoch": 0.15718826341642805, + "grad_norm": 3.89699387550354, + "learning_rate": 8.429592994746061e-05, + "loss": 1.0023, + "num_input_tokens_seen": 36114048, + "step": 2244 + }, + { + "epoch": 0.1572583116621573, + "grad_norm": 3.8859546184539795, + "learning_rate": 8.428893169877408e-05, + "loss": 0.9597, + "num_input_tokens_seen": 36130024, + "step": 2245 + }, + { + "epoch": 0.15732835990788654, + "grad_norm": 4.236848831176758, + "learning_rate": 8.428193345008757e-05, + "loss": 1.1777, + "num_input_tokens_seen": 36146408, + "step": 2246 + }, + { + "epoch": 0.1573984081536158, + "grad_norm": 6.742307662963867, + "learning_rate": 8.427493520140105e-05, + "loss": 0.9674, + "num_input_tokens_seen": 36161440, + "step": 2247 + }, + { + "epoch": 0.15746845639934504, + "grad_norm": 3.332416534423828, + "learning_rate": 8.426793695271454e-05, + "loss": 0.7694, + "num_input_tokens_seen": 36177824, + "step": 2248 + }, + { + "epoch": 0.1575385046450743, + "grad_norm": 4.672734260559082, + "learning_rate": 8.426093870402802e-05, + "loss": 0.9228, + "num_input_tokens_seen": 36193320, + "step": 2249 + }, + { + "epoch": 0.15760855289080353, + "grad_norm": 4.437155246734619, + "learning_rate": 8.425394045534151e-05, + "loss": 1.2712, + "num_input_tokens_seen": 36209704, + "step": 2250 + }, + { + "epoch": 0.1576786011365328, + "grad_norm": 4.112512111663818, + "learning_rate": 8.4246942206655e-05, + "loss": 1.3494, + "num_input_tokens_seen": 36226088, + "step": 2251 + }, + { + "epoch": 0.15774864938226205, + "grad_norm": 4.432194709777832, + "learning_rate": 8.423994395796848e-05, + "loss": 1.1303, + "num_input_tokens_seen": 36242472, + "step": 2252 + }, + { + "epoch": 0.15781869762799128, + "grad_norm": 4.322375297546387, + "learning_rate": 8.423294570928197e-05, + "loss": 1.084, + "num_input_tokens_seen": 36258680, + "step": 2253 + }, + { + "epoch": 0.15788874587372054, + "grad_norm": 3.848836660385132, + "learning_rate": 8.422594746059545e-05, + "loss": 1.2057, + "num_input_tokens_seen": 36274512, + "step": 2254 + }, + { + "epoch": 0.15795879411944977, + "grad_norm": 4.022729396820068, + "learning_rate": 8.421894921190893e-05, + "loss": 1.0584, + "num_input_tokens_seen": 36289568, + "step": 2255 + }, + { + "epoch": 0.15802884236517903, + "grad_norm": 3.8060622215270996, + "learning_rate": 8.421195096322243e-05, + "loss": 1.1144, + "num_input_tokens_seen": 36305256, + "step": 2256 + }, + { + "epoch": 0.15809889061090826, + "grad_norm": 4.685004234313965, + "learning_rate": 8.42049527145359e-05, + "loss": 1.1341, + "num_input_tokens_seen": 36321008, + "step": 2257 + }, + { + "epoch": 0.15816893885663752, + "grad_norm": 3.4483463764190674, + "learning_rate": 8.41979544658494e-05, + "loss": 0.9563, + "num_input_tokens_seen": 36337000, + "step": 2258 + }, + { + "epoch": 0.15823898710236675, + "grad_norm": 3.7172203063964844, + "learning_rate": 8.419095621716287e-05, + "loss": 1.1463, + "num_input_tokens_seen": 36353160, + "step": 2259 + }, + { + "epoch": 0.158309035348096, + "grad_norm": 5.734589099884033, + "learning_rate": 8.418395796847636e-05, + "loss": 0.9321, + "num_input_tokens_seen": 36369248, + "step": 2260 + }, + { + "epoch": 0.15837908359382524, + "grad_norm": 4.060257911682129, + "learning_rate": 8.417695971978985e-05, + "loss": 1.2162, + "num_input_tokens_seen": 36384736, + "step": 2261 + }, + { + "epoch": 0.1584491318395545, + "grad_norm": 5.240515232086182, + "learning_rate": 8.416996147110334e-05, + "loss": 0.9652, + "num_input_tokens_seen": 36401120, + "step": 2262 + }, + { + "epoch": 0.15851918008528373, + "grad_norm": 5.482649803161621, + "learning_rate": 8.416296322241682e-05, + "loss": 1.207, + "num_input_tokens_seen": 36417504, + "step": 2263 + }, + { + "epoch": 0.158589228331013, + "grad_norm": 3.9862253665924072, + "learning_rate": 8.415596497373031e-05, + "loss": 1.1354, + "num_input_tokens_seen": 36433888, + "step": 2264 + }, + { + "epoch": 0.15865927657674223, + "grad_norm": 6.322808742523193, + "learning_rate": 8.414896672504379e-05, + "loss": 1.1144, + "num_input_tokens_seen": 36449552, + "step": 2265 + }, + { + "epoch": 0.15872932482247148, + "grad_norm": 4.312921524047852, + "learning_rate": 8.414196847635726e-05, + "loss": 1.1254, + "num_input_tokens_seen": 36465936, + "step": 2266 + }, + { + "epoch": 0.15879937306820072, + "grad_norm": 4.178677082061768, + "learning_rate": 8.413497022767075e-05, + "loss": 1.2539, + "num_input_tokens_seen": 36482184, + "step": 2267 + }, + { + "epoch": 0.15886942131392998, + "grad_norm": 4.304810523986816, + "learning_rate": 8.412797197898424e-05, + "loss": 1.199, + "num_input_tokens_seen": 36498320, + "step": 2268 + }, + { + "epoch": 0.1589394695596592, + "grad_norm": 3.723483085632324, + "learning_rate": 8.412097373029773e-05, + "loss": 1.0335, + "num_input_tokens_seen": 36514704, + "step": 2269 + }, + { + "epoch": 0.15900951780538847, + "grad_norm": 4.285789489746094, + "learning_rate": 8.411397548161122e-05, + "loss": 1.2463, + "num_input_tokens_seen": 36531032, + "step": 2270 + }, + { + "epoch": 0.1590795660511177, + "grad_norm": 3.5788466930389404, + "learning_rate": 8.41069772329247e-05, + "loss": 0.7809, + "num_input_tokens_seen": 36547416, + "step": 2271 + }, + { + "epoch": 0.15914961429684696, + "grad_norm": 5.785874366760254, + "learning_rate": 8.409997898423818e-05, + "loss": 1.2832, + "num_input_tokens_seen": 36563800, + "step": 2272 + }, + { + "epoch": 0.1592196625425762, + "grad_norm": 3.914402723312378, + "learning_rate": 8.409298073555167e-05, + "loss": 1.2065, + "num_input_tokens_seen": 36580184, + "step": 2273 + }, + { + "epoch": 0.15928971078830545, + "grad_norm": 3.878512144088745, + "learning_rate": 8.408598248686514e-05, + "loss": 1.1457, + "num_input_tokens_seen": 36596568, + "step": 2274 + }, + { + "epoch": 0.15935975903403468, + "grad_norm": 4.195454120635986, + "learning_rate": 8.407898423817863e-05, + "loss": 1.2628, + "num_input_tokens_seen": 36612952, + "step": 2275 + }, + { + "epoch": 0.15942980727976394, + "grad_norm": 3.847649097442627, + "learning_rate": 8.407198598949212e-05, + "loss": 1.0678, + "num_input_tokens_seen": 36628752, + "step": 2276 + }, + { + "epoch": 0.15949985552549317, + "grad_norm": 5.284397125244141, + "learning_rate": 8.406498774080561e-05, + "loss": 1.0508, + "num_input_tokens_seen": 36645136, + "step": 2277 + }, + { + "epoch": 0.15956990377122243, + "grad_norm": 4.10982084274292, + "learning_rate": 8.40579894921191e-05, + "loss": 1.0558, + "num_input_tokens_seen": 36661392, + "step": 2278 + }, + { + "epoch": 0.15963995201695166, + "grad_norm": 3.8282828330993652, + "learning_rate": 8.405099124343257e-05, + "loss": 1.1064, + "num_input_tokens_seen": 36676856, + "step": 2279 + }, + { + "epoch": 0.15971000026268092, + "grad_norm": 4.115365028381348, + "learning_rate": 8.404399299474606e-05, + "loss": 1.0081, + "num_input_tokens_seen": 36693080, + "step": 2280 + }, + { + "epoch": 0.15978004850841016, + "grad_norm": 3.6131088733673096, + "learning_rate": 8.403699474605955e-05, + "loss": 0.8565, + "num_input_tokens_seen": 36709440, + "step": 2281 + }, + { + "epoch": 0.15985009675413941, + "grad_norm": 3.83146071434021, + "learning_rate": 8.402999649737304e-05, + "loss": 1.0762, + "num_input_tokens_seen": 36725496, + "step": 2282 + }, + { + "epoch": 0.15992014499986865, + "grad_norm": 3.8456339836120605, + "learning_rate": 8.402299824868653e-05, + "loss": 1.053, + "num_input_tokens_seen": 36741544, + "step": 2283 + }, + { + "epoch": 0.1599901932455979, + "grad_norm": 3.717014789581299, + "learning_rate": 8.4016e-05, + "loss": 1.0053, + "num_input_tokens_seen": 36757928, + "step": 2284 + }, + { + "epoch": 0.16006024149132717, + "grad_norm": 4.3730854988098145, + "learning_rate": 8.400900175131349e-05, + "loss": 1.1639, + "num_input_tokens_seen": 36774144, + "step": 2285 + }, + { + "epoch": 0.1601302897370564, + "grad_norm": 3.6635241508483887, + "learning_rate": 8.400200350262697e-05, + "loss": 0.9721, + "num_input_tokens_seen": 36790248, + "step": 2286 + }, + { + "epoch": 0.16020033798278566, + "grad_norm": 3.9058330059051514, + "learning_rate": 8.399500525394046e-05, + "loss": 1.0814, + "num_input_tokens_seen": 36806632, + "step": 2287 + }, + { + "epoch": 0.1602703862285149, + "grad_norm": 3.60127854347229, + "learning_rate": 8.398800700525394e-05, + "loss": 1.1541, + "num_input_tokens_seen": 36823016, + "step": 2288 + }, + { + "epoch": 0.16034043447424415, + "grad_norm": 5.762889385223389, + "learning_rate": 8.398100875656743e-05, + "loss": 0.9572, + "num_input_tokens_seen": 36838576, + "step": 2289 + }, + { + "epoch": 0.16041048271997338, + "grad_norm": 3.495436191558838, + "learning_rate": 8.397401050788092e-05, + "loss": 1.0156, + "num_input_tokens_seen": 36854960, + "step": 2290 + }, + { + "epoch": 0.16048053096570264, + "grad_norm": 4.083384037017822, + "learning_rate": 8.396701225919441e-05, + "loss": 1.1724, + "num_input_tokens_seen": 36870672, + "step": 2291 + }, + { + "epoch": 0.16055057921143187, + "grad_norm": 3.7010245323181152, + "learning_rate": 8.396001401050788e-05, + "loss": 0.8871, + "num_input_tokens_seen": 36887056, + "step": 2292 + }, + { + "epoch": 0.16062062745716113, + "grad_norm": 3.419485330581665, + "learning_rate": 8.395301576182136e-05, + "loss": 0.9586, + "num_input_tokens_seen": 36903144, + "step": 2293 + }, + { + "epoch": 0.16069067570289036, + "grad_norm": 3.593970537185669, + "learning_rate": 8.394601751313485e-05, + "loss": 1.0109, + "num_input_tokens_seen": 36919192, + "step": 2294 + }, + { + "epoch": 0.16076072394861962, + "grad_norm": 3.729038953781128, + "learning_rate": 8.393901926444834e-05, + "loss": 1.288, + "num_input_tokens_seen": 36935576, + "step": 2295 + }, + { + "epoch": 0.16083077219434885, + "grad_norm": 3.60687255859375, + "learning_rate": 8.393202101576183e-05, + "loss": 0.9423, + "num_input_tokens_seen": 36951960, + "step": 2296 + }, + { + "epoch": 0.1609008204400781, + "grad_norm": 3.4520435333251953, + "learning_rate": 8.392502276707531e-05, + "loss": 0.9515, + "num_input_tokens_seen": 36968344, + "step": 2297 + }, + { + "epoch": 0.16097086868580734, + "grad_norm": 3.71907639503479, + "learning_rate": 8.39180245183888e-05, + "loss": 1.1141, + "num_input_tokens_seen": 36984440, + "step": 2298 + }, + { + "epoch": 0.1610409169315366, + "grad_norm": 3.8897864818573, + "learning_rate": 8.391102626970228e-05, + "loss": 1.1124, + "num_input_tokens_seen": 37000824, + "step": 2299 + }, + { + "epoch": 0.16111096517726584, + "grad_norm": 3.579921245574951, + "learning_rate": 8.390402802101577e-05, + "loss": 1.0998, + "num_input_tokens_seen": 37017088, + "step": 2300 + }, + { + "epoch": 0.1611810134229951, + "grad_norm": 3.9658427238464355, + "learning_rate": 8.389702977232924e-05, + "loss": 1.034, + "num_input_tokens_seen": 37033232, + "step": 2301 + }, + { + "epoch": 0.16125106166872433, + "grad_norm": 4.2862725257873535, + "learning_rate": 8.389003152364274e-05, + "loss": 0.9662, + "num_input_tokens_seen": 37049616, + "step": 2302 + }, + { + "epoch": 0.1613211099144536, + "grad_norm": 3.7523694038391113, + "learning_rate": 8.388303327495622e-05, + "loss": 1.0806, + "num_input_tokens_seen": 37065784, + "step": 2303 + }, + { + "epoch": 0.16139115816018282, + "grad_norm": 3.9068679809570312, + "learning_rate": 8.387603502626971e-05, + "loss": 1.0985, + "num_input_tokens_seen": 37082168, + "step": 2304 + }, + { + "epoch": 0.16146120640591208, + "grad_norm": 5.876891613006592, + "learning_rate": 8.38690367775832e-05, + "loss": 1.2938, + "num_input_tokens_seen": 37097072, + "step": 2305 + }, + { + "epoch": 0.1615312546516413, + "grad_norm": 4.040335655212402, + "learning_rate": 8.386203852889667e-05, + "loss": 1.1376, + "num_input_tokens_seen": 37112936, + "step": 2306 + }, + { + "epoch": 0.16160130289737057, + "grad_norm": 3.566763401031494, + "learning_rate": 8.385504028021016e-05, + "loss": 0.9164, + "num_input_tokens_seen": 37129320, + "step": 2307 + }, + { + "epoch": 0.1616713511430998, + "grad_norm": 3.7780325412750244, + "learning_rate": 8.384804203152365e-05, + "loss": 0.9541, + "num_input_tokens_seen": 37144832, + "step": 2308 + }, + { + "epoch": 0.16174139938882906, + "grad_norm": 4.291510105133057, + "learning_rate": 8.384104378283714e-05, + "loss": 1.2579, + "num_input_tokens_seen": 37160312, + "step": 2309 + }, + { + "epoch": 0.1618114476345583, + "grad_norm": 3.721531629562378, + "learning_rate": 8.383404553415063e-05, + "loss": 1.0108, + "num_input_tokens_seen": 37176696, + "step": 2310 + }, + { + "epoch": 0.16188149588028755, + "grad_norm": 3.883301258087158, + "learning_rate": 8.38270472854641e-05, + "loss": 1.17, + "num_input_tokens_seen": 37192632, + "step": 2311 + }, + { + "epoch": 0.16195154412601678, + "grad_norm": 4.240591049194336, + "learning_rate": 8.382004903677759e-05, + "loss": 1.2263, + "num_input_tokens_seen": 37208448, + "step": 2312 + }, + { + "epoch": 0.16202159237174604, + "grad_norm": 4.711728572845459, + "learning_rate": 8.381305078809106e-05, + "loss": 1.1743, + "num_input_tokens_seen": 37223176, + "step": 2313 + }, + { + "epoch": 0.16209164061747527, + "grad_norm": 4.733399391174316, + "learning_rate": 8.380605253940455e-05, + "loss": 1.2512, + "num_input_tokens_seen": 37239560, + "step": 2314 + }, + { + "epoch": 0.16216168886320453, + "grad_norm": 5.842257976531982, + "learning_rate": 8.379905429071804e-05, + "loss": 1.029, + "num_input_tokens_seen": 37255536, + "step": 2315 + }, + { + "epoch": 0.16223173710893377, + "grad_norm": 3.9891135692596436, + "learning_rate": 8.379205604203153e-05, + "loss": 1.0767, + "num_input_tokens_seen": 37271920, + "step": 2316 + }, + { + "epoch": 0.16230178535466303, + "grad_norm": 3.3596630096435547, + "learning_rate": 8.378505779334502e-05, + "loss": 0.8963, + "num_input_tokens_seen": 37288024, + "step": 2317 + }, + { + "epoch": 0.16237183360039226, + "grad_norm": 4.346104621887207, + "learning_rate": 8.377805954465851e-05, + "loss": 1.0947, + "num_input_tokens_seen": 37304264, + "step": 2318 + }, + { + "epoch": 0.16244188184612152, + "grad_norm": 3.5524039268493652, + "learning_rate": 8.377106129597198e-05, + "loss": 0.9435, + "num_input_tokens_seen": 37320648, + "step": 2319 + }, + { + "epoch": 0.16251193009185078, + "grad_norm": 4.335781574249268, + "learning_rate": 8.376406304728546e-05, + "loss": 0.9151, + "num_input_tokens_seen": 37336104, + "step": 2320 + }, + { + "epoch": 0.16258197833758, + "grad_norm": 3.7356534004211426, + "learning_rate": 8.375706479859895e-05, + "loss": 1.0195, + "num_input_tokens_seen": 37352488, + "step": 2321 + }, + { + "epoch": 0.16265202658330927, + "grad_norm": 3.842710494995117, + "learning_rate": 8.375006654991243e-05, + "loss": 1.0543, + "num_input_tokens_seen": 37368872, + "step": 2322 + }, + { + "epoch": 0.1627220748290385, + "grad_norm": 3.9485390186309814, + "learning_rate": 8.374306830122592e-05, + "loss": 1.2149, + "num_input_tokens_seen": 37385256, + "step": 2323 + }, + { + "epoch": 0.16279212307476776, + "grad_norm": 3.9196622371673584, + "learning_rate": 8.373607005253941e-05, + "loss": 1.0907, + "num_input_tokens_seen": 37401224, + "step": 2324 + }, + { + "epoch": 0.162862171320497, + "grad_norm": 4.2444844245910645, + "learning_rate": 8.37290718038529e-05, + "loss": 1.0201, + "num_input_tokens_seen": 37417016, + "step": 2325 + }, + { + "epoch": 0.16293221956622625, + "grad_norm": 3.974438190460205, + "learning_rate": 8.372207355516638e-05, + "loss": 1.0733, + "num_input_tokens_seen": 37433400, + "step": 2326 + }, + { + "epoch": 0.16300226781195548, + "grad_norm": 3.833350658416748, + "learning_rate": 8.371507530647986e-05, + "loss": 1.1536, + "num_input_tokens_seen": 37449784, + "step": 2327 + }, + { + "epoch": 0.16307231605768474, + "grad_norm": 4.566055774688721, + "learning_rate": 8.370807705779335e-05, + "loss": 1.093, + "num_input_tokens_seen": 37465720, + "step": 2328 + }, + { + "epoch": 0.16314236430341397, + "grad_norm": 3.455068588256836, + "learning_rate": 8.370107880910684e-05, + "loss": 0.9396, + "num_input_tokens_seen": 37482104, + "step": 2329 + }, + { + "epoch": 0.16321241254914323, + "grad_norm": 4.584096908569336, + "learning_rate": 8.369408056042032e-05, + "loss": 1.0109, + "num_input_tokens_seen": 37498488, + "step": 2330 + }, + { + "epoch": 0.16328246079487246, + "grad_norm": 4.0225958824157715, + "learning_rate": 8.36870823117338e-05, + "loss": 1.1507, + "num_input_tokens_seen": 37514264, + "step": 2331 + }, + { + "epoch": 0.16335250904060172, + "grad_norm": 5.311272144317627, + "learning_rate": 8.368008406304729e-05, + "loss": 1.2248, + "num_input_tokens_seen": 37529280, + "step": 2332 + }, + { + "epoch": 0.16342255728633096, + "grad_norm": 3.752720594406128, + "learning_rate": 8.367308581436077e-05, + "loss": 0.964, + "num_input_tokens_seen": 37545664, + "step": 2333 + }, + { + "epoch": 0.16349260553206021, + "grad_norm": 3.8337442874908447, + "learning_rate": 8.366608756567426e-05, + "loss": 1.1928, + "num_input_tokens_seen": 37562048, + "step": 2334 + }, + { + "epoch": 0.16356265377778945, + "grad_norm": 3.818251132965088, + "learning_rate": 8.365908931698775e-05, + "loss": 1.0032, + "num_input_tokens_seen": 37577848, + "step": 2335 + }, + { + "epoch": 0.1636327020235187, + "grad_norm": 3.7170960903167725, + "learning_rate": 8.365209106830123e-05, + "loss": 1.2297, + "num_input_tokens_seen": 37594232, + "step": 2336 + }, + { + "epoch": 0.16370275026924794, + "grad_norm": 3.984950304031372, + "learning_rate": 8.364509281961472e-05, + "loss": 1.0744, + "num_input_tokens_seen": 37610248, + "step": 2337 + }, + { + "epoch": 0.1637727985149772, + "grad_norm": 3.4384636878967285, + "learning_rate": 8.36380945709282e-05, + "loss": 1.0015, + "num_input_tokens_seen": 37626632, + "step": 2338 + }, + { + "epoch": 0.16384284676070643, + "grad_norm": 3.952625274658203, + "learning_rate": 8.363109632224169e-05, + "loss": 1.1604, + "num_input_tokens_seen": 37643016, + "step": 2339 + }, + { + "epoch": 0.1639128950064357, + "grad_norm": 3.7193119525909424, + "learning_rate": 8.362409807355516e-05, + "loss": 0.9054, + "num_input_tokens_seen": 37658216, + "step": 2340 + }, + { + "epoch": 0.16398294325216492, + "grad_norm": 3.977997303009033, + "learning_rate": 8.361709982486865e-05, + "loss": 1.2904, + "num_input_tokens_seen": 37674600, + "step": 2341 + }, + { + "epoch": 0.16405299149789418, + "grad_norm": 5.108094215393066, + "learning_rate": 8.361010157618214e-05, + "loss": 1.0664, + "num_input_tokens_seen": 37690184, + "step": 2342 + }, + { + "epoch": 0.1641230397436234, + "grad_norm": 4.881065845489502, + "learning_rate": 8.360310332749563e-05, + "loss": 1.0787, + "num_input_tokens_seen": 37705352, + "step": 2343 + }, + { + "epoch": 0.16419308798935267, + "grad_norm": 4.128891468048096, + "learning_rate": 8.359610507880912e-05, + "loss": 0.8745, + "num_input_tokens_seen": 37721736, + "step": 2344 + }, + { + "epoch": 0.1642631362350819, + "grad_norm": 4.006495475769043, + "learning_rate": 8.35891068301226e-05, + "loss": 0.9992, + "num_input_tokens_seen": 37738120, + "step": 2345 + }, + { + "epoch": 0.16433318448081116, + "grad_norm": 3.877427101135254, + "learning_rate": 8.358210858143608e-05, + "loss": 0.9334, + "num_input_tokens_seen": 37754504, + "step": 2346 + }, + { + "epoch": 0.1644032327265404, + "grad_norm": 3.7013916969299316, + "learning_rate": 8.357511033274955e-05, + "loss": 1.069, + "num_input_tokens_seen": 37770792, + "step": 2347 + }, + { + "epoch": 0.16447328097226965, + "grad_norm": 3.675049066543579, + "learning_rate": 8.356811208406304e-05, + "loss": 0.9863, + "num_input_tokens_seen": 37786800, + "step": 2348 + }, + { + "epoch": 0.16454332921799888, + "grad_norm": 4.831826210021973, + "learning_rate": 8.356111383537654e-05, + "loss": 0.9077, + "num_input_tokens_seen": 37801760, + "step": 2349 + }, + { + "epoch": 0.16461337746372814, + "grad_norm": 4.207952499389648, + "learning_rate": 8.355411558669002e-05, + "loss": 1.0585, + "num_input_tokens_seen": 37818144, + "step": 2350 + }, + { + "epoch": 0.16468342570945738, + "grad_norm": 3.9083497524261475, + "learning_rate": 8.354711733800351e-05, + "loss": 1.1437, + "num_input_tokens_seen": 37833896, + "step": 2351 + }, + { + "epoch": 0.16475347395518664, + "grad_norm": 4.307275295257568, + "learning_rate": 8.3540119089317e-05, + "loss": 1.0692, + "num_input_tokens_seen": 37850280, + "step": 2352 + }, + { + "epoch": 0.16482352220091587, + "grad_norm": 3.9434409141540527, + "learning_rate": 8.353312084063047e-05, + "loss": 0.9842, + "num_input_tokens_seen": 37866664, + "step": 2353 + }, + { + "epoch": 0.16489357044664513, + "grad_norm": 4.162476539611816, + "learning_rate": 8.352612259194396e-05, + "loss": 1.2, + "num_input_tokens_seen": 37883048, + "step": 2354 + }, + { + "epoch": 0.1649636186923744, + "grad_norm": 4.3073506355285645, + "learning_rate": 8.351912434325745e-05, + "loss": 1.2625, + "num_input_tokens_seen": 37899264, + "step": 2355 + }, + { + "epoch": 0.16503366693810362, + "grad_norm": 3.9900870323181152, + "learning_rate": 8.351212609457094e-05, + "loss": 1.079, + "num_input_tokens_seen": 37915648, + "step": 2356 + }, + { + "epoch": 0.16510371518383288, + "grad_norm": 3.599282741546631, + "learning_rate": 8.350512784588441e-05, + "loss": 0.9226, + "num_input_tokens_seen": 37932032, + "step": 2357 + }, + { + "epoch": 0.1651737634295621, + "grad_norm": 3.796546697616577, + "learning_rate": 8.34981295971979e-05, + "loss": 0.9095, + "num_input_tokens_seen": 37948416, + "step": 2358 + }, + { + "epoch": 0.16524381167529137, + "grad_norm": 4.0810017585754395, + "learning_rate": 8.349113134851139e-05, + "loss": 0.9083, + "num_input_tokens_seen": 37964072, + "step": 2359 + }, + { + "epoch": 0.1653138599210206, + "grad_norm": 4.155765533447266, + "learning_rate": 8.348413309982487e-05, + "loss": 1.1827, + "num_input_tokens_seen": 37980320, + "step": 2360 + }, + { + "epoch": 0.16538390816674986, + "grad_norm": 4.131893634796143, + "learning_rate": 8.347713485113835e-05, + "loss": 1.1245, + "num_input_tokens_seen": 37995872, + "step": 2361 + }, + { + "epoch": 0.1654539564124791, + "grad_norm": 4.266848564147949, + "learning_rate": 8.347013660245184e-05, + "loss": 1.1084, + "num_input_tokens_seen": 38011856, + "step": 2362 + }, + { + "epoch": 0.16552400465820835, + "grad_norm": 3.8229875564575195, + "learning_rate": 8.346313835376533e-05, + "loss": 1.0592, + "num_input_tokens_seen": 38028080, + "step": 2363 + }, + { + "epoch": 0.16559405290393758, + "grad_norm": 4.0808234214782715, + "learning_rate": 8.345614010507882e-05, + "loss": 0.9667, + "num_input_tokens_seen": 38043992, + "step": 2364 + }, + { + "epoch": 0.16566410114966684, + "grad_norm": 4.470417022705078, + "learning_rate": 8.34491418563923e-05, + "loss": 1.2859, + "num_input_tokens_seen": 38059848, + "step": 2365 + }, + { + "epoch": 0.16573414939539607, + "grad_norm": 3.459963798522949, + "learning_rate": 8.344214360770578e-05, + "loss": 1.0801, + "num_input_tokens_seen": 38076232, + "step": 2366 + }, + { + "epoch": 0.16580419764112533, + "grad_norm": 3.6845312118530273, + "learning_rate": 8.343514535901926e-05, + "loss": 1.1277, + "num_input_tokens_seen": 38092616, + "step": 2367 + }, + { + "epoch": 0.16587424588685457, + "grad_norm": 3.683866500854492, + "learning_rate": 8.342814711033275e-05, + "loss": 1.0821, + "num_input_tokens_seen": 38108880, + "step": 2368 + }, + { + "epoch": 0.16594429413258382, + "grad_norm": 4.3266191482543945, + "learning_rate": 8.342114886164625e-05, + "loss": 1.1432, + "num_input_tokens_seen": 38125264, + "step": 2369 + }, + { + "epoch": 0.16601434237831306, + "grad_norm": 3.9031660556793213, + "learning_rate": 8.341415061295972e-05, + "loss": 1.0378, + "num_input_tokens_seen": 38141648, + "step": 2370 + }, + { + "epoch": 0.16608439062404232, + "grad_norm": 5.415440082550049, + "learning_rate": 8.340715236427321e-05, + "loss": 1.2011, + "num_input_tokens_seen": 38157328, + "step": 2371 + }, + { + "epoch": 0.16615443886977155, + "grad_norm": 4.017500877380371, + "learning_rate": 8.34001541155867e-05, + "loss": 1.0771, + "num_input_tokens_seen": 38173096, + "step": 2372 + }, + { + "epoch": 0.1662244871155008, + "grad_norm": 3.855212926864624, + "learning_rate": 8.339315586690018e-05, + "loss": 1.173, + "num_input_tokens_seen": 38189480, + "step": 2373 + }, + { + "epoch": 0.16629453536123004, + "grad_norm": 3.8502743244171143, + "learning_rate": 8.338615761821365e-05, + "loss": 1.0241, + "num_input_tokens_seen": 38205416, + "step": 2374 + }, + { + "epoch": 0.1663645836069593, + "grad_norm": 6.8746867179870605, + "learning_rate": 8.337915936952715e-05, + "loss": 1.0459, + "num_input_tokens_seen": 38221800, + "step": 2375 + }, + { + "epoch": 0.16643463185268853, + "grad_norm": 3.9708571434020996, + "learning_rate": 8.337216112084064e-05, + "loss": 0.9832, + "num_input_tokens_seen": 38237208, + "step": 2376 + }, + { + "epoch": 0.1665046800984178, + "grad_norm": 4.927229404449463, + "learning_rate": 8.336516287215412e-05, + "loss": 1.1103, + "num_input_tokens_seen": 38253592, + "step": 2377 + }, + { + "epoch": 0.16657472834414702, + "grad_norm": 3.9976963996887207, + "learning_rate": 8.33581646234676e-05, + "loss": 1.1451, + "num_input_tokens_seen": 38269184, + "step": 2378 + }, + { + "epoch": 0.16664477658987628, + "grad_norm": 3.680177927017212, + "learning_rate": 8.33511663747811e-05, + "loss": 1.0602, + "num_input_tokens_seen": 38285568, + "step": 2379 + }, + { + "epoch": 0.1667148248356055, + "grad_norm": 3.768069267272949, + "learning_rate": 8.334416812609457e-05, + "loss": 1.0822, + "num_input_tokens_seen": 38301952, + "step": 2380 + }, + { + "epoch": 0.16678487308133477, + "grad_norm": 4.554010391235352, + "learning_rate": 8.333716987740806e-05, + "loss": 1.3037, + "num_input_tokens_seen": 38318336, + "step": 2381 + }, + { + "epoch": 0.166854921327064, + "grad_norm": 3.6799368858337402, + "learning_rate": 8.333017162872155e-05, + "loss": 1.0152, + "num_input_tokens_seen": 38333544, + "step": 2382 + }, + { + "epoch": 0.16692496957279326, + "grad_norm": 3.5584356784820557, + "learning_rate": 8.332317338003503e-05, + "loss": 0.9617, + "num_input_tokens_seen": 38349632, + "step": 2383 + }, + { + "epoch": 0.1669950178185225, + "grad_norm": 5.978849411010742, + "learning_rate": 8.331617513134851e-05, + "loss": 0.9975, + "num_input_tokens_seen": 38364872, + "step": 2384 + }, + { + "epoch": 0.16706506606425175, + "grad_norm": 4.641121864318848, + "learning_rate": 8.3309176882662e-05, + "loss": 1.0021, + "num_input_tokens_seen": 38379800, + "step": 2385 + }, + { + "epoch": 0.167135114309981, + "grad_norm": 3.895772695541382, + "learning_rate": 8.330217863397549e-05, + "loss": 1.1187, + "num_input_tokens_seen": 38395744, + "step": 2386 + }, + { + "epoch": 0.16720516255571025, + "grad_norm": 3.48437762260437, + "learning_rate": 8.329518038528896e-05, + "loss": 1.0527, + "num_input_tokens_seen": 38412056, + "step": 2387 + }, + { + "epoch": 0.16727521080143948, + "grad_norm": 4.2831549644470215, + "learning_rate": 8.328818213660245e-05, + "loss": 0.8967, + "num_input_tokens_seen": 38426768, + "step": 2388 + }, + { + "epoch": 0.16734525904716874, + "grad_norm": 3.7090001106262207, + "learning_rate": 8.328118388791595e-05, + "loss": 0.9903, + "num_input_tokens_seen": 38442296, + "step": 2389 + }, + { + "epoch": 0.167415307292898, + "grad_norm": 4.253223896026611, + "learning_rate": 8.327418563922943e-05, + "loss": 1.0169, + "num_input_tokens_seen": 38458664, + "step": 2390 + }, + { + "epoch": 0.16748535553862723, + "grad_norm": 4.919910907745361, + "learning_rate": 8.326718739054292e-05, + "loss": 1.233, + "num_input_tokens_seen": 38475048, + "step": 2391 + }, + { + "epoch": 0.1675554037843565, + "grad_norm": 7.881314277648926, + "learning_rate": 8.326018914185639e-05, + "loss": 1.1, + "num_input_tokens_seen": 38491432, + "step": 2392 + }, + { + "epoch": 0.16762545203008572, + "grad_norm": 6.979029655456543, + "learning_rate": 8.325319089316988e-05, + "loss": 0.9189, + "num_input_tokens_seen": 38506312, + "step": 2393 + }, + { + "epoch": 0.16769550027581498, + "grad_norm": 4.141571044921875, + "learning_rate": 8.324619264448336e-05, + "loss": 1.0821, + "num_input_tokens_seen": 38522696, + "step": 2394 + }, + { + "epoch": 0.1677655485215442, + "grad_norm": 4.306760311126709, + "learning_rate": 8.323919439579686e-05, + "loss": 1.1857, + "num_input_tokens_seen": 38539080, + "step": 2395 + }, + { + "epoch": 0.16783559676727347, + "grad_norm": 4.089770793914795, + "learning_rate": 8.323219614711035e-05, + "loss": 1.0994, + "num_input_tokens_seen": 38555464, + "step": 2396 + }, + { + "epoch": 0.1679056450130027, + "grad_norm": 3.648800849914551, + "learning_rate": 8.322519789842382e-05, + "loss": 1.1015, + "num_input_tokens_seen": 38571848, + "step": 2397 + }, + { + "epoch": 0.16797569325873196, + "grad_norm": 4.310317516326904, + "learning_rate": 8.321819964973731e-05, + "loss": 1.171, + "num_input_tokens_seen": 38587616, + "step": 2398 + }, + { + "epoch": 0.1680457415044612, + "grad_norm": 5.373032093048096, + "learning_rate": 8.32112014010508e-05, + "loss": 0.9952, + "num_input_tokens_seen": 38604000, + "step": 2399 + }, + { + "epoch": 0.16811578975019045, + "grad_norm": 3.7830634117126465, + "learning_rate": 8.320420315236427e-05, + "loss": 0.9953, + "num_input_tokens_seen": 38620384, + "step": 2400 + }, + { + "epoch": 0.16811578975019045, + "eval_loss": 1.1429402828216553, + "eval_runtime": 0.205, + "eval_samples_per_second": 4.878, + "eval_steps_per_second": 4.878, + "num_input_tokens_seen": 38620384, + "step": 2400 + }, + { + "epoch": 0.16818583799591968, + "grad_norm": 6.3896684646606445, + "learning_rate": 8.319720490367776e-05, + "loss": 1.184, + "num_input_tokens_seen": 38636288, + "step": 2401 + }, + { + "epoch": 0.16825588624164894, + "grad_norm": 4.178726673126221, + "learning_rate": 8.319020665499125e-05, + "loss": 1.0362, + "num_input_tokens_seen": 38652352, + "step": 2402 + }, + { + "epoch": 0.16832593448737818, + "grad_norm": 3.7572708129882812, + "learning_rate": 8.318320840630474e-05, + "loss": 0.9756, + "num_input_tokens_seen": 38668712, + "step": 2403 + }, + { + "epoch": 0.16839598273310744, + "grad_norm": 3.688552141189575, + "learning_rate": 8.317621015761821e-05, + "loss": 1.0644, + "num_input_tokens_seen": 38685096, + "step": 2404 + }, + { + "epoch": 0.16846603097883667, + "grad_norm": 4.2040510177612305, + "learning_rate": 8.31692119089317e-05, + "loss": 1.1251, + "num_input_tokens_seen": 38701480, + "step": 2405 + }, + { + "epoch": 0.16853607922456593, + "grad_norm": 3.9412119388580322, + "learning_rate": 8.316221366024519e-05, + "loss": 1.0243, + "num_input_tokens_seen": 38716904, + "step": 2406 + }, + { + "epoch": 0.16860612747029516, + "grad_norm": 3.9538826942443848, + "learning_rate": 8.315521541155867e-05, + "loss": 1.0361, + "num_input_tokens_seen": 38733288, + "step": 2407 + }, + { + "epoch": 0.16867617571602442, + "grad_norm": 3.803135871887207, + "learning_rate": 8.314821716287215e-05, + "loss": 1.0684, + "num_input_tokens_seen": 38749672, + "step": 2408 + }, + { + "epoch": 0.16874622396175365, + "grad_norm": 4.323539733886719, + "learning_rate": 8.314121891418564e-05, + "loss": 1.1091, + "num_input_tokens_seen": 38766056, + "step": 2409 + }, + { + "epoch": 0.1688162722074829, + "grad_norm": 3.84000825881958, + "learning_rate": 8.313422066549913e-05, + "loss": 1.0052, + "num_input_tokens_seen": 38782440, + "step": 2410 + }, + { + "epoch": 0.16888632045321214, + "grad_norm": 6.76428747177124, + "learning_rate": 8.312722241681261e-05, + "loss": 1.1773, + "num_input_tokens_seen": 38798824, + "step": 2411 + }, + { + "epoch": 0.1689563686989414, + "grad_norm": 5.8638224601745605, + "learning_rate": 8.31202241681261e-05, + "loss": 0.9515, + "num_input_tokens_seen": 38815112, + "step": 2412 + }, + { + "epoch": 0.16902641694467063, + "grad_norm": 4.254051685333252, + "learning_rate": 8.311322591943958e-05, + "loss": 1.1365, + "num_input_tokens_seen": 38831192, + "step": 2413 + }, + { + "epoch": 0.1690964651903999, + "grad_norm": 3.641663074493408, + "learning_rate": 8.310622767075306e-05, + "loss": 0.9888, + "num_input_tokens_seen": 38847360, + "step": 2414 + }, + { + "epoch": 0.16916651343612912, + "grad_norm": 3.594768762588501, + "learning_rate": 8.309922942206656e-05, + "loss": 1.1156, + "num_input_tokens_seen": 38863744, + "step": 2415 + }, + { + "epoch": 0.16923656168185838, + "grad_norm": 3.6955742835998535, + "learning_rate": 8.309223117338005e-05, + "loss": 0.9514, + "num_input_tokens_seen": 38879880, + "step": 2416 + }, + { + "epoch": 0.16930660992758761, + "grad_norm": 3.64803409576416, + "learning_rate": 8.308523292469353e-05, + "loss": 1.0045, + "num_input_tokens_seen": 38896264, + "step": 2417 + }, + { + "epoch": 0.16937665817331687, + "grad_norm": 3.7921512126922607, + "learning_rate": 8.307823467600701e-05, + "loss": 1.0838, + "num_input_tokens_seen": 38912648, + "step": 2418 + }, + { + "epoch": 0.1694467064190461, + "grad_norm": 4.777346611022949, + "learning_rate": 8.307123642732049e-05, + "loss": 1.239, + "num_input_tokens_seen": 38929032, + "step": 2419 + }, + { + "epoch": 0.16951675466477537, + "grad_norm": 4.417767524719238, + "learning_rate": 8.306423817863398e-05, + "loss": 1.0101, + "num_input_tokens_seen": 38945416, + "step": 2420 + }, + { + "epoch": 0.1695868029105046, + "grad_norm": 4.257672309875488, + "learning_rate": 8.305723992994747e-05, + "loss": 0.8461, + "num_input_tokens_seen": 38961800, + "step": 2421 + }, + { + "epoch": 0.16965685115623386, + "grad_norm": 4.098975658416748, + "learning_rate": 8.305024168126095e-05, + "loss": 1.158, + "num_input_tokens_seen": 38978184, + "step": 2422 + }, + { + "epoch": 0.1697268994019631, + "grad_norm": 5.206361293792725, + "learning_rate": 8.304324343257444e-05, + "loss": 1.041, + "num_input_tokens_seen": 38994568, + "step": 2423 + }, + { + "epoch": 0.16979694764769235, + "grad_norm": 3.638395309448242, + "learning_rate": 8.303624518388792e-05, + "loss": 0.8883, + "num_input_tokens_seen": 39010136, + "step": 2424 + }, + { + "epoch": 0.1698669958934216, + "grad_norm": 3.4154045581817627, + "learning_rate": 8.30292469352014e-05, + "loss": 1.0024, + "num_input_tokens_seen": 39026520, + "step": 2425 + }, + { + "epoch": 0.16993704413915084, + "grad_norm": 3.923617362976074, + "learning_rate": 8.30222486865149e-05, + "loss": 1.1696, + "num_input_tokens_seen": 39042816, + "step": 2426 + }, + { + "epoch": 0.1700070923848801, + "grad_norm": 4.469310760498047, + "learning_rate": 8.301525043782837e-05, + "loss": 1.3424, + "num_input_tokens_seen": 39059040, + "step": 2427 + }, + { + "epoch": 0.17007714063060933, + "grad_norm": 4.111564636230469, + "learning_rate": 8.300825218914186e-05, + "loss": 0.9867, + "num_input_tokens_seen": 39074992, + "step": 2428 + }, + { + "epoch": 0.1701471888763386, + "grad_norm": 3.7809438705444336, + "learning_rate": 8.300125394045535e-05, + "loss": 0.965, + "num_input_tokens_seen": 39090840, + "step": 2429 + }, + { + "epoch": 0.17021723712206782, + "grad_norm": 3.704542875289917, + "learning_rate": 8.299425569176884e-05, + "loss": 1.1784, + "num_input_tokens_seen": 39107136, + "step": 2430 + }, + { + "epoch": 0.17028728536779708, + "grad_norm": 4.356417179107666, + "learning_rate": 8.298725744308231e-05, + "loss": 1.149, + "num_input_tokens_seen": 39123520, + "step": 2431 + }, + { + "epoch": 0.1703573336135263, + "grad_norm": 3.400228500366211, + "learning_rate": 8.29802591943958e-05, + "loss": 0.867, + "num_input_tokens_seen": 39139904, + "step": 2432 + }, + { + "epoch": 0.17042738185925557, + "grad_norm": 4.777987480163574, + "learning_rate": 8.297326094570929e-05, + "loss": 1.1159, + "num_input_tokens_seen": 39156288, + "step": 2433 + }, + { + "epoch": 0.1704974301049848, + "grad_norm": 5.600007057189941, + "learning_rate": 8.296626269702276e-05, + "loss": 0.8863, + "num_input_tokens_seen": 39171928, + "step": 2434 + }, + { + "epoch": 0.17056747835071406, + "grad_norm": 3.72717022895813, + "learning_rate": 8.295926444833627e-05, + "loss": 1.079, + "num_input_tokens_seen": 39188032, + "step": 2435 + }, + { + "epoch": 0.1706375265964433, + "grad_norm": 7.264038562774658, + "learning_rate": 8.295226619964974e-05, + "loss": 0.8546, + "num_input_tokens_seen": 39203816, + "step": 2436 + }, + { + "epoch": 0.17070757484217255, + "grad_norm": 4.103509426116943, + "learning_rate": 8.294526795096323e-05, + "loss": 1.0138, + "num_input_tokens_seen": 39220200, + "step": 2437 + }, + { + "epoch": 0.1707776230879018, + "grad_norm": 3.6456661224365234, + "learning_rate": 8.29382697022767e-05, + "loss": 0.9107, + "num_input_tokens_seen": 39236584, + "step": 2438 + }, + { + "epoch": 0.17084767133363105, + "grad_norm": 3.750075340270996, + "learning_rate": 8.293127145359019e-05, + "loss": 1.0773, + "num_input_tokens_seen": 39252968, + "step": 2439 + }, + { + "epoch": 0.17091771957936028, + "grad_norm": 4.5003581047058105, + "learning_rate": 8.292427320490368e-05, + "loss": 1.1834, + "num_input_tokens_seen": 39269192, + "step": 2440 + }, + { + "epoch": 0.17098776782508954, + "grad_norm": 4.513885498046875, + "learning_rate": 8.291727495621717e-05, + "loss": 1.0337, + "num_input_tokens_seen": 39285576, + "step": 2441 + }, + { + "epoch": 0.17105781607081877, + "grad_norm": 4.220343589782715, + "learning_rate": 8.291027670753066e-05, + "loss": 1.3044, + "num_input_tokens_seen": 39300864, + "step": 2442 + }, + { + "epoch": 0.17112786431654803, + "grad_norm": 4.986631393432617, + "learning_rate": 8.290327845884415e-05, + "loss": 1.0377, + "num_input_tokens_seen": 39317208, + "step": 2443 + }, + { + "epoch": 0.17119791256227726, + "grad_norm": 7.632670879364014, + "learning_rate": 8.289628021015762e-05, + "loss": 1.1749, + "num_input_tokens_seen": 39332392, + "step": 2444 + }, + { + "epoch": 0.17126796080800652, + "grad_norm": 3.588841199874878, + "learning_rate": 8.288928196147111e-05, + "loss": 0.8124, + "num_input_tokens_seen": 39348600, + "step": 2445 + }, + { + "epoch": 0.17133800905373575, + "grad_norm": 4.311728477478027, + "learning_rate": 8.288228371278459e-05, + "loss": 1.035, + "num_input_tokens_seen": 39364456, + "step": 2446 + }, + { + "epoch": 0.171408057299465, + "grad_norm": 6.236140251159668, + "learning_rate": 8.287528546409807e-05, + "loss": 1.1243, + "num_input_tokens_seen": 39379496, + "step": 2447 + }, + { + "epoch": 0.17147810554519424, + "grad_norm": 4.228808403015137, + "learning_rate": 8.286828721541156e-05, + "loss": 1.0185, + "num_input_tokens_seen": 39395880, + "step": 2448 + }, + { + "epoch": 0.1715481537909235, + "grad_norm": 3.873366117477417, + "learning_rate": 8.286128896672505e-05, + "loss": 0.9684, + "num_input_tokens_seen": 39412264, + "step": 2449 + }, + { + "epoch": 0.17161820203665273, + "grad_norm": 3.797846794128418, + "learning_rate": 8.285429071803854e-05, + "loss": 1.0562, + "num_input_tokens_seen": 39428648, + "step": 2450 + }, + { + "epoch": 0.171688250282382, + "grad_norm": 3.798875093460083, + "learning_rate": 8.284729246935202e-05, + "loss": 1.0409, + "num_input_tokens_seen": 39445032, + "step": 2451 + }, + { + "epoch": 0.17175829852811123, + "grad_norm": 5.118900299072266, + "learning_rate": 8.28402942206655e-05, + "loss": 1.14, + "num_input_tokens_seen": 39460168, + "step": 2452 + }, + { + "epoch": 0.17182834677384048, + "grad_norm": 4.157371520996094, + "learning_rate": 8.283329597197899e-05, + "loss": 1.1676, + "num_input_tokens_seen": 39476544, + "step": 2453 + }, + { + "epoch": 0.17189839501956972, + "grad_norm": 3.760786771774292, + "learning_rate": 8.282629772329247e-05, + "loss": 1.0482, + "num_input_tokens_seen": 39492928, + "step": 2454 + }, + { + "epoch": 0.17196844326529898, + "grad_norm": 4.252779960632324, + "learning_rate": 8.281929947460596e-05, + "loss": 1.1538, + "num_input_tokens_seen": 39509312, + "step": 2455 + }, + { + "epoch": 0.1720384915110282, + "grad_norm": 4.374740123748779, + "learning_rate": 8.281230122591944e-05, + "loss": 1.0132, + "num_input_tokens_seen": 39525696, + "step": 2456 + }, + { + "epoch": 0.17210853975675747, + "grad_norm": 4.460380554199219, + "learning_rate": 8.280530297723293e-05, + "loss": 1.1876, + "num_input_tokens_seen": 39541864, + "step": 2457 + }, + { + "epoch": 0.17217858800248673, + "grad_norm": 4.22148323059082, + "learning_rate": 8.279830472854641e-05, + "loss": 1.11, + "num_input_tokens_seen": 39557944, + "step": 2458 + }, + { + "epoch": 0.17224863624821596, + "grad_norm": 4.310081481933594, + "learning_rate": 8.27913064798599e-05, + "loss": 1.0506, + "num_input_tokens_seen": 39574328, + "step": 2459 + }, + { + "epoch": 0.17231868449394522, + "grad_norm": 4.15192174911499, + "learning_rate": 8.278430823117339e-05, + "loss": 0.7793, + "num_input_tokens_seen": 39589312, + "step": 2460 + }, + { + "epoch": 0.17238873273967445, + "grad_norm": 4.6561455726623535, + "learning_rate": 8.277730998248687e-05, + "loss": 1.2239, + "num_input_tokens_seen": 39605456, + "step": 2461 + }, + { + "epoch": 0.1724587809854037, + "grad_norm": 4.273087978363037, + "learning_rate": 8.277031173380036e-05, + "loss": 1.1436, + "num_input_tokens_seen": 39621840, + "step": 2462 + }, + { + "epoch": 0.17252882923113294, + "grad_norm": 4.575830459594727, + "learning_rate": 8.276331348511384e-05, + "loss": 1.2589, + "num_input_tokens_seen": 39638224, + "step": 2463 + }, + { + "epoch": 0.1725988774768622, + "grad_norm": 3.9122824668884277, + "learning_rate": 8.275631523642733e-05, + "loss": 1.0634, + "num_input_tokens_seen": 39654608, + "step": 2464 + }, + { + "epoch": 0.17266892572259143, + "grad_norm": 4.991362571716309, + "learning_rate": 8.27493169877408e-05, + "loss": 1.2077, + "num_input_tokens_seen": 39669824, + "step": 2465 + }, + { + "epoch": 0.1727389739683207, + "grad_norm": 4.688175678253174, + "learning_rate": 8.274231873905429e-05, + "loss": 1.0955, + "num_input_tokens_seen": 39686208, + "step": 2466 + }, + { + "epoch": 0.17280902221404992, + "grad_norm": 3.779524087905884, + "learning_rate": 8.273532049036778e-05, + "loss": 1.004, + "num_input_tokens_seen": 39702336, + "step": 2467 + }, + { + "epoch": 0.17287907045977918, + "grad_norm": 4.117679595947266, + "learning_rate": 8.272832224168127e-05, + "loss": 1.0321, + "num_input_tokens_seen": 39718232, + "step": 2468 + }, + { + "epoch": 0.17294911870550841, + "grad_norm": 3.810084819793701, + "learning_rate": 8.272132399299476e-05, + "loss": 1.0325, + "num_input_tokens_seen": 39733584, + "step": 2469 + }, + { + "epoch": 0.17301916695123767, + "grad_norm": 3.7730038166046143, + "learning_rate": 8.271432574430824e-05, + "loss": 0.9207, + "num_input_tokens_seen": 39749968, + "step": 2470 + }, + { + "epoch": 0.1730892151969669, + "grad_norm": 7.299304008483887, + "learning_rate": 8.270732749562172e-05, + "loss": 1.3425, + "num_input_tokens_seen": 39765552, + "step": 2471 + }, + { + "epoch": 0.17315926344269617, + "grad_norm": 4.079380512237549, + "learning_rate": 8.270032924693521e-05, + "loss": 1.0336, + "num_input_tokens_seen": 39781936, + "step": 2472 + }, + { + "epoch": 0.1732293116884254, + "grad_norm": 3.736607789993286, + "learning_rate": 8.269333099824868e-05, + "loss": 1.0126, + "num_input_tokens_seen": 39797688, + "step": 2473 + }, + { + "epoch": 0.17329935993415466, + "grad_norm": 5.587291240692139, + "learning_rate": 8.268633274956217e-05, + "loss": 1.1422, + "num_input_tokens_seen": 39814072, + "step": 2474 + }, + { + "epoch": 0.1733694081798839, + "grad_norm": 3.5963592529296875, + "learning_rate": 8.267933450087566e-05, + "loss": 0.9947, + "num_input_tokens_seen": 39830456, + "step": 2475 + }, + { + "epoch": 0.17343945642561315, + "grad_norm": 5.241317272186279, + "learning_rate": 8.267233625218915e-05, + "loss": 1.0661, + "num_input_tokens_seen": 39846728, + "step": 2476 + }, + { + "epoch": 0.17350950467134238, + "grad_norm": 4.194108009338379, + "learning_rate": 8.266533800350264e-05, + "loss": 1.1659, + "num_input_tokens_seen": 39863112, + "step": 2477 + }, + { + "epoch": 0.17357955291707164, + "grad_norm": 4.698538780212402, + "learning_rate": 8.265833975481611e-05, + "loss": 1.3673, + "num_input_tokens_seen": 39878624, + "step": 2478 + }, + { + "epoch": 0.17364960116280087, + "grad_norm": 5.960018634796143, + "learning_rate": 8.26513415061296e-05, + "loss": 1.104, + "num_input_tokens_seen": 39894944, + "step": 2479 + }, + { + "epoch": 0.17371964940853013, + "grad_norm": 4.386090278625488, + "learning_rate": 8.264434325744309e-05, + "loss": 1.1284, + "num_input_tokens_seen": 39911040, + "step": 2480 + }, + { + "epoch": 0.17378969765425936, + "grad_norm": 3.7272467613220215, + "learning_rate": 8.263734500875658e-05, + "loss": 1.1066, + "num_input_tokens_seen": 39927408, + "step": 2481 + }, + { + "epoch": 0.17385974589998862, + "grad_norm": 4.296888828277588, + "learning_rate": 8.263034676007005e-05, + "loss": 1.1014, + "num_input_tokens_seen": 39943792, + "step": 2482 + }, + { + "epoch": 0.17392979414571785, + "grad_norm": 5.469056606292725, + "learning_rate": 8.262334851138354e-05, + "loss": 1.1672, + "num_input_tokens_seen": 39958176, + "step": 2483 + }, + { + "epoch": 0.1739998423914471, + "grad_norm": 5.6080498695373535, + "learning_rate": 8.261635026269703e-05, + "loss": 1.2713, + "num_input_tokens_seen": 39973592, + "step": 2484 + }, + { + "epoch": 0.17406989063717634, + "grad_norm": 3.6164181232452393, + "learning_rate": 8.26093520140105e-05, + "loss": 0.9019, + "num_input_tokens_seen": 39989792, + "step": 2485 + }, + { + "epoch": 0.1741399388829056, + "grad_norm": 3.757291078567505, + "learning_rate": 8.2602353765324e-05, + "loss": 1.1038, + "num_input_tokens_seen": 40005672, + "step": 2486 + }, + { + "epoch": 0.17420998712863484, + "grad_norm": 5.1490559577941895, + "learning_rate": 8.259535551663748e-05, + "loss": 1.1524, + "num_input_tokens_seen": 40021816, + "step": 2487 + }, + { + "epoch": 0.1742800353743641, + "grad_norm": 3.9055886268615723, + "learning_rate": 8.258835726795097e-05, + "loss": 1.1, + "num_input_tokens_seen": 40038200, + "step": 2488 + }, + { + "epoch": 0.17435008362009333, + "grad_norm": 5.496553897857666, + "learning_rate": 8.258135901926446e-05, + "loss": 1.3214, + "num_input_tokens_seen": 40054584, + "step": 2489 + }, + { + "epoch": 0.1744201318658226, + "grad_norm": 4.069197177886963, + "learning_rate": 8.257436077057793e-05, + "loss": 0.888, + "num_input_tokens_seen": 40070968, + "step": 2490 + }, + { + "epoch": 0.17449018011155182, + "grad_norm": 5.098565101623535, + "learning_rate": 8.256736252189142e-05, + "loss": 0.9918, + "num_input_tokens_seen": 40087352, + "step": 2491 + }, + { + "epoch": 0.17456022835728108, + "grad_norm": 4.083621025085449, + "learning_rate": 8.25603642732049e-05, + "loss": 0.9506, + "num_input_tokens_seen": 40103736, + "step": 2492 + }, + { + "epoch": 0.17463027660301034, + "grad_norm": 3.8676462173461914, + "learning_rate": 8.255336602451839e-05, + "loss": 1.0746, + "num_input_tokens_seen": 40120120, + "step": 2493 + }, + { + "epoch": 0.17470032484873957, + "grad_norm": 3.8799197673797607, + "learning_rate": 8.254636777583188e-05, + "loss": 1.0207, + "num_input_tokens_seen": 40136504, + "step": 2494 + }, + { + "epoch": 0.17477037309446883, + "grad_norm": 5.469006538391113, + "learning_rate": 8.253936952714536e-05, + "loss": 1.0081, + "num_input_tokens_seen": 40152888, + "step": 2495 + }, + { + "epoch": 0.17484042134019806, + "grad_norm": 4.163306713104248, + "learning_rate": 8.253237127845885e-05, + "loss": 1.2059, + "num_input_tokens_seen": 40169272, + "step": 2496 + }, + { + "epoch": 0.17491046958592732, + "grad_norm": 3.792062282562256, + "learning_rate": 8.252537302977234e-05, + "loss": 1.0806, + "num_input_tokens_seen": 40185656, + "step": 2497 + }, + { + "epoch": 0.17498051783165655, + "grad_norm": 3.6881046295166016, + "learning_rate": 8.251837478108582e-05, + "loss": 1.1557, + "num_input_tokens_seen": 40202040, + "step": 2498 + }, + { + "epoch": 0.1750505660773858, + "grad_norm": 3.818491220474243, + "learning_rate": 8.25113765323993e-05, + "loss": 1.2193, + "num_input_tokens_seen": 40218424, + "step": 2499 + }, + { + "epoch": 0.17512061432311504, + "grad_norm": 3.77933931350708, + "learning_rate": 8.250437828371278e-05, + "loss": 1.0508, + "num_input_tokens_seen": 40234216, + "step": 2500 + }, + { + "epoch": 0.1751906625688443, + "grad_norm": 4.106552600860596, + "learning_rate": 8.249738003502628e-05, + "loss": 0.8558, + "num_input_tokens_seen": 40250368, + "step": 2501 + }, + { + "epoch": 0.17526071081457353, + "grad_norm": 4.9382710456848145, + "learning_rate": 8.249038178633976e-05, + "loss": 1.3082, + "num_input_tokens_seen": 40266600, + "step": 2502 + }, + { + "epoch": 0.1753307590603028, + "grad_norm": 3.8894200325012207, + "learning_rate": 8.248338353765325e-05, + "loss": 1.035, + "num_input_tokens_seen": 40282984, + "step": 2503 + }, + { + "epoch": 0.17540080730603202, + "grad_norm": 3.793044328689575, + "learning_rate": 8.247638528896673e-05, + "loss": 1.0376, + "num_input_tokens_seen": 40299368, + "step": 2504 + }, + { + "epoch": 0.17547085555176128, + "grad_norm": 4.874731540679932, + "learning_rate": 8.246938704028021e-05, + "loss": 1.2598, + "num_input_tokens_seen": 40315752, + "step": 2505 + }, + { + "epoch": 0.17554090379749052, + "grad_norm": 3.908191680908203, + "learning_rate": 8.24623887915937e-05, + "loss": 0.9739, + "num_input_tokens_seen": 40332136, + "step": 2506 + }, + { + "epoch": 0.17561095204321978, + "grad_norm": 3.585002899169922, + "learning_rate": 8.245539054290719e-05, + "loss": 0.9736, + "num_input_tokens_seen": 40348520, + "step": 2507 + }, + { + "epoch": 0.175681000288949, + "grad_norm": 3.9742348194122314, + "learning_rate": 8.244839229422068e-05, + "loss": 1.0278, + "num_input_tokens_seen": 40364760, + "step": 2508 + }, + { + "epoch": 0.17575104853467827, + "grad_norm": 5.1725921630859375, + "learning_rate": 8.244139404553415e-05, + "loss": 1.1488, + "num_input_tokens_seen": 40380072, + "step": 2509 + }, + { + "epoch": 0.1758210967804075, + "grad_norm": 4.038326263427734, + "learning_rate": 8.243439579684764e-05, + "loss": 1.2252, + "num_input_tokens_seen": 40395472, + "step": 2510 + }, + { + "epoch": 0.17589114502613676, + "grad_norm": 3.7381017208099365, + "learning_rate": 8.242739754816113e-05, + "loss": 1.041, + "num_input_tokens_seen": 40411280, + "step": 2511 + }, + { + "epoch": 0.175961193271866, + "grad_norm": 4.327959060668945, + "learning_rate": 8.24203992994746e-05, + "loss": 1.0272, + "num_input_tokens_seen": 40427664, + "step": 2512 + }, + { + "epoch": 0.17603124151759525, + "grad_norm": 3.720078706741333, + "learning_rate": 8.241340105078809e-05, + "loss": 1.2306, + "num_input_tokens_seen": 40443760, + "step": 2513 + }, + { + "epoch": 0.17610128976332448, + "grad_norm": 4.0901618003845215, + "learning_rate": 8.240640280210158e-05, + "loss": 1.0098, + "num_input_tokens_seen": 40460144, + "step": 2514 + }, + { + "epoch": 0.17617133800905374, + "grad_norm": 4.013705730438232, + "learning_rate": 8.239940455341507e-05, + "loss": 1.0817, + "num_input_tokens_seen": 40476528, + "step": 2515 + }, + { + "epoch": 0.17624138625478297, + "grad_norm": 3.8833489418029785, + "learning_rate": 8.239240630472856e-05, + "loss": 1.119, + "num_input_tokens_seen": 40492768, + "step": 2516 + }, + { + "epoch": 0.17631143450051223, + "grad_norm": 7.381611347198486, + "learning_rate": 8.238540805604203e-05, + "loss": 1.3033, + "num_input_tokens_seen": 40507344, + "step": 2517 + }, + { + "epoch": 0.17638148274624146, + "grad_norm": 3.8792364597320557, + "learning_rate": 8.237840980735552e-05, + "loss": 1.1113, + "num_input_tokens_seen": 40523552, + "step": 2518 + }, + { + "epoch": 0.17645153099197072, + "grad_norm": 5.19634485244751, + "learning_rate": 8.2371411558669e-05, + "loss": 1.2186, + "num_input_tokens_seen": 40538640, + "step": 2519 + }, + { + "epoch": 0.17652157923769995, + "grad_norm": 4.081907749176025, + "learning_rate": 8.236441330998248e-05, + "loss": 1.1075, + "num_input_tokens_seen": 40555024, + "step": 2520 + }, + { + "epoch": 0.17659162748342921, + "grad_norm": 4.296377182006836, + "learning_rate": 8.235741506129599e-05, + "loss": 0.9319, + "num_input_tokens_seen": 40570480, + "step": 2521 + }, + { + "epoch": 0.17666167572915845, + "grad_norm": 4.143492221832275, + "learning_rate": 8.235041681260946e-05, + "loss": 0.964, + "num_input_tokens_seen": 40586664, + "step": 2522 + }, + { + "epoch": 0.1767317239748877, + "grad_norm": 3.9894590377807617, + "learning_rate": 8.234341856392295e-05, + "loss": 0.913, + "num_input_tokens_seen": 40603048, + "step": 2523 + }, + { + "epoch": 0.17680177222061694, + "grad_norm": 4.283662796020508, + "learning_rate": 8.233642031523644e-05, + "loss": 1.0709, + "num_input_tokens_seen": 40618440, + "step": 2524 + }, + { + "epoch": 0.1768718204663462, + "grad_norm": 4.126082420349121, + "learning_rate": 8.232942206654991e-05, + "loss": 1.1371, + "num_input_tokens_seen": 40634824, + "step": 2525 + }, + { + "epoch": 0.17694186871207543, + "grad_norm": 4.252981662750244, + "learning_rate": 8.23224238178634e-05, + "loss": 1.0351, + "num_input_tokens_seen": 40650640, + "step": 2526 + }, + { + "epoch": 0.1770119169578047, + "grad_norm": 3.768542528152466, + "learning_rate": 8.231542556917689e-05, + "loss": 0.8221, + "num_input_tokens_seen": 40667000, + "step": 2527 + }, + { + "epoch": 0.17708196520353395, + "grad_norm": 4.067849636077881, + "learning_rate": 8.230842732049038e-05, + "loss": 1.2117, + "num_input_tokens_seen": 40683288, + "step": 2528 + }, + { + "epoch": 0.17715201344926318, + "grad_norm": 4.7552995681762695, + "learning_rate": 8.230142907180385e-05, + "loss": 1.0001, + "num_input_tokens_seen": 40699304, + "step": 2529 + }, + { + "epoch": 0.17722206169499244, + "grad_norm": 4.099888324737549, + "learning_rate": 8.229443082311734e-05, + "loss": 1.3335, + "num_input_tokens_seen": 40715688, + "step": 2530 + }, + { + "epoch": 0.17729210994072167, + "grad_norm": 4.219737529754639, + "learning_rate": 8.228743257443083e-05, + "loss": 1.3004, + "num_input_tokens_seen": 40731640, + "step": 2531 + }, + { + "epoch": 0.17736215818645093, + "grad_norm": 4.125600337982178, + "learning_rate": 8.22804343257443e-05, + "loss": 1.1828, + "num_input_tokens_seen": 40747664, + "step": 2532 + }, + { + "epoch": 0.17743220643218016, + "grad_norm": 3.7761423587799072, + "learning_rate": 8.22734360770578e-05, + "loss": 1.1082, + "num_input_tokens_seen": 40764048, + "step": 2533 + }, + { + "epoch": 0.17750225467790942, + "grad_norm": 5.0669026374816895, + "learning_rate": 8.226643782837128e-05, + "loss": 1.1434, + "num_input_tokens_seen": 40779160, + "step": 2534 + }, + { + "epoch": 0.17757230292363865, + "grad_norm": 4.688200950622559, + "learning_rate": 8.225943957968477e-05, + "loss": 1.2135, + "num_input_tokens_seen": 40795416, + "step": 2535 + }, + { + "epoch": 0.1776423511693679, + "grad_norm": 3.62204647064209, + "learning_rate": 8.225244133099825e-05, + "loss": 1.0816, + "num_input_tokens_seen": 40811800, + "step": 2536 + }, + { + "epoch": 0.17771239941509714, + "grad_norm": 4.086390495300293, + "learning_rate": 8.224544308231174e-05, + "loss": 1.067, + "num_input_tokens_seen": 40826960, + "step": 2537 + }, + { + "epoch": 0.1777824476608264, + "grad_norm": 5.574249744415283, + "learning_rate": 8.223844483362522e-05, + "loss": 1.2678, + "num_input_tokens_seen": 40843344, + "step": 2538 + }, + { + "epoch": 0.17785249590655564, + "grad_norm": 3.9721264839172363, + "learning_rate": 8.22314465849387e-05, + "loss": 1.0381, + "num_input_tokens_seen": 40859448, + "step": 2539 + }, + { + "epoch": 0.1779225441522849, + "grad_norm": 4.220152854919434, + "learning_rate": 8.222444833625219e-05, + "loss": 1.1014, + "num_input_tokens_seen": 40875128, + "step": 2540 + }, + { + "epoch": 0.17799259239801413, + "grad_norm": 3.905205011367798, + "learning_rate": 8.221745008756569e-05, + "loss": 0.9568, + "num_input_tokens_seen": 40890624, + "step": 2541 + }, + { + "epoch": 0.1780626406437434, + "grad_norm": 4.114316463470459, + "learning_rate": 8.221045183887917e-05, + "loss": 0.9885, + "num_input_tokens_seen": 40905624, + "step": 2542 + }, + { + "epoch": 0.17813268888947262, + "grad_norm": 4.810879230499268, + "learning_rate": 8.220345359019265e-05, + "loss": 0.9447, + "num_input_tokens_seen": 40922008, + "step": 2543 + }, + { + "epoch": 0.17820273713520188, + "grad_norm": 4.224065780639648, + "learning_rate": 8.219645534150613e-05, + "loss": 1.1176, + "num_input_tokens_seen": 40938392, + "step": 2544 + }, + { + "epoch": 0.1782727853809311, + "grad_norm": 3.7086703777313232, + "learning_rate": 8.218945709281962e-05, + "loss": 0.8931, + "num_input_tokens_seen": 40954776, + "step": 2545 + }, + { + "epoch": 0.17834283362666037, + "grad_norm": 4.346426963806152, + "learning_rate": 8.218245884413309e-05, + "loss": 0.9808, + "num_input_tokens_seen": 40971160, + "step": 2546 + }, + { + "epoch": 0.1784128818723896, + "grad_norm": 3.9295589923858643, + "learning_rate": 8.21754605954466e-05, + "loss": 1.1054, + "num_input_tokens_seen": 40987544, + "step": 2547 + }, + { + "epoch": 0.17848293011811886, + "grad_norm": 4.224534034729004, + "learning_rate": 8.216846234676008e-05, + "loss": 1.1131, + "num_input_tokens_seen": 41002816, + "step": 2548 + }, + { + "epoch": 0.1785529783638481, + "grad_norm": 3.940401315689087, + "learning_rate": 8.216146409807356e-05, + "loss": 1.1551, + "num_input_tokens_seen": 41018560, + "step": 2549 + }, + { + "epoch": 0.17862302660957735, + "grad_norm": 4.010072231292725, + "learning_rate": 8.215446584938705e-05, + "loss": 1.0915, + "num_input_tokens_seen": 41033976, + "step": 2550 + }, + { + "epoch": 0.17869307485530658, + "grad_norm": 4.192416191101074, + "learning_rate": 8.214746760070054e-05, + "loss": 1.0954, + "num_input_tokens_seen": 41049384, + "step": 2551 + }, + { + "epoch": 0.17876312310103584, + "grad_norm": 3.765962600708008, + "learning_rate": 8.214046935201401e-05, + "loss": 1.1029, + "num_input_tokens_seen": 41065528, + "step": 2552 + }, + { + "epoch": 0.17883317134676507, + "grad_norm": 3.7856082916259766, + "learning_rate": 8.21334711033275e-05, + "loss": 1.1063, + "num_input_tokens_seen": 41081912, + "step": 2553 + }, + { + "epoch": 0.17890321959249433, + "grad_norm": 4.845935821533203, + "learning_rate": 8.212647285464099e-05, + "loss": 1.2907, + "num_input_tokens_seen": 41098056, + "step": 2554 + }, + { + "epoch": 0.17897326783822357, + "grad_norm": 4.835206985473633, + "learning_rate": 8.211947460595448e-05, + "loss": 1.0591, + "num_input_tokens_seen": 41114376, + "step": 2555 + }, + { + "epoch": 0.17904331608395282, + "grad_norm": 3.9637155532836914, + "learning_rate": 8.211247635726795e-05, + "loss": 1.1689, + "num_input_tokens_seen": 41130760, + "step": 2556 + }, + { + "epoch": 0.17911336432968206, + "grad_norm": 3.5001652240753174, + "learning_rate": 8.210547810858144e-05, + "loss": 0.9798, + "num_input_tokens_seen": 41147040, + "step": 2557 + }, + { + "epoch": 0.17918341257541132, + "grad_norm": 5.54505729675293, + "learning_rate": 8.209847985989493e-05, + "loss": 1.3004, + "num_input_tokens_seen": 41163312, + "step": 2558 + }, + { + "epoch": 0.17925346082114055, + "grad_norm": 4.122933387756348, + "learning_rate": 8.20914816112084e-05, + "loss": 1.1754, + "num_input_tokens_seen": 41179632, + "step": 2559 + }, + { + "epoch": 0.1793235090668698, + "grad_norm": 4.166035175323486, + "learning_rate": 8.208448336252189e-05, + "loss": 1.0022, + "num_input_tokens_seen": 41196000, + "step": 2560 + }, + { + "epoch": 0.17939355731259904, + "grad_norm": 4.129281520843506, + "learning_rate": 8.20774851138354e-05, + "loss": 1.2342, + "num_input_tokens_seen": 41211944, + "step": 2561 + }, + { + "epoch": 0.1794636055583283, + "grad_norm": 3.9011406898498535, + "learning_rate": 8.207048686514887e-05, + "loss": 1.0238, + "num_input_tokens_seen": 41227680, + "step": 2562 + }, + { + "epoch": 0.17953365380405756, + "grad_norm": 3.717945098876953, + "learning_rate": 8.206348861646234e-05, + "loss": 0.9601, + "num_input_tokens_seen": 41244064, + "step": 2563 + }, + { + "epoch": 0.1796037020497868, + "grad_norm": 5.05475378036499, + "learning_rate": 8.205649036777583e-05, + "loss": 1.1192, + "num_input_tokens_seen": 41260448, + "step": 2564 + }, + { + "epoch": 0.17967375029551605, + "grad_norm": 4.52910041809082, + "learning_rate": 8.204949211908932e-05, + "loss": 0.9443, + "num_input_tokens_seen": 41276832, + "step": 2565 + }, + { + "epoch": 0.17974379854124528, + "grad_norm": 4.6492695808410645, + "learning_rate": 8.20424938704028e-05, + "loss": 1.0729, + "num_input_tokens_seen": 41293216, + "step": 2566 + }, + { + "epoch": 0.17981384678697454, + "grad_norm": 4.7587456703186035, + "learning_rate": 8.20354956217163e-05, + "loss": 0.9702, + "num_input_tokens_seen": 41309600, + "step": 2567 + }, + { + "epoch": 0.17988389503270377, + "grad_norm": 6.8467817306518555, + "learning_rate": 8.202849737302979e-05, + "loss": 1.1385, + "num_input_tokens_seen": 41325984, + "step": 2568 + }, + { + "epoch": 0.17995394327843303, + "grad_norm": 3.7771074771881104, + "learning_rate": 8.202149912434326e-05, + "loss": 1.1603, + "num_input_tokens_seen": 41342368, + "step": 2569 + }, + { + "epoch": 0.18002399152416226, + "grad_norm": 3.8494906425476074, + "learning_rate": 8.201450087565675e-05, + "loss": 1.056, + "num_input_tokens_seen": 41357992, + "step": 2570 + }, + { + "epoch": 0.18009403976989152, + "grad_norm": 4.079790115356445, + "learning_rate": 8.200750262697023e-05, + "loss": 1.1159, + "num_input_tokens_seen": 41374256, + "step": 2571 + }, + { + "epoch": 0.18016408801562075, + "grad_norm": 7.093918800354004, + "learning_rate": 8.200050437828371e-05, + "loss": 1.1756, + "num_input_tokens_seen": 41388728, + "step": 2572 + }, + { + "epoch": 0.18023413626135001, + "grad_norm": 4.636250972747803, + "learning_rate": 8.19935061295972e-05, + "loss": 1.1599, + "num_input_tokens_seen": 41404488, + "step": 2573 + }, + { + "epoch": 0.18030418450707925, + "grad_norm": 3.789625644683838, + "learning_rate": 8.198650788091069e-05, + "loss": 1.162, + "num_input_tokens_seen": 41420200, + "step": 2574 + }, + { + "epoch": 0.1803742327528085, + "grad_norm": 3.849637508392334, + "learning_rate": 8.197950963222418e-05, + "loss": 1.1399, + "num_input_tokens_seen": 41436496, + "step": 2575 + }, + { + "epoch": 0.18044428099853774, + "grad_norm": 3.6819775104522705, + "learning_rate": 8.197251138353766e-05, + "loss": 1.1467, + "num_input_tokens_seen": 41452736, + "step": 2576 + }, + { + "epoch": 0.180514329244267, + "grad_norm": 4.505229473114014, + "learning_rate": 8.196551313485114e-05, + "loss": 1.0336, + "num_input_tokens_seen": 41468976, + "step": 2577 + }, + { + "epoch": 0.18058437748999623, + "grad_norm": 5.465007781982422, + "learning_rate": 8.195851488616463e-05, + "loss": 0.983, + "num_input_tokens_seen": 41485064, + "step": 2578 + }, + { + "epoch": 0.1806544257357255, + "grad_norm": 3.993953227996826, + "learning_rate": 8.195151663747811e-05, + "loss": 1.3406, + "num_input_tokens_seen": 41501448, + "step": 2579 + }, + { + "epoch": 0.18072447398145472, + "grad_norm": 5.29327392578125, + "learning_rate": 8.19445183887916e-05, + "loss": 1.2397, + "num_input_tokens_seen": 41517832, + "step": 2580 + }, + { + "epoch": 0.18079452222718398, + "grad_norm": 4.132434844970703, + "learning_rate": 8.193752014010508e-05, + "loss": 1.2522, + "num_input_tokens_seen": 41532976, + "step": 2581 + }, + { + "epoch": 0.1808645704729132, + "grad_norm": 5.620279788970947, + "learning_rate": 8.193052189141857e-05, + "loss": 1.06, + "num_input_tokens_seen": 41548784, + "step": 2582 + }, + { + "epoch": 0.18093461871864247, + "grad_norm": 3.9721081256866455, + "learning_rate": 8.192352364273205e-05, + "loss": 1.0458, + "num_input_tokens_seen": 41565168, + "step": 2583 + }, + { + "epoch": 0.1810046669643717, + "grad_norm": 5.015312194824219, + "learning_rate": 8.191652539404554e-05, + "loss": 0.9813, + "num_input_tokens_seen": 41580584, + "step": 2584 + }, + { + "epoch": 0.18107471521010096, + "grad_norm": 5.385783672332764, + "learning_rate": 8.190952714535903e-05, + "loss": 1.0853, + "num_input_tokens_seen": 41596656, + "step": 2585 + }, + { + "epoch": 0.1811447634558302, + "grad_norm": 4.1005120277404785, + "learning_rate": 8.19025288966725e-05, + "loss": 1.0509, + "num_input_tokens_seen": 41611752, + "step": 2586 + }, + { + "epoch": 0.18121481170155945, + "grad_norm": 3.6853153705596924, + "learning_rate": 8.1895530647986e-05, + "loss": 1.0736, + "num_input_tokens_seen": 41627408, + "step": 2587 + }, + { + "epoch": 0.18128485994728868, + "grad_norm": 3.7818400859832764, + "learning_rate": 8.188853239929949e-05, + "loss": 1.1182, + "num_input_tokens_seen": 41643792, + "step": 2588 + }, + { + "epoch": 0.18135490819301794, + "grad_norm": 4.564868450164795, + "learning_rate": 8.188153415061297e-05, + "loss": 1.1408, + "num_input_tokens_seen": 41658768, + "step": 2589 + }, + { + "epoch": 0.18142495643874718, + "grad_norm": 4.092021465301514, + "learning_rate": 8.187453590192644e-05, + "loss": 1.0978, + "num_input_tokens_seen": 41675088, + "step": 2590 + }, + { + "epoch": 0.18149500468447644, + "grad_norm": 5.051564693450928, + "learning_rate": 8.186753765323993e-05, + "loss": 0.9746, + "num_input_tokens_seen": 41690376, + "step": 2591 + }, + { + "epoch": 0.18156505293020567, + "grad_norm": 3.5786261558532715, + "learning_rate": 8.186053940455342e-05, + "loss": 0.9638, + "num_input_tokens_seen": 41706760, + "step": 2592 + }, + { + "epoch": 0.18163510117593493, + "grad_norm": 4.11420202255249, + "learning_rate": 8.185354115586691e-05, + "loss": 1.1234, + "num_input_tokens_seen": 41721760, + "step": 2593 + }, + { + "epoch": 0.18170514942166416, + "grad_norm": 4.445348262786865, + "learning_rate": 8.18465429071804e-05, + "loss": 1.0846, + "num_input_tokens_seen": 41737640, + "step": 2594 + }, + { + "epoch": 0.18177519766739342, + "grad_norm": 5.705301284790039, + "learning_rate": 8.183954465849388e-05, + "loss": 1.2254, + "num_input_tokens_seen": 41753784, + "step": 2595 + }, + { + "epoch": 0.18184524591312265, + "grad_norm": 3.7948646545410156, + "learning_rate": 8.183254640980736e-05, + "loss": 0.9929, + "num_input_tokens_seen": 41770120, + "step": 2596 + }, + { + "epoch": 0.1819152941588519, + "grad_norm": 4.296072959899902, + "learning_rate": 8.182554816112085e-05, + "loss": 1.1365, + "num_input_tokens_seen": 41786504, + "step": 2597 + }, + { + "epoch": 0.18198534240458117, + "grad_norm": 4.750889778137207, + "learning_rate": 8.181854991243432e-05, + "loss": 1.1295, + "num_input_tokens_seen": 41802888, + "step": 2598 + }, + { + "epoch": 0.1820553906503104, + "grad_norm": 4.031731128692627, + "learning_rate": 8.181155166374781e-05, + "loss": 1.1096, + "num_input_tokens_seen": 41819264, + "step": 2599 + }, + { + "epoch": 0.18212543889603966, + "grad_norm": 4.620563507080078, + "learning_rate": 8.18045534150613e-05, + "loss": 1.1862, + "num_input_tokens_seen": 41835016, + "step": 2600 + }, + { + "epoch": 0.18212543889603966, + "eval_loss": 1.1404880285263062, + "eval_runtime": 0.189, + "eval_samples_per_second": 5.291, + "eval_steps_per_second": 5.291, + "num_input_tokens_seen": 41835016, + "step": 2600 + }, + { + "epoch": 0.1821954871417689, + "grad_norm": 3.8487257957458496, + "learning_rate": 8.179755516637479e-05, + "loss": 1.0103, + "num_input_tokens_seen": 41851400, + "step": 2601 + }, + { + "epoch": 0.18226553538749815, + "grad_norm": 4.221493244171143, + "learning_rate": 8.179055691768828e-05, + "loss": 1.1346, + "num_input_tokens_seen": 41867784, + "step": 2602 + }, + { + "epoch": 0.18233558363322738, + "grad_norm": 3.88747239112854, + "learning_rate": 8.178355866900175e-05, + "loss": 1.0679, + "num_input_tokens_seen": 41884024, + "step": 2603 + }, + { + "epoch": 0.18240563187895664, + "grad_norm": 3.845551013946533, + "learning_rate": 8.177656042031524e-05, + "loss": 0.9442, + "num_input_tokens_seen": 41899936, + "step": 2604 + }, + { + "epoch": 0.18247568012468587, + "grad_norm": 3.6964564323425293, + "learning_rate": 8.176956217162873e-05, + "loss": 0.815, + "num_input_tokens_seen": 41915512, + "step": 2605 + }, + { + "epoch": 0.18254572837041513, + "grad_norm": 4.105105400085449, + "learning_rate": 8.17625639229422e-05, + "loss": 1.0891, + "num_input_tokens_seen": 41931728, + "step": 2606 + }, + { + "epoch": 0.18261577661614437, + "grad_norm": 5.0245842933654785, + "learning_rate": 8.175556567425569e-05, + "loss": 0.9657, + "num_input_tokens_seen": 41947528, + "step": 2607 + }, + { + "epoch": 0.18268582486187362, + "grad_norm": 3.4683709144592285, + "learning_rate": 8.174856742556918e-05, + "loss": 0.8183, + "num_input_tokens_seen": 41963912, + "step": 2608 + }, + { + "epoch": 0.18275587310760286, + "grad_norm": 4.603201866149902, + "learning_rate": 8.174156917688267e-05, + "loss": 1.1339, + "num_input_tokens_seen": 41979976, + "step": 2609 + }, + { + "epoch": 0.18282592135333212, + "grad_norm": 3.9904422760009766, + "learning_rate": 8.173457092819615e-05, + "loss": 1.0661, + "num_input_tokens_seen": 41996360, + "step": 2610 + }, + { + "epoch": 0.18289596959906135, + "grad_norm": 6.363785743713379, + "learning_rate": 8.172757267950963e-05, + "loss": 0.9569, + "num_input_tokens_seen": 42011712, + "step": 2611 + }, + { + "epoch": 0.1829660178447906, + "grad_norm": 3.7257959842681885, + "learning_rate": 8.172057443082312e-05, + "loss": 1.0227, + "num_input_tokens_seen": 42028096, + "step": 2612 + }, + { + "epoch": 0.18303606609051984, + "grad_norm": 3.8486809730529785, + "learning_rate": 8.171357618213661e-05, + "loss": 1.0442, + "num_input_tokens_seen": 42044480, + "step": 2613 + }, + { + "epoch": 0.1831061143362491, + "grad_norm": 4.620292663574219, + "learning_rate": 8.17065779334501e-05, + "loss": 0.9917, + "num_input_tokens_seen": 42060864, + "step": 2614 + }, + { + "epoch": 0.18317616258197833, + "grad_norm": 3.52644944190979, + "learning_rate": 8.169957968476359e-05, + "loss": 1.1402, + "num_input_tokens_seen": 42077072, + "step": 2615 + }, + { + "epoch": 0.1832462108277076, + "grad_norm": 3.800718069076538, + "learning_rate": 8.169258143607706e-05, + "loss": 0.9864, + "num_input_tokens_seen": 42093128, + "step": 2616 + }, + { + "epoch": 0.18331625907343682, + "grad_norm": 3.9447405338287354, + "learning_rate": 8.168558318739054e-05, + "loss": 0.9923, + "num_input_tokens_seen": 42109512, + "step": 2617 + }, + { + "epoch": 0.18338630731916608, + "grad_norm": 4.109864234924316, + "learning_rate": 8.167858493870403e-05, + "loss": 0.9583, + "num_input_tokens_seen": 42125776, + "step": 2618 + }, + { + "epoch": 0.1834563555648953, + "grad_norm": 3.6538870334625244, + "learning_rate": 8.167158669001752e-05, + "loss": 1.0731, + "num_input_tokens_seen": 42141760, + "step": 2619 + }, + { + "epoch": 0.18352640381062457, + "grad_norm": 5.139223098754883, + "learning_rate": 8.1664588441331e-05, + "loss": 1.2108, + "num_input_tokens_seen": 42157096, + "step": 2620 + }, + { + "epoch": 0.1835964520563538, + "grad_norm": 4.420098781585693, + "learning_rate": 8.165759019264449e-05, + "loss": 1.0652, + "num_input_tokens_seen": 42173480, + "step": 2621 + }, + { + "epoch": 0.18366650030208306, + "grad_norm": 5.559954643249512, + "learning_rate": 8.165059194395798e-05, + "loss": 0.9224, + "num_input_tokens_seen": 42188944, + "step": 2622 + }, + { + "epoch": 0.1837365485478123, + "grad_norm": 3.827627420425415, + "learning_rate": 8.164359369527146e-05, + "loss": 0.9185, + "num_input_tokens_seen": 42204952, + "step": 2623 + }, + { + "epoch": 0.18380659679354155, + "grad_norm": 7.454338550567627, + "learning_rate": 8.163659544658494e-05, + "loss": 1.2441, + "num_input_tokens_seen": 42221336, + "step": 2624 + }, + { + "epoch": 0.1838766450392708, + "grad_norm": 4.34182071685791, + "learning_rate": 8.162959719789842e-05, + "loss": 1.122, + "num_input_tokens_seen": 42237720, + "step": 2625 + }, + { + "epoch": 0.18394669328500005, + "grad_norm": 3.9157843589782715, + "learning_rate": 8.162259894921191e-05, + "loss": 1.1206, + "num_input_tokens_seen": 42253584, + "step": 2626 + }, + { + "epoch": 0.18401674153072928, + "grad_norm": 3.952451467514038, + "learning_rate": 8.16156007005254e-05, + "loss": 1.2001, + "num_input_tokens_seen": 42269968, + "step": 2627 + }, + { + "epoch": 0.18408678977645854, + "grad_norm": 5.0545148849487305, + "learning_rate": 8.160860245183889e-05, + "loss": 1.0629, + "num_input_tokens_seen": 42286232, + "step": 2628 + }, + { + "epoch": 0.18415683802218777, + "grad_norm": 7.176907062530518, + "learning_rate": 8.160160420315237e-05, + "loss": 1.1248, + "num_input_tokens_seen": 42302616, + "step": 2629 + }, + { + "epoch": 0.18422688626791703, + "grad_norm": 3.994748830795288, + "learning_rate": 8.159460595446585e-05, + "loss": 0.9938, + "num_input_tokens_seen": 42319000, + "step": 2630 + }, + { + "epoch": 0.18429693451364626, + "grad_norm": 3.5744547843933105, + "learning_rate": 8.158760770577934e-05, + "loss": 1.0644, + "num_input_tokens_seen": 42335384, + "step": 2631 + }, + { + "epoch": 0.18436698275937552, + "grad_norm": 3.528723955154419, + "learning_rate": 8.158060945709283e-05, + "loss": 0.9955, + "num_input_tokens_seen": 42351768, + "step": 2632 + }, + { + "epoch": 0.18443703100510478, + "grad_norm": 3.9958291053771973, + "learning_rate": 8.15736112084063e-05, + "loss": 1.076, + "num_input_tokens_seen": 42368152, + "step": 2633 + }, + { + "epoch": 0.184507079250834, + "grad_norm": 4.1659650802612305, + "learning_rate": 8.15666129597198e-05, + "loss": 1.1427, + "num_input_tokens_seen": 42384536, + "step": 2634 + }, + { + "epoch": 0.18457712749656327, + "grad_norm": 5.116000652313232, + "learning_rate": 8.155961471103328e-05, + "loss": 1.1418, + "num_input_tokens_seen": 42399704, + "step": 2635 + }, + { + "epoch": 0.1846471757422925, + "grad_norm": 4.01514196395874, + "learning_rate": 8.155261646234677e-05, + "loss": 0.9521, + "num_input_tokens_seen": 42416056, + "step": 2636 + }, + { + "epoch": 0.18471722398802176, + "grad_norm": 4.290152072906494, + "learning_rate": 8.154561821366024e-05, + "loss": 1.06, + "num_input_tokens_seen": 42431544, + "step": 2637 + }, + { + "epoch": 0.184787272233751, + "grad_norm": 4.267684459686279, + "learning_rate": 8.153861996497373e-05, + "loss": 1.0247, + "num_input_tokens_seen": 42447928, + "step": 2638 + }, + { + "epoch": 0.18485732047948025, + "grad_norm": 3.593191385269165, + "learning_rate": 8.153162171628722e-05, + "loss": 0.9917, + "num_input_tokens_seen": 42464312, + "step": 2639 + }, + { + "epoch": 0.18492736872520948, + "grad_norm": 4.322700023651123, + "learning_rate": 8.152462346760071e-05, + "loss": 1.1686, + "num_input_tokens_seen": 42480696, + "step": 2640 + }, + { + "epoch": 0.18499741697093874, + "grad_norm": 4.176753520965576, + "learning_rate": 8.15176252189142e-05, + "loss": 1.046, + "num_input_tokens_seen": 42496520, + "step": 2641 + }, + { + "epoch": 0.18506746521666798, + "grad_norm": 4.405294895172119, + "learning_rate": 8.151062697022769e-05, + "loss": 1.0884, + "num_input_tokens_seen": 42512904, + "step": 2642 + }, + { + "epoch": 0.18513751346239724, + "grad_norm": 3.8770217895507812, + "learning_rate": 8.150362872154116e-05, + "loss": 1.124, + "num_input_tokens_seen": 42529288, + "step": 2643 + }, + { + "epoch": 0.18520756170812647, + "grad_norm": 3.5909271240234375, + "learning_rate": 8.149663047285464e-05, + "loss": 1.107, + "num_input_tokens_seen": 42545672, + "step": 2644 + }, + { + "epoch": 0.18527760995385573, + "grad_norm": 3.73958420753479, + "learning_rate": 8.148963222416812e-05, + "loss": 0.9943, + "num_input_tokens_seen": 42562056, + "step": 2645 + }, + { + "epoch": 0.18534765819958496, + "grad_norm": 3.6813879013061523, + "learning_rate": 8.148263397548161e-05, + "loss": 0.9861, + "num_input_tokens_seen": 42577720, + "step": 2646 + }, + { + "epoch": 0.18541770644531422, + "grad_norm": 4.13958215713501, + "learning_rate": 8.14756357267951e-05, + "loss": 1.0882, + "num_input_tokens_seen": 42594104, + "step": 2647 + }, + { + "epoch": 0.18548775469104345, + "grad_norm": 3.757805109024048, + "learning_rate": 8.146863747810859e-05, + "loss": 1.0872, + "num_input_tokens_seen": 42610256, + "step": 2648 + }, + { + "epoch": 0.1855578029367727, + "grad_norm": 4.57798957824707, + "learning_rate": 8.146163922942208e-05, + "loss": 0.9471, + "num_input_tokens_seen": 42626424, + "step": 2649 + }, + { + "epoch": 0.18562785118250194, + "grad_norm": 3.797257423400879, + "learning_rate": 8.145464098073555e-05, + "loss": 0.9336, + "num_input_tokens_seen": 42642200, + "step": 2650 + }, + { + "epoch": 0.1856978994282312, + "grad_norm": 4.258513450622559, + "learning_rate": 8.144764273204904e-05, + "loss": 1.1557, + "num_input_tokens_seen": 42657416, + "step": 2651 + }, + { + "epoch": 0.18576794767396043, + "grad_norm": 4.369161605834961, + "learning_rate": 8.144064448336252e-05, + "loss": 1.0013, + "num_input_tokens_seen": 42673752, + "step": 2652 + }, + { + "epoch": 0.1858379959196897, + "grad_norm": 4.159987926483154, + "learning_rate": 8.1433646234676e-05, + "loss": 1.151, + "num_input_tokens_seen": 42690136, + "step": 2653 + }, + { + "epoch": 0.18590804416541892, + "grad_norm": 7.164428234100342, + "learning_rate": 8.14266479859895e-05, + "loss": 1.1637, + "num_input_tokens_seen": 42706520, + "step": 2654 + }, + { + "epoch": 0.18597809241114818, + "grad_norm": 3.4230172634124756, + "learning_rate": 8.141964973730298e-05, + "loss": 0.9291, + "num_input_tokens_seen": 42722904, + "step": 2655 + }, + { + "epoch": 0.18604814065687741, + "grad_norm": 4.316817283630371, + "learning_rate": 8.141265148861647e-05, + "loss": 1.2645, + "num_input_tokens_seen": 42738640, + "step": 2656 + }, + { + "epoch": 0.18611818890260667, + "grad_norm": 3.7894318103790283, + "learning_rate": 8.140565323992995e-05, + "loss": 1.1287, + "num_input_tokens_seen": 42754848, + "step": 2657 + }, + { + "epoch": 0.1861882371483359, + "grad_norm": 4.198835849761963, + "learning_rate": 8.139865499124344e-05, + "loss": 1.1525, + "num_input_tokens_seen": 42771232, + "step": 2658 + }, + { + "epoch": 0.18625828539406517, + "grad_norm": 3.796414852142334, + "learning_rate": 8.139165674255692e-05, + "loss": 1.0313, + "num_input_tokens_seen": 42787344, + "step": 2659 + }, + { + "epoch": 0.1863283336397944, + "grad_norm": 3.6421244144439697, + "learning_rate": 8.138465849387041e-05, + "loss": 1.0497, + "num_input_tokens_seen": 42803728, + "step": 2660 + }, + { + "epoch": 0.18639838188552366, + "grad_norm": 4.391780376434326, + "learning_rate": 8.13776602451839e-05, + "loss": 1.0564, + "num_input_tokens_seen": 42820112, + "step": 2661 + }, + { + "epoch": 0.1864684301312529, + "grad_norm": 4.187370777130127, + "learning_rate": 8.137066199649738e-05, + "loss": 1.0289, + "num_input_tokens_seen": 42836496, + "step": 2662 + }, + { + "epoch": 0.18653847837698215, + "grad_norm": 3.794281244277954, + "learning_rate": 8.136366374781086e-05, + "loss": 1.172, + "num_input_tokens_seen": 42852880, + "step": 2663 + }, + { + "epoch": 0.18660852662271138, + "grad_norm": 4.386116981506348, + "learning_rate": 8.135666549912434e-05, + "loss": 1.1443, + "num_input_tokens_seen": 42869264, + "step": 2664 + }, + { + "epoch": 0.18667857486844064, + "grad_norm": 4.223747253417969, + "learning_rate": 8.134966725043783e-05, + "loss": 1.074, + "num_input_tokens_seen": 42885528, + "step": 2665 + }, + { + "epoch": 0.1867486231141699, + "grad_norm": 5.020680904388428, + "learning_rate": 8.134266900175132e-05, + "loss": 1.1927, + "num_input_tokens_seen": 42901912, + "step": 2666 + }, + { + "epoch": 0.18681867135989913, + "grad_norm": 8.58757495880127, + "learning_rate": 8.13356707530648e-05, + "loss": 1.1377, + "num_input_tokens_seen": 42917072, + "step": 2667 + }, + { + "epoch": 0.1868887196056284, + "grad_norm": 3.6986234188079834, + "learning_rate": 8.13286725043783e-05, + "loss": 1.0536, + "num_input_tokens_seen": 42933296, + "step": 2668 + }, + { + "epoch": 0.18695876785135762, + "grad_norm": 4.196423053741455, + "learning_rate": 8.132167425569178e-05, + "loss": 1.0484, + "num_input_tokens_seen": 42948968, + "step": 2669 + }, + { + "epoch": 0.18702881609708688, + "grad_norm": 4.019235610961914, + "learning_rate": 8.131467600700526e-05, + "loss": 1.1241, + "num_input_tokens_seen": 42965352, + "step": 2670 + }, + { + "epoch": 0.1870988643428161, + "grad_norm": 4.035778045654297, + "learning_rate": 8.130767775831873e-05, + "loss": 0.8962, + "num_input_tokens_seen": 42980872, + "step": 2671 + }, + { + "epoch": 0.18716891258854537, + "grad_norm": 4.193873882293701, + "learning_rate": 8.130067950963222e-05, + "loss": 1.0494, + "num_input_tokens_seen": 42996848, + "step": 2672 + }, + { + "epoch": 0.1872389608342746, + "grad_norm": 4.011183738708496, + "learning_rate": 8.129368126094571e-05, + "loss": 1.1151, + "num_input_tokens_seen": 43012728, + "step": 2673 + }, + { + "epoch": 0.18730900908000386, + "grad_norm": 5.662332534790039, + "learning_rate": 8.12866830122592e-05, + "loss": 1.1238, + "num_input_tokens_seen": 43028728, + "step": 2674 + }, + { + "epoch": 0.1873790573257331, + "grad_norm": 4.4699387550354, + "learning_rate": 8.127968476357269e-05, + "loss": 1.0712, + "num_input_tokens_seen": 43044504, + "step": 2675 + }, + { + "epoch": 0.18744910557146235, + "grad_norm": 3.857011556625366, + "learning_rate": 8.127268651488618e-05, + "loss": 0.9866, + "num_input_tokens_seen": 43060496, + "step": 2676 + }, + { + "epoch": 0.1875191538171916, + "grad_norm": 3.5458414554595947, + "learning_rate": 8.126568826619965e-05, + "loss": 0.9317, + "num_input_tokens_seen": 43076880, + "step": 2677 + }, + { + "epoch": 0.18758920206292085, + "grad_norm": 5.4007744789123535, + "learning_rate": 8.125869001751314e-05, + "loss": 1.2016, + "num_input_tokens_seen": 43091368, + "step": 2678 + }, + { + "epoch": 0.18765925030865008, + "grad_norm": 5.15717077255249, + "learning_rate": 8.125169176882661e-05, + "loss": 1.0662, + "num_input_tokens_seen": 43107752, + "step": 2679 + }, + { + "epoch": 0.18772929855437934, + "grad_norm": 4.891427516937256, + "learning_rate": 8.124469352014012e-05, + "loss": 1.1684, + "num_input_tokens_seen": 43122808, + "step": 2680 + }, + { + "epoch": 0.18779934680010857, + "grad_norm": 4.651966571807861, + "learning_rate": 8.123769527145359e-05, + "loss": 1.1457, + "num_input_tokens_seen": 43139056, + "step": 2681 + }, + { + "epoch": 0.18786939504583783, + "grad_norm": 3.844129800796509, + "learning_rate": 8.123069702276708e-05, + "loss": 0.9282, + "num_input_tokens_seen": 43155440, + "step": 2682 + }, + { + "epoch": 0.18793944329156706, + "grad_norm": 3.669360876083374, + "learning_rate": 8.122369877408057e-05, + "loss": 1.1418, + "num_input_tokens_seen": 43171824, + "step": 2683 + }, + { + "epoch": 0.18800949153729632, + "grad_norm": 3.6102185249328613, + "learning_rate": 8.121670052539404e-05, + "loss": 1.0786, + "num_input_tokens_seen": 43188208, + "step": 2684 + }, + { + "epoch": 0.18807953978302555, + "grad_norm": 3.593414783477783, + "learning_rate": 8.120970227670753e-05, + "loss": 0.9982, + "num_input_tokens_seen": 43204248, + "step": 2685 + }, + { + "epoch": 0.1881495880287548, + "grad_norm": 5.017848491668701, + "learning_rate": 8.120270402802102e-05, + "loss": 0.9573, + "num_input_tokens_seen": 43219808, + "step": 2686 + }, + { + "epoch": 0.18821963627448404, + "grad_norm": 4.083794593811035, + "learning_rate": 8.119570577933451e-05, + "loss": 1.0678, + "num_input_tokens_seen": 43235712, + "step": 2687 + }, + { + "epoch": 0.1882896845202133, + "grad_norm": 4.265167713165283, + "learning_rate": 8.1188707530648e-05, + "loss": 1.2967, + "num_input_tokens_seen": 43252048, + "step": 2688 + }, + { + "epoch": 0.18835973276594253, + "grad_norm": 4.24991512298584, + "learning_rate": 8.118170928196147e-05, + "loss": 1.0267, + "num_input_tokens_seen": 43268152, + "step": 2689 + }, + { + "epoch": 0.1884297810116718, + "grad_norm": 4.059658050537109, + "learning_rate": 8.117471103327496e-05, + "loss": 1.1356, + "num_input_tokens_seen": 43284240, + "step": 2690 + }, + { + "epoch": 0.18849982925740102, + "grad_norm": 4.807305812835693, + "learning_rate": 8.116771278458844e-05, + "loss": 1.0424, + "num_input_tokens_seen": 43299368, + "step": 2691 + }, + { + "epoch": 0.18856987750313028, + "grad_norm": 5.590726852416992, + "learning_rate": 8.116071453590193e-05, + "loss": 1.1008, + "num_input_tokens_seen": 43315648, + "step": 2692 + }, + { + "epoch": 0.18863992574885952, + "grad_norm": 5.114964485168457, + "learning_rate": 8.115371628721541e-05, + "loss": 0.8916, + "num_input_tokens_seen": 43331688, + "step": 2693 + }, + { + "epoch": 0.18870997399458878, + "grad_norm": 4.323836803436279, + "learning_rate": 8.11467180385289e-05, + "loss": 1.1858, + "num_input_tokens_seen": 43346672, + "step": 2694 + }, + { + "epoch": 0.188780022240318, + "grad_norm": 4.290014743804932, + "learning_rate": 8.113971978984239e-05, + "loss": 1.2498, + "num_input_tokens_seen": 43362872, + "step": 2695 + }, + { + "epoch": 0.18885007048604727, + "grad_norm": 3.5292484760284424, + "learning_rate": 8.113272154115588e-05, + "loss": 1.0045, + "num_input_tokens_seen": 43379256, + "step": 2696 + }, + { + "epoch": 0.1889201187317765, + "grad_norm": 4.21523380279541, + "learning_rate": 8.112572329246935e-05, + "loss": 1.0515, + "num_input_tokens_seen": 43395152, + "step": 2697 + }, + { + "epoch": 0.18899016697750576, + "grad_norm": 4.900782108306885, + "learning_rate": 8.111872504378283e-05, + "loss": 1.1038, + "num_input_tokens_seen": 43411536, + "step": 2698 + }, + { + "epoch": 0.189060215223235, + "grad_norm": 3.613231658935547, + "learning_rate": 8.111172679509632e-05, + "loss": 1.017, + "num_input_tokens_seen": 43427920, + "step": 2699 + }, + { + "epoch": 0.18913026346896425, + "grad_norm": 3.681725263595581, + "learning_rate": 8.110472854640982e-05, + "loss": 1.1396, + "num_input_tokens_seen": 43444304, + "step": 2700 + }, + { + "epoch": 0.1892003117146935, + "grad_norm": 3.801785707473755, + "learning_rate": 8.10977302977233e-05, + "loss": 0.9856, + "num_input_tokens_seen": 43459960, + "step": 2701 + }, + { + "epoch": 0.18927035996042274, + "grad_norm": 3.4208626747131348, + "learning_rate": 8.109073204903678e-05, + "loss": 1.0048, + "num_input_tokens_seen": 43476344, + "step": 2702 + }, + { + "epoch": 0.189340408206152, + "grad_norm": 4.169189453125, + "learning_rate": 8.108373380035027e-05, + "loss": 1.0014, + "num_input_tokens_seen": 43492728, + "step": 2703 + }, + { + "epoch": 0.18941045645188123, + "grad_norm": 3.7125117778778076, + "learning_rate": 8.107673555166375e-05, + "loss": 0.9707, + "num_input_tokens_seen": 43508168, + "step": 2704 + }, + { + "epoch": 0.1894805046976105, + "grad_norm": 4.550642490386963, + "learning_rate": 8.106973730297724e-05, + "loss": 1.0832, + "num_input_tokens_seen": 43524480, + "step": 2705 + }, + { + "epoch": 0.18955055294333972, + "grad_norm": 4.219499588012695, + "learning_rate": 8.106273905429072e-05, + "loss": 1.148, + "num_input_tokens_seen": 43540864, + "step": 2706 + }, + { + "epoch": 0.18962060118906898, + "grad_norm": 4.605996131896973, + "learning_rate": 8.105574080560421e-05, + "loss": 1.0564, + "num_input_tokens_seen": 43557248, + "step": 2707 + }, + { + "epoch": 0.18969064943479821, + "grad_norm": 3.740314245223999, + "learning_rate": 8.104874255691769e-05, + "loss": 1.0194, + "num_input_tokens_seen": 43573632, + "step": 2708 + }, + { + "epoch": 0.18976069768052747, + "grad_norm": 3.92555832862854, + "learning_rate": 8.104174430823118e-05, + "loss": 1.1663, + "num_input_tokens_seen": 43589728, + "step": 2709 + }, + { + "epoch": 0.1898307459262567, + "grad_norm": 3.5653927326202393, + "learning_rate": 8.103474605954467e-05, + "loss": 1.1165, + "num_input_tokens_seen": 43606112, + "step": 2710 + }, + { + "epoch": 0.18990079417198596, + "grad_norm": 5.943650245666504, + "learning_rate": 8.102774781085814e-05, + "loss": 1.309, + "num_input_tokens_seen": 43621072, + "step": 2711 + }, + { + "epoch": 0.1899708424177152, + "grad_norm": 3.7632322311401367, + "learning_rate": 8.102074956217163e-05, + "loss": 1.0963, + "num_input_tokens_seen": 43636976, + "step": 2712 + }, + { + "epoch": 0.19004089066344446, + "grad_norm": 3.605536699295044, + "learning_rate": 8.101375131348512e-05, + "loss": 0.9509, + "num_input_tokens_seen": 43653360, + "step": 2713 + }, + { + "epoch": 0.1901109389091737, + "grad_norm": 3.7717363834381104, + "learning_rate": 8.10067530647986e-05, + "loss": 0.9407, + "num_input_tokens_seen": 43669488, + "step": 2714 + }, + { + "epoch": 0.19018098715490295, + "grad_norm": 4.55484676361084, + "learning_rate": 8.09997548161121e-05, + "loss": 0.8501, + "num_input_tokens_seen": 43684704, + "step": 2715 + }, + { + "epoch": 0.19025103540063218, + "grad_norm": 4.155830383300781, + "learning_rate": 8.099275656742557e-05, + "loss": 0.9936, + "num_input_tokens_seen": 43700112, + "step": 2716 + }, + { + "epoch": 0.19032108364636144, + "grad_norm": 5.615505695343018, + "learning_rate": 8.098575831873906e-05, + "loss": 1.2055, + "num_input_tokens_seen": 43716136, + "step": 2717 + }, + { + "epoch": 0.19039113189209067, + "grad_norm": 4.60966157913208, + "learning_rate": 8.097876007005253e-05, + "loss": 1.0531, + "num_input_tokens_seen": 43731576, + "step": 2718 + }, + { + "epoch": 0.19046118013781993, + "grad_norm": 5.698062896728516, + "learning_rate": 8.097176182136602e-05, + "loss": 0.9692, + "num_input_tokens_seen": 43747960, + "step": 2719 + }, + { + "epoch": 0.19053122838354916, + "grad_norm": 3.760756492614746, + "learning_rate": 8.096476357267952e-05, + "loss": 0.9638, + "num_input_tokens_seen": 43764304, + "step": 2720 + }, + { + "epoch": 0.19060127662927842, + "grad_norm": 4.084067344665527, + "learning_rate": 8.0957765323993e-05, + "loss": 1.083, + "num_input_tokens_seen": 43780688, + "step": 2721 + }, + { + "epoch": 0.19067132487500765, + "grad_norm": 3.9934301376342773, + "learning_rate": 8.095076707530649e-05, + "loss": 0.9757, + "num_input_tokens_seen": 43797072, + "step": 2722 + }, + { + "epoch": 0.1907413731207369, + "grad_norm": 3.915512800216675, + "learning_rate": 8.094376882661998e-05, + "loss": 1.1031, + "num_input_tokens_seen": 43813456, + "step": 2723 + }, + { + "epoch": 0.19081142136646614, + "grad_norm": 3.967040777206421, + "learning_rate": 8.093677057793345e-05, + "loss": 0.9821, + "num_input_tokens_seen": 43829656, + "step": 2724 + }, + { + "epoch": 0.1908814696121954, + "grad_norm": 3.707667827606201, + "learning_rate": 8.092977232924693e-05, + "loss": 1.1489, + "num_input_tokens_seen": 43846040, + "step": 2725 + }, + { + "epoch": 0.19095151785792464, + "grad_norm": 3.3822734355926514, + "learning_rate": 8.092277408056043e-05, + "loss": 1.0051, + "num_input_tokens_seen": 43862144, + "step": 2726 + }, + { + "epoch": 0.1910215661036539, + "grad_norm": 3.7703781127929688, + "learning_rate": 8.091577583187392e-05, + "loss": 1.0363, + "num_input_tokens_seen": 43878328, + "step": 2727 + }, + { + "epoch": 0.19109161434938313, + "grad_norm": 3.902003049850464, + "learning_rate": 8.090877758318739e-05, + "loss": 1.0051, + "num_input_tokens_seen": 43893480, + "step": 2728 + }, + { + "epoch": 0.19116166259511239, + "grad_norm": 3.971395969390869, + "learning_rate": 8.090177933450088e-05, + "loss": 1.0469, + "num_input_tokens_seen": 43909752, + "step": 2729 + }, + { + "epoch": 0.19123171084084162, + "grad_norm": 3.4233641624450684, + "learning_rate": 8.089478108581437e-05, + "loss": 0.8821, + "num_input_tokens_seen": 43926136, + "step": 2730 + }, + { + "epoch": 0.19130175908657088, + "grad_norm": 5.967614650726318, + "learning_rate": 8.088778283712784e-05, + "loss": 1.1995, + "num_input_tokens_seen": 43941592, + "step": 2731 + }, + { + "epoch": 0.1913718073323001, + "grad_norm": 4.431912899017334, + "learning_rate": 8.088078458844133e-05, + "loss": 1.2471, + "num_input_tokens_seen": 43957784, + "step": 2732 + }, + { + "epoch": 0.19144185557802937, + "grad_norm": 3.659182071685791, + "learning_rate": 8.087378633975482e-05, + "loss": 0.9701, + "num_input_tokens_seen": 43973648, + "step": 2733 + }, + { + "epoch": 0.1915119038237586, + "grad_norm": 4.983634948730469, + "learning_rate": 8.086678809106831e-05, + "loss": 1.1023, + "num_input_tokens_seen": 43990032, + "step": 2734 + }, + { + "epoch": 0.19158195206948786, + "grad_norm": 4.236748695373535, + "learning_rate": 8.085978984238179e-05, + "loss": 1.0724, + "num_input_tokens_seen": 44005064, + "step": 2735 + }, + { + "epoch": 0.19165200031521712, + "grad_norm": 3.3617727756500244, + "learning_rate": 8.085279159369527e-05, + "loss": 0.9986, + "num_input_tokens_seen": 44021448, + "step": 2736 + }, + { + "epoch": 0.19172204856094635, + "grad_norm": 3.4514083862304688, + "learning_rate": 8.084579334500876e-05, + "loss": 0.8738, + "num_input_tokens_seen": 44037832, + "step": 2737 + }, + { + "epoch": 0.1917920968066756, + "grad_norm": 4.126194000244141, + "learning_rate": 8.083879509632224e-05, + "loss": 1.1142, + "num_input_tokens_seen": 44053384, + "step": 2738 + }, + { + "epoch": 0.19186214505240484, + "grad_norm": 5.12385368347168, + "learning_rate": 8.083179684763573e-05, + "loss": 1.251, + "num_input_tokens_seen": 44068728, + "step": 2739 + }, + { + "epoch": 0.1919321932981341, + "grad_norm": 3.457253932952881, + "learning_rate": 8.082479859894923e-05, + "loss": 0.8251, + "num_input_tokens_seen": 44085112, + "step": 2740 + }, + { + "epoch": 0.19200224154386333, + "grad_norm": 3.8708858489990234, + "learning_rate": 8.08178003502627e-05, + "loss": 1.1838, + "num_input_tokens_seen": 44101456, + "step": 2741 + }, + { + "epoch": 0.1920722897895926, + "grad_norm": 4.175468921661377, + "learning_rate": 8.081080210157619e-05, + "loss": 1.0062, + "num_input_tokens_seen": 44116640, + "step": 2742 + }, + { + "epoch": 0.19214233803532182, + "grad_norm": 4.141748428344727, + "learning_rate": 8.080380385288967e-05, + "loss": 1.1609, + "num_input_tokens_seen": 44132328, + "step": 2743 + }, + { + "epoch": 0.19221238628105108, + "grad_norm": 5.1061692237854, + "learning_rate": 8.079680560420316e-05, + "loss": 1.172, + "num_input_tokens_seen": 44148712, + "step": 2744 + }, + { + "epoch": 0.19228243452678032, + "grad_norm": 3.990196704864502, + "learning_rate": 8.078980735551663e-05, + "loss": 0.9997, + "num_input_tokens_seen": 44164600, + "step": 2745 + }, + { + "epoch": 0.19235248277250958, + "grad_norm": 4.365367412567139, + "learning_rate": 8.078280910683013e-05, + "loss": 1.0672, + "num_input_tokens_seen": 44180984, + "step": 2746 + }, + { + "epoch": 0.1924225310182388, + "grad_norm": 4.092031002044678, + "learning_rate": 8.077581085814362e-05, + "loss": 1.1405, + "num_input_tokens_seen": 44196400, + "step": 2747 + }, + { + "epoch": 0.19249257926396807, + "grad_norm": 3.4052438735961914, + "learning_rate": 8.07688126094571e-05, + "loss": 1.0128, + "num_input_tokens_seen": 44212736, + "step": 2748 + }, + { + "epoch": 0.1925626275096973, + "grad_norm": 4.703436374664307, + "learning_rate": 8.076181436077059e-05, + "loss": 1.2058, + "num_input_tokens_seen": 44229120, + "step": 2749 + }, + { + "epoch": 0.19263267575542656, + "grad_norm": 3.7579853534698486, + "learning_rate": 8.075481611208407e-05, + "loss": 0.9081, + "num_input_tokens_seen": 44245144, + "step": 2750 + }, + { + "epoch": 0.1927027240011558, + "grad_norm": 3.6251869201660156, + "learning_rate": 8.074781786339755e-05, + "loss": 0.9854, + "num_input_tokens_seen": 44260920, + "step": 2751 + }, + { + "epoch": 0.19277277224688505, + "grad_norm": 3.4949889183044434, + "learning_rate": 8.074081961471104e-05, + "loss": 1.1115, + "num_input_tokens_seen": 44277280, + "step": 2752 + }, + { + "epoch": 0.19284282049261428, + "grad_norm": 4.28520393371582, + "learning_rate": 8.073382136602453e-05, + "loss": 1.2536, + "num_input_tokens_seen": 44293664, + "step": 2753 + }, + { + "epoch": 0.19291286873834354, + "grad_norm": 3.9574859142303467, + "learning_rate": 8.072682311733801e-05, + "loss": 1.1584, + "num_input_tokens_seen": 44309328, + "step": 2754 + }, + { + "epoch": 0.19298291698407277, + "grad_norm": 3.6340646743774414, + "learning_rate": 8.071982486865149e-05, + "loss": 1.0116, + "num_input_tokens_seen": 44325336, + "step": 2755 + }, + { + "epoch": 0.19305296522980203, + "grad_norm": 5.131178855895996, + "learning_rate": 8.071282661996498e-05, + "loss": 1.1226, + "num_input_tokens_seen": 44341264, + "step": 2756 + }, + { + "epoch": 0.19312301347553126, + "grad_norm": 4.273870944976807, + "learning_rate": 8.070582837127847e-05, + "loss": 1.0953, + "num_input_tokens_seen": 44357648, + "step": 2757 + }, + { + "epoch": 0.19319306172126052, + "grad_norm": 3.883690118789673, + "learning_rate": 8.069883012259194e-05, + "loss": 1.2978, + "num_input_tokens_seen": 44373984, + "step": 2758 + }, + { + "epoch": 0.19326310996698975, + "grad_norm": 4.284129619598389, + "learning_rate": 8.069183187390543e-05, + "loss": 1.0356, + "num_input_tokens_seen": 44389160, + "step": 2759 + }, + { + "epoch": 0.19333315821271901, + "grad_norm": 4.517998695373535, + "learning_rate": 8.068483362521892e-05, + "loss": 1.0378, + "num_input_tokens_seen": 44405544, + "step": 2760 + }, + { + "epoch": 0.19340320645844825, + "grad_norm": 4.098707675933838, + "learning_rate": 8.067783537653241e-05, + "loss": 1.2235, + "num_input_tokens_seen": 44421560, + "step": 2761 + }, + { + "epoch": 0.1934732547041775, + "grad_norm": 3.656461477279663, + "learning_rate": 8.067083712784588e-05, + "loss": 0.8462, + "num_input_tokens_seen": 44437944, + "step": 2762 + }, + { + "epoch": 0.19354330294990674, + "grad_norm": 3.8305914402008057, + "learning_rate": 8.066383887915937e-05, + "loss": 1.1084, + "num_input_tokens_seen": 44454208, + "step": 2763 + }, + { + "epoch": 0.193613351195636, + "grad_norm": 4.0582990646362305, + "learning_rate": 8.065684063047286e-05, + "loss": 1.2152, + "num_input_tokens_seen": 44470592, + "step": 2764 + }, + { + "epoch": 0.19368339944136523, + "grad_norm": 4.159184455871582, + "learning_rate": 8.064984238178633e-05, + "loss": 1.0183, + "num_input_tokens_seen": 44486976, + "step": 2765 + }, + { + "epoch": 0.1937534476870945, + "grad_norm": 3.7490620613098145, + "learning_rate": 8.064284413309984e-05, + "loss": 1.0883, + "num_input_tokens_seen": 44503360, + "step": 2766 + }, + { + "epoch": 0.19382349593282372, + "grad_norm": 4.3000288009643555, + "learning_rate": 8.063584588441333e-05, + "loss": 1.2323, + "num_input_tokens_seen": 44519744, + "step": 2767 + }, + { + "epoch": 0.19389354417855298, + "grad_norm": 3.9175477027893066, + "learning_rate": 8.06288476357268e-05, + "loss": 0.8758, + "num_input_tokens_seen": 44535664, + "step": 2768 + }, + { + "epoch": 0.1939635924242822, + "grad_norm": 4.4328293800354, + "learning_rate": 8.062184938704029e-05, + "loss": 1.0173, + "num_input_tokens_seen": 44550984, + "step": 2769 + }, + { + "epoch": 0.19403364067001147, + "grad_norm": 4.556321620941162, + "learning_rate": 8.061485113835376e-05, + "loss": 1.1389, + "num_input_tokens_seen": 44566808, + "step": 2770 + }, + { + "epoch": 0.19410368891574073, + "grad_norm": 4.382159233093262, + "learning_rate": 8.060785288966725e-05, + "loss": 1.1211, + "num_input_tokens_seen": 44583192, + "step": 2771 + }, + { + "epoch": 0.19417373716146996, + "grad_norm": 3.920137405395508, + "learning_rate": 8.060085464098074e-05, + "loss": 0.9815, + "num_input_tokens_seen": 44599480, + "step": 2772 + }, + { + "epoch": 0.19424378540719922, + "grad_norm": 4.23013162612915, + "learning_rate": 8.059385639229423e-05, + "loss": 1.2268, + "num_input_tokens_seen": 44615240, + "step": 2773 + }, + { + "epoch": 0.19431383365292845, + "grad_norm": 3.7917346954345703, + "learning_rate": 8.058685814360772e-05, + "loss": 1.0935, + "num_input_tokens_seen": 44630952, + "step": 2774 + }, + { + "epoch": 0.1943838818986577, + "grad_norm": 4.798681259155273, + "learning_rate": 8.05798598949212e-05, + "loss": 1.1321, + "num_input_tokens_seen": 44647336, + "step": 2775 + }, + { + "epoch": 0.19445393014438694, + "grad_norm": 3.563124418258667, + "learning_rate": 8.057286164623468e-05, + "loss": 1.1231, + "num_input_tokens_seen": 44663720, + "step": 2776 + }, + { + "epoch": 0.1945239783901162, + "grad_norm": 6.6064019203186035, + "learning_rate": 8.056586339754817e-05, + "loss": 0.8685, + "num_input_tokens_seen": 44679616, + "step": 2777 + }, + { + "epoch": 0.19459402663584544, + "grad_norm": 4.1651291847229, + "learning_rate": 8.055886514886165e-05, + "loss": 1.1634, + "num_input_tokens_seen": 44695800, + "step": 2778 + }, + { + "epoch": 0.1946640748815747, + "grad_norm": 3.929474353790283, + "learning_rate": 8.055186690017513e-05, + "loss": 1.1127, + "num_input_tokens_seen": 44711744, + "step": 2779 + }, + { + "epoch": 0.19473412312730393, + "grad_norm": 3.758721351623535, + "learning_rate": 8.054486865148862e-05, + "loss": 0.9218, + "num_input_tokens_seen": 44728128, + "step": 2780 + }, + { + "epoch": 0.19480417137303319, + "grad_norm": 4.988550662994385, + "learning_rate": 8.053787040280211e-05, + "loss": 1.222, + "num_input_tokens_seen": 44744512, + "step": 2781 + }, + { + "epoch": 0.19487421961876242, + "grad_norm": 3.7875940799713135, + "learning_rate": 8.053087215411559e-05, + "loss": 1.0393, + "num_input_tokens_seen": 44760896, + "step": 2782 + }, + { + "epoch": 0.19494426786449168, + "grad_norm": 3.877729654312134, + "learning_rate": 8.052387390542908e-05, + "loss": 1.1748, + "num_input_tokens_seen": 44777280, + "step": 2783 + }, + { + "epoch": 0.1950143161102209, + "grad_norm": 4.979894161224365, + "learning_rate": 8.051687565674256e-05, + "loss": 1.1506, + "num_input_tokens_seen": 44793664, + "step": 2784 + }, + { + "epoch": 0.19508436435595017, + "grad_norm": 4.3148579597473145, + "learning_rate": 8.050987740805604e-05, + "loss": 1.1587, + "num_input_tokens_seen": 44809688, + "step": 2785 + }, + { + "epoch": 0.1951544126016794, + "grad_norm": 4.082404136657715, + "learning_rate": 8.050287915936954e-05, + "loss": 1.1488, + "num_input_tokens_seen": 44825600, + "step": 2786 + }, + { + "epoch": 0.19522446084740866, + "grad_norm": 3.6951189041137695, + "learning_rate": 8.049588091068302e-05, + "loss": 1.1542, + "num_input_tokens_seen": 44841984, + "step": 2787 + }, + { + "epoch": 0.1952945090931379, + "grad_norm": 3.797136068344116, + "learning_rate": 8.04888826619965e-05, + "loss": 0.964, + "num_input_tokens_seen": 44858368, + "step": 2788 + }, + { + "epoch": 0.19536455733886715, + "grad_norm": 3.8912811279296875, + "learning_rate": 8.048188441330998e-05, + "loss": 0.8985, + "num_input_tokens_seen": 44873752, + "step": 2789 + }, + { + "epoch": 0.19543460558459638, + "grad_norm": 4.355793476104736, + "learning_rate": 8.047488616462347e-05, + "loss": 1.1546, + "num_input_tokens_seen": 44889336, + "step": 2790 + }, + { + "epoch": 0.19550465383032564, + "grad_norm": 4.216153144836426, + "learning_rate": 8.046788791593696e-05, + "loss": 1.0922, + "num_input_tokens_seen": 44905720, + "step": 2791 + }, + { + "epoch": 0.19557470207605487, + "grad_norm": 3.5346696376800537, + "learning_rate": 8.046088966725045e-05, + "loss": 1.1628, + "num_input_tokens_seen": 44921864, + "step": 2792 + }, + { + "epoch": 0.19564475032178413, + "grad_norm": 4.2197489738464355, + "learning_rate": 8.045389141856393e-05, + "loss": 1.0177, + "num_input_tokens_seen": 44938248, + "step": 2793 + }, + { + "epoch": 0.19571479856751337, + "grad_norm": 3.66995906829834, + "learning_rate": 8.044689316987742e-05, + "loss": 1.0401, + "num_input_tokens_seen": 44954632, + "step": 2794 + }, + { + "epoch": 0.19578484681324262, + "grad_norm": 5.062297821044922, + "learning_rate": 8.04398949211909e-05, + "loss": 1.2106, + "num_input_tokens_seen": 44971016, + "step": 2795 + }, + { + "epoch": 0.19585489505897186, + "grad_norm": 4.473872661590576, + "learning_rate": 8.043289667250439e-05, + "loss": 1.153, + "num_input_tokens_seen": 44987400, + "step": 2796 + }, + { + "epoch": 0.19592494330470112, + "grad_norm": 4.724556922912598, + "learning_rate": 8.042589842381786e-05, + "loss": 1.23, + "num_input_tokens_seen": 45002968, + "step": 2797 + }, + { + "epoch": 0.19599499155043035, + "grad_norm": 4.324196815490723, + "learning_rate": 8.041890017513135e-05, + "loss": 0.8708, + "num_input_tokens_seen": 45019352, + "step": 2798 + }, + { + "epoch": 0.1960650397961596, + "grad_norm": 4.309204578399658, + "learning_rate": 8.041190192644484e-05, + "loss": 1.0769, + "num_input_tokens_seen": 45034960, + "step": 2799 + }, + { + "epoch": 0.19613508804188884, + "grad_norm": 3.4928808212280273, + "learning_rate": 8.040490367775833e-05, + "loss": 0.9394, + "num_input_tokens_seen": 45051344, + "step": 2800 + }, + { + "epoch": 0.19613508804188884, + "eval_loss": 1.1401225328445435, + "eval_runtime": 0.185, + "eval_samples_per_second": 5.405, + "eval_steps_per_second": 5.405, + "num_input_tokens_seen": 45051344, + "step": 2800 + }, + { + "epoch": 0.1962051362876181, + "grad_norm": 6.388762474060059, + "learning_rate": 8.039790542907182e-05, + "loss": 1.047, + "num_input_tokens_seen": 45066712, + "step": 2801 + }, + { + "epoch": 0.19627518453334733, + "grad_norm": 3.8386781215667725, + "learning_rate": 8.039090718038529e-05, + "loss": 1.0248, + "num_input_tokens_seen": 45082472, + "step": 2802 + }, + { + "epoch": 0.1963452327790766, + "grad_norm": 3.540064573287964, + "learning_rate": 8.038390893169878e-05, + "loss": 0.846, + "num_input_tokens_seen": 45098072, + "step": 2803 + }, + { + "epoch": 0.19641528102480582, + "grad_norm": 3.9858322143554688, + "learning_rate": 8.037691068301227e-05, + "loss": 1.1443, + "num_input_tokens_seen": 45114456, + "step": 2804 + }, + { + "epoch": 0.19648532927053508, + "grad_norm": 4.418299674987793, + "learning_rate": 8.036991243432574e-05, + "loss": 1.0391, + "num_input_tokens_seen": 45130416, + "step": 2805 + }, + { + "epoch": 0.19655537751626434, + "grad_norm": 4.6108880043029785, + "learning_rate": 8.036291418563923e-05, + "loss": 0.9911, + "num_input_tokens_seen": 45146800, + "step": 2806 + }, + { + "epoch": 0.19662542576199357, + "grad_norm": 3.686781883239746, + "learning_rate": 8.035591593695272e-05, + "loss": 0.904, + "num_input_tokens_seen": 45163016, + "step": 2807 + }, + { + "epoch": 0.19669547400772283, + "grad_norm": 3.7459771633148193, + "learning_rate": 8.034891768826621e-05, + "loss": 1.0635, + "num_input_tokens_seen": 45178912, + "step": 2808 + }, + { + "epoch": 0.19676552225345206, + "grad_norm": 4.955589771270752, + "learning_rate": 8.034191943957968e-05, + "loss": 0.951, + "num_input_tokens_seen": 45193928, + "step": 2809 + }, + { + "epoch": 0.19683557049918132, + "grad_norm": 4.901642322540283, + "learning_rate": 8.033492119089317e-05, + "loss": 1.0751, + "num_input_tokens_seen": 45209080, + "step": 2810 + }, + { + "epoch": 0.19690561874491055, + "grad_norm": 3.685493230819702, + "learning_rate": 8.032792294220666e-05, + "loss": 1.0408, + "num_input_tokens_seen": 45225400, + "step": 2811 + }, + { + "epoch": 0.19697566699063981, + "grad_norm": 4.731873512268066, + "learning_rate": 8.032092469352015e-05, + "loss": 0.9684, + "num_input_tokens_seen": 45241152, + "step": 2812 + }, + { + "epoch": 0.19704571523636905, + "grad_norm": 4.52595853805542, + "learning_rate": 8.031392644483364e-05, + "loss": 1.142, + "num_input_tokens_seen": 45256976, + "step": 2813 + }, + { + "epoch": 0.1971157634820983, + "grad_norm": 4.4693074226379395, + "learning_rate": 8.030692819614711e-05, + "loss": 1.0846, + "num_input_tokens_seen": 45273360, + "step": 2814 + }, + { + "epoch": 0.19718581172782754, + "grad_norm": 5.599058151245117, + "learning_rate": 8.02999299474606e-05, + "loss": 1.1544, + "num_input_tokens_seen": 45289744, + "step": 2815 + }, + { + "epoch": 0.1972558599735568, + "grad_norm": 3.758751153945923, + "learning_rate": 8.029293169877408e-05, + "loss": 1.1877, + "num_input_tokens_seen": 45305960, + "step": 2816 + }, + { + "epoch": 0.19732590821928603, + "grad_norm": 4.059335231781006, + "learning_rate": 8.028593345008757e-05, + "loss": 1.0294, + "num_input_tokens_seen": 45321536, + "step": 2817 + }, + { + "epoch": 0.1973959564650153, + "grad_norm": 3.8090553283691406, + "learning_rate": 8.027893520140105e-05, + "loss": 1.1264, + "num_input_tokens_seen": 45337920, + "step": 2818 + }, + { + "epoch": 0.19746600471074452, + "grad_norm": 3.7900006771087646, + "learning_rate": 8.027193695271454e-05, + "loss": 1.2042, + "num_input_tokens_seen": 45353632, + "step": 2819 + }, + { + "epoch": 0.19753605295647378, + "grad_norm": 4.279977321624756, + "learning_rate": 8.026493870402803e-05, + "loss": 1.0786, + "num_input_tokens_seen": 45369712, + "step": 2820 + }, + { + "epoch": 0.197606101202203, + "grad_norm": 3.7417356967926025, + "learning_rate": 8.025794045534152e-05, + "loss": 1.0756, + "num_input_tokens_seen": 45384816, + "step": 2821 + }, + { + "epoch": 0.19767614944793227, + "grad_norm": 4.084759712219238, + "learning_rate": 8.0250942206655e-05, + "loss": 0.9187, + "num_input_tokens_seen": 45401200, + "step": 2822 + }, + { + "epoch": 0.1977461976936615, + "grad_norm": 4.963731288909912, + "learning_rate": 8.024394395796848e-05, + "loss": 1.2548, + "num_input_tokens_seen": 45417096, + "step": 2823 + }, + { + "epoch": 0.19781624593939076, + "grad_norm": 4.115303993225098, + "learning_rate": 8.023694570928196e-05, + "loss": 1.2127, + "num_input_tokens_seen": 45433480, + "step": 2824 + }, + { + "epoch": 0.19788629418512, + "grad_norm": 3.908439874649048, + "learning_rate": 8.022994746059545e-05, + "loss": 1.0171, + "num_input_tokens_seen": 45448984, + "step": 2825 + }, + { + "epoch": 0.19795634243084925, + "grad_norm": 4.0723090171813965, + "learning_rate": 8.022294921190894e-05, + "loss": 0.9883, + "num_input_tokens_seen": 45465192, + "step": 2826 + }, + { + "epoch": 0.19802639067657848, + "grad_norm": 4.219478607177734, + "learning_rate": 8.021595096322242e-05, + "loss": 1.109, + "num_input_tokens_seen": 45480904, + "step": 2827 + }, + { + "epoch": 0.19809643892230774, + "grad_norm": 4.246188163757324, + "learning_rate": 8.020895271453591e-05, + "loss": 1.3058, + "num_input_tokens_seen": 45497288, + "step": 2828 + }, + { + "epoch": 0.19816648716803698, + "grad_norm": 4.898525714874268, + "learning_rate": 8.020195446584939e-05, + "loss": 1.1058, + "num_input_tokens_seen": 45513456, + "step": 2829 + }, + { + "epoch": 0.19823653541376623, + "grad_norm": 4.1247239112854, + "learning_rate": 8.019495621716288e-05, + "loss": 1.031, + "num_input_tokens_seen": 45528752, + "step": 2830 + }, + { + "epoch": 0.19830658365949547, + "grad_norm": 4.352110385894775, + "learning_rate": 8.018795796847636e-05, + "loss": 1.3602, + "num_input_tokens_seen": 45545136, + "step": 2831 + }, + { + "epoch": 0.19837663190522473, + "grad_norm": 3.731719732284546, + "learning_rate": 8.018095971978985e-05, + "loss": 0.9833, + "num_input_tokens_seen": 45561160, + "step": 2832 + }, + { + "epoch": 0.19844668015095396, + "grad_norm": 4.234768867492676, + "learning_rate": 8.017396147110333e-05, + "loss": 1.2279, + "num_input_tokens_seen": 45577288, + "step": 2833 + }, + { + "epoch": 0.19851672839668322, + "grad_norm": 4.682285308837891, + "learning_rate": 8.016696322241682e-05, + "loss": 1.0376, + "num_input_tokens_seen": 45593152, + "step": 2834 + }, + { + "epoch": 0.19858677664241245, + "grad_norm": 4.576408863067627, + "learning_rate": 8.01599649737303e-05, + "loss": 1.225, + "num_input_tokens_seen": 45609408, + "step": 2835 + }, + { + "epoch": 0.1986568248881417, + "grad_norm": 4.209808826446533, + "learning_rate": 8.015296672504378e-05, + "loss": 1.0308, + "num_input_tokens_seen": 45625792, + "step": 2836 + }, + { + "epoch": 0.19872687313387094, + "grad_norm": 4.383143901824951, + "learning_rate": 8.014596847635727e-05, + "loss": 1.2079, + "num_input_tokens_seen": 45642176, + "step": 2837 + }, + { + "epoch": 0.1987969213796002, + "grad_norm": 4.105413913726807, + "learning_rate": 8.013897022767076e-05, + "loss": 1.0623, + "num_input_tokens_seen": 45657480, + "step": 2838 + }, + { + "epoch": 0.19886696962532946, + "grad_norm": 5.339532852172852, + "learning_rate": 8.013197197898425e-05, + "loss": 1.1131, + "num_input_tokens_seen": 45673168, + "step": 2839 + }, + { + "epoch": 0.1989370178710587, + "grad_norm": 3.8160016536712646, + "learning_rate": 8.012497373029774e-05, + "loss": 1.1392, + "num_input_tokens_seen": 45689088, + "step": 2840 + }, + { + "epoch": 0.19900706611678795, + "grad_norm": 3.763986587524414, + "learning_rate": 8.011797548161121e-05, + "loss": 1.1852, + "num_input_tokens_seen": 45705472, + "step": 2841 + }, + { + "epoch": 0.19907711436251718, + "grad_norm": 4.034756183624268, + "learning_rate": 8.01109772329247e-05, + "loss": 1.1856, + "num_input_tokens_seen": 45721168, + "step": 2842 + }, + { + "epoch": 0.19914716260824644, + "grad_norm": 3.971479654312134, + "learning_rate": 8.010397898423817e-05, + "loss": 1.1443, + "num_input_tokens_seen": 45737312, + "step": 2843 + }, + { + "epoch": 0.19921721085397567, + "grad_norm": 4.118296146392822, + "learning_rate": 8.009698073555166e-05, + "loss": 0.9964, + "num_input_tokens_seen": 45752792, + "step": 2844 + }, + { + "epoch": 0.19928725909970493, + "grad_norm": 3.628143310546875, + "learning_rate": 8.008998248686515e-05, + "loss": 1.1102, + "num_input_tokens_seen": 45769008, + "step": 2845 + }, + { + "epoch": 0.19935730734543416, + "grad_norm": 3.9946494102478027, + "learning_rate": 8.008298423817864e-05, + "loss": 1.1199, + "num_input_tokens_seen": 45785392, + "step": 2846 + }, + { + "epoch": 0.19942735559116342, + "grad_norm": 3.7445459365844727, + "learning_rate": 8.007598598949213e-05, + "loss": 1.1245, + "num_input_tokens_seen": 45801320, + "step": 2847 + }, + { + "epoch": 0.19949740383689266, + "grad_norm": 3.745481491088867, + "learning_rate": 8.006898774080562e-05, + "loss": 1.0969, + "num_input_tokens_seen": 45817504, + "step": 2848 + }, + { + "epoch": 0.19956745208262192, + "grad_norm": 4.1305766105651855, + "learning_rate": 8.006198949211909e-05, + "loss": 1.0953, + "num_input_tokens_seen": 45833888, + "step": 2849 + }, + { + "epoch": 0.19963750032835115, + "grad_norm": 3.7843470573425293, + "learning_rate": 8.005499124343258e-05, + "loss": 1.111, + "num_input_tokens_seen": 45850272, + "step": 2850 + }, + { + "epoch": 0.1997075485740804, + "grad_norm": 3.9884989261627197, + "learning_rate": 8.004799299474606e-05, + "loss": 1.083, + "num_input_tokens_seen": 45866656, + "step": 2851 + }, + { + "epoch": 0.19977759681980964, + "grad_norm": 3.7280545234680176, + "learning_rate": 8.004099474605956e-05, + "loss": 1.0036, + "num_input_tokens_seen": 45882776, + "step": 2852 + }, + { + "epoch": 0.1998476450655389, + "grad_norm": 5.151428699493408, + "learning_rate": 8.003399649737303e-05, + "loss": 1.2988, + "num_input_tokens_seen": 45898520, + "step": 2853 + }, + { + "epoch": 0.19991769331126813, + "grad_norm": 6.738519191741943, + "learning_rate": 8.002699824868652e-05, + "loss": 1.1934, + "num_input_tokens_seen": 45914904, + "step": 2854 + }, + { + "epoch": 0.1999877415569974, + "grad_norm": 4.689775466918945, + "learning_rate": 8.002000000000001e-05, + "loss": 1.3534, + "num_input_tokens_seen": 45931288, + "step": 2855 + }, + { + "epoch": 0.20005778980272662, + "grad_norm": 4.047792911529541, + "learning_rate": 8.001300175131348e-05, + "loss": 1.2926, + "num_input_tokens_seen": 45947672, + "step": 2856 + }, + { + "epoch": 0.20012783804845588, + "grad_norm": 4.609661102294922, + "learning_rate": 8.000600350262697e-05, + "loss": 1.0717, + "num_input_tokens_seen": 45964056, + "step": 2857 + }, + { + "epoch": 0.2001978862941851, + "grad_norm": 4.188840389251709, + "learning_rate": 7.999900525394046e-05, + "loss": 1.0872, + "num_input_tokens_seen": 45980152, + "step": 2858 + }, + { + "epoch": 0.20026793453991437, + "grad_norm": 3.558335781097412, + "learning_rate": 7.999200700525395e-05, + "loss": 1.1207, + "num_input_tokens_seen": 45996536, + "step": 2859 + }, + { + "epoch": 0.2003379827856436, + "grad_norm": 10.145834922790527, + "learning_rate": 7.998500875656743e-05, + "loss": 1.0649, + "num_input_tokens_seen": 46011616, + "step": 2860 + }, + { + "epoch": 0.20040803103137286, + "grad_norm": 5.534536838531494, + "learning_rate": 7.997801050788091e-05, + "loss": 1.3019, + "num_input_tokens_seen": 46027016, + "step": 2861 + }, + { + "epoch": 0.2004780792771021, + "grad_norm": 4.258336544036865, + "learning_rate": 7.99710122591944e-05, + "loss": 1.1192, + "num_input_tokens_seen": 46043400, + "step": 2862 + }, + { + "epoch": 0.20054812752283135, + "grad_norm": 5.266301155090332, + "learning_rate": 7.996401401050788e-05, + "loss": 1.0048, + "num_input_tokens_seen": 46059784, + "step": 2863 + }, + { + "epoch": 0.20061817576856059, + "grad_norm": 4.502764701843262, + "learning_rate": 7.995701576182137e-05, + "loss": 0.9435, + "num_input_tokens_seen": 46075584, + "step": 2864 + }, + { + "epoch": 0.20068822401428985, + "grad_norm": 4.39752197265625, + "learning_rate": 7.995001751313485e-05, + "loss": 0.9992, + "num_input_tokens_seen": 46091520, + "step": 2865 + }, + { + "epoch": 0.20075827226001908, + "grad_norm": 3.9562480449676514, + "learning_rate": 7.994301926444834e-05, + "loss": 0.9935, + "num_input_tokens_seen": 46107568, + "step": 2866 + }, + { + "epoch": 0.20082832050574834, + "grad_norm": 4.466681957244873, + "learning_rate": 7.993602101576183e-05, + "loss": 1.0067, + "num_input_tokens_seen": 46123952, + "step": 2867 + }, + { + "epoch": 0.20089836875147757, + "grad_norm": 3.9317095279693604, + "learning_rate": 7.992902276707531e-05, + "loss": 1.0353, + "num_input_tokens_seen": 46140336, + "step": 2868 + }, + { + "epoch": 0.20096841699720683, + "grad_norm": 5.025266170501709, + "learning_rate": 7.99220245183888e-05, + "loss": 1.1297, + "num_input_tokens_seen": 46155504, + "step": 2869 + }, + { + "epoch": 0.20103846524293606, + "grad_norm": 3.82340931892395, + "learning_rate": 7.991502626970227e-05, + "loss": 1.1677, + "num_input_tokens_seen": 46171888, + "step": 2870 + }, + { + "epoch": 0.20110851348866532, + "grad_norm": 4.017914295196533, + "learning_rate": 7.990802802101576e-05, + "loss": 1.0779, + "num_input_tokens_seen": 46187712, + "step": 2871 + }, + { + "epoch": 0.20117856173439455, + "grad_norm": 4.053089618682861, + "learning_rate": 7.990102977232926e-05, + "loss": 0.9687, + "num_input_tokens_seen": 46202912, + "step": 2872 + }, + { + "epoch": 0.2012486099801238, + "grad_norm": 3.5664076805114746, + "learning_rate": 7.989403152364274e-05, + "loss": 1.0047, + "num_input_tokens_seen": 46219296, + "step": 2873 + }, + { + "epoch": 0.20131865822585307, + "grad_norm": 4.039318084716797, + "learning_rate": 7.988703327495623e-05, + "loss": 1.107, + "num_input_tokens_seen": 46235680, + "step": 2874 + }, + { + "epoch": 0.2013887064715823, + "grad_norm": 3.8851678371429443, + "learning_rate": 7.988003502626971e-05, + "loss": 1.0268, + "num_input_tokens_seen": 46251408, + "step": 2875 + }, + { + "epoch": 0.20145875471731156, + "grad_norm": 3.581632137298584, + "learning_rate": 7.987303677758319e-05, + "loss": 0.9255, + "num_input_tokens_seen": 46267696, + "step": 2876 + }, + { + "epoch": 0.2015288029630408, + "grad_norm": 4.135960102081299, + "learning_rate": 7.986603852889668e-05, + "loss": 1.1763, + "num_input_tokens_seen": 46284080, + "step": 2877 + }, + { + "epoch": 0.20159885120877005, + "grad_norm": 3.649959087371826, + "learning_rate": 7.985904028021017e-05, + "loss": 0.8932, + "num_input_tokens_seen": 46300456, + "step": 2878 + }, + { + "epoch": 0.20166889945449928, + "grad_norm": 4.564159393310547, + "learning_rate": 7.985204203152365e-05, + "loss": 0.9239, + "num_input_tokens_seen": 46315928, + "step": 2879 + }, + { + "epoch": 0.20173894770022854, + "grad_norm": 3.806626796722412, + "learning_rate": 7.984504378283713e-05, + "loss": 1.0011, + "num_input_tokens_seen": 46331520, + "step": 2880 + }, + { + "epoch": 0.20180899594595778, + "grad_norm": 6.621458530426025, + "learning_rate": 7.983804553415062e-05, + "loss": 1.045, + "num_input_tokens_seen": 46347904, + "step": 2881 + }, + { + "epoch": 0.20187904419168703, + "grad_norm": 4.554089546203613, + "learning_rate": 7.983104728546411e-05, + "loss": 0.9472, + "num_input_tokens_seen": 46364288, + "step": 2882 + }, + { + "epoch": 0.20194909243741627, + "grad_norm": 4.206694602966309, + "learning_rate": 7.982404903677758e-05, + "loss": 1.1913, + "num_input_tokens_seen": 46380672, + "step": 2883 + }, + { + "epoch": 0.20201914068314553, + "grad_norm": 6.333064079284668, + "learning_rate": 7.981705078809107e-05, + "loss": 1.1189, + "num_input_tokens_seen": 46396384, + "step": 2884 + }, + { + "epoch": 0.20208918892887476, + "grad_norm": 3.6293835639953613, + "learning_rate": 7.981005253940456e-05, + "loss": 0.9825, + "num_input_tokens_seen": 46412712, + "step": 2885 + }, + { + "epoch": 0.20215923717460402, + "grad_norm": 6.282841682434082, + "learning_rate": 7.980305429071805e-05, + "loss": 1.0498, + "num_input_tokens_seen": 46429096, + "step": 2886 + }, + { + "epoch": 0.20222928542033325, + "grad_norm": 3.661564350128174, + "learning_rate": 7.979605604203152e-05, + "loss": 0.9022, + "num_input_tokens_seen": 46445480, + "step": 2887 + }, + { + "epoch": 0.2022993336660625, + "grad_norm": 4.232359409332275, + "learning_rate": 7.978905779334501e-05, + "loss": 1.3196, + "num_input_tokens_seen": 46461344, + "step": 2888 + }, + { + "epoch": 0.20236938191179174, + "grad_norm": 3.9777348041534424, + "learning_rate": 7.97820595446585e-05, + "loss": 1.1121, + "num_input_tokens_seen": 46477728, + "step": 2889 + }, + { + "epoch": 0.202439430157521, + "grad_norm": 4.221210479736328, + "learning_rate": 7.977506129597197e-05, + "loss": 1.1899, + "num_input_tokens_seen": 46493680, + "step": 2890 + }, + { + "epoch": 0.20250947840325023, + "grad_norm": 4.210818767547607, + "learning_rate": 7.976806304728546e-05, + "loss": 1.1003, + "num_input_tokens_seen": 46510064, + "step": 2891 + }, + { + "epoch": 0.2025795266489795, + "grad_norm": 5.012551307678223, + "learning_rate": 7.976106479859895e-05, + "loss": 0.9933, + "num_input_tokens_seen": 46526448, + "step": 2892 + }, + { + "epoch": 0.20264957489470872, + "grad_norm": 3.4867520332336426, + "learning_rate": 7.975406654991244e-05, + "loss": 0.8495, + "num_input_tokens_seen": 46542832, + "step": 2893 + }, + { + "epoch": 0.20271962314043798, + "grad_norm": 4.74222993850708, + "learning_rate": 7.974706830122593e-05, + "loss": 1.1398, + "num_input_tokens_seen": 46559048, + "step": 2894 + }, + { + "epoch": 0.20278967138616721, + "grad_norm": 5.358060359954834, + "learning_rate": 7.97400700525394e-05, + "loss": 1.0004, + "num_input_tokens_seen": 46575400, + "step": 2895 + }, + { + "epoch": 0.20285971963189647, + "grad_norm": 4.2599053382873535, + "learning_rate": 7.973307180385289e-05, + "loss": 1.0021, + "num_input_tokens_seen": 46591064, + "step": 2896 + }, + { + "epoch": 0.2029297678776257, + "grad_norm": 5.993118762969971, + "learning_rate": 7.972607355516637e-05, + "loss": 1.2017, + "num_input_tokens_seen": 46606504, + "step": 2897 + }, + { + "epoch": 0.20299981612335496, + "grad_norm": 4.129568576812744, + "learning_rate": 7.971907530647987e-05, + "loss": 1.2929, + "num_input_tokens_seen": 46622400, + "step": 2898 + }, + { + "epoch": 0.2030698643690842, + "grad_norm": 3.8486111164093018, + "learning_rate": 7.971207705779336e-05, + "loss": 1.0113, + "num_input_tokens_seen": 46638752, + "step": 2899 + }, + { + "epoch": 0.20313991261481346, + "grad_norm": 4.262311935424805, + "learning_rate": 7.970507880910683e-05, + "loss": 1.1222, + "num_input_tokens_seen": 46655136, + "step": 2900 + }, + { + "epoch": 0.2032099608605427, + "grad_norm": 4.065335750579834, + "learning_rate": 7.969808056042032e-05, + "loss": 1.2965, + "num_input_tokens_seen": 46671520, + "step": 2901 + }, + { + "epoch": 0.20328000910627195, + "grad_norm": 3.8313064575195312, + "learning_rate": 7.969108231173381e-05, + "loss": 1.1245, + "num_input_tokens_seen": 46687904, + "step": 2902 + }, + { + "epoch": 0.20335005735200118, + "grad_norm": 3.711580276489258, + "learning_rate": 7.968408406304729e-05, + "loss": 1.1688, + "num_input_tokens_seen": 46704088, + "step": 2903 + }, + { + "epoch": 0.20342010559773044, + "grad_norm": 4.172581672668457, + "learning_rate": 7.967708581436077e-05, + "loss": 1.1609, + "num_input_tokens_seen": 46720360, + "step": 2904 + }, + { + "epoch": 0.20349015384345967, + "grad_norm": 4.7567267417907715, + "learning_rate": 7.967008756567426e-05, + "loss": 1.169, + "num_input_tokens_seen": 46735560, + "step": 2905 + }, + { + "epoch": 0.20356020208918893, + "grad_norm": 4.304897308349609, + "learning_rate": 7.966308931698775e-05, + "loss": 0.9359, + "num_input_tokens_seen": 46751720, + "step": 2906 + }, + { + "epoch": 0.20363025033491816, + "grad_norm": 4.0556864738464355, + "learning_rate": 7.965609106830123e-05, + "loss": 1.0763, + "num_input_tokens_seen": 46767432, + "step": 2907 + }, + { + "epoch": 0.20370029858064742, + "grad_norm": 3.7381911277770996, + "learning_rate": 7.964909281961472e-05, + "loss": 1.0158, + "num_input_tokens_seen": 46783488, + "step": 2908 + }, + { + "epoch": 0.20377034682637668, + "grad_norm": 4.363048553466797, + "learning_rate": 7.96420945709282e-05, + "loss": 0.9627, + "num_input_tokens_seen": 46799016, + "step": 2909 + }, + { + "epoch": 0.2038403950721059, + "grad_norm": 4.04617166519165, + "learning_rate": 7.963509632224168e-05, + "loss": 1.1312, + "num_input_tokens_seen": 46815400, + "step": 2910 + }, + { + "epoch": 0.20391044331783517, + "grad_norm": 3.8854830265045166, + "learning_rate": 7.962809807355517e-05, + "loss": 1.0525, + "num_input_tokens_seen": 46831784, + "step": 2911 + }, + { + "epoch": 0.2039804915635644, + "grad_norm": 4.197749614715576, + "learning_rate": 7.962109982486866e-05, + "loss": 1.0839, + "num_input_tokens_seen": 46848168, + "step": 2912 + }, + { + "epoch": 0.20405053980929366, + "grad_norm": 4.414098739624023, + "learning_rate": 7.961410157618214e-05, + "loss": 1.1576, + "num_input_tokens_seen": 46864552, + "step": 2913 + }, + { + "epoch": 0.2041205880550229, + "grad_norm": 3.7771573066711426, + "learning_rate": 7.960710332749562e-05, + "loss": 0.9597, + "num_input_tokens_seen": 46880936, + "step": 2914 + }, + { + "epoch": 0.20419063630075215, + "grad_norm": 4.179026126861572, + "learning_rate": 7.960010507880911e-05, + "loss": 1.0754, + "num_input_tokens_seen": 46897192, + "step": 2915 + }, + { + "epoch": 0.20426068454648139, + "grad_norm": 4.017509460449219, + "learning_rate": 7.95931068301226e-05, + "loss": 1.0476, + "num_input_tokens_seen": 46913576, + "step": 2916 + }, + { + "epoch": 0.20433073279221065, + "grad_norm": 5.863056182861328, + "learning_rate": 7.958610858143607e-05, + "loss": 1.235, + "num_input_tokens_seen": 46929960, + "step": 2917 + }, + { + "epoch": 0.20440078103793988, + "grad_norm": 5.267307281494141, + "learning_rate": 7.957911033274956e-05, + "loss": 1.2414, + "num_input_tokens_seen": 46946344, + "step": 2918 + }, + { + "epoch": 0.20447082928366914, + "grad_norm": 5.20788049697876, + "learning_rate": 7.957211208406306e-05, + "loss": 1.1215, + "num_input_tokens_seen": 46961712, + "step": 2919 + }, + { + "epoch": 0.20454087752939837, + "grad_norm": 4.609791278839111, + "learning_rate": 7.956511383537654e-05, + "loss": 1.0219, + "num_input_tokens_seen": 46977752, + "step": 2920 + }, + { + "epoch": 0.20461092577512763, + "grad_norm": 3.9752824306488037, + "learning_rate": 7.955811558669003e-05, + "loss": 1.1427, + "num_input_tokens_seen": 46994136, + "step": 2921 + }, + { + "epoch": 0.20468097402085686, + "grad_norm": 3.8456339836120605, + "learning_rate": 7.95511173380035e-05, + "loss": 1.1006, + "num_input_tokens_seen": 47010520, + "step": 2922 + }, + { + "epoch": 0.20475102226658612, + "grad_norm": 4.087759494781494, + "learning_rate": 7.954411908931699e-05, + "loss": 1.0535, + "num_input_tokens_seen": 47026904, + "step": 2923 + }, + { + "epoch": 0.20482107051231535, + "grad_norm": 3.9754104614257812, + "learning_rate": 7.953712084063048e-05, + "loss": 1.0334, + "num_input_tokens_seen": 47043288, + "step": 2924 + }, + { + "epoch": 0.2048911187580446, + "grad_norm": 3.61798357963562, + "learning_rate": 7.953012259194397e-05, + "loss": 1.1471, + "num_input_tokens_seen": 47059672, + "step": 2925 + }, + { + "epoch": 0.20496116700377384, + "grad_norm": 4.015439510345459, + "learning_rate": 7.952312434325746e-05, + "loss": 1.0836, + "num_input_tokens_seen": 47074232, + "step": 2926 + }, + { + "epoch": 0.2050312152495031, + "grad_norm": 5.869642734527588, + "learning_rate": 7.951612609457093e-05, + "loss": 1.275, + "num_input_tokens_seen": 47090616, + "step": 2927 + }, + { + "epoch": 0.20510126349523233, + "grad_norm": 4.0500922203063965, + "learning_rate": 7.950912784588442e-05, + "loss": 1.1142, + "num_input_tokens_seen": 47106656, + "step": 2928 + }, + { + "epoch": 0.2051713117409616, + "grad_norm": 5.468737602233887, + "learning_rate": 7.950212959719791e-05, + "loss": 1.2679, + "num_input_tokens_seen": 47122648, + "step": 2929 + }, + { + "epoch": 0.20524135998669082, + "grad_norm": 3.842905282974243, + "learning_rate": 7.949513134851138e-05, + "loss": 1.0889, + "num_input_tokens_seen": 47139032, + "step": 2930 + }, + { + "epoch": 0.20531140823242008, + "grad_norm": 4.24273681640625, + "learning_rate": 7.948813309982487e-05, + "loss": 1.0533, + "num_input_tokens_seen": 47154344, + "step": 2931 + }, + { + "epoch": 0.20538145647814932, + "grad_norm": 3.977433443069458, + "learning_rate": 7.948113485113836e-05, + "loss": 0.9184, + "num_input_tokens_seen": 47170728, + "step": 2932 + }, + { + "epoch": 0.20545150472387858, + "grad_norm": 3.8441646099090576, + "learning_rate": 7.947413660245185e-05, + "loss": 1.1266, + "num_input_tokens_seen": 47187112, + "step": 2933 + }, + { + "epoch": 0.2055215529696078, + "grad_norm": 3.3789381980895996, + "learning_rate": 7.946713835376532e-05, + "loss": 0.9244, + "num_input_tokens_seen": 47203400, + "step": 2934 + }, + { + "epoch": 0.20559160121533707, + "grad_norm": 3.817631483078003, + "learning_rate": 7.946014010507881e-05, + "loss": 1.198, + "num_input_tokens_seen": 47219784, + "step": 2935 + }, + { + "epoch": 0.2056616494610663, + "grad_norm": 3.788300037384033, + "learning_rate": 7.94531418563923e-05, + "loss": 1.1565, + "num_input_tokens_seen": 47236168, + "step": 2936 + }, + { + "epoch": 0.20573169770679556, + "grad_norm": 3.852132558822632, + "learning_rate": 7.944614360770578e-05, + "loss": 1.1259, + "num_input_tokens_seen": 47252288, + "step": 2937 + }, + { + "epoch": 0.2058017459525248, + "grad_norm": 3.8631093502044678, + "learning_rate": 7.943914535901926e-05, + "loss": 1.091, + "num_input_tokens_seen": 47267000, + "step": 2938 + }, + { + "epoch": 0.20587179419825405, + "grad_norm": 3.72165846824646, + "learning_rate": 7.943214711033275e-05, + "loss": 0.7975, + "num_input_tokens_seen": 47282832, + "step": 2939 + }, + { + "epoch": 0.20594184244398328, + "grad_norm": 4.04188871383667, + "learning_rate": 7.942514886164624e-05, + "loss": 1.0953, + "num_input_tokens_seen": 47298320, + "step": 2940 + }, + { + "epoch": 0.20601189068971254, + "grad_norm": 3.5907206535339355, + "learning_rate": 7.941815061295972e-05, + "loss": 0.9766, + "num_input_tokens_seen": 47314704, + "step": 2941 + }, + { + "epoch": 0.20608193893544177, + "grad_norm": 5.023667335510254, + "learning_rate": 7.94111523642732e-05, + "loss": 1.2083, + "num_input_tokens_seen": 47331088, + "step": 2942 + }, + { + "epoch": 0.20615198718117103, + "grad_norm": 3.8885724544525146, + "learning_rate": 7.94041541155867e-05, + "loss": 0.9374, + "num_input_tokens_seen": 47347424, + "step": 2943 + }, + { + "epoch": 0.2062220354269003, + "grad_norm": 4.289493560791016, + "learning_rate": 7.939715586690017e-05, + "loss": 1.0399, + "num_input_tokens_seen": 47363808, + "step": 2944 + }, + { + "epoch": 0.20629208367262952, + "grad_norm": 4.976572513580322, + "learning_rate": 7.939015761821367e-05, + "loss": 0.8901, + "num_input_tokens_seen": 47379152, + "step": 2945 + }, + { + "epoch": 0.20636213191835878, + "grad_norm": 4.0893425941467285, + "learning_rate": 7.938315936952716e-05, + "loss": 1.0622, + "num_input_tokens_seen": 47395536, + "step": 2946 + }, + { + "epoch": 0.206432180164088, + "grad_norm": 3.799873113632202, + "learning_rate": 7.937616112084063e-05, + "loss": 1.1433, + "num_input_tokens_seen": 47410968, + "step": 2947 + }, + { + "epoch": 0.20650222840981727, + "grad_norm": 4.688945293426514, + "learning_rate": 7.936916287215412e-05, + "loss": 1.1424, + "num_input_tokens_seen": 47427352, + "step": 2948 + }, + { + "epoch": 0.2065722766555465, + "grad_norm": 3.6503846645355225, + "learning_rate": 7.93621646234676e-05, + "loss": 0.9236, + "num_input_tokens_seen": 47443736, + "step": 2949 + }, + { + "epoch": 0.20664232490127576, + "grad_norm": 4.2314324378967285, + "learning_rate": 7.935516637478109e-05, + "loss": 1.2795, + "num_input_tokens_seen": 47460120, + "step": 2950 + }, + { + "epoch": 0.206712373147005, + "grad_norm": 5.159674644470215, + "learning_rate": 7.934816812609458e-05, + "loss": 0.8852, + "num_input_tokens_seen": 47476256, + "step": 2951 + }, + { + "epoch": 0.20678242139273426, + "grad_norm": 3.798804759979248, + "learning_rate": 7.934116987740806e-05, + "loss": 1.1161, + "num_input_tokens_seen": 47492208, + "step": 2952 + }, + { + "epoch": 0.2068524696384635, + "grad_norm": 4.233975887298584, + "learning_rate": 7.933417162872155e-05, + "loss": 1.0927, + "num_input_tokens_seen": 47507728, + "step": 2953 + }, + { + "epoch": 0.20692251788419275, + "grad_norm": 3.38350772857666, + "learning_rate": 7.932717338003503e-05, + "loss": 1.0429, + "num_input_tokens_seen": 47523992, + "step": 2954 + }, + { + "epoch": 0.20699256612992198, + "grad_norm": 3.94380521774292, + "learning_rate": 7.932017513134852e-05, + "loss": 0.9227, + "num_input_tokens_seen": 47540376, + "step": 2955 + }, + { + "epoch": 0.20706261437565124, + "grad_norm": 3.887354612350464, + "learning_rate": 7.9313176882662e-05, + "loss": 0.9709, + "num_input_tokens_seen": 47555336, + "step": 2956 + }, + { + "epoch": 0.20713266262138047, + "grad_norm": 4.271602153778076, + "learning_rate": 7.930617863397548e-05, + "loss": 1.3089, + "num_input_tokens_seen": 47570520, + "step": 2957 + }, + { + "epoch": 0.20720271086710973, + "grad_norm": 4.119933605194092, + "learning_rate": 7.929918038528897e-05, + "loss": 1.0162, + "num_input_tokens_seen": 47586904, + "step": 2958 + }, + { + "epoch": 0.20727275911283896, + "grad_norm": 6.137136936187744, + "learning_rate": 7.929218213660246e-05, + "loss": 0.7847, + "num_input_tokens_seen": 47602424, + "step": 2959 + }, + { + "epoch": 0.20734280735856822, + "grad_norm": 3.5264923572540283, + "learning_rate": 7.928518388791595e-05, + "loss": 1.0751, + "num_input_tokens_seen": 47618808, + "step": 2960 + }, + { + "epoch": 0.20741285560429745, + "grad_norm": 4.183988094329834, + "learning_rate": 7.927818563922942e-05, + "loss": 1.1901, + "num_input_tokens_seen": 47634576, + "step": 2961 + }, + { + "epoch": 0.2074829038500267, + "grad_norm": 3.486311197280884, + "learning_rate": 7.927118739054291e-05, + "loss": 0.8559, + "num_input_tokens_seen": 47649920, + "step": 2962 + }, + { + "epoch": 0.20755295209575594, + "grad_norm": 4.561336994171143, + "learning_rate": 7.92641891418564e-05, + "loss": 0.9521, + "num_input_tokens_seen": 47666304, + "step": 2963 + }, + { + "epoch": 0.2076230003414852, + "grad_norm": 4.002289295196533, + "learning_rate": 7.925719089316987e-05, + "loss": 1.1708, + "num_input_tokens_seen": 47682688, + "step": 2964 + }, + { + "epoch": 0.20769304858721443, + "grad_norm": 3.694175958633423, + "learning_rate": 7.925019264448338e-05, + "loss": 0.9635, + "num_input_tokens_seen": 47699072, + "step": 2965 + }, + { + "epoch": 0.2077630968329437, + "grad_norm": 3.7827298641204834, + "learning_rate": 7.924319439579685e-05, + "loss": 1.0921, + "num_input_tokens_seen": 47714720, + "step": 2966 + }, + { + "epoch": 0.20783314507867293, + "grad_norm": 3.8371527194976807, + "learning_rate": 7.923619614711034e-05, + "loss": 1.12, + "num_input_tokens_seen": 47730904, + "step": 2967 + }, + { + "epoch": 0.20790319332440219, + "grad_norm": 4.20089054107666, + "learning_rate": 7.922919789842381e-05, + "loss": 1.0999, + "num_input_tokens_seen": 47747288, + "step": 2968 + }, + { + "epoch": 0.20797324157013142, + "grad_norm": 3.978065252304077, + "learning_rate": 7.92221996497373e-05, + "loss": 1.0472, + "num_input_tokens_seen": 47763672, + "step": 2969 + }, + { + "epoch": 0.20804328981586068, + "grad_norm": 4.882012844085693, + "learning_rate": 7.921520140105079e-05, + "loss": 1.0838, + "num_input_tokens_seen": 47778888, + "step": 2970 + }, + { + "epoch": 0.2081133380615899, + "grad_norm": 4.202088356018066, + "learning_rate": 7.920820315236428e-05, + "loss": 1.178, + "num_input_tokens_seen": 47795272, + "step": 2971 + }, + { + "epoch": 0.20818338630731917, + "grad_norm": 3.623647928237915, + "learning_rate": 7.920120490367777e-05, + "loss": 0.9782, + "num_input_tokens_seen": 47811656, + "step": 2972 + }, + { + "epoch": 0.2082534345530484, + "grad_norm": 4.158148765563965, + "learning_rate": 7.919420665499126e-05, + "loss": 1.0585, + "num_input_tokens_seen": 47827520, + "step": 2973 + }, + { + "epoch": 0.20832348279877766, + "grad_norm": 4.016353130340576, + "learning_rate": 7.918720840630473e-05, + "loss": 1.0176, + "num_input_tokens_seen": 47843904, + "step": 2974 + }, + { + "epoch": 0.2083935310445069, + "grad_norm": 5.862729072570801, + "learning_rate": 7.918021015761822e-05, + "loss": 1.0233, + "num_input_tokens_seen": 47860288, + "step": 2975 + }, + { + "epoch": 0.20846357929023615, + "grad_norm": 4.194519519805908, + "learning_rate": 7.91732119089317e-05, + "loss": 1.13, + "num_input_tokens_seen": 47876536, + "step": 2976 + }, + { + "epoch": 0.20853362753596538, + "grad_norm": 3.925144672393799, + "learning_rate": 7.916621366024518e-05, + "loss": 1.0069, + "num_input_tokens_seen": 47892216, + "step": 2977 + }, + { + "epoch": 0.20860367578169464, + "grad_norm": 4.005881309509277, + "learning_rate": 7.915921541155867e-05, + "loss": 1.1126, + "num_input_tokens_seen": 47907840, + "step": 2978 + }, + { + "epoch": 0.2086737240274239, + "grad_norm": 3.6061627864837646, + "learning_rate": 7.915221716287216e-05, + "loss": 0.8235, + "num_input_tokens_seen": 47923832, + "step": 2979 + }, + { + "epoch": 0.20874377227315313, + "grad_norm": 4.407896041870117, + "learning_rate": 7.914521891418565e-05, + "loss": 0.962, + "num_input_tokens_seen": 47940216, + "step": 2980 + }, + { + "epoch": 0.2088138205188824, + "grad_norm": 4.089472770690918, + "learning_rate": 7.913822066549912e-05, + "loss": 1.0691, + "num_input_tokens_seen": 47956600, + "step": 2981 + }, + { + "epoch": 0.20888386876461162, + "grad_norm": 4.384250640869141, + "learning_rate": 7.913122241681261e-05, + "loss": 1.1681, + "num_input_tokens_seen": 47972984, + "step": 2982 + }, + { + "epoch": 0.20895391701034088, + "grad_norm": 3.881756544113159, + "learning_rate": 7.91242241681261e-05, + "loss": 1.1473, + "num_input_tokens_seen": 47989368, + "step": 2983 + }, + { + "epoch": 0.20902396525607012, + "grad_norm": 3.9435884952545166, + "learning_rate": 7.911722591943958e-05, + "loss": 1.0328, + "num_input_tokens_seen": 48005608, + "step": 2984 + }, + { + "epoch": 0.20909401350179938, + "grad_norm": 4.1196794509887695, + "learning_rate": 7.911022767075308e-05, + "loss": 1.0287, + "num_input_tokens_seen": 48021992, + "step": 2985 + }, + { + "epoch": 0.2091640617475286, + "grad_norm": 4.482571125030518, + "learning_rate": 7.910322942206655e-05, + "loss": 1.0663, + "num_input_tokens_seen": 48037816, + "step": 2986 + }, + { + "epoch": 0.20923410999325787, + "grad_norm": 5.359109401702881, + "learning_rate": 7.909623117338004e-05, + "loss": 1.2157, + "num_input_tokens_seen": 48054200, + "step": 2987 + }, + { + "epoch": 0.2093041582389871, + "grad_norm": 5.712708950042725, + "learning_rate": 7.908923292469352e-05, + "loss": 1.1454, + "num_input_tokens_seen": 48070008, + "step": 2988 + }, + { + "epoch": 0.20937420648471636, + "grad_norm": 3.980526924133301, + "learning_rate": 7.9082234676007e-05, + "loss": 1.1933, + "num_input_tokens_seen": 48084864, + "step": 2989 + }, + { + "epoch": 0.2094442547304456, + "grad_norm": 4.963679790496826, + "learning_rate": 7.90752364273205e-05, + "loss": 1.1465, + "num_input_tokens_seen": 48101248, + "step": 2990 + }, + { + "epoch": 0.20951430297617485, + "grad_norm": 6.20939302444458, + "learning_rate": 7.906823817863398e-05, + "loss": 1.1187, + "num_input_tokens_seen": 48114984, + "step": 2991 + }, + { + "epoch": 0.20958435122190408, + "grad_norm": 13.218465805053711, + "learning_rate": 7.906123992994747e-05, + "loss": 1.0589, + "num_input_tokens_seen": 48129704, + "step": 2992 + }, + { + "epoch": 0.20965439946763334, + "grad_norm": 6.285522937774658, + "learning_rate": 7.905424168126095e-05, + "loss": 1.0993, + "num_input_tokens_seen": 48144280, + "step": 2993 + }, + { + "epoch": 0.20972444771336257, + "grad_norm": 5.113750457763672, + "learning_rate": 7.904724343257444e-05, + "loss": 1.0187, + "num_input_tokens_seen": 48160664, + "step": 2994 + }, + { + "epoch": 0.20979449595909183, + "grad_norm": 3.5571322441101074, + "learning_rate": 7.904024518388791e-05, + "loss": 0.9789, + "num_input_tokens_seen": 48177048, + "step": 2995 + }, + { + "epoch": 0.20986454420482106, + "grad_norm": 4.965229511260986, + "learning_rate": 7.90332469352014e-05, + "loss": 1.0934, + "num_input_tokens_seen": 48193400, + "step": 2996 + }, + { + "epoch": 0.20993459245055032, + "grad_norm": 4.466450214385986, + "learning_rate": 7.902624868651489e-05, + "loss": 1.2786, + "num_input_tokens_seen": 48209784, + "step": 2997 + }, + { + "epoch": 0.21000464069627955, + "grad_norm": 3.556642770767212, + "learning_rate": 7.901925043782838e-05, + "loss": 1.0579, + "num_input_tokens_seen": 48226096, + "step": 2998 + }, + { + "epoch": 0.2100746889420088, + "grad_norm": 5.175073146820068, + "learning_rate": 7.901225218914187e-05, + "loss": 1.0822, + "num_input_tokens_seen": 48242384, + "step": 2999 + }, + { + "epoch": 0.21014473718773805, + "grad_norm": 4.901797771453857, + "learning_rate": 7.900525394045535e-05, + "loss": 0.9413, + "num_input_tokens_seen": 48257944, + "step": 3000 + }, + { + "epoch": 0.21014473718773805, + "eval_loss": 1.137844204902649, + "eval_runtime": 0.2151, + "eval_samples_per_second": 4.649, + "eval_steps_per_second": 4.649, + "num_input_tokens_seen": 48257944, + "step": 3000 + }, + { + "epoch": 0.2102147854334673, + "grad_norm": 3.8474860191345215, + "learning_rate": 7.899825569176883e-05, + "loss": 0.9454, + "num_input_tokens_seen": 48273144, + "step": 3001 + }, + { + "epoch": 0.21028483367919654, + "grad_norm": 4.4164347648620605, + "learning_rate": 7.899125744308232e-05, + "loss": 1.2554, + "num_input_tokens_seen": 48288896, + "step": 3002 + }, + { + "epoch": 0.2103548819249258, + "grad_norm": 4.560143947601318, + "learning_rate": 7.898425919439579e-05, + "loss": 1.1129, + "num_input_tokens_seen": 48305168, + "step": 3003 + }, + { + "epoch": 0.21042493017065503, + "grad_norm": 4.310809135437012, + "learning_rate": 7.897726094570928e-05, + "loss": 1.1215, + "num_input_tokens_seen": 48320936, + "step": 3004 + }, + { + "epoch": 0.2104949784163843, + "grad_norm": 5.8606367111206055, + "learning_rate": 7.897026269702277e-05, + "loss": 0.7859, + "num_input_tokens_seen": 48334752, + "step": 3005 + }, + { + "epoch": 0.21056502666211352, + "grad_norm": 4.533644676208496, + "learning_rate": 7.896326444833626e-05, + "loss": 1.3134, + "num_input_tokens_seen": 48351136, + "step": 3006 + }, + { + "epoch": 0.21063507490784278, + "grad_norm": 3.955151081085205, + "learning_rate": 7.895626619964975e-05, + "loss": 1.3093, + "num_input_tokens_seen": 48367520, + "step": 3007 + }, + { + "epoch": 0.210705123153572, + "grad_norm": 4.857527732849121, + "learning_rate": 7.894926795096322e-05, + "loss": 0.9838, + "num_input_tokens_seen": 48383584, + "step": 3008 + }, + { + "epoch": 0.21077517139930127, + "grad_norm": 4.2091593742370605, + "learning_rate": 7.894226970227671e-05, + "loss": 0.9278, + "num_input_tokens_seen": 48399968, + "step": 3009 + }, + { + "epoch": 0.2108452196450305, + "grad_norm": 4.02255916595459, + "learning_rate": 7.89352714535902e-05, + "loss": 1.086, + "num_input_tokens_seen": 48416016, + "step": 3010 + }, + { + "epoch": 0.21091526789075976, + "grad_norm": 4.021467208862305, + "learning_rate": 7.892827320490369e-05, + "loss": 1.1088, + "num_input_tokens_seen": 48432400, + "step": 3011 + }, + { + "epoch": 0.21098531613648902, + "grad_norm": 4.211849212646484, + "learning_rate": 7.892127495621716e-05, + "loss": 1.1698, + "num_input_tokens_seen": 48448784, + "step": 3012 + }, + { + "epoch": 0.21105536438221825, + "grad_norm": 3.890512704849243, + "learning_rate": 7.891427670753065e-05, + "loss": 1.1048, + "num_input_tokens_seen": 48465168, + "step": 3013 + }, + { + "epoch": 0.2111254126279475, + "grad_norm": 3.9605376720428467, + "learning_rate": 7.890727845884414e-05, + "loss": 0.9904, + "num_input_tokens_seen": 48481024, + "step": 3014 + }, + { + "epoch": 0.21119546087367674, + "grad_norm": 3.6985483169555664, + "learning_rate": 7.890028021015761e-05, + "loss": 1.1033, + "num_input_tokens_seen": 48497408, + "step": 3015 + }, + { + "epoch": 0.211265509119406, + "grad_norm": 4.245354652404785, + "learning_rate": 7.88932819614711e-05, + "loss": 1.0609, + "num_input_tokens_seen": 48513640, + "step": 3016 + }, + { + "epoch": 0.21133555736513523, + "grad_norm": 4.163609027862549, + "learning_rate": 7.888628371278459e-05, + "loss": 1.2399, + "num_input_tokens_seen": 48529704, + "step": 3017 + }, + { + "epoch": 0.2114056056108645, + "grad_norm": 4.139742374420166, + "learning_rate": 7.887928546409808e-05, + "loss": 1.1029, + "num_input_tokens_seen": 48545808, + "step": 3018 + }, + { + "epoch": 0.21147565385659373, + "grad_norm": 4.119020938873291, + "learning_rate": 7.887228721541157e-05, + "loss": 1.233, + "num_input_tokens_seen": 48561584, + "step": 3019 + }, + { + "epoch": 0.21154570210232299, + "grad_norm": 3.467578172683716, + "learning_rate": 7.886528896672504e-05, + "loss": 0.9757, + "num_input_tokens_seen": 48577912, + "step": 3020 + }, + { + "epoch": 0.21161575034805222, + "grad_norm": 4.891791820526123, + "learning_rate": 7.885829071803853e-05, + "loss": 1.0507, + "num_input_tokens_seen": 48591792, + "step": 3021 + }, + { + "epoch": 0.21168579859378148, + "grad_norm": 3.8184545040130615, + "learning_rate": 7.885129246935201e-05, + "loss": 0.9845, + "num_input_tokens_seen": 48606656, + "step": 3022 + }, + { + "epoch": 0.2117558468395107, + "grad_norm": 3.909607410430908, + "learning_rate": 7.88442942206655e-05, + "loss": 1.2735, + "num_input_tokens_seen": 48622608, + "step": 3023 + }, + { + "epoch": 0.21182589508523997, + "grad_norm": 3.780740737915039, + "learning_rate": 7.883729597197899e-05, + "loss": 0.9796, + "num_input_tokens_seen": 48638992, + "step": 3024 + }, + { + "epoch": 0.2118959433309692, + "grad_norm": 3.95491099357605, + "learning_rate": 7.883029772329247e-05, + "loss": 1.0265, + "num_input_tokens_seen": 48654344, + "step": 3025 + }, + { + "epoch": 0.21196599157669846, + "grad_norm": 3.724346876144409, + "learning_rate": 7.882329947460596e-05, + "loss": 0.9352, + "num_input_tokens_seen": 48670728, + "step": 3026 + }, + { + "epoch": 0.2120360398224277, + "grad_norm": 4.314544200897217, + "learning_rate": 7.881630122591945e-05, + "loss": 1.145, + "num_input_tokens_seen": 48685424, + "step": 3027 + }, + { + "epoch": 0.21210608806815695, + "grad_norm": 3.9340150356292725, + "learning_rate": 7.880930297723293e-05, + "loss": 1.0337, + "num_input_tokens_seen": 48700416, + "step": 3028 + }, + { + "epoch": 0.21217613631388618, + "grad_norm": 4.978084087371826, + "learning_rate": 7.880230472854641e-05, + "loss": 1.2418, + "num_input_tokens_seen": 48716800, + "step": 3029 + }, + { + "epoch": 0.21224618455961544, + "grad_norm": 3.7038094997406006, + "learning_rate": 7.879530647985989e-05, + "loss": 1.0618, + "num_input_tokens_seen": 48732616, + "step": 3030 + }, + { + "epoch": 0.21231623280534467, + "grad_norm": 5.743021011352539, + "learning_rate": 7.878830823117339e-05, + "loss": 1.033, + "num_input_tokens_seen": 48748656, + "step": 3031 + }, + { + "epoch": 0.21238628105107393, + "grad_norm": 5.655540943145752, + "learning_rate": 7.878130998248687e-05, + "loss": 1.3541, + "num_input_tokens_seen": 48765040, + "step": 3032 + }, + { + "epoch": 0.21245632929680316, + "grad_norm": 4.291803359985352, + "learning_rate": 7.877431173380036e-05, + "loss": 1.1966, + "num_input_tokens_seen": 48781424, + "step": 3033 + }, + { + "epoch": 0.21252637754253242, + "grad_norm": 5.103096961975098, + "learning_rate": 7.876731348511384e-05, + "loss": 1.0543, + "num_input_tokens_seen": 48797808, + "step": 3034 + }, + { + "epoch": 0.21259642578826166, + "grad_norm": 5.048161029815674, + "learning_rate": 7.876031523642732e-05, + "loss": 0.9595, + "num_input_tokens_seen": 48814192, + "step": 3035 + }, + { + "epoch": 0.21266647403399092, + "grad_norm": 4.086791038513184, + "learning_rate": 7.875331698774081e-05, + "loss": 1.1128, + "num_input_tokens_seen": 48829816, + "step": 3036 + }, + { + "epoch": 0.21273652227972015, + "grad_norm": 3.8422605991363525, + "learning_rate": 7.87463187390543e-05, + "loss": 1.175, + "num_input_tokens_seen": 48846200, + "step": 3037 + }, + { + "epoch": 0.2128065705254494, + "grad_norm": 3.7120776176452637, + "learning_rate": 7.873932049036778e-05, + "loss": 1.0748, + "num_input_tokens_seen": 48862584, + "step": 3038 + }, + { + "epoch": 0.21287661877117864, + "grad_norm": 5.051353454589844, + "learning_rate": 7.873232224168126e-05, + "loss": 1.0278, + "num_input_tokens_seen": 48878368, + "step": 3039 + }, + { + "epoch": 0.2129466670169079, + "grad_norm": 3.9874653816223145, + "learning_rate": 7.872532399299475e-05, + "loss": 1.256, + "num_input_tokens_seen": 48894696, + "step": 3040 + }, + { + "epoch": 0.21301671526263713, + "grad_norm": 4.455258369445801, + "learning_rate": 7.871832574430824e-05, + "loss": 1.2226, + "num_input_tokens_seen": 48911080, + "step": 3041 + }, + { + "epoch": 0.2130867635083664, + "grad_norm": 5.521103382110596, + "learning_rate": 7.871132749562171e-05, + "loss": 1.2116, + "num_input_tokens_seen": 48927464, + "step": 3042 + }, + { + "epoch": 0.21315681175409562, + "grad_norm": 3.80818510055542, + "learning_rate": 7.87043292469352e-05, + "loss": 1.2213, + "num_input_tokens_seen": 48943848, + "step": 3043 + }, + { + "epoch": 0.21322685999982488, + "grad_norm": 4.319914817810059, + "learning_rate": 7.869733099824869e-05, + "loss": 0.9786, + "num_input_tokens_seen": 48960232, + "step": 3044 + }, + { + "epoch": 0.2132969082455541, + "grad_norm": 4.196371078491211, + "learning_rate": 7.869033274956218e-05, + "loss": 0.9782, + "num_input_tokens_seen": 48976616, + "step": 3045 + }, + { + "epoch": 0.21336695649128337, + "grad_norm": 3.988114595413208, + "learning_rate": 7.868333450087567e-05, + "loss": 1.0923, + "num_input_tokens_seen": 48992248, + "step": 3046 + }, + { + "epoch": 0.21343700473701263, + "grad_norm": 3.887589454650879, + "learning_rate": 7.867633625218914e-05, + "loss": 1.068, + "num_input_tokens_seen": 49008632, + "step": 3047 + }, + { + "epoch": 0.21350705298274186, + "grad_norm": 3.7942206859588623, + "learning_rate": 7.866933800350263e-05, + "loss": 1.1917, + "num_input_tokens_seen": 49024560, + "step": 3048 + }, + { + "epoch": 0.21357710122847112, + "grad_norm": 4.464767932891846, + "learning_rate": 7.86623397548161e-05, + "loss": 0.9137, + "num_input_tokens_seen": 49040200, + "step": 3049 + }, + { + "epoch": 0.21364714947420035, + "grad_norm": 4.411591529846191, + "learning_rate": 7.86553415061296e-05, + "loss": 1.2315, + "num_input_tokens_seen": 49056328, + "step": 3050 + }, + { + "epoch": 0.2137171977199296, + "grad_norm": 4.895592212677002, + "learning_rate": 7.86483432574431e-05, + "loss": 1.0756, + "num_input_tokens_seen": 49072696, + "step": 3051 + }, + { + "epoch": 0.21378724596565885, + "grad_norm": 4.46630859375, + "learning_rate": 7.864134500875657e-05, + "loss": 0.837, + "num_input_tokens_seen": 49087256, + "step": 3052 + }, + { + "epoch": 0.2138572942113881, + "grad_norm": 4.975766658782959, + "learning_rate": 7.863434676007006e-05, + "loss": 1.0508, + "num_input_tokens_seen": 49103640, + "step": 3053 + }, + { + "epoch": 0.21392734245711734, + "grad_norm": 4.441544532775879, + "learning_rate": 7.862734851138355e-05, + "loss": 0.9917, + "num_input_tokens_seen": 49119032, + "step": 3054 + }, + { + "epoch": 0.2139973907028466, + "grad_norm": 3.797757148742676, + "learning_rate": 7.862035026269702e-05, + "loss": 0.8701, + "num_input_tokens_seen": 49134960, + "step": 3055 + }, + { + "epoch": 0.21406743894857583, + "grad_norm": 4.021834373474121, + "learning_rate": 7.861335201401051e-05, + "loss": 1.0355, + "num_input_tokens_seen": 49151344, + "step": 3056 + }, + { + "epoch": 0.2141374871943051, + "grad_norm": 3.772587537765503, + "learning_rate": 7.8606353765324e-05, + "loss": 0.9717, + "num_input_tokens_seen": 49167424, + "step": 3057 + }, + { + "epoch": 0.21420753544003432, + "grad_norm": 5.356356143951416, + "learning_rate": 7.859935551663749e-05, + "loss": 1.027, + "num_input_tokens_seen": 49183504, + "step": 3058 + }, + { + "epoch": 0.21427758368576358, + "grad_norm": 4.314568042755127, + "learning_rate": 7.859235726795096e-05, + "loss": 1.0233, + "num_input_tokens_seen": 49199320, + "step": 3059 + }, + { + "epoch": 0.2143476319314928, + "grad_norm": 3.777794122695923, + "learning_rate": 7.858535901926445e-05, + "loss": 1.1218, + "num_input_tokens_seen": 49215032, + "step": 3060 + }, + { + "epoch": 0.21441768017722207, + "grad_norm": 3.788496732711792, + "learning_rate": 7.857836077057794e-05, + "loss": 0.9121, + "num_input_tokens_seen": 49230248, + "step": 3061 + }, + { + "epoch": 0.2144877284229513, + "grad_norm": 3.776698589324951, + "learning_rate": 7.857136252189142e-05, + "loss": 1.0687, + "num_input_tokens_seen": 49246264, + "step": 3062 + }, + { + "epoch": 0.21455777666868056, + "grad_norm": 3.8229172229766846, + "learning_rate": 7.85643642732049e-05, + "loss": 0.9773, + "num_input_tokens_seen": 49262648, + "step": 3063 + }, + { + "epoch": 0.2146278249144098, + "grad_norm": 3.7620902061462402, + "learning_rate": 7.85573660245184e-05, + "loss": 1.0162, + "num_input_tokens_seen": 49278640, + "step": 3064 + }, + { + "epoch": 0.21469787316013905, + "grad_norm": 3.953148126602173, + "learning_rate": 7.855036777583188e-05, + "loss": 1.1277, + "num_input_tokens_seen": 49295024, + "step": 3065 + }, + { + "epoch": 0.21476792140586828, + "grad_norm": 4.1923441886901855, + "learning_rate": 7.854336952714536e-05, + "loss": 0.9317, + "num_input_tokens_seen": 49311408, + "step": 3066 + }, + { + "epoch": 0.21483796965159754, + "grad_norm": 4.922461986541748, + "learning_rate": 7.853637127845885e-05, + "loss": 1.2234, + "num_input_tokens_seen": 49327120, + "step": 3067 + }, + { + "epoch": 0.21490801789732678, + "grad_norm": 3.7414777278900146, + "learning_rate": 7.852937302977233e-05, + "loss": 0.8628, + "num_input_tokens_seen": 49343504, + "step": 3068 + }, + { + "epoch": 0.21497806614305603, + "grad_norm": 6.1490912437438965, + "learning_rate": 7.852237478108581e-05, + "loss": 0.9836, + "num_input_tokens_seen": 49359336, + "step": 3069 + }, + { + "epoch": 0.21504811438878527, + "grad_norm": 4.232786178588867, + "learning_rate": 7.85153765323993e-05, + "loss": 1.1071, + "num_input_tokens_seen": 49374888, + "step": 3070 + }, + { + "epoch": 0.21511816263451453, + "grad_norm": 4.170281887054443, + "learning_rate": 7.85083782837128e-05, + "loss": 1.1863, + "num_input_tokens_seen": 49391272, + "step": 3071 + }, + { + "epoch": 0.21518821088024376, + "grad_norm": 4.096348285675049, + "learning_rate": 7.850138003502627e-05, + "loss": 1.1574, + "num_input_tokens_seen": 49407656, + "step": 3072 + }, + { + "epoch": 0.21525825912597302, + "grad_norm": 4.523014068603516, + "learning_rate": 7.849438178633976e-05, + "loss": 0.9481, + "num_input_tokens_seen": 49424040, + "step": 3073 + }, + { + "epoch": 0.21532830737170225, + "grad_norm": 5.029306888580322, + "learning_rate": 7.848738353765324e-05, + "loss": 1.2744, + "num_input_tokens_seen": 49440208, + "step": 3074 + }, + { + "epoch": 0.2153983556174315, + "grad_norm": 3.5349771976470947, + "learning_rate": 7.848038528896673e-05, + "loss": 0.8675, + "num_input_tokens_seen": 49456520, + "step": 3075 + }, + { + "epoch": 0.21546840386316074, + "grad_norm": 3.544787645339966, + "learning_rate": 7.84733870402802e-05, + "loss": 1.0082, + "num_input_tokens_seen": 49472904, + "step": 3076 + }, + { + "epoch": 0.21553845210889, + "grad_norm": 4.602756500244141, + "learning_rate": 7.84663887915937e-05, + "loss": 1.0747, + "num_input_tokens_seen": 49489264, + "step": 3077 + }, + { + "epoch": 0.21560850035461923, + "grad_norm": 6.479659080505371, + "learning_rate": 7.845939054290719e-05, + "loss": 1.0437, + "num_input_tokens_seen": 49505232, + "step": 3078 + }, + { + "epoch": 0.2156785486003485, + "grad_norm": 4.584348201751709, + "learning_rate": 7.845239229422067e-05, + "loss": 1.1054, + "num_input_tokens_seen": 49521616, + "step": 3079 + }, + { + "epoch": 0.21574859684607772, + "grad_norm": 4.339470386505127, + "learning_rate": 7.844539404553416e-05, + "loss": 1.2386, + "num_input_tokens_seen": 49537376, + "step": 3080 + }, + { + "epoch": 0.21581864509180698, + "grad_norm": 4.098686218261719, + "learning_rate": 7.843839579684765e-05, + "loss": 0.9376, + "num_input_tokens_seen": 49552256, + "step": 3081 + }, + { + "epoch": 0.21588869333753624, + "grad_norm": 4.619485855102539, + "learning_rate": 7.843139754816112e-05, + "loss": 1.0066, + "num_input_tokens_seen": 49568640, + "step": 3082 + }, + { + "epoch": 0.21595874158326547, + "grad_norm": 4.018712997436523, + "learning_rate": 7.842439929947461e-05, + "loss": 1.0062, + "num_input_tokens_seen": 49584816, + "step": 3083 + }, + { + "epoch": 0.21602878982899473, + "grad_norm": 5.898901462554932, + "learning_rate": 7.84174010507881e-05, + "loss": 1.1956, + "num_input_tokens_seen": 49600872, + "step": 3084 + }, + { + "epoch": 0.21609883807472396, + "grad_norm": 4.794529438018799, + "learning_rate": 7.841040280210159e-05, + "loss": 1.0035, + "num_input_tokens_seen": 49616840, + "step": 3085 + }, + { + "epoch": 0.21616888632045322, + "grad_norm": 4.934964656829834, + "learning_rate": 7.840340455341506e-05, + "loss": 1.0033, + "num_input_tokens_seen": 49633224, + "step": 3086 + }, + { + "epoch": 0.21623893456618246, + "grad_norm": 3.6171560287475586, + "learning_rate": 7.839640630472855e-05, + "loss": 1.1165, + "num_input_tokens_seen": 49649056, + "step": 3087 + }, + { + "epoch": 0.21630898281191172, + "grad_norm": 4.032123565673828, + "learning_rate": 7.838940805604204e-05, + "loss": 1.2411, + "num_input_tokens_seen": 49665440, + "step": 3088 + }, + { + "epoch": 0.21637903105764095, + "grad_norm": 3.4669382572174072, + "learning_rate": 7.838240980735551e-05, + "loss": 0.9666, + "num_input_tokens_seen": 49681824, + "step": 3089 + }, + { + "epoch": 0.2164490793033702, + "grad_norm": 3.6899688243865967, + "learning_rate": 7.8375411558669e-05, + "loss": 0.9657, + "num_input_tokens_seen": 49698208, + "step": 3090 + }, + { + "epoch": 0.21651912754909944, + "grad_norm": 4.231171131134033, + "learning_rate": 7.83684133099825e-05, + "loss": 1.1459, + "num_input_tokens_seen": 49713664, + "step": 3091 + }, + { + "epoch": 0.2165891757948287, + "grad_norm": 4.792253017425537, + "learning_rate": 7.836141506129598e-05, + "loss": 0.9982, + "num_input_tokens_seen": 49730048, + "step": 3092 + }, + { + "epoch": 0.21665922404055793, + "grad_norm": 5.7171478271484375, + "learning_rate": 7.835441681260945e-05, + "loss": 1.189, + "num_input_tokens_seen": 49746432, + "step": 3093 + }, + { + "epoch": 0.2167292722862872, + "grad_norm": 4.393872261047363, + "learning_rate": 7.834741856392294e-05, + "loss": 0.9969, + "num_input_tokens_seen": 49762816, + "step": 3094 + }, + { + "epoch": 0.21679932053201642, + "grad_norm": 6.388276100158691, + "learning_rate": 7.834042031523643e-05, + "loss": 1.2192, + "num_input_tokens_seen": 49778680, + "step": 3095 + }, + { + "epoch": 0.21686936877774568, + "grad_norm": 3.8204843997955322, + "learning_rate": 7.83334220665499e-05, + "loss": 1.0601, + "num_input_tokens_seen": 49794344, + "step": 3096 + }, + { + "epoch": 0.2169394170234749, + "grad_norm": 10.573785781860352, + "learning_rate": 7.832642381786341e-05, + "loss": 0.9257, + "num_input_tokens_seen": 49810208, + "step": 3097 + }, + { + "epoch": 0.21700946526920417, + "grad_norm": 3.437734603881836, + "learning_rate": 7.83194255691769e-05, + "loss": 0.8757, + "num_input_tokens_seen": 49826448, + "step": 3098 + }, + { + "epoch": 0.2170795135149334, + "grad_norm": 3.476918935775757, + "learning_rate": 7.831242732049037e-05, + "loss": 0.908, + "num_input_tokens_seen": 49842832, + "step": 3099 + }, + { + "epoch": 0.21714956176066266, + "grad_norm": 4.037630558013916, + "learning_rate": 7.830542907180386e-05, + "loss": 1.1305, + "num_input_tokens_seen": 49859216, + "step": 3100 + }, + { + "epoch": 0.2172196100063919, + "grad_norm": 3.7424814701080322, + "learning_rate": 7.829843082311734e-05, + "loss": 1.1701, + "num_input_tokens_seen": 49875528, + "step": 3101 + }, + { + "epoch": 0.21728965825212115, + "grad_norm": 4.222198486328125, + "learning_rate": 7.829143257443082e-05, + "loss": 1.0539, + "num_input_tokens_seen": 49891912, + "step": 3102 + }, + { + "epoch": 0.21735970649785039, + "grad_norm": 4.064510822296143, + "learning_rate": 7.828443432574431e-05, + "loss": 1.0524, + "num_input_tokens_seen": 49908064, + "step": 3103 + }, + { + "epoch": 0.21742975474357965, + "grad_norm": 3.822498083114624, + "learning_rate": 7.82774360770578e-05, + "loss": 0.9085, + "num_input_tokens_seen": 49923776, + "step": 3104 + }, + { + "epoch": 0.21749980298930888, + "grad_norm": 4.368459224700928, + "learning_rate": 7.827043782837129e-05, + "loss": 0.9599, + "num_input_tokens_seen": 49940104, + "step": 3105 + }, + { + "epoch": 0.21756985123503814, + "grad_norm": 3.722587823867798, + "learning_rate": 7.826343957968477e-05, + "loss": 1.0286, + "num_input_tokens_seen": 49955624, + "step": 3106 + }, + { + "epoch": 0.21763989948076737, + "grad_norm": 4.277473449707031, + "learning_rate": 7.825644133099825e-05, + "loss": 1.1797, + "num_input_tokens_seen": 49971784, + "step": 3107 + }, + { + "epoch": 0.21770994772649663, + "grad_norm": 4.586781024932861, + "learning_rate": 7.824944308231174e-05, + "loss": 1.0395, + "num_input_tokens_seen": 49988168, + "step": 3108 + }, + { + "epoch": 0.21777999597222586, + "grad_norm": 4.456960201263428, + "learning_rate": 7.824244483362522e-05, + "loss": 0.9449, + "num_input_tokens_seen": 50003392, + "step": 3109 + }, + { + "epoch": 0.21785004421795512, + "grad_norm": 4.115220069885254, + "learning_rate": 7.82354465849387e-05, + "loss": 1.1393, + "num_input_tokens_seen": 50019776, + "step": 3110 + }, + { + "epoch": 0.21792009246368435, + "grad_norm": 3.5760059356689453, + "learning_rate": 7.82284483362522e-05, + "loss": 1.0421, + "num_input_tokens_seen": 50036160, + "step": 3111 + }, + { + "epoch": 0.2179901407094136, + "grad_norm": 5.757627487182617, + "learning_rate": 7.822145008756568e-05, + "loss": 1.1382, + "num_input_tokens_seen": 50052544, + "step": 3112 + }, + { + "epoch": 0.21806018895514284, + "grad_norm": 3.4349796772003174, + "learning_rate": 7.821445183887916e-05, + "loss": 0.8474, + "num_input_tokens_seen": 50068872, + "step": 3113 + }, + { + "epoch": 0.2181302372008721, + "grad_norm": 5.546512603759766, + "learning_rate": 7.820745359019265e-05, + "loss": 1.4564, + "num_input_tokens_seen": 50085256, + "step": 3114 + }, + { + "epoch": 0.21820028544660133, + "grad_norm": 3.4954123497009277, + "learning_rate": 7.820045534150614e-05, + "loss": 1.0479, + "num_input_tokens_seen": 50101640, + "step": 3115 + }, + { + "epoch": 0.2182703336923306, + "grad_norm": 5.396134376525879, + "learning_rate": 7.819345709281961e-05, + "loss": 1.0834, + "num_input_tokens_seen": 50117040, + "step": 3116 + }, + { + "epoch": 0.21834038193805985, + "grad_norm": 3.7895803451538086, + "learning_rate": 7.818645884413311e-05, + "loss": 0.9567, + "num_input_tokens_seen": 50133424, + "step": 3117 + }, + { + "epoch": 0.21841043018378908, + "grad_norm": 5.321155548095703, + "learning_rate": 7.81794605954466e-05, + "loss": 1.2246, + "num_input_tokens_seen": 50148520, + "step": 3118 + }, + { + "epoch": 0.21848047842951834, + "grad_norm": 4.993834495544434, + "learning_rate": 7.817246234676008e-05, + "loss": 1.0944, + "num_input_tokens_seen": 50164904, + "step": 3119 + }, + { + "epoch": 0.21855052667524758, + "grad_norm": 3.69236159324646, + "learning_rate": 7.816546409807355e-05, + "loss": 0.9759, + "num_input_tokens_seen": 50181288, + "step": 3120 + }, + { + "epoch": 0.21862057492097683, + "grad_norm": 3.689748764038086, + "learning_rate": 7.815846584938704e-05, + "loss": 1.0594, + "num_input_tokens_seen": 50197672, + "step": 3121 + }, + { + "epoch": 0.21869062316670607, + "grad_norm": 5.904501914978027, + "learning_rate": 7.815146760070053e-05, + "loss": 1.2348, + "num_input_tokens_seen": 50214016, + "step": 3122 + }, + { + "epoch": 0.21876067141243533, + "grad_norm": 4.018721103668213, + "learning_rate": 7.814446935201402e-05, + "loss": 1.2688, + "num_input_tokens_seen": 50229984, + "step": 3123 + }, + { + "epoch": 0.21883071965816456, + "grad_norm": 3.6527509689331055, + "learning_rate": 7.81374711033275e-05, + "loss": 0.8508, + "num_input_tokens_seen": 50246368, + "step": 3124 + }, + { + "epoch": 0.21890076790389382, + "grad_norm": 3.6600260734558105, + "learning_rate": 7.8130472854641e-05, + "loss": 1.0952, + "num_input_tokens_seen": 50262208, + "step": 3125 + }, + { + "epoch": 0.21897081614962305, + "grad_norm": 3.415969133377075, + "learning_rate": 7.812347460595447e-05, + "loss": 1.1055, + "num_input_tokens_seen": 50278592, + "step": 3126 + }, + { + "epoch": 0.2190408643953523, + "grad_norm": 5.027013301849365, + "learning_rate": 7.811647635726796e-05, + "loss": 1.2189, + "num_input_tokens_seen": 50294976, + "step": 3127 + }, + { + "epoch": 0.21911091264108154, + "grad_norm": 3.806324005126953, + "learning_rate": 7.810947810858143e-05, + "loss": 1.094, + "num_input_tokens_seen": 50311360, + "step": 3128 + }, + { + "epoch": 0.2191809608868108, + "grad_norm": 5.208338260650635, + "learning_rate": 7.810247985989492e-05, + "loss": 1.0917, + "num_input_tokens_seen": 50327744, + "step": 3129 + }, + { + "epoch": 0.21925100913254003, + "grad_norm": 3.5902316570281982, + "learning_rate": 7.809548161120841e-05, + "loss": 1.0894, + "num_input_tokens_seen": 50343616, + "step": 3130 + }, + { + "epoch": 0.2193210573782693, + "grad_norm": 3.7159717082977295, + "learning_rate": 7.80884833625219e-05, + "loss": 1.1168, + "num_input_tokens_seen": 50360000, + "step": 3131 + }, + { + "epoch": 0.21939110562399852, + "grad_norm": 4.03640079498291, + "learning_rate": 7.808148511383539e-05, + "loss": 0.8906, + "num_input_tokens_seen": 50376384, + "step": 3132 + }, + { + "epoch": 0.21946115386972778, + "grad_norm": 3.763805627822876, + "learning_rate": 7.807448686514886e-05, + "loss": 1.0922, + "num_input_tokens_seen": 50392328, + "step": 3133 + }, + { + "epoch": 0.219531202115457, + "grad_norm": 4.242026329040527, + "learning_rate": 7.806748861646235e-05, + "loss": 1.1286, + "num_input_tokens_seen": 50408712, + "step": 3134 + }, + { + "epoch": 0.21960125036118627, + "grad_norm": 3.5783863067626953, + "learning_rate": 7.806049036777584e-05, + "loss": 0.946, + "num_input_tokens_seen": 50424816, + "step": 3135 + }, + { + "epoch": 0.2196712986069155, + "grad_norm": 3.8409011363983154, + "learning_rate": 7.805349211908931e-05, + "loss": 1.0901, + "num_input_tokens_seen": 50440464, + "step": 3136 + }, + { + "epoch": 0.21974134685264476, + "grad_norm": 3.642411231994629, + "learning_rate": 7.804649387040282e-05, + "loss": 1.097, + "num_input_tokens_seen": 50456552, + "step": 3137 + }, + { + "epoch": 0.219811395098374, + "grad_norm": 3.702481985092163, + "learning_rate": 7.803949562171629e-05, + "loss": 1.0843, + "num_input_tokens_seen": 50472936, + "step": 3138 + }, + { + "epoch": 0.21988144334410326, + "grad_norm": 3.776094913482666, + "learning_rate": 7.803249737302978e-05, + "loss": 1.0058, + "num_input_tokens_seen": 50488760, + "step": 3139 + }, + { + "epoch": 0.2199514915898325, + "grad_norm": 4.29668664932251, + "learning_rate": 7.802549912434326e-05, + "loss": 1.3095, + "num_input_tokens_seen": 50505144, + "step": 3140 + }, + { + "epoch": 0.22002153983556175, + "grad_norm": 3.8290088176727295, + "learning_rate": 7.801850087565674e-05, + "loss": 0.9331, + "num_input_tokens_seen": 50521520, + "step": 3141 + }, + { + "epoch": 0.22009158808129098, + "grad_norm": 3.9471163749694824, + "learning_rate": 7.801150262697023e-05, + "loss": 1.1064, + "num_input_tokens_seen": 50537688, + "step": 3142 + }, + { + "epoch": 0.22016163632702024, + "grad_norm": 4.3500657081604, + "learning_rate": 7.800450437828372e-05, + "loss": 1.0843, + "num_input_tokens_seen": 50554072, + "step": 3143 + }, + { + "epoch": 0.22023168457274947, + "grad_norm": 4.257317066192627, + "learning_rate": 7.799750612959721e-05, + "loss": 1.2822, + "num_input_tokens_seen": 50570456, + "step": 3144 + }, + { + "epoch": 0.22030173281847873, + "grad_norm": 3.881340265274048, + "learning_rate": 7.79905078809107e-05, + "loss": 1.2797, + "num_input_tokens_seen": 50586840, + "step": 3145 + }, + { + "epoch": 0.22037178106420796, + "grad_norm": 4.07082986831665, + "learning_rate": 7.798350963222417e-05, + "loss": 1.1659, + "num_input_tokens_seen": 50603224, + "step": 3146 + }, + { + "epoch": 0.22044182930993722, + "grad_norm": 3.740081310272217, + "learning_rate": 7.797651138353765e-05, + "loss": 1.1638, + "num_input_tokens_seen": 50619608, + "step": 3147 + }, + { + "epoch": 0.22051187755566645, + "grad_norm": 3.9368820190429688, + "learning_rate": 7.796951313485114e-05, + "loss": 1.2122, + "num_input_tokens_seen": 50635240, + "step": 3148 + }, + { + "epoch": 0.2205819258013957, + "grad_norm": 4.027481555938721, + "learning_rate": 7.796251488616463e-05, + "loss": 1.1479, + "num_input_tokens_seen": 50651144, + "step": 3149 + }, + { + "epoch": 0.22065197404712494, + "grad_norm": 3.53271222114563, + "learning_rate": 7.795551663747811e-05, + "loss": 0.8352, + "num_input_tokens_seen": 50667320, + "step": 3150 + }, + { + "epoch": 0.2207220222928542, + "grad_norm": 3.9494400024414062, + "learning_rate": 7.79485183887916e-05, + "loss": 1.1353, + "num_input_tokens_seen": 50683704, + "step": 3151 + }, + { + "epoch": 0.22079207053858346, + "grad_norm": 4.944929122924805, + "learning_rate": 7.794152014010509e-05, + "loss": 1.0833, + "num_input_tokens_seen": 50699544, + "step": 3152 + }, + { + "epoch": 0.2208621187843127, + "grad_norm": 4.625114440917969, + "learning_rate": 7.793452189141857e-05, + "loss": 1.0123, + "num_input_tokens_seen": 50715096, + "step": 3153 + }, + { + "epoch": 0.22093216703004195, + "grad_norm": 4.543829441070557, + "learning_rate": 7.792752364273205e-05, + "loss": 0.9669, + "num_input_tokens_seen": 50731480, + "step": 3154 + }, + { + "epoch": 0.22100221527577119, + "grad_norm": 4.038646221160889, + "learning_rate": 7.792052539404553e-05, + "loss": 1.1399, + "num_input_tokens_seen": 50747864, + "step": 3155 + }, + { + "epoch": 0.22107226352150045, + "grad_norm": 5.269920825958252, + "learning_rate": 7.791352714535902e-05, + "loss": 1.1412, + "num_input_tokens_seen": 50764248, + "step": 3156 + }, + { + "epoch": 0.22114231176722968, + "grad_norm": 3.661792278289795, + "learning_rate": 7.790652889667251e-05, + "loss": 0.8881, + "num_input_tokens_seen": 50780296, + "step": 3157 + }, + { + "epoch": 0.22121236001295894, + "grad_norm": 5.119567394256592, + "learning_rate": 7.7899530647986e-05, + "loss": 1.2316, + "num_input_tokens_seen": 50796680, + "step": 3158 + }, + { + "epoch": 0.22128240825868817, + "grad_norm": 4.011631965637207, + "learning_rate": 7.789253239929948e-05, + "loss": 1.0854, + "num_input_tokens_seen": 50812648, + "step": 3159 + }, + { + "epoch": 0.22135245650441743, + "grad_norm": 4.292233467102051, + "learning_rate": 7.788553415061296e-05, + "loss": 0.8441, + "num_input_tokens_seen": 50829032, + "step": 3160 + }, + { + "epoch": 0.22142250475014666, + "grad_norm": 3.9228122234344482, + "learning_rate": 7.787853590192645e-05, + "loss": 1.0963, + "num_input_tokens_seen": 50844776, + "step": 3161 + }, + { + "epoch": 0.22149255299587592, + "grad_norm": 4.396078109741211, + "learning_rate": 7.787153765323994e-05, + "loss": 1.2647, + "num_input_tokens_seen": 50860792, + "step": 3162 + }, + { + "epoch": 0.22156260124160515, + "grad_norm": 3.6809213161468506, + "learning_rate": 7.786453940455342e-05, + "loss": 1.0172, + "num_input_tokens_seen": 50877176, + "step": 3163 + }, + { + "epoch": 0.2216326494873344, + "grad_norm": 3.7879207134246826, + "learning_rate": 7.785754115586691e-05, + "loss": 0.9708, + "num_input_tokens_seen": 50893560, + "step": 3164 + }, + { + "epoch": 0.22170269773306364, + "grad_norm": 5.248175621032715, + "learning_rate": 7.785054290718039e-05, + "loss": 0.9575, + "num_input_tokens_seen": 50909944, + "step": 3165 + }, + { + "epoch": 0.2217727459787929, + "grad_norm": 5.437406539916992, + "learning_rate": 7.784354465849388e-05, + "loss": 1.1873, + "num_input_tokens_seen": 50925848, + "step": 3166 + }, + { + "epoch": 0.22184279422452213, + "grad_norm": 4.390413761138916, + "learning_rate": 7.783654640980735e-05, + "loss": 1.143, + "num_input_tokens_seen": 50941488, + "step": 3167 + }, + { + "epoch": 0.2219128424702514, + "grad_norm": 3.6923863887786865, + "learning_rate": 7.782954816112084e-05, + "loss": 1.012, + "num_input_tokens_seen": 50956984, + "step": 3168 + }, + { + "epoch": 0.22198289071598062, + "grad_norm": 4.338325023651123, + "learning_rate": 7.782254991243433e-05, + "loss": 1.0984, + "num_input_tokens_seen": 50973096, + "step": 3169 + }, + { + "epoch": 0.22205293896170988, + "grad_norm": 5.631222248077393, + "learning_rate": 7.781555166374782e-05, + "loss": 1.0325, + "num_input_tokens_seen": 50989480, + "step": 3170 + }, + { + "epoch": 0.22212298720743912, + "grad_norm": 3.852337598800659, + "learning_rate": 7.78085534150613e-05, + "loss": 1.0279, + "num_input_tokens_seen": 51005864, + "step": 3171 + }, + { + "epoch": 0.22219303545316837, + "grad_norm": 3.6684298515319824, + "learning_rate": 7.78015551663748e-05, + "loss": 0.9906, + "num_input_tokens_seen": 51022248, + "step": 3172 + }, + { + "epoch": 0.2222630836988976, + "grad_norm": 3.7521257400512695, + "learning_rate": 7.779455691768827e-05, + "loss": 0.9641, + "num_input_tokens_seen": 51038632, + "step": 3173 + }, + { + "epoch": 0.22233313194462687, + "grad_norm": 4.571293354034424, + "learning_rate": 7.778755866900175e-05, + "loss": 1.1655, + "num_input_tokens_seen": 51055016, + "step": 3174 + }, + { + "epoch": 0.2224031801903561, + "grad_norm": 3.921743154525757, + "learning_rate": 7.778056042031523e-05, + "loss": 1.0815, + "num_input_tokens_seen": 51071288, + "step": 3175 + }, + { + "epoch": 0.22247322843608536, + "grad_norm": 6.1666083335876465, + "learning_rate": 7.777356217162872e-05, + "loss": 1.1025, + "num_input_tokens_seen": 51086712, + "step": 3176 + }, + { + "epoch": 0.2225432766818146, + "grad_norm": 4.170863151550293, + "learning_rate": 7.776656392294221e-05, + "loss": 1.0547, + "num_input_tokens_seen": 51102904, + "step": 3177 + }, + { + "epoch": 0.22261332492754385, + "grad_norm": 4.218405246734619, + "learning_rate": 7.77595656742557e-05, + "loss": 1.0685, + "num_input_tokens_seen": 51119288, + "step": 3178 + }, + { + "epoch": 0.22268337317327308, + "grad_norm": 4.158823490142822, + "learning_rate": 7.775256742556919e-05, + "loss": 1.0053, + "num_input_tokens_seen": 51135672, + "step": 3179 + }, + { + "epoch": 0.22275342141900234, + "grad_norm": 3.900827407836914, + "learning_rate": 7.774556917688266e-05, + "loss": 0.9212, + "num_input_tokens_seen": 51151880, + "step": 3180 + }, + { + "epoch": 0.22282346966473157, + "grad_norm": 3.6363813877105713, + "learning_rate": 7.773857092819615e-05, + "loss": 1.0602, + "num_input_tokens_seen": 51167712, + "step": 3181 + }, + { + "epoch": 0.22289351791046083, + "grad_norm": 6.452186584472656, + "learning_rate": 7.773157267950963e-05, + "loss": 1.3543, + "num_input_tokens_seen": 51184096, + "step": 3182 + }, + { + "epoch": 0.22296356615619006, + "grad_norm": 4.324470043182373, + "learning_rate": 7.772457443082313e-05, + "loss": 1.3328, + "num_input_tokens_seen": 51200480, + "step": 3183 + }, + { + "epoch": 0.22303361440191932, + "grad_norm": 4.093019485473633, + "learning_rate": 7.77175761821366e-05, + "loss": 1.2647, + "num_input_tokens_seen": 51216864, + "step": 3184 + }, + { + "epoch": 0.22310366264764858, + "grad_norm": 3.923771619796753, + "learning_rate": 7.771057793345009e-05, + "loss": 1.0121, + "num_input_tokens_seen": 51233248, + "step": 3185 + }, + { + "epoch": 0.2231737108933778, + "grad_norm": 3.3340275287628174, + "learning_rate": 7.770357968476358e-05, + "loss": 0.8954, + "num_input_tokens_seen": 51249400, + "step": 3186 + }, + { + "epoch": 0.22324375913910707, + "grad_norm": 5.360925197601318, + "learning_rate": 7.769658143607706e-05, + "loss": 1.0391, + "num_input_tokens_seen": 51264920, + "step": 3187 + }, + { + "epoch": 0.2233138073848363, + "grad_norm": 4.377450466156006, + "learning_rate": 7.768958318739054e-05, + "loss": 1.2148, + "num_input_tokens_seen": 51280528, + "step": 3188 + }, + { + "epoch": 0.22338385563056556, + "grad_norm": 4.01370906829834, + "learning_rate": 7.768258493870403e-05, + "loss": 1.0084, + "num_input_tokens_seen": 51296912, + "step": 3189 + }, + { + "epoch": 0.2234539038762948, + "grad_norm": 5.112427711486816, + "learning_rate": 7.767558669001752e-05, + "loss": 1.0388, + "num_input_tokens_seen": 51313296, + "step": 3190 + }, + { + "epoch": 0.22352395212202406, + "grad_norm": 3.5889225006103516, + "learning_rate": 7.766858844133101e-05, + "loss": 1.0018, + "num_input_tokens_seen": 51329680, + "step": 3191 + }, + { + "epoch": 0.2235940003677533, + "grad_norm": 3.6924920082092285, + "learning_rate": 7.766159019264449e-05, + "loss": 1.1056, + "num_input_tokens_seen": 51346064, + "step": 3192 + }, + { + "epoch": 0.22366404861348255, + "grad_norm": 3.9349400997161865, + "learning_rate": 7.765459194395797e-05, + "loss": 0.9785, + "num_input_tokens_seen": 51361200, + "step": 3193 + }, + { + "epoch": 0.22373409685921178, + "grad_norm": 3.6980738639831543, + "learning_rate": 7.764759369527145e-05, + "loss": 0.9112, + "num_input_tokens_seen": 51377584, + "step": 3194 + }, + { + "epoch": 0.22380414510494104, + "grad_norm": 4.400575637817383, + "learning_rate": 7.764059544658494e-05, + "loss": 1.2927, + "num_input_tokens_seen": 51393968, + "step": 3195 + }, + { + "epoch": 0.22387419335067027, + "grad_norm": 3.758664846420288, + "learning_rate": 7.763359719789843e-05, + "loss": 0.8743, + "num_input_tokens_seen": 51410160, + "step": 3196 + }, + { + "epoch": 0.22394424159639953, + "grad_norm": 4.376255512237549, + "learning_rate": 7.762659894921192e-05, + "loss": 1.1239, + "num_input_tokens_seen": 51426192, + "step": 3197 + }, + { + "epoch": 0.22401428984212876, + "grad_norm": 4.371212959289551, + "learning_rate": 7.76196007005254e-05, + "loss": 1.4918, + "num_input_tokens_seen": 51442576, + "step": 3198 + }, + { + "epoch": 0.22408433808785802, + "grad_norm": 3.5152950286865234, + "learning_rate": 7.761260245183889e-05, + "loss": 1.0344, + "num_input_tokens_seen": 51458648, + "step": 3199 + }, + { + "epoch": 0.22415438633358725, + "grad_norm": 4.100535869598389, + "learning_rate": 7.760560420315237e-05, + "loss": 0.9969, + "num_input_tokens_seen": 51475032, + "step": 3200 + }, + { + "epoch": 0.22415438633358725, + "eval_loss": 1.1358542442321777, + "eval_runtime": 0.2073, + "eval_samples_per_second": 4.825, + "eval_steps_per_second": 4.825, + "num_input_tokens_seen": 51475032, + "step": 3200 + }, + { + "epoch": 0.2242244345793165, + "grad_norm": 4.394073486328125, + "learning_rate": 7.759860595446584e-05, + "loss": 1.0951, + "num_input_tokens_seen": 51490544, + "step": 3201 + }, + { + "epoch": 0.22429448282504574, + "grad_norm": 4.041582107543945, + "learning_rate": 7.759160770577933e-05, + "loss": 1.1615, + "num_input_tokens_seen": 51506928, + "step": 3202 + }, + { + "epoch": 0.224364531070775, + "grad_norm": 4.268798351287842, + "learning_rate": 7.758460945709282e-05, + "loss": 1.0975, + "num_input_tokens_seen": 51523232, + "step": 3203 + }, + { + "epoch": 0.22443457931650423, + "grad_norm": 4.080141067504883, + "learning_rate": 7.757761120840631e-05, + "loss": 0.9809, + "num_input_tokens_seen": 51539616, + "step": 3204 + }, + { + "epoch": 0.2245046275622335, + "grad_norm": 7.690321445465088, + "learning_rate": 7.75706129597198e-05, + "loss": 1.1217, + "num_input_tokens_seen": 51556000, + "step": 3205 + }, + { + "epoch": 0.22457467580796273, + "grad_norm": 4.161118507385254, + "learning_rate": 7.756361471103329e-05, + "loss": 0.9672, + "num_input_tokens_seen": 51572384, + "step": 3206 + }, + { + "epoch": 0.22464472405369199, + "grad_norm": 3.922683000564575, + "learning_rate": 7.755661646234676e-05, + "loss": 1.0665, + "num_input_tokens_seen": 51588768, + "step": 3207 + }, + { + "epoch": 0.22471477229942122, + "grad_norm": 3.7474617958068848, + "learning_rate": 7.754961821366025e-05, + "loss": 1.1283, + "num_input_tokens_seen": 51604792, + "step": 3208 + }, + { + "epoch": 0.22478482054515048, + "grad_norm": 3.856959819793701, + "learning_rate": 7.754261996497374e-05, + "loss": 0.963, + "num_input_tokens_seen": 51621176, + "step": 3209 + }, + { + "epoch": 0.2248548687908797, + "grad_norm": 4.130929470062256, + "learning_rate": 7.753562171628723e-05, + "loss": 1.0563, + "num_input_tokens_seen": 51636864, + "step": 3210 + }, + { + "epoch": 0.22492491703660897, + "grad_norm": 3.5023388862609863, + "learning_rate": 7.75286234676007e-05, + "loss": 0.8926, + "num_input_tokens_seen": 51653248, + "step": 3211 + }, + { + "epoch": 0.2249949652823382, + "grad_norm": 3.736415386199951, + "learning_rate": 7.752162521891419e-05, + "loss": 1.08, + "num_input_tokens_seen": 51669632, + "step": 3212 + }, + { + "epoch": 0.22506501352806746, + "grad_norm": 4.355846881866455, + "learning_rate": 7.751462697022768e-05, + "loss": 1.0265, + "num_input_tokens_seen": 51684632, + "step": 3213 + }, + { + "epoch": 0.2251350617737967, + "grad_norm": 4.165436744689941, + "learning_rate": 7.750762872154115e-05, + "loss": 1.1594, + "num_input_tokens_seen": 51701016, + "step": 3214 + }, + { + "epoch": 0.22520511001952595, + "grad_norm": 4.4387946128845215, + "learning_rate": 7.750063047285464e-05, + "loss": 0.911, + "num_input_tokens_seen": 51716176, + "step": 3215 + }, + { + "epoch": 0.22527515826525518, + "grad_norm": 4.749145030975342, + "learning_rate": 7.749363222416813e-05, + "loss": 0.952, + "num_input_tokens_seen": 51732560, + "step": 3216 + }, + { + "epoch": 0.22534520651098444, + "grad_norm": 4.321863651275635, + "learning_rate": 7.748663397548162e-05, + "loss": 1.0974, + "num_input_tokens_seen": 51748944, + "step": 3217 + }, + { + "epoch": 0.22541525475671367, + "grad_norm": 5.319899082183838, + "learning_rate": 7.747963572679511e-05, + "loss": 0.9506, + "num_input_tokens_seen": 51765328, + "step": 3218 + }, + { + "epoch": 0.22548530300244293, + "grad_norm": 3.5695643424987793, + "learning_rate": 7.747263747810858e-05, + "loss": 1.1482, + "num_input_tokens_seen": 51781712, + "step": 3219 + }, + { + "epoch": 0.2255553512481722, + "grad_norm": 3.725698947906494, + "learning_rate": 7.746563922942207e-05, + "loss": 0.9205, + "num_input_tokens_seen": 51798096, + "step": 3220 + }, + { + "epoch": 0.22562539949390142, + "grad_norm": 3.795003652572632, + "learning_rate": 7.745864098073555e-05, + "loss": 1.0314, + "num_input_tokens_seen": 51814480, + "step": 3221 + }, + { + "epoch": 0.22569544773963068, + "grad_norm": 3.817578077316284, + "learning_rate": 7.745164273204903e-05, + "loss": 1.1218, + "num_input_tokens_seen": 51830864, + "step": 3222 + }, + { + "epoch": 0.22576549598535992, + "grad_norm": 5.982937812805176, + "learning_rate": 7.744464448336252e-05, + "loss": 0.9544, + "num_input_tokens_seen": 51846104, + "step": 3223 + }, + { + "epoch": 0.22583554423108917, + "grad_norm": 5.063079833984375, + "learning_rate": 7.743764623467601e-05, + "loss": 0.9191, + "num_input_tokens_seen": 51862488, + "step": 3224 + }, + { + "epoch": 0.2259055924768184, + "grad_norm": 3.620837450027466, + "learning_rate": 7.74306479859895e-05, + "loss": 1.0484, + "num_input_tokens_seen": 51878784, + "step": 3225 + }, + { + "epoch": 0.22597564072254767, + "grad_norm": 3.578369617462158, + "learning_rate": 7.742364973730299e-05, + "loss": 1.0146, + "num_input_tokens_seen": 51894832, + "step": 3226 + }, + { + "epoch": 0.2260456889682769, + "grad_norm": 4.0356974601745605, + "learning_rate": 7.741665148861646e-05, + "loss": 1.0664, + "num_input_tokens_seen": 51911216, + "step": 3227 + }, + { + "epoch": 0.22611573721400616, + "grad_norm": 4.133927822113037, + "learning_rate": 7.740965323992994e-05, + "loss": 1.1579, + "num_input_tokens_seen": 51927600, + "step": 3228 + }, + { + "epoch": 0.2261857854597354, + "grad_norm": 4.2958879470825195, + "learning_rate": 7.740265499124343e-05, + "loss": 1.0519, + "num_input_tokens_seen": 51943688, + "step": 3229 + }, + { + "epoch": 0.22625583370546465, + "grad_norm": 6.211035251617432, + "learning_rate": 7.739565674255693e-05, + "loss": 1.0097, + "num_input_tokens_seen": 51960072, + "step": 3230 + }, + { + "epoch": 0.22632588195119388, + "grad_norm": 4.073126316070557, + "learning_rate": 7.73886584938704e-05, + "loss": 1.0226, + "num_input_tokens_seen": 51976456, + "step": 3231 + }, + { + "epoch": 0.22639593019692314, + "grad_norm": 3.605041980743408, + "learning_rate": 7.73816602451839e-05, + "loss": 0.817, + "num_input_tokens_seen": 51992840, + "step": 3232 + }, + { + "epoch": 0.22646597844265237, + "grad_norm": 4.341184139251709, + "learning_rate": 7.737466199649738e-05, + "loss": 1.1391, + "num_input_tokens_seen": 52008696, + "step": 3233 + }, + { + "epoch": 0.22653602668838163, + "grad_norm": 4.676966667175293, + "learning_rate": 7.736766374781086e-05, + "loss": 1.0163, + "num_input_tokens_seen": 52024944, + "step": 3234 + }, + { + "epoch": 0.22660607493411086, + "grad_norm": 4.6688032150268555, + "learning_rate": 7.736066549912435e-05, + "loss": 0.972, + "num_input_tokens_seen": 52041104, + "step": 3235 + }, + { + "epoch": 0.22667612317984012, + "grad_norm": 4.6416916847229, + "learning_rate": 7.735366725043783e-05, + "loss": 1.1197, + "num_input_tokens_seen": 52055864, + "step": 3236 + }, + { + "epoch": 0.22674617142556935, + "grad_norm": 3.713846206665039, + "learning_rate": 7.734666900175132e-05, + "loss": 1.0498, + "num_input_tokens_seen": 52071992, + "step": 3237 + }, + { + "epoch": 0.2268162196712986, + "grad_norm": 3.694094657897949, + "learning_rate": 7.73396707530648e-05, + "loss": 1.083, + "num_input_tokens_seen": 52088376, + "step": 3238 + }, + { + "epoch": 0.22688626791702785, + "grad_norm": 4.250162601470947, + "learning_rate": 7.733267250437829e-05, + "loss": 0.9421, + "num_input_tokens_seen": 52104320, + "step": 3239 + }, + { + "epoch": 0.2269563161627571, + "grad_norm": 3.8184008598327637, + "learning_rate": 7.732567425569178e-05, + "loss": 1.0033, + "num_input_tokens_seen": 52120416, + "step": 3240 + }, + { + "epoch": 0.22702636440848634, + "grad_norm": 3.9957122802734375, + "learning_rate": 7.731867600700525e-05, + "loss": 0.9594, + "num_input_tokens_seen": 52136704, + "step": 3241 + }, + { + "epoch": 0.2270964126542156, + "grad_norm": 4.153292655944824, + "learning_rate": 7.731167775831874e-05, + "loss": 1.2315, + "num_input_tokens_seen": 52153088, + "step": 3242 + }, + { + "epoch": 0.22716646089994483, + "grad_norm": 3.628377914428711, + "learning_rate": 7.730467950963223e-05, + "loss": 0.9826, + "num_input_tokens_seen": 52169032, + "step": 3243 + }, + { + "epoch": 0.2272365091456741, + "grad_norm": 3.45796275138855, + "learning_rate": 7.729768126094572e-05, + "loss": 1.0942, + "num_input_tokens_seen": 52185416, + "step": 3244 + }, + { + "epoch": 0.22730655739140332, + "grad_norm": 3.9128968715667725, + "learning_rate": 7.72906830122592e-05, + "loss": 1.2954, + "num_input_tokens_seen": 52201504, + "step": 3245 + }, + { + "epoch": 0.22737660563713258, + "grad_norm": 4.4097394943237305, + "learning_rate": 7.728368476357268e-05, + "loss": 1.0171, + "num_input_tokens_seen": 52217184, + "step": 3246 + }, + { + "epoch": 0.2274466538828618, + "grad_norm": 4.110626220703125, + "learning_rate": 7.727668651488617e-05, + "loss": 1.0412, + "num_input_tokens_seen": 52233432, + "step": 3247 + }, + { + "epoch": 0.22751670212859107, + "grad_norm": 4.161354064941406, + "learning_rate": 7.726968826619964e-05, + "loss": 0.9371, + "num_input_tokens_seen": 52249816, + "step": 3248 + }, + { + "epoch": 0.2275867503743203, + "grad_norm": 5.910977363586426, + "learning_rate": 7.726269001751313e-05, + "loss": 0.8993, + "num_input_tokens_seen": 52266200, + "step": 3249 + }, + { + "epoch": 0.22765679862004956, + "grad_norm": 3.8264660835266113, + "learning_rate": 7.725569176882663e-05, + "loss": 1.0927, + "num_input_tokens_seen": 52282136, + "step": 3250 + }, + { + "epoch": 0.2277268468657788, + "grad_norm": 3.9992623329162598, + "learning_rate": 7.724869352014011e-05, + "loss": 0.9256, + "num_input_tokens_seen": 52297368, + "step": 3251 + }, + { + "epoch": 0.22779689511150805, + "grad_norm": 4.263967990875244, + "learning_rate": 7.72416952714536e-05, + "loss": 1.1708, + "num_input_tokens_seen": 52313200, + "step": 3252 + }, + { + "epoch": 0.22786694335723728, + "grad_norm": 3.8846871852874756, + "learning_rate": 7.723469702276709e-05, + "loss": 1.1445, + "num_input_tokens_seen": 52329584, + "step": 3253 + }, + { + "epoch": 0.22793699160296654, + "grad_norm": 4.3504533767700195, + "learning_rate": 7.722769877408056e-05, + "loss": 1.0332, + "num_input_tokens_seen": 52345968, + "step": 3254 + }, + { + "epoch": 0.2280070398486958, + "grad_norm": 3.9775991439819336, + "learning_rate": 7.722070052539404e-05, + "loss": 1.2149, + "num_input_tokens_seen": 52362352, + "step": 3255 + }, + { + "epoch": 0.22807708809442503, + "grad_norm": 4.098363399505615, + "learning_rate": 7.721370227670754e-05, + "loss": 1.1278, + "num_input_tokens_seen": 52378736, + "step": 3256 + }, + { + "epoch": 0.2281471363401543, + "grad_norm": 3.7094836235046387, + "learning_rate": 7.720670402802103e-05, + "loss": 1.0221, + "num_input_tokens_seen": 52394896, + "step": 3257 + }, + { + "epoch": 0.22821718458588353, + "grad_norm": 4.042232036590576, + "learning_rate": 7.71997057793345e-05, + "loss": 1.2902, + "num_input_tokens_seen": 52410952, + "step": 3258 + }, + { + "epoch": 0.22828723283161279, + "grad_norm": 3.725853443145752, + "learning_rate": 7.719270753064799e-05, + "loss": 1.0135, + "num_input_tokens_seen": 52427200, + "step": 3259 + }, + { + "epoch": 0.22835728107734202, + "grad_norm": 5.186229705810547, + "learning_rate": 7.718570928196148e-05, + "loss": 1.0539, + "num_input_tokens_seen": 52443584, + "step": 3260 + }, + { + "epoch": 0.22842732932307128, + "grad_norm": 3.8725364208221436, + "learning_rate": 7.717871103327495e-05, + "loss": 1.0782, + "num_input_tokens_seen": 52458272, + "step": 3261 + }, + { + "epoch": 0.2284973775688005, + "grad_norm": 5.006584644317627, + "learning_rate": 7.717171278458844e-05, + "loss": 1.0313, + "num_input_tokens_seen": 52474456, + "step": 3262 + }, + { + "epoch": 0.22856742581452977, + "grad_norm": 5.102536201477051, + "learning_rate": 7.716471453590193e-05, + "loss": 1.2077, + "num_input_tokens_seen": 52490464, + "step": 3263 + }, + { + "epoch": 0.228637474060259, + "grad_norm": 3.741029977798462, + "learning_rate": 7.715771628721542e-05, + "loss": 0.8978, + "num_input_tokens_seen": 52506112, + "step": 3264 + }, + { + "epoch": 0.22870752230598826, + "grad_norm": 5.656842231750488, + "learning_rate": 7.71507180385289e-05, + "loss": 1.1569, + "num_input_tokens_seen": 52522496, + "step": 3265 + }, + { + "epoch": 0.2287775705517175, + "grad_norm": 3.882403612136841, + "learning_rate": 7.714371978984238e-05, + "loss": 1.163, + "num_input_tokens_seen": 52538240, + "step": 3266 + }, + { + "epoch": 0.22884761879744675, + "grad_norm": 4.812796592712402, + "learning_rate": 7.713672154115587e-05, + "loss": 1.0478, + "num_input_tokens_seen": 52554024, + "step": 3267 + }, + { + "epoch": 0.22891766704317598, + "grad_norm": 3.9040687084198, + "learning_rate": 7.712972329246935e-05, + "loss": 1.0123, + "num_input_tokens_seen": 52570408, + "step": 3268 + }, + { + "epoch": 0.22898771528890524, + "grad_norm": 3.8387644290924072, + "learning_rate": 7.712272504378284e-05, + "loss": 0.9401, + "num_input_tokens_seen": 52586512, + "step": 3269 + }, + { + "epoch": 0.22905776353463447, + "grad_norm": 4.602542877197266, + "learning_rate": 7.711572679509634e-05, + "loss": 1.0196, + "num_input_tokens_seen": 52602896, + "step": 3270 + }, + { + "epoch": 0.22912781178036373, + "grad_norm": 4.209007263183594, + "learning_rate": 7.710872854640981e-05, + "loss": 1.1401, + "num_input_tokens_seen": 52619080, + "step": 3271 + }, + { + "epoch": 0.22919786002609296, + "grad_norm": 3.5082032680511475, + "learning_rate": 7.71017302977233e-05, + "loss": 0.9979, + "num_input_tokens_seen": 52635464, + "step": 3272 + }, + { + "epoch": 0.22926790827182222, + "grad_norm": 4.123980522155762, + "learning_rate": 7.709473204903678e-05, + "loss": 1.0201, + "num_input_tokens_seen": 52651848, + "step": 3273 + }, + { + "epoch": 0.22933795651755146, + "grad_norm": 4.267751216888428, + "learning_rate": 7.708773380035027e-05, + "loss": 1.1338, + "num_input_tokens_seen": 52668232, + "step": 3274 + }, + { + "epoch": 0.22940800476328072, + "grad_norm": 4.1165666580200195, + "learning_rate": 7.708073555166374e-05, + "loss": 1.1146, + "num_input_tokens_seen": 52684616, + "step": 3275 + }, + { + "epoch": 0.22947805300900995, + "grad_norm": 4.810427665710449, + "learning_rate": 7.707373730297724e-05, + "loss": 1.1785, + "num_input_tokens_seen": 52701000, + "step": 3276 + }, + { + "epoch": 0.2295481012547392, + "grad_norm": 6.566617488861084, + "learning_rate": 7.706673905429073e-05, + "loss": 0.8192, + "num_input_tokens_seen": 52715920, + "step": 3277 + }, + { + "epoch": 0.22961814950046844, + "grad_norm": 4.456092834472656, + "learning_rate": 7.70597408056042e-05, + "loss": 0.992, + "num_input_tokens_seen": 52732304, + "step": 3278 + }, + { + "epoch": 0.2296881977461977, + "grad_norm": 4.063642501831055, + "learning_rate": 7.70527425569177e-05, + "loss": 0.9306, + "num_input_tokens_seen": 52748688, + "step": 3279 + }, + { + "epoch": 0.22975824599192693, + "grad_norm": 3.337742567062378, + "learning_rate": 7.704574430823118e-05, + "loss": 0.8497, + "num_input_tokens_seen": 52764800, + "step": 3280 + }, + { + "epoch": 0.2298282942376562, + "grad_norm": 4.36488151550293, + "learning_rate": 7.703874605954466e-05, + "loss": 1.0851, + "num_input_tokens_seen": 52780952, + "step": 3281 + }, + { + "epoch": 0.22989834248338542, + "grad_norm": 4.948200702667236, + "learning_rate": 7.703174781085815e-05, + "loss": 0.9591, + "num_input_tokens_seen": 52795728, + "step": 3282 + }, + { + "epoch": 0.22996839072911468, + "grad_norm": 4.977625370025635, + "learning_rate": 7.702474956217164e-05, + "loss": 1.2094, + "num_input_tokens_seen": 52812112, + "step": 3283 + }, + { + "epoch": 0.2300384389748439, + "grad_norm": 3.7551944255828857, + "learning_rate": 7.701775131348512e-05, + "loss": 1.1018, + "num_input_tokens_seen": 52828184, + "step": 3284 + }, + { + "epoch": 0.23010848722057317, + "grad_norm": 3.700916051864624, + "learning_rate": 7.70107530647986e-05, + "loss": 1.0159, + "num_input_tokens_seen": 52844568, + "step": 3285 + }, + { + "epoch": 0.2301785354663024, + "grad_norm": 4.135788917541504, + "learning_rate": 7.700375481611209e-05, + "loss": 1.047, + "num_input_tokens_seen": 52860952, + "step": 3286 + }, + { + "epoch": 0.23024858371203166, + "grad_norm": 4.018477916717529, + "learning_rate": 7.699675656742558e-05, + "loss": 1.1124, + "num_input_tokens_seen": 52876808, + "step": 3287 + }, + { + "epoch": 0.2303186319577609, + "grad_norm": 5.230745315551758, + "learning_rate": 7.698975831873905e-05, + "loss": 1.0805, + "num_input_tokens_seen": 52893192, + "step": 3288 + }, + { + "epoch": 0.23038868020349015, + "grad_norm": 4.192041873931885, + "learning_rate": 7.698276007005254e-05, + "loss": 1.1476, + "num_input_tokens_seen": 52909576, + "step": 3289 + }, + { + "epoch": 0.2304587284492194, + "grad_norm": 4.28109073638916, + "learning_rate": 7.697576182136603e-05, + "loss": 0.9795, + "num_input_tokens_seen": 52925592, + "step": 3290 + }, + { + "epoch": 0.23052877669494864, + "grad_norm": 4.673538684844971, + "learning_rate": 7.696876357267952e-05, + "loss": 1.2104, + "num_input_tokens_seen": 52941784, + "step": 3291 + }, + { + "epoch": 0.2305988249406779, + "grad_norm": 3.791339159011841, + "learning_rate": 7.696176532399299e-05, + "loss": 1.0098, + "num_input_tokens_seen": 52958168, + "step": 3292 + }, + { + "epoch": 0.23066887318640714, + "grad_norm": 5.353015899658203, + "learning_rate": 7.695476707530648e-05, + "loss": 1.346, + "num_input_tokens_seen": 52974552, + "step": 3293 + }, + { + "epoch": 0.2307389214321364, + "grad_norm": 6.66793966293335, + "learning_rate": 7.694776882661997e-05, + "loss": 1.127, + "num_input_tokens_seen": 52990512, + "step": 3294 + }, + { + "epoch": 0.23080896967786563, + "grad_norm": 5.462240695953369, + "learning_rate": 7.694077057793344e-05, + "loss": 1.2397, + "num_input_tokens_seen": 53006768, + "step": 3295 + }, + { + "epoch": 0.2308790179235949, + "grad_norm": 4.212863445281982, + "learning_rate": 7.693377232924695e-05, + "loss": 0.9377, + "num_input_tokens_seen": 53023152, + "step": 3296 + }, + { + "epoch": 0.23094906616932412, + "grad_norm": 3.623929977416992, + "learning_rate": 7.692677408056044e-05, + "loss": 0.9086, + "num_input_tokens_seen": 53039536, + "step": 3297 + }, + { + "epoch": 0.23101911441505338, + "grad_norm": 4.791571617126465, + "learning_rate": 7.691977583187391e-05, + "loss": 1.0059, + "num_input_tokens_seen": 53055920, + "step": 3298 + }, + { + "epoch": 0.2310891626607826, + "grad_norm": 3.733243465423584, + "learning_rate": 7.69127775831874e-05, + "loss": 1.1729, + "num_input_tokens_seen": 53072304, + "step": 3299 + }, + { + "epoch": 0.23115921090651187, + "grad_norm": 3.916738986968994, + "learning_rate": 7.690577933450087e-05, + "loss": 1.2479, + "num_input_tokens_seen": 53088568, + "step": 3300 + }, + { + "epoch": 0.2312292591522411, + "grad_norm": 4.0346856117248535, + "learning_rate": 7.689878108581436e-05, + "loss": 1.0858, + "num_input_tokens_seen": 53103656, + "step": 3301 + }, + { + "epoch": 0.23129930739797036, + "grad_norm": 4.834316730499268, + "learning_rate": 7.689178283712785e-05, + "loss": 0.9328, + "num_input_tokens_seen": 53120040, + "step": 3302 + }, + { + "epoch": 0.2313693556436996, + "grad_norm": 4.5966291427612305, + "learning_rate": 7.688478458844134e-05, + "loss": 1.0108, + "num_input_tokens_seen": 53136424, + "step": 3303 + }, + { + "epoch": 0.23143940388942885, + "grad_norm": 5.17268705368042, + "learning_rate": 7.687778633975483e-05, + "loss": 1.1559, + "num_input_tokens_seen": 53152080, + "step": 3304 + }, + { + "epoch": 0.23150945213515808, + "grad_norm": 3.6322672367095947, + "learning_rate": 7.68707880910683e-05, + "loss": 1.0666, + "num_input_tokens_seen": 53168464, + "step": 3305 + }, + { + "epoch": 0.23157950038088734, + "grad_norm": 4.761613368988037, + "learning_rate": 7.686378984238179e-05, + "loss": 1.032, + "num_input_tokens_seen": 53184848, + "step": 3306 + }, + { + "epoch": 0.23164954862661657, + "grad_norm": 3.4870493412017822, + "learning_rate": 7.685679159369528e-05, + "loss": 1.026, + "num_input_tokens_seen": 53201232, + "step": 3307 + }, + { + "epoch": 0.23171959687234583, + "grad_norm": 4.122028827667236, + "learning_rate": 7.684979334500876e-05, + "loss": 1.2103, + "num_input_tokens_seen": 53217616, + "step": 3308 + }, + { + "epoch": 0.23178964511807507, + "grad_norm": 3.4486751556396484, + "learning_rate": 7.684279509632224e-05, + "loss": 0.6654, + "num_input_tokens_seen": 53233936, + "step": 3309 + }, + { + "epoch": 0.23185969336380433, + "grad_norm": 4.321650981903076, + "learning_rate": 7.683579684763573e-05, + "loss": 1.106, + "num_input_tokens_seen": 53250320, + "step": 3310 + }, + { + "epoch": 0.23192974160953356, + "grad_norm": 5.820108413696289, + "learning_rate": 7.682879859894922e-05, + "loss": 1.0225, + "num_input_tokens_seen": 53266592, + "step": 3311 + }, + { + "epoch": 0.23199978985526282, + "grad_norm": 5.5514912605285645, + "learning_rate": 7.68218003502627e-05, + "loss": 1.1083, + "num_input_tokens_seen": 53282976, + "step": 3312 + }, + { + "epoch": 0.23206983810099205, + "grad_norm": 4.108302116394043, + "learning_rate": 7.681480210157618e-05, + "loss": 1.1507, + "num_input_tokens_seen": 53299184, + "step": 3313 + }, + { + "epoch": 0.2321398863467213, + "grad_norm": 4.037779331207275, + "learning_rate": 7.680780385288967e-05, + "loss": 1.2858, + "num_input_tokens_seen": 53315000, + "step": 3314 + }, + { + "epoch": 0.23220993459245054, + "grad_norm": 4.5398383140563965, + "learning_rate": 7.680080560420315e-05, + "loss": 1.0374, + "num_input_tokens_seen": 53331104, + "step": 3315 + }, + { + "epoch": 0.2322799828381798, + "grad_norm": 4.2399067878723145, + "learning_rate": 7.679380735551665e-05, + "loss": 1.098, + "num_input_tokens_seen": 53347488, + "step": 3316 + }, + { + "epoch": 0.23235003108390903, + "grad_norm": 5.6600775718688965, + "learning_rate": 7.678680910683013e-05, + "loss": 0.9446, + "num_input_tokens_seen": 53363872, + "step": 3317 + }, + { + "epoch": 0.2324200793296383, + "grad_norm": 4.462069511413574, + "learning_rate": 7.677981085814361e-05, + "loss": 0.9313, + "num_input_tokens_seen": 53379424, + "step": 3318 + }, + { + "epoch": 0.23249012757536752, + "grad_norm": 4.644591808319092, + "learning_rate": 7.677281260945709e-05, + "loss": 1.3155, + "num_input_tokens_seen": 53395728, + "step": 3319 + }, + { + "epoch": 0.23256017582109678, + "grad_norm": 3.860954523086548, + "learning_rate": 7.676581436077058e-05, + "loss": 1.0917, + "num_input_tokens_seen": 53412112, + "step": 3320 + }, + { + "epoch": 0.232630224066826, + "grad_norm": 4.625146389007568, + "learning_rate": 7.675881611208407e-05, + "loss": 0.9253, + "num_input_tokens_seen": 53427992, + "step": 3321 + }, + { + "epoch": 0.23270027231255527, + "grad_norm": 6.473335266113281, + "learning_rate": 7.675181786339756e-05, + "loss": 0.9892, + "num_input_tokens_seen": 53444376, + "step": 3322 + }, + { + "epoch": 0.2327703205582845, + "grad_norm": 3.6846091747283936, + "learning_rate": 7.674481961471104e-05, + "loss": 0.9976, + "num_input_tokens_seen": 53460760, + "step": 3323 + }, + { + "epoch": 0.23284036880401376, + "grad_norm": 3.784900188446045, + "learning_rate": 7.673782136602453e-05, + "loss": 0.8865, + "num_input_tokens_seen": 53477144, + "step": 3324 + }, + { + "epoch": 0.23291041704974302, + "grad_norm": 4.175132751464844, + "learning_rate": 7.673082311733801e-05, + "loss": 1.1741, + "num_input_tokens_seen": 53493496, + "step": 3325 + }, + { + "epoch": 0.23298046529547226, + "grad_norm": 4.355600833892822, + "learning_rate": 7.67238248686515e-05, + "loss": 0.8686, + "num_input_tokens_seen": 53509560, + "step": 3326 + }, + { + "epoch": 0.23305051354120151, + "grad_norm": 4.32242488861084, + "learning_rate": 7.671682661996497e-05, + "loss": 0.9493, + "num_input_tokens_seen": 53525944, + "step": 3327 + }, + { + "epoch": 0.23312056178693075, + "grad_norm": 4.937814235687256, + "learning_rate": 7.670982837127846e-05, + "loss": 1.1617, + "num_input_tokens_seen": 53541312, + "step": 3328 + }, + { + "epoch": 0.23319061003266, + "grad_norm": 3.1939101219177246, + "learning_rate": 7.670283012259195e-05, + "loss": 0.8866, + "num_input_tokens_seen": 53557696, + "step": 3329 + }, + { + "epoch": 0.23326065827838924, + "grad_norm": 5.137113094329834, + "learning_rate": 7.669583187390544e-05, + "loss": 0.9911, + "num_input_tokens_seen": 53573600, + "step": 3330 + }, + { + "epoch": 0.2333307065241185, + "grad_norm": 3.777954578399658, + "learning_rate": 7.668883362521893e-05, + "loss": 1.0047, + "num_input_tokens_seen": 53588808, + "step": 3331 + }, + { + "epoch": 0.23340075476984773, + "grad_norm": 4.229750633239746, + "learning_rate": 7.66818353765324e-05, + "loss": 1.3247, + "num_input_tokens_seen": 53603416, + "step": 3332 + }, + { + "epoch": 0.233470803015577, + "grad_norm": 4.248676776885986, + "learning_rate": 7.667483712784589e-05, + "loss": 1.2149, + "num_input_tokens_seen": 53618896, + "step": 3333 + }, + { + "epoch": 0.23354085126130622, + "grad_norm": 3.7393991947174072, + "learning_rate": 7.666783887915938e-05, + "loss": 1.0339, + "num_input_tokens_seen": 53635280, + "step": 3334 + }, + { + "epoch": 0.23361089950703548, + "grad_norm": 3.6224875450134277, + "learning_rate": 7.666084063047285e-05, + "loss": 0.8727, + "num_input_tokens_seen": 53651664, + "step": 3335 + }, + { + "epoch": 0.2336809477527647, + "grad_norm": 4.2722063064575195, + "learning_rate": 7.665384238178634e-05, + "loss": 1.1982, + "num_input_tokens_seen": 53668048, + "step": 3336 + }, + { + "epoch": 0.23375099599849397, + "grad_norm": 3.4717535972595215, + "learning_rate": 7.664684413309983e-05, + "loss": 0.9695, + "num_input_tokens_seen": 53684432, + "step": 3337 + }, + { + "epoch": 0.2338210442442232, + "grad_norm": 3.6640021800994873, + "learning_rate": 7.663984588441332e-05, + "loss": 0.8621, + "num_input_tokens_seen": 53700816, + "step": 3338 + }, + { + "epoch": 0.23389109248995246, + "grad_norm": 5.14633321762085, + "learning_rate": 7.66328476357268e-05, + "loss": 1.1954, + "num_input_tokens_seen": 53717200, + "step": 3339 + }, + { + "epoch": 0.2339611407356817, + "grad_norm": 4.479960918426514, + "learning_rate": 7.662584938704028e-05, + "loss": 1.1001, + "num_input_tokens_seen": 53733584, + "step": 3340 + }, + { + "epoch": 0.23403118898141095, + "grad_norm": 5.33896017074585, + "learning_rate": 7.661885113835377e-05, + "loss": 0.8984, + "num_input_tokens_seen": 53749072, + "step": 3341 + }, + { + "epoch": 0.23410123722714019, + "grad_norm": 4.407443046569824, + "learning_rate": 7.661185288966726e-05, + "loss": 1.2437, + "num_input_tokens_seen": 53765088, + "step": 3342 + }, + { + "epoch": 0.23417128547286944, + "grad_norm": 3.8250956535339355, + "learning_rate": 7.660485464098075e-05, + "loss": 0.9243, + "num_input_tokens_seen": 53781000, + "step": 3343 + }, + { + "epoch": 0.23424133371859868, + "grad_norm": 4.316215515136719, + "learning_rate": 7.659785639229422e-05, + "loss": 1.0972, + "num_input_tokens_seen": 53796744, + "step": 3344 + }, + { + "epoch": 0.23431138196432794, + "grad_norm": 4.291647434234619, + "learning_rate": 7.659085814360771e-05, + "loss": 1.1376, + "num_input_tokens_seen": 53813128, + "step": 3345 + }, + { + "epoch": 0.23438143021005717, + "grad_norm": 3.704899787902832, + "learning_rate": 7.658385989492119e-05, + "loss": 1.2117, + "num_input_tokens_seen": 53829512, + "step": 3346 + }, + { + "epoch": 0.23445147845578643, + "grad_norm": 3.5979909896850586, + "learning_rate": 7.657686164623468e-05, + "loss": 0.9604, + "num_input_tokens_seen": 53845536, + "step": 3347 + }, + { + "epoch": 0.23452152670151566, + "grad_norm": 3.8820247650146484, + "learning_rate": 7.656986339754816e-05, + "loss": 1.2439, + "num_input_tokens_seen": 53861920, + "step": 3348 + }, + { + "epoch": 0.23459157494724492, + "grad_norm": 4.226894855499268, + "learning_rate": 7.656286514886165e-05, + "loss": 1.0884, + "num_input_tokens_seen": 53878304, + "step": 3349 + }, + { + "epoch": 0.23466162319297415, + "grad_norm": 4.507336616516113, + "learning_rate": 7.655586690017514e-05, + "loss": 1.0184, + "num_input_tokens_seen": 53894688, + "step": 3350 + }, + { + "epoch": 0.2347316714387034, + "grad_norm": 3.86645245552063, + "learning_rate": 7.654886865148863e-05, + "loss": 1.0895, + "num_input_tokens_seen": 53910736, + "step": 3351 + }, + { + "epoch": 0.23480171968443264, + "grad_norm": 3.8789820671081543, + "learning_rate": 7.65418704028021e-05, + "loss": 1.0078, + "num_input_tokens_seen": 53926688, + "step": 3352 + }, + { + "epoch": 0.2348717679301619, + "grad_norm": 3.893564462661743, + "learning_rate": 7.653487215411559e-05, + "loss": 1.0701, + "num_input_tokens_seen": 53942904, + "step": 3353 + }, + { + "epoch": 0.23494181617589113, + "grad_norm": 4.6554412841796875, + "learning_rate": 7.652787390542907e-05, + "loss": 1.1396, + "num_input_tokens_seen": 53957976, + "step": 3354 + }, + { + "epoch": 0.2350118644216204, + "grad_norm": 4.118137359619141, + "learning_rate": 7.652087565674256e-05, + "loss": 1.2019, + "num_input_tokens_seen": 53973520, + "step": 3355 + }, + { + "epoch": 0.23508191266734962, + "grad_norm": 5.099210262298584, + "learning_rate": 7.651387740805605e-05, + "loss": 0.892, + "num_input_tokens_seen": 53989280, + "step": 3356 + }, + { + "epoch": 0.23515196091307888, + "grad_norm": 3.868797779083252, + "learning_rate": 7.650687915936953e-05, + "loss": 1.0992, + "num_input_tokens_seen": 54005664, + "step": 3357 + }, + { + "epoch": 0.23522200915880812, + "grad_norm": 4.032477378845215, + "learning_rate": 7.649988091068302e-05, + "loss": 1.0356, + "num_input_tokens_seen": 54022048, + "step": 3358 + }, + { + "epoch": 0.23529205740453737, + "grad_norm": 3.907238483428955, + "learning_rate": 7.64928826619965e-05, + "loss": 1.0925, + "num_input_tokens_seen": 54038432, + "step": 3359 + }, + { + "epoch": 0.23536210565026663, + "grad_norm": 3.6504223346710205, + "learning_rate": 7.648588441330999e-05, + "loss": 0.9708, + "num_input_tokens_seen": 54054272, + "step": 3360 + }, + { + "epoch": 0.23543215389599587, + "grad_norm": 4.614812850952148, + "learning_rate": 7.647888616462347e-05, + "loss": 1.136, + "num_input_tokens_seen": 54070656, + "step": 3361 + }, + { + "epoch": 0.23550220214172513, + "grad_norm": 4.812591552734375, + "learning_rate": 7.647188791593696e-05, + "loss": 1.0714, + "num_input_tokens_seen": 54086416, + "step": 3362 + }, + { + "epoch": 0.23557225038745436, + "grad_norm": 3.709543466567993, + "learning_rate": 7.646488966725044e-05, + "loss": 1.106, + "num_input_tokens_seen": 54102800, + "step": 3363 + }, + { + "epoch": 0.23564229863318362, + "grad_norm": 3.9850802421569824, + "learning_rate": 7.645789141856393e-05, + "loss": 1.1509, + "num_input_tokens_seen": 54119184, + "step": 3364 + }, + { + "epoch": 0.23571234687891285, + "grad_norm": 4.59740686416626, + "learning_rate": 7.645089316987742e-05, + "loss": 1.1974, + "num_input_tokens_seen": 54135568, + "step": 3365 + }, + { + "epoch": 0.2357823951246421, + "grad_norm": 4.118459224700928, + "learning_rate": 7.644389492119089e-05, + "loss": 1.2196, + "num_input_tokens_seen": 54151952, + "step": 3366 + }, + { + "epoch": 0.23585244337037134, + "grad_norm": 4.172552108764648, + "learning_rate": 7.643689667250438e-05, + "loss": 1.0178, + "num_input_tokens_seen": 54167776, + "step": 3367 + }, + { + "epoch": 0.2359224916161006, + "grad_norm": 3.9671120643615723, + "learning_rate": 7.642989842381787e-05, + "loss": 1.0589, + "num_input_tokens_seen": 54184160, + "step": 3368 + }, + { + "epoch": 0.23599253986182983, + "grad_norm": 3.7376415729522705, + "learning_rate": 7.642290017513136e-05, + "loss": 1.1445, + "num_input_tokens_seen": 54200280, + "step": 3369 + }, + { + "epoch": 0.2360625881075591, + "grad_norm": 4.665002346038818, + "learning_rate": 7.641590192644484e-05, + "loss": 1.3347, + "num_input_tokens_seen": 54216664, + "step": 3370 + }, + { + "epoch": 0.23613263635328832, + "grad_norm": 3.669015884399414, + "learning_rate": 7.640890367775832e-05, + "loss": 0.8359, + "num_input_tokens_seen": 54232320, + "step": 3371 + }, + { + "epoch": 0.23620268459901758, + "grad_norm": 3.993393659591675, + "learning_rate": 7.640190542907181e-05, + "loss": 1.0298, + "num_input_tokens_seen": 54248704, + "step": 3372 + }, + { + "epoch": 0.2362727328447468, + "grad_norm": 3.808516263961792, + "learning_rate": 7.639490718038528e-05, + "loss": 1.1315, + "num_input_tokens_seen": 54265088, + "step": 3373 + }, + { + "epoch": 0.23634278109047607, + "grad_norm": 5.25230073928833, + "learning_rate": 7.638790893169877e-05, + "loss": 1.1273, + "num_input_tokens_seen": 54281256, + "step": 3374 + }, + { + "epoch": 0.2364128293362053, + "grad_norm": 5.724976062774658, + "learning_rate": 7.638091068301226e-05, + "loss": 1.3176, + "num_input_tokens_seen": 54296832, + "step": 3375 + }, + { + "epoch": 0.23648287758193456, + "grad_norm": 3.553737163543701, + "learning_rate": 7.637391243432575e-05, + "loss": 1.0288, + "num_input_tokens_seen": 54313120, + "step": 3376 + }, + { + "epoch": 0.2365529258276638, + "grad_norm": 6.614949703216553, + "learning_rate": 7.636691418563924e-05, + "loss": 1.0649, + "num_input_tokens_seen": 54328184, + "step": 3377 + }, + { + "epoch": 0.23662297407339306, + "grad_norm": 3.76234769821167, + "learning_rate": 7.635991593695273e-05, + "loss": 1.149, + "num_input_tokens_seen": 54344568, + "step": 3378 + }, + { + "epoch": 0.2366930223191223, + "grad_norm": 3.4564521312713623, + "learning_rate": 7.63529176882662e-05, + "loss": 0.9227, + "num_input_tokens_seen": 54360952, + "step": 3379 + }, + { + "epoch": 0.23676307056485155, + "grad_norm": 3.735978841781616, + "learning_rate": 7.634591943957969e-05, + "loss": 1.2159, + "num_input_tokens_seen": 54377336, + "step": 3380 + }, + { + "epoch": 0.23683311881058078, + "grad_norm": 4.106653690338135, + "learning_rate": 7.633892119089317e-05, + "loss": 1.0997, + "num_input_tokens_seen": 54393232, + "step": 3381 + }, + { + "epoch": 0.23690316705631004, + "grad_norm": 3.9169600009918213, + "learning_rate": 7.633192294220667e-05, + "loss": 1.247, + "num_input_tokens_seen": 54409616, + "step": 3382 + }, + { + "epoch": 0.23697321530203927, + "grad_norm": 3.8265388011932373, + "learning_rate": 7.632492469352014e-05, + "loss": 1.1391, + "num_input_tokens_seen": 54425312, + "step": 3383 + }, + { + "epoch": 0.23704326354776853, + "grad_norm": 3.6288204193115234, + "learning_rate": 7.631792644483363e-05, + "loss": 1.0445, + "num_input_tokens_seen": 54441696, + "step": 3384 + }, + { + "epoch": 0.23711331179349776, + "grad_norm": 4.207483291625977, + "learning_rate": 7.631092819614712e-05, + "loss": 1.2068, + "num_input_tokens_seen": 54457720, + "step": 3385 + }, + { + "epoch": 0.23718336003922702, + "grad_norm": 3.880786895751953, + "learning_rate": 7.63039299474606e-05, + "loss": 1.0471, + "num_input_tokens_seen": 54474104, + "step": 3386 + }, + { + "epoch": 0.23725340828495625, + "grad_norm": 4.493243217468262, + "learning_rate": 7.629693169877408e-05, + "loss": 1.1107, + "num_input_tokens_seen": 54490080, + "step": 3387 + }, + { + "epoch": 0.2373234565306855, + "grad_norm": 4.432561874389648, + "learning_rate": 7.628993345008757e-05, + "loss": 1.1474, + "num_input_tokens_seen": 54506464, + "step": 3388 + }, + { + "epoch": 0.23739350477641474, + "grad_norm": 4.210158824920654, + "learning_rate": 7.628293520140106e-05, + "loss": 1.1567, + "num_input_tokens_seen": 54522848, + "step": 3389 + }, + { + "epoch": 0.237463553022144, + "grad_norm": 4.561443328857422, + "learning_rate": 7.627593695271454e-05, + "loss": 1.2793, + "num_input_tokens_seen": 54538192, + "step": 3390 + }, + { + "epoch": 0.23753360126787323, + "grad_norm": 3.6792140007019043, + "learning_rate": 7.626893870402802e-05, + "loss": 0.9692, + "num_input_tokens_seen": 54554576, + "step": 3391 + }, + { + "epoch": 0.2376036495136025, + "grad_norm": 4.3415141105651855, + "learning_rate": 7.626194045534151e-05, + "loss": 1.1777, + "num_input_tokens_seen": 54570960, + "step": 3392 + }, + { + "epoch": 0.23767369775933175, + "grad_norm": 3.770224094390869, + "learning_rate": 7.625494220665499e-05, + "loss": 1.1923, + "num_input_tokens_seen": 54587344, + "step": 3393 + }, + { + "epoch": 0.23774374600506099, + "grad_norm": 3.7803759574890137, + "learning_rate": 7.624794395796848e-05, + "loss": 1.1631, + "num_input_tokens_seen": 54603728, + "step": 3394 + }, + { + "epoch": 0.23781379425079024, + "grad_norm": 4.559312343597412, + "learning_rate": 7.624094570928196e-05, + "loss": 1.0235, + "num_input_tokens_seen": 54619760, + "step": 3395 + }, + { + "epoch": 0.23788384249651948, + "grad_norm": 4.215981483459473, + "learning_rate": 7.623394746059545e-05, + "loss": 1.2803, + "num_input_tokens_seen": 54636144, + "step": 3396 + }, + { + "epoch": 0.23795389074224874, + "grad_norm": 4.108291149139404, + "learning_rate": 7.622694921190894e-05, + "loss": 1.0486, + "num_input_tokens_seen": 54652136, + "step": 3397 + }, + { + "epoch": 0.23802393898797797, + "grad_norm": 4.4075093269348145, + "learning_rate": 7.621995096322242e-05, + "loss": 1.0766, + "num_input_tokens_seen": 54668520, + "step": 3398 + }, + { + "epoch": 0.23809398723370723, + "grad_norm": 4.002575874328613, + "learning_rate": 7.62129527145359e-05, + "loss": 1.1793, + "num_input_tokens_seen": 54684544, + "step": 3399 + }, + { + "epoch": 0.23816403547943646, + "grad_norm": 3.5264174938201904, + "learning_rate": 7.620595446584938e-05, + "loss": 0.928, + "num_input_tokens_seen": 54700680, + "step": 3400 + }, + { + "epoch": 0.23816403547943646, + "eval_loss": 1.1361509561538696, + "eval_runtime": 0.1856, + "eval_samples_per_second": 5.389, + "eval_steps_per_second": 5.389, + "num_input_tokens_seen": 54700680, + "step": 3400 + }, + { + "epoch": 0.23823408372516572, + "grad_norm": 3.585204839706421, + "learning_rate": 7.619895621716287e-05, + "loss": 1.0865, + "num_input_tokens_seen": 54717064, + "step": 3401 + }, + { + "epoch": 0.23830413197089495, + "grad_norm": 4.442777633666992, + "learning_rate": 7.619195796847637e-05, + "loss": 0.9445, + "num_input_tokens_seen": 54732648, + "step": 3402 + }, + { + "epoch": 0.2383741802166242, + "grad_norm": 3.807063102722168, + "learning_rate": 7.618495971978985e-05, + "loss": 1.0127, + "num_input_tokens_seen": 54749032, + "step": 3403 + }, + { + "epoch": 0.23844422846235344, + "grad_norm": 4.984583854675293, + "learning_rate": 7.617796147110333e-05, + "loss": 1.349, + "num_input_tokens_seen": 54764192, + "step": 3404 + }, + { + "epoch": 0.2385142767080827, + "grad_norm": 4.326750755310059, + "learning_rate": 7.617096322241682e-05, + "loss": 1.0875, + "num_input_tokens_seen": 54780120, + "step": 3405 + }, + { + "epoch": 0.23858432495381193, + "grad_norm": 5.707291126251221, + "learning_rate": 7.61639649737303e-05, + "loss": 1.0816, + "num_input_tokens_seen": 54796168, + "step": 3406 + }, + { + "epoch": 0.2386543731995412, + "grad_norm": 4.450499534606934, + "learning_rate": 7.615696672504379e-05, + "loss": 1.139, + "num_input_tokens_seen": 54812056, + "step": 3407 + }, + { + "epoch": 0.23872442144527042, + "grad_norm": 4.253554821014404, + "learning_rate": 7.614996847635728e-05, + "loss": 1.1798, + "num_input_tokens_seen": 54828248, + "step": 3408 + }, + { + "epoch": 0.23879446969099968, + "grad_norm": 5.04890251159668, + "learning_rate": 7.614297022767076e-05, + "loss": 0.9968, + "num_input_tokens_seen": 54844632, + "step": 3409 + }, + { + "epoch": 0.23886451793672892, + "grad_norm": 3.24513578414917, + "learning_rate": 7.613597197898424e-05, + "loss": 0.8901, + "num_input_tokens_seen": 54861016, + "step": 3410 + }, + { + "epoch": 0.23893456618245817, + "grad_norm": 4.008625507354736, + "learning_rate": 7.612897373029773e-05, + "loss": 1.1048, + "num_input_tokens_seen": 54877168, + "step": 3411 + }, + { + "epoch": 0.2390046144281874, + "grad_norm": 5.393536567687988, + "learning_rate": 7.612197548161122e-05, + "loss": 1.1554, + "num_input_tokens_seen": 54892720, + "step": 3412 + }, + { + "epoch": 0.23907466267391667, + "grad_norm": 4.388333797454834, + "learning_rate": 7.611497723292469e-05, + "loss": 1.0478, + "num_input_tokens_seen": 54909104, + "step": 3413 + }, + { + "epoch": 0.2391447109196459, + "grad_norm": 3.8056883811950684, + "learning_rate": 7.610797898423818e-05, + "loss": 0.9235, + "num_input_tokens_seen": 54925280, + "step": 3414 + }, + { + "epoch": 0.23921475916537516, + "grad_norm": 6.9983062744140625, + "learning_rate": 7.610098073555167e-05, + "loss": 1.0766, + "num_input_tokens_seen": 54941384, + "step": 3415 + }, + { + "epoch": 0.2392848074111044, + "grad_norm": 3.485119581222534, + "learning_rate": 7.609398248686516e-05, + "loss": 1.0811, + "num_input_tokens_seen": 54957592, + "step": 3416 + }, + { + "epoch": 0.23935485565683365, + "grad_norm": 4.450938701629639, + "learning_rate": 7.608698423817863e-05, + "loss": 0.9354, + "num_input_tokens_seen": 54973976, + "step": 3417 + }, + { + "epoch": 0.23942490390256288, + "grad_norm": 4.142702579498291, + "learning_rate": 7.607998598949212e-05, + "loss": 1.0336, + "num_input_tokens_seen": 54990360, + "step": 3418 + }, + { + "epoch": 0.23949495214829214, + "grad_norm": 4.341495513916016, + "learning_rate": 7.607298774080561e-05, + "loss": 0.9722, + "num_input_tokens_seen": 55006744, + "step": 3419 + }, + { + "epoch": 0.23956500039402137, + "grad_norm": 4.355419158935547, + "learning_rate": 7.606598949211908e-05, + "loss": 0.9972, + "num_input_tokens_seen": 55022816, + "step": 3420 + }, + { + "epoch": 0.23963504863975063, + "grad_norm": 4.295046806335449, + "learning_rate": 7.605899124343257e-05, + "loss": 1.1881, + "num_input_tokens_seen": 55039200, + "step": 3421 + }, + { + "epoch": 0.23970509688547986, + "grad_norm": 3.9299042224884033, + "learning_rate": 7.605199299474608e-05, + "loss": 1.0959, + "num_input_tokens_seen": 55055552, + "step": 3422 + }, + { + "epoch": 0.23977514513120912, + "grad_norm": 3.7252607345581055, + "learning_rate": 7.604499474605955e-05, + "loss": 0.9151, + "num_input_tokens_seen": 55071936, + "step": 3423 + }, + { + "epoch": 0.23984519337693835, + "grad_norm": 4.723415851593018, + "learning_rate": 7.603799649737304e-05, + "loss": 0.9568, + "num_input_tokens_seen": 55088320, + "step": 3424 + }, + { + "epoch": 0.2399152416226676, + "grad_norm": 3.9923605918884277, + "learning_rate": 7.603099824868651e-05, + "loss": 1.1124, + "num_input_tokens_seen": 55104416, + "step": 3425 + }, + { + "epoch": 0.23998528986839684, + "grad_norm": 4.510697364807129, + "learning_rate": 7.6024e-05, + "loss": 1.1397, + "num_input_tokens_seen": 55120800, + "step": 3426 + }, + { + "epoch": 0.2400553381141261, + "grad_norm": 4.161818027496338, + "learning_rate": 7.601700175131348e-05, + "loss": 1.0915, + "num_input_tokens_seen": 55137184, + "step": 3427 + }, + { + "epoch": 0.24012538635985536, + "grad_norm": 5.871128082275391, + "learning_rate": 7.601000350262698e-05, + "loss": 0.9465, + "num_input_tokens_seen": 55152528, + "step": 3428 + }, + { + "epoch": 0.2401954346055846, + "grad_norm": 4.180598258972168, + "learning_rate": 7.600300525394047e-05, + "loss": 1.0132, + "num_input_tokens_seen": 55168552, + "step": 3429 + }, + { + "epoch": 0.24026548285131386, + "grad_norm": 5.575338363647461, + "learning_rate": 7.599600700525394e-05, + "loss": 1.2578, + "num_input_tokens_seen": 55184104, + "step": 3430 + }, + { + "epoch": 0.2403355310970431, + "grad_norm": 4.503122329711914, + "learning_rate": 7.598900875656743e-05, + "loss": 1.1367, + "num_input_tokens_seen": 55199768, + "step": 3431 + }, + { + "epoch": 0.24040557934277235, + "grad_norm": 3.6931769847869873, + "learning_rate": 7.598201050788092e-05, + "loss": 1.0977, + "num_input_tokens_seen": 55216016, + "step": 3432 + }, + { + "epoch": 0.24047562758850158, + "grad_norm": 4.138489723205566, + "learning_rate": 7.59750122591944e-05, + "loss": 1.1163, + "num_input_tokens_seen": 55232400, + "step": 3433 + }, + { + "epoch": 0.24054567583423084, + "grad_norm": 3.603297710418701, + "learning_rate": 7.596801401050788e-05, + "loss": 1.1277, + "num_input_tokens_seen": 55248784, + "step": 3434 + }, + { + "epoch": 0.24061572407996007, + "grad_norm": 4.072240352630615, + "learning_rate": 7.596101576182137e-05, + "loss": 1.3073, + "num_input_tokens_seen": 55264320, + "step": 3435 + }, + { + "epoch": 0.24068577232568933, + "grad_norm": 5.015305519104004, + "learning_rate": 7.595401751313486e-05, + "loss": 1.3236, + "num_input_tokens_seen": 55280528, + "step": 3436 + }, + { + "epoch": 0.24075582057141856, + "grad_norm": 5.135364055633545, + "learning_rate": 7.594701926444834e-05, + "loss": 1.0322, + "num_input_tokens_seen": 55296912, + "step": 3437 + }, + { + "epoch": 0.24082586881714782, + "grad_norm": 4.737668991088867, + "learning_rate": 7.594002101576183e-05, + "loss": 1.0069, + "num_input_tokens_seen": 55313296, + "step": 3438 + }, + { + "epoch": 0.24089591706287705, + "grad_norm": 4.380087375640869, + "learning_rate": 7.593302276707531e-05, + "loss": 1.267, + "num_input_tokens_seen": 55329152, + "step": 3439 + }, + { + "epoch": 0.2409659653086063, + "grad_norm": 4.472866535186768, + "learning_rate": 7.592602451838879e-05, + "loss": 1.1577, + "num_input_tokens_seen": 55345536, + "step": 3440 + }, + { + "epoch": 0.24103601355433554, + "grad_norm": 4.323402404785156, + "learning_rate": 7.591902626970228e-05, + "loss": 1.1872, + "num_input_tokens_seen": 55361920, + "step": 3441 + }, + { + "epoch": 0.2411060618000648, + "grad_norm": 3.7247276306152344, + "learning_rate": 7.591202802101578e-05, + "loss": 1.0906, + "num_input_tokens_seen": 55377344, + "step": 3442 + }, + { + "epoch": 0.24117611004579403, + "grad_norm": 6.503116607666016, + "learning_rate": 7.590502977232925e-05, + "loss": 1.2304, + "num_input_tokens_seen": 55393728, + "step": 3443 + }, + { + "epoch": 0.2412461582915233, + "grad_norm": 4.590184688568115, + "learning_rate": 7.589803152364273e-05, + "loss": 1.1369, + "num_input_tokens_seen": 55410112, + "step": 3444 + }, + { + "epoch": 0.24131620653725253, + "grad_norm": 3.718323230743408, + "learning_rate": 7.589103327495622e-05, + "loss": 1.09, + "num_input_tokens_seen": 55426392, + "step": 3445 + }, + { + "epoch": 0.24138625478298179, + "grad_norm": 4.8696465492248535, + "learning_rate": 7.58840350262697e-05, + "loss": 1.2361, + "num_input_tokens_seen": 55442632, + "step": 3446 + }, + { + "epoch": 0.24145630302871102, + "grad_norm": 3.7620716094970703, + "learning_rate": 7.587703677758318e-05, + "loss": 0.9411, + "num_input_tokens_seen": 55459016, + "step": 3447 + }, + { + "epoch": 0.24152635127444028, + "grad_norm": 3.8696882724761963, + "learning_rate": 7.587003852889668e-05, + "loss": 0.992, + "num_input_tokens_seen": 55474944, + "step": 3448 + }, + { + "epoch": 0.2415963995201695, + "grad_norm": 4.628901481628418, + "learning_rate": 7.586304028021017e-05, + "loss": 1.1376, + "num_input_tokens_seen": 55490416, + "step": 3449 + }, + { + "epoch": 0.24166644776589877, + "grad_norm": 4.1568121910095215, + "learning_rate": 7.585604203152365e-05, + "loss": 1.1596, + "num_input_tokens_seen": 55505760, + "step": 3450 + }, + { + "epoch": 0.241736496011628, + "grad_norm": 4.089991569519043, + "learning_rate": 7.584904378283714e-05, + "loss": 1.1707, + "num_input_tokens_seen": 55521528, + "step": 3451 + }, + { + "epoch": 0.24180654425735726, + "grad_norm": 7.870755195617676, + "learning_rate": 7.584204553415061e-05, + "loss": 1.3069, + "num_input_tokens_seen": 55536256, + "step": 3452 + }, + { + "epoch": 0.2418765925030865, + "grad_norm": 4.898053169250488, + "learning_rate": 7.58350472854641e-05, + "loss": 1.0912, + "num_input_tokens_seen": 55551904, + "step": 3453 + }, + { + "epoch": 0.24194664074881575, + "grad_norm": 4.515797138214111, + "learning_rate": 7.582804903677759e-05, + "loss": 1.2266, + "num_input_tokens_seen": 55567240, + "step": 3454 + }, + { + "epoch": 0.24201668899454498, + "grad_norm": 3.7202370166778564, + "learning_rate": 7.582105078809108e-05, + "loss": 1.0118, + "num_input_tokens_seen": 55583176, + "step": 3455 + }, + { + "epoch": 0.24208673724027424, + "grad_norm": 5.834963321685791, + "learning_rate": 7.581405253940457e-05, + "loss": 1.3757, + "num_input_tokens_seen": 55599144, + "step": 3456 + }, + { + "epoch": 0.24215678548600347, + "grad_norm": 4.450705528259277, + "learning_rate": 7.580705429071804e-05, + "loss": 0.9523, + "num_input_tokens_seen": 55615528, + "step": 3457 + }, + { + "epoch": 0.24222683373173273, + "grad_norm": 3.796229839324951, + "learning_rate": 7.580005604203153e-05, + "loss": 1.0415, + "num_input_tokens_seen": 55631912, + "step": 3458 + }, + { + "epoch": 0.24229688197746196, + "grad_norm": 4.004448413848877, + "learning_rate": 7.579305779334502e-05, + "loss": 1.1538, + "num_input_tokens_seen": 55647896, + "step": 3459 + }, + { + "epoch": 0.24236693022319122, + "grad_norm": 4.511063098907471, + "learning_rate": 7.578605954465849e-05, + "loss": 1.0616, + "num_input_tokens_seen": 55664280, + "step": 3460 + }, + { + "epoch": 0.24243697846892046, + "grad_norm": 6.866496562957764, + "learning_rate": 7.577906129597198e-05, + "loss": 1.1126, + "num_input_tokens_seen": 55679720, + "step": 3461 + }, + { + "epoch": 0.24250702671464971, + "grad_norm": 5.447164058685303, + "learning_rate": 7.577206304728547e-05, + "loss": 1.0812, + "num_input_tokens_seen": 55696104, + "step": 3462 + }, + { + "epoch": 0.24257707496037897, + "grad_norm": 6.401725769042969, + "learning_rate": 7.576506479859896e-05, + "loss": 1.0097, + "num_input_tokens_seen": 55712488, + "step": 3463 + }, + { + "epoch": 0.2426471232061082, + "grad_norm": 3.7833733558654785, + "learning_rate": 7.575806654991243e-05, + "loss": 1.0064, + "num_input_tokens_seen": 55728504, + "step": 3464 + }, + { + "epoch": 0.24271717145183747, + "grad_norm": 4.157958984375, + "learning_rate": 7.575106830122592e-05, + "loss": 1.109, + "num_input_tokens_seen": 55743904, + "step": 3465 + }, + { + "epoch": 0.2427872196975667, + "grad_norm": 4.657470703125, + "learning_rate": 7.574407005253941e-05, + "loss": 1.2033, + "num_input_tokens_seen": 55759920, + "step": 3466 + }, + { + "epoch": 0.24285726794329596, + "grad_norm": 5.129040718078613, + "learning_rate": 7.573707180385289e-05, + "loss": 1.2959, + "num_input_tokens_seen": 55776304, + "step": 3467 + }, + { + "epoch": 0.2429273161890252, + "grad_norm": 4.887351036071777, + "learning_rate": 7.573007355516639e-05, + "loss": 1.0568, + "num_input_tokens_seen": 55792688, + "step": 3468 + }, + { + "epoch": 0.24299736443475445, + "grad_norm": 4.042501926422119, + "learning_rate": 7.572307530647988e-05, + "loss": 1.0509, + "num_input_tokens_seen": 55809072, + "step": 3469 + }, + { + "epoch": 0.24306741268048368, + "grad_norm": 4.162355422973633, + "learning_rate": 7.571607705779335e-05, + "loss": 1.0168, + "num_input_tokens_seen": 55825456, + "step": 3470 + }, + { + "epoch": 0.24313746092621294, + "grad_norm": 3.8478844165802, + "learning_rate": 7.570907880910683e-05, + "loss": 1.2378, + "num_input_tokens_seen": 55841840, + "step": 3471 + }, + { + "epoch": 0.24320750917194217, + "grad_norm": 6.2065815925598145, + "learning_rate": 7.570208056042032e-05, + "loss": 1.3644, + "num_input_tokens_seen": 55858224, + "step": 3472 + }, + { + "epoch": 0.24327755741767143, + "grad_norm": 4.8233642578125, + "learning_rate": 7.56950823117338e-05, + "loss": 1.1363, + "num_input_tokens_seen": 55874608, + "step": 3473 + }, + { + "epoch": 0.24334760566340066, + "grad_norm": 3.534205198287964, + "learning_rate": 7.568808406304729e-05, + "loss": 1.0236, + "num_input_tokens_seen": 55890992, + "step": 3474 + }, + { + "epoch": 0.24341765390912992, + "grad_norm": 4.218345642089844, + "learning_rate": 7.568108581436078e-05, + "loss": 1.0921, + "num_input_tokens_seen": 55907376, + "step": 3475 + }, + { + "epoch": 0.24348770215485915, + "grad_norm": 3.7262325286865234, + "learning_rate": 7.567408756567427e-05, + "loss": 1.1182, + "num_input_tokens_seen": 55923752, + "step": 3476 + }, + { + "epoch": 0.2435577504005884, + "grad_norm": 3.7530906200408936, + "learning_rate": 7.566708931698774e-05, + "loss": 1.0766, + "num_input_tokens_seen": 55939176, + "step": 3477 + }, + { + "epoch": 0.24362779864631764, + "grad_norm": 4.452608585357666, + "learning_rate": 7.566009106830123e-05, + "loss": 0.9421, + "num_input_tokens_seen": 55955200, + "step": 3478 + }, + { + "epoch": 0.2436978468920469, + "grad_norm": 4.049906253814697, + "learning_rate": 7.565309281961471e-05, + "loss": 1.1022, + "num_input_tokens_seen": 55971584, + "step": 3479 + }, + { + "epoch": 0.24376789513777614, + "grad_norm": 4.956455230712891, + "learning_rate": 7.56460945709282e-05, + "loss": 1.2684, + "num_input_tokens_seen": 55987968, + "step": 3480 + }, + { + "epoch": 0.2438379433835054, + "grad_norm": 4.846863746643066, + "learning_rate": 7.563909632224169e-05, + "loss": 1.0492, + "num_input_tokens_seen": 56003000, + "step": 3481 + }, + { + "epoch": 0.24390799162923463, + "grad_norm": 4.678101539611816, + "learning_rate": 7.563209807355517e-05, + "loss": 0.8577, + "num_input_tokens_seen": 56019384, + "step": 3482 + }, + { + "epoch": 0.2439780398749639, + "grad_norm": 4.13012170791626, + "learning_rate": 7.562509982486866e-05, + "loss": 0.9508, + "num_input_tokens_seen": 56035768, + "step": 3483 + }, + { + "epoch": 0.24404808812069312, + "grad_norm": 3.7858669757843018, + "learning_rate": 7.561810157618214e-05, + "loss": 1.1034, + "num_input_tokens_seen": 56052152, + "step": 3484 + }, + { + "epoch": 0.24411813636642238, + "grad_norm": 3.7132198810577393, + "learning_rate": 7.561110332749563e-05, + "loss": 1.0665, + "num_input_tokens_seen": 56068536, + "step": 3485 + }, + { + "epoch": 0.2441881846121516, + "grad_norm": 4.093675136566162, + "learning_rate": 7.560410507880911e-05, + "loss": 1.0994, + "num_input_tokens_seen": 56084888, + "step": 3486 + }, + { + "epoch": 0.24425823285788087, + "grad_norm": 3.8601670265197754, + "learning_rate": 7.559710683012259e-05, + "loss": 0.9648, + "num_input_tokens_seen": 56101272, + "step": 3487 + }, + { + "epoch": 0.2443282811036101, + "grad_norm": 3.9332170486450195, + "learning_rate": 7.559010858143608e-05, + "loss": 0.9559, + "num_input_tokens_seen": 56117352, + "step": 3488 + }, + { + "epoch": 0.24439832934933936, + "grad_norm": 3.7619807720184326, + "learning_rate": 7.558311033274957e-05, + "loss": 1.0948, + "num_input_tokens_seen": 56133736, + "step": 3489 + }, + { + "epoch": 0.2444683775950686, + "grad_norm": 3.9035656452178955, + "learning_rate": 7.557611208406306e-05, + "loss": 1.1255, + "num_input_tokens_seen": 56149624, + "step": 3490 + }, + { + "epoch": 0.24453842584079785, + "grad_norm": 5.9505486488342285, + "learning_rate": 7.556911383537653e-05, + "loss": 1.0192, + "num_input_tokens_seen": 56163752, + "step": 3491 + }, + { + "epoch": 0.24460847408652708, + "grad_norm": 4.006525039672852, + "learning_rate": 7.556211558669002e-05, + "loss": 1.0859, + "num_input_tokens_seen": 56180136, + "step": 3492 + }, + { + "epoch": 0.24467852233225634, + "grad_norm": 5.28178071975708, + "learning_rate": 7.555511733800351e-05, + "loss": 1.3704, + "num_input_tokens_seen": 56196152, + "step": 3493 + }, + { + "epoch": 0.24474857057798557, + "grad_norm": 3.577709674835205, + "learning_rate": 7.5548119089317e-05, + "loss": 1.0015, + "num_input_tokens_seen": 56212528, + "step": 3494 + }, + { + "epoch": 0.24481861882371483, + "grad_norm": 4.6045002937316895, + "learning_rate": 7.554112084063048e-05, + "loss": 1.1895, + "num_input_tokens_seen": 56228912, + "step": 3495 + }, + { + "epoch": 0.24488866706944407, + "grad_norm": 4.160959720611572, + "learning_rate": 7.553412259194397e-05, + "loss": 1.1253, + "num_input_tokens_seen": 56244944, + "step": 3496 + }, + { + "epoch": 0.24495871531517333, + "grad_norm": 4.380669593811035, + "learning_rate": 7.552712434325745e-05, + "loss": 1.0171, + "num_input_tokens_seen": 56261072, + "step": 3497 + }, + { + "epoch": 0.24502876356090258, + "grad_norm": 3.568679094314575, + "learning_rate": 7.552012609457092e-05, + "loss": 1.0005, + "num_input_tokens_seen": 56277456, + "step": 3498 + }, + { + "epoch": 0.24509881180663182, + "grad_norm": 4.006386756896973, + "learning_rate": 7.551312784588441e-05, + "loss": 1.0756, + "num_input_tokens_seen": 56293840, + "step": 3499 + }, + { + "epoch": 0.24516886005236108, + "grad_norm": 4.180081844329834, + "learning_rate": 7.55061295971979e-05, + "loss": 1.303, + "num_input_tokens_seen": 56310224, + "step": 3500 + }, + { + "epoch": 0.2452389082980903, + "grad_norm": 5.228555202484131, + "learning_rate": 7.549913134851139e-05, + "loss": 1.0267, + "num_input_tokens_seen": 56326608, + "step": 3501 + }, + { + "epoch": 0.24530895654381957, + "grad_norm": 3.60235595703125, + "learning_rate": 7.549213309982488e-05, + "loss": 0.9258, + "num_input_tokens_seen": 56342752, + "step": 3502 + }, + { + "epoch": 0.2453790047895488, + "grad_norm": 5.305295467376709, + "learning_rate": 7.548513485113837e-05, + "loss": 1.0366, + "num_input_tokens_seen": 56358208, + "step": 3503 + }, + { + "epoch": 0.24544905303527806, + "grad_norm": 4.5955939292907715, + "learning_rate": 7.547813660245184e-05, + "loss": 1.1016, + "num_input_tokens_seen": 56374592, + "step": 3504 + }, + { + "epoch": 0.2455191012810073, + "grad_norm": 4.503798961639404, + "learning_rate": 7.547113835376533e-05, + "loss": 0.9045, + "num_input_tokens_seen": 56390320, + "step": 3505 + }, + { + "epoch": 0.24558914952673655, + "grad_norm": 3.79156231880188, + "learning_rate": 7.54641401050788e-05, + "loss": 0.9796, + "num_input_tokens_seen": 56406176, + "step": 3506 + }, + { + "epoch": 0.24565919777246578, + "grad_norm": 4.054116249084473, + "learning_rate": 7.54571418563923e-05, + "loss": 1.0002, + "num_input_tokens_seen": 56421120, + "step": 3507 + }, + { + "epoch": 0.24572924601819504, + "grad_norm": 3.57210636138916, + "learning_rate": 7.545014360770578e-05, + "loss": 0.982, + "num_input_tokens_seen": 56437504, + "step": 3508 + }, + { + "epoch": 0.24579929426392427, + "grad_norm": 4.02463960647583, + "learning_rate": 7.544314535901927e-05, + "loss": 1.0651, + "num_input_tokens_seen": 56452680, + "step": 3509 + }, + { + "epoch": 0.24586934250965353, + "grad_norm": 3.712689161300659, + "learning_rate": 7.543614711033276e-05, + "loss": 1.1449, + "num_input_tokens_seen": 56468424, + "step": 3510 + }, + { + "epoch": 0.24593939075538276, + "grad_norm": 4.943066596984863, + "learning_rate": 7.542914886164623e-05, + "loss": 1.2289, + "num_input_tokens_seen": 56484784, + "step": 3511 + }, + { + "epoch": 0.24600943900111202, + "grad_norm": 4.94294548034668, + "learning_rate": 7.542215061295972e-05, + "loss": 1.0088, + "num_input_tokens_seen": 56500992, + "step": 3512 + }, + { + "epoch": 0.24607948724684126, + "grad_norm": 4.003958225250244, + "learning_rate": 7.541515236427321e-05, + "loss": 0.9778, + "num_input_tokens_seen": 56516392, + "step": 3513 + }, + { + "epoch": 0.24614953549257051, + "grad_norm": 4.173887252807617, + "learning_rate": 7.540815411558669e-05, + "loss": 1.0291, + "num_input_tokens_seen": 56532776, + "step": 3514 + }, + { + "epoch": 0.24621958373829975, + "grad_norm": 5.028538227081299, + "learning_rate": 7.540115586690019e-05, + "loss": 1.0489, + "num_input_tokens_seen": 56549056, + "step": 3515 + }, + { + "epoch": 0.246289631984029, + "grad_norm": 3.781219959259033, + "learning_rate": 7.539415761821366e-05, + "loss": 0.9056, + "num_input_tokens_seen": 56564768, + "step": 3516 + }, + { + "epoch": 0.24635968022975824, + "grad_norm": 4.070143222808838, + "learning_rate": 7.538715936952715e-05, + "loss": 1.0561, + "num_input_tokens_seen": 56580856, + "step": 3517 + }, + { + "epoch": 0.2464297284754875, + "grad_norm": 4.322885036468506, + "learning_rate": 7.538016112084063e-05, + "loss": 0.9834, + "num_input_tokens_seen": 56596664, + "step": 3518 + }, + { + "epoch": 0.24649977672121673, + "grad_norm": 7.525569438934326, + "learning_rate": 7.537316287215412e-05, + "loss": 1.278, + "num_input_tokens_seen": 56611608, + "step": 3519 + }, + { + "epoch": 0.246569824966946, + "grad_norm": 3.6520745754241943, + "learning_rate": 7.53661646234676e-05, + "loss": 0.9472, + "num_input_tokens_seen": 56627992, + "step": 3520 + }, + { + "epoch": 0.24663987321267522, + "grad_norm": 6.346038341522217, + "learning_rate": 7.53591663747811e-05, + "loss": 1.0585, + "num_input_tokens_seen": 56644224, + "step": 3521 + }, + { + "epoch": 0.24670992145840448, + "grad_norm": 5.049849987030029, + "learning_rate": 7.535216812609458e-05, + "loss": 1.1843, + "num_input_tokens_seen": 56660464, + "step": 3522 + }, + { + "epoch": 0.2467799697041337, + "grad_norm": 5.948208332061768, + "learning_rate": 7.534516987740807e-05, + "loss": 1.1426, + "num_input_tokens_seen": 56676848, + "step": 3523 + }, + { + "epoch": 0.24685001794986297, + "grad_norm": 4.2648210525512695, + "learning_rate": 7.533817162872155e-05, + "loss": 1.0703, + "num_input_tokens_seen": 56692800, + "step": 3524 + }, + { + "epoch": 0.2469200661955922, + "grad_norm": 4.268098831176758, + "learning_rate": 7.533117338003502e-05, + "loss": 1.123, + "num_input_tokens_seen": 56709184, + "step": 3525 + }, + { + "epoch": 0.24699011444132146, + "grad_norm": 3.987408399581909, + "learning_rate": 7.532417513134851e-05, + "loss": 1.1312, + "num_input_tokens_seen": 56724744, + "step": 3526 + }, + { + "epoch": 0.2470601626870507, + "grad_norm": 4.087530612945557, + "learning_rate": 7.5317176882662e-05, + "loss": 0.9904, + "num_input_tokens_seen": 56741128, + "step": 3527 + }, + { + "epoch": 0.24713021093277995, + "grad_norm": 3.484837770462036, + "learning_rate": 7.531017863397549e-05, + "loss": 1.0385, + "num_input_tokens_seen": 56757512, + "step": 3528 + }, + { + "epoch": 0.24720025917850919, + "grad_norm": 4.382214546203613, + "learning_rate": 7.530318038528898e-05, + "loss": 1.1713, + "num_input_tokens_seen": 56773896, + "step": 3529 + }, + { + "epoch": 0.24727030742423844, + "grad_norm": 4.361959457397461, + "learning_rate": 7.529618213660246e-05, + "loss": 1.2548, + "num_input_tokens_seen": 56789184, + "step": 3530 + }, + { + "epoch": 0.24734035566996768, + "grad_norm": 4.029329776763916, + "learning_rate": 7.528918388791594e-05, + "loss": 1.0637, + "num_input_tokens_seen": 56804912, + "step": 3531 + }, + { + "epoch": 0.24741040391569694, + "grad_norm": 4.577064514160156, + "learning_rate": 7.528218563922943e-05, + "loss": 0.9591, + "num_input_tokens_seen": 56821296, + "step": 3532 + }, + { + "epoch": 0.2474804521614262, + "grad_norm": 3.6799368858337402, + "learning_rate": 7.52751873905429e-05, + "loss": 1.0508, + "num_input_tokens_seen": 56837680, + "step": 3533 + }, + { + "epoch": 0.24755050040715543, + "grad_norm": 3.962989568710327, + "learning_rate": 7.526818914185639e-05, + "loss": 1.1347, + "num_input_tokens_seen": 56853984, + "step": 3534 + }, + { + "epoch": 0.2476205486528847, + "grad_norm": 3.610877275466919, + "learning_rate": 7.526119089316988e-05, + "loss": 1.075, + "num_input_tokens_seen": 56870368, + "step": 3535 + }, + { + "epoch": 0.24769059689861392, + "grad_norm": 4.16568660736084, + "learning_rate": 7.525419264448337e-05, + "loss": 0.9326, + "num_input_tokens_seen": 56886032, + "step": 3536 + }, + { + "epoch": 0.24776064514434318, + "grad_norm": 4.645627021789551, + "learning_rate": 7.524719439579686e-05, + "loss": 1.0221, + "num_input_tokens_seen": 56900928, + "step": 3537 + }, + { + "epoch": 0.2478306933900724, + "grad_norm": 3.652317523956299, + "learning_rate": 7.524019614711033e-05, + "loss": 1.1641, + "num_input_tokens_seen": 56917048, + "step": 3538 + }, + { + "epoch": 0.24790074163580167, + "grad_norm": 5.583502769470215, + "learning_rate": 7.523319789842382e-05, + "loss": 1.0303, + "num_input_tokens_seen": 56933432, + "step": 3539 + }, + { + "epoch": 0.2479707898815309, + "grad_norm": 3.6924219131469727, + "learning_rate": 7.522619964973731e-05, + "loss": 0.9386, + "num_input_tokens_seen": 56949816, + "step": 3540 + }, + { + "epoch": 0.24804083812726016, + "grad_norm": 6.476202487945557, + "learning_rate": 7.52192014010508e-05, + "loss": 1.1841, + "num_input_tokens_seen": 56966064, + "step": 3541 + }, + { + "epoch": 0.2481108863729894, + "grad_norm": 4.052863121032715, + "learning_rate": 7.521220315236429e-05, + "loss": 1.1133, + "num_input_tokens_seen": 56982448, + "step": 3542 + }, + { + "epoch": 0.24818093461871865, + "grad_norm": 6.569397926330566, + "learning_rate": 7.520520490367776e-05, + "loss": 1.1061, + "num_input_tokens_seen": 56998832, + "step": 3543 + }, + { + "epoch": 0.24825098286444788, + "grad_norm": 4.026796817779541, + "learning_rate": 7.519820665499125e-05, + "loss": 1.0121, + "num_input_tokens_seen": 57014744, + "step": 3544 + }, + { + "epoch": 0.24832103111017714, + "grad_norm": 3.705080032348633, + "learning_rate": 7.519120840630472e-05, + "loss": 1.0041, + "num_input_tokens_seen": 57031128, + "step": 3545 + }, + { + "epoch": 0.24839107935590637, + "grad_norm": 4.828441143035889, + "learning_rate": 7.518421015761821e-05, + "loss": 1.1551, + "num_input_tokens_seen": 57047512, + "step": 3546 + }, + { + "epoch": 0.24846112760163563, + "grad_norm": 5.6117777824401855, + "learning_rate": 7.51772119089317e-05, + "loss": 1.1555, + "num_input_tokens_seen": 57063840, + "step": 3547 + }, + { + "epoch": 0.24853117584736487, + "grad_norm": 3.955193281173706, + "learning_rate": 7.517021366024519e-05, + "loss": 1.0514, + "num_input_tokens_seen": 57079936, + "step": 3548 + }, + { + "epoch": 0.24860122409309413, + "grad_norm": 3.8878116607666016, + "learning_rate": 7.516321541155868e-05, + "loss": 1.0335, + "num_input_tokens_seen": 57096320, + "step": 3549 + }, + { + "epoch": 0.24867127233882336, + "grad_norm": 6.119873046875, + "learning_rate": 7.515621716287217e-05, + "loss": 1.0798, + "num_input_tokens_seen": 57111632, + "step": 3550 + }, + { + "epoch": 0.24874132058455262, + "grad_norm": 3.757984161376953, + "learning_rate": 7.514921891418564e-05, + "loss": 0.9911, + "num_input_tokens_seen": 57128016, + "step": 3551 + }, + { + "epoch": 0.24881136883028185, + "grad_norm": 4.173069953918457, + "learning_rate": 7.514222066549912e-05, + "loss": 1.1443, + "num_input_tokens_seen": 57144400, + "step": 3552 + }, + { + "epoch": 0.2488814170760111, + "grad_norm": 3.6985576152801514, + "learning_rate": 7.51352224168126e-05, + "loss": 1.1084, + "num_input_tokens_seen": 57160784, + "step": 3553 + }, + { + "epoch": 0.24895146532174034, + "grad_norm": 4.464880466461182, + "learning_rate": 7.51282241681261e-05, + "loss": 1.2571, + "num_input_tokens_seen": 57177168, + "step": 3554 + }, + { + "epoch": 0.2490215135674696, + "grad_norm": 3.699873447418213, + "learning_rate": 7.512122591943958e-05, + "loss": 0.9722, + "num_input_tokens_seen": 57193208, + "step": 3555 + }, + { + "epoch": 0.24909156181319883, + "grad_norm": 5.011424541473389, + "learning_rate": 7.511422767075307e-05, + "loss": 0.962, + "num_input_tokens_seen": 57209592, + "step": 3556 + }, + { + "epoch": 0.2491616100589281, + "grad_norm": 4.302685260772705, + "learning_rate": 7.510722942206656e-05, + "loss": 1.0718, + "num_input_tokens_seen": 57225976, + "step": 3557 + }, + { + "epoch": 0.24923165830465732, + "grad_norm": 3.991840362548828, + "learning_rate": 7.510023117338004e-05, + "loss": 1.0826, + "num_input_tokens_seen": 57242168, + "step": 3558 + }, + { + "epoch": 0.24930170655038658, + "grad_norm": 3.9910435676574707, + "learning_rate": 7.509323292469352e-05, + "loss": 1.2494, + "num_input_tokens_seen": 57258552, + "step": 3559 + }, + { + "epoch": 0.2493717547961158, + "grad_norm": 4.170960426330566, + "learning_rate": 7.5086234676007e-05, + "loss": 1.0068, + "num_input_tokens_seen": 57274936, + "step": 3560 + }, + { + "epoch": 0.24944180304184507, + "grad_norm": 4.317671298980713, + "learning_rate": 7.50792364273205e-05, + "loss": 1.0835, + "num_input_tokens_seen": 57291320, + "step": 3561 + }, + { + "epoch": 0.2495118512875743, + "grad_norm": 3.871293783187866, + "learning_rate": 7.507223817863398e-05, + "loss": 0.9352, + "num_input_tokens_seen": 57307056, + "step": 3562 + }, + { + "epoch": 0.24958189953330356, + "grad_norm": 4.014804840087891, + "learning_rate": 7.506523992994747e-05, + "loss": 1.2893, + "num_input_tokens_seen": 57322376, + "step": 3563 + }, + { + "epoch": 0.2496519477790328, + "grad_norm": 5.13847017288208, + "learning_rate": 7.505824168126095e-05, + "loss": 1.1943, + "num_input_tokens_seen": 57338760, + "step": 3564 + }, + { + "epoch": 0.24972199602476206, + "grad_norm": 3.7801826000213623, + "learning_rate": 7.505124343257443e-05, + "loss": 1.021, + "num_input_tokens_seen": 57355144, + "step": 3565 + }, + { + "epoch": 0.24979204427049131, + "grad_norm": 3.662065029144287, + "learning_rate": 7.504424518388792e-05, + "loss": 1.1461, + "num_input_tokens_seen": 57371528, + "step": 3566 + }, + { + "epoch": 0.24986209251622055, + "grad_norm": 4.548840522766113, + "learning_rate": 7.50372469352014e-05, + "loss": 1.058, + "num_input_tokens_seen": 57387912, + "step": 3567 + }, + { + "epoch": 0.2499321407619498, + "grad_norm": 4.732056140899658, + "learning_rate": 7.50302486865149e-05, + "loss": 1.0513, + "num_input_tokens_seen": 57403120, + "step": 3568 + }, + { + "epoch": 0.25000218900767907, + "grad_norm": 3.7986674308776855, + "learning_rate": 7.502325043782837e-05, + "loss": 0.9574, + "num_input_tokens_seen": 57418800, + "step": 3569 + }, + { + "epoch": 0.25007223725340827, + "grad_norm": 6.760079860687256, + "learning_rate": 7.501625218914186e-05, + "loss": 0.9101, + "num_input_tokens_seen": 57432608, + "step": 3570 + }, + { + "epoch": 0.25014228549913753, + "grad_norm": 4.0666985511779785, + "learning_rate": 7.500925394045535e-05, + "loss": 1.0564, + "num_input_tokens_seen": 57448296, + "step": 3571 + }, + { + "epoch": 0.2502123337448668, + "grad_norm": 3.7505650520324707, + "learning_rate": 7.500225569176882e-05, + "loss": 1.1593, + "num_input_tokens_seen": 57464680, + "step": 3572 + }, + { + "epoch": 0.25028238199059605, + "grad_norm": 5.1084675788879395, + "learning_rate": 7.499525744308231e-05, + "loss": 1.317, + "num_input_tokens_seen": 57481032, + "step": 3573 + }, + { + "epoch": 0.25035243023632525, + "grad_norm": 6.083080768585205, + "learning_rate": 7.49882591943958e-05, + "loss": 0.9305, + "num_input_tokens_seen": 57497416, + "step": 3574 + }, + { + "epoch": 0.2504224784820545, + "grad_norm": 4.197649955749512, + "learning_rate": 7.498126094570929e-05, + "loss": 1.0191, + "num_input_tokens_seen": 57513800, + "step": 3575 + }, + { + "epoch": 0.25049252672778377, + "grad_norm": 4.637972831726074, + "learning_rate": 7.497426269702278e-05, + "loss": 0.9914, + "num_input_tokens_seen": 57529832, + "step": 3576 + }, + { + "epoch": 0.25056257497351303, + "grad_norm": 4.096358776092529, + "learning_rate": 7.496726444833626e-05, + "loss": 1.1909, + "num_input_tokens_seen": 57545432, + "step": 3577 + }, + { + "epoch": 0.25063262321924223, + "grad_norm": 3.9253315925598145, + "learning_rate": 7.496026619964974e-05, + "loss": 1.1383, + "num_input_tokens_seen": 57561816, + "step": 3578 + }, + { + "epoch": 0.2507026714649715, + "grad_norm": 5.603836536407471, + "learning_rate": 7.495326795096321e-05, + "loss": 1.0744, + "num_input_tokens_seen": 57577336, + "step": 3579 + }, + { + "epoch": 0.25077271971070075, + "grad_norm": 4.588653564453125, + "learning_rate": 7.49462697022767e-05, + "loss": 1.0896, + "num_input_tokens_seen": 57593720, + "step": 3580 + }, + { + "epoch": 0.25084276795643, + "grad_norm": 3.989229917526245, + "learning_rate": 7.49392714535902e-05, + "loss": 0.9605, + "num_input_tokens_seen": 57609656, + "step": 3581 + }, + { + "epoch": 0.2509128162021592, + "grad_norm": 4.728183269500732, + "learning_rate": 7.493227320490368e-05, + "loss": 1.2626, + "num_input_tokens_seen": 57626040, + "step": 3582 + }, + { + "epoch": 0.2509828644478885, + "grad_norm": 4.269988059997559, + "learning_rate": 7.492527495621717e-05, + "loss": 1.0987, + "num_input_tokens_seen": 57641280, + "step": 3583 + }, + { + "epoch": 0.25105291269361774, + "grad_norm": 6.506377696990967, + "learning_rate": 7.491827670753066e-05, + "loss": 0.9327, + "num_input_tokens_seen": 57657664, + "step": 3584 + }, + { + "epoch": 0.251122960939347, + "grad_norm": 6.415282726287842, + "learning_rate": 7.491127845884413e-05, + "loss": 0.9515, + "num_input_tokens_seen": 57672704, + "step": 3585 + }, + { + "epoch": 0.2511930091850762, + "grad_norm": 3.969257116317749, + "learning_rate": 7.490428021015761e-05, + "loss": 1.1255, + "num_input_tokens_seen": 57687504, + "step": 3586 + }, + { + "epoch": 0.25126305743080546, + "grad_norm": 3.493469476699829, + "learning_rate": 7.489728196147111e-05, + "loss": 0.95, + "num_input_tokens_seen": 57703512, + "step": 3587 + }, + { + "epoch": 0.2513331056765347, + "grad_norm": 5.777353763580322, + "learning_rate": 7.48902837127846e-05, + "loss": 1.0089, + "num_input_tokens_seen": 57719344, + "step": 3588 + }, + { + "epoch": 0.251403153922264, + "grad_norm": 3.6840991973876953, + "learning_rate": 7.488328546409807e-05, + "loss": 1.0351, + "num_input_tokens_seen": 57734848, + "step": 3589 + }, + { + "epoch": 0.2514732021679932, + "grad_norm": 6.526551246643066, + "learning_rate": 7.487628721541156e-05, + "loss": 1.1651, + "num_input_tokens_seen": 57751232, + "step": 3590 + }, + { + "epoch": 0.25154325041372244, + "grad_norm": 3.7879719734191895, + "learning_rate": 7.486928896672505e-05, + "loss": 1.0128, + "num_input_tokens_seen": 57767616, + "step": 3591 + }, + { + "epoch": 0.2516132986594517, + "grad_norm": 6.891875267028809, + "learning_rate": 7.486229071803853e-05, + "loss": 1.2037, + "num_input_tokens_seen": 57783592, + "step": 3592 + }, + { + "epoch": 0.25168334690518096, + "grad_norm": 4.700318336486816, + "learning_rate": 7.485529246935201e-05, + "loss": 1.0291, + "num_input_tokens_seen": 57799976, + "step": 3593 + }, + { + "epoch": 0.25175339515091016, + "grad_norm": 6.47390604019165, + "learning_rate": 7.48482942206655e-05, + "loss": 0.9828, + "num_input_tokens_seen": 57816360, + "step": 3594 + }, + { + "epoch": 0.2518234433966394, + "grad_norm": 5.045449733734131, + "learning_rate": 7.484129597197899e-05, + "loss": 0.9569, + "num_input_tokens_seen": 57832016, + "step": 3595 + }, + { + "epoch": 0.2518934916423687, + "grad_norm": 4.258456230163574, + "learning_rate": 7.483429772329247e-05, + "loss": 0.9804, + "num_input_tokens_seen": 57848400, + "step": 3596 + }, + { + "epoch": 0.25196353988809794, + "grad_norm": 3.948582649230957, + "learning_rate": 7.482729947460596e-05, + "loss": 0.9898, + "num_input_tokens_seen": 57864784, + "step": 3597 + }, + { + "epoch": 0.25203358813382715, + "grad_norm": 4.017141342163086, + "learning_rate": 7.482030122591944e-05, + "loss": 0.8644, + "num_input_tokens_seen": 57879696, + "step": 3598 + }, + { + "epoch": 0.2521036363795564, + "grad_norm": 3.7428297996520996, + "learning_rate": 7.481330297723292e-05, + "loss": 0.9318, + "num_input_tokens_seen": 57896080, + "step": 3599 + }, + { + "epoch": 0.25217368462528567, + "grad_norm": 4.883368968963623, + "learning_rate": 7.480630472854641e-05, + "loss": 0.9771, + "num_input_tokens_seen": 57911976, + "step": 3600 + }, + { + "epoch": 0.25217368462528567, + "eval_loss": 1.136000633239746, + "eval_runtime": 0.2016, + "eval_samples_per_second": 4.959, + "eval_steps_per_second": 4.959, + "num_input_tokens_seen": 57911976, + "step": 3600 + }, + { + "epoch": 0.2522437328710149, + "grad_norm": 4.399716377258301, + "learning_rate": 7.479930647985991e-05, + "loss": 0.9965, + "num_input_tokens_seen": 57927440, + "step": 3601 + }, + { + "epoch": 0.2523137811167442, + "grad_norm": 6.019199371337891, + "learning_rate": 7.479230823117338e-05, + "loss": 1.1172, + "num_input_tokens_seen": 57943824, + "step": 3602 + }, + { + "epoch": 0.2523838293624734, + "grad_norm": 4.42507266998291, + "learning_rate": 7.478530998248687e-05, + "loss": 1.1294, + "num_input_tokens_seen": 57960208, + "step": 3603 + }, + { + "epoch": 0.25245387760820265, + "grad_norm": 4.0232062339782715, + "learning_rate": 7.477831173380036e-05, + "loss": 1.031, + "num_input_tokens_seen": 57976560, + "step": 3604 + }, + { + "epoch": 0.2525239258539319, + "grad_norm": 3.6392862796783447, + "learning_rate": 7.477131348511384e-05, + "loss": 0.8717, + "num_input_tokens_seen": 57992944, + "step": 3605 + }, + { + "epoch": 0.25259397409966117, + "grad_norm": 3.849912643432617, + "learning_rate": 7.476431523642731e-05, + "loss": 0.994, + "num_input_tokens_seen": 58009328, + "step": 3606 + }, + { + "epoch": 0.25266402234539037, + "grad_norm": 3.5331156253814697, + "learning_rate": 7.475731698774081e-05, + "loss": 0.8999, + "num_input_tokens_seen": 58025152, + "step": 3607 + }, + { + "epoch": 0.25273407059111963, + "grad_norm": 4.343970775604248, + "learning_rate": 7.47503187390543e-05, + "loss": 1.0231, + "num_input_tokens_seen": 58041536, + "step": 3608 + }, + { + "epoch": 0.2528041188368489, + "grad_norm": 3.6736862659454346, + "learning_rate": 7.474332049036778e-05, + "loss": 1.161, + "num_input_tokens_seen": 58057920, + "step": 3609 + }, + { + "epoch": 0.25287416708257815, + "grad_norm": 6.599121570587158, + "learning_rate": 7.473632224168127e-05, + "loss": 1.2235, + "num_input_tokens_seen": 58073784, + "step": 3610 + }, + { + "epoch": 0.25294421532830735, + "grad_norm": 4.2448930740356445, + "learning_rate": 7.472932399299475e-05, + "loss": 1.0207, + "num_input_tokens_seen": 58088776, + "step": 3611 + }, + { + "epoch": 0.2530142635740366, + "grad_norm": 3.416584014892578, + "learning_rate": 7.472232574430823e-05, + "loss": 0.984, + "num_input_tokens_seen": 58105160, + "step": 3612 + }, + { + "epoch": 0.2530843118197659, + "grad_norm": 3.9348700046539307, + "learning_rate": 7.471532749562172e-05, + "loss": 1.0883, + "num_input_tokens_seen": 58121528, + "step": 3613 + }, + { + "epoch": 0.25315436006549513, + "grad_norm": 6.208236217498779, + "learning_rate": 7.470832924693521e-05, + "loss": 1.1842, + "num_input_tokens_seen": 58137912, + "step": 3614 + }, + { + "epoch": 0.25322440831122434, + "grad_norm": 3.9069888591766357, + "learning_rate": 7.47013309982487e-05, + "loss": 0.9958, + "num_input_tokens_seen": 58154056, + "step": 3615 + }, + { + "epoch": 0.2532944565569536, + "grad_norm": 4.482925891876221, + "learning_rate": 7.469433274956217e-05, + "loss": 1.0365, + "num_input_tokens_seen": 58168904, + "step": 3616 + }, + { + "epoch": 0.25336450480268286, + "grad_norm": 4.082488536834717, + "learning_rate": 7.468733450087566e-05, + "loss": 0.9116, + "num_input_tokens_seen": 58185288, + "step": 3617 + }, + { + "epoch": 0.2534345530484121, + "grad_norm": 5.994426250457764, + "learning_rate": 7.468033625218915e-05, + "loss": 1.1286, + "num_input_tokens_seen": 58201600, + "step": 3618 + }, + { + "epoch": 0.2535046012941413, + "grad_norm": 3.966487169265747, + "learning_rate": 7.467333800350262e-05, + "loss": 1.061, + "num_input_tokens_seen": 58217752, + "step": 3619 + }, + { + "epoch": 0.2535746495398706, + "grad_norm": 4.3370537757873535, + "learning_rate": 7.466633975481611e-05, + "loss": 0.9495, + "num_input_tokens_seen": 58233672, + "step": 3620 + }, + { + "epoch": 0.25364469778559984, + "grad_norm": 4.638936519622803, + "learning_rate": 7.465934150612961e-05, + "loss": 1.1593, + "num_input_tokens_seen": 58249904, + "step": 3621 + }, + { + "epoch": 0.2537147460313291, + "grad_norm": 3.42993426322937, + "learning_rate": 7.465234325744309e-05, + "loss": 0.9112, + "num_input_tokens_seen": 58265272, + "step": 3622 + }, + { + "epoch": 0.2537847942770583, + "grad_norm": 4.637670516967773, + "learning_rate": 7.464534500875656e-05, + "loss": 1.1578, + "num_input_tokens_seen": 58281656, + "step": 3623 + }, + { + "epoch": 0.25385484252278756, + "grad_norm": 4.470972061157227, + "learning_rate": 7.463834676007005e-05, + "loss": 0.9973, + "num_input_tokens_seen": 58297696, + "step": 3624 + }, + { + "epoch": 0.2539248907685168, + "grad_norm": 4.158536434173584, + "learning_rate": 7.463134851138354e-05, + "loss": 1.2625, + "num_input_tokens_seen": 58313960, + "step": 3625 + }, + { + "epoch": 0.2539949390142461, + "grad_norm": 5.2940850257873535, + "learning_rate": 7.462435026269702e-05, + "loss": 1.1649, + "num_input_tokens_seen": 58329928, + "step": 3626 + }, + { + "epoch": 0.2540649872599753, + "grad_norm": 4.270470142364502, + "learning_rate": 7.461735201401052e-05, + "loss": 0.9042, + "num_input_tokens_seen": 58345544, + "step": 3627 + }, + { + "epoch": 0.25413503550570454, + "grad_norm": 4.488008975982666, + "learning_rate": 7.4610353765324e-05, + "loss": 1.2652, + "num_input_tokens_seen": 58361736, + "step": 3628 + }, + { + "epoch": 0.2542050837514338, + "grad_norm": 3.9760642051696777, + "learning_rate": 7.460335551663748e-05, + "loss": 0.9522, + "num_input_tokens_seen": 58377888, + "step": 3629 + }, + { + "epoch": 0.25427513199716306, + "grad_norm": 4.022678852081299, + "learning_rate": 7.459635726795097e-05, + "loss": 1.0673, + "num_input_tokens_seen": 58393744, + "step": 3630 + }, + { + "epoch": 0.25434518024289227, + "grad_norm": 6.345690727233887, + "learning_rate": 7.458935901926446e-05, + "loss": 1.052, + "num_input_tokens_seen": 58410064, + "step": 3631 + }, + { + "epoch": 0.2544152284886215, + "grad_norm": 4.0159101486206055, + "learning_rate": 7.458236077057793e-05, + "loss": 1.1164, + "num_input_tokens_seen": 58426352, + "step": 3632 + }, + { + "epoch": 0.2544852767343508, + "grad_norm": 4.125208854675293, + "learning_rate": 7.457536252189142e-05, + "loss": 1.0113, + "num_input_tokens_seen": 58441936, + "step": 3633 + }, + { + "epoch": 0.25455532498008004, + "grad_norm": 4.429535865783691, + "learning_rate": 7.456836427320491e-05, + "loss": 1.158, + "num_input_tokens_seen": 58457136, + "step": 3634 + }, + { + "epoch": 0.2546253732258093, + "grad_norm": 3.655606269836426, + "learning_rate": 7.45613660245184e-05, + "loss": 1.0467, + "num_input_tokens_seen": 58473520, + "step": 3635 + }, + { + "epoch": 0.2546954214715385, + "grad_norm": 3.688188314437866, + "learning_rate": 7.455436777583187e-05, + "loss": 0.9309, + "num_input_tokens_seen": 58489904, + "step": 3636 + }, + { + "epoch": 0.25476546971726777, + "grad_norm": 3.95440411567688, + "learning_rate": 7.454736952714536e-05, + "loss": 1.2586, + "num_input_tokens_seen": 58506032, + "step": 3637 + }, + { + "epoch": 0.254835517962997, + "grad_norm": 3.950641632080078, + "learning_rate": 7.454037127845885e-05, + "loss": 0.9397, + "num_input_tokens_seen": 58521464, + "step": 3638 + }, + { + "epoch": 0.2549055662087263, + "grad_norm": 4.9607038497924805, + "learning_rate": 7.453337302977233e-05, + "loss": 1.0498, + "num_input_tokens_seen": 58537848, + "step": 3639 + }, + { + "epoch": 0.2549756144544555, + "grad_norm": 3.4168713092803955, + "learning_rate": 7.452637478108582e-05, + "loss": 0.8983, + "num_input_tokens_seen": 58554232, + "step": 3640 + }, + { + "epoch": 0.25504566270018475, + "grad_norm": 6.897549152374268, + "learning_rate": 7.451937653239932e-05, + "loss": 1.2782, + "num_input_tokens_seen": 58570616, + "step": 3641 + }, + { + "epoch": 0.255115710945914, + "grad_norm": 4.009060859680176, + "learning_rate": 7.451237828371279e-05, + "loss": 1.0205, + "num_input_tokens_seen": 58587000, + "step": 3642 + }, + { + "epoch": 0.25518575919164327, + "grad_norm": 4.245255470275879, + "learning_rate": 7.450538003502627e-05, + "loss": 0.98, + "num_input_tokens_seen": 58602768, + "step": 3643 + }, + { + "epoch": 0.2552558074373725, + "grad_norm": 3.7547385692596436, + "learning_rate": 7.449838178633976e-05, + "loss": 1.0763, + "num_input_tokens_seen": 58619024, + "step": 3644 + }, + { + "epoch": 0.25532585568310173, + "grad_norm": 5.7543745040893555, + "learning_rate": 7.449138353765324e-05, + "loss": 1.1535, + "num_input_tokens_seen": 58635408, + "step": 3645 + }, + { + "epoch": 0.255395903928831, + "grad_norm": 3.8786420822143555, + "learning_rate": 7.448438528896672e-05, + "loss": 1.0385, + "num_input_tokens_seen": 58651392, + "step": 3646 + }, + { + "epoch": 0.25546595217456025, + "grad_norm": 4.290858745574951, + "learning_rate": 7.447738704028022e-05, + "loss": 0.9459, + "num_input_tokens_seen": 58667712, + "step": 3647 + }, + { + "epoch": 0.25553600042028946, + "grad_norm": 3.8005576133728027, + "learning_rate": 7.447038879159371e-05, + "loss": 1.1709, + "num_input_tokens_seen": 58683512, + "step": 3648 + }, + { + "epoch": 0.2556060486660187, + "grad_norm": 3.574735403060913, + "learning_rate": 7.446339054290719e-05, + "loss": 1.0276, + "num_input_tokens_seen": 58699296, + "step": 3649 + }, + { + "epoch": 0.255676096911748, + "grad_norm": 4.487549304962158, + "learning_rate": 7.445639229422066e-05, + "loss": 1.0608, + "num_input_tokens_seen": 58715680, + "step": 3650 + }, + { + "epoch": 0.25574614515747723, + "grad_norm": 3.80549955368042, + "learning_rate": 7.444939404553415e-05, + "loss": 1.0916, + "num_input_tokens_seen": 58732064, + "step": 3651 + }, + { + "epoch": 0.25581619340320644, + "grad_norm": 6.745276927947998, + "learning_rate": 7.444239579684764e-05, + "loss": 0.9649, + "num_input_tokens_seen": 58748416, + "step": 3652 + }, + { + "epoch": 0.2558862416489357, + "grad_norm": 5.366410732269287, + "learning_rate": 7.443539754816113e-05, + "loss": 1.1205, + "num_input_tokens_seen": 58764800, + "step": 3653 + }, + { + "epoch": 0.25595628989466496, + "grad_norm": 4.889951705932617, + "learning_rate": 7.442839929947462e-05, + "loss": 1.0447, + "num_input_tokens_seen": 58779776, + "step": 3654 + }, + { + "epoch": 0.2560263381403942, + "grad_norm": 3.776078462600708, + "learning_rate": 7.44214010507881e-05, + "loss": 0.9146, + "num_input_tokens_seen": 58796160, + "step": 3655 + }, + { + "epoch": 0.2560963863861234, + "grad_norm": 4.999850749969482, + "learning_rate": 7.441440280210158e-05, + "loss": 1.053, + "num_input_tokens_seen": 58812544, + "step": 3656 + }, + { + "epoch": 0.2561664346318527, + "grad_norm": 4.111214637756348, + "learning_rate": 7.440740455341507e-05, + "loss": 1.02, + "num_input_tokens_seen": 58828696, + "step": 3657 + }, + { + "epoch": 0.25623648287758194, + "grad_norm": 4.49043083190918, + "learning_rate": 7.440040630472856e-05, + "loss": 0.8889, + "num_input_tokens_seen": 58845080, + "step": 3658 + }, + { + "epoch": 0.2563065311233112, + "grad_norm": 4.440788745880127, + "learning_rate": 7.439340805604203e-05, + "loss": 1.0635, + "num_input_tokens_seen": 58861464, + "step": 3659 + }, + { + "epoch": 0.2563765793690404, + "grad_norm": 5.642586708068848, + "learning_rate": 7.438640980735552e-05, + "loss": 1.3676, + "num_input_tokens_seen": 58877624, + "step": 3660 + }, + { + "epoch": 0.25644662761476966, + "grad_norm": 3.8768467903137207, + "learning_rate": 7.437941155866901e-05, + "loss": 0.9737, + "num_input_tokens_seen": 58894008, + "step": 3661 + }, + { + "epoch": 0.2565166758604989, + "grad_norm": 3.9855473041534424, + "learning_rate": 7.43724133099825e-05, + "loss": 1.0987, + "num_input_tokens_seen": 58909600, + "step": 3662 + }, + { + "epoch": 0.2565867241062282, + "grad_norm": 3.6692938804626465, + "learning_rate": 7.436541506129597e-05, + "loss": 1.0541, + "num_input_tokens_seen": 58925776, + "step": 3663 + }, + { + "epoch": 0.2566567723519574, + "grad_norm": 3.87776517868042, + "learning_rate": 7.435841681260946e-05, + "loss": 1.0616, + "num_input_tokens_seen": 58941048, + "step": 3664 + }, + { + "epoch": 0.25672682059768664, + "grad_norm": 3.5173263549804688, + "learning_rate": 7.435141856392295e-05, + "loss": 0.9046, + "num_input_tokens_seen": 58957432, + "step": 3665 + }, + { + "epoch": 0.2567968688434159, + "grad_norm": 4.312611103057861, + "learning_rate": 7.434442031523642e-05, + "loss": 0.8224, + "num_input_tokens_seen": 58973816, + "step": 3666 + }, + { + "epoch": 0.25686691708914516, + "grad_norm": 3.7889907360076904, + "learning_rate": 7.433742206654991e-05, + "loss": 1.1431, + "num_input_tokens_seen": 58989472, + "step": 3667 + }, + { + "epoch": 0.25693696533487437, + "grad_norm": 4.997755527496338, + "learning_rate": 7.433042381786341e-05, + "loss": 1.2147, + "num_input_tokens_seen": 59005856, + "step": 3668 + }, + { + "epoch": 0.2570070135806036, + "grad_norm": 5.839511871337891, + "learning_rate": 7.432342556917689e-05, + "loss": 1.0974, + "num_input_tokens_seen": 59022176, + "step": 3669 + }, + { + "epoch": 0.2570770618263329, + "grad_norm": 4.185897350311279, + "learning_rate": 7.431642732049036e-05, + "loss": 1.0769, + "num_input_tokens_seen": 59038296, + "step": 3670 + }, + { + "epoch": 0.25714711007206215, + "grad_norm": 3.6666383743286133, + "learning_rate": 7.430942907180385e-05, + "loss": 1.0051, + "num_input_tokens_seen": 59054680, + "step": 3671 + }, + { + "epoch": 0.2572171583177914, + "grad_norm": 3.8587453365325928, + "learning_rate": 7.430243082311734e-05, + "loss": 1.1001, + "num_input_tokens_seen": 59070912, + "step": 3672 + }, + { + "epoch": 0.2572872065635206, + "grad_norm": 3.6518352031707764, + "learning_rate": 7.429543257443083e-05, + "loss": 1.042, + "num_input_tokens_seen": 59087296, + "step": 3673 + }, + { + "epoch": 0.25735725480924987, + "grad_norm": 4.629798412322998, + "learning_rate": 7.428843432574432e-05, + "loss": 1.2649, + "num_input_tokens_seen": 59103632, + "step": 3674 + }, + { + "epoch": 0.25742730305497913, + "grad_norm": 6.353034496307373, + "learning_rate": 7.428143607705781e-05, + "loss": 1.3823, + "num_input_tokens_seen": 59120016, + "step": 3675 + }, + { + "epoch": 0.2574973513007084, + "grad_norm": 6.1848273277282715, + "learning_rate": 7.427443782837128e-05, + "loss": 1.2275, + "num_input_tokens_seen": 59136232, + "step": 3676 + }, + { + "epoch": 0.2575673995464376, + "grad_norm": 3.6022186279296875, + "learning_rate": 7.426743957968476e-05, + "loss": 0.9513, + "num_input_tokens_seen": 59152616, + "step": 3677 + }, + { + "epoch": 0.25763744779216685, + "grad_norm": 3.6495468616485596, + "learning_rate": 7.426044133099825e-05, + "loss": 1.0282, + "num_input_tokens_seen": 59167792, + "step": 3678 + }, + { + "epoch": 0.2577074960378961, + "grad_norm": 4.675189018249512, + "learning_rate": 7.425344308231174e-05, + "loss": 1.1248, + "num_input_tokens_seen": 59184176, + "step": 3679 + }, + { + "epoch": 0.25777754428362537, + "grad_norm": 3.657700538635254, + "learning_rate": 7.424644483362522e-05, + "loss": 1.0445, + "num_input_tokens_seen": 59199632, + "step": 3680 + }, + { + "epoch": 0.2578475925293546, + "grad_norm": 3.9934394359588623, + "learning_rate": 7.423944658493871e-05, + "loss": 1.0598, + "num_input_tokens_seen": 59215720, + "step": 3681 + }, + { + "epoch": 0.25791764077508383, + "grad_norm": 3.777191400527954, + "learning_rate": 7.42324483362522e-05, + "loss": 1.2255, + "num_input_tokens_seen": 59231248, + "step": 3682 + }, + { + "epoch": 0.2579876890208131, + "grad_norm": 3.9812276363372803, + "learning_rate": 7.422545008756568e-05, + "loss": 1.1253, + "num_input_tokens_seen": 59247280, + "step": 3683 + }, + { + "epoch": 0.25805773726654235, + "grad_norm": 3.631455183029175, + "learning_rate": 7.421845183887916e-05, + "loss": 1.0559, + "num_input_tokens_seen": 59263664, + "step": 3684 + }, + { + "epoch": 0.25812778551227156, + "grad_norm": 3.803898334503174, + "learning_rate": 7.421145359019265e-05, + "loss": 0.9847, + "num_input_tokens_seen": 59279880, + "step": 3685 + }, + { + "epoch": 0.2581978337580008, + "grad_norm": 3.649956703186035, + "learning_rate": 7.420445534150613e-05, + "loss": 0.951, + "num_input_tokens_seen": 59296216, + "step": 3686 + }, + { + "epoch": 0.2582678820037301, + "grad_norm": 4.010924339294434, + "learning_rate": 7.419745709281962e-05, + "loss": 1.1987, + "num_input_tokens_seen": 59312448, + "step": 3687 + }, + { + "epoch": 0.25833793024945934, + "grad_norm": 4.2410759925842285, + "learning_rate": 7.41904588441331e-05, + "loss": 0.9677, + "num_input_tokens_seen": 59328456, + "step": 3688 + }, + { + "epoch": 0.25840797849518854, + "grad_norm": 3.9170684814453125, + "learning_rate": 7.41834605954466e-05, + "loss": 1.0795, + "num_input_tokens_seen": 59344840, + "step": 3689 + }, + { + "epoch": 0.2584780267409178, + "grad_norm": 3.935624837875366, + "learning_rate": 7.417646234676007e-05, + "loss": 1.1149, + "num_input_tokens_seen": 59360040, + "step": 3690 + }, + { + "epoch": 0.25854807498664706, + "grad_norm": 4.3747782707214355, + "learning_rate": 7.416946409807356e-05, + "loss": 1.2462, + "num_input_tokens_seen": 59375896, + "step": 3691 + }, + { + "epoch": 0.2586181232323763, + "grad_norm": 7.553433418273926, + "learning_rate": 7.416246584938705e-05, + "loss": 1.4753, + "num_input_tokens_seen": 59391144, + "step": 3692 + }, + { + "epoch": 0.2586881714781055, + "grad_norm": 3.4443981647491455, + "learning_rate": 7.415546760070053e-05, + "loss": 1.0629, + "num_input_tokens_seen": 59407528, + "step": 3693 + }, + { + "epoch": 0.2587582197238348, + "grad_norm": 4.02165412902832, + "learning_rate": 7.414846935201401e-05, + "loss": 0.9951, + "num_input_tokens_seen": 59422824, + "step": 3694 + }, + { + "epoch": 0.25882826796956404, + "grad_norm": 3.8880200386047363, + "learning_rate": 7.414147110332751e-05, + "loss": 0.941, + "num_input_tokens_seen": 59439208, + "step": 3695 + }, + { + "epoch": 0.2588983162152933, + "grad_norm": 5.463441371917725, + "learning_rate": 7.413447285464099e-05, + "loss": 0.9333, + "num_input_tokens_seen": 59455592, + "step": 3696 + }, + { + "epoch": 0.2589683644610225, + "grad_norm": 7.555225372314453, + "learning_rate": 7.412747460595446e-05, + "loss": 1.2278, + "num_input_tokens_seen": 59471976, + "step": 3697 + }, + { + "epoch": 0.25903841270675176, + "grad_norm": 5.7154436111450195, + "learning_rate": 7.412047635726795e-05, + "loss": 1.151, + "num_input_tokens_seen": 59488360, + "step": 3698 + }, + { + "epoch": 0.259108460952481, + "grad_norm": 5.09559965133667, + "learning_rate": 7.411347810858144e-05, + "loss": 1.0998, + "num_input_tokens_seen": 59504536, + "step": 3699 + }, + { + "epoch": 0.2591785091982103, + "grad_norm": 4.7749738693237305, + "learning_rate": 7.410647985989493e-05, + "loss": 1.2971, + "num_input_tokens_seen": 59520488, + "step": 3700 + }, + { + "epoch": 0.2592485574439395, + "grad_norm": 4.323631763458252, + "learning_rate": 7.409948161120842e-05, + "loss": 1.1687, + "num_input_tokens_seen": 59535384, + "step": 3701 + }, + { + "epoch": 0.25931860568966875, + "grad_norm": 3.511822462081909, + "learning_rate": 7.40924833625219e-05, + "loss": 1.0547, + "num_input_tokens_seen": 59550888, + "step": 3702 + }, + { + "epoch": 0.259388653935398, + "grad_norm": 4.039402008056641, + "learning_rate": 7.408548511383538e-05, + "loss": 0.8453, + "num_input_tokens_seen": 59567184, + "step": 3703 + }, + { + "epoch": 0.25945870218112727, + "grad_norm": 3.6692605018615723, + "learning_rate": 7.407848686514885e-05, + "loss": 0.9705, + "num_input_tokens_seen": 59583568, + "step": 3704 + }, + { + "epoch": 0.2595287504268565, + "grad_norm": 4.414707660675049, + "learning_rate": 7.407148861646234e-05, + "loss": 0.8734, + "num_input_tokens_seen": 59599088, + "step": 3705 + }, + { + "epoch": 0.25959879867258573, + "grad_norm": 4.073670387268066, + "learning_rate": 7.406449036777583e-05, + "loss": 1.2958, + "num_input_tokens_seen": 59615432, + "step": 3706 + }, + { + "epoch": 0.259668846918315, + "grad_norm": 4.436419486999512, + "learning_rate": 7.405749211908932e-05, + "loss": 1.0019, + "num_input_tokens_seen": 59631816, + "step": 3707 + }, + { + "epoch": 0.25973889516404425, + "grad_norm": 5.866218090057373, + "learning_rate": 7.405049387040281e-05, + "loss": 1.043, + "num_input_tokens_seen": 59648200, + "step": 3708 + }, + { + "epoch": 0.2598089434097735, + "grad_norm": 4.133188247680664, + "learning_rate": 7.40434956217163e-05, + "loss": 1.1168, + "num_input_tokens_seen": 59664584, + "step": 3709 + }, + { + "epoch": 0.2598789916555027, + "grad_norm": 4.1976213455200195, + "learning_rate": 7.403649737302977e-05, + "loss": 1.1118, + "num_input_tokens_seen": 59680288, + "step": 3710 + }, + { + "epoch": 0.25994903990123197, + "grad_norm": 3.990983009338379, + "learning_rate": 7.402949912434326e-05, + "loss": 0.9963, + "num_input_tokens_seen": 59696408, + "step": 3711 + }, + { + "epoch": 0.26001908814696123, + "grad_norm": 4.427793025970459, + "learning_rate": 7.402250087565675e-05, + "loss": 1.1771, + "num_input_tokens_seen": 59712792, + "step": 3712 + }, + { + "epoch": 0.2600891363926905, + "grad_norm": 5.360867023468018, + "learning_rate": 7.401550262697024e-05, + "loss": 1.1428, + "num_input_tokens_seen": 59728968, + "step": 3713 + }, + { + "epoch": 0.2601591846384197, + "grad_norm": 3.8442916870117188, + "learning_rate": 7.400850437828371e-05, + "loss": 0.9544, + "num_input_tokens_seen": 59745352, + "step": 3714 + }, + { + "epoch": 0.26022923288414895, + "grad_norm": 3.7610833644866943, + "learning_rate": 7.40015061295972e-05, + "loss": 0.9969, + "num_input_tokens_seen": 59761736, + "step": 3715 + }, + { + "epoch": 0.2602992811298782, + "grad_norm": 3.9050705432891846, + "learning_rate": 7.399450788091069e-05, + "loss": 1.2099, + "num_input_tokens_seen": 59778000, + "step": 3716 + }, + { + "epoch": 0.2603693293756075, + "grad_norm": 4.293839454650879, + "learning_rate": 7.398750963222417e-05, + "loss": 1.0274, + "num_input_tokens_seen": 59794216, + "step": 3717 + }, + { + "epoch": 0.2604393776213367, + "grad_norm": 3.7403993606567383, + "learning_rate": 7.398051138353765e-05, + "loss": 1.0172, + "num_input_tokens_seen": 59810600, + "step": 3718 + }, + { + "epoch": 0.26050942586706594, + "grad_norm": 5.266970157623291, + "learning_rate": 7.397351313485114e-05, + "loss": 0.8695, + "num_input_tokens_seen": 59826984, + "step": 3719 + }, + { + "epoch": 0.2605794741127952, + "grad_norm": 4.385645866394043, + "learning_rate": 7.396651488616463e-05, + "loss": 1.0625, + "num_input_tokens_seen": 59843368, + "step": 3720 + }, + { + "epoch": 0.26064952235852445, + "grad_norm": 4.349147796630859, + "learning_rate": 7.39595166374781e-05, + "loss": 1.2092, + "num_input_tokens_seen": 59859136, + "step": 3721 + }, + { + "epoch": 0.26071957060425366, + "grad_norm": 4.69277286529541, + "learning_rate": 7.395251838879161e-05, + "loss": 1.1171, + "num_input_tokens_seen": 59875024, + "step": 3722 + }, + { + "epoch": 0.2607896188499829, + "grad_norm": 3.602949857711792, + "learning_rate": 7.394552014010508e-05, + "loss": 1.0994, + "num_input_tokens_seen": 59891408, + "step": 3723 + }, + { + "epoch": 0.2608596670957122, + "grad_norm": 4.137026786804199, + "learning_rate": 7.393852189141856e-05, + "loss": 1.0414, + "num_input_tokens_seen": 59906360, + "step": 3724 + }, + { + "epoch": 0.26092971534144144, + "grad_norm": 4.558672904968262, + "learning_rate": 7.393152364273205e-05, + "loss": 1.2051, + "num_input_tokens_seen": 59922744, + "step": 3725 + }, + { + "epoch": 0.26099976358717064, + "grad_norm": 3.977217197418213, + "learning_rate": 7.392452539404554e-05, + "loss": 0.9036, + "num_input_tokens_seen": 59938448, + "step": 3726 + }, + { + "epoch": 0.2610698118328999, + "grad_norm": 6.573578834533691, + "learning_rate": 7.391752714535902e-05, + "loss": 0.9693, + "num_input_tokens_seen": 59954832, + "step": 3727 + }, + { + "epoch": 0.26113986007862916, + "grad_norm": 4.253365516662598, + "learning_rate": 7.391052889667251e-05, + "loss": 1.1001, + "num_input_tokens_seen": 59971216, + "step": 3728 + }, + { + "epoch": 0.2612099083243584, + "grad_norm": 4.279355525970459, + "learning_rate": 7.3903530647986e-05, + "loss": 1.0456, + "num_input_tokens_seen": 59987384, + "step": 3729 + }, + { + "epoch": 0.2612799565700876, + "grad_norm": 5.5035505294799805, + "learning_rate": 7.389653239929948e-05, + "loss": 1.236, + "num_input_tokens_seen": 60003720, + "step": 3730 + }, + { + "epoch": 0.2613500048158169, + "grad_norm": 5.064812660217285, + "learning_rate": 7.388953415061295e-05, + "loss": 0.8739, + "num_input_tokens_seen": 60020104, + "step": 3731 + }, + { + "epoch": 0.26142005306154614, + "grad_norm": 4.716748237609863, + "learning_rate": 7.388253590192644e-05, + "loss": 1.2417, + "num_input_tokens_seen": 60036488, + "step": 3732 + }, + { + "epoch": 0.2614901013072754, + "grad_norm": 4.0947489738464355, + "learning_rate": 7.387553765323994e-05, + "loss": 1.1332, + "num_input_tokens_seen": 60052384, + "step": 3733 + }, + { + "epoch": 0.2615601495530046, + "grad_norm": 3.757126808166504, + "learning_rate": 7.386853940455342e-05, + "loss": 1.0442, + "num_input_tokens_seen": 60068624, + "step": 3734 + }, + { + "epoch": 0.26163019779873387, + "grad_norm": 7.364987850189209, + "learning_rate": 7.38615411558669e-05, + "loss": 1.0285, + "num_input_tokens_seen": 60084248, + "step": 3735 + }, + { + "epoch": 0.2617002460444631, + "grad_norm": 4.630516052246094, + "learning_rate": 7.38545429071804e-05, + "loss": 1.1585, + "num_input_tokens_seen": 60100632, + "step": 3736 + }, + { + "epoch": 0.2617702942901924, + "grad_norm": 5.3436760902404785, + "learning_rate": 7.384754465849387e-05, + "loss": 0.9723, + "num_input_tokens_seen": 60116672, + "step": 3737 + }, + { + "epoch": 0.2618403425359216, + "grad_norm": 3.843344211578369, + "learning_rate": 7.384054640980736e-05, + "loss": 0.8992, + "num_input_tokens_seen": 60133056, + "step": 3738 + }, + { + "epoch": 0.26191039078165085, + "grad_norm": 4.561652183532715, + "learning_rate": 7.383354816112085e-05, + "loss": 1.2304, + "num_input_tokens_seen": 60149440, + "step": 3739 + }, + { + "epoch": 0.2619804390273801, + "grad_norm": 3.951719045639038, + "learning_rate": 7.382654991243434e-05, + "loss": 0.8449, + "num_input_tokens_seen": 60165824, + "step": 3740 + }, + { + "epoch": 0.26205048727310937, + "grad_norm": 3.702449321746826, + "learning_rate": 7.381955166374781e-05, + "loss": 1.1251, + "num_input_tokens_seen": 60181496, + "step": 3741 + }, + { + "epoch": 0.2621205355188386, + "grad_norm": 5.43525505065918, + "learning_rate": 7.38125534150613e-05, + "loss": 1.1107, + "num_input_tokens_seen": 60197040, + "step": 3742 + }, + { + "epoch": 0.26219058376456783, + "grad_norm": 3.9709503650665283, + "learning_rate": 7.380555516637479e-05, + "loss": 1.1172, + "num_input_tokens_seen": 60213424, + "step": 3743 + }, + { + "epoch": 0.2622606320102971, + "grad_norm": 3.7183797359466553, + "learning_rate": 7.379855691768826e-05, + "loss": 1.0234, + "num_input_tokens_seen": 60229696, + "step": 3744 + }, + { + "epoch": 0.26233068025602635, + "grad_norm": 3.933479070663452, + "learning_rate": 7.379155866900175e-05, + "loss": 1.0702, + "num_input_tokens_seen": 60246080, + "step": 3745 + }, + { + "epoch": 0.2624007285017556, + "grad_norm": 4.837695598602295, + "learning_rate": 7.378456042031524e-05, + "loss": 1.1017, + "num_input_tokens_seen": 60262464, + "step": 3746 + }, + { + "epoch": 0.2624707767474848, + "grad_norm": 4.791194438934326, + "learning_rate": 7.377756217162873e-05, + "loss": 1.2467, + "num_input_tokens_seen": 60278600, + "step": 3747 + }, + { + "epoch": 0.2625408249932141, + "grad_norm": 4.53259801864624, + "learning_rate": 7.37705639229422e-05, + "loss": 1.1742, + "num_input_tokens_seen": 60293856, + "step": 3748 + }, + { + "epoch": 0.26261087323894333, + "grad_norm": 3.87522554397583, + "learning_rate": 7.37635656742557e-05, + "loss": 1.1901, + "num_input_tokens_seen": 60309888, + "step": 3749 + }, + { + "epoch": 0.2626809214846726, + "grad_norm": 4.46868896484375, + "learning_rate": 7.375656742556918e-05, + "loss": 0.9265, + "num_input_tokens_seen": 60325784, + "step": 3750 + }, + { + "epoch": 0.2627509697304018, + "grad_norm": 3.938703775405884, + "learning_rate": 7.374956917688266e-05, + "loss": 0.9785, + "num_input_tokens_seen": 60340696, + "step": 3751 + }, + { + "epoch": 0.26282101797613105, + "grad_norm": 3.5147759914398193, + "learning_rate": 7.374257092819614e-05, + "loss": 1.0984, + "num_input_tokens_seen": 60357080, + "step": 3752 + }, + { + "epoch": 0.2628910662218603, + "grad_norm": 4.008304119110107, + "learning_rate": 7.373557267950965e-05, + "loss": 1.046, + "num_input_tokens_seen": 60373464, + "step": 3753 + }, + { + "epoch": 0.2629611144675896, + "grad_norm": 3.9318859577178955, + "learning_rate": 7.372857443082312e-05, + "loss": 1.001, + "num_input_tokens_seen": 60389848, + "step": 3754 + }, + { + "epoch": 0.2630311627133188, + "grad_norm": 4.046808242797852, + "learning_rate": 7.372157618213661e-05, + "loss": 1.0768, + "num_input_tokens_seen": 60406232, + "step": 3755 + }, + { + "epoch": 0.26310121095904804, + "grad_norm": 5.451204299926758, + "learning_rate": 7.37145779334501e-05, + "loss": 0.9567, + "num_input_tokens_seen": 60422544, + "step": 3756 + }, + { + "epoch": 0.2631712592047773, + "grad_norm": 4.395990371704102, + "learning_rate": 7.370757968476357e-05, + "loss": 0.9173, + "num_input_tokens_seen": 60438312, + "step": 3757 + }, + { + "epoch": 0.26324130745050656, + "grad_norm": 5.997600078582764, + "learning_rate": 7.370058143607705e-05, + "loss": 1.049, + "num_input_tokens_seen": 60454696, + "step": 3758 + }, + { + "epoch": 0.26331135569623576, + "grad_norm": 5.588560104370117, + "learning_rate": 7.369358318739055e-05, + "loss": 0.9015, + "num_input_tokens_seen": 60470232, + "step": 3759 + }, + { + "epoch": 0.263381403941965, + "grad_norm": 3.2995078563690186, + "learning_rate": 7.368658493870404e-05, + "loss": 0.9814, + "num_input_tokens_seen": 60486224, + "step": 3760 + }, + { + "epoch": 0.2634514521876943, + "grad_norm": 4.141932964324951, + "learning_rate": 7.367958669001751e-05, + "loss": 1.0069, + "num_input_tokens_seen": 60502608, + "step": 3761 + }, + { + "epoch": 0.26352150043342354, + "grad_norm": 5.010983943939209, + "learning_rate": 7.3672588441331e-05, + "loss": 1.1533, + "num_input_tokens_seen": 60518672, + "step": 3762 + }, + { + "epoch": 0.26359154867915274, + "grad_norm": 3.555612802505493, + "learning_rate": 7.366559019264449e-05, + "loss": 1.1037, + "num_input_tokens_seen": 60534408, + "step": 3763 + }, + { + "epoch": 0.263661596924882, + "grad_norm": 4.006901264190674, + "learning_rate": 7.365859194395797e-05, + "loss": 1.0086, + "num_input_tokens_seen": 60550760, + "step": 3764 + }, + { + "epoch": 0.26373164517061126, + "grad_norm": 5.055272579193115, + "learning_rate": 7.365159369527146e-05, + "loss": 0.9645, + "num_input_tokens_seen": 60567144, + "step": 3765 + }, + { + "epoch": 0.2638016934163405, + "grad_norm": 3.860630989074707, + "learning_rate": 7.364459544658494e-05, + "loss": 1.0371, + "num_input_tokens_seen": 60583528, + "step": 3766 + }, + { + "epoch": 0.2638717416620697, + "grad_norm": 4.644535541534424, + "learning_rate": 7.363759719789843e-05, + "loss": 1.1461, + "num_input_tokens_seen": 60599912, + "step": 3767 + }, + { + "epoch": 0.263941789907799, + "grad_norm": 3.7196872234344482, + "learning_rate": 7.363059894921191e-05, + "loss": 1.1025, + "num_input_tokens_seen": 60616296, + "step": 3768 + }, + { + "epoch": 0.26401183815352824, + "grad_norm": 4.477166175842285, + "learning_rate": 7.36236007005254e-05, + "loss": 1.2221, + "num_input_tokens_seen": 60631760, + "step": 3769 + }, + { + "epoch": 0.2640818863992575, + "grad_norm": 4.906933784484863, + "learning_rate": 7.361660245183889e-05, + "loss": 0.9398, + "num_input_tokens_seen": 60648144, + "step": 3770 + }, + { + "epoch": 0.2641519346449867, + "grad_norm": 3.784450054168701, + "learning_rate": 7.360960420315236e-05, + "loss": 0.9521, + "num_input_tokens_seen": 60664528, + "step": 3771 + }, + { + "epoch": 0.26422198289071597, + "grad_norm": 4.5654191970825195, + "learning_rate": 7.360260595446585e-05, + "loss": 0.9199, + "num_input_tokens_seen": 60680912, + "step": 3772 + }, + { + "epoch": 0.2642920311364452, + "grad_norm": 3.965175151824951, + "learning_rate": 7.359560770577934e-05, + "loss": 0.9469, + "num_input_tokens_seen": 60697296, + "step": 3773 + }, + { + "epoch": 0.2643620793821745, + "grad_norm": 5.112542152404785, + "learning_rate": 7.358860945709283e-05, + "loss": 0.946, + "num_input_tokens_seen": 60713328, + "step": 3774 + }, + { + "epoch": 0.26443212762790375, + "grad_norm": 3.8610634803771973, + "learning_rate": 7.35816112084063e-05, + "loss": 1.1243, + "num_input_tokens_seen": 60729712, + "step": 3775 + }, + { + "epoch": 0.26450217587363295, + "grad_norm": 3.794217348098755, + "learning_rate": 7.35746129597198e-05, + "loss": 1.1127, + "num_input_tokens_seen": 60745824, + "step": 3776 + }, + { + "epoch": 0.2645722241193622, + "grad_norm": 3.7547152042388916, + "learning_rate": 7.356761471103328e-05, + "loss": 1.0774, + "num_input_tokens_seen": 60762024, + "step": 3777 + }, + { + "epoch": 0.26464227236509147, + "grad_norm": 3.492917537689209, + "learning_rate": 7.356061646234675e-05, + "loss": 1.0505, + "num_input_tokens_seen": 60778096, + "step": 3778 + }, + { + "epoch": 0.26471232061082073, + "grad_norm": 3.856019973754883, + "learning_rate": 7.355361821366026e-05, + "loss": 0.9716, + "num_input_tokens_seen": 60794480, + "step": 3779 + }, + { + "epoch": 0.26478236885654993, + "grad_norm": 3.68072509765625, + "learning_rate": 7.354661996497374e-05, + "loss": 1.2316, + "num_input_tokens_seen": 60810584, + "step": 3780 + }, + { + "epoch": 0.2648524171022792, + "grad_norm": 4.4739909172058105, + "learning_rate": 7.353962171628722e-05, + "loss": 1.2492, + "num_input_tokens_seen": 60826240, + "step": 3781 + }, + { + "epoch": 0.26492246534800845, + "grad_norm": 5.2342610359191895, + "learning_rate": 7.353262346760071e-05, + "loss": 1.1018, + "num_input_tokens_seen": 60842216, + "step": 3782 + }, + { + "epoch": 0.2649925135937377, + "grad_norm": 4.408970355987549, + "learning_rate": 7.35256252189142e-05, + "loss": 1.0485, + "num_input_tokens_seen": 60857336, + "step": 3783 + }, + { + "epoch": 0.2650625618394669, + "grad_norm": 3.8172199726104736, + "learning_rate": 7.351862697022767e-05, + "loss": 1.1399, + "num_input_tokens_seen": 60873720, + "step": 3784 + }, + { + "epoch": 0.2651326100851962, + "grad_norm": 4.250039100646973, + "learning_rate": 7.351162872154116e-05, + "loss": 1.003, + "num_input_tokens_seen": 60890104, + "step": 3785 + }, + { + "epoch": 0.26520265833092543, + "grad_norm": 4.257120609283447, + "learning_rate": 7.350463047285465e-05, + "loss": 1.0466, + "num_input_tokens_seen": 60906488, + "step": 3786 + }, + { + "epoch": 0.2652727065766547, + "grad_norm": 4.205286026000977, + "learning_rate": 7.349763222416814e-05, + "loss": 1.2149, + "num_input_tokens_seen": 60922872, + "step": 3787 + }, + { + "epoch": 0.2653427548223839, + "grad_norm": 4.304909706115723, + "learning_rate": 7.349063397548161e-05, + "loss": 1.0023, + "num_input_tokens_seen": 60939256, + "step": 3788 + }, + { + "epoch": 0.26541280306811316, + "grad_norm": 4.793664455413818, + "learning_rate": 7.34836357267951e-05, + "loss": 1.0475, + "num_input_tokens_seen": 60955440, + "step": 3789 + }, + { + "epoch": 0.2654828513138424, + "grad_norm": 4.383579730987549, + "learning_rate": 7.347663747810859e-05, + "loss": 1.1924, + "num_input_tokens_seen": 60971824, + "step": 3790 + }, + { + "epoch": 0.2655528995595717, + "grad_norm": 3.9962210655212402, + "learning_rate": 7.346963922942206e-05, + "loss": 1.0429, + "num_input_tokens_seen": 60987168, + "step": 3791 + }, + { + "epoch": 0.2656229478053009, + "grad_norm": 4.356331825256348, + "learning_rate": 7.346264098073555e-05, + "loss": 0.9332, + "num_input_tokens_seen": 61002840, + "step": 3792 + }, + { + "epoch": 0.26569299605103014, + "grad_norm": 5.836807727813721, + "learning_rate": 7.345564273204904e-05, + "loss": 1.205, + "num_input_tokens_seen": 61019224, + "step": 3793 + }, + { + "epoch": 0.2657630442967594, + "grad_norm": 4.778296947479248, + "learning_rate": 7.344864448336253e-05, + "loss": 1.0227, + "num_input_tokens_seen": 61034712, + "step": 3794 + }, + { + "epoch": 0.26583309254248866, + "grad_norm": 6.723006248474121, + "learning_rate": 7.3441646234676e-05, + "loss": 0.955, + "num_input_tokens_seen": 61050328, + "step": 3795 + }, + { + "epoch": 0.26590314078821786, + "grad_norm": 3.773984670639038, + "learning_rate": 7.34346479859895e-05, + "loss": 1.1262, + "num_input_tokens_seen": 61066048, + "step": 3796 + }, + { + "epoch": 0.2659731890339471, + "grad_norm": 3.915708065032959, + "learning_rate": 7.342764973730298e-05, + "loss": 1.1027, + "num_input_tokens_seen": 61082136, + "step": 3797 + }, + { + "epoch": 0.2660432372796764, + "grad_norm": 6.568943977355957, + "learning_rate": 7.342065148861646e-05, + "loss": 1.0457, + "num_input_tokens_seen": 61097216, + "step": 3798 + }, + { + "epoch": 0.26611328552540564, + "grad_norm": 5.0017499923706055, + "learning_rate": 7.341365323992995e-05, + "loss": 1.0194, + "num_input_tokens_seen": 61112344, + "step": 3799 + }, + { + "epoch": 0.26618333377113484, + "grad_norm": 4.1988935470581055, + "learning_rate": 7.340665499124345e-05, + "loss": 1.0794, + "num_input_tokens_seen": 61128728, + "step": 3800 + }, + { + "epoch": 0.26618333377113484, + "eval_loss": 1.1352765560150146, + "eval_runtime": 0.2173, + "eval_samples_per_second": 4.603, + "eval_steps_per_second": 4.603, + "num_input_tokens_seen": 61128728, + "step": 3800 + }, + { + "epoch": 0.2662533820168641, + "grad_norm": 3.991041660308838, + "learning_rate": 7.339965674255692e-05, + "loss": 1.1468, + "num_input_tokens_seen": 61145112, + "step": 3801 + }, + { + "epoch": 0.26632343026259336, + "grad_norm": 4.921470642089844, + "learning_rate": 7.33926584938704e-05, + "loss": 1.1756, + "num_input_tokens_seen": 61160952, + "step": 3802 + }, + { + "epoch": 0.2663934785083226, + "grad_norm": 3.835486888885498, + "learning_rate": 7.33856602451839e-05, + "loss": 0.782, + "num_input_tokens_seen": 61177024, + "step": 3803 + }, + { + "epoch": 0.2664635267540518, + "grad_norm": 4.419501304626465, + "learning_rate": 7.337866199649738e-05, + "loss": 1.0029, + "num_input_tokens_seen": 61193408, + "step": 3804 + }, + { + "epoch": 0.2665335749997811, + "grad_norm": 4.003963947296143, + "learning_rate": 7.337166374781086e-05, + "loss": 1.0805, + "num_input_tokens_seen": 61209792, + "step": 3805 + }, + { + "epoch": 0.26660362324551035, + "grad_norm": 4.115198612213135, + "learning_rate": 7.336466549912435e-05, + "loss": 1.1718, + "num_input_tokens_seen": 61226176, + "step": 3806 + }, + { + "epoch": 0.2666736714912396, + "grad_norm": 3.663464307785034, + "learning_rate": 7.335766725043784e-05, + "loss": 1.1447, + "num_input_tokens_seen": 61242560, + "step": 3807 + }, + { + "epoch": 0.2667437197369688, + "grad_norm": 3.7513012886047363, + "learning_rate": 7.335066900175132e-05, + "loss": 1.1208, + "num_input_tokens_seen": 61258944, + "step": 3808 + }, + { + "epoch": 0.26681376798269807, + "grad_norm": 4.693987846374512, + "learning_rate": 7.33436707530648e-05, + "loss": 1.2823, + "num_input_tokens_seen": 61275048, + "step": 3809 + }, + { + "epoch": 0.26688381622842733, + "grad_norm": 6.161116600036621, + "learning_rate": 7.333667250437829e-05, + "loss": 1.1606, + "num_input_tokens_seen": 61291368, + "step": 3810 + }, + { + "epoch": 0.2669538644741566, + "grad_norm": 5.942180633544922, + "learning_rate": 7.332967425569177e-05, + "loss": 1.2382, + "num_input_tokens_seen": 61307680, + "step": 3811 + }, + { + "epoch": 0.26702391271988585, + "grad_norm": 4.940249443054199, + "learning_rate": 7.332267600700526e-05, + "loss": 1.0407, + "num_input_tokens_seen": 61324064, + "step": 3812 + }, + { + "epoch": 0.26709396096561505, + "grad_norm": 5.384439468383789, + "learning_rate": 7.331567775831875e-05, + "loss": 1.007, + "num_input_tokens_seen": 61340416, + "step": 3813 + }, + { + "epoch": 0.2671640092113443, + "grad_norm": 5.4137959480285645, + "learning_rate": 7.330867950963223e-05, + "loss": 1.0485, + "num_input_tokens_seen": 61356800, + "step": 3814 + }, + { + "epoch": 0.26723405745707357, + "grad_norm": 5.492247581481934, + "learning_rate": 7.330168126094571e-05, + "loss": 1.1623, + "num_input_tokens_seen": 61371736, + "step": 3815 + }, + { + "epoch": 0.26730410570280283, + "grad_norm": 5.316330909729004, + "learning_rate": 7.32946830122592e-05, + "loss": 1.0147, + "num_input_tokens_seen": 61388120, + "step": 3816 + }, + { + "epoch": 0.26737415394853203, + "grad_norm": 3.976797103881836, + "learning_rate": 7.328768476357269e-05, + "loss": 1.1049, + "num_input_tokens_seen": 61403672, + "step": 3817 + }, + { + "epoch": 0.2674442021942613, + "grad_norm": 7.333898544311523, + "learning_rate": 7.328068651488616e-05, + "loss": 1.0696, + "num_input_tokens_seen": 61420056, + "step": 3818 + }, + { + "epoch": 0.26751425043999055, + "grad_norm": 3.795746088027954, + "learning_rate": 7.327368826619965e-05, + "loss": 1.0545, + "num_input_tokens_seen": 61436440, + "step": 3819 + }, + { + "epoch": 0.2675842986857198, + "grad_norm": 6.624248027801514, + "learning_rate": 7.326669001751315e-05, + "loss": 1.0736, + "num_input_tokens_seen": 61452824, + "step": 3820 + }, + { + "epoch": 0.267654346931449, + "grad_norm": 4.991429805755615, + "learning_rate": 7.325969176882663e-05, + "loss": 1.0681, + "num_input_tokens_seen": 61469208, + "step": 3821 + }, + { + "epoch": 0.2677243951771783, + "grad_norm": 3.8505215644836426, + "learning_rate": 7.32526935201401e-05, + "loss": 1.0217, + "num_input_tokens_seen": 61485592, + "step": 3822 + }, + { + "epoch": 0.26779444342290754, + "grad_norm": 3.7079288959503174, + "learning_rate": 7.324569527145359e-05, + "loss": 1.049, + "num_input_tokens_seen": 61501976, + "step": 3823 + }, + { + "epoch": 0.2678644916686368, + "grad_norm": 3.8987131118774414, + "learning_rate": 7.323869702276708e-05, + "loss": 1.0152, + "num_input_tokens_seen": 61518360, + "step": 3824 + }, + { + "epoch": 0.267934539914366, + "grad_norm": 4.0447516441345215, + "learning_rate": 7.323169877408055e-05, + "loss": 1.0604, + "num_input_tokens_seen": 61534744, + "step": 3825 + }, + { + "epoch": 0.26800458816009526, + "grad_norm": 4.089504241943359, + "learning_rate": 7.322470052539406e-05, + "loss": 1.119, + "num_input_tokens_seen": 61551128, + "step": 3826 + }, + { + "epoch": 0.2680746364058245, + "grad_norm": 3.864943265914917, + "learning_rate": 7.321770227670754e-05, + "loss": 1.002, + "num_input_tokens_seen": 61566872, + "step": 3827 + }, + { + "epoch": 0.2681446846515538, + "grad_norm": 4.649239540100098, + "learning_rate": 7.321070402802102e-05, + "loss": 1.059, + "num_input_tokens_seen": 61582704, + "step": 3828 + }, + { + "epoch": 0.268214732897283, + "grad_norm": 7.537643909454346, + "learning_rate": 7.32037057793345e-05, + "loss": 1.289, + "num_input_tokens_seen": 61599088, + "step": 3829 + }, + { + "epoch": 0.26828478114301224, + "grad_norm": 3.312519073486328, + "learning_rate": 7.3196707530648e-05, + "loss": 0.872, + "num_input_tokens_seen": 61615472, + "step": 3830 + }, + { + "epoch": 0.2683548293887415, + "grad_norm": 7.833526134490967, + "learning_rate": 7.318970928196147e-05, + "loss": 1.0896, + "num_input_tokens_seen": 61631288, + "step": 3831 + }, + { + "epoch": 0.26842487763447076, + "grad_norm": 3.9574341773986816, + "learning_rate": 7.318271103327496e-05, + "loss": 1.1105, + "num_input_tokens_seen": 61646400, + "step": 3832 + }, + { + "epoch": 0.26849492588019996, + "grad_norm": 3.8763623237609863, + "learning_rate": 7.317571278458845e-05, + "loss": 1.0339, + "num_input_tokens_seen": 61662784, + "step": 3833 + }, + { + "epoch": 0.2685649741259292, + "grad_norm": 4.006046295166016, + "learning_rate": 7.316871453590194e-05, + "loss": 1.1266, + "num_input_tokens_seen": 61678296, + "step": 3834 + }, + { + "epoch": 0.2686350223716585, + "grad_norm": 4.0256500244140625, + "learning_rate": 7.316171628721541e-05, + "loss": 0.9773, + "num_input_tokens_seen": 61694680, + "step": 3835 + }, + { + "epoch": 0.26870507061738774, + "grad_norm": 4.045619964599609, + "learning_rate": 7.31547180385289e-05, + "loss": 1.0445, + "num_input_tokens_seen": 61711064, + "step": 3836 + }, + { + "epoch": 0.26877511886311695, + "grad_norm": 4.189207553863525, + "learning_rate": 7.314771978984239e-05, + "loss": 1.1357, + "num_input_tokens_seen": 61727448, + "step": 3837 + }, + { + "epoch": 0.2688451671088462, + "grad_norm": 6.098819255828857, + "learning_rate": 7.314072154115587e-05, + "loss": 1.0298, + "num_input_tokens_seen": 61743600, + "step": 3838 + }, + { + "epoch": 0.26891521535457547, + "grad_norm": 3.832962989807129, + "learning_rate": 7.313372329246935e-05, + "loss": 1.0985, + "num_input_tokens_seen": 61759984, + "step": 3839 + }, + { + "epoch": 0.2689852636003047, + "grad_norm": 4.448224067687988, + "learning_rate": 7.312672504378284e-05, + "loss": 0.9682, + "num_input_tokens_seen": 61776368, + "step": 3840 + }, + { + "epoch": 0.26905531184603393, + "grad_norm": 4.621326446533203, + "learning_rate": 7.311972679509633e-05, + "loss": 0.9866, + "num_input_tokens_seen": 61791992, + "step": 3841 + }, + { + "epoch": 0.2691253600917632, + "grad_norm": 4.979477882385254, + "learning_rate": 7.31127285464098e-05, + "loss": 1.1592, + "num_input_tokens_seen": 61807912, + "step": 3842 + }, + { + "epoch": 0.26919540833749245, + "grad_norm": 4.678060054779053, + "learning_rate": 7.31057302977233e-05, + "loss": 1.218, + "num_input_tokens_seen": 61824296, + "step": 3843 + }, + { + "epoch": 0.2692654565832217, + "grad_norm": 5.379042625427246, + "learning_rate": 7.309873204903678e-05, + "loss": 1.0687, + "num_input_tokens_seen": 61840680, + "step": 3844 + }, + { + "epoch": 0.26933550482895097, + "grad_norm": 5.836205005645752, + "learning_rate": 7.309173380035026e-05, + "loss": 1.0435, + "num_input_tokens_seen": 61856296, + "step": 3845 + }, + { + "epoch": 0.26940555307468017, + "grad_norm": 4.040728569030762, + "learning_rate": 7.308473555166376e-05, + "loss": 1.0494, + "num_input_tokens_seen": 61872680, + "step": 3846 + }, + { + "epoch": 0.26947560132040943, + "grad_norm": 5.207007884979248, + "learning_rate": 7.307773730297725e-05, + "loss": 0.9293, + "num_input_tokens_seen": 61889064, + "step": 3847 + }, + { + "epoch": 0.2695456495661387, + "grad_norm": 4.996053695678711, + "learning_rate": 7.307073905429072e-05, + "loss": 1.0765, + "num_input_tokens_seen": 61905448, + "step": 3848 + }, + { + "epoch": 0.26961569781186795, + "grad_norm": 3.9249801635742188, + "learning_rate": 7.30637408056042e-05, + "loss": 1.0971, + "num_input_tokens_seen": 61921832, + "step": 3849 + }, + { + "epoch": 0.26968574605759715, + "grad_norm": 4.512659072875977, + "learning_rate": 7.305674255691769e-05, + "loss": 1.0811, + "num_input_tokens_seen": 61937928, + "step": 3850 + }, + { + "epoch": 0.2697557943033264, + "grad_norm": 3.8067586421966553, + "learning_rate": 7.304974430823118e-05, + "loss": 1.0381, + "num_input_tokens_seen": 61953992, + "step": 3851 + }, + { + "epoch": 0.26982584254905567, + "grad_norm": 3.5481879711151123, + "learning_rate": 7.304274605954466e-05, + "loss": 0.9524, + "num_input_tokens_seen": 61969856, + "step": 3852 + }, + { + "epoch": 0.26989589079478493, + "grad_norm": 5.14021635055542, + "learning_rate": 7.303574781085815e-05, + "loss": 1.0893, + "num_input_tokens_seen": 61985448, + "step": 3853 + }, + { + "epoch": 0.26996593904051414, + "grad_norm": 4.729730606079102, + "learning_rate": 7.302874956217164e-05, + "loss": 0.955, + "num_input_tokens_seen": 62001832, + "step": 3854 + }, + { + "epoch": 0.2700359872862434, + "grad_norm": 4.081509113311768, + "learning_rate": 7.302175131348512e-05, + "loss": 1.3099, + "num_input_tokens_seen": 62018216, + "step": 3855 + }, + { + "epoch": 0.27010603553197265, + "grad_norm": 3.9220404624938965, + "learning_rate": 7.301475306479859e-05, + "loss": 1.256, + "num_input_tokens_seen": 62034600, + "step": 3856 + }, + { + "epoch": 0.2701760837777019, + "grad_norm": 3.9707326889038086, + "learning_rate": 7.30077548161121e-05, + "loss": 0.9347, + "num_input_tokens_seen": 62050984, + "step": 3857 + }, + { + "epoch": 0.2702461320234311, + "grad_norm": 3.985651731491089, + "learning_rate": 7.300075656742557e-05, + "loss": 1.0869, + "num_input_tokens_seen": 62066496, + "step": 3858 + }, + { + "epoch": 0.2703161802691604, + "grad_norm": 4.900750160217285, + "learning_rate": 7.299375831873906e-05, + "loss": 1.2112, + "num_input_tokens_seen": 62082880, + "step": 3859 + }, + { + "epoch": 0.27038622851488964, + "grad_norm": 3.7562901973724365, + "learning_rate": 7.298676007005255e-05, + "loss": 1.0372, + "num_input_tokens_seen": 62099264, + "step": 3860 + }, + { + "epoch": 0.2704562767606189, + "grad_norm": 4.3399271965026855, + "learning_rate": 7.297976182136604e-05, + "loss": 1.2113, + "num_input_tokens_seen": 62115648, + "step": 3861 + }, + { + "epoch": 0.2705263250063481, + "grad_norm": 3.792924642562866, + "learning_rate": 7.297276357267951e-05, + "loss": 1.0027, + "num_input_tokens_seen": 62132032, + "step": 3862 + }, + { + "epoch": 0.27059637325207736, + "grad_norm": 4.10078763961792, + "learning_rate": 7.2965765323993e-05, + "loss": 1.0485, + "num_input_tokens_seen": 62148416, + "step": 3863 + }, + { + "epoch": 0.2706664214978066, + "grad_norm": 3.6712818145751953, + "learning_rate": 7.295876707530649e-05, + "loss": 0.982, + "num_input_tokens_seen": 62164080, + "step": 3864 + }, + { + "epoch": 0.2707364697435359, + "grad_norm": 4.216330051422119, + "learning_rate": 7.295176882661996e-05, + "loss": 0.9988, + "num_input_tokens_seen": 62179952, + "step": 3865 + }, + { + "epoch": 0.2708065179892651, + "grad_norm": 3.803950548171997, + "learning_rate": 7.294477057793345e-05, + "loss": 1.1107, + "num_input_tokens_seen": 62196336, + "step": 3866 + }, + { + "epoch": 0.27087656623499434, + "grad_norm": 4.4687676429748535, + "learning_rate": 7.293777232924694e-05, + "loss": 1.1374, + "num_input_tokens_seen": 62212072, + "step": 3867 + }, + { + "epoch": 0.2709466144807236, + "grad_norm": 3.8923938274383545, + "learning_rate": 7.293077408056043e-05, + "loss": 1.0037, + "num_input_tokens_seen": 62227384, + "step": 3868 + }, + { + "epoch": 0.27101666272645286, + "grad_norm": 3.7378618717193604, + "learning_rate": 7.29237758318739e-05, + "loss": 0.9185, + "num_input_tokens_seen": 62243768, + "step": 3869 + }, + { + "epoch": 0.27108671097218207, + "grad_norm": 4.39946985244751, + "learning_rate": 7.291677758318739e-05, + "loss": 1.2908, + "num_input_tokens_seen": 62259760, + "step": 3870 + }, + { + "epoch": 0.2711567592179113, + "grad_norm": 4.526809215545654, + "learning_rate": 7.290977933450088e-05, + "loss": 1.1677, + "num_input_tokens_seen": 62275880, + "step": 3871 + }, + { + "epoch": 0.2712268074636406, + "grad_norm": 5.780641078948975, + "learning_rate": 7.290278108581437e-05, + "loss": 1.3366, + "num_input_tokens_seen": 62291992, + "step": 3872 + }, + { + "epoch": 0.27129685570936984, + "grad_norm": 3.932300329208374, + "learning_rate": 7.289578283712786e-05, + "loss": 0.9404, + "num_input_tokens_seen": 62308168, + "step": 3873 + }, + { + "epoch": 0.27136690395509905, + "grad_norm": 6.381493091583252, + "learning_rate": 7.288878458844135e-05, + "loss": 0.9909, + "num_input_tokens_seen": 62324552, + "step": 3874 + }, + { + "epoch": 0.2714369522008283, + "grad_norm": 6.920464515686035, + "learning_rate": 7.288178633975482e-05, + "loss": 1.0534, + "num_input_tokens_seen": 62340712, + "step": 3875 + }, + { + "epoch": 0.27150700044655757, + "grad_norm": 4.327527046203613, + "learning_rate": 7.28747880910683e-05, + "loss": 1.2133, + "num_input_tokens_seen": 62355904, + "step": 3876 + }, + { + "epoch": 0.2715770486922868, + "grad_norm": 6.8873610496521, + "learning_rate": 7.286778984238178e-05, + "loss": 1.1857, + "num_input_tokens_seen": 62372288, + "step": 3877 + }, + { + "epoch": 0.2716470969380161, + "grad_norm": 4.397764205932617, + "learning_rate": 7.286079159369527e-05, + "loss": 1.1458, + "num_input_tokens_seen": 62388672, + "step": 3878 + }, + { + "epoch": 0.2717171451837453, + "grad_norm": 4.200334072113037, + "learning_rate": 7.285379334500876e-05, + "loss": 1.1534, + "num_input_tokens_seen": 62403728, + "step": 3879 + }, + { + "epoch": 0.27178719342947455, + "grad_norm": 3.8102898597717285, + "learning_rate": 7.284679509632225e-05, + "loss": 1.2455, + "num_input_tokens_seen": 62419712, + "step": 3880 + }, + { + "epoch": 0.2718572416752038, + "grad_norm": 5.665886878967285, + "learning_rate": 7.283979684763574e-05, + "loss": 1.0506, + "num_input_tokens_seen": 62435648, + "step": 3881 + }, + { + "epoch": 0.27192728992093307, + "grad_norm": 5.59833288192749, + "learning_rate": 7.283279859894921e-05, + "loss": 1.1289, + "num_input_tokens_seen": 62451760, + "step": 3882 + }, + { + "epoch": 0.2719973381666623, + "grad_norm": 4.3096699714660645, + "learning_rate": 7.282580035026269e-05, + "loss": 1.1069, + "num_input_tokens_seen": 62468144, + "step": 3883 + }, + { + "epoch": 0.27206738641239153, + "grad_norm": 3.584202766418457, + "learning_rate": 7.281880210157619e-05, + "loss": 0.981, + "num_input_tokens_seen": 62484528, + "step": 3884 + }, + { + "epoch": 0.2721374346581208, + "grad_norm": 5.078696250915527, + "learning_rate": 7.281180385288967e-05, + "loss": 1.0727, + "num_input_tokens_seen": 62500912, + "step": 3885 + }, + { + "epoch": 0.27220748290385005, + "grad_norm": 3.4883761405944824, + "learning_rate": 7.280480560420315e-05, + "loss": 0.888, + "num_input_tokens_seen": 62517296, + "step": 3886 + }, + { + "epoch": 0.27227753114957925, + "grad_norm": 3.938286066055298, + "learning_rate": 7.279780735551664e-05, + "loss": 0.9736, + "num_input_tokens_seen": 62532896, + "step": 3887 + }, + { + "epoch": 0.2723475793953085, + "grad_norm": 3.7150652408599854, + "learning_rate": 7.279080910683013e-05, + "loss": 1.1163, + "num_input_tokens_seen": 62549072, + "step": 3888 + }, + { + "epoch": 0.2724176276410378, + "grad_norm": 5.31076717376709, + "learning_rate": 7.278381085814361e-05, + "loss": 0.9943, + "num_input_tokens_seen": 62564384, + "step": 3889 + }, + { + "epoch": 0.27248767588676703, + "grad_norm": 4.8600053787231445, + "learning_rate": 7.27768126094571e-05, + "loss": 1.1767, + "num_input_tokens_seen": 62580768, + "step": 3890 + }, + { + "epoch": 0.27255772413249624, + "grad_norm": 3.5890231132507324, + "learning_rate": 7.276981436077058e-05, + "loss": 1.0949, + "num_input_tokens_seen": 62596928, + "step": 3891 + }, + { + "epoch": 0.2726277723782255, + "grad_norm": 4.171263217926025, + "learning_rate": 7.276281611208407e-05, + "loss": 1.0013, + "num_input_tokens_seen": 62613312, + "step": 3892 + }, + { + "epoch": 0.27269782062395476, + "grad_norm": 5.907830715179443, + "learning_rate": 7.275581786339755e-05, + "loss": 1.0622, + "num_input_tokens_seen": 62627840, + "step": 3893 + }, + { + "epoch": 0.272767868869684, + "grad_norm": 3.912140369415283, + "learning_rate": 7.274881961471104e-05, + "loss": 1.1128, + "num_input_tokens_seen": 62643760, + "step": 3894 + }, + { + "epoch": 0.2728379171154132, + "grad_norm": 3.9871180057525635, + "learning_rate": 7.274182136602453e-05, + "loss": 1.0879, + "num_input_tokens_seen": 62660144, + "step": 3895 + }, + { + "epoch": 0.2729079653611425, + "grad_norm": 3.8014907836914062, + "learning_rate": 7.2734823117338e-05, + "loss": 1.0135, + "num_input_tokens_seen": 62676200, + "step": 3896 + }, + { + "epoch": 0.27297801360687174, + "grad_norm": 3.7584786415100098, + "learning_rate": 7.272782486865149e-05, + "loss": 1.0366, + "num_input_tokens_seen": 62692584, + "step": 3897 + }, + { + "epoch": 0.273048061852601, + "grad_norm": 3.573341131210327, + "learning_rate": 7.272082661996498e-05, + "loss": 0.8726, + "num_input_tokens_seen": 62708968, + "step": 3898 + }, + { + "epoch": 0.2731181100983302, + "grad_norm": 4.013971328735352, + "learning_rate": 7.271382837127847e-05, + "loss": 0.991, + "num_input_tokens_seen": 62725352, + "step": 3899 + }, + { + "epoch": 0.27318815834405946, + "grad_norm": 4.3081488609313965, + "learning_rate": 7.270683012259195e-05, + "loss": 1.0632, + "num_input_tokens_seen": 62741736, + "step": 3900 + }, + { + "epoch": 0.2732582065897887, + "grad_norm": 3.857982635498047, + "learning_rate": 7.269983187390544e-05, + "loss": 1.1116, + "num_input_tokens_seen": 62757624, + "step": 3901 + }, + { + "epoch": 0.273328254835518, + "grad_norm": 3.5167486667633057, + "learning_rate": 7.269283362521892e-05, + "loss": 0.9951, + "num_input_tokens_seen": 62774008, + "step": 3902 + }, + { + "epoch": 0.2733983030812472, + "grad_norm": 4.025612831115723, + "learning_rate": 7.26858353765324e-05, + "loss": 1.1632, + "num_input_tokens_seen": 62789560, + "step": 3903 + }, + { + "epoch": 0.27346835132697644, + "grad_norm": 3.6391422748565674, + "learning_rate": 7.267883712784588e-05, + "loss": 0.9442, + "num_input_tokens_seen": 62805824, + "step": 3904 + }, + { + "epoch": 0.2735383995727057, + "grad_norm": 4.352347373962402, + "learning_rate": 7.267183887915937e-05, + "loss": 1.0882, + "num_input_tokens_seen": 62821368, + "step": 3905 + }, + { + "epoch": 0.27360844781843496, + "grad_norm": 3.782601833343506, + "learning_rate": 7.266484063047286e-05, + "loss": 0.9795, + "num_input_tokens_seen": 62837024, + "step": 3906 + }, + { + "epoch": 0.27367849606416417, + "grad_norm": 3.860903263092041, + "learning_rate": 7.265784238178635e-05, + "loss": 1.1751, + "num_input_tokens_seen": 62853408, + "step": 3907 + }, + { + "epoch": 0.2737485443098934, + "grad_norm": 6.185113430023193, + "learning_rate": 7.265084413309984e-05, + "loss": 1.1976, + "num_input_tokens_seen": 62869792, + "step": 3908 + }, + { + "epoch": 0.2738185925556227, + "grad_norm": 6.02334451675415, + "learning_rate": 7.264384588441331e-05, + "loss": 1.0472, + "num_input_tokens_seen": 62886088, + "step": 3909 + }, + { + "epoch": 0.27388864080135195, + "grad_norm": 4.019417762756348, + "learning_rate": 7.263684763572679e-05, + "loss": 0.9597, + "num_input_tokens_seen": 62902472, + "step": 3910 + }, + { + "epoch": 0.27395868904708115, + "grad_norm": 4.0645527839660645, + "learning_rate": 7.262984938704029e-05, + "loss": 1.0267, + "num_input_tokens_seen": 62918552, + "step": 3911 + }, + { + "epoch": 0.2740287372928104, + "grad_norm": 3.978803873062134, + "learning_rate": 7.262285113835378e-05, + "loss": 1.1366, + "num_input_tokens_seen": 62934272, + "step": 3912 + }, + { + "epoch": 0.27409878553853967, + "grad_norm": 4.659839630126953, + "learning_rate": 7.261585288966725e-05, + "loss": 1.0485, + "num_input_tokens_seen": 62950656, + "step": 3913 + }, + { + "epoch": 0.27416883378426893, + "grad_norm": 4.378306865692139, + "learning_rate": 7.260885464098074e-05, + "loss": 0.9949, + "num_input_tokens_seen": 62966120, + "step": 3914 + }, + { + "epoch": 0.2742388820299982, + "grad_norm": 3.723999261856079, + "learning_rate": 7.260185639229423e-05, + "loss": 1.0575, + "num_input_tokens_seen": 62982504, + "step": 3915 + }, + { + "epoch": 0.2743089302757274, + "grad_norm": 4.133684158325195, + "learning_rate": 7.25948581436077e-05, + "loss": 0.9707, + "num_input_tokens_seen": 62998888, + "step": 3916 + }, + { + "epoch": 0.27437897852145665, + "grad_norm": 3.8377842903137207, + "learning_rate": 7.258785989492119e-05, + "loss": 1.1018, + "num_input_tokens_seen": 63015272, + "step": 3917 + }, + { + "epoch": 0.2744490267671859, + "grad_norm": 3.546846389770508, + "learning_rate": 7.258086164623468e-05, + "loss": 0.9544, + "num_input_tokens_seen": 63031656, + "step": 3918 + }, + { + "epoch": 0.27451907501291517, + "grad_norm": 3.8629097938537598, + "learning_rate": 7.257386339754817e-05, + "loss": 1.0174, + "num_input_tokens_seen": 63047208, + "step": 3919 + }, + { + "epoch": 0.2745891232586444, + "grad_norm": 3.780395984649658, + "learning_rate": 7.256686514886165e-05, + "loss": 1.0927, + "num_input_tokens_seen": 63063592, + "step": 3920 + }, + { + "epoch": 0.27465917150437363, + "grad_norm": 3.5188148021698, + "learning_rate": 7.255986690017513e-05, + "loss": 0.9973, + "num_input_tokens_seen": 63079976, + "step": 3921 + }, + { + "epoch": 0.2747292197501029, + "grad_norm": 4.295319080352783, + "learning_rate": 7.255286865148862e-05, + "loss": 1.1545, + "num_input_tokens_seen": 63096360, + "step": 3922 + }, + { + "epoch": 0.27479926799583215, + "grad_norm": 6.307181358337402, + "learning_rate": 7.25458704028021e-05, + "loss": 1.0283, + "num_input_tokens_seen": 63112744, + "step": 3923 + }, + { + "epoch": 0.27486931624156136, + "grad_norm": 4.0670342445373535, + "learning_rate": 7.253887215411559e-05, + "loss": 1.0834, + "num_input_tokens_seen": 63129000, + "step": 3924 + }, + { + "epoch": 0.2749393644872906, + "grad_norm": 4.441539287567139, + "learning_rate": 7.253187390542907e-05, + "loss": 1.1264, + "num_input_tokens_seen": 63145304, + "step": 3925 + }, + { + "epoch": 0.2750094127330199, + "grad_norm": 6.151254653930664, + "learning_rate": 7.252487565674256e-05, + "loss": 0.911, + "num_input_tokens_seen": 63161688, + "step": 3926 + }, + { + "epoch": 0.27507946097874914, + "grad_norm": 5.355491638183594, + "learning_rate": 7.251787740805605e-05, + "loss": 1.0604, + "num_input_tokens_seen": 63176128, + "step": 3927 + }, + { + "epoch": 0.27514950922447834, + "grad_norm": 3.4603800773620605, + "learning_rate": 7.251087915936954e-05, + "loss": 0.7811, + "num_input_tokens_seen": 63192512, + "step": 3928 + }, + { + "epoch": 0.2752195574702076, + "grad_norm": 5.412753105163574, + "learning_rate": 7.250388091068302e-05, + "loss": 0.9675, + "num_input_tokens_seen": 63208896, + "step": 3929 + }, + { + "epoch": 0.27528960571593686, + "grad_norm": 3.928074598312378, + "learning_rate": 7.249688266199649e-05, + "loss": 1.0562, + "num_input_tokens_seen": 63224296, + "step": 3930 + }, + { + "epoch": 0.2753596539616661, + "grad_norm": 4.239214897155762, + "learning_rate": 7.248988441330998e-05, + "loss": 0.9697, + "num_input_tokens_seen": 63239312, + "step": 3931 + }, + { + "epoch": 0.2754297022073953, + "grad_norm": 3.8074252605438232, + "learning_rate": 7.248288616462348e-05, + "loss": 1.0834, + "num_input_tokens_seen": 63255664, + "step": 3932 + }, + { + "epoch": 0.2754997504531246, + "grad_norm": 3.721026659011841, + "learning_rate": 7.247588791593696e-05, + "loss": 1.1663, + "num_input_tokens_seen": 63272048, + "step": 3933 + }, + { + "epoch": 0.27556979869885384, + "grad_norm": 4.076726913452148, + "learning_rate": 7.246888966725044e-05, + "loss": 1.1179, + "num_input_tokens_seen": 63288432, + "step": 3934 + }, + { + "epoch": 0.2756398469445831, + "grad_norm": 4.238835334777832, + "learning_rate": 7.246189141856393e-05, + "loss": 1.0894, + "num_input_tokens_seen": 63304168, + "step": 3935 + }, + { + "epoch": 0.2757098951903123, + "grad_norm": 4.4860148429870605, + "learning_rate": 7.245489316987741e-05, + "loss": 1.1763, + "num_input_tokens_seen": 63320552, + "step": 3936 + }, + { + "epoch": 0.27577994343604156, + "grad_norm": 6.002726078033447, + "learning_rate": 7.244789492119088e-05, + "loss": 1.158, + "num_input_tokens_seen": 63336792, + "step": 3937 + }, + { + "epoch": 0.2758499916817708, + "grad_norm": 3.799751043319702, + "learning_rate": 7.244089667250439e-05, + "loss": 1.0316, + "num_input_tokens_seen": 63353176, + "step": 3938 + }, + { + "epoch": 0.2759200399275001, + "grad_norm": 4.905911445617676, + "learning_rate": 7.243389842381787e-05, + "loss": 0.8847, + "num_input_tokens_seen": 63369560, + "step": 3939 + }, + { + "epoch": 0.2759900881732293, + "grad_norm": 5.141537666320801, + "learning_rate": 7.242690017513135e-05, + "loss": 1.109, + "num_input_tokens_seen": 63385944, + "step": 3940 + }, + { + "epoch": 0.27606013641895855, + "grad_norm": 5.276777267456055, + "learning_rate": 7.241990192644484e-05, + "loss": 0.9881, + "num_input_tokens_seen": 63401672, + "step": 3941 + }, + { + "epoch": 0.2761301846646878, + "grad_norm": 5.267075538635254, + "learning_rate": 7.241290367775833e-05, + "loss": 1.0048, + "num_input_tokens_seen": 63417792, + "step": 3942 + }, + { + "epoch": 0.27620023291041707, + "grad_norm": 4.065691947937012, + "learning_rate": 7.24059054290718e-05, + "loss": 1.0088, + "num_input_tokens_seen": 63434176, + "step": 3943 + }, + { + "epoch": 0.27627028115614627, + "grad_norm": 7.921762466430664, + "learning_rate": 7.239890718038529e-05, + "loss": 1.3552, + "num_input_tokens_seen": 63450032, + "step": 3944 + }, + { + "epoch": 0.27634032940187553, + "grad_norm": 3.55094313621521, + "learning_rate": 7.239190893169878e-05, + "loss": 0.9957, + "num_input_tokens_seen": 63466416, + "step": 3945 + }, + { + "epoch": 0.2764103776476048, + "grad_norm": 5.732813358306885, + "learning_rate": 7.238491068301227e-05, + "loss": 1.0968, + "num_input_tokens_seen": 63482296, + "step": 3946 + }, + { + "epoch": 0.27648042589333405, + "grad_norm": 3.9143989086151123, + "learning_rate": 7.237791243432574e-05, + "loss": 0.9218, + "num_input_tokens_seen": 63498472, + "step": 3947 + }, + { + "epoch": 0.2765504741390633, + "grad_norm": 4.123042106628418, + "learning_rate": 7.237091418563923e-05, + "loss": 1.0081, + "num_input_tokens_seen": 63513856, + "step": 3948 + }, + { + "epoch": 0.2766205223847925, + "grad_norm": 3.7550277709960938, + "learning_rate": 7.236391593695272e-05, + "loss": 1.0612, + "num_input_tokens_seen": 63529432, + "step": 3949 + }, + { + "epoch": 0.27669057063052177, + "grad_norm": 3.841831922531128, + "learning_rate": 7.23569176882662e-05, + "loss": 1.1208, + "num_input_tokens_seen": 63545816, + "step": 3950 + }, + { + "epoch": 0.27676061887625103, + "grad_norm": 4.626603126525879, + "learning_rate": 7.234991943957968e-05, + "loss": 1.3412, + "num_input_tokens_seen": 63561960, + "step": 3951 + }, + { + "epoch": 0.2768306671219803, + "grad_norm": 3.874140977859497, + "learning_rate": 7.234292119089319e-05, + "loss": 1.0549, + "num_input_tokens_seen": 63578344, + "step": 3952 + }, + { + "epoch": 0.2769007153677095, + "grad_norm": 3.6525163650512695, + "learning_rate": 7.233592294220666e-05, + "loss": 1.0905, + "num_input_tokens_seen": 63594520, + "step": 3953 + }, + { + "epoch": 0.27697076361343875, + "grad_norm": 5.065535068511963, + "learning_rate": 7.232892469352015e-05, + "loss": 1.1913, + "num_input_tokens_seen": 63610904, + "step": 3954 + }, + { + "epoch": 0.277040811859168, + "grad_norm": 7.97597599029541, + "learning_rate": 7.232192644483364e-05, + "loss": 0.9109, + "num_input_tokens_seen": 63625896, + "step": 3955 + }, + { + "epoch": 0.27711086010489727, + "grad_norm": 5.0254645347595215, + "learning_rate": 7.231492819614711e-05, + "loss": 1.0177, + "num_input_tokens_seen": 63642280, + "step": 3956 + }, + { + "epoch": 0.2771809083506265, + "grad_norm": 4.171605587005615, + "learning_rate": 7.230792994746059e-05, + "loss": 1.3166, + "num_input_tokens_seen": 63658400, + "step": 3957 + }, + { + "epoch": 0.27725095659635574, + "grad_norm": 4.036003589630127, + "learning_rate": 7.230093169877409e-05, + "loss": 1.0489, + "num_input_tokens_seen": 63674784, + "step": 3958 + }, + { + "epoch": 0.277321004842085, + "grad_norm": 4.664374828338623, + "learning_rate": 7.229393345008758e-05, + "loss": 1.3189, + "num_input_tokens_seen": 63691168, + "step": 3959 + }, + { + "epoch": 0.27739105308781425, + "grad_norm": 3.7217307090759277, + "learning_rate": 7.228693520140105e-05, + "loss": 1.2532, + "num_input_tokens_seen": 63707552, + "step": 3960 + }, + { + "epoch": 0.27746110133354346, + "grad_norm": 3.622593879699707, + "learning_rate": 7.227993695271454e-05, + "loss": 0.8604, + "num_input_tokens_seen": 63723936, + "step": 3961 + }, + { + "epoch": 0.2775311495792727, + "grad_norm": 4.154850006103516, + "learning_rate": 7.227293870402803e-05, + "loss": 1.1366, + "num_input_tokens_seen": 63740320, + "step": 3962 + }, + { + "epoch": 0.277601197825002, + "grad_norm": 4.157016754150391, + "learning_rate": 7.22659404553415e-05, + "loss": 0.8815, + "num_input_tokens_seen": 63756456, + "step": 3963 + }, + { + "epoch": 0.27767124607073124, + "grad_norm": 4.652394771575928, + "learning_rate": 7.2258942206655e-05, + "loss": 0.8966, + "num_input_tokens_seen": 63772840, + "step": 3964 + }, + { + "epoch": 0.27774129431646044, + "grad_norm": 7.87667989730835, + "learning_rate": 7.225194395796848e-05, + "loss": 1.1371, + "num_input_tokens_seen": 63788800, + "step": 3965 + }, + { + "epoch": 0.2778113425621897, + "grad_norm": 4.333608627319336, + "learning_rate": 7.224494570928197e-05, + "loss": 1.3465, + "num_input_tokens_seen": 63805088, + "step": 3966 + }, + { + "epoch": 0.27788139080791896, + "grad_norm": 7.2095184326171875, + "learning_rate": 7.223794746059545e-05, + "loss": 1.0276, + "num_input_tokens_seen": 63821472, + "step": 3967 + }, + { + "epoch": 0.2779514390536482, + "grad_norm": 3.9144251346588135, + "learning_rate": 7.223094921190893e-05, + "loss": 0.9954, + "num_input_tokens_seen": 63837048, + "step": 3968 + }, + { + "epoch": 0.2780214872993774, + "grad_norm": 4.380809783935547, + "learning_rate": 7.222395096322242e-05, + "loss": 0.9757, + "num_input_tokens_seen": 63852872, + "step": 3969 + }, + { + "epoch": 0.2780915355451067, + "grad_norm": 3.637685537338257, + "learning_rate": 7.22169527145359e-05, + "loss": 1.0264, + "num_input_tokens_seen": 63868864, + "step": 3970 + }, + { + "epoch": 0.27816158379083594, + "grad_norm": 4.742129802703857, + "learning_rate": 7.220995446584939e-05, + "loss": 1.2344, + "num_input_tokens_seen": 63885248, + "step": 3971 + }, + { + "epoch": 0.2782316320365652, + "grad_norm": 4.7221269607543945, + "learning_rate": 7.220295621716289e-05, + "loss": 1.1001, + "num_input_tokens_seen": 63901632, + "step": 3972 + }, + { + "epoch": 0.2783016802822944, + "grad_norm": 3.6607449054718018, + "learning_rate": 7.219595796847636e-05, + "loss": 1.1179, + "num_input_tokens_seen": 63917688, + "step": 3973 + }, + { + "epoch": 0.27837172852802367, + "grad_norm": 4.264851093292236, + "learning_rate": 7.218895971978984e-05, + "loss": 1.0158, + "num_input_tokens_seen": 63934072, + "step": 3974 + }, + { + "epoch": 0.2784417767737529, + "grad_norm": 5.0043511390686035, + "learning_rate": 7.218196147110333e-05, + "loss": 1.0359, + "num_input_tokens_seen": 63950200, + "step": 3975 + }, + { + "epoch": 0.2785118250194822, + "grad_norm": 4.323488235473633, + "learning_rate": 7.217496322241682e-05, + "loss": 1.1791, + "num_input_tokens_seen": 63966584, + "step": 3976 + }, + { + "epoch": 0.2785818732652114, + "grad_norm": 6.721888065338135, + "learning_rate": 7.216796497373029e-05, + "loss": 0.9446, + "num_input_tokens_seen": 63982440, + "step": 3977 + }, + { + "epoch": 0.27865192151094065, + "grad_norm": 6.3528289794921875, + "learning_rate": 7.21609667250438e-05, + "loss": 1.1506, + "num_input_tokens_seen": 63998824, + "step": 3978 + }, + { + "epoch": 0.2787219697566699, + "grad_norm": 5.293467044830322, + "learning_rate": 7.215396847635728e-05, + "loss": 1.2791, + "num_input_tokens_seen": 64014984, + "step": 3979 + }, + { + "epoch": 0.27879201800239917, + "grad_norm": 3.8228442668914795, + "learning_rate": 7.214697022767076e-05, + "loss": 1.1086, + "num_input_tokens_seen": 64031080, + "step": 3980 + }, + { + "epoch": 0.27886206624812837, + "grad_norm": 3.8407061100006104, + "learning_rate": 7.213997197898425e-05, + "loss": 1.1772, + "num_input_tokens_seen": 64046416, + "step": 3981 + }, + { + "epoch": 0.27893211449385763, + "grad_norm": 3.9471728801727295, + "learning_rate": 7.213297373029773e-05, + "loss": 0.9394, + "num_input_tokens_seen": 64062784, + "step": 3982 + }, + { + "epoch": 0.2790021627395869, + "grad_norm": 4.1796722412109375, + "learning_rate": 7.212597548161121e-05, + "loss": 0.9966, + "num_input_tokens_seen": 64077504, + "step": 3983 + }, + { + "epoch": 0.27907221098531615, + "grad_norm": 3.78998064994812, + "learning_rate": 7.21189772329247e-05, + "loss": 1.1219, + "num_input_tokens_seen": 64093888, + "step": 3984 + }, + { + "epoch": 0.2791422592310454, + "grad_norm": 3.383371591567993, + "learning_rate": 7.211197898423819e-05, + "loss": 0.8832, + "num_input_tokens_seen": 64110272, + "step": 3985 + }, + { + "epoch": 0.2792123074767746, + "grad_norm": 3.6502346992492676, + "learning_rate": 7.210498073555168e-05, + "loss": 1.0114, + "num_input_tokens_seen": 64125464, + "step": 3986 + }, + { + "epoch": 0.27928235572250387, + "grad_norm": 3.9421629905700684, + "learning_rate": 7.209798248686515e-05, + "loss": 1.1305, + "num_input_tokens_seen": 64141848, + "step": 3987 + }, + { + "epoch": 0.27935240396823313, + "grad_norm": 4.40875244140625, + "learning_rate": 7.209098423817864e-05, + "loss": 0.9603, + "num_input_tokens_seen": 64158232, + "step": 3988 + }, + { + "epoch": 0.2794224522139624, + "grad_norm": 5.909340858459473, + "learning_rate": 7.208398598949213e-05, + "loss": 1.121, + "num_input_tokens_seen": 64174616, + "step": 3989 + }, + { + "epoch": 0.2794925004596916, + "grad_norm": 4.548187732696533, + "learning_rate": 7.20769877408056e-05, + "loss": 0.9575, + "num_input_tokens_seen": 64191000, + "step": 3990 + }, + { + "epoch": 0.27956254870542085, + "grad_norm": 4.1479926109313965, + "learning_rate": 7.206998949211909e-05, + "loss": 1.1205, + "num_input_tokens_seen": 64207384, + "step": 3991 + }, + { + "epoch": 0.2796325969511501, + "grad_norm": 4.229100227355957, + "learning_rate": 7.206299124343259e-05, + "loss": 1.0597, + "num_input_tokens_seen": 64223304, + "step": 3992 + }, + { + "epoch": 0.2797026451968794, + "grad_norm": 7.431615352630615, + "learning_rate": 7.205599299474607e-05, + "loss": 1.2793, + "num_input_tokens_seen": 64239688, + "step": 3993 + }, + { + "epoch": 0.2797726934426086, + "grad_norm": 4.346622943878174, + "learning_rate": 7.204899474605954e-05, + "loss": 1.2385, + "num_input_tokens_seen": 64255752, + "step": 3994 + }, + { + "epoch": 0.27984274168833784, + "grad_norm": 3.771306276321411, + "learning_rate": 7.204199649737303e-05, + "loss": 1.0211, + "num_input_tokens_seen": 64271760, + "step": 3995 + }, + { + "epoch": 0.2799127899340671, + "grad_norm": 4.411479473114014, + "learning_rate": 7.203499824868652e-05, + "loss": 1.3369, + "num_input_tokens_seen": 64288144, + "step": 3996 + }, + { + "epoch": 0.27998283817979636, + "grad_norm": 4.591271877288818, + "learning_rate": 7.2028e-05, + "loss": 1.1021, + "num_input_tokens_seen": 64304528, + "step": 3997 + }, + { + "epoch": 0.28005288642552556, + "grad_norm": 3.88271427154541, + "learning_rate": 7.20210017513135e-05, + "loss": 0.9181, + "num_input_tokens_seen": 64320912, + "step": 3998 + }, + { + "epoch": 0.2801229346712548, + "grad_norm": 10.80846118927002, + "learning_rate": 7.201400350262699e-05, + "loss": 1.0922, + "num_input_tokens_seen": 64337296, + "step": 3999 + }, + { + "epoch": 0.2801929829169841, + "grad_norm": 3.7112953662872314, + "learning_rate": 7.200700525394046e-05, + "loss": 1.0157, + "num_input_tokens_seen": 64353680, + "step": 4000 + }, + { + "epoch": 0.2801929829169841, + "eval_loss": 1.1334750652313232, + "eval_runtime": 0.1958, + "eval_samples_per_second": 5.106, + "eval_steps_per_second": 5.106, + "num_input_tokens_seen": 64353680, + "step": 4000 + }, + { + "epoch": 0.28026303116271334, + "grad_norm": 7.529544830322266, + "learning_rate": 7.200000700525394e-05, + "loss": 1.1264, + "num_input_tokens_seen": 64368408, + "step": 4001 + }, + { + "epoch": 0.28033307940844254, + "grad_norm": 3.761939764022827, + "learning_rate": 7.199300875656742e-05, + "loss": 1.1027, + "num_input_tokens_seen": 64384792, + "step": 4002 + }, + { + "epoch": 0.2804031276541718, + "grad_norm": 4.091811656951904, + "learning_rate": 7.198601050788091e-05, + "loss": 1.0368, + "num_input_tokens_seen": 64400520, + "step": 4003 + }, + { + "epoch": 0.28047317589990106, + "grad_norm": 5.5972795486450195, + "learning_rate": 7.19790122591944e-05, + "loss": 1.0957, + "num_input_tokens_seen": 64416904, + "step": 4004 + }, + { + "epoch": 0.2805432241456303, + "grad_norm": 3.4631423950195312, + "learning_rate": 7.197201401050789e-05, + "loss": 0.9517, + "num_input_tokens_seen": 64432168, + "step": 4005 + }, + { + "epoch": 0.2806132723913595, + "grad_norm": 6.3156938552856445, + "learning_rate": 7.196501576182138e-05, + "loss": 1.0554, + "num_input_tokens_seen": 64447752, + "step": 4006 + }, + { + "epoch": 0.2806833206370888, + "grad_norm": 10.07819652557373, + "learning_rate": 7.195801751313485e-05, + "loss": 0.99, + "num_input_tokens_seen": 64464136, + "step": 4007 + }, + { + "epoch": 0.28075336888281804, + "grad_norm": 4.695057392120361, + "learning_rate": 7.195101926444834e-05, + "loss": 0.9745, + "num_input_tokens_seen": 64480520, + "step": 4008 + }, + { + "epoch": 0.2808234171285473, + "grad_norm": 4.74672269821167, + "learning_rate": 7.194402101576183e-05, + "loss": 1.0648, + "num_input_tokens_seen": 64496904, + "step": 4009 + }, + { + "epoch": 0.2808934653742765, + "grad_norm": 3.834928512573242, + "learning_rate": 7.19370227670753e-05, + "loss": 1.0163, + "num_input_tokens_seen": 64513288, + "step": 4010 + }, + { + "epoch": 0.28096351362000577, + "grad_norm": 4.1937103271484375, + "learning_rate": 7.19300245183888e-05, + "loss": 1.1351, + "num_input_tokens_seen": 64528992, + "step": 4011 + }, + { + "epoch": 0.281033561865735, + "grad_norm": 4.1531243324279785, + "learning_rate": 7.192302626970228e-05, + "loss": 1.0835, + "num_input_tokens_seen": 64544776, + "step": 4012 + }, + { + "epoch": 0.2811036101114643, + "grad_norm": 5.006285190582275, + "learning_rate": 7.191602802101577e-05, + "loss": 1.1282, + "num_input_tokens_seen": 64560944, + "step": 4013 + }, + { + "epoch": 0.2811736583571935, + "grad_norm": 3.433964252471924, + "learning_rate": 7.190902977232925e-05, + "loss": 1.1164, + "num_input_tokens_seen": 64577328, + "step": 4014 + }, + { + "epoch": 0.28124370660292275, + "grad_norm": 6.165640354156494, + "learning_rate": 7.190203152364274e-05, + "loss": 0.9713, + "num_input_tokens_seen": 64593672, + "step": 4015 + }, + { + "epoch": 0.281313754848652, + "grad_norm": 6.037381649017334, + "learning_rate": 7.189503327495622e-05, + "loss": 1.032, + "num_input_tokens_seen": 64610056, + "step": 4016 + }, + { + "epoch": 0.28138380309438127, + "grad_norm": 4.2639923095703125, + "learning_rate": 7.18880350262697e-05, + "loss": 1.1842, + "num_input_tokens_seen": 64625864, + "step": 4017 + }, + { + "epoch": 0.28145385134011053, + "grad_norm": 3.8862967491149902, + "learning_rate": 7.188103677758319e-05, + "loss": 0.9448, + "num_input_tokens_seen": 64642248, + "step": 4018 + }, + { + "epoch": 0.28152389958583973, + "grad_norm": 3.9584991931915283, + "learning_rate": 7.187403852889669e-05, + "loss": 0.9602, + "num_input_tokens_seen": 64658632, + "step": 4019 + }, + { + "epoch": 0.281593947831569, + "grad_norm": 6.037077903747559, + "learning_rate": 7.186704028021017e-05, + "loss": 1.0913, + "num_input_tokens_seen": 64675016, + "step": 4020 + }, + { + "epoch": 0.28166399607729825, + "grad_norm": 3.750059127807617, + "learning_rate": 7.186004203152364e-05, + "loss": 1.1294, + "num_input_tokens_seen": 64691400, + "step": 4021 + }, + { + "epoch": 0.2817340443230275, + "grad_norm": 4.364743709564209, + "learning_rate": 7.185304378283713e-05, + "loss": 1.0983, + "num_input_tokens_seen": 64706512, + "step": 4022 + }, + { + "epoch": 0.2818040925687567, + "grad_norm": 3.463717460632324, + "learning_rate": 7.184604553415062e-05, + "loss": 1.0773, + "num_input_tokens_seen": 64722688, + "step": 4023 + }, + { + "epoch": 0.281874140814486, + "grad_norm": 3.939438819885254, + "learning_rate": 7.18390472854641e-05, + "loss": 1.3356, + "num_input_tokens_seen": 64738856, + "step": 4024 + }, + { + "epoch": 0.28194418906021523, + "grad_norm": 3.813849687576294, + "learning_rate": 7.18320490367776e-05, + "loss": 1.0521, + "num_input_tokens_seen": 64755240, + "step": 4025 + }, + { + "epoch": 0.2820142373059445, + "grad_norm": 3.5874619483947754, + "learning_rate": 7.182505078809108e-05, + "loss": 1.0328, + "num_input_tokens_seen": 64771184, + "step": 4026 + }, + { + "epoch": 0.2820842855516737, + "grad_norm": 4.544376850128174, + "learning_rate": 7.181805253940456e-05, + "loss": 1.1132, + "num_input_tokens_seen": 64787568, + "step": 4027 + }, + { + "epoch": 0.28215433379740296, + "grad_norm": 3.6816799640655518, + "learning_rate": 7.181105429071803e-05, + "loss": 1.1088, + "num_input_tokens_seen": 64803064, + "step": 4028 + }, + { + "epoch": 0.2822243820431322, + "grad_norm": 7.1433939933776855, + "learning_rate": 7.180405604203152e-05, + "loss": 1.0069, + "num_input_tokens_seen": 64818736, + "step": 4029 + }, + { + "epoch": 0.2822944302888615, + "grad_norm": 4.308315753936768, + "learning_rate": 7.179705779334501e-05, + "loss": 1.1992, + "num_input_tokens_seen": 64834848, + "step": 4030 + }, + { + "epoch": 0.2823644785345907, + "grad_norm": 4.985830783843994, + "learning_rate": 7.17900595446585e-05, + "loss": 1.1996, + "num_input_tokens_seen": 64851224, + "step": 4031 + }, + { + "epoch": 0.28243452678031994, + "grad_norm": 4.884370803833008, + "learning_rate": 7.178306129597199e-05, + "loss": 1.0541, + "num_input_tokens_seen": 64867608, + "step": 4032 + }, + { + "epoch": 0.2825045750260492, + "grad_norm": 4.335781097412109, + "learning_rate": 7.177606304728548e-05, + "loss": 1.0596, + "num_input_tokens_seen": 64883840, + "step": 4033 + }, + { + "epoch": 0.28257462327177846, + "grad_norm": 3.729811191558838, + "learning_rate": 7.176906479859895e-05, + "loss": 1.0167, + "num_input_tokens_seen": 64899872, + "step": 4034 + }, + { + "epoch": 0.28264467151750766, + "grad_norm": 3.7386136054992676, + "learning_rate": 7.176206654991244e-05, + "loss": 0.7835, + "num_input_tokens_seen": 64916256, + "step": 4035 + }, + { + "epoch": 0.2827147197632369, + "grad_norm": 3.8022067546844482, + "learning_rate": 7.175506830122593e-05, + "loss": 1.0571, + "num_input_tokens_seen": 64932640, + "step": 4036 + }, + { + "epoch": 0.2827847680089662, + "grad_norm": 4.713296890258789, + "learning_rate": 7.17480700525394e-05, + "loss": 1.2877, + "num_input_tokens_seen": 64948520, + "step": 4037 + }, + { + "epoch": 0.28285481625469544, + "grad_norm": 3.682568073272705, + "learning_rate": 7.174107180385289e-05, + "loss": 1.0193, + "num_input_tokens_seen": 64964904, + "step": 4038 + }, + { + "epoch": 0.28292486450042464, + "grad_norm": 4.533677101135254, + "learning_rate": 7.173407355516638e-05, + "loss": 1.133, + "num_input_tokens_seen": 64981288, + "step": 4039 + }, + { + "epoch": 0.2829949127461539, + "grad_norm": 4.343021392822266, + "learning_rate": 7.172707530647987e-05, + "loss": 1.2843, + "num_input_tokens_seen": 64997640, + "step": 4040 + }, + { + "epoch": 0.28306496099188316, + "grad_norm": 4.942739009857178, + "learning_rate": 7.172007705779334e-05, + "loss": 1.1391, + "num_input_tokens_seen": 65012456, + "step": 4041 + }, + { + "epoch": 0.2831350092376124, + "grad_norm": 6.1112213134765625, + "learning_rate": 7.171307880910683e-05, + "loss": 1.0135, + "num_input_tokens_seen": 65028840, + "step": 4042 + }, + { + "epoch": 0.2832050574833416, + "grad_norm": 4.650609016418457, + "learning_rate": 7.170608056042032e-05, + "loss": 0.8585, + "num_input_tokens_seen": 65044608, + "step": 4043 + }, + { + "epoch": 0.2832751057290709, + "grad_norm": 5.383882522583008, + "learning_rate": 7.169908231173381e-05, + "loss": 1.3442, + "num_input_tokens_seen": 65060992, + "step": 4044 + }, + { + "epoch": 0.28334515397480015, + "grad_norm": 3.569399118423462, + "learning_rate": 7.169208406304729e-05, + "loss": 1.0645, + "num_input_tokens_seen": 65077096, + "step": 4045 + }, + { + "epoch": 0.2834152022205294, + "grad_norm": 5.199350833892822, + "learning_rate": 7.168508581436079e-05, + "loss": 1.0178, + "num_input_tokens_seen": 65093480, + "step": 4046 + }, + { + "epoch": 0.2834852504662586, + "grad_norm": 4.172554969787598, + "learning_rate": 7.167808756567426e-05, + "loss": 1.1721, + "num_input_tokens_seen": 65109864, + "step": 4047 + }, + { + "epoch": 0.28355529871198787, + "grad_norm": 3.822197437286377, + "learning_rate": 7.167108931698774e-05, + "loss": 0.9076, + "num_input_tokens_seen": 65126248, + "step": 4048 + }, + { + "epoch": 0.28362534695771713, + "grad_norm": 3.8899435997009277, + "learning_rate": 7.166409106830123e-05, + "loss": 1.1228, + "num_input_tokens_seen": 65141984, + "step": 4049 + }, + { + "epoch": 0.2836953952034464, + "grad_norm": 4.559451580047607, + "learning_rate": 7.165709281961471e-05, + "loss": 1.0732, + "num_input_tokens_seen": 65157984, + "step": 4050 + }, + { + "epoch": 0.28376544344917565, + "grad_norm": 5.253831386566162, + "learning_rate": 7.16500945709282e-05, + "loss": 1.1104, + "num_input_tokens_seen": 65174040, + "step": 4051 + }, + { + "epoch": 0.28383549169490485, + "grad_norm": 3.827268123626709, + "learning_rate": 7.164309632224169e-05, + "loss": 1.0689, + "num_input_tokens_seen": 65190424, + "step": 4052 + }, + { + "epoch": 0.2839055399406341, + "grad_norm": 4.432236194610596, + "learning_rate": 7.163609807355518e-05, + "loss": 0.9357, + "num_input_tokens_seen": 65206808, + "step": 4053 + }, + { + "epoch": 0.28397558818636337, + "grad_norm": 5.008002281188965, + "learning_rate": 7.162909982486866e-05, + "loss": 1.1584, + "num_input_tokens_seen": 65222744, + "step": 4054 + }, + { + "epoch": 0.28404563643209263, + "grad_norm": 3.748089551925659, + "learning_rate": 7.162210157618213e-05, + "loss": 1.0242, + "num_input_tokens_seen": 65238592, + "step": 4055 + }, + { + "epoch": 0.28411568467782183, + "grad_norm": 4.073843002319336, + "learning_rate": 7.161510332749562e-05, + "loss": 1.0629, + "num_input_tokens_seen": 65254464, + "step": 4056 + }, + { + "epoch": 0.2841857329235511, + "grad_norm": 4.0271100997924805, + "learning_rate": 7.160810507880911e-05, + "loss": 1.0191, + "num_input_tokens_seen": 65269744, + "step": 4057 + }, + { + "epoch": 0.28425578116928035, + "grad_norm": 4.266842365264893, + "learning_rate": 7.16011068301226e-05, + "loss": 1.0061, + "num_input_tokens_seen": 65286128, + "step": 4058 + }, + { + "epoch": 0.2843258294150096, + "grad_norm": 3.4473531246185303, + "learning_rate": 7.159410858143608e-05, + "loss": 0.8837, + "num_input_tokens_seen": 65301864, + "step": 4059 + }, + { + "epoch": 0.2843958776607388, + "grad_norm": 3.717029333114624, + "learning_rate": 7.158711033274957e-05, + "loss": 1.0704, + "num_input_tokens_seen": 65317880, + "step": 4060 + }, + { + "epoch": 0.2844659259064681, + "grad_norm": 4.008082866668701, + "learning_rate": 7.158011208406305e-05, + "loss": 1.0322, + "num_input_tokens_seen": 65334096, + "step": 4061 + }, + { + "epoch": 0.28453597415219734, + "grad_norm": 5.350658893585205, + "learning_rate": 7.157311383537654e-05, + "loss": 1.1277, + "num_input_tokens_seen": 65348288, + "step": 4062 + }, + { + "epoch": 0.2846060223979266, + "grad_norm": 8.911882400512695, + "learning_rate": 7.156611558669003e-05, + "loss": 1.0978, + "num_input_tokens_seen": 65364672, + "step": 4063 + }, + { + "epoch": 0.2846760706436558, + "grad_norm": 4.207833766937256, + "learning_rate": 7.155911733800351e-05, + "loss": 1.1248, + "num_input_tokens_seen": 65380600, + "step": 4064 + }, + { + "epoch": 0.28474611888938506, + "grad_norm": 3.492713689804077, + "learning_rate": 7.155211908931699e-05, + "loss": 0.9513, + "num_input_tokens_seen": 65396920, + "step": 4065 + }, + { + "epoch": 0.2848161671351143, + "grad_norm": 3.866763114929199, + "learning_rate": 7.154512084063048e-05, + "loss": 0.9899, + "num_input_tokens_seen": 65413136, + "step": 4066 + }, + { + "epoch": 0.2848862153808436, + "grad_norm": 4.352143287658691, + "learning_rate": 7.153812259194397e-05, + "loss": 1.097, + "num_input_tokens_seen": 65428368, + "step": 4067 + }, + { + "epoch": 0.2849562636265728, + "grad_norm": 5.335500717163086, + "learning_rate": 7.153112434325744e-05, + "loss": 1.1697, + "num_input_tokens_seen": 65444752, + "step": 4068 + }, + { + "epoch": 0.28502631187230204, + "grad_norm": 3.7467970848083496, + "learning_rate": 7.152412609457093e-05, + "loss": 0.9655, + "num_input_tokens_seen": 65461136, + "step": 4069 + }, + { + "epoch": 0.2850963601180313, + "grad_norm": 3.410472869873047, + "learning_rate": 7.151712784588442e-05, + "loss": 0.8464, + "num_input_tokens_seen": 65477520, + "step": 4070 + }, + { + "epoch": 0.28516640836376056, + "grad_norm": 6.551929950714111, + "learning_rate": 7.151012959719791e-05, + "loss": 1.0369, + "num_input_tokens_seen": 65493904, + "step": 4071 + }, + { + "epoch": 0.28523645660948976, + "grad_norm": 3.4140212535858154, + "learning_rate": 7.150313134851138e-05, + "loss": 1.0508, + "num_input_tokens_seen": 65510288, + "step": 4072 + }, + { + "epoch": 0.285306504855219, + "grad_norm": 4.227553367614746, + "learning_rate": 7.149613309982488e-05, + "loss": 1.0793, + "num_input_tokens_seen": 65526672, + "step": 4073 + }, + { + "epoch": 0.2853765531009483, + "grad_norm": 4.202794551849365, + "learning_rate": 7.148913485113836e-05, + "loss": 1.1393, + "num_input_tokens_seen": 65542456, + "step": 4074 + }, + { + "epoch": 0.28544660134667754, + "grad_norm": 5.172013759613037, + "learning_rate": 7.148213660245183e-05, + "loss": 1.2451, + "num_input_tokens_seen": 65558384, + "step": 4075 + }, + { + "epoch": 0.28551664959240675, + "grad_norm": 3.716113567352295, + "learning_rate": 7.147513835376532e-05, + "loss": 0.8515, + "num_input_tokens_seen": 65574768, + "step": 4076 + }, + { + "epoch": 0.285586697838136, + "grad_norm": 8.10258674621582, + "learning_rate": 7.146814010507881e-05, + "loss": 1.0737, + "num_input_tokens_seen": 65590632, + "step": 4077 + }, + { + "epoch": 0.28565674608386527, + "grad_norm": 3.649273157119751, + "learning_rate": 7.14611418563923e-05, + "loss": 1.0376, + "num_input_tokens_seen": 65607016, + "step": 4078 + }, + { + "epoch": 0.2857267943295945, + "grad_norm": 4.202502250671387, + "learning_rate": 7.145414360770579e-05, + "loss": 1.1102, + "num_input_tokens_seen": 65622856, + "step": 4079 + }, + { + "epoch": 0.28579684257532373, + "grad_norm": 4.027415752410889, + "learning_rate": 7.144714535901928e-05, + "loss": 1.26, + "num_input_tokens_seen": 65639240, + "step": 4080 + }, + { + "epoch": 0.285866890821053, + "grad_norm": 4.549161434173584, + "learning_rate": 7.144014711033275e-05, + "loss": 1.1598, + "num_input_tokens_seen": 65655624, + "step": 4081 + }, + { + "epoch": 0.28593693906678225, + "grad_norm": 4.43501615524292, + "learning_rate": 7.143314886164623e-05, + "loss": 1.0735, + "num_input_tokens_seen": 65671016, + "step": 4082 + }, + { + "epoch": 0.2860069873125115, + "grad_norm": 3.739610433578491, + "learning_rate": 7.142615061295972e-05, + "loss": 1.0321, + "num_input_tokens_seen": 65687072, + "step": 4083 + }, + { + "epoch": 0.2860770355582407, + "grad_norm": 3.725759506225586, + "learning_rate": 7.14191523642732e-05, + "loss": 1.0712, + "num_input_tokens_seen": 65703456, + "step": 4084 + }, + { + "epoch": 0.28614708380396997, + "grad_norm": 3.706056594848633, + "learning_rate": 7.14121541155867e-05, + "loss": 1.0643, + "num_input_tokens_seen": 65719552, + "step": 4085 + }, + { + "epoch": 0.28621713204969923, + "grad_norm": 4.971164703369141, + "learning_rate": 7.140515586690018e-05, + "loss": 1.2084, + "num_input_tokens_seen": 65735936, + "step": 4086 + }, + { + "epoch": 0.2862871802954285, + "grad_norm": 7.377131938934326, + "learning_rate": 7.139815761821367e-05, + "loss": 0.8867, + "num_input_tokens_seen": 65752320, + "step": 4087 + }, + { + "epoch": 0.28635722854115775, + "grad_norm": 4.293169975280762, + "learning_rate": 7.139115936952715e-05, + "loss": 1.0805, + "num_input_tokens_seen": 65768704, + "step": 4088 + }, + { + "epoch": 0.28642727678688695, + "grad_norm": 3.4757955074310303, + "learning_rate": 7.138416112084063e-05, + "loss": 0.9749, + "num_input_tokens_seen": 65785088, + "step": 4089 + }, + { + "epoch": 0.2864973250326162, + "grad_norm": 4.5705695152282715, + "learning_rate": 7.137716287215412e-05, + "loss": 1.209, + "num_input_tokens_seen": 65801472, + "step": 4090 + }, + { + "epoch": 0.28656737327834547, + "grad_norm": 5.240487575531006, + "learning_rate": 7.137016462346761e-05, + "loss": 0.9684, + "num_input_tokens_seen": 65817856, + "step": 4091 + }, + { + "epoch": 0.28663742152407473, + "grad_norm": 3.7815425395965576, + "learning_rate": 7.136316637478109e-05, + "loss": 0.9431, + "num_input_tokens_seen": 65833872, + "step": 4092 + }, + { + "epoch": 0.28670746976980394, + "grad_norm": 5.411090850830078, + "learning_rate": 7.135616812609457e-05, + "loss": 1.1237, + "num_input_tokens_seen": 65849064, + "step": 4093 + }, + { + "epoch": 0.2867775180155332, + "grad_norm": 4.07004451751709, + "learning_rate": 7.134916987740806e-05, + "loss": 1.0168, + "num_input_tokens_seen": 65865448, + "step": 4094 + }, + { + "epoch": 0.28684756626126245, + "grad_norm": 3.636051893234253, + "learning_rate": 7.134217162872154e-05, + "loss": 0.9363, + "num_input_tokens_seen": 65881320, + "step": 4095 + }, + { + "epoch": 0.2869176145069917, + "grad_norm": 4.265620708465576, + "learning_rate": 7.133517338003503e-05, + "loss": 1.2098, + "num_input_tokens_seen": 65896832, + "step": 4096 + }, + { + "epoch": 0.2869876627527209, + "grad_norm": 4.145105838775635, + "learning_rate": 7.132817513134852e-05, + "loss": 0.9785, + "num_input_tokens_seen": 65912960, + "step": 4097 + }, + { + "epoch": 0.2870577109984502, + "grad_norm": 3.6198408603668213, + "learning_rate": 7.1321176882662e-05, + "loss": 1.0276, + "num_input_tokens_seen": 65929344, + "step": 4098 + }, + { + "epoch": 0.28712775924417944, + "grad_norm": 4.000823497772217, + "learning_rate": 7.131417863397548e-05, + "loss": 1.2109, + "num_input_tokens_seen": 65945480, + "step": 4099 + }, + { + "epoch": 0.2871978074899087, + "grad_norm": 4.2647271156311035, + "learning_rate": 7.130718038528898e-05, + "loss": 1.1588, + "num_input_tokens_seen": 65961672, + "step": 4100 + }, + { + "epoch": 0.2872678557356379, + "grad_norm": 4.704364776611328, + "learning_rate": 7.130018213660246e-05, + "loss": 1.0707, + "num_input_tokens_seen": 65976848, + "step": 4101 + }, + { + "epoch": 0.28733790398136716, + "grad_norm": 3.8795642852783203, + "learning_rate": 7.129318388791593e-05, + "loss": 1.0087, + "num_input_tokens_seen": 65993120, + "step": 4102 + }, + { + "epoch": 0.2874079522270964, + "grad_norm": 4.356956958770752, + "learning_rate": 7.128618563922942e-05, + "loss": 1.4218, + "num_input_tokens_seen": 66008448, + "step": 4103 + }, + { + "epoch": 0.2874780004728257, + "grad_norm": 3.5145177841186523, + "learning_rate": 7.127918739054291e-05, + "loss": 0.9055, + "num_input_tokens_seen": 66024712, + "step": 4104 + }, + { + "epoch": 0.2875480487185549, + "grad_norm": 3.7384872436523438, + "learning_rate": 7.12721891418564e-05, + "loss": 1.0574, + "num_input_tokens_seen": 66041096, + "step": 4105 + }, + { + "epoch": 0.28761809696428414, + "grad_norm": 3.9706084728240967, + "learning_rate": 7.126519089316989e-05, + "loss": 1.1538, + "num_input_tokens_seen": 66056880, + "step": 4106 + }, + { + "epoch": 0.2876881452100134, + "grad_norm": 3.692093849182129, + "learning_rate": 7.125819264448337e-05, + "loss": 0.9421, + "num_input_tokens_seen": 66073264, + "step": 4107 + }, + { + "epoch": 0.28775819345574266, + "grad_norm": 4.967808246612549, + "learning_rate": 7.125119439579685e-05, + "loss": 0.8829, + "num_input_tokens_seen": 66089648, + "step": 4108 + }, + { + "epoch": 0.28782824170147187, + "grad_norm": 3.8627805709838867, + "learning_rate": 7.124419614711032e-05, + "loss": 1.1056, + "num_input_tokens_seen": 66105992, + "step": 4109 + }, + { + "epoch": 0.2878982899472011, + "grad_norm": 3.7407474517822266, + "learning_rate": 7.123719789842381e-05, + "loss": 1.0241, + "num_input_tokens_seen": 66122040, + "step": 4110 + }, + { + "epoch": 0.2879683381929304, + "grad_norm": 4.028223514556885, + "learning_rate": 7.123019964973732e-05, + "loss": 1.161, + "num_input_tokens_seen": 66138056, + "step": 4111 + }, + { + "epoch": 0.28803838643865964, + "grad_norm": 4.248149394989014, + "learning_rate": 7.122320140105079e-05, + "loss": 1.083, + "num_input_tokens_seen": 66154384, + "step": 4112 + }, + { + "epoch": 0.28810843468438885, + "grad_norm": 3.49904465675354, + "learning_rate": 7.121620315236428e-05, + "loss": 1.0217, + "num_input_tokens_seen": 66170016, + "step": 4113 + }, + { + "epoch": 0.2881784829301181, + "grad_norm": 5.039339542388916, + "learning_rate": 7.120920490367777e-05, + "loss": 0.8658, + "num_input_tokens_seen": 66185744, + "step": 4114 + }, + { + "epoch": 0.28824853117584737, + "grad_norm": 3.800870656967163, + "learning_rate": 7.120220665499124e-05, + "loss": 1.1031, + "num_input_tokens_seen": 66202128, + "step": 4115 + }, + { + "epoch": 0.2883185794215766, + "grad_norm": 4.8073530197143555, + "learning_rate": 7.119520840630473e-05, + "loss": 1.1191, + "num_input_tokens_seen": 66217840, + "step": 4116 + }, + { + "epoch": 0.28838862766730583, + "grad_norm": 3.495415210723877, + "learning_rate": 7.118821015761822e-05, + "loss": 0.8693, + "num_input_tokens_seen": 66234224, + "step": 4117 + }, + { + "epoch": 0.2884586759130351, + "grad_norm": 4.46912956237793, + "learning_rate": 7.118121190893171e-05, + "loss": 1.2077, + "num_input_tokens_seen": 66249968, + "step": 4118 + }, + { + "epoch": 0.28852872415876435, + "grad_norm": 4.553129196166992, + "learning_rate": 7.117421366024518e-05, + "loss": 1.1039, + "num_input_tokens_seen": 66265304, + "step": 4119 + }, + { + "epoch": 0.2885987724044936, + "grad_norm": 3.713836193084717, + "learning_rate": 7.116721541155867e-05, + "loss": 1.0833, + "num_input_tokens_seen": 66281680, + "step": 4120 + }, + { + "epoch": 0.28866882065022287, + "grad_norm": 3.9745819568634033, + "learning_rate": 7.116021716287216e-05, + "loss": 1.1524, + "num_input_tokens_seen": 66298064, + "step": 4121 + }, + { + "epoch": 0.28873886889595207, + "grad_norm": 6.237453937530518, + "learning_rate": 7.115321891418564e-05, + "loss": 1.3598, + "num_input_tokens_seen": 66314448, + "step": 4122 + }, + { + "epoch": 0.28880891714168133, + "grad_norm": 3.7947497367858887, + "learning_rate": 7.114622066549912e-05, + "loss": 0.9342, + "num_input_tokens_seen": 66330832, + "step": 4123 + }, + { + "epoch": 0.2888789653874106, + "grad_norm": 5.574815273284912, + "learning_rate": 7.113922241681261e-05, + "loss": 1.1212, + "num_input_tokens_seen": 66347216, + "step": 4124 + }, + { + "epoch": 0.28894901363313985, + "grad_norm": 3.538344144821167, + "learning_rate": 7.11322241681261e-05, + "loss": 1.0205, + "num_input_tokens_seen": 66363352, + "step": 4125 + }, + { + "epoch": 0.28901906187886905, + "grad_norm": 3.792769193649292, + "learning_rate": 7.112522591943958e-05, + "loss": 1.1266, + "num_input_tokens_seen": 66379736, + "step": 4126 + }, + { + "epoch": 0.2890891101245983, + "grad_norm": 4.527935981750488, + "learning_rate": 7.111822767075308e-05, + "loss": 1.0124, + "num_input_tokens_seen": 66396120, + "step": 4127 + }, + { + "epoch": 0.2891591583703276, + "grad_norm": 3.753326416015625, + "learning_rate": 7.111122942206655e-05, + "loss": 0.9993, + "num_input_tokens_seen": 66412424, + "step": 4128 + }, + { + "epoch": 0.28922920661605683, + "grad_norm": 4.310519218444824, + "learning_rate": 7.110423117338003e-05, + "loss": 1.0481, + "num_input_tokens_seen": 66428176, + "step": 4129 + }, + { + "epoch": 0.28929925486178604, + "grad_norm": 3.9848945140838623, + "learning_rate": 7.109723292469352e-05, + "loss": 1.2687, + "num_input_tokens_seen": 66444560, + "step": 4130 + }, + { + "epoch": 0.2893693031075153, + "grad_norm": 4.654316425323486, + "learning_rate": 7.109023467600702e-05, + "loss": 1.0025, + "num_input_tokens_seen": 66460944, + "step": 4131 + }, + { + "epoch": 0.28943935135324456, + "grad_norm": 4.566670894622803, + "learning_rate": 7.10832364273205e-05, + "loss": 0.9224, + "num_input_tokens_seen": 66475928, + "step": 4132 + }, + { + "epoch": 0.2895093995989738, + "grad_norm": 4.4292988777160645, + "learning_rate": 7.107623817863398e-05, + "loss": 1.0922, + "num_input_tokens_seen": 66491904, + "step": 4133 + }, + { + "epoch": 0.289579447844703, + "grad_norm": 6.520173072814941, + "learning_rate": 7.106923992994747e-05, + "loss": 0.9938, + "num_input_tokens_seen": 66507256, + "step": 4134 + }, + { + "epoch": 0.2896494960904323, + "grad_norm": 3.8424220085144043, + "learning_rate": 7.106224168126095e-05, + "loss": 1.0857, + "num_input_tokens_seen": 66522736, + "step": 4135 + }, + { + "epoch": 0.28971954433616154, + "grad_norm": 4.742796897888184, + "learning_rate": 7.105524343257442e-05, + "loss": 1.0296, + "num_input_tokens_seen": 66538480, + "step": 4136 + }, + { + "epoch": 0.2897895925818908, + "grad_norm": 3.552365779876709, + "learning_rate": 7.104824518388792e-05, + "loss": 1.0597, + "num_input_tokens_seen": 66554576, + "step": 4137 + }, + { + "epoch": 0.28985964082762, + "grad_norm": 6.649835109710693, + "learning_rate": 7.104124693520141e-05, + "loss": 0.9729, + "num_input_tokens_seen": 66570000, + "step": 4138 + }, + { + "epoch": 0.28992968907334926, + "grad_norm": 3.9890356063842773, + "learning_rate": 7.103424868651489e-05, + "loss": 0.9774, + "num_input_tokens_seen": 66585640, + "step": 4139 + }, + { + "epoch": 0.2899997373190785, + "grad_norm": 3.80637526512146, + "learning_rate": 7.102725043782838e-05, + "loss": 1.0373, + "num_input_tokens_seen": 66601696, + "step": 4140 + }, + { + "epoch": 0.2900697855648078, + "grad_norm": 4.089916706085205, + "learning_rate": 7.102025218914186e-05, + "loss": 1.0919, + "num_input_tokens_seen": 66618080, + "step": 4141 + }, + { + "epoch": 0.290139833810537, + "grad_norm": 3.2609710693359375, + "learning_rate": 7.101325394045534e-05, + "loss": 0.9409, + "num_input_tokens_seen": 66634216, + "step": 4142 + }, + { + "epoch": 0.29020988205626624, + "grad_norm": 4.3664093017578125, + "learning_rate": 7.100625569176883e-05, + "loss": 0.9031, + "num_input_tokens_seen": 66650600, + "step": 4143 + }, + { + "epoch": 0.2902799303019955, + "grad_norm": 4.460801124572754, + "learning_rate": 7.099925744308232e-05, + "loss": 1.0582, + "num_input_tokens_seen": 66666592, + "step": 4144 + }, + { + "epoch": 0.29034997854772476, + "grad_norm": 4.474677562713623, + "learning_rate": 7.09922591943958e-05, + "loss": 1.0016, + "num_input_tokens_seen": 66681544, + "step": 4145 + }, + { + "epoch": 0.29042002679345397, + "grad_norm": 3.6482129096984863, + "learning_rate": 7.098526094570928e-05, + "loss": 1.0823, + "num_input_tokens_seen": 66697928, + "step": 4146 + }, + { + "epoch": 0.2904900750391832, + "grad_norm": 3.483290195465088, + "learning_rate": 7.097826269702277e-05, + "loss": 0.8853, + "num_input_tokens_seen": 66714312, + "step": 4147 + }, + { + "epoch": 0.2905601232849125, + "grad_norm": 4.703539848327637, + "learning_rate": 7.097126444833626e-05, + "loss": 0.9718, + "num_input_tokens_seen": 66729632, + "step": 4148 + }, + { + "epoch": 0.29063017153064175, + "grad_norm": 3.8614907264709473, + "learning_rate": 7.096426619964973e-05, + "loss": 1.0047, + "num_input_tokens_seen": 66746016, + "step": 4149 + }, + { + "epoch": 0.29070021977637095, + "grad_norm": 3.612683057785034, + "learning_rate": 7.095726795096322e-05, + "loss": 1.1783, + "num_input_tokens_seen": 66762400, + "step": 4150 + }, + { + "epoch": 0.2907702680221002, + "grad_norm": 3.980149984359741, + "learning_rate": 7.095026970227672e-05, + "loss": 0.9993, + "num_input_tokens_seen": 66778392, + "step": 4151 + }, + { + "epoch": 0.29084031626782947, + "grad_norm": 3.857588052749634, + "learning_rate": 7.09432714535902e-05, + "loss": 1.0506, + "num_input_tokens_seen": 66794200, + "step": 4152 + }, + { + "epoch": 0.29091036451355873, + "grad_norm": 5.106949806213379, + "learning_rate": 7.093627320490367e-05, + "loss": 1.2222, + "num_input_tokens_seen": 66810584, + "step": 4153 + }, + { + "epoch": 0.29098041275928793, + "grad_norm": 4.338438987731934, + "learning_rate": 7.092927495621718e-05, + "loss": 1.1203, + "num_input_tokens_seen": 66826208, + "step": 4154 + }, + { + "epoch": 0.2910504610050172, + "grad_norm": 3.962877035140991, + "learning_rate": 7.092227670753065e-05, + "loss": 1.1026, + "num_input_tokens_seen": 66842592, + "step": 4155 + }, + { + "epoch": 0.29112050925074645, + "grad_norm": 3.8490965366363525, + "learning_rate": 7.091527845884413e-05, + "loss": 0.9551, + "num_input_tokens_seen": 66858832, + "step": 4156 + }, + { + "epoch": 0.2911905574964757, + "grad_norm": 4.559625148773193, + "learning_rate": 7.090828021015763e-05, + "loss": 1.3951, + "num_input_tokens_seen": 66875216, + "step": 4157 + }, + { + "epoch": 0.29126060574220497, + "grad_norm": 8.37543773651123, + "learning_rate": 7.090128196147112e-05, + "loss": 1.2365, + "num_input_tokens_seen": 66891600, + "step": 4158 + }, + { + "epoch": 0.2913306539879342, + "grad_norm": 4.128559112548828, + "learning_rate": 7.089428371278459e-05, + "loss": 0.8789, + "num_input_tokens_seen": 66907984, + "step": 4159 + }, + { + "epoch": 0.29140070223366343, + "grad_norm": 4.81403112411499, + "learning_rate": 7.088728546409808e-05, + "loss": 1.1149, + "num_input_tokens_seen": 66923240, + "step": 4160 + }, + { + "epoch": 0.2914707504793927, + "grad_norm": 4.534300804138184, + "learning_rate": 7.088028721541157e-05, + "loss": 0.8906, + "num_input_tokens_seen": 66939624, + "step": 4161 + }, + { + "epoch": 0.29154079872512195, + "grad_norm": 4.46708869934082, + "learning_rate": 7.087328896672504e-05, + "loss": 0.873, + "num_input_tokens_seen": 66955968, + "step": 4162 + }, + { + "epoch": 0.29161084697085116, + "grad_norm": 4.142822265625, + "learning_rate": 7.086629071803853e-05, + "loss": 0.8286, + "num_input_tokens_seen": 66971680, + "step": 4163 + }, + { + "epoch": 0.2916808952165804, + "grad_norm": 3.686167001724243, + "learning_rate": 7.085929246935202e-05, + "loss": 0.897, + "num_input_tokens_seen": 66987952, + "step": 4164 + }, + { + "epoch": 0.2917509434623097, + "grad_norm": 8.076430320739746, + "learning_rate": 7.085229422066551e-05, + "loss": 1.1215, + "num_input_tokens_seen": 67004336, + "step": 4165 + }, + { + "epoch": 0.29182099170803893, + "grad_norm": 8.69857120513916, + "learning_rate": 7.084529597197898e-05, + "loss": 1.2295, + "num_input_tokens_seen": 67020216, + "step": 4166 + }, + { + "epoch": 0.29189103995376814, + "grad_norm": 3.7867684364318848, + "learning_rate": 7.083829772329247e-05, + "loss": 1.058, + "num_input_tokens_seen": 67035600, + "step": 4167 + }, + { + "epoch": 0.2919610881994974, + "grad_norm": 5.560591697692871, + "learning_rate": 7.083129947460596e-05, + "loss": 1.0864, + "num_input_tokens_seen": 67051680, + "step": 4168 + }, + { + "epoch": 0.29203113644522666, + "grad_norm": 3.857120990753174, + "learning_rate": 7.082430122591944e-05, + "loss": 1.1991, + "num_input_tokens_seen": 67068064, + "step": 4169 + }, + { + "epoch": 0.2921011846909559, + "grad_norm": 4.343360900878906, + "learning_rate": 7.081730297723293e-05, + "loss": 1.0973, + "num_input_tokens_seen": 67084448, + "step": 4170 + }, + { + "epoch": 0.2921712329366851, + "grad_norm": 4.198531150817871, + "learning_rate": 7.081030472854643e-05, + "loss": 1.1271, + "num_input_tokens_seen": 67100832, + "step": 4171 + }, + { + "epoch": 0.2922412811824144, + "grad_norm": 3.539684772491455, + "learning_rate": 7.08033064798599e-05, + "loss": 0.9532, + "num_input_tokens_seen": 67117216, + "step": 4172 + }, + { + "epoch": 0.29231132942814364, + "grad_norm": 4.2374444007873535, + "learning_rate": 7.079630823117338e-05, + "loss": 0.9965, + "num_input_tokens_seen": 67133600, + "step": 4173 + }, + { + "epoch": 0.2923813776738729, + "grad_norm": 4.106996059417725, + "learning_rate": 7.078930998248687e-05, + "loss": 1.1141, + "num_input_tokens_seen": 67149984, + "step": 4174 + }, + { + "epoch": 0.2924514259196021, + "grad_norm": 3.7100484371185303, + "learning_rate": 7.078231173380035e-05, + "loss": 1.0702, + "num_input_tokens_seen": 67166168, + "step": 4175 + }, + { + "epoch": 0.29252147416533136, + "grad_norm": 5.189118385314941, + "learning_rate": 7.077531348511383e-05, + "loss": 0.9642, + "num_input_tokens_seen": 67181472, + "step": 4176 + }, + { + "epoch": 0.2925915224110606, + "grad_norm": 4.540155410766602, + "learning_rate": 7.076831523642733e-05, + "loss": 1.0558, + "num_input_tokens_seen": 67197856, + "step": 4177 + }, + { + "epoch": 0.2926615706567899, + "grad_norm": 4.748345375061035, + "learning_rate": 7.076131698774082e-05, + "loss": 0.8845, + "num_input_tokens_seen": 67214240, + "step": 4178 + }, + { + "epoch": 0.2927316189025191, + "grad_norm": 4.252089023590088, + "learning_rate": 7.07543187390543e-05, + "loss": 1.1002, + "num_input_tokens_seen": 67230312, + "step": 4179 + }, + { + "epoch": 0.29280166714824835, + "grad_norm": 4.273370742797852, + "learning_rate": 7.074732049036777e-05, + "loss": 1.1759, + "num_input_tokens_seen": 67246152, + "step": 4180 + }, + { + "epoch": 0.2928717153939776, + "grad_norm": 3.9271481037139893, + "learning_rate": 7.074032224168127e-05, + "loss": 1.0159, + "num_input_tokens_seen": 67261688, + "step": 4181 + }, + { + "epoch": 0.29294176363970686, + "grad_norm": 3.875622034072876, + "learning_rate": 7.073332399299475e-05, + "loss": 1.2345, + "num_input_tokens_seen": 67278072, + "step": 4182 + }, + { + "epoch": 0.29301181188543607, + "grad_norm": 3.8089005947113037, + "learning_rate": 7.072632574430824e-05, + "loss": 1.1025, + "num_input_tokens_seen": 67293760, + "step": 4183 + }, + { + "epoch": 0.29308186013116533, + "grad_norm": 4.402803421020508, + "learning_rate": 7.071932749562172e-05, + "loss": 1.0397, + "num_input_tokens_seen": 67310144, + "step": 4184 + }, + { + "epoch": 0.2931519083768946, + "grad_norm": 4.4534783363342285, + "learning_rate": 7.071232924693521e-05, + "loss": 1.0222, + "num_input_tokens_seen": 67326528, + "step": 4185 + }, + { + "epoch": 0.29322195662262385, + "grad_norm": 4.247747898101807, + "learning_rate": 7.070533099824869e-05, + "loss": 1.0667, + "num_input_tokens_seen": 67342080, + "step": 4186 + }, + { + "epoch": 0.29329200486835305, + "grad_norm": 5.280468463897705, + "learning_rate": 7.069833274956218e-05, + "loss": 1.0492, + "num_input_tokens_seen": 67357168, + "step": 4187 + }, + { + "epoch": 0.2933620531140823, + "grad_norm": 5.14320707321167, + "learning_rate": 7.069133450087567e-05, + "loss": 1.1073, + "num_input_tokens_seen": 67373552, + "step": 4188 + }, + { + "epoch": 0.29343210135981157, + "grad_norm": 4.131645679473877, + "learning_rate": 7.068433625218914e-05, + "loss": 1.3795, + "num_input_tokens_seen": 67389936, + "step": 4189 + }, + { + "epoch": 0.29350214960554083, + "grad_norm": 4.727990627288818, + "learning_rate": 7.067733800350263e-05, + "loss": 1.2066, + "num_input_tokens_seen": 67406320, + "step": 4190 + }, + { + "epoch": 0.2935721978512701, + "grad_norm": 5.857666969299316, + "learning_rate": 7.067033975481612e-05, + "loss": 1.028, + "num_input_tokens_seen": 67422680, + "step": 4191 + }, + { + "epoch": 0.2936422460969993, + "grad_norm": 4.185948371887207, + "learning_rate": 7.06633415061296e-05, + "loss": 1.2738, + "num_input_tokens_seen": 67439064, + "step": 4192 + }, + { + "epoch": 0.29371229434272855, + "grad_norm": 3.749274969100952, + "learning_rate": 7.065634325744308e-05, + "loss": 1.0327, + "num_input_tokens_seen": 67454680, + "step": 4193 + }, + { + "epoch": 0.2937823425884578, + "grad_norm": 4.332368850708008, + "learning_rate": 7.064934500875657e-05, + "loss": 1.0986, + "num_input_tokens_seen": 67470800, + "step": 4194 + }, + { + "epoch": 0.29385239083418707, + "grad_norm": 5.514054775238037, + "learning_rate": 7.064234676007006e-05, + "loss": 1.2602, + "num_input_tokens_seen": 67487184, + "step": 4195 + }, + { + "epoch": 0.2939224390799163, + "grad_norm": 4.534146785736084, + "learning_rate": 7.063534851138353e-05, + "loss": 1.2929, + "num_input_tokens_seen": 67503504, + "step": 4196 + }, + { + "epoch": 0.29399248732564554, + "grad_norm": 4.86776876449585, + "learning_rate": 7.062835026269702e-05, + "loss": 1.111, + "num_input_tokens_seen": 67519056, + "step": 4197 + }, + { + "epoch": 0.2940625355713748, + "grad_norm": 3.8528504371643066, + "learning_rate": 7.062135201401052e-05, + "loss": 0.9151, + "num_input_tokens_seen": 67535440, + "step": 4198 + }, + { + "epoch": 0.29413258381710405, + "grad_norm": 4.244069576263428, + "learning_rate": 7.0614353765324e-05, + "loss": 1.1733, + "num_input_tokens_seen": 67551264, + "step": 4199 + }, + { + "epoch": 0.29420263206283326, + "grad_norm": 3.5963211059570312, + "learning_rate": 7.060735551663747e-05, + "loss": 1.008, + "num_input_tokens_seen": 67567648, + "step": 4200 + }, + { + "epoch": 0.29420263206283326, + "eval_loss": 1.1331984996795654, + "eval_runtime": 0.203, + "eval_samples_per_second": 4.927, + "eval_steps_per_second": 4.927, + "num_input_tokens_seen": 67567648, + "step": 4200 + }, + { + "epoch": 0.2942726803085625, + "grad_norm": 4.51765775680542, + "learning_rate": 7.060035726795096e-05, + "loss": 1.1284, + "num_input_tokens_seen": 67583792, + "step": 4201 + }, + { + "epoch": 0.2943427285542918, + "grad_norm": 4.541067123413086, + "learning_rate": 7.059335901926445e-05, + "loss": 1.1246, + "num_input_tokens_seen": 67599856, + "step": 4202 + }, + { + "epoch": 0.29441277680002104, + "grad_norm": 4.095570087432861, + "learning_rate": 7.058636077057794e-05, + "loss": 1.0087, + "num_input_tokens_seen": 67616240, + "step": 4203 + }, + { + "epoch": 0.29448282504575024, + "grad_norm": 4.616795539855957, + "learning_rate": 7.057936252189143e-05, + "loss": 1.2549, + "num_input_tokens_seen": 67632496, + "step": 4204 + }, + { + "epoch": 0.2945528732914795, + "grad_norm": 3.8619420528411865, + "learning_rate": 7.057236427320492e-05, + "loss": 0.9626, + "num_input_tokens_seen": 67648880, + "step": 4205 + }, + { + "epoch": 0.29462292153720876, + "grad_norm": 4.194519996643066, + "learning_rate": 7.056536602451839e-05, + "loss": 0.958, + "num_input_tokens_seen": 67665264, + "step": 4206 + }, + { + "epoch": 0.294692969782938, + "grad_norm": 4.835122585296631, + "learning_rate": 7.055836777583187e-05, + "loss": 1.0201, + "num_input_tokens_seen": 67681648, + "step": 4207 + }, + { + "epoch": 0.2947630180286672, + "grad_norm": 4.2085280418396, + "learning_rate": 7.055136952714537e-05, + "loss": 0.9584, + "num_input_tokens_seen": 67697960, + "step": 4208 + }, + { + "epoch": 0.2948330662743965, + "grad_norm": 4.439855575561523, + "learning_rate": 7.054437127845884e-05, + "loss": 1.0693, + "num_input_tokens_seen": 67714344, + "step": 4209 + }, + { + "epoch": 0.29490311452012574, + "grad_norm": 5.427484035491943, + "learning_rate": 7.053737302977233e-05, + "loss": 1.1782, + "num_input_tokens_seen": 67730728, + "step": 4210 + }, + { + "epoch": 0.294973162765855, + "grad_norm": 3.6627275943756104, + "learning_rate": 7.053037478108582e-05, + "loss": 0.8507, + "num_input_tokens_seen": 67746704, + "step": 4211 + }, + { + "epoch": 0.2950432110115842, + "grad_norm": 4.450380325317383, + "learning_rate": 7.052337653239931e-05, + "loss": 1.0651, + "num_input_tokens_seen": 67762320, + "step": 4212 + }, + { + "epoch": 0.29511325925731346, + "grad_norm": 3.6644749641418457, + "learning_rate": 7.051637828371279e-05, + "loss": 1.1532, + "num_input_tokens_seen": 67778704, + "step": 4213 + }, + { + "epoch": 0.2951833075030427, + "grad_norm": 4.331392288208008, + "learning_rate": 7.050938003502627e-05, + "loss": 1.0854, + "num_input_tokens_seen": 67795088, + "step": 4214 + }, + { + "epoch": 0.295253355748772, + "grad_norm": 4.157777786254883, + "learning_rate": 7.050238178633976e-05, + "loss": 1.2039, + "num_input_tokens_seen": 67811472, + "step": 4215 + }, + { + "epoch": 0.2953234039945012, + "grad_norm": 3.858069896697998, + "learning_rate": 7.049538353765324e-05, + "loss": 1.1751, + "num_input_tokens_seen": 67827488, + "step": 4216 + }, + { + "epoch": 0.29539345224023045, + "grad_norm": 4.279262542724609, + "learning_rate": 7.048838528896673e-05, + "loss": 1.0344, + "num_input_tokens_seen": 67843872, + "step": 4217 + }, + { + "epoch": 0.2954635004859597, + "grad_norm": 4.539918422698975, + "learning_rate": 7.048138704028021e-05, + "loss": 1.0244, + "num_input_tokens_seen": 67860256, + "step": 4218 + }, + { + "epoch": 0.29553354873168897, + "grad_norm": 3.738811492919922, + "learning_rate": 7.04743887915937e-05, + "loss": 1.067, + "num_input_tokens_seen": 67876224, + "step": 4219 + }, + { + "epoch": 0.29560359697741817, + "grad_norm": 4.634495258331299, + "learning_rate": 7.046739054290718e-05, + "loss": 0.9273, + "num_input_tokens_seen": 67892040, + "step": 4220 + }, + { + "epoch": 0.29567364522314743, + "grad_norm": 5.988262176513672, + "learning_rate": 7.046039229422067e-05, + "loss": 0.956, + "num_input_tokens_seen": 67908424, + "step": 4221 + }, + { + "epoch": 0.2957436934688767, + "grad_norm": 7.2220258712768555, + "learning_rate": 7.045339404553416e-05, + "loss": 1.246, + "num_input_tokens_seen": 67924808, + "step": 4222 + }, + { + "epoch": 0.29581374171460595, + "grad_norm": 8.866394996643066, + "learning_rate": 7.044639579684764e-05, + "loss": 0.9932, + "num_input_tokens_seen": 67941192, + "step": 4223 + }, + { + "epoch": 0.2958837899603352, + "grad_norm": 4.791526794433594, + "learning_rate": 7.043939754816112e-05, + "loss": 1.1966, + "num_input_tokens_seen": 67957576, + "step": 4224 + }, + { + "epoch": 0.2959538382060644, + "grad_norm": 3.8345704078674316, + "learning_rate": 7.043239929947462e-05, + "loss": 0.9754, + "num_input_tokens_seen": 67973112, + "step": 4225 + }, + { + "epoch": 0.29602388645179367, + "grad_norm": 5.0572099685668945, + "learning_rate": 7.04254010507881e-05, + "loss": 1.3761, + "num_input_tokens_seen": 67989360, + "step": 4226 + }, + { + "epoch": 0.29609393469752293, + "grad_norm": 4.467088222503662, + "learning_rate": 7.041840280210157e-05, + "loss": 0.981, + "num_input_tokens_seen": 68005744, + "step": 4227 + }, + { + "epoch": 0.2961639829432522, + "grad_norm": 6.415910243988037, + "learning_rate": 7.041140455341506e-05, + "loss": 1.1376, + "num_input_tokens_seen": 68021592, + "step": 4228 + }, + { + "epoch": 0.2962340311889814, + "grad_norm": 4.432079315185547, + "learning_rate": 7.040440630472855e-05, + "loss": 1.1264, + "num_input_tokens_seen": 68037976, + "step": 4229 + }, + { + "epoch": 0.29630407943471065, + "grad_norm": 4.207062721252441, + "learning_rate": 7.039740805604204e-05, + "loss": 1.2702, + "num_input_tokens_seen": 68054328, + "step": 4230 + }, + { + "epoch": 0.2963741276804399, + "grad_norm": 4.825972557067871, + "learning_rate": 7.039040980735553e-05, + "loss": 1.3091, + "num_input_tokens_seen": 68070416, + "step": 4231 + }, + { + "epoch": 0.2964441759261692, + "grad_norm": 3.917593002319336, + "learning_rate": 7.038341155866901e-05, + "loss": 1.1863, + "num_input_tokens_seen": 68086800, + "step": 4232 + }, + { + "epoch": 0.2965142241718984, + "grad_norm": 3.8865675926208496, + "learning_rate": 7.037641330998249e-05, + "loss": 1.2023, + "num_input_tokens_seen": 68103184, + "step": 4233 + }, + { + "epoch": 0.29658427241762764, + "grad_norm": 3.8321971893310547, + "learning_rate": 7.036941506129596e-05, + "loss": 0.9507, + "num_input_tokens_seen": 68119568, + "step": 4234 + }, + { + "epoch": 0.2966543206633569, + "grad_norm": 5.020960807800293, + "learning_rate": 7.036241681260947e-05, + "loss": 1.0438, + "num_input_tokens_seen": 68135416, + "step": 4235 + }, + { + "epoch": 0.29672436890908616, + "grad_norm": 3.653468608856201, + "learning_rate": 7.035541856392294e-05, + "loss": 1.0664, + "num_input_tokens_seen": 68151800, + "step": 4236 + }, + { + "epoch": 0.29679441715481536, + "grad_norm": 3.8133575916290283, + "learning_rate": 7.034842031523643e-05, + "loss": 1.0867, + "num_input_tokens_seen": 68168184, + "step": 4237 + }, + { + "epoch": 0.2968644654005446, + "grad_norm": 3.6642141342163086, + "learning_rate": 7.034142206654992e-05, + "loss": 0.9505, + "num_input_tokens_seen": 68184080, + "step": 4238 + }, + { + "epoch": 0.2969345136462739, + "grad_norm": 4.362963676452637, + "learning_rate": 7.033442381786341e-05, + "loss": 1.0335, + "num_input_tokens_seen": 68199928, + "step": 4239 + }, + { + "epoch": 0.29700456189200314, + "grad_norm": 3.6831562519073486, + "learning_rate": 7.032742556917688e-05, + "loss": 0.9608, + "num_input_tokens_seen": 68215952, + "step": 4240 + }, + { + "epoch": 0.29707461013773234, + "grad_norm": 4.906534194946289, + "learning_rate": 7.032042732049037e-05, + "loss": 0.9434, + "num_input_tokens_seen": 68232336, + "step": 4241 + }, + { + "epoch": 0.2971446583834616, + "grad_norm": 3.446749687194824, + "learning_rate": 7.031342907180386e-05, + "loss": 0.8306, + "num_input_tokens_seen": 68247832, + "step": 4242 + }, + { + "epoch": 0.29721470662919086, + "grad_norm": 4.729014873504639, + "learning_rate": 7.030643082311735e-05, + "loss": 1.0787, + "num_input_tokens_seen": 68264216, + "step": 4243 + }, + { + "epoch": 0.2972847548749201, + "grad_norm": 4.196920871734619, + "learning_rate": 7.029943257443082e-05, + "loss": 1.1496, + "num_input_tokens_seen": 68280600, + "step": 4244 + }, + { + "epoch": 0.2973548031206493, + "grad_norm": 7.193357467651367, + "learning_rate": 7.029243432574431e-05, + "loss": 1.0509, + "num_input_tokens_seen": 68296984, + "step": 4245 + }, + { + "epoch": 0.2974248513663786, + "grad_norm": 4.00344181060791, + "learning_rate": 7.02854360770578e-05, + "loss": 0.9025, + "num_input_tokens_seen": 68312720, + "step": 4246 + }, + { + "epoch": 0.29749489961210784, + "grad_norm": 4.04103422164917, + "learning_rate": 7.027843782837128e-05, + "loss": 1.1307, + "num_input_tokens_seen": 68328608, + "step": 4247 + }, + { + "epoch": 0.2975649478578371, + "grad_norm": 4.010391712188721, + "learning_rate": 7.027143957968476e-05, + "loss": 1.153, + "num_input_tokens_seen": 68343288, + "step": 4248 + }, + { + "epoch": 0.2976349961035663, + "grad_norm": 6.364760398864746, + "learning_rate": 7.026444133099825e-05, + "loss": 1.1629, + "num_input_tokens_seen": 68359672, + "step": 4249 + }, + { + "epoch": 0.29770504434929557, + "grad_norm": 5.682034969329834, + "learning_rate": 7.025744308231174e-05, + "loss": 1.1388, + "num_input_tokens_seen": 68376056, + "step": 4250 + }, + { + "epoch": 0.2977750925950248, + "grad_norm": 3.6160550117492676, + "learning_rate": 7.025044483362522e-05, + "loss": 1.0105, + "num_input_tokens_seen": 68392440, + "step": 4251 + }, + { + "epoch": 0.2978451408407541, + "grad_norm": 4.839343070983887, + "learning_rate": 7.024344658493872e-05, + "loss": 0.9924, + "num_input_tokens_seen": 68408608, + "step": 4252 + }, + { + "epoch": 0.2979151890864833, + "grad_norm": 5.255819320678711, + "learning_rate": 7.02364483362522e-05, + "loss": 1.1425, + "num_input_tokens_seen": 68424944, + "step": 4253 + }, + { + "epoch": 0.29798523733221255, + "grad_norm": 3.7549142837524414, + "learning_rate": 7.022945008756567e-05, + "loss": 0.8801, + "num_input_tokens_seen": 68441328, + "step": 4254 + }, + { + "epoch": 0.2980552855779418, + "grad_norm": 5.159091472625732, + "learning_rate": 7.022245183887916e-05, + "loss": 1.0075, + "num_input_tokens_seen": 68457712, + "step": 4255 + }, + { + "epoch": 0.29812533382367107, + "grad_norm": 3.8031342029571533, + "learning_rate": 7.021545359019265e-05, + "loss": 0.9975, + "num_input_tokens_seen": 68474072, + "step": 4256 + }, + { + "epoch": 0.29819538206940027, + "grad_norm": 6.039318084716797, + "learning_rate": 7.020845534150613e-05, + "loss": 1.0791, + "num_input_tokens_seen": 68490456, + "step": 4257 + }, + { + "epoch": 0.29826543031512953, + "grad_norm": 3.9376237392425537, + "learning_rate": 7.020145709281962e-05, + "loss": 1.0753, + "num_input_tokens_seen": 68506760, + "step": 4258 + }, + { + "epoch": 0.2983354785608588, + "grad_norm": 4.599661827087402, + "learning_rate": 7.019445884413311e-05, + "loss": 0.9722, + "num_input_tokens_seen": 68523144, + "step": 4259 + }, + { + "epoch": 0.29840552680658805, + "grad_norm": 3.743640661239624, + "learning_rate": 7.018746059544659e-05, + "loss": 1.157, + "num_input_tokens_seen": 68539448, + "step": 4260 + }, + { + "epoch": 0.2984755750523173, + "grad_norm": 6.111955642700195, + "learning_rate": 7.018046234676006e-05, + "loss": 1.2148, + "num_input_tokens_seen": 68555832, + "step": 4261 + }, + { + "epoch": 0.2985456232980465, + "grad_norm": 4.297199249267578, + "learning_rate": 7.017346409807356e-05, + "loss": 0.9796, + "num_input_tokens_seen": 68572216, + "step": 4262 + }, + { + "epoch": 0.2986156715437758, + "grad_norm": 4.126640319824219, + "learning_rate": 7.016646584938705e-05, + "loss": 1.0781, + "num_input_tokens_seen": 68588600, + "step": 4263 + }, + { + "epoch": 0.29868571978950503, + "grad_norm": 3.8142640590667725, + "learning_rate": 7.015946760070053e-05, + "loss": 1.1943, + "num_input_tokens_seen": 68604336, + "step": 4264 + }, + { + "epoch": 0.2987557680352343, + "grad_norm": 3.9500539302825928, + "learning_rate": 7.015246935201402e-05, + "loss": 1.1179, + "num_input_tokens_seen": 68620056, + "step": 4265 + }, + { + "epoch": 0.2988258162809635, + "grad_norm": 4.431976318359375, + "learning_rate": 7.01454711033275e-05, + "loss": 1.3419, + "num_input_tokens_seen": 68636328, + "step": 4266 + }, + { + "epoch": 0.29889586452669276, + "grad_norm": 5.619480609893799, + "learning_rate": 7.013847285464098e-05, + "loss": 1.099, + "num_input_tokens_seen": 68651984, + "step": 4267 + }, + { + "epoch": 0.298965912772422, + "grad_norm": 3.8473827838897705, + "learning_rate": 7.013147460595447e-05, + "loss": 1.1273, + "num_input_tokens_seen": 68668176, + "step": 4268 + }, + { + "epoch": 0.2990359610181513, + "grad_norm": 5.942142486572266, + "learning_rate": 7.012447635726796e-05, + "loss": 1.1058, + "num_input_tokens_seen": 68684560, + "step": 4269 + }, + { + "epoch": 0.2991060092638805, + "grad_norm": 6.194666862487793, + "learning_rate": 7.011747810858145e-05, + "loss": 0.9782, + "num_input_tokens_seen": 68699816, + "step": 4270 + }, + { + "epoch": 0.29917605750960974, + "grad_norm": 4.336294651031494, + "learning_rate": 7.011047985989492e-05, + "loss": 1.0038, + "num_input_tokens_seen": 68716200, + "step": 4271 + }, + { + "epoch": 0.299246105755339, + "grad_norm": 4.277907371520996, + "learning_rate": 7.010348161120841e-05, + "loss": 1.0151, + "num_input_tokens_seen": 68732584, + "step": 4272 + }, + { + "epoch": 0.29931615400106826, + "grad_norm": 5.045118808746338, + "learning_rate": 7.00964833625219e-05, + "loss": 1.0992, + "num_input_tokens_seen": 68748840, + "step": 4273 + }, + { + "epoch": 0.29938620224679746, + "grad_norm": 4.3978400230407715, + "learning_rate": 7.008948511383537e-05, + "loss": 1.1796, + "num_input_tokens_seen": 68765224, + "step": 4274 + }, + { + "epoch": 0.2994562504925267, + "grad_norm": 5.052615165710449, + "learning_rate": 7.008248686514886e-05, + "loss": 1.0557, + "num_input_tokens_seen": 68780808, + "step": 4275 + }, + { + "epoch": 0.299526298738256, + "grad_norm": 6.902999401092529, + "learning_rate": 7.007548861646235e-05, + "loss": 1.0952, + "num_input_tokens_seen": 68797064, + "step": 4276 + }, + { + "epoch": 0.29959634698398524, + "grad_norm": 5.947190761566162, + "learning_rate": 7.006849036777584e-05, + "loss": 1.1163, + "num_input_tokens_seen": 68812904, + "step": 4277 + }, + { + "epoch": 0.29966639522971444, + "grad_norm": 5.443974018096924, + "learning_rate": 7.006149211908931e-05, + "loss": 1.146, + "num_input_tokens_seen": 68828736, + "step": 4278 + }, + { + "epoch": 0.2997364434754437, + "grad_norm": 3.9849112033843994, + "learning_rate": 7.005449387040282e-05, + "loss": 0.9636, + "num_input_tokens_seen": 68843928, + "step": 4279 + }, + { + "epoch": 0.29980649172117296, + "grad_norm": 5.787483215332031, + "learning_rate": 7.004749562171629e-05, + "loss": 1.194, + "num_input_tokens_seen": 68860312, + "step": 4280 + }, + { + "epoch": 0.2998765399669022, + "grad_norm": 3.8437387943267822, + "learning_rate": 7.004049737302977e-05, + "loss": 0.9214, + "num_input_tokens_seen": 68876696, + "step": 4281 + }, + { + "epoch": 0.2999465882126314, + "grad_norm": 3.94879150390625, + "learning_rate": 7.003349912434325e-05, + "loss": 1.1403, + "num_input_tokens_seen": 68893080, + "step": 4282 + }, + { + "epoch": 0.3000166364583607, + "grad_norm": 4.746649265289307, + "learning_rate": 7.002650087565676e-05, + "loss": 1.077, + "num_input_tokens_seen": 68909464, + "step": 4283 + }, + { + "epoch": 0.30008668470408995, + "grad_norm": 4.1024861335754395, + "learning_rate": 7.001950262697023e-05, + "loss": 1.0592, + "num_input_tokens_seen": 68925352, + "step": 4284 + }, + { + "epoch": 0.3001567329498192, + "grad_norm": 4.5073699951171875, + "learning_rate": 7.001250437828372e-05, + "loss": 1.2092, + "num_input_tokens_seen": 68941736, + "step": 4285 + }, + { + "epoch": 0.3002267811955484, + "grad_norm": 4.947534561157227, + "learning_rate": 7.000550612959721e-05, + "loss": 1.1389, + "num_input_tokens_seen": 68958120, + "step": 4286 + }, + { + "epoch": 0.30029682944127767, + "grad_norm": 3.8399429321289062, + "learning_rate": 6.999850788091068e-05, + "loss": 1.1268, + "num_input_tokens_seen": 68974232, + "step": 4287 + }, + { + "epoch": 0.30036687768700693, + "grad_norm": 3.9180405139923096, + "learning_rate": 6.999150963222416e-05, + "loss": 1.1666, + "num_input_tokens_seen": 68990616, + "step": 4288 + }, + { + "epoch": 0.3004369259327362, + "grad_norm": 3.9542794227600098, + "learning_rate": 6.998451138353766e-05, + "loss": 1.1474, + "num_input_tokens_seen": 69006952, + "step": 4289 + }, + { + "epoch": 0.3005069741784654, + "grad_norm": 3.5275325775146484, + "learning_rate": 6.997751313485115e-05, + "loss": 1.1239, + "num_input_tokens_seen": 69023336, + "step": 4290 + }, + { + "epoch": 0.30057702242419465, + "grad_norm": 3.9485349655151367, + "learning_rate": 6.997051488616462e-05, + "loss": 0.9736, + "num_input_tokens_seen": 69038392, + "step": 4291 + }, + { + "epoch": 0.3006470706699239, + "grad_norm": 3.4944114685058594, + "learning_rate": 6.996351663747811e-05, + "loss": 0.7473, + "num_input_tokens_seen": 69054160, + "step": 4292 + }, + { + "epoch": 0.30071711891565317, + "grad_norm": 3.387148380279541, + "learning_rate": 6.99565183887916e-05, + "loss": 0.9142, + "num_input_tokens_seen": 69070056, + "step": 4293 + }, + { + "epoch": 0.30078716716138243, + "grad_norm": 3.9591586589813232, + "learning_rate": 6.994952014010508e-05, + "loss": 1.133, + "num_input_tokens_seen": 69086240, + "step": 4294 + }, + { + "epoch": 0.30085721540711163, + "grad_norm": 8.32682991027832, + "learning_rate": 6.994252189141857e-05, + "loss": 1.1697, + "num_input_tokens_seen": 69102408, + "step": 4295 + }, + { + "epoch": 0.3009272636528409, + "grad_norm": 3.5885214805603027, + "learning_rate": 6.993552364273205e-05, + "loss": 1.0626, + "num_input_tokens_seen": 69118376, + "step": 4296 + }, + { + "epoch": 0.30099731189857015, + "grad_norm": 4.784765243530273, + "learning_rate": 6.992852539404554e-05, + "loss": 0.9771, + "num_input_tokens_seen": 69133664, + "step": 4297 + }, + { + "epoch": 0.3010673601442994, + "grad_norm": 6.456319808959961, + "learning_rate": 6.992152714535902e-05, + "loss": 1.3836, + "num_input_tokens_seen": 69148224, + "step": 4298 + }, + { + "epoch": 0.3011374083900286, + "grad_norm": 5.820954322814941, + "learning_rate": 6.99145288966725e-05, + "loss": 0.9987, + "num_input_tokens_seen": 69164440, + "step": 4299 + }, + { + "epoch": 0.3012074566357579, + "grad_norm": 6.690483570098877, + "learning_rate": 6.9907530647986e-05, + "loss": 1.1583, + "num_input_tokens_seen": 69180824, + "step": 4300 + }, + { + "epoch": 0.30127750488148713, + "grad_norm": 3.8018131256103516, + "learning_rate": 6.990053239929947e-05, + "loss": 1.1643, + "num_input_tokens_seen": 69197016, + "step": 4301 + }, + { + "epoch": 0.3013475531272164, + "grad_norm": 4.574918746948242, + "learning_rate": 6.989353415061296e-05, + "loss": 1.168, + "num_input_tokens_seen": 69213400, + "step": 4302 + }, + { + "epoch": 0.3014176013729456, + "grad_norm": 3.3843026161193848, + "learning_rate": 6.988653590192646e-05, + "loss": 0.9762, + "num_input_tokens_seen": 69229784, + "step": 4303 + }, + { + "epoch": 0.30148764961867486, + "grad_norm": 6.179981708526611, + "learning_rate": 6.987953765323994e-05, + "loss": 1.173, + "num_input_tokens_seen": 69246168, + "step": 4304 + }, + { + "epoch": 0.3015576978644041, + "grad_norm": 4.759994029998779, + "learning_rate": 6.987253940455341e-05, + "loss": 0.947, + "num_input_tokens_seen": 69262512, + "step": 4305 + }, + { + "epoch": 0.3016277461101334, + "grad_norm": 3.719902992248535, + "learning_rate": 6.986554115586691e-05, + "loss": 1.1882, + "num_input_tokens_seen": 69278496, + "step": 4306 + }, + { + "epoch": 0.3016977943558626, + "grad_norm": 3.6757240295410156, + "learning_rate": 6.985854290718039e-05, + "loss": 1.1506, + "num_input_tokens_seen": 69294880, + "step": 4307 + }, + { + "epoch": 0.30176784260159184, + "grad_norm": 4.316056251525879, + "learning_rate": 6.985154465849386e-05, + "loss": 0.8921, + "num_input_tokens_seen": 69311264, + "step": 4308 + }, + { + "epoch": 0.3018378908473211, + "grad_norm": 5.248560428619385, + "learning_rate": 6.984454640980736e-05, + "loss": 1.1127, + "num_input_tokens_seen": 69327648, + "step": 4309 + }, + { + "epoch": 0.30190793909305036, + "grad_norm": 3.601381540298462, + "learning_rate": 6.983754816112085e-05, + "loss": 1.0002, + "num_input_tokens_seen": 69344032, + "step": 4310 + }, + { + "epoch": 0.30197798733877956, + "grad_norm": 4.555902004241943, + "learning_rate": 6.983054991243433e-05, + "loss": 1.0674, + "num_input_tokens_seen": 69360416, + "step": 4311 + }, + { + "epoch": 0.3020480355845088, + "grad_norm": 4.615258693695068, + "learning_rate": 6.982355166374782e-05, + "loss": 1.1759, + "num_input_tokens_seen": 69375728, + "step": 4312 + }, + { + "epoch": 0.3021180838302381, + "grad_norm": 5.953250408172607, + "learning_rate": 6.98165534150613e-05, + "loss": 1.1161, + "num_input_tokens_seen": 69391768, + "step": 4313 + }, + { + "epoch": 0.30218813207596734, + "grad_norm": 4.049426555633545, + "learning_rate": 6.980955516637478e-05, + "loss": 1.0466, + "num_input_tokens_seen": 69407328, + "step": 4314 + }, + { + "epoch": 0.30225818032169655, + "grad_norm": 4.012260437011719, + "learning_rate": 6.980255691768827e-05, + "loss": 1.338, + "num_input_tokens_seen": 69423712, + "step": 4315 + }, + { + "epoch": 0.3023282285674258, + "grad_norm": 3.8932242393493652, + "learning_rate": 6.979555866900176e-05, + "loss": 1.0836, + "num_input_tokens_seen": 69440096, + "step": 4316 + }, + { + "epoch": 0.30239827681315506, + "grad_norm": 7.58411169052124, + "learning_rate": 6.978856042031525e-05, + "loss": 1.0849, + "num_input_tokens_seen": 69456088, + "step": 4317 + }, + { + "epoch": 0.3024683250588843, + "grad_norm": 5.275664806365967, + "learning_rate": 6.978156217162872e-05, + "loss": 0.9773, + "num_input_tokens_seen": 69471768, + "step": 4318 + }, + { + "epoch": 0.30253837330461353, + "grad_norm": 3.6384737491607666, + "learning_rate": 6.977456392294221e-05, + "loss": 1.1168, + "num_input_tokens_seen": 69488152, + "step": 4319 + }, + { + "epoch": 0.3026084215503428, + "grad_norm": 5.059805870056152, + "learning_rate": 6.97675656742557e-05, + "loss": 1.1221, + "num_input_tokens_seen": 69504536, + "step": 4320 + }, + { + "epoch": 0.30267846979607205, + "grad_norm": 5.672605037689209, + "learning_rate": 6.976056742556917e-05, + "loss": 0.8506, + "num_input_tokens_seen": 69520920, + "step": 4321 + }, + { + "epoch": 0.3027485180418013, + "grad_norm": 3.5066421031951904, + "learning_rate": 6.975356917688266e-05, + "loss": 1.1437, + "num_input_tokens_seen": 69537304, + "step": 4322 + }, + { + "epoch": 0.3028185662875305, + "grad_norm": 4.403011798858643, + "learning_rate": 6.974657092819616e-05, + "loss": 1.0946, + "num_input_tokens_seen": 69553688, + "step": 4323 + }, + { + "epoch": 0.30288861453325977, + "grad_norm": 3.87226939201355, + "learning_rate": 6.973957267950964e-05, + "loss": 0.9997, + "num_input_tokens_seen": 69570072, + "step": 4324 + }, + { + "epoch": 0.30295866277898903, + "grad_norm": 4.516434192657471, + "learning_rate": 6.973257443082311e-05, + "loss": 1.0816, + "num_input_tokens_seen": 69585056, + "step": 4325 + }, + { + "epoch": 0.3030287110247183, + "grad_norm": 4.07093620300293, + "learning_rate": 6.97255761821366e-05, + "loss": 1.1811, + "num_input_tokens_seen": 69601440, + "step": 4326 + }, + { + "epoch": 0.3030987592704475, + "grad_norm": 3.663632392883301, + "learning_rate": 6.971857793345009e-05, + "loss": 1.0103, + "num_input_tokens_seen": 69617824, + "step": 4327 + }, + { + "epoch": 0.30316880751617675, + "grad_norm": 3.791191577911377, + "learning_rate": 6.971157968476357e-05, + "loss": 1.1402, + "num_input_tokens_seen": 69634208, + "step": 4328 + }, + { + "epoch": 0.303238855761906, + "grad_norm": 4.766335964202881, + "learning_rate": 6.970458143607707e-05, + "loss": 1.057, + "num_input_tokens_seen": 69650592, + "step": 4329 + }, + { + "epoch": 0.30330890400763527, + "grad_norm": 3.6603240966796875, + "learning_rate": 6.969758318739056e-05, + "loss": 1.0052, + "num_input_tokens_seen": 69666976, + "step": 4330 + }, + { + "epoch": 0.30337895225336453, + "grad_norm": 4.231273174285889, + "learning_rate": 6.969058493870403e-05, + "loss": 1.1661, + "num_input_tokens_seen": 69683360, + "step": 4331 + }, + { + "epoch": 0.30344900049909374, + "grad_norm": 3.7526698112487793, + "learning_rate": 6.968358669001751e-05, + "loss": 1.0783, + "num_input_tokens_seen": 69699744, + "step": 4332 + }, + { + "epoch": 0.303519048744823, + "grad_norm": 3.8541617393493652, + "learning_rate": 6.967658844133101e-05, + "loss": 1.0485, + "num_input_tokens_seen": 69715960, + "step": 4333 + }, + { + "epoch": 0.30358909699055225, + "grad_norm": 3.914926767349243, + "learning_rate": 6.966959019264448e-05, + "loss": 0.9525, + "num_input_tokens_seen": 69732344, + "step": 4334 + }, + { + "epoch": 0.3036591452362815, + "grad_norm": 4.39329719543457, + "learning_rate": 6.966259194395797e-05, + "loss": 1.0234, + "num_input_tokens_seen": 69748728, + "step": 4335 + }, + { + "epoch": 0.3037291934820107, + "grad_norm": 3.914006233215332, + "learning_rate": 6.965559369527146e-05, + "loss": 1.1397, + "num_input_tokens_seen": 69765112, + "step": 4336 + }, + { + "epoch": 0.30379924172774, + "grad_norm": 4.536770343780518, + "learning_rate": 6.964859544658495e-05, + "loss": 1.1825, + "num_input_tokens_seen": 69781496, + "step": 4337 + }, + { + "epoch": 0.30386928997346924, + "grad_norm": 4.147655010223389, + "learning_rate": 6.964159719789843e-05, + "loss": 0.7535, + "num_input_tokens_seen": 69797880, + "step": 4338 + }, + { + "epoch": 0.3039393382191985, + "grad_norm": 4.224967956542969, + "learning_rate": 6.963459894921191e-05, + "loss": 1.1842, + "num_input_tokens_seen": 69814264, + "step": 4339 + }, + { + "epoch": 0.3040093864649277, + "grad_norm": 4.415369033813477, + "learning_rate": 6.96276007005254e-05, + "loss": 1.1396, + "num_input_tokens_seen": 69830648, + "step": 4340 + }, + { + "epoch": 0.30407943471065696, + "grad_norm": 3.5865182876586914, + "learning_rate": 6.962060245183888e-05, + "loss": 1.1649, + "num_input_tokens_seen": 69846608, + "step": 4341 + }, + { + "epoch": 0.3041494829563862, + "grad_norm": 6.16670560836792, + "learning_rate": 6.961360420315237e-05, + "loss": 1.0684, + "num_input_tokens_seen": 69862232, + "step": 4342 + }, + { + "epoch": 0.3042195312021155, + "grad_norm": 7.907288074493408, + "learning_rate": 6.960660595446586e-05, + "loss": 0.9661, + "num_input_tokens_seen": 69878616, + "step": 4343 + }, + { + "epoch": 0.3042895794478447, + "grad_norm": 3.7910618782043457, + "learning_rate": 6.959960770577934e-05, + "loss": 1.0852, + "num_input_tokens_seen": 69895000, + "step": 4344 + }, + { + "epoch": 0.30435962769357394, + "grad_norm": 3.4832661151885986, + "learning_rate": 6.959260945709282e-05, + "loss": 1.0312, + "num_input_tokens_seen": 69911384, + "step": 4345 + }, + { + "epoch": 0.3044296759393032, + "grad_norm": 3.563248872756958, + "learning_rate": 6.958561120840631e-05, + "loss": 1.1249, + "num_input_tokens_seen": 69927768, + "step": 4346 + }, + { + "epoch": 0.30449972418503246, + "grad_norm": 4.838014602661133, + "learning_rate": 6.95786129597198e-05, + "loss": 1.2449, + "num_input_tokens_seen": 69944152, + "step": 4347 + }, + { + "epoch": 0.30456977243076166, + "grad_norm": 3.6796975135803223, + "learning_rate": 6.957161471103327e-05, + "loss": 0.8156, + "num_input_tokens_seen": 69959968, + "step": 4348 + }, + { + "epoch": 0.3046398206764909, + "grad_norm": 4.028040885925293, + "learning_rate": 6.956461646234677e-05, + "loss": 1.0653, + "num_input_tokens_seen": 69975960, + "step": 4349 + }, + { + "epoch": 0.3047098689222202, + "grad_norm": 4.073189735412598, + "learning_rate": 6.955761821366026e-05, + "loss": 1.0004, + "num_input_tokens_seen": 69991656, + "step": 4350 + }, + { + "epoch": 0.30477991716794944, + "grad_norm": 5.757152080535889, + "learning_rate": 6.955061996497374e-05, + "loss": 1.2074, + "num_input_tokens_seen": 70008040, + "step": 4351 + }, + { + "epoch": 0.30484996541367865, + "grad_norm": 5.49181604385376, + "learning_rate": 6.954362171628721e-05, + "loss": 1.0235, + "num_input_tokens_seen": 70024424, + "step": 4352 + }, + { + "epoch": 0.3049200136594079, + "grad_norm": 5.573401927947998, + "learning_rate": 6.95366234676007e-05, + "loss": 0.9787, + "num_input_tokens_seen": 70040808, + "step": 4353 + }, + { + "epoch": 0.30499006190513717, + "grad_norm": 3.491823673248291, + "learning_rate": 6.952962521891419e-05, + "loss": 1.0254, + "num_input_tokens_seen": 70057192, + "step": 4354 + }, + { + "epoch": 0.3050601101508664, + "grad_norm": 6.05043888092041, + "learning_rate": 6.952262697022768e-05, + "loss": 1.0709, + "num_input_tokens_seen": 70073576, + "step": 4355 + }, + { + "epoch": 0.30513015839659563, + "grad_norm": 3.848910331726074, + "learning_rate": 6.951562872154117e-05, + "loss": 1.0267, + "num_input_tokens_seen": 70089960, + "step": 4356 + }, + { + "epoch": 0.3052002066423249, + "grad_norm": 4.134339332580566, + "learning_rate": 6.950863047285465e-05, + "loss": 1.2447, + "num_input_tokens_seen": 70106344, + "step": 4357 + }, + { + "epoch": 0.30527025488805415, + "grad_norm": 3.6560862064361572, + "learning_rate": 6.950163222416813e-05, + "loss": 1.1018, + "num_input_tokens_seen": 70122056, + "step": 4358 + }, + { + "epoch": 0.3053403031337834, + "grad_norm": 3.813434600830078, + "learning_rate": 6.94946339754816e-05, + "loss": 1.0149, + "num_input_tokens_seen": 70138408, + "step": 4359 + }, + { + "epoch": 0.3054103513795126, + "grad_norm": 5.002225875854492, + "learning_rate": 6.948763572679511e-05, + "loss": 1.1563, + "num_input_tokens_seen": 70154792, + "step": 4360 + }, + { + "epoch": 0.30548039962524187, + "grad_norm": 3.8483340740203857, + "learning_rate": 6.948063747810858e-05, + "loss": 0.9643, + "num_input_tokens_seen": 70171176, + "step": 4361 + }, + { + "epoch": 0.30555044787097113, + "grad_norm": 5.18534517288208, + "learning_rate": 6.947363922942207e-05, + "loss": 1.1841, + "num_input_tokens_seen": 70187336, + "step": 4362 + }, + { + "epoch": 0.3056204961167004, + "grad_norm": 3.92976713180542, + "learning_rate": 6.946664098073556e-05, + "loss": 1.0051, + "num_input_tokens_seen": 70203720, + "step": 4363 + }, + { + "epoch": 0.30569054436242965, + "grad_norm": 3.4534151554107666, + "learning_rate": 6.945964273204905e-05, + "loss": 0.9356, + "num_input_tokens_seen": 70220104, + "step": 4364 + }, + { + "epoch": 0.30576059260815885, + "grad_norm": 3.7937867641448975, + "learning_rate": 6.945264448336252e-05, + "loss": 1.1694, + "num_input_tokens_seen": 70236488, + "step": 4365 + }, + { + "epoch": 0.3058306408538881, + "grad_norm": 3.9063713550567627, + "learning_rate": 6.944564623467601e-05, + "loss": 1.0969, + "num_input_tokens_seen": 70252872, + "step": 4366 + }, + { + "epoch": 0.3059006890996174, + "grad_norm": 3.9363296031951904, + "learning_rate": 6.94386479859895e-05, + "loss": 0.9776, + "num_input_tokens_seen": 70269256, + "step": 4367 + }, + { + "epoch": 0.30597073734534663, + "grad_norm": 4.722838401794434, + "learning_rate": 6.943164973730297e-05, + "loss": 0.9503, + "num_input_tokens_seen": 70285640, + "step": 4368 + }, + { + "epoch": 0.30604078559107584, + "grad_norm": 4.053229808807373, + "learning_rate": 6.942465148861646e-05, + "loss": 1.2669, + "num_input_tokens_seen": 70301688, + "step": 4369 + }, + { + "epoch": 0.3061108338368051, + "grad_norm": 3.71604323387146, + "learning_rate": 6.941765323992995e-05, + "loss": 1.0619, + "num_input_tokens_seen": 70318072, + "step": 4370 + }, + { + "epoch": 0.30618088208253436, + "grad_norm": 3.8376901149749756, + "learning_rate": 6.941065499124344e-05, + "loss": 1.0007, + "num_input_tokens_seen": 70334456, + "step": 4371 + }, + { + "epoch": 0.3062509303282636, + "grad_norm": 4.157979488372803, + "learning_rate": 6.940365674255692e-05, + "loss": 1.2379, + "num_input_tokens_seen": 70350424, + "step": 4372 + }, + { + "epoch": 0.3063209785739928, + "grad_norm": 4.173924922943115, + "learning_rate": 6.93966584938704e-05, + "loss": 1.1111, + "num_input_tokens_seen": 70366808, + "step": 4373 + }, + { + "epoch": 0.3063910268197221, + "grad_norm": 4.114030838012695, + "learning_rate": 6.938966024518389e-05, + "loss": 1.0932, + "num_input_tokens_seen": 70383000, + "step": 4374 + }, + { + "epoch": 0.30646107506545134, + "grad_norm": 4.31168794631958, + "learning_rate": 6.938266199649738e-05, + "loss": 1.243, + "num_input_tokens_seen": 70399016, + "step": 4375 + }, + { + "epoch": 0.3065311233111806, + "grad_norm": 6.187852382659912, + "learning_rate": 6.937566374781087e-05, + "loss": 0.9274, + "num_input_tokens_seen": 70413016, + "step": 4376 + }, + { + "epoch": 0.3066011715569098, + "grad_norm": 4.700244903564453, + "learning_rate": 6.936866549912436e-05, + "loss": 1.0809, + "num_input_tokens_seen": 70427552, + "step": 4377 + }, + { + "epoch": 0.30667121980263906, + "grad_norm": 4.941024303436279, + "learning_rate": 6.936166725043783e-05, + "loss": 1.1653, + "num_input_tokens_seen": 70443936, + "step": 4378 + }, + { + "epoch": 0.3067412680483683, + "grad_norm": 3.8171792030334473, + "learning_rate": 6.935466900175131e-05, + "loss": 1.1128, + "num_input_tokens_seen": 70460320, + "step": 4379 + }, + { + "epoch": 0.3068113162940976, + "grad_norm": 5.006760597229004, + "learning_rate": 6.93476707530648e-05, + "loss": 1.1674, + "num_input_tokens_seen": 70476704, + "step": 4380 + }, + { + "epoch": 0.3068813645398268, + "grad_norm": 3.8567628860473633, + "learning_rate": 6.934067250437829e-05, + "loss": 1.1478, + "num_input_tokens_seen": 70493016, + "step": 4381 + }, + { + "epoch": 0.30695141278555604, + "grad_norm": 3.7168126106262207, + "learning_rate": 6.933367425569177e-05, + "loss": 0.9496, + "num_input_tokens_seen": 70509400, + "step": 4382 + }, + { + "epoch": 0.3070214610312853, + "grad_norm": 4.72265625, + "learning_rate": 6.932667600700526e-05, + "loss": 1.0319, + "num_input_tokens_seen": 70525592, + "step": 4383 + }, + { + "epoch": 0.30709150927701456, + "grad_norm": 4.502997875213623, + "learning_rate": 6.931967775831875e-05, + "loss": 1.0556, + "num_input_tokens_seen": 70541976, + "step": 4384 + }, + { + "epoch": 0.30716155752274377, + "grad_norm": 4.090621471405029, + "learning_rate": 6.931267950963223e-05, + "loss": 1.1441, + "num_input_tokens_seen": 70558360, + "step": 4385 + }, + { + "epoch": 0.307231605768473, + "grad_norm": 3.501185655593872, + "learning_rate": 6.93056812609457e-05, + "loss": 0.9005, + "num_input_tokens_seen": 70574640, + "step": 4386 + }, + { + "epoch": 0.3073016540142023, + "grad_norm": 3.937352180480957, + "learning_rate": 6.92986830122592e-05, + "loss": 1.1003, + "num_input_tokens_seen": 70591024, + "step": 4387 + }, + { + "epoch": 0.30737170225993155, + "grad_norm": 8.832700729370117, + "learning_rate": 6.929168476357268e-05, + "loss": 0.9773, + "num_input_tokens_seen": 70606720, + "step": 4388 + }, + { + "epoch": 0.30744175050566075, + "grad_norm": 3.8081719875335693, + "learning_rate": 6.928468651488617e-05, + "loss": 0.8348, + "num_input_tokens_seen": 70622864, + "step": 4389 + }, + { + "epoch": 0.30751179875139, + "grad_norm": 3.836366653442383, + "learning_rate": 6.927768826619966e-05, + "loss": 1.0858, + "num_input_tokens_seen": 70639248, + "step": 4390 + }, + { + "epoch": 0.30758184699711927, + "grad_norm": 5.150767803192139, + "learning_rate": 6.927069001751314e-05, + "loss": 1.0395, + "num_input_tokens_seen": 70655128, + "step": 4391 + }, + { + "epoch": 0.3076518952428485, + "grad_norm": 5.0762434005737305, + "learning_rate": 6.926369176882662e-05, + "loss": 0.9503, + "num_input_tokens_seen": 70671512, + "step": 4392 + }, + { + "epoch": 0.30772194348857773, + "grad_norm": 3.7713098526000977, + "learning_rate": 6.925669352014011e-05, + "loss": 0.9273, + "num_input_tokens_seen": 70687896, + "step": 4393 + }, + { + "epoch": 0.307791991734307, + "grad_norm": 5.246247291564941, + "learning_rate": 6.92496952714536e-05, + "loss": 1.1295, + "num_input_tokens_seen": 70704280, + "step": 4394 + }, + { + "epoch": 0.30786203998003625, + "grad_norm": 3.5723984241485596, + "learning_rate": 6.924269702276707e-05, + "loss": 0.9063, + "num_input_tokens_seen": 70720664, + "step": 4395 + }, + { + "epoch": 0.3079320882257655, + "grad_norm": 3.5165982246398926, + "learning_rate": 6.923569877408056e-05, + "loss": 0.963, + "num_input_tokens_seen": 70736968, + "step": 4396 + }, + { + "epoch": 0.30800213647149477, + "grad_norm": 4.140204429626465, + "learning_rate": 6.922870052539405e-05, + "loss": 0.9557, + "num_input_tokens_seen": 70753352, + "step": 4397 + }, + { + "epoch": 0.308072184717224, + "grad_norm": 7.949122428894043, + "learning_rate": 6.922170227670754e-05, + "loss": 0.9429, + "num_input_tokens_seen": 70769720, + "step": 4398 + }, + { + "epoch": 0.30814223296295323, + "grad_norm": 6.45367431640625, + "learning_rate": 6.921470402802101e-05, + "loss": 1.2214, + "num_input_tokens_seen": 70784984, + "step": 4399 + }, + { + "epoch": 0.3082122812086825, + "grad_norm": 4.139477252960205, + "learning_rate": 6.92077057793345e-05, + "loss": 1.2376, + "num_input_tokens_seen": 70800376, + "step": 4400 + }, + { + "epoch": 0.3082122812086825, + "eval_loss": 1.1308872699737549, + "eval_runtime": 0.2076, + "eval_samples_per_second": 4.818, + "eval_steps_per_second": 4.818, + "num_input_tokens_seen": 70800376, + "step": 4400 + }, + { + "epoch": 0.30828232945441175, + "grad_norm": 4.095129013061523, + "learning_rate": 6.920070753064799e-05, + "loss": 1.1774, + "num_input_tokens_seen": 70816560, + "step": 4401 + }, + { + "epoch": 0.30835237770014096, + "grad_norm": 3.6730854511260986, + "learning_rate": 6.919370928196148e-05, + "loss": 1.0242, + "num_input_tokens_seen": 70831848, + "step": 4402 + }, + { + "epoch": 0.3084224259458702, + "grad_norm": 4.013517379760742, + "learning_rate": 6.918671103327497e-05, + "loss": 0.9785, + "num_input_tokens_seen": 70847408, + "step": 4403 + }, + { + "epoch": 0.3084924741915995, + "grad_norm": 5.617120742797852, + "learning_rate": 6.917971278458846e-05, + "loss": 0.9883, + "num_input_tokens_seen": 70862080, + "step": 4404 + }, + { + "epoch": 0.30856252243732873, + "grad_norm": 3.5201385021209717, + "learning_rate": 6.917271453590193e-05, + "loss": 0.9537, + "num_input_tokens_seen": 70878464, + "step": 4405 + }, + { + "epoch": 0.30863257068305794, + "grad_norm": 5.116230010986328, + "learning_rate": 6.91657162872154e-05, + "loss": 1.0934, + "num_input_tokens_seen": 70894848, + "step": 4406 + }, + { + "epoch": 0.3087026189287872, + "grad_norm": 3.4510743618011475, + "learning_rate": 6.91587180385289e-05, + "loss": 1.0857, + "num_input_tokens_seen": 70911232, + "step": 4407 + }, + { + "epoch": 0.30877266717451646, + "grad_norm": 4.719654083251953, + "learning_rate": 6.915171978984238e-05, + "loss": 1.1565, + "num_input_tokens_seen": 70927616, + "step": 4408 + }, + { + "epoch": 0.3088427154202457, + "grad_norm": 4.52898645401001, + "learning_rate": 6.914472154115587e-05, + "loss": 0.9418, + "num_input_tokens_seen": 70944000, + "step": 4409 + }, + { + "epoch": 0.3089127636659749, + "grad_norm": 4.237354755401611, + "learning_rate": 6.913772329246936e-05, + "loss": 1.1614, + "num_input_tokens_seen": 70960384, + "step": 4410 + }, + { + "epoch": 0.3089828119117042, + "grad_norm": 5.489138126373291, + "learning_rate": 6.913072504378285e-05, + "loss": 0.9871, + "num_input_tokens_seen": 70976768, + "step": 4411 + }, + { + "epoch": 0.30905286015743344, + "grad_norm": 5.482370853424072, + "learning_rate": 6.912372679509632e-05, + "loss": 0.9962, + "num_input_tokens_seen": 70992496, + "step": 4412 + }, + { + "epoch": 0.3091229084031627, + "grad_norm": 3.8174126148223877, + "learning_rate": 6.91167285464098e-05, + "loss": 1.0605, + "num_input_tokens_seen": 71008880, + "step": 4413 + }, + { + "epoch": 0.3091929566488919, + "grad_norm": 4.064924716949463, + "learning_rate": 6.91097302977233e-05, + "loss": 0.8307, + "num_input_tokens_seen": 71023912, + "step": 4414 + }, + { + "epoch": 0.30926300489462116, + "grad_norm": 3.955643653869629, + "learning_rate": 6.910273204903678e-05, + "loss": 1.2599, + "num_input_tokens_seen": 71040296, + "step": 4415 + }, + { + "epoch": 0.3093330531403504, + "grad_norm": 3.771191358566284, + "learning_rate": 6.909573380035026e-05, + "loss": 1.0682, + "num_input_tokens_seen": 71056680, + "step": 4416 + }, + { + "epoch": 0.3094031013860797, + "grad_norm": 5.4105963706970215, + "learning_rate": 6.908873555166375e-05, + "loss": 1.1571, + "num_input_tokens_seen": 71072640, + "step": 4417 + }, + { + "epoch": 0.3094731496318089, + "grad_norm": 4.549078464508057, + "learning_rate": 6.908173730297724e-05, + "loss": 1.0837, + "num_input_tokens_seen": 71087336, + "step": 4418 + }, + { + "epoch": 0.30954319787753815, + "grad_norm": 3.998065233230591, + "learning_rate": 6.907473905429072e-05, + "loss": 1.2753, + "num_input_tokens_seen": 71102952, + "step": 4419 + }, + { + "epoch": 0.3096132461232674, + "grad_norm": 3.834508180618286, + "learning_rate": 6.90677408056042e-05, + "loss": 0.8886, + "num_input_tokens_seen": 71119328, + "step": 4420 + }, + { + "epoch": 0.30968329436899666, + "grad_norm": 3.932875156402588, + "learning_rate": 6.90607425569177e-05, + "loss": 1.1568, + "num_input_tokens_seen": 71134968, + "step": 4421 + }, + { + "epoch": 0.30975334261472587, + "grad_norm": 3.712484359741211, + "learning_rate": 6.905374430823118e-05, + "loss": 1.0686, + "num_input_tokens_seen": 71150976, + "step": 4422 + }, + { + "epoch": 0.30982339086045513, + "grad_norm": 3.6733663082122803, + "learning_rate": 6.904674605954466e-05, + "loss": 0.8884, + "num_input_tokens_seen": 71167232, + "step": 4423 + }, + { + "epoch": 0.3098934391061844, + "grad_norm": 3.9877066612243652, + "learning_rate": 6.903974781085815e-05, + "loss": 1.0473, + "num_input_tokens_seen": 71182704, + "step": 4424 + }, + { + "epoch": 0.30996348735191365, + "grad_norm": 3.908582925796509, + "learning_rate": 6.903274956217163e-05, + "loss": 0.9944, + "num_input_tokens_seen": 71198920, + "step": 4425 + }, + { + "epoch": 0.31003353559764285, + "grad_norm": 4.310460090637207, + "learning_rate": 6.902575131348511e-05, + "loss": 0.9651, + "num_input_tokens_seen": 71215256, + "step": 4426 + }, + { + "epoch": 0.3101035838433721, + "grad_norm": 3.8914272785186768, + "learning_rate": 6.90187530647986e-05, + "loss": 0.9858, + "num_input_tokens_seen": 71231432, + "step": 4427 + }, + { + "epoch": 0.31017363208910137, + "grad_norm": 5.774794578552246, + "learning_rate": 6.901175481611209e-05, + "loss": 0.9792, + "num_input_tokens_seen": 71246944, + "step": 4428 + }, + { + "epoch": 0.31024368033483063, + "grad_norm": 6.370543956756592, + "learning_rate": 6.900475656742558e-05, + "loss": 1.0283, + "num_input_tokens_seen": 71263120, + "step": 4429 + }, + { + "epoch": 0.31031372858055983, + "grad_norm": 3.8334455490112305, + "learning_rate": 6.899775831873906e-05, + "loss": 1.0173, + "num_input_tokens_seen": 71279040, + "step": 4430 + }, + { + "epoch": 0.3103837768262891, + "grad_norm": 3.624006509780884, + "learning_rate": 6.899076007005255e-05, + "loss": 0.9908, + "num_input_tokens_seen": 71295424, + "step": 4431 + }, + { + "epoch": 0.31045382507201835, + "grad_norm": 3.8340702056884766, + "learning_rate": 6.898376182136603e-05, + "loss": 1.1257, + "num_input_tokens_seen": 71311808, + "step": 4432 + }, + { + "epoch": 0.3105238733177476, + "grad_norm": 4.4179277420043945, + "learning_rate": 6.89767635726795e-05, + "loss": 1.0439, + "num_input_tokens_seen": 71327560, + "step": 4433 + }, + { + "epoch": 0.31059392156347687, + "grad_norm": 5.758373260498047, + "learning_rate": 6.896976532399299e-05, + "loss": 0.9307, + "num_input_tokens_seen": 71342848, + "step": 4434 + }, + { + "epoch": 0.3106639698092061, + "grad_norm": 3.7063519954681396, + "learning_rate": 6.896276707530648e-05, + "loss": 1.1769, + "num_input_tokens_seen": 71359232, + "step": 4435 + }, + { + "epoch": 0.31073401805493533, + "grad_norm": 4.19386625289917, + "learning_rate": 6.895576882661997e-05, + "loss": 1.1185, + "num_input_tokens_seen": 71375616, + "step": 4436 + }, + { + "epoch": 0.3108040663006646, + "grad_norm": 4.116868019104004, + "learning_rate": 6.894877057793346e-05, + "loss": 0.9571, + "num_input_tokens_seen": 71392000, + "step": 4437 + }, + { + "epoch": 0.31087411454639385, + "grad_norm": 4.810275077819824, + "learning_rate": 6.894177232924695e-05, + "loss": 1.113, + "num_input_tokens_seen": 71407448, + "step": 4438 + }, + { + "epoch": 0.31094416279212306, + "grad_norm": 4.026486873626709, + "learning_rate": 6.893477408056042e-05, + "loss": 0.9949, + "num_input_tokens_seen": 71423832, + "step": 4439 + }, + { + "epoch": 0.3110142110378523, + "grad_norm": 4.268560886383057, + "learning_rate": 6.89277758318739e-05, + "loss": 1.0746, + "num_input_tokens_seen": 71440216, + "step": 4440 + }, + { + "epoch": 0.3110842592835816, + "grad_norm": 3.3299612998962402, + "learning_rate": 6.89207775831874e-05, + "loss": 0.7408, + "num_input_tokens_seen": 71456160, + "step": 4441 + }, + { + "epoch": 0.31115430752931084, + "grad_norm": 3.678912401199341, + "learning_rate": 6.891377933450089e-05, + "loss": 0.9508, + "num_input_tokens_seen": 71472344, + "step": 4442 + }, + { + "epoch": 0.31122435577504004, + "grad_norm": 3.3206088542938232, + "learning_rate": 6.890678108581436e-05, + "loss": 0.9531, + "num_input_tokens_seen": 71488728, + "step": 4443 + }, + { + "epoch": 0.3112944040207693, + "grad_norm": 3.6073081493377686, + "learning_rate": 6.889978283712785e-05, + "loss": 1.18, + "num_input_tokens_seen": 71505112, + "step": 4444 + }, + { + "epoch": 0.31136445226649856, + "grad_norm": 4.998234748840332, + "learning_rate": 6.889278458844134e-05, + "loss": 1.0307, + "num_input_tokens_seen": 71521496, + "step": 4445 + }, + { + "epoch": 0.3114345005122278, + "grad_norm": 3.7966136932373047, + "learning_rate": 6.888578633975481e-05, + "loss": 1.05, + "num_input_tokens_seen": 71537880, + "step": 4446 + }, + { + "epoch": 0.311504548757957, + "grad_norm": 3.7041022777557373, + "learning_rate": 6.88787880910683e-05, + "loss": 1.0701, + "num_input_tokens_seen": 71554136, + "step": 4447 + }, + { + "epoch": 0.3115745970036863, + "grad_norm": 4.155350208282471, + "learning_rate": 6.887178984238179e-05, + "loss": 1.0144, + "num_input_tokens_seen": 71570520, + "step": 4448 + }, + { + "epoch": 0.31164464524941554, + "grad_norm": 3.608290195465088, + "learning_rate": 6.886479159369528e-05, + "loss": 1.0224, + "num_input_tokens_seen": 71586904, + "step": 4449 + }, + { + "epoch": 0.3117146934951448, + "grad_norm": 5.258309841156006, + "learning_rate": 6.885779334500875e-05, + "loss": 1.0929, + "num_input_tokens_seen": 71602280, + "step": 4450 + }, + { + "epoch": 0.311784741740874, + "grad_norm": 4.176782608032227, + "learning_rate": 6.885079509632224e-05, + "loss": 1.0301, + "num_input_tokens_seen": 71618248, + "step": 4451 + }, + { + "epoch": 0.31185478998660326, + "grad_norm": 3.219015121459961, + "learning_rate": 6.884379684763573e-05, + "loss": 0.8699, + "num_input_tokens_seen": 71634632, + "step": 4452 + }, + { + "epoch": 0.3119248382323325, + "grad_norm": 3.485370397567749, + "learning_rate": 6.883679859894921e-05, + "loss": 0.947, + "num_input_tokens_seen": 71651016, + "step": 4453 + }, + { + "epoch": 0.3119948864780618, + "grad_norm": 4.25452184677124, + "learning_rate": 6.88298003502627e-05, + "loss": 1.1649, + "num_input_tokens_seen": 71667400, + "step": 4454 + }, + { + "epoch": 0.312064934723791, + "grad_norm": 4.2082133293151855, + "learning_rate": 6.882280210157618e-05, + "loss": 1.0457, + "num_input_tokens_seen": 71682656, + "step": 4455 + }, + { + "epoch": 0.31213498296952025, + "grad_norm": 3.366642475128174, + "learning_rate": 6.881580385288967e-05, + "loss": 0.8594, + "num_input_tokens_seen": 71699040, + "step": 4456 + }, + { + "epoch": 0.3122050312152495, + "grad_norm": 3.795114278793335, + "learning_rate": 6.880880560420316e-05, + "loss": 1.0749, + "num_input_tokens_seen": 71715424, + "step": 4457 + }, + { + "epoch": 0.31227507946097877, + "grad_norm": 7.3179121017456055, + "learning_rate": 6.880180735551665e-05, + "loss": 1.2978, + "num_input_tokens_seen": 71731808, + "step": 4458 + }, + { + "epoch": 0.31234512770670797, + "grad_norm": 5.0151848793029785, + "learning_rate": 6.879480910683012e-05, + "loss": 1.1359, + "num_input_tokens_seen": 71746688, + "step": 4459 + }, + { + "epoch": 0.31241517595243723, + "grad_norm": 4.136596202850342, + "learning_rate": 6.87878108581436e-05, + "loss": 1.141, + "num_input_tokens_seen": 71763016, + "step": 4460 + }, + { + "epoch": 0.3124852241981665, + "grad_norm": 3.6476573944091797, + "learning_rate": 6.878081260945709e-05, + "loss": 0.9967, + "num_input_tokens_seen": 71779400, + "step": 4461 + }, + { + "epoch": 0.31255527244389575, + "grad_norm": 4.907565593719482, + "learning_rate": 6.877381436077059e-05, + "loss": 1.0391, + "num_input_tokens_seen": 71795784, + "step": 4462 + }, + { + "epoch": 0.31262532068962495, + "grad_norm": 3.82183575630188, + "learning_rate": 6.876681611208407e-05, + "loss": 1.2237, + "num_input_tokens_seen": 71812168, + "step": 4463 + }, + { + "epoch": 0.3126953689353542, + "grad_norm": 4.63422966003418, + "learning_rate": 6.875981786339755e-05, + "loss": 1.1271, + "num_input_tokens_seen": 71828296, + "step": 4464 + }, + { + "epoch": 0.31276541718108347, + "grad_norm": 4.02967643737793, + "learning_rate": 6.875281961471104e-05, + "loss": 1.105, + "num_input_tokens_seen": 71844096, + "step": 4465 + }, + { + "epoch": 0.31283546542681273, + "grad_norm": 3.477452516555786, + "learning_rate": 6.874582136602452e-05, + "loss": 1.0503, + "num_input_tokens_seen": 71860480, + "step": 4466 + }, + { + "epoch": 0.312905513672542, + "grad_norm": 4.4327168464660645, + "learning_rate": 6.873882311733799e-05, + "loss": 1.2305, + "num_input_tokens_seen": 71876208, + "step": 4467 + }, + { + "epoch": 0.3129755619182712, + "grad_norm": 3.8214218616485596, + "learning_rate": 6.87318248686515e-05, + "loss": 1.0271, + "num_input_tokens_seen": 71892592, + "step": 4468 + }, + { + "epoch": 0.31304561016400045, + "grad_norm": 3.4210402965545654, + "learning_rate": 6.872482661996498e-05, + "loss": 0.8505, + "num_input_tokens_seen": 71908976, + "step": 4469 + }, + { + "epoch": 0.3131156584097297, + "grad_norm": 3.567034959793091, + "learning_rate": 6.871782837127846e-05, + "loss": 0.7866, + "num_input_tokens_seen": 71925200, + "step": 4470 + }, + { + "epoch": 0.313185706655459, + "grad_norm": 4.694231986999512, + "learning_rate": 6.871083012259195e-05, + "loss": 0.9634, + "num_input_tokens_seen": 71941584, + "step": 4471 + }, + { + "epoch": 0.3132557549011882, + "grad_norm": 5.802227973937988, + "learning_rate": 6.870383187390544e-05, + "loss": 1.1923, + "num_input_tokens_seen": 71957968, + "step": 4472 + }, + { + "epoch": 0.31332580314691744, + "grad_norm": 4.238499641418457, + "learning_rate": 6.869683362521891e-05, + "loss": 1.3381, + "num_input_tokens_seen": 71973376, + "step": 4473 + }, + { + "epoch": 0.3133958513926467, + "grad_norm": 4.2250213623046875, + "learning_rate": 6.86898353765324e-05, + "loss": 1.0959, + "num_input_tokens_seen": 71988560, + "step": 4474 + }, + { + "epoch": 0.31346589963837596, + "grad_norm": 4.052889823913574, + "learning_rate": 6.868283712784589e-05, + "loss": 1.1711, + "num_input_tokens_seen": 72004232, + "step": 4475 + }, + { + "epoch": 0.31353594788410516, + "grad_norm": 3.694481134414673, + "learning_rate": 6.867583887915938e-05, + "loss": 1.187, + "num_input_tokens_seen": 72020616, + "step": 4476 + }, + { + "epoch": 0.3136059961298344, + "grad_norm": 4.2295074462890625, + "learning_rate": 6.866884063047285e-05, + "loss": 1.2454, + "num_input_tokens_seen": 72036912, + "step": 4477 + }, + { + "epoch": 0.3136760443755637, + "grad_norm": 3.9813766479492188, + "learning_rate": 6.866184238178634e-05, + "loss": 1.1627, + "num_input_tokens_seen": 72053296, + "step": 4478 + }, + { + "epoch": 0.31374609262129294, + "grad_norm": 4.473883152008057, + "learning_rate": 6.865484413309983e-05, + "loss": 1.0522, + "num_input_tokens_seen": 72069680, + "step": 4479 + }, + { + "epoch": 0.31381614086702214, + "grad_norm": 3.7663521766662598, + "learning_rate": 6.86478458844133e-05, + "loss": 0.937, + "num_input_tokens_seen": 72085840, + "step": 4480 + }, + { + "epoch": 0.3138861891127514, + "grad_norm": 3.9587883949279785, + "learning_rate": 6.864084763572679e-05, + "loss": 1.1194, + "num_input_tokens_seen": 72102224, + "step": 4481 + }, + { + "epoch": 0.31395623735848066, + "grad_norm": 3.953232526779175, + "learning_rate": 6.86338493870403e-05, + "loss": 0.9581, + "num_input_tokens_seen": 72118608, + "step": 4482 + }, + { + "epoch": 0.3140262856042099, + "grad_norm": 3.917574882507324, + "learning_rate": 6.862685113835377e-05, + "loss": 1.1883, + "num_input_tokens_seen": 72134504, + "step": 4483 + }, + { + "epoch": 0.3140963338499391, + "grad_norm": 3.756253242492676, + "learning_rate": 6.861985288966726e-05, + "loss": 1.1057, + "num_input_tokens_seen": 72150368, + "step": 4484 + }, + { + "epoch": 0.3141663820956684, + "grad_norm": 4.146200656890869, + "learning_rate": 6.861285464098075e-05, + "loss": 0.98, + "num_input_tokens_seen": 72166752, + "step": 4485 + }, + { + "epoch": 0.31423643034139764, + "grad_norm": 3.98949933052063, + "learning_rate": 6.860585639229422e-05, + "loss": 1.2088, + "num_input_tokens_seen": 72182696, + "step": 4486 + }, + { + "epoch": 0.3143064785871269, + "grad_norm": 3.99951434135437, + "learning_rate": 6.85988581436077e-05, + "loss": 1.218, + "num_input_tokens_seen": 72199080, + "step": 4487 + }, + { + "epoch": 0.3143765268328561, + "grad_norm": 4.351415157318115, + "learning_rate": 6.85918598949212e-05, + "loss": 1.0178, + "num_input_tokens_seen": 72215176, + "step": 4488 + }, + { + "epoch": 0.31444657507858537, + "grad_norm": 4.563141822814941, + "learning_rate": 6.858486164623469e-05, + "loss": 1.0002, + "num_input_tokens_seen": 72231560, + "step": 4489 + }, + { + "epoch": 0.3145166233243146, + "grad_norm": 4.523083686828613, + "learning_rate": 6.857786339754816e-05, + "loss": 1.1464, + "num_input_tokens_seen": 72246920, + "step": 4490 + }, + { + "epoch": 0.3145866715700439, + "grad_norm": 4.032657623291016, + "learning_rate": 6.857086514886165e-05, + "loss": 1.1774, + "num_input_tokens_seen": 72263304, + "step": 4491 + }, + { + "epoch": 0.3146567198157731, + "grad_norm": 4.755338191986084, + "learning_rate": 6.856386690017514e-05, + "loss": 1.0756, + "num_input_tokens_seen": 72279688, + "step": 4492 + }, + { + "epoch": 0.31472676806150235, + "grad_norm": 4.037180423736572, + "learning_rate": 6.855686865148862e-05, + "loss": 1.2973, + "num_input_tokens_seen": 72296072, + "step": 4493 + }, + { + "epoch": 0.3147968163072316, + "grad_norm": 3.308746099472046, + "learning_rate": 6.85498704028021e-05, + "loss": 0.9127, + "num_input_tokens_seen": 72312360, + "step": 4494 + }, + { + "epoch": 0.31486686455296087, + "grad_norm": 4.204549789428711, + "learning_rate": 6.854287215411559e-05, + "loss": 1.1138, + "num_input_tokens_seen": 72328744, + "step": 4495 + }, + { + "epoch": 0.31493691279869007, + "grad_norm": 4.142894744873047, + "learning_rate": 6.853587390542908e-05, + "loss": 1.0273, + "num_input_tokens_seen": 72344944, + "step": 4496 + }, + { + "epoch": 0.31500696104441933, + "grad_norm": 5.43609094619751, + "learning_rate": 6.852887565674256e-05, + "loss": 0.9369, + "num_input_tokens_seen": 72360672, + "step": 4497 + }, + { + "epoch": 0.3150770092901486, + "grad_norm": 4.20035982131958, + "learning_rate": 6.852187740805604e-05, + "loss": 1.0857, + "num_input_tokens_seen": 72376744, + "step": 4498 + }, + { + "epoch": 0.31514705753587785, + "grad_norm": 3.6777737140655518, + "learning_rate": 6.851487915936953e-05, + "loss": 1.0489, + "num_input_tokens_seen": 72393128, + "step": 4499 + }, + { + "epoch": 0.31521710578160705, + "grad_norm": 5.047235488891602, + "learning_rate": 6.850788091068301e-05, + "loss": 1.0644, + "num_input_tokens_seen": 72408016, + "step": 4500 + }, + { + "epoch": 0.3152871540273363, + "grad_norm": 4.095731258392334, + "learning_rate": 6.85008826619965e-05, + "loss": 1.0881, + "num_input_tokens_seen": 72424400, + "step": 4501 + }, + { + "epoch": 0.3153572022730656, + "grad_norm": 3.6437504291534424, + "learning_rate": 6.849388441331e-05, + "loss": 1.1428, + "num_input_tokens_seen": 72440368, + "step": 4502 + }, + { + "epoch": 0.31542725051879483, + "grad_norm": 5.345888614654541, + "learning_rate": 6.848688616462347e-05, + "loss": 1.0143, + "num_input_tokens_seen": 72456752, + "step": 4503 + }, + { + "epoch": 0.3154972987645241, + "grad_norm": 4.471817970275879, + "learning_rate": 6.847988791593695e-05, + "loss": 1.11, + "num_input_tokens_seen": 72472952, + "step": 4504 + }, + { + "epoch": 0.3155673470102533, + "grad_norm": 3.8012888431549072, + "learning_rate": 6.847288966725044e-05, + "loss": 1.1961, + "num_input_tokens_seen": 72489256, + "step": 4505 + }, + { + "epoch": 0.31563739525598256, + "grad_norm": 7.531235218048096, + "learning_rate": 6.846589141856393e-05, + "loss": 1.0254, + "num_input_tokens_seen": 72503752, + "step": 4506 + }, + { + "epoch": 0.3157074435017118, + "grad_norm": 4.075259208679199, + "learning_rate": 6.84588931698774e-05, + "loss": 1.1834, + "num_input_tokens_seen": 72520136, + "step": 4507 + }, + { + "epoch": 0.3157774917474411, + "grad_norm": 5.203637599945068, + "learning_rate": 6.84518949211909e-05, + "loss": 1.1198, + "num_input_tokens_seen": 72536520, + "step": 4508 + }, + { + "epoch": 0.3158475399931703, + "grad_norm": 5.733241081237793, + "learning_rate": 6.844489667250439e-05, + "loss": 1.1981, + "num_input_tokens_seen": 72551976, + "step": 4509 + }, + { + "epoch": 0.31591758823889954, + "grad_norm": 4.182814121246338, + "learning_rate": 6.843789842381787e-05, + "loss": 1.0879, + "num_input_tokens_seen": 72568360, + "step": 4510 + }, + { + "epoch": 0.3159876364846288, + "grad_norm": 5.769293785095215, + "learning_rate": 6.843090017513136e-05, + "loss": 0.988, + "num_input_tokens_seen": 72584744, + "step": 4511 + }, + { + "epoch": 0.31605768473035806, + "grad_norm": 5.052547454833984, + "learning_rate": 6.842390192644484e-05, + "loss": 0.952, + "num_input_tokens_seen": 72600608, + "step": 4512 + }, + { + "epoch": 0.31612773297608726, + "grad_norm": 3.7260072231292725, + "learning_rate": 6.841690367775832e-05, + "loss": 0.948, + "num_input_tokens_seen": 72616720, + "step": 4513 + }, + { + "epoch": 0.3161977812218165, + "grad_norm": 4.230448246002197, + "learning_rate": 6.840990542907181e-05, + "loss": 1.3362, + "num_input_tokens_seen": 72632896, + "step": 4514 + }, + { + "epoch": 0.3162678294675458, + "grad_norm": 3.7840049266815186, + "learning_rate": 6.84029071803853e-05, + "loss": 1.1432, + "num_input_tokens_seen": 72649280, + "step": 4515 + }, + { + "epoch": 0.31633787771327504, + "grad_norm": 3.6891443729400635, + "learning_rate": 6.839590893169878e-05, + "loss": 0.9276, + "num_input_tokens_seen": 72665664, + "step": 4516 + }, + { + "epoch": 0.31640792595900424, + "grad_norm": 5.132042407989502, + "learning_rate": 6.838891068301226e-05, + "loss": 0.9418, + "num_input_tokens_seen": 72682048, + "step": 4517 + }, + { + "epoch": 0.3164779742047335, + "grad_norm": 4.329607009887695, + "learning_rate": 6.838191243432575e-05, + "loss": 1.0247, + "num_input_tokens_seen": 72698136, + "step": 4518 + }, + { + "epoch": 0.31654802245046276, + "grad_norm": 4.269455432891846, + "learning_rate": 6.837491418563924e-05, + "loss": 1.1186, + "num_input_tokens_seen": 72714296, + "step": 4519 + }, + { + "epoch": 0.316618070696192, + "grad_norm": 3.5963287353515625, + "learning_rate": 6.836791593695271e-05, + "loss": 0.8834, + "num_input_tokens_seen": 72730680, + "step": 4520 + }, + { + "epoch": 0.3166881189419212, + "grad_norm": 3.9145658016204834, + "learning_rate": 6.83609176882662e-05, + "loss": 1.1385, + "num_input_tokens_seen": 72746296, + "step": 4521 + }, + { + "epoch": 0.3167581671876505, + "grad_norm": 4.266791820526123, + "learning_rate": 6.83539194395797e-05, + "loss": 1.0825, + "num_input_tokens_seen": 72762680, + "step": 4522 + }, + { + "epoch": 0.31682821543337975, + "grad_norm": 3.850743532180786, + "learning_rate": 6.834692119089318e-05, + "loss": 1.0558, + "num_input_tokens_seen": 72778816, + "step": 4523 + }, + { + "epoch": 0.316898263679109, + "grad_norm": 3.8117008209228516, + "learning_rate": 6.833992294220665e-05, + "loss": 0.9562, + "num_input_tokens_seen": 72794576, + "step": 4524 + }, + { + "epoch": 0.3169683119248382, + "grad_norm": 4.469017028808594, + "learning_rate": 6.833292469352014e-05, + "loss": 1.2533, + "num_input_tokens_seen": 72810960, + "step": 4525 + }, + { + "epoch": 0.31703836017056747, + "grad_norm": 3.538980007171631, + "learning_rate": 6.832592644483363e-05, + "loss": 0.9393, + "num_input_tokens_seen": 72826480, + "step": 4526 + }, + { + "epoch": 0.3171084084162967, + "grad_norm": 3.6429643630981445, + "learning_rate": 6.83189281961471e-05, + "loss": 1.0492, + "num_input_tokens_seen": 72842440, + "step": 4527 + }, + { + "epoch": 0.317178456662026, + "grad_norm": 3.876481056213379, + "learning_rate": 6.831192994746061e-05, + "loss": 1.0699, + "num_input_tokens_seen": 72858424, + "step": 4528 + }, + { + "epoch": 0.3172485049077552, + "grad_norm": 5.119854927062988, + "learning_rate": 6.83049316987741e-05, + "loss": 1.1704, + "num_input_tokens_seen": 72874808, + "step": 4529 + }, + { + "epoch": 0.31731855315348445, + "grad_norm": 3.908071994781494, + "learning_rate": 6.829793345008757e-05, + "loss": 1.0156, + "num_input_tokens_seen": 72891192, + "step": 4530 + }, + { + "epoch": 0.3173886013992137, + "grad_norm": 4.499825954437256, + "learning_rate": 6.829093520140105e-05, + "loss": 0.9863, + "num_input_tokens_seen": 72907576, + "step": 4531 + }, + { + "epoch": 0.31745864964494297, + "grad_norm": 4.060844421386719, + "learning_rate": 6.828393695271453e-05, + "loss": 1.0173, + "num_input_tokens_seen": 72923960, + "step": 4532 + }, + { + "epoch": 0.3175286978906722, + "grad_norm": 4.47066068649292, + "learning_rate": 6.827693870402802e-05, + "loss": 0.859, + "num_input_tokens_seen": 72939576, + "step": 4533 + }, + { + "epoch": 0.31759874613640143, + "grad_norm": 3.6252682209014893, + "learning_rate": 6.826994045534151e-05, + "loss": 0.996, + "num_input_tokens_seen": 72955136, + "step": 4534 + }, + { + "epoch": 0.3176687943821307, + "grad_norm": 4.25836181640625, + "learning_rate": 6.8262942206655e-05, + "loss": 1.0267, + "num_input_tokens_seen": 72971520, + "step": 4535 + }, + { + "epoch": 0.31773884262785995, + "grad_norm": 3.6240739822387695, + "learning_rate": 6.825594395796849e-05, + "loss": 1.0116, + "num_input_tokens_seen": 72987440, + "step": 4536 + }, + { + "epoch": 0.3178088908735892, + "grad_norm": 4.470614910125732, + "learning_rate": 6.824894570928196e-05, + "loss": 1.1302, + "num_input_tokens_seen": 73003824, + "step": 4537 + }, + { + "epoch": 0.3178789391193184, + "grad_norm": 3.5759263038635254, + "learning_rate": 6.824194746059545e-05, + "loss": 0.9902, + "num_input_tokens_seen": 73020208, + "step": 4538 + }, + { + "epoch": 0.3179489873650477, + "grad_norm": 4.424665451049805, + "learning_rate": 6.823494921190894e-05, + "loss": 1.0239, + "num_input_tokens_seen": 73036592, + "step": 4539 + }, + { + "epoch": 0.31801903561077693, + "grad_norm": 3.803205966949463, + "learning_rate": 6.822795096322242e-05, + "loss": 0.9315, + "num_input_tokens_seen": 73052976, + "step": 4540 + }, + { + "epoch": 0.3180890838565062, + "grad_norm": 4.25760555267334, + "learning_rate": 6.82209527145359e-05, + "loss": 1.0985, + "num_input_tokens_seen": 73069360, + "step": 4541 + }, + { + "epoch": 0.3181591321022354, + "grad_norm": 4.006928443908691, + "learning_rate": 6.82139544658494e-05, + "loss": 0.9056, + "num_input_tokens_seen": 73084624, + "step": 4542 + }, + { + "epoch": 0.31822918034796466, + "grad_norm": 3.56350040435791, + "learning_rate": 6.820695621716288e-05, + "loss": 0.8721, + "num_input_tokens_seen": 73100008, + "step": 4543 + }, + { + "epoch": 0.3182992285936939, + "grad_norm": 3.7276062965393066, + "learning_rate": 6.819995796847636e-05, + "loss": 1.1001, + "num_input_tokens_seen": 73116392, + "step": 4544 + }, + { + "epoch": 0.3183692768394232, + "grad_norm": 4.955738544464111, + "learning_rate": 6.819295971978985e-05, + "loss": 1.0459, + "num_input_tokens_seen": 73131920, + "step": 4545 + }, + { + "epoch": 0.3184393250851524, + "grad_norm": 3.5275161266326904, + "learning_rate": 6.818596147110333e-05, + "loss": 1.1006, + "num_input_tokens_seen": 73148304, + "step": 4546 + }, + { + "epoch": 0.31850937333088164, + "grad_norm": 6.4245924949646, + "learning_rate": 6.817896322241681e-05, + "loss": 1.2968, + "num_input_tokens_seen": 73164688, + "step": 4547 + }, + { + "epoch": 0.3185794215766109, + "grad_norm": 4.1172966957092285, + "learning_rate": 6.81719649737303e-05, + "loss": 1.0743, + "num_input_tokens_seen": 73181072, + "step": 4548 + }, + { + "epoch": 0.31864946982234016, + "grad_norm": 3.849090337753296, + "learning_rate": 6.81649667250438e-05, + "loss": 1.1064, + "num_input_tokens_seen": 73197456, + "step": 4549 + }, + { + "epoch": 0.31871951806806936, + "grad_norm": 6.241509437561035, + "learning_rate": 6.815796847635728e-05, + "loss": 1.0592, + "num_input_tokens_seen": 73213568, + "step": 4550 + }, + { + "epoch": 0.3187895663137986, + "grad_norm": 4.039997577667236, + "learning_rate": 6.815097022767075e-05, + "loss": 0.9789, + "num_input_tokens_seen": 73229648, + "step": 4551 + }, + { + "epoch": 0.3188596145595279, + "grad_norm": 3.757549285888672, + "learning_rate": 6.814397197898424e-05, + "loss": 1.1547, + "num_input_tokens_seen": 73245952, + "step": 4552 + }, + { + "epoch": 0.31892966280525714, + "grad_norm": 4.177220821380615, + "learning_rate": 6.813697373029773e-05, + "loss": 1.3134, + "num_input_tokens_seen": 73262336, + "step": 4553 + }, + { + "epoch": 0.31899971105098635, + "grad_norm": 3.659167766571045, + "learning_rate": 6.812997548161122e-05, + "loss": 0.9954, + "num_input_tokens_seen": 73278304, + "step": 4554 + }, + { + "epoch": 0.3190697592967156, + "grad_norm": 4.289649486541748, + "learning_rate": 6.81229772329247e-05, + "loss": 0.8452, + "num_input_tokens_seen": 73294320, + "step": 4555 + }, + { + "epoch": 0.31913980754244486, + "grad_norm": 4.452631950378418, + "learning_rate": 6.811597898423819e-05, + "loss": 1.0265, + "num_input_tokens_seen": 73310256, + "step": 4556 + }, + { + "epoch": 0.3192098557881741, + "grad_norm": 3.572444438934326, + "learning_rate": 6.810898073555167e-05, + "loss": 1.0247, + "num_input_tokens_seen": 73326640, + "step": 4557 + }, + { + "epoch": 0.31927990403390333, + "grad_norm": 4.059347629547119, + "learning_rate": 6.810198248686514e-05, + "loss": 1.0103, + "num_input_tokens_seen": 73342096, + "step": 4558 + }, + { + "epoch": 0.3193499522796326, + "grad_norm": 5.144520282745361, + "learning_rate": 6.809498423817863e-05, + "loss": 1.1181, + "num_input_tokens_seen": 73358480, + "step": 4559 + }, + { + "epoch": 0.31942000052536185, + "grad_norm": 4.210456848144531, + "learning_rate": 6.808798598949212e-05, + "loss": 1.1197, + "num_input_tokens_seen": 73374864, + "step": 4560 + }, + { + "epoch": 0.3194900487710911, + "grad_norm": 5.06007194519043, + "learning_rate": 6.808098774080561e-05, + "loss": 0.8933, + "num_input_tokens_seen": 73391248, + "step": 4561 + }, + { + "epoch": 0.3195600970168203, + "grad_norm": 4.032425403594971, + "learning_rate": 6.80739894921191e-05, + "loss": 0.9132, + "num_input_tokens_seen": 73406728, + "step": 4562 + }, + { + "epoch": 0.31963014526254957, + "grad_norm": 4.344507694244385, + "learning_rate": 6.806699124343259e-05, + "loss": 1.1248, + "num_input_tokens_seen": 73423112, + "step": 4563 + }, + { + "epoch": 0.31970019350827883, + "grad_norm": 3.7113993167877197, + "learning_rate": 6.805999299474606e-05, + "loss": 0.9122, + "num_input_tokens_seen": 73439496, + "step": 4564 + }, + { + "epoch": 0.3197702417540081, + "grad_norm": 4.160495281219482, + "learning_rate": 6.805299474605955e-05, + "loss": 1.0425, + "num_input_tokens_seen": 73455456, + "step": 4565 + }, + { + "epoch": 0.3198402899997373, + "grad_norm": 5.51431131362915, + "learning_rate": 6.804599649737304e-05, + "loss": 0.9416, + "num_input_tokens_seen": 73471840, + "step": 4566 + }, + { + "epoch": 0.31991033824546655, + "grad_norm": 4.145261287689209, + "learning_rate": 6.803899824868651e-05, + "loss": 0.947, + "num_input_tokens_seen": 73487688, + "step": 4567 + }, + { + "epoch": 0.3199803864911958, + "grad_norm": 3.917922019958496, + "learning_rate": 6.8032e-05, + "loss": 1.1859, + "num_input_tokens_seen": 73504072, + "step": 4568 + }, + { + "epoch": 0.32005043473692507, + "grad_norm": 3.8644864559173584, + "learning_rate": 6.802500175131349e-05, + "loss": 0.9176, + "num_input_tokens_seen": 73520344, + "step": 4569 + }, + { + "epoch": 0.32012048298265433, + "grad_norm": 4.043839931488037, + "learning_rate": 6.801800350262698e-05, + "loss": 1.0045, + "num_input_tokens_seen": 73536248, + "step": 4570 + }, + { + "epoch": 0.32019053122838353, + "grad_norm": 4.793722629547119, + "learning_rate": 6.801100525394045e-05, + "loss": 1.2245, + "num_input_tokens_seen": 73552512, + "step": 4571 + }, + { + "epoch": 0.3202605794741128, + "grad_norm": 3.632899761199951, + "learning_rate": 6.800400700525394e-05, + "loss": 0.9899, + "num_input_tokens_seen": 73568896, + "step": 4572 + }, + { + "epoch": 0.32033062771984205, + "grad_norm": 6.236395359039307, + "learning_rate": 6.799700875656743e-05, + "loss": 1.113, + "num_input_tokens_seen": 73585280, + "step": 4573 + }, + { + "epoch": 0.3204006759655713, + "grad_norm": 4.591775417327881, + "learning_rate": 6.799001050788092e-05, + "loss": 1.0019, + "num_input_tokens_seen": 73600328, + "step": 4574 + }, + { + "epoch": 0.3204707242113005, + "grad_norm": 3.9546539783477783, + "learning_rate": 6.79830122591944e-05, + "loss": 1.0444, + "num_input_tokens_seen": 73616568, + "step": 4575 + }, + { + "epoch": 0.3205407724570298, + "grad_norm": 4.425241470336914, + "learning_rate": 6.79760140105079e-05, + "loss": 1.0112, + "num_input_tokens_seen": 73632552, + "step": 4576 + }, + { + "epoch": 0.32061082070275904, + "grad_norm": 3.999953508377075, + "learning_rate": 6.796901576182137e-05, + "loss": 1.1854, + "num_input_tokens_seen": 73648672, + "step": 4577 + }, + { + "epoch": 0.3206808689484883, + "grad_norm": 3.6718766689300537, + "learning_rate": 6.796201751313485e-05, + "loss": 1.0379, + "num_input_tokens_seen": 73665056, + "step": 4578 + }, + { + "epoch": 0.3207509171942175, + "grad_norm": 4.37136173248291, + "learning_rate": 6.795501926444834e-05, + "loss": 0.9921, + "num_input_tokens_seen": 73679680, + "step": 4579 + }, + { + "epoch": 0.32082096543994676, + "grad_norm": 5.109454154968262, + "learning_rate": 6.794802101576182e-05, + "loss": 1.0901, + "num_input_tokens_seen": 73695768, + "step": 4580 + }, + { + "epoch": 0.320891013685676, + "grad_norm": 4.277298927307129, + "learning_rate": 6.794102276707531e-05, + "loss": 1.0651, + "num_input_tokens_seen": 73711840, + "step": 4581 + }, + { + "epoch": 0.3209610619314053, + "grad_norm": 4.598893165588379, + "learning_rate": 6.79340245183888e-05, + "loss": 1.1733, + "num_input_tokens_seen": 73728184, + "step": 4582 + }, + { + "epoch": 0.3210311101771345, + "grad_norm": 5.124484539031982, + "learning_rate": 6.792702626970229e-05, + "loss": 0.9399, + "num_input_tokens_seen": 73744568, + "step": 4583 + }, + { + "epoch": 0.32110115842286374, + "grad_norm": 4.426584243774414, + "learning_rate": 6.792002802101577e-05, + "loss": 0.8339, + "num_input_tokens_seen": 73760424, + "step": 4584 + }, + { + "epoch": 0.321171206668593, + "grad_norm": 3.5181384086608887, + "learning_rate": 6.791302977232924e-05, + "loss": 0.8025, + "num_input_tokens_seen": 73776808, + "step": 4585 + }, + { + "epoch": 0.32124125491432226, + "grad_norm": 6.614295482635498, + "learning_rate": 6.790603152364273e-05, + "loss": 1.1392, + "num_input_tokens_seen": 73793192, + "step": 4586 + }, + { + "epoch": 0.32131130316005146, + "grad_norm": 5.212308406829834, + "learning_rate": 6.789903327495622e-05, + "loss": 1.0909, + "num_input_tokens_seen": 73809576, + "step": 4587 + }, + { + "epoch": 0.3213813514057807, + "grad_norm": 4.7378106117248535, + "learning_rate": 6.78920350262697e-05, + "loss": 1.112, + "num_input_tokens_seen": 73825680, + "step": 4588 + }, + { + "epoch": 0.32145139965151, + "grad_norm": 5.0195136070251465, + "learning_rate": 6.78850367775832e-05, + "loss": 1.4437, + "num_input_tokens_seen": 73841200, + "step": 4589 + }, + { + "epoch": 0.32152144789723924, + "grad_norm": 6.186412811279297, + "learning_rate": 6.787803852889668e-05, + "loss": 1.0715, + "num_input_tokens_seen": 73857584, + "step": 4590 + }, + { + "epoch": 0.32159149614296845, + "grad_norm": 6.835412502288818, + "learning_rate": 6.787104028021016e-05, + "loss": 0.9454, + "num_input_tokens_seen": 73873624, + "step": 4591 + }, + { + "epoch": 0.3216615443886977, + "grad_norm": 4.3859333992004395, + "learning_rate": 6.786404203152365e-05, + "loss": 0.9344, + "num_input_tokens_seen": 73890008, + "step": 4592 + }, + { + "epoch": 0.32173159263442697, + "grad_norm": 3.8230555057525635, + "learning_rate": 6.785704378283714e-05, + "loss": 0.9475, + "num_input_tokens_seen": 73906392, + "step": 4593 + }, + { + "epoch": 0.3218016408801562, + "grad_norm": 4.458274841308594, + "learning_rate": 6.785004553415062e-05, + "loss": 1.1223, + "num_input_tokens_seen": 73922776, + "step": 4594 + }, + { + "epoch": 0.32187168912588543, + "grad_norm": 4.006426811218262, + "learning_rate": 6.78430472854641e-05, + "loss": 1.3019, + "num_input_tokens_seen": 73938896, + "step": 4595 + }, + { + "epoch": 0.3219417373716147, + "grad_norm": 4.637386322021484, + "learning_rate": 6.783604903677759e-05, + "loss": 1.0272, + "num_input_tokens_seen": 73955280, + "step": 4596 + }, + { + "epoch": 0.32201178561734395, + "grad_norm": 5.13168478012085, + "learning_rate": 6.782905078809108e-05, + "loss": 1.1046, + "num_input_tokens_seen": 73971480, + "step": 4597 + }, + { + "epoch": 0.3220818338630732, + "grad_norm": 3.8248770236968994, + "learning_rate": 6.782205253940455e-05, + "loss": 1.0467, + "num_input_tokens_seen": 73987712, + "step": 4598 + }, + { + "epoch": 0.3221518821088024, + "grad_norm": 5.167041778564453, + "learning_rate": 6.781505429071804e-05, + "loss": 1.005, + "num_input_tokens_seen": 74004096, + "step": 4599 + }, + { + "epoch": 0.32222193035453167, + "grad_norm": 3.779311180114746, + "learning_rate": 6.780805604203153e-05, + "loss": 0.9102, + "num_input_tokens_seen": 74020176, + "step": 4600 + }, + { + "epoch": 0.32222193035453167, + "eval_loss": 1.1318858861923218, + "eval_runtime": 0.2027, + "eval_samples_per_second": 4.933, + "eval_steps_per_second": 4.933, + "num_input_tokens_seen": 74020176, + "step": 4600 + }, + { + "epoch": 0.32229197860026093, + "grad_norm": 3.8468148708343506, + "learning_rate": 6.780105779334502e-05, + "loss": 0.9602, + "num_input_tokens_seen": 74035664, + "step": 4601 + }, + { + "epoch": 0.3223620268459902, + "grad_norm": 4.28491735458374, + "learning_rate": 6.779405954465849e-05, + "loss": 1.1125, + "num_input_tokens_seen": 74050408, + "step": 4602 + }, + { + "epoch": 0.3224320750917194, + "grad_norm": 4.872751712799072, + "learning_rate": 6.7787061295972e-05, + "loss": 0.9746, + "num_input_tokens_seen": 74066336, + "step": 4603 + }, + { + "epoch": 0.32250212333744865, + "grad_norm": 4.060647487640381, + "learning_rate": 6.778006304728547e-05, + "loss": 1.0575, + "num_input_tokens_seen": 74082720, + "step": 4604 + }, + { + "epoch": 0.3225721715831779, + "grad_norm": 3.607623815536499, + "learning_rate": 6.777306479859894e-05, + "loss": 0.9797, + "num_input_tokens_seen": 74099104, + "step": 4605 + }, + { + "epoch": 0.3226422198289072, + "grad_norm": 3.719801187515259, + "learning_rate": 6.776606654991243e-05, + "loss": 1.0249, + "num_input_tokens_seen": 74115488, + "step": 4606 + }, + { + "epoch": 0.32271226807463643, + "grad_norm": 5.072197914123535, + "learning_rate": 6.775906830122592e-05, + "loss": 1.1264, + "num_input_tokens_seen": 74131224, + "step": 4607 + }, + { + "epoch": 0.32278231632036564, + "grad_norm": 6.052949905395508, + "learning_rate": 6.775207005253941e-05, + "loss": 1.1319, + "num_input_tokens_seen": 74147608, + "step": 4608 + }, + { + "epoch": 0.3228523645660949, + "grad_norm": 6.214832782745361, + "learning_rate": 6.77450718038529e-05, + "loss": 0.9861, + "num_input_tokens_seen": 74162840, + "step": 4609 + }, + { + "epoch": 0.32292241281182416, + "grad_norm": 4.279264450073242, + "learning_rate": 6.773807355516639e-05, + "loss": 0.9855, + "num_input_tokens_seen": 74179224, + "step": 4610 + }, + { + "epoch": 0.3229924610575534, + "grad_norm": 3.8564460277557373, + "learning_rate": 6.773107530647986e-05, + "loss": 1.0555, + "num_input_tokens_seen": 74195608, + "step": 4611 + }, + { + "epoch": 0.3230625093032826, + "grad_norm": 4.747770309448242, + "learning_rate": 6.772407705779334e-05, + "loss": 1.0011, + "num_input_tokens_seen": 74211664, + "step": 4612 + }, + { + "epoch": 0.3231325575490119, + "grad_norm": 3.5425655841827393, + "learning_rate": 6.771707880910683e-05, + "loss": 1.135, + "num_input_tokens_seen": 74228048, + "step": 4613 + }, + { + "epoch": 0.32320260579474114, + "grad_norm": 3.919851303100586, + "learning_rate": 6.771008056042033e-05, + "loss": 0.9791, + "num_input_tokens_seen": 74243424, + "step": 4614 + }, + { + "epoch": 0.3232726540404704, + "grad_norm": 4.061427593231201, + "learning_rate": 6.77030823117338e-05, + "loss": 1.2477, + "num_input_tokens_seen": 74259696, + "step": 4615 + }, + { + "epoch": 0.3233427022861996, + "grad_norm": 5.14341926574707, + "learning_rate": 6.769608406304729e-05, + "loss": 0.9715, + "num_input_tokens_seen": 74274968, + "step": 4616 + }, + { + "epoch": 0.32341275053192886, + "grad_norm": 6.207670211791992, + "learning_rate": 6.768908581436078e-05, + "loss": 1.0955, + "num_input_tokens_seen": 74291352, + "step": 4617 + }, + { + "epoch": 0.3234827987776581, + "grad_norm": 5.948925971984863, + "learning_rate": 6.768208756567426e-05, + "loss": 1.1007, + "num_input_tokens_seen": 74307000, + "step": 4618 + }, + { + "epoch": 0.3235528470233874, + "grad_norm": 5.205277442932129, + "learning_rate": 6.767508931698774e-05, + "loss": 0.9458, + "num_input_tokens_seen": 74323384, + "step": 4619 + }, + { + "epoch": 0.3236228952691166, + "grad_norm": 3.8878557682037354, + "learning_rate": 6.766809106830123e-05, + "loss": 1.01, + "num_input_tokens_seen": 74339768, + "step": 4620 + }, + { + "epoch": 0.32369294351484584, + "grad_norm": 4.9194111824035645, + "learning_rate": 6.766109281961472e-05, + "loss": 1.1011, + "num_input_tokens_seen": 74355888, + "step": 4621 + }, + { + "epoch": 0.3237629917605751, + "grad_norm": 3.5212655067443848, + "learning_rate": 6.76540945709282e-05, + "loss": 1.0886, + "num_input_tokens_seen": 74372048, + "step": 4622 + }, + { + "epoch": 0.32383304000630436, + "grad_norm": 3.6212568283081055, + "learning_rate": 6.764709632224168e-05, + "loss": 1.0616, + "num_input_tokens_seen": 74388432, + "step": 4623 + }, + { + "epoch": 0.32390308825203357, + "grad_norm": 3.795515298843384, + "learning_rate": 6.764009807355517e-05, + "loss": 1.1594, + "num_input_tokens_seen": 74404584, + "step": 4624 + }, + { + "epoch": 0.3239731364977628, + "grad_norm": 4.537838935852051, + "learning_rate": 6.763309982486865e-05, + "loss": 1.1319, + "num_input_tokens_seen": 74420304, + "step": 4625 + }, + { + "epoch": 0.3240431847434921, + "grad_norm": 4.276764392852783, + "learning_rate": 6.762610157618214e-05, + "loss": 0.9162, + "num_input_tokens_seen": 74436688, + "step": 4626 + }, + { + "epoch": 0.32411323298922134, + "grad_norm": 3.9739227294921875, + "learning_rate": 6.761910332749563e-05, + "loss": 1.0002, + "num_input_tokens_seen": 74451824, + "step": 4627 + }, + { + "epoch": 0.32418328123495055, + "grad_norm": 4.176823616027832, + "learning_rate": 6.761210507880911e-05, + "loss": 1.2547, + "num_input_tokens_seen": 74467080, + "step": 4628 + }, + { + "epoch": 0.3242533294806798, + "grad_norm": 4.471405029296875, + "learning_rate": 6.760510683012259e-05, + "loss": 0.9694, + "num_input_tokens_seen": 74483464, + "step": 4629 + }, + { + "epoch": 0.32432337772640907, + "grad_norm": 3.95442271232605, + "learning_rate": 6.759810858143609e-05, + "loss": 1.1059, + "num_input_tokens_seen": 74499848, + "step": 4630 + }, + { + "epoch": 0.3243934259721383, + "grad_norm": 5.348501682281494, + "learning_rate": 6.759111033274957e-05, + "loss": 1.043, + "num_input_tokens_seen": 74516232, + "step": 4631 + }, + { + "epoch": 0.32446347421786753, + "grad_norm": 4.405150413513184, + "learning_rate": 6.758411208406304e-05, + "loss": 1.0732, + "num_input_tokens_seen": 74531120, + "step": 4632 + }, + { + "epoch": 0.3245335224635968, + "grad_norm": 3.633358955383301, + "learning_rate": 6.757711383537653e-05, + "loss": 0.9585, + "num_input_tokens_seen": 74547504, + "step": 4633 + }, + { + "epoch": 0.32460357070932605, + "grad_norm": 4.668785095214844, + "learning_rate": 6.757011558669003e-05, + "loss": 1.2355, + "num_input_tokens_seen": 74563888, + "step": 4634 + }, + { + "epoch": 0.3246736189550553, + "grad_norm": 5.222908020019531, + "learning_rate": 6.756311733800351e-05, + "loss": 1.094, + "num_input_tokens_seen": 74580224, + "step": 4635 + }, + { + "epoch": 0.3247436672007845, + "grad_norm": 3.812385082244873, + "learning_rate": 6.7556119089317e-05, + "loss": 1.1326, + "num_input_tokens_seen": 74596608, + "step": 4636 + }, + { + "epoch": 0.3248137154465138, + "grad_norm": 5.080833911895752, + "learning_rate": 6.754912084063048e-05, + "loss": 1.0665, + "num_input_tokens_seen": 74612456, + "step": 4637 + }, + { + "epoch": 0.32488376369224303, + "grad_norm": 5.309609413146973, + "learning_rate": 6.754212259194396e-05, + "loss": 1.0206, + "num_input_tokens_seen": 74627840, + "step": 4638 + }, + { + "epoch": 0.3249538119379723, + "grad_norm": 4.46236515045166, + "learning_rate": 6.753512434325743e-05, + "loss": 1.1093, + "num_input_tokens_seen": 74643800, + "step": 4639 + }, + { + "epoch": 0.32502386018370155, + "grad_norm": 9.981855392456055, + "learning_rate": 6.752812609457094e-05, + "loss": 1.2777, + "num_input_tokens_seen": 74660184, + "step": 4640 + }, + { + "epoch": 0.32509390842943076, + "grad_norm": 5.075852870941162, + "learning_rate": 6.752112784588443e-05, + "loss": 0.9977, + "num_input_tokens_seen": 74676568, + "step": 4641 + }, + { + "epoch": 0.32516395667516, + "grad_norm": 3.8985090255737305, + "learning_rate": 6.75141295971979e-05, + "loss": 1.1299, + "num_input_tokens_seen": 74692952, + "step": 4642 + }, + { + "epoch": 0.3252340049208893, + "grad_norm": 4.9769673347473145, + "learning_rate": 6.750713134851139e-05, + "loss": 1.3023, + "num_input_tokens_seen": 74709216, + "step": 4643 + }, + { + "epoch": 0.32530405316661853, + "grad_norm": 4.508238315582275, + "learning_rate": 6.750013309982488e-05, + "loss": 1.014, + "num_input_tokens_seen": 74724640, + "step": 4644 + }, + { + "epoch": 0.32537410141234774, + "grad_norm": 4.214225769042969, + "learning_rate": 6.749313485113835e-05, + "loss": 1.0864, + "num_input_tokens_seen": 74740696, + "step": 4645 + }, + { + "epoch": 0.325444149658077, + "grad_norm": 4.217604160308838, + "learning_rate": 6.748613660245184e-05, + "loss": 1.0521, + "num_input_tokens_seen": 74756520, + "step": 4646 + }, + { + "epoch": 0.32551419790380626, + "grad_norm": 3.5975253582000732, + "learning_rate": 6.747913835376533e-05, + "loss": 0.9642, + "num_input_tokens_seen": 74772904, + "step": 4647 + }, + { + "epoch": 0.3255842461495355, + "grad_norm": 3.5055267810821533, + "learning_rate": 6.747214010507882e-05, + "loss": 1.0975, + "num_input_tokens_seen": 74789288, + "step": 4648 + }, + { + "epoch": 0.3256542943952647, + "grad_norm": 3.8605833053588867, + "learning_rate": 6.746514185639229e-05, + "loss": 1.095, + "num_input_tokens_seen": 74804768, + "step": 4649 + }, + { + "epoch": 0.325724342640994, + "grad_norm": 9.446599006652832, + "learning_rate": 6.745814360770578e-05, + "loss": 1.1894, + "num_input_tokens_seen": 74821152, + "step": 4650 + }, + { + "epoch": 0.32579439088672324, + "grad_norm": 4.161158084869385, + "learning_rate": 6.745114535901927e-05, + "loss": 0.984, + "num_input_tokens_seen": 74836992, + "step": 4651 + }, + { + "epoch": 0.3258644391324525, + "grad_norm": 3.5690324306488037, + "learning_rate": 6.744414711033275e-05, + "loss": 1.0186, + "num_input_tokens_seen": 74852896, + "step": 4652 + }, + { + "epoch": 0.3259344873781817, + "grad_norm": 3.5873210430145264, + "learning_rate": 6.743714886164623e-05, + "loss": 1.0069, + "num_input_tokens_seen": 74868472, + "step": 4653 + }, + { + "epoch": 0.32600453562391096, + "grad_norm": 4.192559719085693, + "learning_rate": 6.743015061295972e-05, + "loss": 1.0646, + "num_input_tokens_seen": 74884856, + "step": 4654 + }, + { + "epoch": 0.3260745838696402, + "grad_norm": 4.633018493652344, + "learning_rate": 6.742315236427321e-05, + "loss": 1.1525, + "num_input_tokens_seen": 74900848, + "step": 4655 + }, + { + "epoch": 0.3261446321153695, + "grad_norm": 3.568934440612793, + "learning_rate": 6.741615411558669e-05, + "loss": 1.061, + "num_input_tokens_seen": 74917232, + "step": 4656 + }, + { + "epoch": 0.3262146803610987, + "grad_norm": 3.6099655628204346, + "learning_rate": 6.740915586690019e-05, + "loss": 1.0758, + "num_input_tokens_seen": 74933616, + "step": 4657 + }, + { + "epoch": 0.32628472860682795, + "grad_norm": 4.272975921630859, + "learning_rate": 6.740215761821366e-05, + "loss": 1.1901, + "num_input_tokens_seen": 74950000, + "step": 4658 + }, + { + "epoch": 0.3263547768525572, + "grad_norm": 4.2752251625061035, + "learning_rate": 6.739515936952714e-05, + "loss": 1.0835, + "num_input_tokens_seen": 74966032, + "step": 4659 + }, + { + "epoch": 0.32642482509828646, + "grad_norm": 5.06410551071167, + "learning_rate": 6.738816112084064e-05, + "loss": 1.2041, + "num_input_tokens_seen": 74981432, + "step": 4660 + }, + { + "epoch": 0.32649487334401567, + "grad_norm": 6.378856182098389, + "learning_rate": 6.738116287215413e-05, + "loss": 1.2996, + "num_input_tokens_seen": 74997440, + "step": 4661 + }, + { + "epoch": 0.3265649215897449, + "grad_norm": 5.427485466003418, + "learning_rate": 6.73741646234676e-05, + "loss": 1.2233, + "num_input_tokens_seen": 75013824, + "step": 4662 + }, + { + "epoch": 0.3266349698354742, + "grad_norm": 4.366839408874512, + "learning_rate": 6.736716637478109e-05, + "loss": 1.2077, + "num_input_tokens_seen": 75030208, + "step": 4663 + }, + { + "epoch": 0.32670501808120345, + "grad_norm": 5.765005588531494, + "learning_rate": 6.736016812609458e-05, + "loss": 1.0833, + "num_input_tokens_seen": 75046592, + "step": 4664 + }, + { + "epoch": 0.32677506632693265, + "grad_norm": 3.4886975288391113, + "learning_rate": 6.735316987740806e-05, + "loss": 0.7976, + "num_input_tokens_seen": 75062976, + "step": 4665 + }, + { + "epoch": 0.3268451145726619, + "grad_norm": 4.1105875968933105, + "learning_rate": 6.734617162872154e-05, + "loss": 1.011, + "num_input_tokens_seen": 75078024, + "step": 4666 + }, + { + "epoch": 0.32691516281839117, + "grad_norm": 3.8737053871154785, + "learning_rate": 6.733917338003503e-05, + "loss": 1.0544, + "num_input_tokens_seen": 75094408, + "step": 4667 + }, + { + "epoch": 0.32698521106412043, + "grad_norm": 4.077807426452637, + "learning_rate": 6.733217513134852e-05, + "loss": 1.1573, + "num_input_tokens_seen": 75110792, + "step": 4668 + }, + { + "epoch": 0.32705525930984963, + "grad_norm": 4.339305400848389, + "learning_rate": 6.7325176882662e-05, + "loss": 0.7132, + "num_input_tokens_seen": 75126240, + "step": 4669 + }, + { + "epoch": 0.3271253075555789, + "grad_norm": 4.241507053375244, + "learning_rate": 6.731817863397549e-05, + "loss": 1.1594, + "num_input_tokens_seen": 75142144, + "step": 4670 + }, + { + "epoch": 0.32719535580130815, + "grad_norm": 7.518558979034424, + "learning_rate": 6.731118038528897e-05, + "loss": 1.0168, + "num_input_tokens_seen": 75158528, + "step": 4671 + }, + { + "epoch": 0.3272654040470374, + "grad_norm": 4.342295169830322, + "learning_rate": 6.730418213660245e-05, + "loss": 1.2134, + "num_input_tokens_seen": 75174912, + "step": 4672 + }, + { + "epoch": 0.3273354522927666, + "grad_norm": 3.3599188327789307, + "learning_rate": 6.729718388791594e-05, + "loss": 0.9183, + "num_input_tokens_seen": 75190720, + "step": 4673 + }, + { + "epoch": 0.3274055005384959, + "grad_norm": 4.393617153167725, + "learning_rate": 6.729018563922943e-05, + "loss": 1.1215, + "num_input_tokens_seen": 75207104, + "step": 4674 + }, + { + "epoch": 0.32747554878422513, + "grad_norm": 3.948538064956665, + "learning_rate": 6.728318739054292e-05, + "loss": 0.9105, + "num_input_tokens_seen": 75222736, + "step": 4675 + }, + { + "epoch": 0.3275455970299544, + "grad_norm": 5.3323469161987305, + "learning_rate": 6.727618914185639e-05, + "loss": 0.9977, + "num_input_tokens_seen": 75238680, + "step": 4676 + }, + { + "epoch": 0.32761564527568365, + "grad_norm": 4.943187713623047, + "learning_rate": 6.726919089316988e-05, + "loss": 0.9327, + "num_input_tokens_seen": 75255064, + "step": 4677 + }, + { + "epoch": 0.32768569352141286, + "grad_norm": 4.083932399749756, + "learning_rate": 6.726219264448337e-05, + "loss": 1.2085, + "num_input_tokens_seen": 75271448, + "step": 4678 + }, + { + "epoch": 0.3277557417671421, + "grad_norm": 4.682622909545898, + "learning_rate": 6.725519439579684e-05, + "loss": 1.0105, + "num_input_tokens_seen": 75287752, + "step": 4679 + }, + { + "epoch": 0.3278257900128714, + "grad_norm": 4.544816493988037, + "learning_rate": 6.724819614711033e-05, + "loss": 1.0422, + "num_input_tokens_seen": 75304136, + "step": 4680 + }, + { + "epoch": 0.32789583825860064, + "grad_norm": 3.859891176223755, + "learning_rate": 6.724119789842383e-05, + "loss": 0.9317, + "num_input_tokens_seen": 75320520, + "step": 4681 + }, + { + "epoch": 0.32796588650432984, + "grad_norm": 5.739070415496826, + "learning_rate": 6.723419964973731e-05, + "loss": 1.1315, + "num_input_tokens_seen": 75336904, + "step": 4682 + }, + { + "epoch": 0.3280359347500591, + "grad_norm": 4.289483547210693, + "learning_rate": 6.722720140105078e-05, + "loss": 1.0576, + "num_input_tokens_seen": 75353288, + "step": 4683 + }, + { + "epoch": 0.32810598299578836, + "grad_norm": 4.03695011138916, + "learning_rate": 6.722020315236429e-05, + "loss": 1.0129, + "num_input_tokens_seen": 75369424, + "step": 4684 + }, + { + "epoch": 0.3281760312415176, + "grad_norm": 3.8941352367401123, + "learning_rate": 6.721320490367776e-05, + "loss": 1.015, + "num_input_tokens_seen": 75385760, + "step": 4685 + }, + { + "epoch": 0.3282460794872468, + "grad_norm": 4.345769882202148, + "learning_rate": 6.720620665499125e-05, + "loss": 0.9842, + "num_input_tokens_seen": 75401736, + "step": 4686 + }, + { + "epoch": 0.3283161277329761, + "grad_norm": 5.759182453155518, + "learning_rate": 6.719920840630474e-05, + "loss": 1.0937, + "num_input_tokens_seen": 75417928, + "step": 4687 + }, + { + "epoch": 0.32838617597870534, + "grad_norm": 4.947919845581055, + "learning_rate": 6.719221015761823e-05, + "loss": 1.0346, + "num_input_tokens_seen": 75433624, + "step": 4688 + }, + { + "epoch": 0.3284562242244346, + "grad_norm": 3.936934471130371, + "learning_rate": 6.71852119089317e-05, + "loss": 1.1684, + "num_input_tokens_seen": 75450008, + "step": 4689 + }, + { + "epoch": 0.3285262724701638, + "grad_norm": 3.7944555282592773, + "learning_rate": 6.717821366024519e-05, + "loss": 0.9825, + "num_input_tokens_seen": 75466392, + "step": 4690 + }, + { + "epoch": 0.32859632071589306, + "grad_norm": 3.8094451427459717, + "learning_rate": 6.717121541155868e-05, + "loss": 0.9309, + "num_input_tokens_seen": 75482776, + "step": 4691 + }, + { + "epoch": 0.3286663689616223, + "grad_norm": 4.426685333251953, + "learning_rate": 6.716421716287215e-05, + "loss": 0.9497, + "num_input_tokens_seen": 75497760, + "step": 4692 + }, + { + "epoch": 0.3287364172073516, + "grad_norm": 4.299224376678467, + "learning_rate": 6.715721891418564e-05, + "loss": 1.214, + "num_input_tokens_seen": 75513024, + "step": 4693 + }, + { + "epoch": 0.3288064654530808, + "grad_norm": 3.765477418899536, + "learning_rate": 6.715022066549913e-05, + "loss": 1.2114, + "num_input_tokens_seen": 75529304, + "step": 4694 + }, + { + "epoch": 0.32887651369881005, + "grad_norm": 3.991591453552246, + "learning_rate": 6.714322241681262e-05, + "loss": 0.8295, + "num_input_tokens_seen": 75545264, + "step": 4695 + }, + { + "epoch": 0.3289465619445393, + "grad_norm": 3.652726888656616, + "learning_rate": 6.71362241681261e-05, + "loss": 0.953, + "num_input_tokens_seen": 75561648, + "step": 4696 + }, + { + "epoch": 0.32901661019026857, + "grad_norm": 6.083689212799072, + "learning_rate": 6.712922591943958e-05, + "loss": 1.0838, + "num_input_tokens_seen": 75578032, + "step": 4697 + }, + { + "epoch": 0.32908665843599777, + "grad_norm": 4.732533931732178, + "learning_rate": 6.712222767075307e-05, + "loss": 0.9885, + "num_input_tokens_seen": 75593944, + "step": 4698 + }, + { + "epoch": 0.32915670668172703, + "grad_norm": 5.024901866912842, + "learning_rate": 6.711522942206655e-05, + "loss": 0.887, + "num_input_tokens_seen": 75610328, + "step": 4699 + }, + { + "epoch": 0.3292267549274563, + "grad_norm": 4.663429260253906, + "learning_rate": 6.710823117338004e-05, + "loss": 1.0955, + "num_input_tokens_seen": 75626712, + "step": 4700 + }, + { + "epoch": 0.32929680317318555, + "grad_norm": 4.396904945373535, + "learning_rate": 6.710123292469354e-05, + "loss": 1.2419, + "num_input_tokens_seen": 75643096, + "step": 4701 + }, + { + "epoch": 0.32936685141891475, + "grad_norm": 3.7963149547576904, + "learning_rate": 6.709423467600701e-05, + "loss": 1.1536, + "num_input_tokens_seen": 75658616, + "step": 4702 + }, + { + "epoch": 0.329436899664644, + "grad_norm": 4.154513835906982, + "learning_rate": 6.708723642732049e-05, + "loss": 1.0529, + "num_input_tokens_seen": 75675000, + "step": 4703 + }, + { + "epoch": 0.32950694791037327, + "grad_norm": 3.8939032554626465, + "learning_rate": 6.708023817863398e-05, + "loss": 1.115, + "num_input_tokens_seen": 75690728, + "step": 4704 + }, + { + "epoch": 0.32957699615610253, + "grad_norm": 4.7678375244140625, + "learning_rate": 6.707323992994746e-05, + "loss": 0.9747, + "num_input_tokens_seen": 75707080, + "step": 4705 + }, + { + "epoch": 0.32964704440183173, + "grad_norm": 6.56498384475708, + "learning_rate": 6.706624168126094e-05, + "loss": 1.058, + "num_input_tokens_seen": 75723464, + "step": 4706 + }, + { + "epoch": 0.329717092647561, + "grad_norm": 6.917506694793701, + "learning_rate": 6.705924343257444e-05, + "loss": 1.0576, + "num_input_tokens_seen": 75739848, + "step": 4707 + }, + { + "epoch": 0.32978714089329025, + "grad_norm": 3.9431846141815186, + "learning_rate": 6.705224518388793e-05, + "loss": 0.9693, + "num_input_tokens_seen": 75756232, + "step": 4708 + }, + { + "epoch": 0.3298571891390195, + "grad_norm": 4.838469505310059, + "learning_rate": 6.70452469352014e-05, + "loss": 1.2367, + "num_input_tokens_seen": 75772616, + "step": 4709 + }, + { + "epoch": 0.3299272373847488, + "grad_norm": 4.0371012687683105, + "learning_rate": 6.703824868651488e-05, + "loss": 1.0494, + "num_input_tokens_seen": 75789000, + "step": 4710 + }, + { + "epoch": 0.329997285630478, + "grad_norm": 3.491875410079956, + "learning_rate": 6.703125043782838e-05, + "loss": 0.8919, + "num_input_tokens_seen": 75805384, + "step": 4711 + }, + { + "epoch": 0.33006733387620724, + "grad_norm": 3.5304512977600098, + "learning_rate": 6.702425218914186e-05, + "loss": 0.8896, + "num_input_tokens_seen": 75821104, + "step": 4712 + }, + { + "epoch": 0.3301373821219365, + "grad_norm": 3.642528533935547, + "learning_rate": 6.701725394045535e-05, + "loss": 0.9843, + "num_input_tokens_seen": 75837424, + "step": 4713 + }, + { + "epoch": 0.33020743036766576, + "grad_norm": 6.536950588226318, + "learning_rate": 6.701025569176883e-05, + "loss": 0.9545, + "num_input_tokens_seen": 75853808, + "step": 4714 + }, + { + "epoch": 0.33027747861339496, + "grad_norm": 3.376460075378418, + "learning_rate": 6.700325744308232e-05, + "loss": 0.9607, + "num_input_tokens_seen": 75870192, + "step": 4715 + }, + { + "epoch": 0.3303475268591242, + "grad_norm": 4.988052845001221, + "learning_rate": 6.69962591943958e-05, + "loss": 1.1392, + "num_input_tokens_seen": 75886576, + "step": 4716 + }, + { + "epoch": 0.3304175751048535, + "grad_norm": 4.724236965179443, + "learning_rate": 6.698926094570929e-05, + "loss": 1.0015, + "num_input_tokens_seen": 75902960, + "step": 4717 + }, + { + "epoch": 0.33048762335058274, + "grad_norm": 4.877357006072998, + "learning_rate": 6.698226269702278e-05, + "loss": 0.9892, + "num_input_tokens_seen": 75919344, + "step": 4718 + }, + { + "epoch": 0.33055767159631194, + "grad_norm": 3.981224775314331, + "learning_rate": 6.697526444833625e-05, + "loss": 0.9356, + "num_input_tokens_seen": 75935728, + "step": 4719 + }, + { + "epoch": 0.3306277198420412, + "grad_norm": 5.456554889678955, + "learning_rate": 6.696826619964974e-05, + "loss": 0.8373, + "num_input_tokens_seen": 75951304, + "step": 4720 + }, + { + "epoch": 0.33069776808777046, + "grad_norm": 3.9885287284851074, + "learning_rate": 6.696126795096323e-05, + "loss": 1.2658, + "num_input_tokens_seen": 75967688, + "step": 4721 + }, + { + "epoch": 0.3307678163334997, + "grad_norm": 3.447371482849121, + "learning_rate": 6.695426970227672e-05, + "loss": 0.9301, + "num_input_tokens_seen": 75984072, + "step": 4722 + }, + { + "epoch": 0.3308378645792289, + "grad_norm": 4.405709743499756, + "learning_rate": 6.694727145359019e-05, + "loss": 1.2445, + "num_input_tokens_seen": 76000456, + "step": 4723 + }, + { + "epoch": 0.3309079128249582, + "grad_norm": 3.7595372200012207, + "learning_rate": 6.694027320490368e-05, + "loss": 1.1851, + "num_input_tokens_seen": 76016840, + "step": 4724 + }, + { + "epoch": 0.33097796107068744, + "grad_norm": 5.460091590881348, + "learning_rate": 6.693327495621717e-05, + "loss": 0.8514, + "num_input_tokens_seen": 76032344, + "step": 4725 + }, + { + "epoch": 0.3310480093164167, + "grad_norm": 7.111250400543213, + "learning_rate": 6.692627670753064e-05, + "loss": 1.0086, + "num_input_tokens_seen": 76048728, + "step": 4726 + }, + { + "epoch": 0.3311180575621459, + "grad_norm": 4.799232482910156, + "learning_rate": 6.691927845884415e-05, + "loss": 0.9995, + "num_input_tokens_seen": 76063832, + "step": 4727 + }, + { + "epoch": 0.33118810580787517, + "grad_norm": 4.045900344848633, + "learning_rate": 6.691228021015763e-05, + "loss": 1.162, + "num_input_tokens_seen": 76079792, + "step": 4728 + }, + { + "epoch": 0.3312581540536044, + "grad_norm": 3.9942305088043213, + "learning_rate": 6.690528196147111e-05, + "loss": 1.1444, + "num_input_tokens_seen": 76095992, + "step": 4729 + }, + { + "epoch": 0.3313282022993337, + "grad_norm": 4.173962116241455, + "learning_rate": 6.689828371278458e-05, + "loss": 0.928, + "num_input_tokens_seen": 76111760, + "step": 4730 + }, + { + "epoch": 0.3313982505450629, + "grad_norm": 8.357215881347656, + "learning_rate": 6.689128546409807e-05, + "loss": 1.1803, + "num_input_tokens_seen": 76127184, + "step": 4731 + }, + { + "epoch": 0.33146829879079215, + "grad_norm": 3.7359249591827393, + "learning_rate": 6.688428721541156e-05, + "loss": 1.0539, + "num_input_tokens_seen": 76143536, + "step": 4732 + }, + { + "epoch": 0.3315383470365214, + "grad_norm": 4.159603595733643, + "learning_rate": 6.687728896672505e-05, + "loss": 1.1565, + "num_input_tokens_seen": 76159640, + "step": 4733 + }, + { + "epoch": 0.33160839528225067, + "grad_norm": 4.893441200256348, + "learning_rate": 6.687029071803854e-05, + "loss": 1.191, + "num_input_tokens_seen": 76176024, + "step": 4734 + }, + { + "epoch": 0.33167844352797987, + "grad_norm": 4.4292426109313965, + "learning_rate": 6.686329246935203e-05, + "loss": 1.1852, + "num_input_tokens_seen": 76192408, + "step": 4735 + }, + { + "epoch": 0.33174849177370913, + "grad_norm": 3.612821102142334, + "learning_rate": 6.68562942206655e-05, + "loss": 1.0195, + "num_input_tokens_seen": 76208792, + "step": 4736 + }, + { + "epoch": 0.3318185400194384, + "grad_norm": 3.6046557426452637, + "learning_rate": 6.684929597197898e-05, + "loss": 1.1402, + "num_input_tokens_seen": 76225176, + "step": 4737 + }, + { + "epoch": 0.33188858826516765, + "grad_norm": 4.637216567993164, + "learning_rate": 6.684229772329248e-05, + "loss": 0.8202, + "num_input_tokens_seen": 76241560, + "step": 4738 + }, + { + "epoch": 0.33195863651089685, + "grad_norm": 4.83438777923584, + "learning_rate": 6.683529947460595e-05, + "loss": 0.9085, + "num_input_tokens_seen": 76257264, + "step": 4739 + }, + { + "epoch": 0.3320286847566261, + "grad_norm": 3.903982400894165, + "learning_rate": 6.682830122591944e-05, + "loss": 1.2306, + "num_input_tokens_seen": 76273608, + "step": 4740 + }, + { + "epoch": 0.3320987330023554, + "grad_norm": 6.24022102355957, + "learning_rate": 6.682130297723293e-05, + "loss": 0.9706, + "num_input_tokens_seen": 76289840, + "step": 4741 + }, + { + "epoch": 0.33216878124808463, + "grad_norm": 5.286207675933838, + "learning_rate": 6.681430472854642e-05, + "loss": 1.0803, + "num_input_tokens_seen": 76306088, + "step": 4742 + }, + { + "epoch": 0.3322388294938139, + "grad_norm": 5.145969867706299, + "learning_rate": 6.68073064798599e-05, + "loss": 1.2303, + "num_input_tokens_seen": 76322152, + "step": 4743 + }, + { + "epoch": 0.3323088777395431, + "grad_norm": 3.6806249618530273, + "learning_rate": 6.680030823117338e-05, + "loss": 1.0168, + "num_input_tokens_seen": 76338424, + "step": 4744 + }, + { + "epoch": 0.33237892598527236, + "grad_norm": 3.743912696838379, + "learning_rate": 6.679330998248687e-05, + "loss": 0.9507, + "num_input_tokens_seen": 76354808, + "step": 4745 + }, + { + "epoch": 0.3324489742310016, + "grad_norm": 5.072415828704834, + "learning_rate": 6.678631173380035e-05, + "loss": 1.014, + "num_input_tokens_seen": 76369696, + "step": 4746 + }, + { + "epoch": 0.3325190224767309, + "grad_norm": 3.366450548171997, + "learning_rate": 6.677931348511384e-05, + "loss": 0.9201, + "num_input_tokens_seen": 76385560, + "step": 4747 + }, + { + "epoch": 0.3325890707224601, + "grad_norm": 3.8318989276885986, + "learning_rate": 6.677231523642732e-05, + "loss": 1.0973, + "num_input_tokens_seen": 76401168, + "step": 4748 + }, + { + "epoch": 0.33265911896818934, + "grad_norm": 3.9670164585113525, + "learning_rate": 6.676531698774081e-05, + "loss": 0.9238, + "num_input_tokens_seen": 76417552, + "step": 4749 + }, + { + "epoch": 0.3327291672139186, + "grad_norm": 4.344585418701172, + "learning_rate": 6.675831873905429e-05, + "loss": 1.0099, + "num_input_tokens_seen": 76433936, + "step": 4750 + }, + { + "epoch": 0.33279921545964786, + "grad_norm": 7.547675132751465, + "learning_rate": 6.675132049036778e-05, + "loss": 1.1412, + "num_input_tokens_seen": 76450320, + "step": 4751 + }, + { + "epoch": 0.33286926370537706, + "grad_norm": 7.854677677154541, + "learning_rate": 6.674432224168127e-05, + "loss": 0.8778, + "num_input_tokens_seen": 76465696, + "step": 4752 + }, + { + "epoch": 0.3329393119511063, + "grad_norm": 4.030972480773926, + "learning_rate": 6.673732399299475e-05, + "loss": 1.1389, + "num_input_tokens_seen": 76482080, + "step": 4753 + }, + { + "epoch": 0.3330093601968356, + "grad_norm": 5.990024089813232, + "learning_rate": 6.673032574430824e-05, + "loss": 0.9469, + "num_input_tokens_seen": 76498464, + "step": 4754 + }, + { + "epoch": 0.33307940844256484, + "grad_norm": 3.8437137603759766, + "learning_rate": 6.672332749562173e-05, + "loss": 1.009, + "num_input_tokens_seen": 76514848, + "step": 4755 + }, + { + "epoch": 0.33314945668829404, + "grad_norm": 3.883882761001587, + "learning_rate": 6.67163292469352e-05, + "loss": 1.0267, + "num_input_tokens_seen": 76531232, + "step": 4756 + }, + { + "epoch": 0.3332195049340233, + "grad_norm": 4.205630779266357, + "learning_rate": 6.670933099824868e-05, + "loss": 1.0847, + "num_input_tokens_seen": 76547616, + "step": 4757 + }, + { + "epoch": 0.33328955317975256, + "grad_norm": 6.173430442810059, + "learning_rate": 6.670233274956217e-05, + "loss": 1.2014, + "num_input_tokens_seen": 76564000, + "step": 4758 + }, + { + "epoch": 0.3333596014254818, + "grad_norm": 3.464181661605835, + "learning_rate": 6.669533450087566e-05, + "loss": 0.8751, + "num_input_tokens_seen": 76579904, + "step": 4759 + }, + { + "epoch": 0.333429649671211, + "grad_norm": 3.3506994247436523, + "learning_rate": 6.668833625218915e-05, + "loss": 0.8281, + "num_input_tokens_seen": 76596288, + "step": 4760 + }, + { + "epoch": 0.3334996979169403, + "grad_norm": 7.188508987426758, + "learning_rate": 6.668133800350264e-05, + "loss": 1.1058, + "num_input_tokens_seen": 76611632, + "step": 4761 + }, + { + "epoch": 0.33356974616266954, + "grad_norm": 3.916689872741699, + "learning_rate": 6.667433975481612e-05, + "loss": 1.0815, + "num_input_tokens_seen": 76626840, + "step": 4762 + }, + { + "epoch": 0.3336397944083988, + "grad_norm": 3.4827966690063477, + "learning_rate": 6.66673415061296e-05, + "loss": 0.9103, + "num_input_tokens_seen": 76643024, + "step": 4763 + }, + { + "epoch": 0.333709842654128, + "grad_norm": 4.479428768157959, + "learning_rate": 6.666034325744307e-05, + "loss": 0.9238, + "num_input_tokens_seen": 76659408, + "step": 4764 + }, + { + "epoch": 0.33377989089985727, + "grad_norm": 6.008899211883545, + "learning_rate": 6.665334500875658e-05, + "loss": 1.2375, + "num_input_tokens_seen": 76675272, + "step": 4765 + }, + { + "epoch": 0.3338499391455865, + "grad_norm": 4.10992431640625, + "learning_rate": 6.664634676007005e-05, + "loss": 1.0539, + "num_input_tokens_seen": 76691000, + "step": 4766 + }, + { + "epoch": 0.3339199873913158, + "grad_norm": 3.953507423400879, + "learning_rate": 6.663934851138354e-05, + "loss": 1.1051, + "num_input_tokens_seen": 76707024, + "step": 4767 + }, + { + "epoch": 0.333990035637045, + "grad_norm": 4.237090587615967, + "learning_rate": 6.663235026269703e-05, + "loss": 1.1683, + "num_input_tokens_seen": 76723408, + "step": 4768 + }, + { + "epoch": 0.33406008388277425, + "grad_norm": 4.417295932769775, + "learning_rate": 6.662535201401052e-05, + "loss": 0.923, + "num_input_tokens_seen": 76739296, + "step": 4769 + }, + { + "epoch": 0.3341301321285035, + "grad_norm": 3.664970874786377, + "learning_rate": 6.661835376532399e-05, + "loss": 0.9556, + "num_input_tokens_seen": 76755432, + "step": 4770 + }, + { + "epoch": 0.33420018037423277, + "grad_norm": 3.702932834625244, + "learning_rate": 6.661135551663748e-05, + "loss": 0.9457, + "num_input_tokens_seen": 76770688, + "step": 4771 + }, + { + "epoch": 0.334270228619962, + "grad_norm": 3.741722822189331, + "learning_rate": 6.660435726795097e-05, + "loss": 1.0923, + "num_input_tokens_seen": 76787072, + "step": 4772 + }, + { + "epoch": 0.33434027686569123, + "grad_norm": 3.9605424404144287, + "learning_rate": 6.659735901926446e-05, + "loss": 1.0823, + "num_input_tokens_seen": 76803456, + "step": 4773 + }, + { + "epoch": 0.3344103251114205, + "grad_norm": 3.9401822090148926, + "learning_rate": 6.659036077057793e-05, + "loss": 0.996, + "num_input_tokens_seen": 76819840, + "step": 4774 + }, + { + "epoch": 0.33448037335714975, + "grad_norm": 3.8762905597686768, + "learning_rate": 6.658336252189142e-05, + "loss": 0.9796, + "num_input_tokens_seen": 76836224, + "step": 4775 + }, + { + "epoch": 0.33455042160287896, + "grad_norm": 4.117221832275391, + "learning_rate": 6.657636427320491e-05, + "loss": 1.2631, + "num_input_tokens_seen": 76852608, + "step": 4776 + }, + { + "epoch": 0.3346204698486082, + "grad_norm": 3.814997434616089, + "learning_rate": 6.656936602451839e-05, + "loss": 1.0891, + "num_input_tokens_seen": 76868400, + "step": 4777 + }, + { + "epoch": 0.3346905180943375, + "grad_norm": 3.6070499420166016, + "learning_rate": 6.656236777583187e-05, + "loss": 0.8537, + "num_input_tokens_seen": 76884784, + "step": 4778 + }, + { + "epoch": 0.33476056634006673, + "grad_norm": 6.291281700134277, + "learning_rate": 6.655536952714536e-05, + "loss": 1.2195, + "num_input_tokens_seen": 76901168, + "step": 4779 + }, + { + "epoch": 0.334830614585796, + "grad_norm": 7.043301105499268, + "learning_rate": 6.654837127845885e-05, + "loss": 1.1015, + "num_input_tokens_seen": 76917552, + "step": 4780 + }, + { + "epoch": 0.3349006628315252, + "grad_norm": 3.6702778339385986, + "learning_rate": 6.654137302977234e-05, + "loss": 1.0832, + "num_input_tokens_seen": 76933936, + "step": 4781 + }, + { + "epoch": 0.33497071107725446, + "grad_norm": 4.228512287139893, + "learning_rate": 6.653437478108583e-05, + "loss": 0.8781, + "num_input_tokens_seen": 76950320, + "step": 4782 + }, + { + "epoch": 0.3350407593229837, + "grad_norm": 3.9304075241088867, + "learning_rate": 6.65273765323993e-05, + "loss": 1.1066, + "num_input_tokens_seen": 76966704, + "step": 4783 + }, + { + "epoch": 0.335110807568713, + "grad_norm": 3.608708620071411, + "learning_rate": 6.652037828371278e-05, + "loss": 1.0409, + "num_input_tokens_seen": 76983016, + "step": 4784 + }, + { + "epoch": 0.3351808558144422, + "grad_norm": 4.402626037597656, + "learning_rate": 6.651338003502627e-05, + "loss": 1.1832, + "num_input_tokens_seen": 76999344, + "step": 4785 + }, + { + "epoch": 0.33525090406017144, + "grad_norm": 4.109679222106934, + "learning_rate": 6.650638178633976e-05, + "loss": 1.0003, + "num_input_tokens_seen": 77014960, + "step": 4786 + }, + { + "epoch": 0.3353209523059007, + "grad_norm": 3.893702507019043, + "learning_rate": 6.649938353765324e-05, + "loss": 1.1101, + "num_input_tokens_seen": 77031344, + "step": 4787 + }, + { + "epoch": 0.33539100055162996, + "grad_norm": 4.326907157897949, + "learning_rate": 6.649238528896673e-05, + "loss": 1.0554, + "num_input_tokens_seen": 77047264, + "step": 4788 + }, + { + "epoch": 0.33546104879735916, + "grad_norm": 4.946060657501221, + "learning_rate": 6.648538704028022e-05, + "loss": 1.0413, + "num_input_tokens_seen": 77063648, + "step": 4789 + }, + { + "epoch": 0.3355310970430884, + "grad_norm": 3.5379018783569336, + "learning_rate": 6.64783887915937e-05, + "loss": 1.0981, + "num_input_tokens_seen": 77080032, + "step": 4790 + }, + { + "epoch": 0.3356011452888177, + "grad_norm": 4.117929935455322, + "learning_rate": 6.647139054290717e-05, + "loss": 1.0624, + "num_input_tokens_seen": 77096416, + "step": 4791 + }, + { + "epoch": 0.33567119353454694, + "grad_norm": 4.293130397796631, + "learning_rate": 6.646439229422067e-05, + "loss": 1.1938, + "num_input_tokens_seen": 77112800, + "step": 4792 + }, + { + "epoch": 0.33574124178027615, + "grad_norm": 3.8246893882751465, + "learning_rate": 6.645739404553416e-05, + "loss": 1.0944, + "num_input_tokens_seen": 77128272, + "step": 4793 + }, + { + "epoch": 0.3358112900260054, + "grad_norm": 4.095324993133545, + "learning_rate": 6.645039579684764e-05, + "loss": 1.0024, + "num_input_tokens_seen": 77144008, + "step": 4794 + }, + { + "epoch": 0.33588133827173466, + "grad_norm": 3.7015397548675537, + "learning_rate": 6.644339754816113e-05, + "loss": 1.1318, + "num_input_tokens_seen": 77160392, + "step": 4795 + }, + { + "epoch": 0.3359513865174639, + "grad_norm": 3.7702248096466064, + "learning_rate": 6.643639929947461e-05, + "loss": 1.1815, + "num_input_tokens_seen": 77176776, + "step": 4796 + }, + { + "epoch": 0.3360214347631931, + "grad_norm": 6.468194961547852, + "learning_rate": 6.642940105078809e-05, + "loss": 1.1144, + "num_input_tokens_seen": 77192000, + "step": 4797 + }, + { + "epoch": 0.3360914830089224, + "grad_norm": 5.211976528167725, + "learning_rate": 6.642240280210158e-05, + "loss": 1.0252, + "num_input_tokens_seen": 77208040, + "step": 4798 + }, + { + "epoch": 0.33616153125465165, + "grad_norm": 4.3227763175964355, + "learning_rate": 6.641540455341507e-05, + "loss": 1.1343, + "num_input_tokens_seen": 77224016, + "step": 4799 + }, + { + "epoch": 0.3362315795003809, + "grad_norm": 3.6128039360046387, + "learning_rate": 6.640840630472856e-05, + "loss": 1.0222, + "num_input_tokens_seen": 77240400, + "step": 4800 + }, + { + "epoch": 0.3362315795003809, + "eval_loss": 1.1314613819122314, + "eval_runtime": 0.2053, + "eval_samples_per_second": 4.87, + "eval_steps_per_second": 4.87, + "num_input_tokens_seen": 77240400, + "step": 4800 + }, + { + "epoch": 0.3363016277461101, + "grad_norm": 3.851407766342163, + "learning_rate": 6.640140805604203e-05, + "loss": 1.0153, + "num_input_tokens_seen": 77255896, + "step": 4801 + }, + { + "epoch": 0.33637167599183937, + "grad_norm": 3.8287763595581055, + "learning_rate": 6.639440980735552e-05, + "loss": 1.02, + "num_input_tokens_seen": 77272280, + "step": 4802 + }, + { + "epoch": 0.33644172423756863, + "grad_norm": 4.373470306396484, + "learning_rate": 6.638741155866901e-05, + "loss": 1.0321, + "num_input_tokens_seen": 77288664, + "step": 4803 + }, + { + "epoch": 0.3365117724832979, + "grad_norm": 5.384084224700928, + "learning_rate": 6.638041330998248e-05, + "loss": 1.08, + "num_input_tokens_seen": 77304544, + "step": 4804 + }, + { + "epoch": 0.3365818207290271, + "grad_norm": 4.742502212524414, + "learning_rate": 6.637341506129597e-05, + "loss": 1.0856, + "num_input_tokens_seen": 77320928, + "step": 4805 + }, + { + "epoch": 0.33665186897475635, + "grad_norm": 3.5294950008392334, + "learning_rate": 6.636641681260946e-05, + "loss": 0.827, + "num_input_tokens_seen": 77337072, + "step": 4806 + }, + { + "epoch": 0.3367219172204856, + "grad_norm": 4.925806999206543, + "learning_rate": 6.635941856392295e-05, + "loss": 1.1351, + "num_input_tokens_seen": 77352312, + "step": 4807 + }, + { + "epoch": 0.33679196546621487, + "grad_norm": 4.373791694641113, + "learning_rate": 6.635242031523644e-05, + "loss": 1.055, + "num_input_tokens_seen": 77368696, + "step": 4808 + }, + { + "epoch": 0.3368620137119441, + "grad_norm": 4.921911239624023, + "learning_rate": 6.634542206654993e-05, + "loss": 1.0054, + "num_input_tokens_seen": 77384976, + "step": 4809 + }, + { + "epoch": 0.33693206195767333, + "grad_norm": 6.203757286071777, + "learning_rate": 6.63384238178634e-05, + "loss": 1.1801, + "num_input_tokens_seen": 77401360, + "step": 4810 + }, + { + "epoch": 0.3370021102034026, + "grad_norm": 3.675086498260498, + "learning_rate": 6.633142556917688e-05, + "loss": 1.1148, + "num_input_tokens_seen": 77417744, + "step": 4811 + }, + { + "epoch": 0.33707215844913185, + "grad_norm": 5.169121742248535, + "learning_rate": 6.632442732049036e-05, + "loss": 0.9541, + "num_input_tokens_seen": 77434128, + "step": 4812 + }, + { + "epoch": 0.3371422066948611, + "grad_norm": 4.036499977111816, + "learning_rate": 6.631742907180387e-05, + "loss": 0.9914, + "num_input_tokens_seen": 77449856, + "step": 4813 + }, + { + "epoch": 0.3372122549405903, + "grad_norm": 4.040637016296387, + "learning_rate": 6.631043082311734e-05, + "loss": 1.0704, + "num_input_tokens_seen": 77465536, + "step": 4814 + }, + { + "epoch": 0.3372823031863196, + "grad_norm": 5.76871395111084, + "learning_rate": 6.630343257443083e-05, + "loss": 1.0127, + "num_input_tokens_seen": 77481920, + "step": 4815 + }, + { + "epoch": 0.33735235143204884, + "grad_norm": 5.222348690032959, + "learning_rate": 6.629643432574432e-05, + "loss": 0.9411, + "num_input_tokens_seen": 77497464, + "step": 4816 + }, + { + "epoch": 0.3374223996777781, + "grad_norm": 4.099587440490723, + "learning_rate": 6.62894360770578e-05, + "loss": 1.011, + "num_input_tokens_seen": 77513848, + "step": 4817 + }, + { + "epoch": 0.3374924479235073, + "grad_norm": 4.034639835357666, + "learning_rate": 6.628243782837127e-05, + "loss": 1.0626, + "num_input_tokens_seen": 77530000, + "step": 4818 + }, + { + "epoch": 0.33756249616923656, + "grad_norm": 5.280242443084717, + "learning_rate": 6.627543957968477e-05, + "loss": 1.1305, + "num_input_tokens_seen": 77546384, + "step": 4819 + }, + { + "epoch": 0.3376325444149658, + "grad_norm": 4.851918697357178, + "learning_rate": 6.626844133099826e-05, + "loss": 1.0557, + "num_input_tokens_seen": 77561000, + "step": 4820 + }, + { + "epoch": 0.3377025926606951, + "grad_norm": 3.957601308822632, + "learning_rate": 6.626144308231173e-05, + "loss": 0.9625, + "num_input_tokens_seen": 77576512, + "step": 4821 + }, + { + "epoch": 0.3377726409064243, + "grad_norm": 3.9369540214538574, + "learning_rate": 6.625444483362522e-05, + "loss": 1.0611, + "num_input_tokens_seen": 77592896, + "step": 4822 + }, + { + "epoch": 0.33784268915215354, + "grad_norm": 4.397778511047363, + "learning_rate": 6.624744658493871e-05, + "loss": 1.0173, + "num_input_tokens_seen": 77609280, + "step": 4823 + }, + { + "epoch": 0.3379127373978828, + "grad_norm": 6.919220447540283, + "learning_rate": 6.624044833625219e-05, + "loss": 0.9992, + "num_input_tokens_seen": 77625664, + "step": 4824 + }, + { + "epoch": 0.33798278564361206, + "grad_norm": 5.501309871673584, + "learning_rate": 6.623345008756568e-05, + "loss": 1.0321, + "num_input_tokens_seen": 77642048, + "step": 4825 + }, + { + "epoch": 0.33805283388934126, + "grad_norm": 4.240433216094971, + "learning_rate": 6.622645183887916e-05, + "loss": 0.8771, + "num_input_tokens_seen": 77658432, + "step": 4826 + }, + { + "epoch": 0.3381228821350705, + "grad_norm": 3.9563584327697754, + "learning_rate": 6.621945359019265e-05, + "loss": 1.1921, + "num_input_tokens_seen": 77674816, + "step": 4827 + }, + { + "epoch": 0.3381929303807998, + "grad_norm": 6.861433982849121, + "learning_rate": 6.621245534150613e-05, + "loss": 0.9414, + "num_input_tokens_seen": 77689712, + "step": 4828 + }, + { + "epoch": 0.33826297862652904, + "grad_norm": 3.951972246170044, + "learning_rate": 6.620545709281962e-05, + "loss": 1.2363, + "num_input_tokens_seen": 77706096, + "step": 4829 + }, + { + "epoch": 0.33833302687225825, + "grad_norm": 4.419849395751953, + "learning_rate": 6.61984588441331e-05, + "loss": 1.1375, + "num_input_tokens_seen": 77721168, + "step": 4830 + }, + { + "epoch": 0.3384030751179875, + "grad_norm": 5.075031280517578, + "learning_rate": 6.619146059544658e-05, + "loss": 1.1363, + "num_input_tokens_seen": 77737552, + "step": 4831 + }, + { + "epoch": 0.33847312336371677, + "grad_norm": 4.216047763824463, + "learning_rate": 6.618446234676007e-05, + "loss": 1.0683, + "num_input_tokens_seen": 77753936, + "step": 4832 + }, + { + "epoch": 0.338543171609446, + "grad_norm": 4.175511360168457, + "learning_rate": 6.617746409807357e-05, + "loss": 1.1294, + "num_input_tokens_seen": 77769848, + "step": 4833 + }, + { + "epoch": 0.33861321985517523, + "grad_norm": 3.894831418991089, + "learning_rate": 6.617046584938705e-05, + "loss": 0.9527, + "num_input_tokens_seen": 77785792, + "step": 4834 + }, + { + "epoch": 0.3386832681009045, + "grad_norm": 4.06626033782959, + "learning_rate": 6.616346760070053e-05, + "loss": 1.039, + "num_input_tokens_seen": 77801728, + "step": 4835 + }, + { + "epoch": 0.33875331634663375, + "grad_norm": 6.5680341720581055, + "learning_rate": 6.615646935201402e-05, + "loss": 1.2627, + "num_input_tokens_seen": 77818112, + "step": 4836 + }, + { + "epoch": 0.338823364592363, + "grad_norm": 4.967332363128662, + "learning_rate": 6.61494711033275e-05, + "loss": 1.1455, + "num_input_tokens_seen": 77833464, + "step": 4837 + }, + { + "epoch": 0.3388934128380922, + "grad_norm": 4.244988918304443, + "learning_rate": 6.614247285464097e-05, + "loss": 1.0809, + "num_input_tokens_seen": 77849848, + "step": 4838 + }, + { + "epoch": 0.33896346108382147, + "grad_norm": 4.361011028289795, + "learning_rate": 6.613547460595447e-05, + "loss": 1.0217, + "num_input_tokens_seen": 77866232, + "step": 4839 + }, + { + "epoch": 0.33903350932955073, + "grad_norm": 6.348353385925293, + "learning_rate": 6.612847635726796e-05, + "loss": 1.1202, + "num_input_tokens_seen": 77882616, + "step": 4840 + }, + { + "epoch": 0.33910355757528, + "grad_norm": 3.8235714435577393, + "learning_rate": 6.612147810858144e-05, + "loss": 0.9018, + "num_input_tokens_seen": 77899000, + "step": 4841 + }, + { + "epoch": 0.3391736058210092, + "grad_norm": 4.069207191467285, + "learning_rate": 6.611447985989493e-05, + "loss": 1.1303, + "num_input_tokens_seen": 77915384, + "step": 4842 + }, + { + "epoch": 0.33924365406673845, + "grad_norm": 3.8036270141601562, + "learning_rate": 6.610748161120842e-05, + "loss": 1.1935, + "num_input_tokens_seen": 77931768, + "step": 4843 + }, + { + "epoch": 0.3393137023124677, + "grad_norm": 5.776700019836426, + "learning_rate": 6.610048336252189e-05, + "loss": 1.0031, + "num_input_tokens_seen": 77948152, + "step": 4844 + }, + { + "epoch": 0.339383750558197, + "grad_norm": 5.484714508056641, + "learning_rate": 6.609348511383538e-05, + "loss": 1.2233, + "num_input_tokens_seen": 77964536, + "step": 4845 + }, + { + "epoch": 0.3394537988039262, + "grad_norm": 4.595640659332275, + "learning_rate": 6.608648686514887e-05, + "loss": 1.0854, + "num_input_tokens_seen": 77980648, + "step": 4846 + }, + { + "epoch": 0.33952384704965544, + "grad_norm": 5.0377197265625, + "learning_rate": 6.607948861646236e-05, + "loss": 1.0513, + "num_input_tokens_seen": 77995624, + "step": 4847 + }, + { + "epoch": 0.3395938952953847, + "grad_norm": 3.796713352203369, + "learning_rate": 6.607249036777583e-05, + "loss": 0.8928, + "num_input_tokens_seen": 78012008, + "step": 4848 + }, + { + "epoch": 0.33966394354111396, + "grad_norm": 5.138030052185059, + "learning_rate": 6.606549211908932e-05, + "loss": 0.9565, + "num_input_tokens_seen": 78027960, + "step": 4849 + }, + { + "epoch": 0.3397339917868432, + "grad_norm": 5.852467060089111, + "learning_rate": 6.605849387040281e-05, + "loss": 1.1297, + "num_input_tokens_seen": 78044144, + "step": 4850 + }, + { + "epoch": 0.3398040400325724, + "grad_norm": 3.5677835941314697, + "learning_rate": 6.605149562171628e-05, + "loss": 1.0115, + "num_input_tokens_seen": 78059096, + "step": 4851 + }, + { + "epoch": 0.3398740882783017, + "grad_norm": 4.033452987670898, + "learning_rate": 6.604449737302977e-05, + "loss": 1.1311, + "num_input_tokens_seen": 78075480, + "step": 4852 + }, + { + "epoch": 0.33994413652403094, + "grad_norm": 5.06736421585083, + "learning_rate": 6.603749912434327e-05, + "loss": 1.0456, + "num_input_tokens_seen": 78091864, + "step": 4853 + }, + { + "epoch": 0.3400141847697602, + "grad_norm": 4.851357460021973, + "learning_rate": 6.603050087565675e-05, + "loss": 0.9985, + "num_input_tokens_seen": 78108248, + "step": 4854 + }, + { + "epoch": 0.3400842330154894, + "grad_norm": 4.659403324127197, + "learning_rate": 6.602350262697022e-05, + "loss": 1.0618, + "num_input_tokens_seen": 78123720, + "step": 4855 + }, + { + "epoch": 0.34015428126121866, + "grad_norm": 4.0248870849609375, + "learning_rate": 6.601650437828371e-05, + "loss": 1.0192, + "num_input_tokens_seen": 78140104, + "step": 4856 + }, + { + "epoch": 0.3402243295069479, + "grad_norm": 3.615807056427002, + "learning_rate": 6.60095061295972e-05, + "loss": 0.9875, + "num_input_tokens_seen": 78155768, + "step": 4857 + }, + { + "epoch": 0.3402943777526772, + "grad_norm": 4.032024383544922, + "learning_rate": 6.600250788091068e-05, + "loss": 0.8621, + "num_input_tokens_seen": 78171688, + "step": 4858 + }, + { + "epoch": 0.3403644259984064, + "grad_norm": 4.213406085968018, + "learning_rate": 6.599550963222418e-05, + "loss": 1.1186, + "num_input_tokens_seen": 78188072, + "step": 4859 + }, + { + "epoch": 0.34043447424413564, + "grad_norm": 4.343294620513916, + "learning_rate": 6.598851138353767e-05, + "loss": 0.9555, + "num_input_tokens_seen": 78203328, + "step": 4860 + }, + { + "epoch": 0.3405045224898649, + "grad_norm": 5.112723350524902, + "learning_rate": 6.598151313485114e-05, + "loss": 0.8201, + "num_input_tokens_seen": 78218480, + "step": 4861 + }, + { + "epoch": 0.34057457073559416, + "grad_norm": 5.0619215965271, + "learning_rate": 6.597451488616463e-05, + "loss": 1.1147, + "num_input_tokens_seen": 78234864, + "step": 4862 + }, + { + "epoch": 0.34064461898132337, + "grad_norm": 4.161584377288818, + "learning_rate": 6.596751663747812e-05, + "loss": 1.3292, + "num_input_tokens_seen": 78250664, + "step": 4863 + }, + { + "epoch": 0.3407146672270526, + "grad_norm": 4.402634143829346, + "learning_rate": 6.59605183887916e-05, + "loss": 1.2664, + "num_input_tokens_seen": 78266976, + "step": 4864 + }, + { + "epoch": 0.3407847154727819, + "grad_norm": 4.01839017868042, + "learning_rate": 6.595352014010508e-05, + "loss": 1.1515, + "num_input_tokens_seen": 78283360, + "step": 4865 + }, + { + "epoch": 0.34085476371851114, + "grad_norm": 3.6157965660095215, + "learning_rate": 6.594652189141857e-05, + "loss": 0.8962, + "num_input_tokens_seen": 78299744, + "step": 4866 + }, + { + "epoch": 0.34092481196424035, + "grad_norm": 4.221523761749268, + "learning_rate": 6.593952364273206e-05, + "loss": 1.3689, + "num_input_tokens_seen": 78314944, + "step": 4867 + }, + { + "epoch": 0.3409948602099696, + "grad_norm": 5.253129482269287, + "learning_rate": 6.593252539404554e-05, + "loss": 1.0223, + "num_input_tokens_seen": 78331168, + "step": 4868 + }, + { + "epoch": 0.34106490845569887, + "grad_norm": 4.839991569519043, + "learning_rate": 6.592552714535902e-05, + "loss": 1.1622, + "num_input_tokens_seen": 78347200, + "step": 4869 + }, + { + "epoch": 0.3411349567014281, + "grad_norm": 5.994297504425049, + "learning_rate": 6.591852889667251e-05, + "loss": 1.0511, + "num_input_tokens_seen": 78363584, + "step": 4870 + }, + { + "epoch": 0.34120500494715733, + "grad_norm": 4.886160850524902, + "learning_rate": 6.591153064798599e-05, + "loss": 1.0025, + "num_input_tokens_seen": 78379968, + "step": 4871 + }, + { + "epoch": 0.3412750531928866, + "grad_norm": 8.658349990844727, + "learning_rate": 6.590453239929948e-05, + "loss": 1.0145, + "num_input_tokens_seen": 78395368, + "step": 4872 + }, + { + "epoch": 0.34134510143861585, + "grad_norm": 5.1440935134887695, + "learning_rate": 6.589753415061298e-05, + "loss": 0.9584, + "num_input_tokens_seen": 78411752, + "step": 4873 + }, + { + "epoch": 0.3414151496843451, + "grad_norm": 4.83282995223999, + "learning_rate": 6.589053590192645e-05, + "loss": 1.1825, + "num_input_tokens_seen": 78428128, + "step": 4874 + }, + { + "epoch": 0.3414851979300743, + "grad_norm": 3.603290557861328, + "learning_rate": 6.588353765323993e-05, + "loss": 1.081, + "num_input_tokens_seen": 78444512, + "step": 4875 + }, + { + "epoch": 0.3415552461758036, + "grad_norm": 3.8035361766815186, + "learning_rate": 6.587653940455342e-05, + "loss": 1.1571, + "num_input_tokens_seen": 78460896, + "step": 4876 + }, + { + "epoch": 0.34162529442153283, + "grad_norm": 4.02992582321167, + "learning_rate": 6.58695411558669e-05, + "loss": 1.0974, + "num_input_tokens_seen": 78477280, + "step": 4877 + }, + { + "epoch": 0.3416953426672621, + "grad_norm": 4.898126125335693, + "learning_rate": 6.586254290718038e-05, + "loss": 1.0632, + "num_input_tokens_seen": 78493664, + "step": 4878 + }, + { + "epoch": 0.3417653909129913, + "grad_norm": 4.779463768005371, + "learning_rate": 6.585554465849388e-05, + "loss": 1.0473, + "num_input_tokens_seen": 78509280, + "step": 4879 + }, + { + "epoch": 0.34183543915872056, + "grad_norm": 3.7280569076538086, + "learning_rate": 6.584854640980737e-05, + "loss": 1.1184, + "num_input_tokens_seen": 78525664, + "step": 4880 + }, + { + "epoch": 0.3419054874044498, + "grad_norm": 4.691235542297363, + "learning_rate": 6.584154816112085e-05, + "loss": 0.9956, + "num_input_tokens_seen": 78542048, + "step": 4881 + }, + { + "epoch": 0.3419755356501791, + "grad_norm": 4.188792705535889, + "learning_rate": 6.583454991243432e-05, + "loss": 1.1065, + "num_input_tokens_seen": 78558432, + "step": 4882 + }, + { + "epoch": 0.34204558389590833, + "grad_norm": 3.7049522399902344, + "learning_rate": 6.582755166374781e-05, + "loss": 0.9286, + "num_input_tokens_seen": 78574816, + "step": 4883 + }, + { + "epoch": 0.34211563214163754, + "grad_norm": 5.808310508728027, + "learning_rate": 6.58205534150613e-05, + "loss": 1.0674, + "num_input_tokens_seen": 78590992, + "step": 4884 + }, + { + "epoch": 0.3421856803873668, + "grad_norm": 3.877638339996338, + "learning_rate": 6.581355516637479e-05, + "loss": 0.9908, + "num_input_tokens_seen": 78607368, + "step": 4885 + }, + { + "epoch": 0.34225572863309606, + "grad_norm": 3.7855000495910645, + "learning_rate": 6.580655691768828e-05, + "loss": 1.0697, + "num_input_tokens_seen": 78622712, + "step": 4886 + }, + { + "epoch": 0.3423257768788253, + "grad_norm": 3.9921584129333496, + "learning_rate": 6.579955866900176e-05, + "loss": 1.0196, + "num_input_tokens_seen": 78638840, + "step": 4887 + }, + { + "epoch": 0.3423958251245545, + "grad_norm": 4.037683486938477, + "learning_rate": 6.579256042031524e-05, + "loss": 0.9606, + "num_input_tokens_seen": 78655224, + "step": 4888 + }, + { + "epoch": 0.3424658733702838, + "grad_norm": 4.109930515289307, + "learning_rate": 6.578556217162873e-05, + "loss": 1.1189, + "num_input_tokens_seen": 78670984, + "step": 4889 + }, + { + "epoch": 0.34253592161601304, + "grad_norm": 5.201082229614258, + "learning_rate": 6.577856392294222e-05, + "loss": 1.0844, + "num_input_tokens_seen": 78686856, + "step": 4890 + }, + { + "epoch": 0.3426059698617423, + "grad_norm": 5.922754764556885, + "learning_rate": 6.577156567425569e-05, + "loss": 1.2428, + "num_input_tokens_seen": 78702688, + "step": 4891 + }, + { + "epoch": 0.3426760181074715, + "grad_norm": 4.052786350250244, + "learning_rate": 6.576456742556918e-05, + "loss": 1.0765, + "num_input_tokens_seen": 78719072, + "step": 4892 + }, + { + "epoch": 0.34274606635320076, + "grad_norm": 4.0263671875, + "learning_rate": 6.575756917688267e-05, + "loss": 1.2076, + "num_input_tokens_seen": 78735456, + "step": 4893 + }, + { + "epoch": 0.34281611459893, + "grad_norm": 3.773024082183838, + "learning_rate": 6.575057092819616e-05, + "loss": 1.1275, + "num_input_tokens_seen": 78751424, + "step": 4894 + }, + { + "epoch": 0.3428861628446593, + "grad_norm": 3.770413398742676, + "learning_rate": 6.574357267950963e-05, + "loss": 1.1331, + "num_input_tokens_seen": 78767808, + "step": 4895 + }, + { + "epoch": 0.3429562110903885, + "grad_norm": 6.26648473739624, + "learning_rate": 6.573657443082312e-05, + "loss": 1.1432, + "num_input_tokens_seen": 78783448, + "step": 4896 + }, + { + "epoch": 0.34302625933611774, + "grad_norm": 4.071943283081055, + "learning_rate": 6.572957618213661e-05, + "loss": 0.9008, + "num_input_tokens_seen": 78798976, + "step": 4897 + }, + { + "epoch": 0.343096307581847, + "grad_norm": 7.654726505279541, + "learning_rate": 6.572257793345008e-05, + "loss": 1.0902, + "num_input_tokens_seen": 78814664, + "step": 4898 + }, + { + "epoch": 0.34316635582757626, + "grad_norm": 5.928562641143799, + "learning_rate": 6.571557968476357e-05, + "loss": 1.0462, + "num_input_tokens_seen": 78830792, + "step": 4899 + }, + { + "epoch": 0.34323640407330547, + "grad_norm": 3.8699424266815186, + "learning_rate": 6.570858143607708e-05, + "loss": 1.0568, + "num_input_tokens_seen": 78847176, + "step": 4900 + }, + { + "epoch": 0.3433064523190347, + "grad_norm": 4.177735328674316, + "learning_rate": 6.570158318739055e-05, + "loss": 1.1381, + "num_input_tokens_seen": 78863496, + "step": 4901 + }, + { + "epoch": 0.343376500564764, + "grad_norm": 3.5755650997161865, + "learning_rate": 6.569458493870403e-05, + "loss": 1.0307, + "num_input_tokens_seen": 78879688, + "step": 4902 + }, + { + "epoch": 0.34344654881049325, + "grad_norm": 5.799609184265137, + "learning_rate": 6.568758669001751e-05, + "loss": 0.9005, + "num_input_tokens_seen": 78894744, + "step": 4903 + }, + { + "epoch": 0.34351659705622245, + "grad_norm": 3.7705209255218506, + "learning_rate": 6.5680588441331e-05, + "loss": 1.0557, + "num_input_tokens_seen": 78911112, + "step": 4904 + }, + { + "epoch": 0.3435866453019517, + "grad_norm": 4.713012218475342, + "learning_rate": 6.567359019264449e-05, + "loss": 1.1005, + "num_input_tokens_seen": 78927496, + "step": 4905 + }, + { + "epoch": 0.34365669354768097, + "grad_norm": 3.8360157012939453, + "learning_rate": 6.566659194395798e-05, + "loss": 1.1281, + "num_input_tokens_seen": 78942712, + "step": 4906 + }, + { + "epoch": 0.34372674179341023, + "grad_norm": 3.6071383953094482, + "learning_rate": 6.565959369527147e-05, + "loss": 0.974, + "num_input_tokens_seen": 78959016, + "step": 4907 + }, + { + "epoch": 0.34379679003913943, + "grad_norm": 4.876083850860596, + "learning_rate": 6.565259544658494e-05, + "loss": 1.1583, + "num_input_tokens_seen": 78975400, + "step": 4908 + }, + { + "epoch": 0.3438668382848687, + "grad_norm": 4.011876583099365, + "learning_rate": 6.564559719789842e-05, + "loss": 1.0749, + "num_input_tokens_seen": 78991784, + "step": 4909 + }, + { + "epoch": 0.34393688653059795, + "grad_norm": 3.74336576461792, + "learning_rate": 6.563859894921191e-05, + "loss": 1.0358, + "num_input_tokens_seen": 79008168, + "step": 4910 + }, + { + "epoch": 0.3440069347763272, + "grad_norm": 4.092207908630371, + "learning_rate": 6.56316007005254e-05, + "loss": 0.9901, + "num_input_tokens_seen": 79024200, + "step": 4911 + }, + { + "epoch": 0.3440769830220564, + "grad_norm": 3.771979331970215, + "learning_rate": 6.562460245183888e-05, + "loss": 0.9599, + "num_input_tokens_seen": 79040584, + "step": 4912 + }, + { + "epoch": 0.3441470312677857, + "grad_norm": 4.791725158691406, + "learning_rate": 6.561760420315237e-05, + "loss": 1.0563, + "num_input_tokens_seen": 79056552, + "step": 4913 + }, + { + "epoch": 0.34421707951351493, + "grad_norm": 4.9150519371032715, + "learning_rate": 6.561060595446586e-05, + "loss": 0.9623, + "num_input_tokens_seen": 79072536, + "step": 4914 + }, + { + "epoch": 0.3442871277592442, + "grad_norm": 4.550070285797119, + "learning_rate": 6.560360770577934e-05, + "loss": 1.07, + "num_input_tokens_seen": 79088376, + "step": 4915 + }, + { + "epoch": 0.34435717600497345, + "grad_norm": 4.497488975524902, + "learning_rate": 6.559660945709283e-05, + "loss": 1.0779, + "num_input_tokens_seen": 79104760, + "step": 4916 + }, + { + "epoch": 0.34442722425070266, + "grad_norm": 4.417470455169678, + "learning_rate": 6.558961120840631e-05, + "loss": 1.0471, + "num_input_tokens_seen": 79120296, + "step": 4917 + }, + { + "epoch": 0.3444972724964319, + "grad_norm": 4.967655658721924, + "learning_rate": 6.558261295971979e-05, + "loss": 0.9294, + "num_input_tokens_seen": 79135936, + "step": 4918 + }, + { + "epoch": 0.3445673207421612, + "grad_norm": 4.973440647125244, + "learning_rate": 6.557561471103328e-05, + "loss": 1.1045, + "num_input_tokens_seen": 79151632, + "step": 4919 + }, + { + "epoch": 0.34463736898789044, + "grad_norm": 5.2282609939575195, + "learning_rate": 6.556861646234677e-05, + "loss": 1.2508, + "num_input_tokens_seen": 79167112, + "step": 4920 + }, + { + "epoch": 0.34470741723361964, + "grad_norm": 4.118466854095459, + "learning_rate": 6.556161821366025e-05, + "loss": 1.0162, + "num_input_tokens_seen": 79183496, + "step": 4921 + }, + { + "epoch": 0.3447774654793489, + "grad_norm": 4.74249267578125, + "learning_rate": 6.555461996497373e-05, + "loss": 1.1383, + "num_input_tokens_seen": 79199240, + "step": 4922 + }, + { + "epoch": 0.34484751372507816, + "grad_norm": 4.21056604385376, + "learning_rate": 6.554762171628722e-05, + "loss": 1.2693, + "num_input_tokens_seen": 79215560, + "step": 4923 + }, + { + "epoch": 0.3449175619708074, + "grad_norm": 3.584332227706909, + "learning_rate": 6.55406234676007e-05, + "loss": 1.0466, + "num_input_tokens_seen": 79231944, + "step": 4924 + }, + { + "epoch": 0.3449876102165366, + "grad_norm": 7.287233829498291, + "learning_rate": 6.55336252189142e-05, + "loss": 1.0568, + "num_input_tokens_seen": 79248328, + "step": 4925 + }, + { + "epoch": 0.3450576584622659, + "grad_norm": 6.5669379234313965, + "learning_rate": 6.552662697022767e-05, + "loss": 0.8538, + "num_input_tokens_seen": 79264712, + "step": 4926 + }, + { + "epoch": 0.34512770670799514, + "grad_norm": 4.086475849151611, + "learning_rate": 6.551962872154117e-05, + "loss": 1.082, + "num_input_tokens_seen": 79281096, + "step": 4927 + }, + { + "epoch": 0.3451977549537244, + "grad_norm": 5.543658256530762, + "learning_rate": 6.551263047285465e-05, + "loss": 0.9835, + "num_input_tokens_seen": 79297120, + "step": 4928 + }, + { + "epoch": 0.3452678031994536, + "grad_norm": 6.474762439727783, + "learning_rate": 6.550563222416812e-05, + "loss": 1.2022, + "num_input_tokens_seen": 79313504, + "step": 4929 + }, + { + "epoch": 0.34533785144518286, + "grad_norm": 3.8226888179779053, + "learning_rate": 6.549863397548161e-05, + "loss": 0.9796, + "num_input_tokens_seen": 79329888, + "step": 4930 + }, + { + "epoch": 0.3454078996909121, + "grad_norm": 3.8926212787628174, + "learning_rate": 6.54916357267951e-05, + "loss": 1.0837, + "num_input_tokens_seen": 79346272, + "step": 4931 + }, + { + "epoch": 0.3454779479366414, + "grad_norm": 4.127487659454346, + "learning_rate": 6.548463747810859e-05, + "loss": 1.1942, + "num_input_tokens_seen": 79362656, + "step": 4932 + }, + { + "epoch": 0.3455479961823706, + "grad_norm": 6.770711421966553, + "learning_rate": 6.547763922942208e-05, + "loss": 0.9898, + "num_input_tokens_seen": 79378544, + "step": 4933 + }, + { + "epoch": 0.34561804442809985, + "grad_norm": 5.547317028045654, + "learning_rate": 6.547064098073557e-05, + "loss": 1.0748, + "num_input_tokens_seen": 79394896, + "step": 4934 + }, + { + "epoch": 0.3456880926738291, + "grad_norm": 4.469418048858643, + "learning_rate": 6.546364273204904e-05, + "loss": 1.1633, + "num_input_tokens_seen": 79410480, + "step": 4935 + }, + { + "epoch": 0.34575814091955837, + "grad_norm": 4.901472091674805, + "learning_rate": 6.545664448336252e-05, + "loss": 1.0252, + "num_input_tokens_seen": 79426864, + "step": 4936 + }, + { + "epoch": 0.34582818916528757, + "grad_norm": 3.60495662689209, + "learning_rate": 6.5449646234676e-05, + "loss": 1.007, + "num_input_tokens_seen": 79443248, + "step": 4937 + }, + { + "epoch": 0.34589823741101683, + "grad_norm": 4.513663291931152, + "learning_rate": 6.544264798598949e-05, + "loss": 1.2239, + "num_input_tokens_seen": 79459632, + "step": 4938 + }, + { + "epoch": 0.3459682856567461, + "grad_norm": 3.6959240436553955, + "learning_rate": 6.543564973730298e-05, + "loss": 1.0561, + "num_input_tokens_seen": 79475320, + "step": 4939 + }, + { + "epoch": 0.34603833390247535, + "grad_norm": 4.071475505828857, + "learning_rate": 6.542865148861647e-05, + "loss": 1.1963, + "num_input_tokens_seen": 79491704, + "step": 4940 + }, + { + "epoch": 0.34610838214820455, + "grad_norm": 3.665421962738037, + "learning_rate": 6.542165323992996e-05, + "loss": 0.9609, + "num_input_tokens_seen": 79508088, + "step": 4941 + }, + { + "epoch": 0.3461784303939338, + "grad_norm": 4.1782941818237305, + "learning_rate": 6.541465499124343e-05, + "loss": 0.85, + "num_input_tokens_seen": 79523936, + "step": 4942 + }, + { + "epoch": 0.34624847863966307, + "grad_norm": 4.728964328765869, + "learning_rate": 6.540765674255691e-05, + "loss": 1.0283, + "num_input_tokens_seen": 79539848, + "step": 4943 + }, + { + "epoch": 0.34631852688539233, + "grad_norm": 5.39119815826416, + "learning_rate": 6.540065849387041e-05, + "loss": 1.185, + "num_input_tokens_seen": 79555040, + "step": 4944 + }, + { + "epoch": 0.34638857513112153, + "grad_norm": 3.8394956588745117, + "learning_rate": 6.53936602451839e-05, + "loss": 0.7774, + "num_input_tokens_seen": 79570504, + "step": 4945 + }, + { + "epoch": 0.3464586233768508, + "grad_norm": 5.03010368347168, + "learning_rate": 6.538666199649737e-05, + "loss": 0.8746, + "num_input_tokens_seen": 79586888, + "step": 4946 + }, + { + "epoch": 0.34652867162258005, + "grad_norm": 3.984548807144165, + "learning_rate": 6.537966374781086e-05, + "loss": 1.0893, + "num_input_tokens_seen": 79603128, + "step": 4947 + }, + { + "epoch": 0.3465987198683093, + "grad_norm": 5.096433162689209, + "learning_rate": 6.537266549912435e-05, + "loss": 1.0547, + "num_input_tokens_seen": 79618624, + "step": 4948 + }, + { + "epoch": 0.3466687681140385, + "grad_norm": 3.6773791313171387, + "learning_rate": 6.536566725043783e-05, + "loss": 0.985, + "num_input_tokens_seen": 79635008, + "step": 4949 + }, + { + "epoch": 0.3467388163597678, + "grad_norm": 4.050341606140137, + "learning_rate": 6.535866900175132e-05, + "loss": 0.9229, + "num_input_tokens_seen": 79651392, + "step": 4950 + }, + { + "epoch": 0.34680886460549704, + "grad_norm": 3.8354263305664062, + "learning_rate": 6.53516707530648e-05, + "loss": 1.0264, + "num_input_tokens_seen": 79667040, + "step": 4951 + }, + { + "epoch": 0.3468789128512263, + "grad_norm": 4.2188873291015625, + "learning_rate": 6.534467250437829e-05, + "loss": 1.0297, + "num_input_tokens_seen": 79683152, + "step": 4952 + }, + { + "epoch": 0.34694896109695555, + "grad_norm": 4.75797700881958, + "learning_rate": 6.533767425569177e-05, + "loss": 1.2475, + "num_input_tokens_seen": 79699536, + "step": 4953 + }, + { + "epoch": 0.34701900934268476, + "grad_norm": 3.494459867477417, + "learning_rate": 6.533067600700527e-05, + "loss": 0.9534, + "num_input_tokens_seen": 79715920, + "step": 4954 + }, + { + "epoch": 0.347089057588414, + "grad_norm": 3.860872268676758, + "learning_rate": 6.532367775831874e-05, + "loss": 1.081, + "num_input_tokens_seen": 79731832, + "step": 4955 + }, + { + "epoch": 0.3471591058341433, + "grad_norm": 4.188973426818848, + "learning_rate": 6.531667950963222e-05, + "loss": 1.0814, + "num_input_tokens_seen": 79747592, + "step": 4956 + }, + { + "epoch": 0.34722915407987254, + "grad_norm": 5.598564624786377, + "learning_rate": 6.530968126094571e-05, + "loss": 1.0699, + "num_input_tokens_seen": 79763048, + "step": 4957 + }, + { + "epoch": 0.34729920232560174, + "grad_norm": 4.153980255126953, + "learning_rate": 6.53026830122592e-05, + "loss": 1.1726, + "num_input_tokens_seen": 79777928, + "step": 4958 + }, + { + "epoch": 0.347369250571331, + "grad_norm": 3.875469446182251, + "learning_rate": 6.529568476357269e-05, + "loss": 1.1449, + "num_input_tokens_seen": 79794312, + "step": 4959 + }, + { + "epoch": 0.34743929881706026, + "grad_norm": 5.391599655151367, + "learning_rate": 6.528868651488617e-05, + "loss": 1.1748, + "num_input_tokens_seen": 79810696, + "step": 4960 + }, + { + "epoch": 0.3475093470627895, + "grad_norm": 3.3462777137756348, + "learning_rate": 6.528168826619966e-05, + "loss": 0.8645, + "num_input_tokens_seen": 79826208, + "step": 4961 + }, + { + "epoch": 0.3475793953085187, + "grad_norm": 3.5444939136505127, + "learning_rate": 6.527469001751314e-05, + "loss": 1.0989, + "num_input_tokens_seen": 79842592, + "step": 4962 + }, + { + "epoch": 0.347649443554248, + "grad_norm": 4.541754722595215, + "learning_rate": 6.526769176882661e-05, + "loss": 0.9, + "num_input_tokens_seen": 79858976, + "step": 4963 + }, + { + "epoch": 0.34771949179997724, + "grad_norm": 3.728207588195801, + "learning_rate": 6.52606935201401e-05, + "loss": 1.0493, + "num_input_tokens_seen": 79874944, + "step": 4964 + }, + { + "epoch": 0.3477895400457065, + "grad_norm": 5.615260601043701, + "learning_rate": 6.525369527145359e-05, + "loss": 1.0588, + "num_input_tokens_seen": 79890968, + "step": 4965 + }, + { + "epoch": 0.3478595882914357, + "grad_norm": 4.863505840301514, + "learning_rate": 6.524669702276708e-05, + "loss": 0.9896, + "num_input_tokens_seen": 79907352, + "step": 4966 + }, + { + "epoch": 0.34792963653716497, + "grad_norm": 3.6932058334350586, + "learning_rate": 6.523969877408057e-05, + "loss": 0.9675, + "num_input_tokens_seen": 79923736, + "step": 4967 + }, + { + "epoch": 0.3479996847828942, + "grad_norm": 4.483904838562012, + "learning_rate": 6.523270052539406e-05, + "loss": 1.222, + "num_input_tokens_seen": 79939360, + "step": 4968 + }, + { + "epoch": 0.3480697330286235, + "grad_norm": 3.540771007537842, + "learning_rate": 6.522570227670753e-05, + "loss": 0.9759, + "num_input_tokens_seen": 79955744, + "step": 4969 + }, + { + "epoch": 0.3481397812743527, + "grad_norm": 3.980483293533325, + "learning_rate": 6.5218704028021e-05, + "loss": 1.1637, + "num_input_tokens_seen": 79971368, + "step": 4970 + }, + { + "epoch": 0.34820982952008195, + "grad_norm": 5.302091598510742, + "learning_rate": 6.521170577933451e-05, + "loss": 1.0568, + "num_input_tokens_seen": 79986688, + "step": 4971 + }, + { + "epoch": 0.3482798777658112, + "grad_norm": 4.176638603210449, + "learning_rate": 6.5204707530648e-05, + "loss": 1.1928, + "num_input_tokens_seen": 80003072, + "step": 4972 + }, + { + "epoch": 0.34834992601154047, + "grad_norm": 5.939540386199951, + "learning_rate": 6.519770928196147e-05, + "loss": 1.0465, + "num_input_tokens_seen": 80019344, + "step": 4973 + }, + { + "epoch": 0.34841997425726967, + "grad_norm": 4.681301593780518, + "learning_rate": 6.519071103327496e-05, + "loss": 1.1121, + "num_input_tokens_seen": 80034504, + "step": 4974 + }, + { + "epoch": 0.34849002250299893, + "grad_norm": 4.993075847625732, + "learning_rate": 6.518371278458845e-05, + "loss": 0.8792, + "num_input_tokens_seen": 80050488, + "step": 4975 + }, + { + "epoch": 0.3485600707487282, + "grad_norm": 3.87778377532959, + "learning_rate": 6.517671453590192e-05, + "loss": 0.9458, + "num_input_tokens_seen": 80066872, + "step": 4976 + }, + { + "epoch": 0.34863011899445745, + "grad_norm": 3.652738332748413, + "learning_rate": 6.516971628721541e-05, + "loss": 0.9912, + "num_input_tokens_seen": 80083232, + "step": 4977 + }, + { + "epoch": 0.34870016724018665, + "grad_norm": 3.9958438873291016, + "learning_rate": 6.51627180385289e-05, + "loss": 0.8653, + "num_input_tokens_seen": 80099616, + "step": 4978 + }, + { + "epoch": 0.3487702154859159, + "grad_norm": 4.190839767456055, + "learning_rate": 6.515571978984239e-05, + "loss": 1.2081, + "num_input_tokens_seen": 80116000, + "step": 4979 + }, + { + "epoch": 0.3488402637316452, + "grad_norm": 4.848324298858643, + "learning_rate": 6.514872154115586e-05, + "loss": 1.197, + "num_input_tokens_seen": 80132384, + "step": 4980 + }, + { + "epoch": 0.34891031197737443, + "grad_norm": 4.863750457763672, + "learning_rate": 6.514172329246937e-05, + "loss": 1.1181, + "num_input_tokens_seen": 80148768, + "step": 4981 + }, + { + "epoch": 0.34898036022310364, + "grad_norm": 4.555769443511963, + "learning_rate": 6.513472504378284e-05, + "loss": 0.9769, + "num_input_tokens_seen": 80164984, + "step": 4982 + }, + { + "epoch": 0.3490504084688329, + "grad_norm": 5.041413307189941, + "learning_rate": 6.512772679509632e-05, + "loss": 1.0183, + "num_input_tokens_seen": 80181336, + "step": 4983 + }, + { + "epoch": 0.34912045671456216, + "grad_norm": 4.58367395401001, + "learning_rate": 6.51207285464098e-05, + "loss": 1.232, + "num_input_tokens_seen": 80197720, + "step": 4984 + }, + { + "epoch": 0.3491905049602914, + "grad_norm": 3.9667036533355713, + "learning_rate": 6.51137302977233e-05, + "loss": 1.1363, + "num_input_tokens_seen": 80212776, + "step": 4985 + }, + { + "epoch": 0.3492605532060207, + "grad_norm": 3.474071979522705, + "learning_rate": 6.510673204903678e-05, + "loss": 0.8978, + "num_input_tokens_seen": 80229160, + "step": 4986 + }, + { + "epoch": 0.3493306014517499, + "grad_norm": 3.912496328353882, + "learning_rate": 6.509973380035027e-05, + "loss": 0.9695, + "num_input_tokens_seen": 80245544, + "step": 4987 + }, + { + "epoch": 0.34940064969747914, + "grad_norm": 3.760340690612793, + "learning_rate": 6.509273555166376e-05, + "loss": 0.97, + "num_input_tokens_seen": 80261400, + "step": 4988 + }, + { + "epoch": 0.3494706979432084, + "grad_norm": 4.982266426086426, + "learning_rate": 6.508573730297723e-05, + "loss": 1.008, + "num_input_tokens_seen": 80277784, + "step": 4989 + }, + { + "epoch": 0.34954074618893766, + "grad_norm": 4.6823530197143555, + "learning_rate": 6.507873905429071e-05, + "loss": 1.3118, + "num_input_tokens_seen": 80294168, + "step": 4990 + }, + { + "epoch": 0.34961079443466686, + "grad_norm": 3.768439769744873, + "learning_rate": 6.50717408056042e-05, + "loss": 0.91, + "num_input_tokens_seen": 80310552, + "step": 4991 + }, + { + "epoch": 0.3496808426803961, + "grad_norm": 3.5285451412200928, + "learning_rate": 6.50647425569177e-05, + "loss": 0.8937, + "num_input_tokens_seen": 80326464, + "step": 4992 + }, + { + "epoch": 0.3497508909261254, + "grad_norm": 3.875992774963379, + "learning_rate": 6.505774430823118e-05, + "loss": 0.9514, + "num_input_tokens_seen": 80342848, + "step": 4993 + }, + { + "epoch": 0.34982093917185464, + "grad_norm": 4.061910152435303, + "learning_rate": 6.505074605954466e-05, + "loss": 0.9607, + "num_input_tokens_seen": 80359232, + "step": 4994 + }, + { + "epoch": 0.34989098741758384, + "grad_norm": 4.456427097320557, + "learning_rate": 6.504374781085815e-05, + "loss": 1.1927, + "num_input_tokens_seen": 80375616, + "step": 4995 + }, + { + "epoch": 0.3499610356633131, + "grad_norm": 4.381276607513428, + "learning_rate": 6.503674956217163e-05, + "loss": 1.0291, + "num_input_tokens_seen": 80392000, + "step": 4996 + }, + { + "epoch": 0.35003108390904236, + "grad_norm": 6.789033889770508, + "learning_rate": 6.50297513134851e-05, + "loss": 0.9971, + "num_input_tokens_seen": 80407360, + "step": 4997 + }, + { + "epoch": 0.3501011321547716, + "grad_norm": 3.953124761581421, + "learning_rate": 6.50227530647986e-05, + "loss": 1.1585, + "num_input_tokens_seen": 80423744, + "step": 4998 + }, + { + "epoch": 0.3501711804005008, + "grad_norm": 3.498389482498169, + "learning_rate": 6.50157548161121e-05, + "loss": 0.9259, + "num_input_tokens_seen": 80440128, + "step": 4999 + }, + { + "epoch": 0.3502412286462301, + "grad_norm": 5.498814582824707, + "learning_rate": 6.500875656742557e-05, + "loss": 0.9867, + "num_input_tokens_seen": 80456512, + "step": 5000 + }, + { + "epoch": 0.3502412286462301, + "eval_loss": 1.1277527809143066, + "eval_runtime": 0.1909, + "eval_samples_per_second": 5.238, + "eval_steps_per_second": 5.238, + "num_input_tokens_seen": 80456512, + "step": 5000 + }, + { + "epoch": 0.35031127689195934, + "grad_norm": 3.440230131149292, + "learning_rate": 6.500175831873906e-05, + "loss": 0.8354, + "num_input_tokens_seen": 80472456, + "step": 5001 + }, + { + "epoch": 0.3503813251376886, + "grad_norm": 5.069565296173096, + "learning_rate": 6.499476007005255e-05, + "loss": 1.1994, + "num_input_tokens_seen": 80488840, + "step": 5002 + }, + { + "epoch": 0.3504513733834178, + "grad_norm": 4.53994607925415, + "learning_rate": 6.498776182136602e-05, + "loss": 1.0962, + "num_input_tokens_seen": 80504984, + "step": 5003 + }, + { + "epoch": 0.35052142162914707, + "grad_norm": 4.136146068572998, + "learning_rate": 6.498076357267951e-05, + "loss": 0.9885, + "num_input_tokens_seen": 80520448, + "step": 5004 + }, + { + "epoch": 0.3505914698748763, + "grad_norm": 5.609417915344238, + "learning_rate": 6.4973765323993e-05, + "loss": 1.0242, + "num_input_tokens_seen": 80536496, + "step": 5005 + }, + { + "epoch": 0.3506615181206056, + "grad_norm": 4.375439643859863, + "learning_rate": 6.496676707530649e-05, + "loss": 0.9937, + "num_input_tokens_seen": 80551592, + "step": 5006 + }, + { + "epoch": 0.3507315663663348, + "grad_norm": 3.5269775390625, + "learning_rate": 6.495976882661996e-05, + "loss": 0.9995, + "num_input_tokens_seen": 80567976, + "step": 5007 + }, + { + "epoch": 0.35080161461206405, + "grad_norm": 3.9541778564453125, + "learning_rate": 6.495277057793346e-05, + "loss": 1.1451, + "num_input_tokens_seen": 80584360, + "step": 5008 + }, + { + "epoch": 0.3508716628577933, + "grad_norm": 5.544612407684326, + "learning_rate": 6.494577232924694e-05, + "loss": 1.3493, + "num_input_tokens_seen": 80599856, + "step": 5009 + }, + { + "epoch": 0.35094171110352257, + "grad_norm": 4.189836502075195, + "learning_rate": 6.493877408056041e-05, + "loss": 1.2096, + "num_input_tokens_seen": 80615392, + "step": 5010 + }, + { + "epoch": 0.3510117593492518, + "grad_norm": 4.8789825439453125, + "learning_rate": 6.49317758318739e-05, + "loss": 1.0665, + "num_input_tokens_seen": 80631776, + "step": 5011 + }, + { + "epoch": 0.35108180759498103, + "grad_norm": 4.271617412567139, + "learning_rate": 6.49247775831874e-05, + "loss": 0.9655, + "num_input_tokens_seen": 80648160, + "step": 5012 + }, + { + "epoch": 0.3511518558407103, + "grad_norm": 4.656182765960693, + "learning_rate": 6.491777933450088e-05, + "loss": 0.9566, + "num_input_tokens_seen": 80664424, + "step": 5013 + }, + { + "epoch": 0.35122190408643955, + "grad_norm": 6.627303600311279, + "learning_rate": 6.491078108581437e-05, + "loss": 1.2156, + "num_input_tokens_seen": 80680128, + "step": 5014 + }, + { + "epoch": 0.35129195233216876, + "grad_norm": 3.6189517974853516, + "learning_rate": 6.490378283712786e-05, + "loss": 1.0828, + "num_input_tokens_seen": 80695848, + "step": 5015 + }, + { + "epoch": 0.351362000577898, + "grad_norm": 3.58449387550354, + "learning_rate": 6.489678458844133e-05, + "loss": 1.0578, + "num_input_tokens_seen": 80712232, + "step": 5016 + }, + { + "epoch": 0.3514320488236273, + "grad_norm": 4.014143466949463, + "learning_rate": 6.488978633975481e-05, + "loss": 1.1271, + "num_input_tokens_seen": 80726480, + "step": 5017 + }, + { + "epoch": 0.35150209706935653, + "grad_norm": 4.461588382720947, + "learning_rate": 6.488278809106831e-05, + "loss": 1.1175, + "num_input_tokens_seen": 80742776, + "step": 5018 + }, + { + "epoch": 0.35157214531508574, + "grad_norm": 4.534054279327393, + "learning_rate": 6.48757898423818e-05, + "loss": 1.1009, + "num_input_tokens_seen": 80758024, + "step": 5019 + }, + { + "epoch": 0.351642193560815, + "grad_norm": 3.502699613571167, + "learning_rate": 6.486879159369527e-05, + "loss": 1.0564, + "num_input_tokens_seen": 80774152, + "step": 5020 + }, + { + "epoch": 0.35171224180654426, + "grad_norm": 4.463150978088379, + "learning_rate": 6.486179334500876e-05, + "loss": 0.9945, + "num_input_tokens_seen": 80790528, + "step": 5021 + }, + { + "epoch": 0.3517822900522735, + "grad_norm": 4.1127543449401855, + "learning_rate": 6.485479509632225e-05, + "loss": 0.9813, + "num_input_tokens_seen": 80805400, + "step": 5022 + }, + { + "epoch": 0.3518523382980028, + "grad_norm": 3.6113109588623047, + "learning_rate": 6.484779684763572e-05, + "loss": 1.1071, + "num_input_tokens_seen": 80821584, + "step": 5023 + }, + { + "epoch": 0.351922386543732, + "grad_norm": 4.167325019836426, + "learning_rate": 6.484079859894921e-05, + "loss": 1.0636, + "num_input_tokens_seen": 80837968, + "step": 5024 + }, + { + "epoch": 0.35199243478946124, + "grad_norm": 3.9422924518585205, + "learning_rate": 6.48338003502627e-05, + "loss": 1.0665, + "num_input_tokens_seen": 80854352, + "step": 5025 + }, + { + "epoch": 0.3520624830351905, + "grad_norm": 4.867110729217529, + "learning_rate": 6.482680210157619e-05, + "loss": 0.9098, + "num_input_tokens_seen": 80870648, + "step": 5026 + }, + { + "epoch": 0.35213253128091976, + "grad_norm": 4.714593887329102, + "learning_rate": 6.481980385288967e-05, + "loss": 1.0256, + "num_input_tokens_seen": 80886704, + "step": 5027 + }, + { + "epoch": 0.35220257952664896, + "grad_norm": 3.8926947116851807, + "learning_rate": 6.481280560420315e-05, + "loss": 0.9577, + "num_input_tokens_seen": 80902184, + "step": 5028 + }, + { + "epoch": 0.3522726277723782, + "grad_norm": 4.510727405548096, + "learning_rate": 6.480580735551664e-05, + "loss": 1.1543, + "num_input_tokens_seen": 80917960, + "step": 5029 + }, + { + "epoch": 0.3523426760181075, + "grad_norm": 3.6175239086151123, + "learning_rate": 6.479880910683012e-05, + "loss": 1.0692, + "num_input_tokens_seen": 80934344, + "step": 5030 + }, + { + "epoch": 0.35241272426383674, + "grad_norm": 4.112790584564209, + "learning_rate": 6.47918108581436e-05, + "loss": 1.1518, + "num_input_tokens_seen": 80950336, + "step": 5031 + }, + { + "epoch": 0.35248277250956594, + "grad_norm": 4.372056007385254, + "learning_rate": 6.478481260945711e-05, + "loss": 1.0732, + "num_input_tokens_seen": 80966272, + "step": 5032 + }, + { + "epoch": 0.3525528207552952, + "grad_norm": 5.2401204109191895, + "learning_rate": 6.477781436077058e-05, + "loss": 1.0378, + "num_input_tokens_seen": 80981568, + "step": 5033 + }, + { + "epoch": 0.35262286900102446, + "grad_norm": 4.032891273498535, + "learning_rate": 6.477081611208406e-05, + "loss": 1.0788, + "num_input_tokens_seen": 80997384, + "step": 5034 + }, + { + "epoch": 0.3526929172467537, + "grad_norm": 5.448423385620117, + "learning_rate": 6.476381786339756e-05, + "loss": 1.2136, + "num_input_tokens_seen": 81013768, + "step": 5035 + }, + { + "epoch": 0.3527629654924829, + "grad_norm": 3.5669469833374023, + "learning_rate": 6.475681961471104e-05, + "loss": 1.0039, + "num_input_tokens_seen": 81030152, + "step": 5036 + }, + { + "epoch": 0.3528330137382122, + "grad_norm": 3.4767303466796875, + "learning_rate": 6.474982136602451e-05, + "loss": 0.9563, + "num_input_tokens_seen": 81046536, + "step": 5037 + }, + { + "epoch": 0.35290306198394145, + "grad_norm": 4.859378814697266, + "learning_rate": 6.474282311733801e-05, + "loss": 1.2855, + "num_input_tokens_seen": 81062528, + "step": 5038 + }, + { + "epoch": 0.3529731102296707, + "grad_norm": 5.003366470336914, + "learning_rate": 6.47358248686515e-05, + "loss": 1.1317, + "num_input_tokens_seen": 81078912, + "step": 5039 + }, + { + "epoch": 0.3530431584753999, + "grad_norm": 3.9362549781799316, + "learning_rate": 6.472882661996498e-05, + "loss": 1.2051, + "num_input_tokens_seen": 81095296, + "step": 5040 + }, + { + "epoch": 0.35311320672112917, + "grad_norm": 3.319826364517212, + "learning_rate": 6.472182837127847e-05, + "loss": 0.9632, + "num_input_tokens_seen": 81111640, + "step": 5041 + }, + { + "epoch": 0.35318325496685843, + "grad_norm": 3.5816714763641357, + "learning_rate": 6.471483012259195e-05, + "loss": 0.9576, + "num_input_tokens_seen": 81128024, + "step": 5042 + }, + { + "epoch": 0.3532533032125877, + "grad_norm": 4.352350234985352, + "learning_rate": 6.470783187390543e-05, + "loss": 1.1754, + "num_input_tokens_seen": 81143992, + "step": 5043 + }, + { + "epoch": 0.3533233514583169, + "grad_norm": 3.4122314453125, + "learning_rate": 6.470083362521892e-05, + "loss": 1.104, + "num_input_tokens_seen": 81160376, + "step": 5044 + }, + { + "epoch": 0.35339339970404615, + "grad_norm": 4.0952324867248535, + "learning_rate": 6.46938353765324e-05, + "loss": 0.9727, + "num_input_tokens_seen": 81175968, + "step": 5045 + }, + { + "epoch": 0.3534634479497754, + "grad_norm": 3.9099533557891846, + "learning_rate": 6.46868371278459e-05, + "loss": 1.0624, + "num_input_tokens_seen": 81192352, + "step": 5046 + }, + { + "epoch": 0.35353349619550467, + "grad_norm": 6.379274845123291, + "learning_rate": 6.467983887915937e-05, + "loss": 1.0069, + "num_input_tokens_seen": 81208648, + "step": 5047 + }, + { + "epoch": 0.3536035444412339, + "grad_norm": 3.9650473594665527, + "learning_rate": 6.467284063047286e-05, + "loss": 1.0727, + "num_input_tokens_seen": 81224472, + "step": 5048 + }, + { + "epoch": 0.35367359268696313, + "grad_norm": 3.7729573249816895, + "learning_rate": 6.466584238178635e-05, + "loss": 1.0097, + "num_input_tokens_seen": 81240232, + "step": 5049 + }, + { + "epoch": 0.3537436409326924, + "grad_norm": 4.012545585632324, + "learning_rate": 6.465884413309982e-05, + "loss": 1.0527, + "num_input_tokens_seen": 81256616, + "step": 5050 + }, + { + "epoch": 0.35381368917842165, + "grad_norm": 3.679382801055908, + "learning_rate": 6.465184588441331e-05, + "loss": 1.0033, + "num_input_tokens_seen": 81272888, + "step": 5051 + }, + { + "epoch": 0.35388373742415086, + "grad_norm": 3.897606134414673, + "learning_rate": 6.464484763572681e-05, + "loss": 0.9513, + "num_input_tokens_seen": 81289272, + "step": 5052 + }, + { + "epoch": 0.3539537856698801, + "grad_norm": 4.988255023956299, + "learning_rate": 6.463784938704029e-05, + "loss": 0.8484, + "num_input_tokens_seen": 81305656, + "step": 5053 + }, + { + "epoch": 0.3540238339156094, + "grad_norm": 4.226601600646973, + "learning_rate": 6.463085113835376e-05, + "loss": 1.0048, + "num_input_tokens_seen": 81320912, + "step": 5054 + }, + { + "epoch": 0.35409388216133864, + "grad_norm": 4.0905070304870605, + "learning_rate": 6.462385288966725e-05, + "loss": 1.2044, + "num_input_tokens_seen": 81337296, + "step": 5055 + }, + { + "epoch": 0.3541639304070679, + "grad_norm": 4.470916748046875, + "learning_rate": 6.461685464098074e-05, + "loss": 1.1198, + "num_input_tokens_seen": 81353680, + "step": 5056 + }, + { + "epoch": 0.3542339786527971, + "grad_norm": 3.8264098167419434, + "learning_rate": 6.460985639229421e-05, + "loss": 0.8444, + "num_input_tokens_seen": 81370064, + "step": 5057 + }, + { + "epoch": 0.35430402689852636, + "grad_norm": 5.07196569442749, + "learning_rate": 6.460285814360772e-05, + "loss": 0.9035, + "num_input_tokens_seen": 81386368, + "step": 5058 + }, + { + "epoch": 0.3543740751442556, + "grad_norm": 4.830010414123535, + "learning_rate": 6.45958598949212e-05, + "loss": 1.0685, + "num_input_tokens_seen": 81402752, + "step": 5059 + }, + { + "epoch": 0.3544441233899849, + "grad_norm": 3.5972540378570557, + "learning_rate": 6.458886164623468e-05, + "loss": 0.9466, + "num_input_tokens_seen": 81418856, + "step": 5060 + }, + { + "epoch": 0.3545141716357141, + "grad_norm": 4.840418815612793, + "learning_rate": 6.458186339754816e-05, + "loss": 1.0174, + "num_input_tokens_seen": 81434344, + "step": 5061 + }, + { + "epoch": 0.35458421988144334, + "grad_norm": 4.891697883605957, + "learning_rate": 6.457486514886166e-05, + "loss": 1.0537, + "num_input_tokens_seen": 81450280, + "step": 5062 + }, + { + "epoch": 0.3546542681271726, + "grad_norm": 3.7236123085021973, + "learning_rate": 6.456786690017513e-05, + "loss": 1.0524, + "num_input_tokens_seen": 81466664, + "step": 5063 + }, + { + "epoch": 0.35472431637290186, + "grad_norm": 3.6597838401794434, + "learning_rate": 6.456086865148862e-05, + "loss": 0.9648, + "num_input_tokens_seen": 81483048, + "step": 5064 + }, + { + "epoch": 0.35479436461863106, + "grad_norm": 4.048685073852539, + "learning_rate": 6.455387040280211e-05, + "loss": 1.0033, + "num_input_tokens_seen": 81499080, + "step": 5065 + }, + { + "epoch": 0.3548644128643603, + "grad_norm": 3.683549165725708, + "learning_rate": 6.45468721541156e-05, + "loss": 1.054, + "num_input_tokens_seen": 81515464, + "step": 5066 + }, + { + "epoch": 0.3549344611100896, + "grad_norm": 4.80827522277832, + "learning_rate": 6.453987390542907e-05, + "loss": 1.0664, + "num_input_tokens_seen": 81530672, + "step": 5067 + }, + { + "epoch": 0.35500450935581884, + "grad_norm": 3.6255602836608887, + "learning_rate": 6.453287565674256e-05, + "loss": 1.0027, + "num_input_tokens_seen": 81546976, + "step": 5068 + }, + { + "epoch": 0.35507455760154805, + "grad_norm": 3.430290460586548, + "learning_rate": 6.452587740805605e-05, + "loss": 1.1253, + "num_input_tokens_seen": 81562936, + "step": 5069 + }, + { + "epoch": 0.3551446058472773, + "grad_norm": 5.140942573547363, + "learning_rate": 6.451887915936953e-05, + "loss": 0.9522, + "num_input_tokens_seen": 81579120, + "step": 5070 + }, + { + "epoch": 0.35521465409300657, + "grad_norm": 4.5443115234375, + "learning_rate": 6.451188091068301e-05, + "loss": 1.2141, + "num_input_tokens_seen": 81595504, + "step": 5071 + }, + { + "epoch": 0.3552847023387358, + "grad_norm": 4.33146333694458, + "learning_rate": 6.45048826619965e-05, + "loss": 1.0189, + "num_input_tokens_seen": 81611024, + "step": 5072 + }, + { + "epoch": 0.35535475058446503, + "grad_norm": 4.212037563323975, + "learning_rate": 6.449788441330999e-05, + "loss": 1.2356, + "num_input_tokens_seen": 81627208, + "step": 5073 + }, + { + "epoch": 0.3554247988301943, + "grad_norm": 3.714611053466797, + "learning_rate": 6.449088616462347e-05, + "loss": 0.9699, + "num_input_tokens_seen": 81642744, + "step": 5074 + }, + { + "epoch": 0.35549484707592355, + "grad_norm": 3.985471487045288, + "learning_rate": 6.448388791593696e-05, + "loss": 1.1381, + "num_input_tokens_seen": 81659128, + "step": 5075 + }, + { + "epoch": 0.3555648953216528, + "grad_norm": 4.519073963165283, + "learning_rate": 6.447688966725044e-05, + "loss": 1.1515, + "num_input_tokens_seen": 81675512, + "step": 5076 + }, + { + "epoch": 0.355634943567382, + "grad_norm": 4.546297550201416, + "learning_rate": 6.446989141856392e-05, + "loss": 1.324, + "num_input_tokens_seen": 81691528, + "step": 5077 + }, + { + "epoch": 0.35570499181311127, + "grad_norm": 4.023989200592041, + "learning_rate": 6.446289316987741e-05, + "loss": 1.068, + "num_input_tokens_seen": 81707912, + "step": 5078 + }, + { + "epoch": 0.35577504005884053, + "grad_norm": 4.442357540130615, + "learning_rate": 6.445589492119091e-05, + "loss": 0.9021, + "num_input_tokens_seen": 81724296, + "step": 5079 + }, + { + "epoch": 0.3558450883045698, + "grad_norm": 3.63273286819458, + "learning_rate": 6.444889667250438e-05, + "loss": 0.919, + "num_input_tokens_seen": 81740112, + "step": 5080 + }, + { + "epoch": 0.355915136550299, + "grad_norm": 3.8844716548919678, + "learning_rate": 6.444189842381786e-05, + "loss": 1.1389, + "num_input_tokens_seen": 81756024, + "step": 5081 + }, + { + "epoch": 0.35598518479602825, + "grad_norm": 3.8603484630584717, + "learning_rate": 6.443490017513135e-05, + "loss": 0.8949, + "num_input_tokens_seen": 81772408, + "step": 5082 + }, + { + "epoch": 0.3560552330417575, + "grad_norm": 4.305675029754639, + "learning_rate": 6.442790192644484e-05, + "loss": 1.0133, + "num_input_tokens_seen": 81787992, + "step": 5083 + }, + { + "epoch": 0.3561252812874868, + "grad_norm": 5.944203853607178, + "learning_rate": 6.442090367775833e-05, + "loss": 1.0635, + "num_input_tokens_seen": 81804032, + "step": 5084 + }, + { + "epoch": 0.356195329533216, + "grad_norm": 5.269783020019531, + "learning_rate": 6.441390542907181e-05, + "loss": 1.0697, + "num_input_tokens_seen": 81820416, + "step": 5085 + }, + { + "epoch": 0.35626537777894524, + "grad_norm": 3.775933027267456, + "learning_rate": 6.44069071803853e-05, + "loss": 1.0638, + "num_input_tokens_seen": 81836712, + "step": 5086 + }, + { + "epoch": 0.3563354260246745, + "grad_norm": 4.133227825164795, + "learning_rate": 6.439990893169878e-05, + "loss": 0.9842, + "num_input_tokens_seen": 81853096, + "step": 5087 + }, + { + "epoch": 0.35640547427040375, + "grad_norm": 4.418367862701416, + "learning_rate": 6.439291068301225e-05, + "loss": 1.1836, + "num_input_tokens_seen": 81869480, + "step": 5088 + }, + { + "epoch": 0.356475522516133, + "grad_norm": 3.584392786026001, + "learning_rate": 6.438591243432575e-05, + "loss": 1.0805, + "num_input_tokens_seen": 81885864, + "step": 5089 + }, + { + "epoch": 0.3565455707618622, + "grad_norm": 4.216940402984619, + "learning_rate": 6.437891418563923e-05, + "loss": 0.8602, + "num_input_tokens_seen": 81902248, + "step": 5090 + }, + { + "epoch": 0.3566156190075915, + "grad_norm": 4.383372783660889, + "learning_rate": 6.437191593695272e-05, + "loss": 0.9763, + "num_input_tokens_seen": 81918464, + "step": 5091 + }, + { + "epoch": 0.35668566725332074, + "grad_norm": 4.06666374206543, + "learning_rate": 6.436491768826621e-05, + "loss": 0.9784, + "num_input_tokens_seen": 81934848, + "step": 5092 + }, + { + "epoch": 0.35675571549905, + "grad_norm": 5.485066890716553, + "learning_rate": 6.43579194395797e-05, + "loss": 0.9188, + "num_input_tokens_seen": 81950696, + "step": 5093 + }, + { + "epoch": 0.3568257637447792, + "grad_norm": 6.794841766357422, + "learning_rate": 6.435092119089317e-05, + "loss": 1.1765, + "num_input_tokens_seen": 81967080, + "step": 5094 + }, + { + "epoch": 0.35689581199050846, + "grad_norm": 3.531291961669922, + "learning_rate": 6.434392294220666e-05, + "loss": 0.9904, + "num_input_tokens_seen": 81983464, + "step": 5095 + }, + { + "epoch": 0.3569658602362377, + "grad_norm": 3.694018840789795, + "learning_rate": 6.433692469352015e-05, + "loss": 1.0384, + "num_input_tokens_seen": 81999848, + "step": 5096 + }, + { + "epoch": 0.357035908481967, + "grad_norm": 6.933582305908203, + "learning_rate": 6.432992644483362e-05, + "loss": 1.0262, + "num_input_tokens_seen": 82015304, + "step": 5097 + }, + { + "epoch": 0.3571059567276962, + "grad_norm": 5.904866695404053, + "learning_rate": 6.432292819614711e-05, + "loss": 1.0849, + "num_input_tokens_seen": 82031688, + "step": 5098 + }, + { + "epoch": 0.35717600497342544, + "grad_norm": 4.199756145477295, + "learning_rate": 6.43159299474606e-05, + "loss": 1.1007, + "num_input_tokens_seen": 82047336, + "step": 5099 + }, + { + "epoch": 0.3572460532191547, + "grad_norm": 3.703000783920288, + "learning_rate": 6.430893169877409e-05, + "loss": 0.8503, + "num_input_tokens_seen": 82063720, + "step": 5100 + }, + { + "epoch": 0.35731610146488396, + "grad_norm": 4.844930171966553, + "learning_rate": 6.430193345008756e-05, + "loss": 1.0255, + "num_input_tokens_seen": 82079632, + "step": 5101 + }, + { + "epoch": 0.35738614971061317, + "grad_norm": 3.870488166809082, + "learning_rate": 6.429493520140105e-05, + "loss": 1.0116, + "num_input_tokens_seen": 82094864, + "step": 5102 + }, + { + "epoch": 0.3574561979563424, + "grad_norm": 3.9125707149505615, + "learning_rate": 6.428793695271454e-05, + "loss": 0.9626, + "num_input_tokens_seen": 82111136, + "step": 5103 + }, + { + "epoch": 0.3575262462020717, + "grad_norm": 4.347132205963135, + "learning_rate": 6.428093870402803e-05, + "loss": 0.9538, + "num_input_tokens_seen": 82127064, + "step": 5104 + }, + { + "epoch": 0.35759629444780094, + "grad_norm": 3.739053964614868, + "learning_rate": 6.42739404553415e-05, + "loss": 1.201, + "num_input_tokens_seen": 82143448, + "step": 5105 + }, + { + "epoch": 0.35766634269353015, + "grad_norm": 4.781857967376709, + "learning_rate": 6.4266942206655e-05, + "loss": 1.0323, + "num_input_tokens_seen": 82159832, + "step": 5106 + }, + { + "epoch": 0.3577363909392594, + "grad_norm": 4.3711700439453125, + "learning_rate": 6.425994395796848e-05, + "loss": 1.2023, + "num_input_tokens_seen": 82175944, + "step": 5107 + }, + { + "epoch": 0.35780643918498867, + "grad_norm": 3.6916282176971436, + "learning_rate": 6.425294570928196e-05, + "loss": 0.787, + "num_input_tokens_seen": 82192304, + "step": 5108 + }, + { + "epoch": 0.3578764874307179, + "grad_norm": 4.418915271759033, + "learning_rate": 6.424594746059545e-05, + "loss": 1.0842, + "num_input_tokens_seen": 82208080, + "step": 5109 + }, + { + "epoch": 0.35794653567644713, + "grad_norm": 3.9138340950012207, + "learning_rate": 6.423894921190893e-05, + "loss": 1.0261, + "num_input_tokens_seen": 82224464, + "step": 5110 + }, + { + "epoch": 0.3580165839221764, + "grad_norm": 3.99479079246521, + "learning_rate": 6.423195096322242e-05, + "loss": 1.0562, + "num_input_tokens_seen": 82240664, + "step": 5111 + }, + { + "epoch": 0.35808663216790565, + "grad_norm": 4.260537147521973, + "learning_rate": 6.422495271453591e-05, + "loss": 1.1133, + "num_input_tokens_seen": 82257048, + "step": 5112 + }, + { + "epoch": 0.3581566804136349, + "grad_norm": 3.5181097984313965, + "learning_rate": 6.42179544658494e-05, + "loss": 0.98, + "num_input_tokens_seen": 82273432, + "step": 5113 + }, + { + "epoch": 0.3582267286593641, + "grad_norm": 5.96913480758667, + "learning_rate": 6.421095621716287e-05, + "loss": 0.8867, + "num_input_tokens_seen": 82289816, + "step": 5114 + }, + { + "epoch": 0.3582967769050934, + "grad_norm": 4.628411769866943, + "learning_rate": 6.420395796847635e-05, + "loss": 1.1363, + "num_input_tokens_seen": 82305784, + "step": 5115 + }, + { + "epoch": 0.35836682515082263, + "grad_norm": 3.5981955528259277, + "learning_rate": 6.419695971978985e-05, + "loss": 0.9182, + "num_input_tokens_seen": 82321384, + "step": 5116 + }, + { + "epoch": 0.3584368733965519, + "grad_norm": 4.410891056060791, + "learning_rate": 6.418996147110333e-05, + "loss": 1.1118, + "num_input_tokens_seen": 82336184, + "step": 5117 + }, + { + "epoch": 0.3585069216422811, + "grad_norm": 4.316674709320068, + "learning_rate": 6.418296322241682e-05, + "loss": 1.1604, + "num_input_tokens_seen": 82351520, + "step": 5118 + }, + { + "epoch": 0.35857696988801036, + "grad_norm": 5.662688255310059, + "learning_rate": 6.41759649737303e-05, + "loss": 1.1212, + "num_input_tokens_seen": 82367904, + "step": 5119 + }, + { + "epoch": 0.3586470181337396, + "grad_norm": 4.5336151123046875, + "learning_rate": 6.416896672504379e-05, + "loss": 1.0093, + "num_input_tokens_seen": 82384288, + "step": 5120 + }, + { + "epoch": 0.3587170663794689, + "grad_norm": 6.43854284286499, + "learning_rate": 6.416196847635727e-05, + "loss": 0.9434, + "num_input_tokens_seen": 82400120, + "step": 5121 + }, + { + "epoch": 0.3587871146251981, + "grad_norm": 3.519869089126587, + "learning_rate": 6.415497022767076e-05, + "loss": 0.9704, + "num_input_tokens_seen": 82416504, + "step": 5122 + }, + { + "epoch": 0.35885716287092734, + "grad_norm": 4.426568508148193, + "learning_rate": 6.414797197898425e-05, + "loss": 0.9778, + "num_input_tokens_seen": 82431936, + "step": 5123 + }, + { + "epoch": 0.3589272111166566, + "grad_norm": 10.392409324645996, + "learning_rate": 6.414097373029773e-05, + "loss": 1.0289, + "num_input_tokens_seen": 82447232, + "step": 5124 + }, + { + "epoch": 0.35899725936238586, + "grad_norm": 4.133431434631348, + "learning_rate": 6.413397548161121e-05, + "loss": 1.1998, + "num_input_tokens_seen": 82462648, + "step": 5125 + }, + { + "epoch": 0.3590673076081151, + "grad_norm": 5.43566370010376, + "learning_rate": 6.41269772329247e-05, + "loss": 0.9587, + "num_input_tokens_seen": 82478536, + "step": 5126 + }, + { + "epoch": 0.3591373558538443, + "grad_norm": 4.205079555511475, + "learning_rate": 6.411997898423819e-05, + "loss": 1.1152, + "num_input_tokens_seen": 82494224, + "step": 5127 + }, + { + "epoch": 0.3592074040995736, + "grad_norm": 4.165416240692139, + "learning_rate": 6.411298073555166e-05, + "loss": 1.3017, + "num_input_tokens_seen": 82510608, + "step": 5128 + }, + { + "epoch": 0.35927745234530284, + "grad_norm": 3.7855117321014404, + "learning_rate": 6.410598248686515e-05, + "loss": 0.8362, + "num_input_tokens_seen": 82526992, + "step": 5129 + }, + { + "epoch": 0.3593475005910321, + "grad_norm": 4.406207084655762, + "learning_rate": 6.409898423817864e-05, + "loss": 1.0353, + "num_input_tokens_seen": 82543376, + "step": 5130 + }, + { + "epoch": 0.3594175488367613, + "grad_norm": 4.228625774383545, + "learning_rate": 6.409198598949213e-05, + "loss": 0.9788, + "num_input_tokens_seen": 82559760, + "step": 5131 + }, + { + "epoch": 0.35948759708249056, + "grad_norm": 3.6679983139038086, + "learning_rate": 6.40849877408056e-05, + "loss": 1.072, + "num_input_tokens_seen": 82575552, + "step": 5132 + }, + { + "epoch": 0.3595576453282198, + "grad_norm": 4.011179447174072, + "learning_rate": 6.40779894921191e-05, + "loss": 1.0443, + "num_input_tokens_seen": 82591936, + "step": 5133 + }, + { + "epoch": 0.3596276935739491, + "grad_norm": 4.861363410949707, + "learning_rate": 6.407099124343258e-05, + "loss": 1.1077, + "num_input_tokens_seen": 82608320, + "step": 5134 + }, + { + "epoch": 0.3596977418196783, + "grad_norm": 4.128578186035156, + "learning_rate": 6.406399299474605e-05, + "loss": 1.1903, + "num_input_tokens_seen": 82624704, + "step": 5135 + }, + { + "epoch": 0.35976779006540754, + "grad_norm": 4.036421775817871, + "learning_rate": 6.405699474605954e-05, + "loss": 1.1624, + "num_input_tokens_seen": 82641088, + "step": 5136 + }, + { + "epoch": 0.3598378383111368, + "grad_norm": 4.536168098449707, + "learning_rate": 6.404999649737303e-05, + "loss": 0.9512, + "num_input_tokens_seen": 82657472, + "step": 5137 + }, + { + "epoch": 0.35990788655686606, + "grad_norm": 3.665916681289673, + "learning_rate": 6.404299824868652e-05, + "loss": 1.1718, + "num_input_tokens_seen": 82673856, + "step": 5138 + }, + { + "epoch": 0.35997793480259527, + "grad_norm": 3.798205852508545, + "learning_rate": 6.403600000000001e-05, + "loss": 1.0625, + "num_input_tokens_seen": 82690240, + "step": 5139 + }, + { + "epoch": 0.3600479830483245, + "grad_norm": 3.9616305828094482, + "learning_rate": 6.40290017513135e-05, + "loss": 1.1314, + "num_input_tokens_seen": 82706624, + "step": 5140 + }, + { + "epoch": 0.3601180312940538, + "grad_norm": 4.6059489250183105, + "learning_rate": 6.402200350262697e-05, + "loss": 0.9534, + "num_input_tokens_seen": 82723008, + "step": 5141 + }, + { + "epoch": 0.36018807953978305, + "grad_norm": 4.2935943603515625, + "learning_rate": 6.401500525394045e-05, + "loss": 0.9653, + "num_input_tokens_seen": 82739392, + "step": 5142 + }, + { + "epoch": 0.36025812778551225, + "grad_norm": 4.02174711227417, + "learning_rate": 6.400800700525395e-05, + "loss": 1.2037, + "num_input_tokens_seen": 82755600, + "step": 5143 + }, + { + "epoch": 0.3603281760312415, + "grad_norm": 4.0431599617004395, + "learning_rate": 6.400100875656744e-05, + "loss": 1.0548, + "num_input_tokens_seen": 82771592, + "step": 5144 + }, + { + "epoch": 0.36039822427697077, + "grad_norm": 3.6921310424804688, + "learning_rate": 6.399401050788091e-05, + "loss": 0.8992, + "num_input_tokens_seen": 82787728, + "step": 5145 + }, + { + "epoch": 0.36046827252270003, + "grad_norm": 4.27170991897583, + "learning_rate": 6.39870122591944e-05, + "loss": 1.0908, + "num_input_tokens_seen": 82803152, + "step": 5146 + }, + { + "epoch": 0.36053832076842923, + "grad_norm": 4.670827865600586, + "learning_rate": 6.398001401050789e-05, + "loss": 1.1134, + "num_input_tokens_seen": 82819536, + "step": 5147 + }, + { + "epoch": 0.3606083690141585, + "grad_norm": 3.6219654083251953, + "learning_rate": 6.397301576182136e-05, + "loss": 0.9576, + "num_input_tokens_seen": 82835920, + "step": 5148 + }, + { + "epoch": 0.36067841725988775, + "grad_norm": 3.53466796875, + "learning_rate": 6.396601751313485e-05, + "loss": 0.905, + "num_input_tokens_seen": 82852304, + "step": 5149 + }, + { + "epoch": 0.360748465505617, + "grad_norm": 4.027638912200928, + "learning_rate": 6.395901926444834e-05, + "loss": 1.0661, + "num_input_tokens_seen": 82867816, + "step": 5150 + }, + { + "epoch": 0.3608185137513462, + "grad_norm": 5.701491832733154, + "learning_rate": 6.395202101576183e-05, + "loss": 1.2476, + "num_input_tokens_seen": 82883480, + "step": 5151 + }, + { + "epoch": 0.3608885619970755, + "grad_norm": 4.156428337097168, + "learning_rate": 6.39450227670753e-05, + "loss": 1.1507, + "num_input_tokens_seen": 82899608, + "step": 5152 + }, + { + "epoch": 0.36095861024280473, + "grad_norm": 5.278023719787598, + "learning_rate": 6.39380245183888e-05, + "loss": 1.0583, + "num_input_tokens_seen": 82915656, + "step": 5153 + }, + { + "epoch": 0.361028658488534, + "grad_norm": 3.6892948150634766, + "learning_rate": 6.393102626970228e-05, + "loss": 1.0063, + "num_input_tokens_seen": 82931632, + "step": 5154 + }, + { + "epoch": 0.3610987067342632, + "grad_norm": 5.179676055908203, + "learning_rate": 6.392402802101576e-05, + "loss": 1.1701, + "num_input_tokens_seen": 82947344, + "step": 5155 + }, + { + "epoch": 0.36116875497999246, + "grad_norm": 4.948189735412598, + "learning_rate": 6.391702977232925e-05, + "loss": 1.056, + "num_input_tokens_seen": 82963720, + "step": 5156 + }, + { + "epoch": 0.3612388032257217, + "grad_norm": 4.465184688568115, + "learning_rate": 6.391003152364274e-05, + "loss": 1.225, + "num_input_tokens_seen": 82980048, + "step": 5157 + }, + { + "epoch": 0.361308851471451, + "grad_norm": 4.053642749786377, + "learning_rate": 6.390303327495622e-05, + "loss": 1.1481, + "num_input_tokens_seen": 82996432, + "step": 5158 + }, + { + "epoch": 0.36137889971718024, + "grad_norm": 8.422308921813965, + "learning_rate": 6.38960350262697e-05, + "loss": 1.241, + "num_input_tokens_seen": 83012560, + "step": 5159 + }, + { + "epoch": 0.36144894796290944, + "grad_norm": 3.4304730892181396, + "learning_rate": 6.38890367775832e-05, + "loss": 1.1008, + "num_input_tokens_seen": 83028680, + "step": 5160 + }, + { + "epoch": 0.3615189962086387, + "grad_norm": 9.87295913696289, + "learning_rate": 6.388203852889668e-05, + "loss": 1.0512, + "num_input_tokens_seen": 83045064, + "step": 5161 + }, + { + "epoch": 0.36158904445436796, + "grad_norm": 3.7000608444213867, + "learning_rate": 6.387504028021015e-05, + "loss": 1.0758, + "num_input_tokens_seen": 83061448, + "step": 5162 + }, + { + "epoch": 0.3616590927000972, + "grad_norm": 3.5490283966064453, + "learning_rate": 6.386804203152364e-05, + "loss": 0.9705, + "num_input_tokens_seen": 83077176, + "step": 5163 + }, + { + "epoch": 0.3617291409458264, + "grad_norm": 3.850770950317383, + "learning_rate": 6.386104378283714e-05, + "loss": 1.0371, + "num_input_tokens_seen": 83093560, + "step": 5164 + }, + { + "epoch": 0.3617991891915557, + "grad_norm": 5.09017276763916, + "learning_rate": 6.385404553415062e-05, + "loss": 1.0084, + "num_input_tokens_seen": 83109752, + "step": 5165 + }, + { + "epoch": 0.36186923743728494, + "grad_norm": 4.801665782928467, + "learning_rate": 6.38470472854641e-05, + "loss": 1.0909, + "num_input_tokens_seen": 83125048, + "step": 5166 + }, + { + "epoch": 0.3619392856830142, + "grad_norm": 3.954345941543579, + "learning_rate": 6.38400490367776e-05, + "loss": 0.9775, + "num_input_tokens_seen": 83140808, + "step": 5167 + }, + { + "epoch": 0.3620093339287434, + "grad_norm": 4.874080657958984, + "learning_rate": 6.383305078809107e-05, + "loss": 1.1408, + "num_input_tokens_seen": 83157176, + "step": 5168 + }, + { + "epoch": 0.36207938217447266, + "grad_norm": 4.3997111320495605, + "learning_rate": 6.382605253940454e-05, + "loss": 1.1489, + "num_input_tokens_seen": 83173560, + "step": 5169 + }, + { + "epoch": 0.3621494304202019, + "grad_norm": 4.431540489196777, + "learning_rate": 6.381905429071805e-05, + "loss": 1.1138, + "num_input_tokens_seen": 83189864, + "step": 5170 + }, + { + "epoch": 0.3622194786659312, + "grad_norm": 4.48107385635376, + "learning_rate": 6.381205604203153e-05, + "loss": 1.2451, + "num_input_tokens_seen": 83205560, + "step": 5171 + }, + { + "epoch": 0.3622895269116604, + "grad_norm": 4.369350910186768, + "learning_rate": 6.380505779334501e-05, + "loss": 1.0877, + "num_input_tokens_seen": 83221544, + "step": 5172 + }, + { + "epoch": 0.36235957515738965, + "grad_norm": 3.8510024547576904, + "learning_rate": 6.37980595446585e-05, + "loss": 0.8895, + "num_input_tokens_seen": 83237928, + "step": 5173 + }, + { + "epoch": 0.3624296234031189, + "grad_norm": 3.7452402114868164, + "learning_rate": 6.379106129597199e-05, + "loss": 1.1425, + "num_input_tokens_seen": 83254168, + "step": 5174 + }, + { + "epoch": 0.36249967164884817, + "grad_norm": 4.53076171875, + "learning_rate": 6.378406304728546e-05, + "loss": 1.1516, + "num_input_tokens_seen": 83269568, + "step": 5175 + }, + { + "epoch": 0.36256971989457737, + "grad_norm": 3.729602813720703, + "learning_rate": 6.377706479859895e-05, + "loss": 1.2105, + "num_input_tokens_seen": 83285952, + "step": 5176 + }, + { + "epoch": 0.36263976814030663, + "grad_norm": 4.085333824157715, + "learning_rate": 6.377006654991244e-05, + "loss": 1.0517, + "num_input_tokens_seen": 83302200, + "step": 5177 + }, + { + "epoch": 0.3627098163860359, + "grad_norm": 3.9202303886413574, + "learning_rate": 6.376306830122593e-05, + "loss": 1.0358, + "num_input_tokens_seen": 83318584, + "step": 5178 + }, + { + "epoch": 0.36277986463176515, + "grad_norm": 4.10648775100708, + "learning_rate": 6.37560700525394e-05, + "loss": 1.3052, + "num_input_tokens_seen": 83334288, + "step": 5179 + }, + { + "epoch": 0.36284991287749435, + "grad_norm": 3.975217580795288, + "learning_rate": 6.374907180385289e-05, + "loss": 1.1725, + "num_input_tokens_seen": 83350096, + "step": 5180 + }, + { + "epoch": 0.3629199611232236, + "grad_norm": 4.207096099853516, + "learning_rate": 6.374207355516638e-05, + "loss": 1.1396, + "num_input_tokens_seen": 83366480, + "step": 5181 + }, + { + "epoch": 0.36299000936895287, + "grad_norm": 3.9960830211639404, + "learning_rate": 6.373507530647986e-05, + "loss": 1.1971, + "num_input_tokens_seen": 83381832, + "step": 5182 + }, + { + "epoch": 0.36306005761468213, + "grad_norm": 4.142012596130371, + "learning_rate": 6.372807705779334e-05, + "loss": 1.0829, + "num_input_tokens_seen": 83398216, + "step": 5183 + }, + { + "epoch": 0.36313010586041133, + "grad_norm": 3.8692433834075928, + "learning_rate": 6.372107880910685e-05, + "loss": 1.0649, + "num_input_tokens_seen": 83414600, + "step": 5184 + }, + { + "epoch": 0.3632001541061406, + "grad_norm": 3.663544178009033, + "learning_rate": 6.371408056042032e-05, + "loss": 0.8924, + "num_input_tokens_seen": 83430984, + "step": 5185 + }, + { + "epoch": 0.36327020235186985, + "grad_norm": 4.056418418884277, + "learning_rate": 6.37070823117338e-05, + "loss": 0.9463, + "num_input_tokens_seen": 83447368, + "step": 5186 + }, + { + "epoch": 0.3633402505975991, + "grad_norm": 4.209747314453125, + "learning_rate": 6.37000840630473e-05, + "loss": 1.0641, + "num_input_tokens_seen": 83463752, + "step": 5187 + }, + { + "epoch": 0.3634102988433283, + "grad_norm": 4.93091344833374, + "learning_rate": 6.369308581436077e-05, + "loss": 1.2046, + "num_input_tokens_seen": 83479424, + "step": 5188 + }, + { + "epoch": 0.3634803470890576, + "grad_norm": 3.6523993015289307, + "learning_rate": 6.368608756567425e-05, + "loss": 0.8965, + "num_input_tokens_seen": 83495808, + "step": 5189 + }, + { + "epoch": 0.36355039533478684, + "grad_norm": 4.8949294090271, + "learning_rate": 6.367908931698775e-05, + "loss": 0.8928, + "num_input_tokens_seen": 83511448, + "step": 5190 + }, + { + "epoch": 0.3636204435805161, + "grad_norm": 5.856332778930664, + "learning_rate": 6.367209106830124e-05, + "loss": 0.9844, + "num_input_tokens_seen": 83526664, + "step": 5191 + }, + { + "epoch": 0.3636904918262453, + "grad_norm": 3.762014865875244, + "learning_rate": 6.366509281961471e-05, + "loss": 1.0865, + "num_input_tokens_seen": 83542792, + "step": 5192 + }, + { + "epoch": 0.36376054007197456, + "grad_norm": 4.075290203094482, + "learning_rate": 6.36580945709282e-05, + "loss": 1.0261, + "num_input_tokens_seen": 83558992, + "step": 5193 + }, + { + "epoch": 0.3638305883177038, + "grad_norm": 4.124780178070068, + "learning_rate": 6.365109632224169e-05, + "loss": 1.1021, + "num_input_tokens_seen": 83575376, + "step": 5194 + }, + { + "epoch": 0.3639006365634331, + "grad_norm": 6.1159210205078125, + "learning_rate": 6.364409807355517e-05, + "loss": 0.9209, + "num_input_tokens_seen": 83591400, + "step": 5195 + }, + { + "epoch": 0.36397068480916234, + "grad_norm": 3.8839027881622314, + "learning_rate": 6.363709982486865e-05, + "loss": 1.0866, + "num_input_tokens_seen": 83607784, + "step": 5196 + }, + { + "epoch": 0.36404073305489154, + "grad_norm": 4.260892391204834, + "learning_rate": 6.363010157618214e-05, + "loss": 1.0747, + "num_input_tokens_seen": 83623944, + "step": 5197 + }, + { + "epoch": 0.3641107813006208, + "grad_norm": 4.111022472381592, + "learning_rate": 6.362310332749563e-05, + "loss": 1.2594, + "num_input_tokens_seen": 83639408, + "step": 5198 + }, + { + "epoch": 0.36418082954635006, + "grad_norm": 3.567676305770874, + "learning_rate": 6.361610507880911e-05, + "loss": 1.0115, + "num_input_tokens_seen": 83655496, + "step": 5199 + }, + { + "epoch": 0.3642508777920793, + "grad_norm": 4.935754299163818, + "learning_rate": 6.36091068301226e-05, + "loss": 1.2028, + "num_input_tokens_seen": 83671016, + "step": 5200 + }, + { + "epoch": 0.3642508777920793, + "eval_loss": 1.129547119140625, + "eval_runtime": 0.1857, + "eval_samples_per_second": 5.386, + "eval_steps_per_second": 5.386, + "num_input_tokens_seen": 83671016, + "step": 5200 + }, + { + "epoch": 0.3643209260378085, + "grad_norm": 3.8546817302703857, + "learning_rate": 6.360210858143608e-05, + "loss": 0.9873, + "num_input_tokens_seen": 83685736, + "step": 5201 + }, + { + "epoch": 0.3643909742835378, + "grad_norm": 3.900425910949707, + "learning_rate": 6.359511033274956e-05, + "loss": 1.0005, + "num_input_tokens_seen": 83702120, + "step": 5202 + }, + { + "epoch": 0.36446102252926704, + "grad_norm": 4.270096302032471, + "learning_rate": 6.358811208406305e-05, + "loss": 0.9098, + "num_input_tokens_seen": 83718504, + "step": 5203 + }, + { + "epoch": 0.3645310707749963, + "grad_norm": 5.027628421783447, + "learning_rate": 6.358111383537655e-05, + "loss": 1.1363, + "num_input_tokens_seen": 83734888, + "step": 5204 + }, + { + "epoch": 0.3646011190207255, + "grad_norm": 4.843371868133545, + "learning_rate": 6.357411558669002e-05, + "loss": 0.9629, + "num_input_tokens_seen": 83749488, + "step": 5205 + }, + { + "epoch": 0.36467116726645477, + "grad_norm": 7.530435562133789, + "learning_rate": 6.35671173380035e-05, + "loss": 1.0575, + "num_input_tokens_seen": 83765872, + "step": 5206 + }, + { + "epoch": 0.364741215512184, + "grad_norm": 4.028171062469482, + "learning_rate": 6.356011908931699e-05, + "loss": 1.2011, + "num_input_tokens_seen": 83781936, + "step": 5207 + }, + { + "epoch": 0.3648112637579133, + "grad_norm": 6.744492053985596, + "learning_rate": 6.355312084063048e-05, + "loss": 1.0464, + "num_input_tokens_seen": 83797520, + "step": 5208 + }, + { + "epoch": 0.3648813120036425, + "grad_norm": 3.9689910411834717, + "learning_rate": 6.354612259194395e-05, + "loss": 1.0156, + "num_input_tokens_seen": 83813872, + "step": 5209 + }, + { + "epoch": 0.36495136024937175, + "grad_norm": 4.990142345428467, + "learning_rate": 6.353912434325745e-05, + "loss": 1.2019, + "num_input_tokens_seen": 83830256, + "step": 5210 + }, + { + "epoch": 0.365021408495101, + "grad_norm": 4.547253131866455, + "learning_rate": 6.353212609457094e-05, + "loss": 1.1825, + "num_input_tokens_seen": 83846640, + "step": 5211 + }, + { + "epoch": 0.36509145674083027, + "grad_norm": 4.108243465423584, + "learning_rate": 6.352512784588442e-05, + "loss": 1.1827, + "num_input_tokens_seen": 83863024, + "step": 5212 + }, + { + "epoch": 0.36516150498655947, + "grad_norm": 4.540827751159668, + "learning_rate": 6.351812959719789e-05, + "loss": 1.0034, + "num_input_tokens_seen": 83878976, + "step": 5213 + }, + { + "epoch": 0.36523155323228873, + "grad_norm": 5.3233842849731445, + "learning_rate": 6.35111313485114e-05, + "loss": 1.2247, + "num_input_tokens_seen": 83895360, + "step": 5214 + }, + { + "epoch": 0.365301601478018, + "grad_norm": 5.161661624908447, + "learning_rate": 6.350413309982487e-05, + "loss": 1.2067, + "num_input_tokens_seen": 83910064, + "step": 5215 + }, + { + "epoch": 0.36537164972374725, + "grad_norm": 4.908864498138428, + "learning_rate": 6.349713485113836e-05, + "loss": 1.1748, + "num_input_tokens_seen": 83926448, + "step": 5216 + }, + { + "epoch": 0.36544169796947645, + "grad_norm": 5.954193592071533, + "learning_rate": 6.349013660245185e-05, + "loss": 0.99, + "num_input_tokens_seen": 83942248, + "step": 5217 + }, + { + "epoch": 0.3655117462152057, + "grad_norm": 3.5276272296905518, + "learning_rate": 6.348313835376534e-05, + "loss": 0.9637, + "num_input_tokens_seen": 83958632, + "step": 5218 + }, + { + "epoch": 0.365581794460935, + "grad_norm": 3.736661195755005, + "learning_rate": 6.347614010507881e-05, + "loss": 1.059, + "num_input_tokens_seen": 83975016, + "step": 5219 + }, + { + "epoch": 0.36565184270666423, + "grad_norm": 5.434671401977539, + "learning_rate": 6.34691418563923e-05, + "loss": 1.0891, + "num_input_tokens_seen": 83990424, + "step": 5220 + }, + { + "epoch": 0.36572189095239344, + "grad_norm": 3.9301772117614746, + "learning_rate": 6.346214360770579e-05, + "loss": 1.0278, + "num_input_tokens_seen": 84006808, + "step": 5221 + }, + { + "epoch": 0.3657919391981227, + "grad_norm": 5.101827621459961, + "learning_rate": 6.345514535901926e-05, + "loss": 1.2129, + "num_input_tokens_seen": 84022624, + "step": 5222 + }, + { + "epoch": 0.36586198744385195, + "grad_norm": 4.042179584503174, + "learning_rate": 6.344814711033275e-05, + "loss": 1.2996, + "num_input_tokens_seen": 84038688, + "step": 5223 + }, + { + "epoch": 0.3659320356895812, + "grad_norm": 4.2309441566467285, + "learning_rate": 6.344114886164624e-05, + "loss": 1.1113, + "num_input_tokens_seen": 84055072, + "step": 5224 + }, + { + "epoch": 0.3660020839353104, + "grad_norm": 6.73452615737915, + "learning_rate": 6.343415061295973e-05, + "loss": 1.1523, + "num_input_tokens_seen": 84071456, + "step": 5225 + }, + { + "epoch": 0.3660721321810397, + "grad_norm": 3.684497833251953, + "learning_rate": 6.34271523642732e-05, + "loss": 1.0967, + "num_input_tokens_seen": 84087840, + "step": 5226 + }, + { + "epoch": 0.36614218042676894, + "grad_norm": 3.7974796295166016, + "learning_rate": 6.342015411558669e-05, + "loss": 1.0675, + "num_input_tokens_seen": 84103456, + "step": 5227 + }, + { + "epoch": 0.3662122286724982, + "grad_norm": 4.681473255157471, + "learning_rate": 6.341315586690018e-05, + "loss": 0.9202, + "num_input_tokens_seen": 84119840, + "step": 5228 + }, + { + "epoch": 0.36628227691822746, + "grad_norm": 4.197212219238281, + "learning_rate": 6.340615761821366e-05, + "loss": 0.9594, + "num_input_tokens_seen": 84136224, + "step": 5229 + }, + { + "epoch": 0.36635232516395666, + "grad_norm": 4.1414794921875, + "learning_rate": 6.339915936952716e-05, + "loss": 1.1421, + "num_input_tokens_seen": 84152552, + "step": 5230 + }, + { + "epoch": 0.3664223734096859, + "grad_norm": 4.138907432556152, + "learning_rate": 6.339216112084065e-05, + "loss": 1.0841, + "num_input_tokens_seen": 84168936, + "step": 5231 + }, + { + "epoch": 0.3664924216554152, + "grad_norm": 4.723425388336182, + "learning_rate": 6.338516287215412e-05, + "loss": 1.1302, + "num_input_tokens_seen": 84185320, + "step": 5232 + }, + { + "epoch": 0.36656246990114444, + "grad_norm": 4.167308330535889, + "learning_rate": 6.33781646234676e-05, + "loss": 1.2636, + "num_input_tokens_seen": 84201704, + "step": 5233 + }, + { + "epoch": 0.36663251814687364, + "grad_norm": 3.832829236984253, + "learning_rate": 6.337116637478109e-05, + "loss": 0.9078, + "num_input_tokens_seen": 84217216, + "step": 5234 + }, + { + "epoch": 0.3667025663926029, + "grad_norm": 6.1642842292785645, + "learning_rate": 6.336416812609457e-05, + "loss": 0.8102, + "num_input_tokens_seen": 84232896, + "step": 5235 + }, + { + "epoch": 0.36677261463833216, + "grad_norm": 3.948350429534912, + "learning_rate": 6.335716987740806e-05, + "loss": 1.1285, + "num_input_tokens_seen": 84248448, + "step": 5236 + }, + { + "epoch": 0.3668426628840614, + "grad_norm": 3.6216750144958496, + "learning_rate": 6.335017162872155e-05, + "loss": 1.022, + "num_input_tokens_seen": 84264832, + "step": 5237 + }, + { + "epoch": 0.3669127111297906, + "grad_norm": 5.787931442260742, + "learning_rate": 6.334317338003504e-05, + "loss": 1.1968, + "num_input_tokens_seen": 84281216, + "step": 5238 + }, + { + "epoch": 0.3669827593755199, + "grad_norm": 4.830391883850098, + "learning_rate": 6.333617513134851e-05, + "loss": 1.3014, + "num_input_tokens_seen": 84297352, + "step": 5239 + }, + { + "epoch": 0.36705280762124914, + "grad_norm": 3.839425563812256, + "learning_rate": 6.332917688266199e-05, + "loss": 1.031, + "num_input_tokens_seen": 84313608, + "step": 5240 + }, + { + "epoch": 0.3671228558669784, + "grad_norm": 3.963012456893921, + "learning_rate": 6.332217863397549e-05, + "loss": 1.0232, + "num_input_tokens_seen": 84329680, + "step": 5241 + }, + { + "epoch": 0.3671929041127076, + "grad_norm": 3.4596047401428223, + "learning_rate": 6.331518038528897e-05, + "loss": 1.0028, + "num_input_tokens_seen": 84346064, + "step": 5242 + }, + { + "epoch": 0.36726295235843687, + "grad_norm": 5.7928290367126465, + "learning_rate": 6.330818213660246e-05, + "loss": 1.2292, + "num_input_tokens_seen": 84361800, + "step": 5243 + }, + { + "epoch": 0.3673330006041661, + "grad_norm": 3.5012640953063965, + "learning_rate": 6.330118388791594e-05, + "loss": 1.0095, + "num_input_tokens_seen": 84378184, + "step": 5244 + }, + { + "epoch": 0.3674030488498954, + "grad_norm": 4.464978218078613, + "learning_rate": 6.329418563922943e-05, + "loss": 1.2258, + "num_input_tokens_seen": 84394568, + "step": 5245 + }, + { + "epoch": 0.3674730970956246, + "grad_norm": 3.4716012477874756, + "learning_rate": 6.328718739054291e-05, + "loss": 1.087, + "num_input_tokens_seen": 84410584, + "step": 5246 + }, + { + "epoch": 0.36754314534135385, + "grad_norm": 4.010568618774414, + "learning_rate": 6.32801891418564e-05, + "loss": 1.0823, + "num_input_tokens_seen": 84426968, + "step": 5247 + }, + { + "epoch": 0.3676131935870831, + "grad_norm": 3.763718605041504, + "learning_rate": 6.327319089316989e-05, + "loss": 1.079, + "num_input_tokens_seen": 84443352, + "step": 5248 + }, + { + "epoch": 0.36768324183281237, + "grad_norm": 5.381477355957031, + "learning_rate": 6.326619264448336e-05, + "loss": 1.0387, + "num_input_tokens_seen": 84459736, + "step": 5249 + }, + { + "epoch": 0.3677532900785416, + "grad_norm": 3.6646018028259277, + "learning_rate": 6.325919439579685e-05, + "loss": 1.026, + "num_input_tokens_seen": 84476120, + "step": 5250 + }, + { + "epoch": 0.36782333832427083, + "grad_norm": 4.005465507507324, + "learning_rate": 6.325219614711034e-05, + "loss": 1.0341, + "num_input_tokens_seen": 84492400, + "step": 5251 + }, + { + "epoch": 0.3678933865700001, + "grad_norm": 3.4287807941436768, + "learning_rate": 6.324519789842383e-05, + "loss": 0.9892, + "num_input_tokens_seen": 84508720, + "step": 5252 + }, + { + "epoch": 0.36796343481572935, + "grad_norm": 3.8715076446533203, + "learning_rate": 6.32381996497373e-05, + "loss": 1.3025, + "num_input_tokens_seen": 84524592, + "step": 5253 + }, + { + "epoch": 0.36803348306145856, + "grad_norm": 3.4789586067199707, + "learning_rate": 6.323120140105079e-05, + "loss": 0.9109, + "num_input_tokens_seen": 84540176, + "step": 5254 + }, + { + "epoch": 0.3681035313071878, + "grad_norm": 3.992988348007202, + "learning_rate": 6.322420315236428e-05, + "loss": 1.138, + "num_input_tokens_seen": 84556560, + "step": 5255 + }, + { + "epoch": 0.3681735795529171, + "grad_norm": 4.3957743644714355, + "learning_rate": 6.321720490367775e-05, + "loss": 1.2542, + "num_input_tokens_seen": 84572240, + "step": 5256 + }, + { + "epoch": 0.36824362779864633, + "grad_norm": 3.7909469604492188, + "learning_rate": 6.321020665499126e-05, + "loss": 0.9282, + "num_input_tokens_seen": 84587400, + "step": 5257 + }, + { + "epoch": 0.36831367604437554, + "grad_norm": 3.747345209121704, + "learning_rate": 6.320320840630474e-05, + "loss": 0.9673, + "num_input_tokens_seen": 84603240, + "step": 5258 + }, + { + "epoch": 0.3683837242901048, + "grad_norm": 3.6753249168395996, + "learning_rate": 6.319621015761822e-05, + "loss": 1.0435, + "num_input_tokens_seen": 84619624, + "step": 5259 + }, + { + "epoch": 0.36845377253583406, + "grad_norm": 3.6952924728393555, + "learning_rate": 6.31892119089317e-05, + "loss": 1.0577, + "num_input_tokens_seen": 84636008, + "step": 5260 + }, + { + "epoch": 0.3685238207815633, + "grad_norm": 4.606325149536133, + "learning_rate": 6.318221366024518e-05, + "loss": 1.0212, + "num_input_tokens_seen": 84652392, + "step": 5261 + }, + { + "epoch": 0.3685938690272925, + "grad_norm": 3.749755382537842, + "learning_rate": 6.317521541155867e-05, + "loss": 1.0378, + "num_input_tokens_seen": 84667832, + "step": 5262 + }, + { + "epoch": 0.3686639172730218, + "grad_norm": 3.7973029613494873, + "learning_rate": 6.316821716287216e-05, + "loss": 1.1695, + "num_input_tokens_seen": 84683904, + "step": 5263 + }, + { + "epoch": 0.36873396551875104, + "grad_norm": 4.264857769012451, + "learning_rate": 6.316121891418565e-05, + "loss": 1.0638, + "num_input_tokens_seen": 84700288, + "step": 5264 + }, + { + "epoch": 0.3688040137644803, + "grad_norm": 3.4577653408050537, + "learning_rate": 6.315422066549914e-05, + "loss": 1.0037, + "num_input_tokens_seen": 84716672, + "step": 5265 + }, + { + "epoch": 0.36887406201020956, + "grad_norm": 4.049471378326416, + "learning_rate": 6.314722241681261e-05, + "loss": 1.0595, + "num_input_tokens_seen": 84732976, + "step": 5266 + }, + { + "epoch": 0.36894411025593876, + "grad_norm": 4.293907165527344, + "learning_rate": 6.314022416812609e-05, + "loss": 1.1094, + "num_input_tokens_seen": 84747480, + "step": 5267 + }, + { + "epoch": 0.369014158501668, + "grad_norm": 7.115272045135498, + "learning_rate": 6.313322591943959e-05, + "loss": 0.9904, + "num_input_tokens_seen": 84763864, + "step": 5268 + }, + { + "epoch": 0.3690842067473973, + "grad_norm": 6.85962438583374, + "learning_rate": 6.312622767075306e-05, + "loss": 0.9934, + "num_input_tokens_seen": 84778648, + "step": 5269 + }, + { + "epoch": 0.36915425499312654, + "grad_norm": 4.24301290512085, + "learning_rate": 6.311922942206655e-05, + "loss": 1.0426, + "num_input_tokens_seen": 84794440, + "step": 5270 + }, + { + "epoch": 0.36922430323885574, + "grad_norm": 3.533189535140991, + "learning_rate": 6.311223117338004e-05, + "loss": 0.9863, + "num_input_tokens_seen": 84810824, + "step": 5271 + }, + { + "epoch": 0.369294351484585, + "grad_norm": 4.706559658050537, + "learning_rate": 6.310523292469353e-05, + "loss": 1.2352, + "num_input_tokens_seen": 84827208, + "step": 5272 + }, + { + "epoch": 0.36936439973031426, + "grad_norm": 3.492366075515747, + "learning_rate": 6.3098234676007e-05, + "loss": 0.9802, + "num_input_tokens_seen": 84842744, + "step": 5273 + }, + { + "epoch": 0.3694344479760435, + "grad_norm": 4.733495712280273, + "learning_rate": 6.30912364273205e-05, + "loss": 1.1914, + "num_input_tokens_seen": 84858432, + "step": 5274 + }, + { + "epoch": 0.3695044962217727, + "grad_norm": 3.6145412921905518, + "learning_rate": 6.308423817863398e-05, + "loss": 1.0411, + "num_input_tokens_seen": 84874496, + "step": 5275 + }, + { + "epoch": 0.369574544467502, + "grad_norm": 3.764568328857422, + "learning_rate": 6.307723992994746e-05, + "loss": 1.1604, + "num_input_tokens_seen": 84890880, + "step": 5276 + }, + { + "epoch": 0.36964459271323125, + "grad_norm": 5.0368428230285645, + "learning_rate": 6.307024168126095e-05, + "loss": 1.0006, + "num_input_tokens_seen": 84907264, + "step": 5277 + }, + { + "epoch": 0.3697146409589605, + "grad_norm": 3.9158520698547363, + "learning_rate": 6.306324343257443e-05, + "loss": 1.0409, + "num_input_tokens_seen": 84923648, + "step": 5278 + }, + { + "epoch": 0.3697846892046897, + "grad_norm": 4.663973808288574, + "learning_rate": 6.305624518388792e-05, + "loss": 0.9818, + "num_input_tokens_seen": 84939976, + "step": 5279 + }, + { + "epoch": 0.36985473745041897, + "grad_norm": 4.3741455078125, + "learning_rate": 6.30492469352014e-05, + "loss": 1.2902, + "num_input_tokens_seen": 84956184, + "step": 5280 + }, + { + "epoch": 0.36992478569614823, + "grad_norm": 5.071192264556885, + "learning_rate": 6.304224868651489e-05, + "loss": 1.0856, + "num_input_tokens_seen": 84972024, + "step": 5281 + }, + { + "epoch": 0.3699948339418775, + "grad_norm": 3.5479323863983154, + "learning_rate": 6.303525043782838e-05, + "loss": 1.0809, + "num_input_tokens_seen": 84988408, + "step": 5282 + }, + { + "epoch": 0.3700648821876067, + "grad_norm": 4.6933465003967285, + "learning_rate": 6.302825218914186e-05, + "loss": 1.1826, + "num_input_tokens_seen": 85004720, + "step": 5283 + }, + { + "epoch": 0.37013493043333595, + "grad_norm": 3.594067096710205, + "learning_rate": 6.302125394045535e-05, + "loss": 0.8992, + "num_input_tokens_seen": 85020456, + "step": 5284 + }, + { + "epoch": 0.3702049786790652, + "grad_norm": 3.972480535507202, + "learning_rate": 6.301425569176884e-05, + "loss": 1.09, + "num_input_tokens_seen": 85036840, + "step": 5285 + }, + { + "epoch": 0.37027502692479447, + "grad_norm": 4.674763202667236, + "learning_rate": 6.300725744308232e-05, + "loss": 1.029, + "num_input_tokens_seen": 85053224, + "step": 5286 + }, + { + "epoch": 0.3703450751705237, + "grad_norm": 4.716235160827637, + "learning_rate": 6.300025919439579e-05, + "loss": 0.9872, + "num_input_tokens_seen": 85068624, + "step": 5287 + }, + { + "epoch": 0.37041512341625293, + "grad_norm": 5.01246452331543, + "learning_rate": 6.299326094570928e-05, + "loss": 0.9608, + "num_input_tokens_seen": 85085008, + "step": 5288 + }, + { + "epoch": 0.3704851716619822, + "grad_norm": 5.020605087280273, + "learning_rate": 6.298626269702277e-05, + "loss": 0.9759, + "num_input_tokens_seen": 85101392, + "step": 5289 + }, + { + "epoch": 0.37055521990771145, + "grad_norm": 5.841190814971924, + "learning_rate": 6.297926444833626e-05, + "loss": 1.3302, + "num_input_tokens_seen": 85117776, + "step": 5290 + }, + { + "epoch": 0.37062526815344066, + "grad_norm": 4.592007637023926, + "learning_rate": 6.297226619964975e-05, + "loss": 0.9129, + "num_input_tokens_seen": 85134160, + "step": 5291 + }, + { + "epoch": 0.3706953163991699, + "grad_norm": 3.678398609161377, + "learning_rate": 6.296526795096323e-05, + "loss": 0.9809, + "num_input_tokens_seen": 85150544, + "step": 5292 + }, + { + "epoch": 0.3707653646448992, + "grad_norm": 3.9148921966552734, + "learning_rate": 6.295826970227671e-05, + "loss": 1.1459, + "num_input_tokens_seen": 85166208, + "step": 5293 + }, + { + "epoch": 0.37083541289062844, + "grad_norm": 3.83375883102417, + "learning_rate": 6.295127145359018e-05, + "loss": 1.1273, + "num_input_tokens_seen": 85182592, + "step": 5294 + }, + { + "epoch": 0.37090546113635764, + "grad_norm": 6.339621067047119, + "learning_rate": 6.294427320490369e-05, + "loss": 1.0995, + "num_input_tokens_seen": 85197512, + "step": 5295 + }, + { + "epoch": 0.3709755093820869, + "grad_norm": 3.931565046310425, + "learning_rate": 6.293727495621716e-05, + "loss": 0.9326, + "num_input_tokens_seen": 85213800, + "step": 5296 + }, + { + "epoch": 0.37104555762781616, + "grad_norm": 4.46995210647583, + "learning_rate": 6.293027670753065e-05, + "loss": 1.0782, + "num_input_tokens_seen": 85229528, + "step": 5297 + }, + { + "epoch": 0.3711156058735454, + "grad_norm": 4.4390363693237305, + "learning_rate": 6.292327845884414e-05, + "loss": 1.1976, + "num_input_tokens_seen": 85245912, + "step": 5298 + }, + { + "epoch": 0.3711856541192747, + "grad_norm": 4.089926719665527, + "learning_rate": 6.291628021015763e-05, + "loss": 1.037, + "num_input_tokens_seen": 85262296, + "step": 5299 + }, + { + "epoch": 0.3712557023650039, + "grad_norm": 4.190539360046387, + "learning_rate": 6.29092819614711e-05, + "loss": 1.1928, + "num_input_tokens_seen": 85278560, + "step": 5300 + }, + { + "epoch": 0.37132575061073314, + "grad_norm": 5.1102166175842285, + "learning_rate": 6.290228371278459e-05, + "loss": 0.8734, + "num_input_tokens_seen": 85294944, + "step": 5301 + }, + { + "epoch": 0.3713957988564624, + "grad_norm": 4.174960136413574, + "learning_rate": 6.289528546409808e-05, + "loss": 1.0425, + "num_input_tokens_seen": 85311328, + "step": 5302 + }, + { + "epoch": 0.37146584710219166, + "grad_norm": 3.8785698413848877, + "learning_rate": 6.288828721541157e-05, + "loss": 1.0008, + "num_input_tokens_seen": 85326784, + "step": 5303 + }, + { + "epoch": 0.37153589534792086, + "grad_norm": 3.728626251220703, + "learning_rate": 6.288128896672504e-05, + "loss": 1.1116, + "num_input_tokens_seen": 85343168, + "step": 5304 + }, + { + "epoch": 0.3716059435936501, + "grad_norm": 5.1877312660217285, + "learning_rate": 6.287429071803853e-05, + "loss": 1.0917, + "num_input_tokens_seen": 85359552, + "step": 5305 + }, + { + "epoch": 0.3716759918393794, + "grad_norm": 5.751648902893066, + "learning_rate": 6.286729246935202e-05, + "loss": 1.2662, + "num_input_tokens_seen": 85375136, + "step": 5306 + }, + { + "epoch": 0.37174604008510864, + "grad_norm": 3.7917258739471436, + "learning_rate": 6.28602942206655e-05, + "loss": 0.8499, + "num_input_tokens_seen": 85391520, + "step": 5307 + }, + { + "epoch": 0.37181608833083785, + "grad_norm": 4.268946647644043, + "learning_rate": 6.285329597197898e-05, + "loss": 1.0928, + "num_input_tokens_seen": 85406848, + "step": 5308 + }, + { + "epoch": 0.3718861365765671, + "grad_norm": 4.350981712341309, + "learning_rate": 6.284629772329247e-05, + "loss": 1.1725, + "num_input_tokens_seen": 85423232, + "step": 5309 + }, + { + "epoch": 0.37195618482229637, + "grad_norm": 3.8072032928466797, + "learning_rate": 6.283929947460596e-05, + "loss": 0.9999, + "num_input_tokens_seen": 85439616, + "step": 5310 + }, + { + "epoch": 0.3720262330680256, + "grad_norm": 4.0531697273254395, + "learning_rate": 6.283230122591945e-05, + "loss": 0.9389, + "num_input_tokens_seen": 85456000, + "step": 5311 + }, + { + "epoch": 0.37209628131375483, + "grad_norm": 5.18675422668457, + "learning_rate": 6.282530297723294e-05, + "loss": 1.0504, + "num_input_tokens_seen": 85472384, + "step": 5312 + }, + { + "epoch": 0.3721663295594841, + "grad_norm": 4.675386428833008, + "learning_rate": 6.281830472854641e-05, + "loss": 0.8373, + "num_input_tokens_seen": 85488544, + "step": 5313 + }, + { + "epoch": 0.37223637780521335, + "grad_norm": 6.522333145141602, + "learning_rate": 6.281130647985989e-05, + "loss": 0.9685, + "num_input_tokens_seen": 85504352, + "step": 5314 + }, + { + "epoch": 0.3723064260509426, + "grad_norm": 3.9266233444213867, + "learning_rate": 6.280430823117338e-05, + "loss": 1.0443, + "num_input_tokens_seen": 85520688, + "step": 5315 + }, + { + "epoch": 0.3723764742966718, + "grad_norm": 4.6428093910217285, + "learning_rate": 6.279730998248687e-05, + "loss": 0.9396, + "num_input_tokens_seen": 85537072, + "step": 5316 + }, + { + "epoch": 0.37244652254240107, + "grad_norm": 3.6043691635131836, + "learning_rate": 6.279031173380035e-05, + "loss": 0.903, + "num_input_tokens_seen": 85553456, + "step": 5317 + }, + { + "epoch": 0.37251657078813033, + "grad_norm": 3.4878151416778564, + "learning_rate": 6.278331348511384e-05, + "loss": 1.101, + "num_input_tokens_seen": 85569824, + "step": 5318 + }, + { + "epoch": 0.3725866190338596, + "grad_norm": 4.275106906890869, + "learning_rate": 6.277631523642733e-05, + "loss": 0.8912, + "num_input_tokens_seen": 85586208, + "step": 5319 + }, + { + "epoch": 0.3726566672795888, + "grad_norm": 7.615388870239258, + "learning_rate": 6.27693169877408e-05, + "loss": 1.0786, + "num_input_tokens_seen": 85600984, + "step": 5320 + }, + { + "epoch": 0.37272671552531805, + "grad_norm": 4.4750752449035645, + "learning_rate": 6.276231873905428e-05, + "loss": 1.1369, + "num_input_tokens_seen": 85617368, + "step": 5321 + }, + { + "epoch": 0.3727967637710473, + "grad_norm": 3.7900373935699463, + "learning_rate": 6.275532049036778e-05, + "loss": 1.0727, + "num_input_tokens_seen": 85633304, + "step": 5322 + }, + { + "epoch": 0.37286681201677657, + "grad_norm": 8.58016300201416, + "learning_rate": 6.274832224168127e-05, + "loss": 1.0942, + "num_input_tokens_seen": 85648592, + "step": 5323 + }, + { + "epoch": 0.3729368602625058, + "grad_norm": 3.847476005554199, + "learning_rate": 6.274132399299475e-05, + "loss": 1.2543, + "num_input_tokens_seen": 85664976, + "step": 5324 + }, + { + "epoch": 0.37300690850823504, + "grad_norm": 3.68683123588562, + "learning_rate": 6.273432574430824e-05, + "loss": 1.1331, + "num_input_tokens_seen": 85681360, + "step": 5325 + }, + { + "epoch": 0.3730769567539643, + "grad_norm": 4.07316255569458, + "learning_rate": 6.272732749562172e-05, + "loss": 1.1859, + "num_input_tokens_seen": 85697744, + "step": 5326 + }, + { + "epoch": 0.37314700499969355, + "grad_norm": 3.7817749977111816, + "learning_rate": 6.27203292469352e-05, + "loss": 1.128, + "num_input_tokens_seen": 85713680, + "step": 5327 + }, + { + "epoch": 0.37321705324542276, + "grad_norm": 3.8322465419769287, + "learning_rate": 6.271333099824869e-05, + "loss": 1.1804, + "num_input_tokens_seen": 85730064, + "step": 5328 + }, + { + "epoch": 0.373287101491152, + "grad_norm": 5.689653396606445, + "learning_rate": 6.270633274956218e-05, + "loss": 1.0848, + "num_input_tokens_seen": 85745904, + "step": 5329 + }, + { + "epoch": 0.3733571497368813, + "grad_norm": 5.568809509277344, + "learning_rate": 6.269933450087566e-05, + "loss": 0.9887, + "num_input_tokens_seen": 85762288, + "step": 5330 + }, + { + "epoch": 0.37342719798261054, + "grad_norm": 3.982375383377075, + "learning_rate": 6.269233625218914e-05, + "loss": 0.9975, + "num_input_tokens_seen": 85778672, + "step": 5331 + }, + { + "epoch": 0.3734972462283398, + "grad_norm": 3.430204391479492, + "learning_rate": 6.268533800350263e-05, + "loss": 1.0241, + "num_input_tokens_seen": 85795056, + "step": 5332 + }, + { + "epoch": 0.373567294474069, + "grad_norm": 3.465724229812622, + "learning_rate": 6.267833975481612e-05, + "loss": 0.9229, + "num_input_tokens_seen": 85811392, + "step": 5333 + }, + { + "epoch": 0.37363734271979826, + "grad_norm": 3.837188482284546, + "learning_rate": 6.267134150612959e-05, + "loss": 1.1354, + "num_input_tokens_seen": 85827016, + "step": 5334 + }, + { + "epoch": 0.3737073909655275, + "grad_norm": 7.360764980316162, + "learning_rate": 6.266434325744308e-05, + "loss": 1.0209, + "num_input_tokens_seen": 85842040, + "step": 5335 + }, + { + "epoch": 0.3737774392112568, + "grad_norm": 3.567553997039795, + "learning_rate": 6.265734500875657e-05, + "loss": 1.0502, + "num_input_tokens_seen": 85858424, + "step": 5336 + }, + { + "epoch": 0.373847487456986, + "grad_norm": 4.564986705780029, + "learning_rate": 6.265034676007006e-05, + "loss": 1.0178, + "num_input_tokens_seen": 85874808, + "step": 5337 + }, + { + "epoch": 0.37391753570271524, + "grad_norm": 3.4568405151367188, + "learning_rate": 6.264334851138355e-05, + "loss": 0.9245, + "num_input_tokens_seen": 85890672, + "step": 5338 + }, + { + "epoch": 0.3739875839484445, + "grad_norm": 3.723557233810425, + "learning_rate": 6.263635026269704e-05, + "loss": 1.0175, + "num_input_tokens_seen": 85906920, + "step": 5339 + }, + { + "epoch": 0.37405763219417376, + "grad_norm": 3.5800676345825195, + "learning_rate": 6.262935201401051e-05, + "loss": 0.9726, + "num_input_tokens_seen": 85923304, + "step": 5340 + }, + { + "epoch": 0.37412768043990297, + "grad_norm": 3.8996667861938477, + "learning_rate": 6.262235376532399e-05, + "loss": 1.2368, + "num_input_tokens_seen": 85938984, + "step": 5341 + }, + { + "epoch": 0.3741977286856322, + "grad_norm": 3.417182207107544, + "learning_rate": 6.261535551663747e-05, + "loss": 1.0959, + "num_input_tokens_seen": 85955368, + "step": 5342 + }, + { + "epoch": 0.3742677769313615, + "grad_norm": 4.214803695678711, + "learning_rate": 6.260835726795098e-05, + "loss": 1.1107, + "num_input_tokens_seen": 85971320, + "step": 5343 + }, + { + "epoch": 0.37433782517709074, + "grad_norm": 3.7782840728759766, + "learning_rate": 6.260135901926445e-05, + "loss": 0.9455, + "num_input_tokens_seen": 85987704, + "step": 5344 + }, + { + "epoch": 0.37440787342281995, + "grad_norm": 3.6186842918395996, + "learning_rate": 6.259436077057794e-05, + "loss": 1.0682, + "num_input_tokens_seen": 86004088, + "step": 5345 + }, + { + "epoch": 0.3744779216685492, + "grad_norm": 4.2028913497924805, + "learning_rate": 6.258736252189143e-05, + "loss": 1.2203, + "num_input_tokens_seen": 86020472, + "step": 5346 + }, + { + "epoch": 0.37454796991427847, + "grad_norm": 4.17422342300415, + "learning_rate": 6.25803642732049e-05, + "loss": 1.2483, + "num_input_tokens_seen": 86036856, + "step": 5347 + }, + { + "epoch": 0.3746180181600077, + "grad_norm": 3.3578243255615234, + "learning_rate": 6.257336602451838e-05, + "loss": 1.0315, + "num_input_tokens_seen": 86053224, + "step": 5348 + }, + { + "epoch": 0.37468806640573693, + "grad_norm": 4.105921268463135, + "learning_rate": 6.256636777583188e-05, + "loss": 1.0552, + "num_input_tokens_seen": 86069272, + "step": 5349 + }, + { + "epoch": 0.3747581146514662, + "grad_norm": 3.7420692443847656, + "learning_rate": 6.255936952714537e-05, + "loss": 1.0672, + "num_input_tokens_seen": 86085656, + "step": 5350 + }, + { + "epoch": 0.37482816289719545, + "grad_norm": 5.1573872566223145, + "learning_rate": 6.255237127845884e-05, + "loss": 1.376, + "num_input_tokens_seen": 86102040, + "step": 5351 + }, + { + "epoch": 0.3748982111429247, + "grad_norm": 3.9844436645507812, + "learning_rate": 6.254537302977233e-05, + "loss": 1.0042, + "num_input_tokens_seen": 86117976, + "step": 5352 + }, + { + "epoch": 0.3749682593886539, + "grad_norm": 3.6582653522491455, + "learning_rate": 6.253837478108582e-05, + "loss": 0.9786, + "num_input_tokens_seen": 86134360, + "step": 5353 + }, + { + "epoch": 0.3750383076343832, + "grad_norm": 4.814766883850098, + "learning_rate": 6.25313765323993e-05, + "loss": 1.2574, + "num_input_tokens_seen": 86150208, + "step": 5354 + }, + { + "epoch": 0.37510835588011243, + "grad_norm": 4.7514262199401855, + "learning_rate": 6.252437828371278e-05, + "loss": 1.071, + "num_input_tokens_seen": 86165672, + "step": 5355 + }, + { + "epoch": 0.3751784041258417, + "grad_norm": 3.9450578689575195, + "learning_rate": 6.251738003502627e-05, + "loss": 1.1295, + "num_input_tokens_seen": 86182056, + "step": 5356 + }, + { + "epoch": 0.3752484523715709, + "grad_norm": 3.5215647220611572, + "learning_rate": 6.251038178633976e-05, + "loss": 1.04, + "num_input_tokens_seen": 86198440, + "step": 5357 + }, + { + "epoch": 0.37531850061730015, + "grad_norm": 3.805070161819458, + "learning_rate": 6.250338353765324e-05, + "loss": 1.036, + "num_input_tokens_seen": 86214824, + "step": 5358 + }, + { + "epoch": 0.3753885488630294, + "grad_norm": 4.033730983734131, + "learning_rate": 6.249638528896673e-05, + "loss": 1.092, + "num_input_tokens_seen": 86231208, + "step": 5359 + }, + { + "epoch": 0.3754585971087587, + "grad_norm": 3.8157355785369873, + "learning_rate": 6.248938704028021e-05, + "loss": 1.0032, + "num_input_tokens_seen": 86247392, + "step": 5360 + }, + { + "epoch": 0.3755286453544879, + "grad_norm": 4.832013130187988, + "learning_rate": 6.248238879159369e-05, + "loss": 1.0711, + "num_input_tokens_seen": 86263776, + "step": 5361 + }, + { + "epoch": 0.37559869360021714, + "grad_norm": 3.753471612930298, + "learning_rate": 6.247539054290718e-05, + "loss": 1.0532, + "num_input_tokens_seen": 86279912, + "step": 5362 + }, + { + "epoch": 0.3756687418459464, + "grad_norm": 8.569518089294434, + "learning_rate": 6.246839229422068e-05, + "loss": 1.1073, + "num_input_tokens_seen": 86296296, + "step": 5363 + }, + { + "epoch": 0.37573879009167566, + "grad_norm": 4.399802207946777, + "learning_rate": 6.246139404553416e-05, + "loss": 1.1484, + "num_input_tokens_seen": 86312680, + "step": 5364 + }, + { + "epoch": 0.37580883833740486, + "grad_norm": 4.230834484100342, + "learning_rate": 6.245439579684764e-05, + "loss": 1.0905, + "num_input_tokens_seen": 86329064, + "step": 5365 + }, + { + "epoch": 0.3758788865831341, + "grad_norm": 4.750765800476074, + "learning_rate": 6.244739754816113e-05, + "loss": 1.2126, + "num_input_tokens_seen": 86345448, + "step": 5366 + }, + { + "epoch": 0.3759489348288634, + "grad_norm": 6.567142963409424, + "learning_rate": 6.244039929947461e-05, + "loss": 1.314, + "num_input_tokens_seen": 86361272, + "step": 5367 + }, + { + "epoch": 0.37601898307459264, + "grad_norm": 3.9668781757354736, + "learning_rate": 6.243340105078808e-05, + "loss": 1.0427, + "num_input_tokens_seen": 86377448, + "step": 5368 + }, + { + "epoch": 0.3760890313203219, + "grad_norm": 4.619864463806152, + "learning_rate": 6.242640280210158e-05, + "loss": 1.0687, + "num_input_tokens_seen": 86393600, + "step": 5369 + }, + { + "epoch": 0.3761590795660511, + "grad_norm": 6.837228298187256, + "learning_rate": 6.241940455341507e-05, + "loss": 0.9225, + "num_input_tokens_seen": 86409896, + "step": 5370 + }, + { + "epoch": 0.37622912781178036, + "grad_norm": 4.634070873260498, + "learning_rate": 6.241240630472855e-05, + "loss": 1.0147, + "num_input_tokens_seen": 86426280, + "step": 5371 + }, + { + "epoch": 0.3762991760575096, + "grad_norm": 3.944580554962158, + "learning_rate": 6.240540805604204e-05, + "loss": 1.144, + "num_input_tokens_seen": 86442640, + "step": 5372 + }, + { + "epoch": 0.3763692243032389, + "grad_norm": 7.016427516937256, + "learning_rate": 6.239840980735553e-05, + "loss": 1.0016, + "num_input_tokens_seen": 86459024, + "step": 5373 + }, + { + "epoch": 0.3764392725489681, + "grad_norm": 3.9997384548187256, + "learning_rate": 6.2391411558669e-05, + "loss": 0.9382, + "num_input_tokens_seen": 86475408, + "step": 5374 + }, + { + "epoch": 0.37650932079469734, + "grad_norm": 4.016181945800781, + "learning_rate": 6.238441330998249e-05, + "loss": 1.1728, + "num_input_tokens_seen": 86491680, + "step": 5375 + }, + { + "epoch": 0.3765793690404266, + "grad_norm": 4.19748592376709, + "learning_rate": 6.237741506129598e-05, + "loss": 1.161, + "num_input_tokens_seen": 86507768, + "step": 5376 + }, + { + "epoch": 0.37664941728615586, + "grad_norm": 4.579540252685547, + "learning_rate": 6.237041681260947e-05, + "loss": 1.0014, + "num_input_tokens_seen": 86524040, + "step": 5377 + }, + { + "epoch": 0.37671946553188507, + "grad_norm": 3.784952402114868, + "learning_rate": 6.236341856392294e-05, + "loss": 1.0435, + "num_input_tokens_seen": 86540424, + "step": 5378 + }, + { + "epoch": 0.3767895137776143, + "grad_norm": 5.813356876373291, + "learning_rate": 6.235642031523643e-05, + "loss": 0.9772, + "num_input_tokens_seen": 86556360, + "step": 5379 + }, + { + "epoch": 0.3768595620233436, + "grad_norm": 4.314088344573975, + "learning_rate": 6.234942206654992e-05, + "loss": 1.2318, + "num_input_tokens_seen": 86572744, + "step": 5380 + }, + { + "epoch": 0.37692961026907285, + "grad_norm": 3.898298740386963, + "learning_rate": 6.23424238178634e-05, + "loss": 1.1217, + "num_input_tokens_seen": 86588888, + "step": 5381 + }, + { + "epoch": 0.37699965851480205, + "grad_norm": 3.514692544937134, + "learning_rate": 6.233542556917688e-05, + "loss": 0.9526, + "num_input_tokens_seen": 86605272, + "step": 5382 + }, + { + "epoch": 0.3770697067605313, + "grad_norm": 3.7073886394500732, + "learning_rate": 6.232842732049038e-05, + "loss": 1.1199, + "num_input_tokens_seen": 86621656, + "step": 5383 + }, + { + "epoch": 0.37713975500626057, + "grad_norm": 3.9826815128326416, + "learning_rate": 6.232142907180386e-05, + "loss": 1.1417, + "num_input_tokens_seen": 86638040, + "step": 5384 + }, + { + "epoch": 0.37720980325198983, + "grad_norm": 3.6563196182250977, + "learning_rate": 6.231443082311733e-05, + "loss": 0.888, + "num_input_tokens_seen": 86654424, + "step": 5385 + }, + { + "epoch": 0.37727985149771903, + "grad_norm": 3.5995571613311768, + "learning_rate": 6.230743257443082e-05, + "loss": 1.0457, + "num_input_tokens_seen": 86670328, + "step": 5386 + }, + { + "epoch": 0.3773498997434483, + "grad_norm": 4.254338264465332, + "learning_rate": 6.230043432574431e-05, + "loss": 0.94, + "num_input_tokens_seen": 86685960, + "step": 5387 + }, + { + "epoch": 0.37741994798917755, + "grad_norm": 3.689716100692749, + "learning_rate": 6.229343607705779e-05, + "loss": 0.81, + "num_input_tokens_seen": 86702008, + "step": 5388 + }, + { + "epoch": 0.3774899962349068, + "grad_norm": 3.4042210578918457, + "learning_rate": 6.228643782837129e-05, + "loss": 1.0077, + "num_input_tokens_seen": 86718392, + "step": 5389 + }, + { + "epoch": 0.377560044480636, + "grad_norm": 4.607806205749512, + "learning_rate": 6.227943957968478e-05, + "loss": 1.2891, + "num_input_tokens_seen": 86734624, + "step": 5390 + }, + { + "epoch": 0.3776300927263653, + "grad_norm": 3.951362133026123, + "learning_rate": 6.227244133099825e-05, + "loss": 1.0501, + "num_input_tokens_seen": 86749816, + "step": 5391 + }, + { + "epoch": 0.37770014097209453, + "grad_norm": 3.535480260848999, + "learning_rate": 6.226544308231174e-05, + "loss": 0.8942, + "num_input_tokens_seen": 86765800, + "step": 5392 + }, + { + "epoch": 0.3777701892178238, + "grad_norm": 5.398930549621582, + "learning_rate": 6.225844483362523e-05, + "loss": 1.1322, + "num_input_tokens_seen": 86782184, + "step": 5393 + }, + { + "epoch": 0.377840237463553, + "grad_norm": 4.456240177154541, + "learning_rate": 6.22514465849387e-05, + "loss": 1.1725, + "num_input_tokens_seen": 86798568, + "step": 5394 + }, + { + "epoch": 0.37791028570928226, + "grad_norm": 3.8764703273773193, + "learning_rate": 6.224444833625219e-05, + "loss": 1.0041, + "num_input_tokens_seen": 86814824, + "step": 5395 + }, + { + "epoch": 0.3779803339550115, + "grad_norm": 3.8746144771575928, + "learning_rate": 6.223745008756568e-05, + "loss": 1.066, + "num_input_tokens_seen": 86831208, + "step": 5396 + }, + { + "epoch": 0.3780503822007408, + "grad_norm": 4.3454742431640625, + "learning_rate": 6.223045183887917e-05, + "loss": 1.1164, + "num_input_tokens_seen": 86846872, + "step": 5397 + }, + { + "epoch": 0.37812043044647, + "grad_norm": 5.006749153137207, + "learning_rate": 6.222345359019265e-05, + "loss": 0.8317, + "num_input_tokens_seen": 86863256, + "step": 5398 + }, + { + "epoch": 0.37819047869219924, + "grad_norm": 3.7388808727264404, + "learning_rate": 6.221645534150613e-05, + "loss": 1.1562, + "num_input_tokens_seen": 86879640, + "step": 5399 + }, + { + "epoch": 0.3782605269379285, + "grad_norm": 4.515074253082275, + "learning_rate": 6.220945709281962e-05, + "loss": 1.0428, + "num_input_tokens_seen": 86896024, + "step": 5400 + }, + { + "epoch": 0.3782605269379285, + "eval_loss": 1.1279726028442383, + "eval_runtime": 0.2024, + "eval_samples_per_second": 4.94, + "eval_steps_per_second": 4.94, + "num_input_tokens_seen": 86896024, + "step": 5400 + }, + { + "epoch": 0.37833057518365776, + "grad_norm": 3.5468356609344482, + "learning_rate": 6.22024588441331e-05, + "loss": 0.9858, + "num_input_tokens_seen": 86912032, + "step": 5401 + }, + { + "epoch": 0.378400623429387, + "grad_norm": 4.281546115875244, + "learning_rate": 6.219546059544659e-05, + "loss": 1.2335, + "num_input_tokens_seen": 86928080, + "step": 5402 + }, + { + "epoch": 0.3784706716751162, + "grad_norm": 4.247570037841797, + "learning_rate": 6.218846234676009e-05, + "loss": 1.0889, + "num_input_tokens_seen": 86944424, + "step": 5403 + }, + { + "epoch": 0.3785407199208455, + "grad_norm": 3.78439998626709, + "learning_rate": 6.218146409807356e-05, + "loss": 1.0476, + "num_input_tokens_seen": 86960808, + "step": 5404 + }, + { + "epoch": 0.37861076816657474, + "grad_norm": 4.174613952636719, + "learning_rate": 6.217446584938704e-05, + "loss": 1.2858, + "num_input_tokens_seen": 86976472, + "step": 5405 + }, + { + "epoch": 0.378680816412304, + "grad_norm": 4.759533882141113, + "learning_rate": 6.216746760070053e-05, + "loss": 0.9813, + "num_input_tokens_seen": 86992856, + "step": 5406 + }, + { + "epoch": 0.3787508646580332, + "grad_norm": 5.2616801261901855, + "learning_rate": 6.216046935201402e-05, + "loss": 1.1752, + "num_input_tokens_seen": 87007936, + "step": 5407 + }, + { + "epoch": 0.37882091290376246, + "grad_norm": 4.626899719238281, + "learning_rate": 6.215347110332749e-05, + "loss": 1.0348, + "num_input_tokens_seen": 87023888, + "step": 5408 + }, + { + "epoch": 0.3788909611494917, + "grad_norm": 3.7142221927642822, + "learning_rate": 6.214647285464099e-05, + "loss": 1.0051, + "num_input_tokens_seen": 87040272, + "step": 5409 + }, + { + "epoch": 0.378961009395221, + "grad_norm": 6.228342056274414, + "learning_rate": 6.213947460595448e-05, + "loss": 1.0807, + "num_input_tokens_seen": 87056656, + "step": 5410 + }, + { + "epoch": 0.3790310576409502, + "grad_norm": 3.7979259490966797, + "learning_rate": 6.213247635726796e-05, + "loss": 1.0051, + "num_input_tokens_seen": 87073040, + "step": 5411 + }, + { + "epoch": 0.37910110588667945, + "grad_norm": 3.903106927871704, + "learning_rate": 6.212547810858143e-05, + "loss": 1.0546, + "num_input_tokens_seen": 87089344, + "step": 5412 + }, + { + "epoch": 0.3791711541324087, + "grad_norm": 3.966651201248169, + "learning_rate": 6.211847985989492e-05, + "loss": 1.0678, + "num_input_tokens_seen": 87105144, + "step": 5413 + }, + { + "epoch": 0.37924120237813796, + "grad_norm": 4.070274829864502, + "learning_rate": 6.211148161120841e-05, + "loss": 1.1021, + "num_input_tokens_seen": 87121528, + "step": 5414 + }, + { + "epoch": 0.37931125062386717, + "grad_norm": 3.516997814178467, + "learning_rate": 6.21044833625219e-05, + "loss": 1.0112, + "num_input_tokens_seen": 87137752, + "step": 5415 + }, + { + "epoch": 0.37938129886959643, + "grad_norm": 4.28290319442749, + "learning_rate": 6.209748511383539e-05, + "loss": 1.2252, + "num_input_tokens_seen": 87154136, + "step": 5416 + }, + { + "epoch": 0.3794513471153257, + "grad_norm": 4.765808582305908, + "learning_rate": 6.209048686514887e-05, + "loss": 1.0135, + "num_input_tokens_seen": 87170520, + "step": 5417 + }, + { + "epoch": 0.37952139536105495, + "grad_norm": 3.8507494926452637, + "learning_rate": 6.208348861646235e-05, + "loss": 1.0304, + "num_input_tokens_seen": 87186904, + "step": 5418 + }, + { + "epoch": 0.37959144360678415, + "grad_norm": 7.46950626373291, + "learning_rate": 6.207649036777584e-05, + "loss": 1.1376, + "num_input_tokens_seen": 87203288, + "step": 5419 + }, + { + "epoch": 0.3796614918525134, + "grad_norm": 5.770944595336914, + "learning_rate": 6.206949211908933e-05, + "loss": 1.23, + "num_input_tokens_seen": 87219552, + "step": 5420 + }, + { + "epoch": 0.37973154009824267, + "grad_norm": 3.752936363220215, + "learning_rate": 6.20624938704028e-05, + "loss": 0.8285, + "num_input_tokens_seen": 87235736, + "step": 5421 + }, + { + "epoch": 0.37980158834397193, + "grad_norm": 3.8336403369903564, + "learning_rate": 6.205549562171629e-05, + "loss": 0.8416, + "num_input_tokens_seen": 87252120, + "step": 5422 + }, + { + "epoch": 0.37987163658970113, + "grad_norm": 7.380855083465576, + "learning_rate": 6.204849737302978e-05, + "loss": 1.1149, + "num_input_tokens_seen": 87268504, + "step": 5423 + }, + { + "epoch": 0.3799416848354304, + "grad_norm": 4.780874729156494, + "learning_rate": 6.204149912434327e-05, + "loss": 0.9103, + "num_input_tokens_seen": 87284888, + "step": 5424 + }, + { + "epoch": 0.38001173308115965, + "grad_norm": 4.691160202026367, + "learning_rate": 6.203450087565674e-05, + "loss": 1.1994, + "num_input_tokens_seen": 87301272, + "step": 5425 + }, + { + "epoch": 0.3800817813268889, + "grad_norm": 3.592348098754883, + "learning_rate": 6.202750262697023e-05, + "loss": 1.017, + "num_input_tokens_seen": 87317288, + "step": 5426 + }, + { + "epoch": 0.3801518295726181, + "grad_norm": 4.750811576843262, + "learning_rate": 6.202050437828372e-05, + "loss": 1.2781, + "num_input_tokens_seen": 87332488, + "step": 5427 + }, + { + "epoch": 0.3802218778183474, + "grad_norm": 4.564239501953125, + "learning_rate": 6.20135061295972e-05, + "loss": 1.133, + "num_input_tokens_seen": 87348264, + "step": 5428 + }, + { + "epoch": 0.38029192606407664, + "grad_norm": 4.697380065917969, + "learning_rate": 6.200650788091068e-05, + "loss": 1.0091, + "num_input_tokens_seen": 87363920, + "step": 5429 + }, + { + "epoch": 0.3803619743098059, + "grad_norm": 4.026552677154541, + "learning_rate": 6.199950963222419e-05, + "loss": 1.2177, + "num_input_tokens_seen": 87379920, + "step": 5430 + }, + { + "epoch": 0.3804320225555351, + "grad_norm": 5.023289203643799, + "learning_rate": 6.199251138353766e-05, + "loss": 0.9822, + "num_input_tokens_seen": 87395848, + "step": 5431 + }, + { + "epoch": 0.38050207080126436, + "grad_norm": 3.6005523204803467, + "learning_rate": 6.198551313485114e-05, + "loss": 1.0709, + "num_input_tokens_seen": 87411632, + "step": 5432 + }, + { + "epoch": 0.3805721190469936, + "grad_norm": 4.094357967376709, + "learning_rate": 6.197851488616462e-05, + "loss": 1.1254, + "num_input_tokens_seen": 87426912, + "step": 5433 + }, + { + "epoch": 0.3806421672927229, + "grad_norm": 4.452909469604492, + "learning_rate": 6.197151663747811e-05, + "loss": 1.248, + "num_input_tokens_seen": 87443296, + "step": 5434 + }, + { + "epoch": 0.3807122155384521, + "grad_norm": 3.975532054901123, + "learning_rate": 6.19645183887916e-05, + "loss": 1.0786, + "num_input_tokens_seen": 87459680, + "step": 5435 + }, + { + "epoch": 0.38078226378418134, + "grad_norm": 4.745920181274414, + "learning_rate": 6.195752014010509e-05, + "loss": 1.2534, + "num_input_tokens_seen": 87476064, + "step": 5436 + }, + { + "epoch": 0.3808523120299106, + "grad_norm": 3.8793790340423584, + "learning_rate": 6.195052189141858e-05, + "loss": 1.1197, + "num_input_tokens_seen": 87492448, + "step": 5437 + }, + { + "epoch": 0.38092236027563986, + "grad_norm": 4.695518493652344, + "learning_rate": 6.194352364273205e-05, + "loss": 0.93, + "num_input_tokens_seen": 87508832, + "step": 5438 + }, + { + "epoch": 0.3809924085213691, + "grad_norm": 3.5820047855377197, + "learning_rate": 6.193652539404553e-05, + "loss": 1.0007, + "num_input_tokens_seen": 87524728, + "step": 5439 + }, + { + "epoch": 0.3810624567670983, + "grad_norm": 5.76292610168457, + "learning_rate": 6.192952714535902e-05, + "loss": 1.1919, + "num_input_tokens_seen": 87540752, + "step": 5440 + }, + { + "epoch": 0.3811325050128276, + "grad_norm": 4.334653377532959, + "learning_rate": 6.19225288966725e-05, + "loss": 0.9847, + "num_input_tokens_seen": 87556384, + "step": 5441 + }, + { + "epoch": 0.38120255325855684, + "grad_norm": 3.7438180446624756, + "learning_rate": 6.1915530647986e-05, + "loss": 1.0083, + "num_input_tokens_seen": 87572320, + "step": 5442 + }, + { + "epoch": 0.3812726015042861, + "grad_norm": 4.082560062408447, + "learning_rate": 6.190853239929948e-05, + "loss": 0.8908, + "num_input_tokens_seen": 87588704, + "step": 5443 + }, + { + "epoch": 0.3813426497500153, + "grad_norm": 3.9324755668640137, + "learning_rate": 6.190153415061297e-05, + "loss": 1.0991, + "num_input_tokens_seen": 87605088, + "step": 5444 + }, + { + "epoch": 0.38141269799574457, + "grad_norm": 5.329967498779297, + "learning_rate": 6.189453590192645e-05, + "loss": 1.1817, + "num_input_tokens_seen": 87621472, + "step": 5445 + }, + { + "epoch": 0.3814827462414738, + "grad_norm": 3.627267837524414, + "learning_rate": 6.188753765323993e-05, + "loss": 0.9407, + "num_input_tokens_seen": 87637856, + "step": 5446 + }, + { + "epoch": 0.3815527944872031, + "grad_norm": 3.6728835105895996, + "learning_rate": 6.188053940455342e-05, + "loss": 0.8623, + "num_input_tokens_seen": 87653720, + "step": 5447 + }, + { + "epoch": 0.3816228427329323, + "grad_norm": 3.556185245513916, + "learning_rate": 6.18735411558669e-05, + "loss": 0.9531, + "num_input_tokens_seen": 87670104, + "step": 5448 + }, + { + "epoch": 0.38169289097866155, + "grad_norm": 4.075231552124023, + "learning_rate": 6.186654290718039e-05, + "loss": 1.0284, + "num_input_tokens_seen": 87686488, + "step": 5449 + }, + { + "epoch": 0.3817629392243908, + "grad_norm": 3.981752395629883, + "learning_rate": 6.185954465849388e-05, + "loss": 1.0822, + "num_input_tokens_seen": 87702872, + "step": 5450 + }, + { + "epoch": 0.38183298747012007, + "grad_norm": 4.75683069229126, + "learning_rate": 6.185254640980736e-05, + "loss": 0.9611, + "num_input_tokens_seen": 87718912, + "step": 5451 + }, + { + "epoch": 0.38190303571584927, + "grad_norm": 6.081716060638428, + "learning_rate": 6.184554816112084e-05, + "loss": 1.092, + "num_input_tokens_seen": 87735160, + "step": 5452 + }, + { + "epoch": 0.38197308396157853, + "grad_norm": 6.651247978210449, + "learning_rate": 6.183854991243433e-05, + "loss": 1.0397, + "num_input_tokens_seen": 87749232, + "step": 5453 + }, + { + "epoch": 0.3820431322073078, + "grad_norm": 4.12028694152832, + "learning_rate": 6.183155166374782e-05, + "loss": 1.094, + "num_input_tokens_seen": 87765328, + "step": 5454 + }, + { + "epoch": 0.38211318045303705, + "grad_norm": 6.3344645500183105, + "learning_rate": 6.18245534150613e-05, + "loss": 1.0275, + "num_input_tokens_seen": 87781712, + "step": 5455 + }, + { + "epoch": 0.38218322869876625, + "grad_norm": 3.745476007461548, + "learning_rate": 6.181755516637478e-05, + "loss": 0.9485, + "num_input_tokens_seen": 87798032, + "step": 5456 + }, + { + "epoch": 0.3822532769444955, + "grad_norm": 3.515174388885498, + "learning_rate": 6.181055691768828e-05, + "loss": 1.1138, + "num_input_tokens_seen": 87814416, + "step": 5457 + }, + { + "epoch": 0.38232332519022477, + "grad_norm": 4.101998329162598, + "learning_rate": 6.180355866900176e-05, + "loss": 0.9787, + "num_input_tokens_seen": 87830504, + "step": 5458 + }, + { + "epoch": 0.38239337343595403, + "grad_norm": 4.045940399169922, + "learning_rate": 6.179656042031523e-05, + "loss": 1.1278, + "num_input_tokens_seen": 87846264, + "step": 5459 + }, + { + "epoch": 0.38246342168168324, + "grad_norm": 8.09753131866455, + "learning_rate": 6.178956217162872e-05, + "loss": 1.131, + "num_input_tokens_seen": 87861856, + "step": 5460 + }, + { + "epoch": 0.3825334699274125, + "grad_norm": 5.395979404449463, + "learning_rate": 6.178256392294221e-05, + "loss": 1.0364, + "num_input_tokens_seen": 87878024, + "step": 5461 + }, + { + "epoch": 0.38260351817314175, + "grad_norm": 3.452855110168457, + "learning_rate": 6.17755656742557e-05, + "loss": 0.8875, + "num_input_tokens_seen": 87894408, + "step": 5462 + }, + { + "epoch": 0.382673566418871, + "grad_norm": 3.9877512454986572, + "learning_rate": 6.176856742556919e-05, + "loss": 1.0349, + "num_input_tokens_seen": 87910400, + "step": 5463 + }, + { + "epoch": 0.3827436146646002, + "grad_norm": 3.9095492362976074, + "learning_rate": 6.176156917688268e-05, + "loss": 1.063, + "num_input_tokens_seen": 87926040, + "step": 5464 + }, + { + "epoch": 0.3828136629103295, + "grad_norm": 4.558162212371826, + "learning_rate": 6.175457092819615e-05, + "loss": 1.1126, + "num_input_tokens_seen": 87942424, + "step": 5465 + }, + { + "epoch": 0.38288371115605874, + "grad_norm": 3.786123275756836, + "learning_rate": 6.174757267950963e-05, + "loss": 1.0414, + "num_input_tokens_seen": 87958808, + "step": 5466 + }, + { + "epoch": 0.382953759401788, + "grad_norm": 4.0291056632995605, + "learning_rate": 6.174057443082311e-05, + "loss": 1.0603, + "num_input_tokens_seen": 87975192, + "step": 5467 + }, + { + "epoch": 0.3830238076475172, + "grad_norm": 3.698666572570801, + "learning_rate": 6.17335761821366e-05, + "loss": 0.9187, + "num_input_tokens_seen": 87991504, + "step": 5468 + }, + { + "epoch": 0.38309385589324646, + "grad_norm": 3.7802882194519043, + "learning_rate": 6.172657793345009e-05, + "loss": 0.9568, + "num_input_tokens_seen": 88007888, + "step": 5469 + }, + { + "epoch": 0.3831639041389757, + "grad_norm": 4.754447937011719, + "learning_rate": 6.171957968476358e-05, + "loss": 1.1788, + "num_input_tokens_seen": 88023832, + "step": 5470 + }, + { + "epoch": 0.383233952384705, + "grad_norm": 3.502560615539551, + "learning_rate": 6.171258143607707e-05, + "loss": 0.9993, + "num_input_tokens_seen": 88040216, + "step": 5471 + }, + { + "epoch": 0.38330400063043424, + "grad_norm": 4.379989147186279, + "learning_rate": 6.170558318739054e-05, + "loss": 1.0609, + "num_input_tokens_seen": 88055768, + "step": 5472 + }, + { + "epoch": 0.38337404887616344, + "grad_norm": 3.3798177242279053, + "learning_rate": 6.169858493870403e-05, + "loss": 0.6884, + "num_input_tokens_seen": 88072152, + "step": 5473 + }, + { + "epoch": 0.3834440971218927, + "grad_norm": 4.265483856201172, + "learning_rate": 6.169158669001752e-05, + "loss": 1.0405, + "num_input_tokens_seen": 88087816, + "step": 5474 + }, + { + "epoch": 0.38351414536762196, + "grad_norm": 4.468397617340088, + "learning_rate": 6.168458844133101e-05, + "loss": 0.885, + "num_input_tokens_seen": 88103160, + "step": 5475 + }, + { + "epoch": 0.3835841936133512, + "grad_norm": 3.888359546661377, + "learning_rate": 6.167759019264448e-05, + "loss": 1.1768, + "num_input_tokens_seen": 88119544, + "step": 5476 + }, + { + "epoch": 0.3836542418590804, + "grad_norm": 3.7953927516937256, + "learning_rate": 6.167059194395797e-05, + "loss": 1.1585, + "num_input_tokens_seen": 88135928, + "step": 5477 + }, + { + "epoch": 0.3837242901048097, + "grad_norm": 3.7742021083831787, + "learning_rate": 6.166359369527146e-05, + "loss": 0.9201, + "num_input_tokens_seen": 88151928, + "step": 5478 + }, + { + "epoch": 0.38379433835053894, + "grad_norm": 3.811535120010376, + "learning_rate": 6.165659544658494e-05, + "loss": 0.9211, + "num_input_tokens_seen": 88168312, + "step": 5479 + }, + { + "epoch": 0.3838643865962682, + "grad_norm": 5.1758646965026855, + "learning_rate": 6.164959719789842e-05, + "loss": 1.2087, + "num_input_tokens_seen": 88184064, + "step": 5480 + }, + { + "epoch": 0.3839344348419974, + "grad_norm": 4.529813289642334, + "learning_rate": 6.164259894921191e-05, + "loss": 0.9142, + "num_input_tokens_seen": 88200216, + "step": 5481 + }, + { + "epoch": 0.38400448308772667, + "grad_norm": 4.426999568939209, + "learning_rate": 6.16356007005254e-05, + "loss": 1.0046, + "num_input_tokens_seen": 88215568, + "step": 5482 + }, + { + "epoch": 0.3840745313334559, + "grad_norm": 4.73276948928833, + "learning_rate": 6.162860245183888e-05, + "loss": 1.0082, + "num_input_tokens_seen": 88231952, + "step": 5483 + }, + { + "epoch": 0.3841445795791852, + "grad_norm": 3.6280384063720703, + "learning_rate": 6.162160420315238e-05, + "loss": 0.9015, + "num_input_tokens_seen": 88247728, + "step": 5484 + }, + { + "epoch": 0.3842146278249144, + "grad_norm": 3.6947717666625977, + "learning_rate": 6.161460595446585e-05, + "loss": 0.9671, + "num_input_tokens_seen": 88263472, + "step": 5485 + }, + { + "epoch": 0.38428467607064365, + "grad_norm": 3.683591842651367, + "learning_rate": 6.160760770577933e-05, + "loss": 1.1844, + "num_input_tokens_seen": 88279856, + "step": 5486 + }, + { + "epoch": 0.3843547243163729, + "grad_norm": 6.020013332366943, + "learning_rate": 6.160060945709282e-05, + "loss": 1.0372, + "num_input_tokens_seen": 88295864, + "step": 5487 + }, + { + "epoch": 0.38442477256210217, + "grad_norm": 8.429437637329102, + "learning_rate": 6.15936112084063e-05, + "loss": 1.143, + "num_input_tokens_seen": 88311752, + "step": 5488 + }, + { + "epoch": 0.3844948208078314, + "grad_norm": 3.679159164428711, + "learning_rate": 6.15866129597198e-05, + "loss": 0.9212, + "num_input_tokens_seen": 88327784, + "step": 5489 + }, + { + "epoch": 0.38456486905356063, + "grad_norm": 4.131216526031494, + "learning_rate": 6.157961471103328e-05, + "loss": 1.0983, + "num_input_tokens_seen": 88343480, + "step": 5490 + }, + { + "epoch": 0.3846349172992899, + "grad_norm": 4.294956684112549, + "learning_rate": 6.157261646234677e-05, + "loss": 1.224, + "num_input_tokens_seen": 88359864, + "step": 5491 + }, + { + "epoch": 0.38470496554501915, + "grad_norm": 4.683321952819824, + "learning_rate": 6.156561821366025e-05, + "loss": 1.0377, + "num_input_tokens_seen": 88375184, + "step": 5492 + }, + { + "epoch": 0.38477501379074835, + "grad_norm": 3.932366371154785, + "learning_rate": 6.155861996497372e-05, + "loss": 1.1341, + "num_input_tokens_seen": 88391568, + "step": 5493 + }, + { + "epoch": 0.3848450620364776, + "grad_norm": 4.191849231719971, + "learning_rate": 6.155162171628721e-05, + "loss": 1.2247, + "num_input_tokens_seen": 88407680, + "step": 5494 + }, + { + "epoch": 0.3849151102822069, + "grad_norm": 3.983915090560913, + "learning_rate": 6.154462346760071e-05, + "loss": 1.0115, + "num_input_tokens_seen": 88422888, + "step": 5495 + }, + { + "epoch": 0.38498515852793613, + "grad_norm": 4.163250923156738, + "learning_rate": 6.153762521891419e-05, + "loss": 1.057, + "num_input_tokens_seen": 88439272, + "step": 5496 + }, + { + "epoch": 0.38505520677366534, + "grad_norm": 6.113068580627441, + "learning_rate": 6.153062697022768e-05, + "loss": 1.0971, + "num_input_tokens_seen": 88455656, + "step": 5497 + }, + { + "epoch": 0.3851252550193946, + "grad_norm": 5.32371187210083, + "learning_rate": 6.152362872154117e-05, + "loss": 1.1886, + "num_input_tokens_seen": 88472040, + "step": 5498 + }, + { + "epoch": 0.38519530326512386, + "grad_norm": 6.110095500946045, + "learning_rate": 6.151663047285464e-05, + "loss": 0.9587, + "num_input_tokens_seen": 88487720, + "step": 5499 + }, + { + "epoch": 0.3852653515108531, + "grad_norm": 3.9656851291656494, + "learning_rate": 6.150963222416813e-05, + "loss": 0.9635, + "num_input_tokens_seen": 88504104, + "step": 5500 + }, + { + "epoch": 0.3853353997565823, + "grad_norm": 4.601620197296143, + "learning_rate": 6.150263397548162e-05, + "loss": 1.2542, + "num_input_tokens_seen": 88520160, + "step": 5501 + }, + { + "epoch": 0.3854054480023116, + "grad_norm": 4.273797988891602, + "learning_rate": 6.14956357267951e-05, + "loss": 1.3405, + "num_input_tokens_seen": 88535832, + "step": 5502 + }, + { + "epoch": 0.38547549624804084, + "grad_norm": 4.023514747619629, + "learning_rate": 6.148863747810858e-05, + "loss": 1.1248, + "num_input_tokens_seen": 88552000, + "step": 5503 + }, + { + "epoch": 0.3855455444937701, + "grad_norm": 3.7229719161987305, + "learning_rate": 6.148163922942207e-05, + "loss": 1.1117, + "num_input_tokens_seen": 88567600, + "step": 5504 + }, + { + "epoch": 0.38561559273949936, + "grad_norm": 4.696394920349121, + "learning_rate": 6.147464098073556e-05, + "loss": 0.8945, + "num_input_tokens_seen": 88583224, + "step": 5505 + }, + { + "epoch": 0.38568564098522856, + "grad_norm": 5.354174613952637, + "learning_rate": 6.146764273204903e-05, + "loss": 1.079, + "num_input_tokens_seen": 88599608, + "step": 5506 + }, + { + "epoch": 0.3857556892309578, + "grad_norm": 4.717334747314453, + "learning_rate": 6.146064448336252e-05, + "loss": 1.1293, + "num_input_tokens_seen": 88615048, + "step": 5507 + }, + { + "epoch": 0.3858257374766871, + "grad_norm": 5.373983383178711, + "learning_rate": 6.145364623467601e-05, + "loss": 0.9625, + "num_input_tokens_seen": 88630888, + "step": 5508 + }, + { + "epoch": 0.38589578572241634, + "grad_norm": 4.338916301727295, + "learning_rate": 6.14466479859895e-05, + "loss": 1.0884, + "num_input_tokens_seen": 88647072, + "step": 5509 + }, + { + "epoch": 0.38596583396814554, + "grad_norm": 3.898721694946289, + "learning_rate": 6.143964973730297e-05, + "loss": 1.095, + "num_input_tokens_seen": 88663128, + "step": 5510 + }, + { + "epoch": 0.3860358822138748, + "grad_norm": 4.614948749542236, + "learning_rate": 6.143265148861648e-05, + "loss": 1.0729, + "num_input_tokens_seen": 88679512, + "step": 5511 + }, + { + "epoch": 0.38610593045960406, + "grad_norm": 5.1157732009887695, + "learning_rate": 6.142565323992995e-05, + "loss": 1.0776, + "num_input_tokens_seen": 88695664, + "step": 5512 + }, + { + "epoch": 0.3861759787053333, + "grad_norm": 4.29611873626709, + "learning_rate": 6.141865499124343e-05, + "loss": 1.0838, + "num_input_tokens_seen": 88711560, + "step": 5513 + }, + { + "epoch": 0.3862460269510625, + "grad_norm": 3.9464735984802246, + "learning_rate": 6.141165674255692e-05, + "loss": 1.1907, + "num_input_tokens_seen": 88727464, + "step": 5514 + }, + { + "epoch": 0.3863160751967918, + "grad_norm": 3.8381590843200684, + "learning_rate": 6.140465849387042e-05, + "loss": 1.1416, + "num_input_tokens_seen": 88743848, + "step": 5515 + }, + { + "epoch": 0.38638612344252105, + "grad_norm": 3.573434829711914, + "learning_rate": 6.139766024518389e-05, + "loss": 0.9316, + "num_input_tokens_seen": 88759312, + "step": 5516 + }, + { + "epoch": 0.3864561716882503, + "grad_norm": 4.257131576538086, + "learning_rate": 6.139066199649738e-05, + "loss": 0.9534, + "num_input_tokens_seen": 88775112, + "step": 5517 + }, + { + "epoch": 0.3865262199339795, + "grad_norm": 4.2985310554504395, + "learning_rate": 6.138366374781087e-05, + "loss": 0.9387, + "num_input_tokens_seen": 88791496, + "step": 5518 + }, + { + "epoch": 0.38659626817970877, + "grad_norm": 3.7012977600097656, + "learning_rate": 6.137666549912434e-05, + "loss": 1.0104, + "num_input_tokens_seen": 88807880, + "step": 5519 + }, + { + "epoch": 0.38666631642543803, + "grad_norm": 5.4860453605651855, + "learning_rate": 6.136966725043782e-05, + "loss": 1.1978, + "num_input_tokens_seen": 88823392, + "step": 5520 + }, + { + "epoch": 0.3867363646711673, + "grad_norm": 4.165813446044922, + "learning_rate": 6.136266900175132e-05, + "loss": 1.0184, + "num_input_tokens_seen": 88839352, + "step": 5521 + }, + { + "epoch": 0.3868064129168965, + "grad_norm": 3.6253862380981445, + "learning_rate": 6.135567075306481e-05, + "loss": 0.9544, + "num_input_tokens_seen": 88855736, + "step": 5522 + }, + { + "epoch": 0.38687646116262575, + "grad_norm": 3.834057331085205, + "learning_rate": 6.134867250437829e-05, + "loss": 1.1863, + "num_input_tokens_seen": 88871952, + "step": 5523 + }, + { + "epoch": 0.386946509408355, + "grad_norm": 4.534783363342285, + "learning_rate": 6.134167425569177e-05, + "loss": 0.85, + "num_input_tokens_seen": 88888336, + "step": 5524 + }, + { + "epoch": 0.38701655765408427, + "grad_norm": 5.4073381423950195, + "learning_rate": 6.133467600700526e-05, + "loss": 0.9257, + "num_input_tokens_seen": 88904256, + "step": 5525 + }, + { + "epoch": 0.3870866058998135, + "grad_norm": 3.819841146469116, + "learning_rate": 6.132767775831874e-05, + "loss": 1.0911, + "num_input_tokens_seen": 88920640, + "step": 5526 + }, + { + "epoch": 0.38715665414554273, + "grad_norm": 3.814857244491577, + "learning_rate": 6.132067950963223e-05, + "loss": 1.2414, + "num_input_tokens_seen": 88937024, + "step": 5527 + }, + { + "epoch": 0.387226702391272, + "grad_norm": 3.682535171508789, + "learning_rate": 6.131368126094571e-05, + "loss": 0.9158, + "num_input_tokens_seen": 88952712, + "step": 5528 + }, + { + "epoch": 0.38729675063700125, + "grad_norm": 3.5657262802124023, + "learning_rate": 6.13066830122592e-05, + "loss": 0.9336, + "num_input_tokens_seen": 88969096, + "step": 5529 + }, + { + "epoch": 0.38736679888273046, + "grad_norm": 3.851977825164795, + "learning_rate": 6.129968476357268e-05, + "loss": 1.0546, + "num_input_tokens_seen": 88985480, + "step": 5530 + }, + { + "epoch": 0.3874368471284597, + "grad_norm": 4.079189777374268, + "learning_rate": 6.129268651488617e-05, + "loss": 0.856, + "num_input_tokens_seen": 89001104, + "step": 5531 + }, + { + "epoch": 0.387506895374189, + "grad_norm": 4.388980865478516, + "learning_rate": 6.128568826619966e-05, + "loss": 1.0785, + "num_input_tokens_seen": 89017232, + "step": 5532 + }, + { + "epoch": 0.38757694361991823, + "grad_norm": 3.6747231483459473, + "learning_rate": 6.127869001751313e-05, + "loss": 1.1171, + "num_input_tokens_seen": 89033576, + "step": 5533 + }, + { + "epoch": 0.38764699186564744, + "grad_norm": 4.62367057800293, + "learning_rate": 6.127169176882662e-05, + "loss": 1.138, + "num_input_tokens_seen": 89049224, + "step": 5534 + }, + { + "epoch": 0.3877170401113767, + "grad_norm": 3.8601040840148926, + "learning_rate": 6.126469352014011e-05, + "loss": 1.0254, + "num_input_tokens_seen": 89064968, + "step": 5535 + }, + { + "epoch": 0.38778708835710596, + "grad_norm": 5.132208347320557, + "learning_rate": 6.12576952714536e-05, + "loss": 1.0121, + "num_input_tokens_seen": 89081352, + "step": 5536 + }, + { + "epoch": 0.3878571366028352, + "grad_norm": 3.9259984493255615, + "learning_rate": 6.125069702276707e-05, + "loss": 0.9146, + "num_input_tokens_seen": 89097696, + "step": 5537 + }, + { + "epoch": 0.3879271848485644, + "grad_norm": 3.9004077911376953, + "learning_rate": 6.124369877408057e-05, + "loss": 1.0059, + "num_input_tokens_seen": 89114080, + "step": 5538 + }, + { + "epoch": 0.3879972330942937, + "grad_norm": 4.657776355743408, + "learning_rate": 6.123670052539405e-05, + "loss": 1.0612, + "num_input_tokens_seen": 89129584, + "step": 5539 + }, + { + "epoch": 0.38806728134002294, + "grad_norm": 3.4758501052856445, + "learning_rate": 6.122970227670752e-05, + "loss": 1.0179, + "num_input_tokens_seen": 89145968, + "step": 5540 + }, + { + "epoch": 0.3881373295857522, + "grad_norm": 3.949275255203247, + "learning_rate": 6.122270402802101e-05, + "loss": 1.0336, + "num_input_tokens_seen": 89161904, + "step": 5541 + }, + { + "epoch": 0.38820737783148146, + "grad_norm": 5.620425224304199, + "learning_rate": 6.121570577933451e-05, + "loss": 0.9776, + "num_input_tokens_seen": 89178032, + "step": 5542 + }, + { + "epoch": 0.38827742607721066, + "grad_norm": 5.1215643882751465, + "learning_rate": 6.120870753064799e-05, + "loss": 1.0577, + "num_input_tokens_seen": 89193568, + "step": 5543 + }, + { + "epoch": 0.3883474743229399, + "grad_norm": 3.994556427001953, + "learning_rate": 6.120170928196148e-05, + "loss": 1.0631, + "num_input_tokens_seen": 89209952, + "step": 5544 + }, + { + "epoch": 0.3884175225686692, + "grad_norm": 6.86944055557251, + "learning_rate": 6.119471103327497e-05, + "loss": 1.208, + "num_input_tokens_seen": 89226336, + "step": 5545 + }, + { + "epoch": 0.38848757081439844, + "grad_norm": 3.72501540184021, + "learning_rate": 6.118771278458844e-05, + "loss": 1.0198, + "num_input_tokens_seen": 89242720, + "step": 5546 + }, + { + "epoch": 0.38855761906012765, + "grad_norm": 3.6887834072113037, + "learning_rate": 6.118071453590193e-05, + "loss": 1.0964, + "num_input_tokens_seen": 89258536, + "step": 5547 + }, + { + "epoch": 0.3886276673058569, + "grad_norm": 5.15130615234375, + "learning_rate": 6.117371628721542e-05, + "loss": 1.0193, + "num_input_tokens_seen": 89274920, + "step": 5548 + }, + { + "epoch": 0.38869771555158616, + "grad_norm": 3.7503981590270996, + "learning_rate": 6.116671803852891e-05, + "loss": 0.9457, + "num_input_tokens_seen": 89291304, + "step": 5549 + }, + { + "epoch": 0.3887677637973154, + "grad_norm": 4.851298809051514, + "learning_rate": 6.115971978984238e-05, + "loss": 1.132, + "num_input_tokens_seen": 89307080, + "step": 5550 + }, + { + "epoch": 0.38883781204304463, + "grad_norm": 3.72981858253479, + "learning_rate": 6.115272154115587e-05, + "loss": 1.0371, + "num_input_tokens_seen": 89323464, + "step": 5551 + }, + { + "epoch": 0.3889078602887739, + "grad_norm": 4.1301140785217285, + "learning_rate": 6.114572329246936e-05, + "loss": 0.9746, + "num_input_tokens_seen": 89339696, + "step": 5552 + }, + { + "epoch": 0.38897790853450315, + "grad_norm": 4.225720405578613, + "learning_rate": 6.113872504378283e-05, + "loss": 1.124, + "num_input_tokens_seen": 89356080, + "step": 5553 + }, + { + "epoch": 0.3890479567802324, + "grad_norm": 3.7197327613830566, + "learning_rate": 6.113172679509632e-05, + "loss": 1.0739, + "num_input_tokens_seen": 89372464, + "step": 5554 + }, + { + "epoch": 0.3891180050259616, + "grad_norm": 4.626903057098389, + "learning_rate": 6.112472854640981e-05, + "loss": 0.9896, + "num_input_tokens_seen": 89388848, + "step": 5555 + }, + { + "epoch": 0.38918805327169087, + "grad_norm": 4.229621410369873, + "learning_rate": 6.111773029772329e-05, + "loss": 0.9925, + "num_input_tokens_seen": 89405112, + "step": 5556 + }, + { + "epoch": 0.38925810151742013, + "grad_norm": 3.5502984523773193, + "learning_rate": 6.111073204903678e-05, + "loss": 0.9966, + "num_input_tokens_seen": 89421496, + "step": 5557 + }, + { + "epoch": 0.3893281497631494, + "grad_norm": 4.251241207122803, + "learning_rate": 6.110373380035026e-05, + "loss": 1.0249, + "num_input_tokens_seen": 89437880, + "step": 5558 + }, + { + "epoch": 0.3893981980088786, + "grad_norm": 5.076200485229492, + "learning_rate": 6.109673555166375e-05, + "loss": 1.0592, + "num_input_tokens_seen": 89454264, + "step": 5559 + }, + { + "epoch": 0.38946824625460785, + "grad_norm": 4.018000602722168, + "learning_rate": 6.108973730297723e-05, + "loss": 1.197, + "num_input_tokens_seen": 89470648, + "step": 5560 + }, + { + "epoch": 0.3895382945003371, + "grad_norm": 4.3367180824279785, + "learning_rate": 6.108273905429072e-05, + "loss": 1.0778, + "num_input_tokens_seen": 89487032, + "step": 5561 + }, + { + "epoch": 0.38960834274606637, + "grad_norm": 6.027153015136719, + "learning_rate": 6.10757408056042e-05, + "loss": 0.7955, + "num_input_tokens_seen": 89502064, + "step": 5562 + }, + { + "epoch": 0.3896783909917956, + "grad_norm": 3.499268054962158, + "learning_rate": 6.10687425569177e-05, + "loss": 0.935, + "num_input_tokens_seen": 89518448, + "step": 5563 + }, + { + "epoch": 0.38974843923752484, + "grad_norm": 3.3691868782043457, + "learning_rate": 6.106174430823117e-05, + "loss": 0.9249, + "num_input_tokens_seen": 89534640, + "step": 5564 + }, + { + "epoch": 0.3898184874832541, + "grad_norm": 3.4140114784240723, + "learning_rate": 6.105474605954467e-05, + "loss": 0.9594, + "num_input_tokens_seen": 89551024, + "step": 5565 + }, + { + "epoch": 0.38988853572898335, + "grad_norm": 4.049834728240967, + "learning_rate": 6.104774781085815e-05, + "loss": 1.1637, + "num_input_tokens_seen": 89567408, + "step": 5566 + }, + { + "epoch": 0.38995858397471256, + "grad_norm": 4.825027942657471, + "learning_rate": 6.104074956217162e-05, + "loss": 1.0514, + "num_input_tokens_seen": 89583792, + "step": 5567 + }, + { + "epoch": 0.3900286322204418, + "grad_norm": 5.281174659729004, + "learning_rate": 6.103375131348512e-05, + "loss": 1.0534, + "num_input_tokens_seen": 89600176, + "step": 5568 + }, + { + "epoch": 0.3900986804661711, + "grad_norm": 3.567270278930664, + "learning_rate": 6.1026753064798605e-05, + "loss": 0.9217, + "num_input_tokens_seen": 89616560, + "step": 5569 + }, + { + "epoch": 0.39016872871190034, + "grad_norm": 5.449852466583252, + "learning_rate": 6.1019754816112086e-05, + "loss": 1.0362, + "num_input_tokens_seen": 89631968, + "step": 5570 + }, + { + "epoch": 0.39023877695762954, + "grad_norm": 4.016347885131836, + "learning_rate": 6.1012756567425575e-05, + "loss": 0.9634, + "num_input_tokens_seen": 89646712, + "step": 5571 + }, + { + "epoch": 0.3903088252033588, + "grad_norm": 3.8826510906219482, + "learning_rate": 6.100575831873906e-05, + "loss": 1.1645, + "num_input_tokens_seen": 89662776, + "step": 5572 + }, + { + "epoch": 0.39037887344908806, + "grad_norm": 3.80755615234375, + "learning_rate": 6.099876007005254e-05, + "loss": 1.0404, + "num_input_tokens_seen": 89679096, + "step": 5573 + }, + { + "epoch": 0.3904489216948173, + "grad_norm": 3.7274065017700195, + "learning_rate": 6.099176182136602e-05, + "loss": 1.1665, + "num_input_tokens_seen": 89695048, + "step": 5574 + }, + { + "epoch": 0.3905189699405466, + "grad_norm": 4.335930824279785, + "learning_rate": 6.09847635726795e-05, + "loss": 1.0662, + "num_input_tokens_seen": 89711432, + "step": 5575 + }, + { + "epoch": 0.3905890181862758, + "grad_norm": 3.8839964866638184, + "learning_rate": 6.0977765323993e-05, + "loss": 1.0635, + "num_input_tokens_seen": 89727712, + "step": 5576 + }, + { + "epoch": 0.39065906643200504, + "grad_norm": 4.8028035163879395, + "learning_rate": 6.097076707530648e-05, + "loss": 1.0906, + "num_input_tokens_seen": 89744096, + "step": 5577 + }, + { + "epoch": 0.3907291146777343, + "grad_norm": 4.042201519012451, + "learning_rate": 6.096376882661997e-05, + "loss": 0.8609, + "num_input_tokens_seen": 89758688, + "step": 5578 + }, + { + "epoch": 0.39079916292346356, + "grad_norm": 4.1316986083984375, + "learning_rate": 6.095677057793345e-05, + "loss": 0.9509, + "num_input_tokens_seen": 89774720, + "step": 5579 + }, + { + "epoch": 0.39086921116919277, + "grad_norm": 5.164004802703857, + "learning_rate": 6.094977232924693e-05, + "loss": 1.1927, + "num_input_tokens_seen": 89788480, + "step": 5580 + }, + { + "epoch": 0.390939259414922, + "grad_norm": 4.125234127044678, + "learning_rate": 6.094277408056043e-05, + "loss": 0.9237, + "num_input_tokens_seen": 89803000, + "step": 5581 + }, + { + "epoch": 0.3910093076606513, + "grad_norm": 4.798699855804443, + "learning_rate": 6.093577583187392e-05, + "loss": 1.1069, + "num_input_tokens_seen": 89818288, + "step": 5582 + }, + { + "epoch": 0.39107935590638054, + "grad_norm": 4.383975028991699, + "learning_rate": 6.0928777583187404e-05, + "loss": 0.9997, + "num_input_tokens_seen": 89833616, + "step": 5583 + }, + { + "epoch": 0.39114940415210975, + "grad_norm": 4.20830774307251, + "learning_rate": 6.092177933450087e-05, + "loss": 0.8794, + "num_input_tokens_seen": 89849200, + "step": 5584 + }, + { + "epoch": 0.391219452397839, + "grad_norm": 4.470288276672363, + "learning_rate": 6.091478108581437e-05, + "loss": 1.0974, + "num_input_tokens_seen": 89865184, + "step": 5585 + }, + { + "epoch": 0.39128950064356827, + "grad_norm": 4.8457112312316895, + "learning_rate": 6.090778283712785e-05, + "loss": 1.1091, + "num_input_tokens_seen": 89881184, + "step": 5586 + }, + { + "epoch": 0.3913595488892975, + "grad_norm": 4.112722873687744, + "learning_rate": 6.090078458844133e-05, + "loss": 1.0189, + "num_input_tokens_seen": 89897568, + "step": 5587 + }, + { + "epoch": 0.39142959713502673, + "grad_norm": 4.070732116699219, + "learning_rate": 6.089378633975482e-05, + "loss": 1.0694, + "num_input_tokens_seen": 89913952, + "step": 5588 + }, + { + "epoch": 0.391499645380756, + "grad_norm": 4.092299461364746, + "learning_rate": 6.08867880910683e-05, + "loss": 1.0922, + "num_input_tokens_seen": 89929216, + "step": 5589 + }, + { + "epoch": 0.39156969362648525, + "grad_norm": 3.8092305660247803, + "learning_rate": 6.08797898423818e-05, + "loss": 1.0041, + "num_input_tokens_seen": 89945376, + "step": 5590 + }, + { + "epoch": 0.3916397418722145, + "grad_norm": 5.461154937744141, + "learning_rate": 6.0872791593695265e-05, + "loss": 1.1393, + "num_input_tokens_seen": 89960296, + "step": 5591 + }, + { + "epoch": 0.3917097901179437, + "grad_norm": 5.2103190422058105, + "learning_rate": 6.086579334500877e-05, + "loss": 1.079, + "num_input_tokens_seen": 89975864, + "step": 5592 + }, + { + "epoch": 0.39177983836367297, + "grad_norm": 3.6308488845825195, + "learning_rate": 6.085879509632225e-05, + "loss": 1.0344, + "num_input_tokens_seen": 89991880, + "step": 5593 + }, + { + "epoch": 0.39184988660940223, + "grad_norm": 4.632900714874268, + "learning_rate": 6.0851796847635724e-05, + "loss": 0.8255, + "num_input_tokens_seen": 90006560, + "step": 5594 + }, + { + "epoch": 0.3919199348551315, + "grad_norm": 3.8614165782928467, + "learning_rate": 6.084479859894921e-05, + "loss": 1.1599, + "num_input_tokens_seen": 90022136, + "step": 5595 + }, + { + "epoch": 0.3919899831008607, + "grad_norm": 3.768287420272827, + "learning_rate": 6.0837800350262695e-05, + "loss": 1.0694, + "num_input_tokens_seen": 90038520, + "step": 5596 + }, + { + "epoch": 0.39206003134658995, + "grad_norm": 3.355902671813965, + "learning_rate": 6.083080210157618e-05, + "loss": 0.944, + "num_input_tokens_seen": 90054592, + "step": 5597 + }, + { + "epoch": 0.3921300795923192, + "grad_norm": 3.2001609802246094, + "learning_rate": 6.082380385288967e-05, + "loss": 0.8642, + "num_input_tokens_seen": 90070976, + "step": 5598 + }, + { + "epoch": 0.3922001278380485, + "grad_norm": 3.74692440032959, + "learning_rate": 6.081680560420317e-05, + "loss": 0.9807, + "num_input_tokens_seen": 90087360, + "step": 5599 + }, + { + "epoch": 0.3922701760837777, + "grad_norm": 5.602208614349365, + "learning_rate": 6.080980735551665e-05, + "loss": 1.2451, + "num_input_tokens_seen": 90103744, + "step": 5600 + }, + { + "epoch": 0.3922701760837777, + "eval_loss": 1.127113699913025, + "eval_runtime": 0.2033, + "eval_samples_per_second": 4.919, + "eval_steps_per_second": 4.919, + "num_input_tokens_seen": 90103744, + "step": 5600 + }, + { + "epoch": 0.39234022432950694, + "grad_norm": 3.727559804916382, + "learning_rate": 6.080280910683012e-05, + "loss": 1.0777, + "num_input_tokens_seen": 90119400, + "step": 5601 + }, + { + "epoch": 0.3924102725752362, + "grad_norm": 4.577515125274658, + "learning_rate": 6.0795810858143606e-05, + "loss": 1.0032, + "num_input_tokens_seen": 90135168, + "step": 5602 + }, + { + "epoch": 0.39248032082096546, + "grad_norm": 5.225588798522949, + "learning_rate": 6.07888126094571e-05, + "loss": 1.0964, + "num_input_tokens_seen": 90151480, + "step": 5603 + }, + { + "epoch": 0.39255036906669466, + "grad_norm": 3.6131844520568848, + "learning_rate": 6.078181436077057e-05, + "loss": 0.9255, + "num_input_tokens_seen": 90167864, + "step": 5604 + }, + { + "epoch": 0.3926204173124239, + "grad_norm": 4.127248287200928, + "learning_rate": 6.0774816112084065e-05, + "loss": 1.1939, + "num_input_tokens_seen": 90184248, + "step": 5605 + }, + { + "epoch": 0.3926904655581532, + "grad_norm": 4.599911689758301, + "learning_rate": 6.076781786339756e-05, + "loss": 1.2819, + "num_input_tokens_seen": 90199816, + "step": 5606 + }, + { + "epoch": 0.39276051380388244, + "grad_norm": 3.7179722785949707, + "learning_rate": 6.076081961471104e-05, + "loss": 0.8559, + "num_input_tokens_seen": 90215640, + "step": 5607 + }, + { + "epoch": 0.39283056204961164, + "grad_norm": 3.872941493988037, + "learning_rate": 6.0753821366024524e-05, + "loss": 0.9986, + "num_input_tokens_seen": 90232024, + "step": 5608 + }, + { + "epoch": 0.3929006102953409, + "grad_norm": 3.7326548099517822, + "learning_rate": 6.074682311733801e-05, + "loss": 1.0295, + "num_input_tokens_seen": 90247536, + "step": 5609 + }, + { + "epoch": 0.39297065854107016, + "grad_norm": 4.05418062210083, + "learning_rate": 6.0739824868651494e-05, + "loss": 1.2199, + "num_input_tokens_seen": 90263920, + "step": 5610 + }, + { + "epoch": 0.3930407067867994, + "grad_norm": 5.326319694519043, + "learning_rate": 6.073282661996497e-05, + "loss": 1.0705, + "num_input_tokens_seen": 90280240, + "step": 5611 + }, + { + "epoch": 0.3931107550325287, + "grad_norm": 4.132864952087402, + "learning_rate": 6.072582837127846e-05, + "loss": 1.2377, + "num_input_tokens_seen": 90296128, + "step": 5612 + }, + { + "epoch": 0.3931808032782579, + "grad_norm": 3.7307562828063965, + "learning_rate": 6.0718830122591953e-05, + "loss": 0.9765, + "num_input_tokens_seen": 90312512, + "step": 5613 + }, + { + "epoch": 0.39325085152398714, + "grad_norm": 6.35123872756958, + "learning_rate": 6.0711831873905435e-05, + "loss": 0.9049, + "num_input_tokens_seen": 90328896, + "step": 5614 + }, + { + "epoch": 0.3933208997697164, + "grad_norm": 5.536827564239502, + "learning_rate": 6.070483362521892e-05, + "loss": 0.952, + "num_input_tokens_seen": 90344648, + "step": 5615 + }, + { + "epoch": 0.39339094801544566, + "grad_norm": 3.6394944190979004, + "learning_rate": 6.0697835376532406e-05, + "loss": 0.9446, + "num_input_tokens_seen": 90361032, + "step": 5616 + }, + { + "epoch": 0.39346099626117487, + "grad_norm": 3.4719443321228027, + "learning_rate": 6.069083712784589e-05, + "loss": 0.9859, + "num_input_tokens_seen": 90377192, + "step": 5617 + }, + { + "epoch": 0.3935310445069041, + "grad_norm": 6.195781230926514, + "learning_rate": 6.068383887915936e-05, + "loss": 0.9482, + "num_input_tokens_seen": 90393576, + "step": 5618 + }, + { + "epoch": 0.3936010927526334, + "grad_norm": 3.5569331645965576, + "learning_rate": 6.0676840630472865e-05, + "loss": 1.0244, + "num_input_tokens_seen": 90409960, + "step": 5619 + }, + { + "epoch": 0.39367114099836265, + "grad_norm": 5.535704135894775, + "learning_rate": 6.0669842381786346e-05, + "loss": 1.3058, + "num_input_tokens_seen": 90426216, + "step": 5620 + }, + { + "epoch": 0.39374118924409185, + "grad_norm": 3.81278395652771, + "learning_rate": 6.0662844133099815e-05, + "loss": 0.9343, + "num_input_tokens_seen": 90442320, + "step": 5621 + }, + { + "epoch": 0.3938112374898211, + "grad_norm": 4.093146324157715, + "learning_rate": 6.065584588441331e-05, + "loss": 1.0698, + "num_input_tokens_seen": 90458704, + "step": 5622 + }, + { + "epoch": 0.39388128573555037, + "grad_norm": 6.061161518096924, + "learning_rate": 6.06488476357268e-05, + "loss": 0.9046, + "num_input_tokens_seen": 90474408, + "step": 5623 + }, + { + "epoch": 0.39395133398127963, + "grad_norm": 3.763059616088867, + "learning_rate": 6.064184938704028e-05, + "loss": 1.002, + "num_input_tokens_seen": 90490592, + "step": 5624 + }, + { + "epoch": 0.39402138222700883, + "grad_norm": 5.287941932678223, + "learning_rate": 6.063485113835376e-05, + "loss": 1.0667, + "num_input_tokens_seen": 90506568, + "step": 5625 + }, + { + "epoch": 0.3940914304727381, + "grad_norm": 3.5837693214416504, + "learning_rate": 6.062785288966726e-05, + "loss": 1.1136, + "num_input_tokens_seen": 90522952, + "step": 5626 + }, + { + "epoch": 0.39416147871846735, + "grad_norm": 3.5523111820220947, + "learning_rate": 6.062085464098074e-05, + "loss": 1.0703, + "num_input_tokens_seen": 90539336, + "step": 5627 + }, + { + "epoch": 0.3942315269641966, + "grad_norm": 4.3880934715271, + "learning_rate": 6.061385639229421e-05, + "loss": 0.993, + "num_input_tokens_seen": 90555720, + "step": 5628 + }, + { + "epoch": 0.3943015752099258, + "grad_norm": 4.26425313949585, + "learning_rate": 6.060685814360772e-05, + "loss": 1.08, + "num_input_tokens_seen": 90572104, + "step": 5629 + }, + { + "epoch": 0.3943716234556551, + "grad_norm": 3.8837990760803223, + "learning_rate": 6.05998598949212e-05, + "loss": 0.9414, + "num_input_tokens_seen": 90588488, + "step": 5630 + }, + { + "epoch": 0.39444167170138433, + "grad_norm": 3.704282522201538, + "learning_rate": 6.059286164623468e-05, + "loss": 1.0037, + "num_input_tokens_seen": 90604872, + "step": 5631 + }, + { + "epoch": 0.3945117199471136, + "grad_norm": 5.933957099914551, + "learning_rate": 6.058586339754816e-05, + "loss": 1.0753, + "num_input_tokens_seen": 90621256, + "step": 5632 + }, + { + "epoch": 0.3945817681928428, + "grad_norm": 4.185206413269043, + "learning_rate": 6.057886514886165e-05, + "loss": 1.0044, + "num_input_tokens_seen": 90637640, + "step": 5633 + }, + { + "epoch": 0.39465181643857206, + "grad_norm": 3.97603440284729, + "learning_rate": 6.057186690017513e-05, + "loss": 1.2243, + "num_input_tokens_seen": 90654024, + "step": 5634 + }, + { + "epoch": 0.3947218646843013, + "grad_norm": 3.394630193710327, + "learning_rate": 6.056486865148863e-05, + "loss": 0.9702, + "num_input_tokens_seen": 90670008, + "step": 5635 + }, + { + "epoch": 0.3947919129300306, + "grad_norm": 3.810899019241333, + "learning_rate": 6.055787040280211e-05, + "loss": 0.9998, + "num_input_tokens_seen": 90686392, + "step": 5636 + }, + { + "epoch": 0.3948619611757598, + "grad_norm": 4.237402439117432, + "learning_rate": 6.055087215411559e-05, + "loss": 1.1013, + "num_input_tokens_seen": 90702776, + "step": 5637 + }, + { + "epoch": 0.39493200942148904, + "grad_norm": 5.481308937072754, + "learning_rate": 6.054387390542907e-05, + "loss": 1.0064, + "num_input_tokens_seen": 90718312, + "step": 5638 + }, + { + "epoch": 0.3950020576672183, + "grad_norm": 3.582808017730713, + "learning_rate": 6.053687565674256e-05, + "loss": 0.9305, + "num_input_tokens_seen": 90733856, + "step": 5639 + }, + { + "epoch": 0.39507210591294756, + "grad_norm": 3.9277966022491455, + "learning_rate": 6.0529877408056044e-05, + "loss": 1.0986, + "num_input_tokens_seen": 90750240, + "step": 5640 + }, + { + "epoch": 0.39514215415867676, + "grad_norm": 4.61000394821167, + "learning_rate": 6.0522879159369526e-05, + "loss": 1.1705, + "num_input_tokens_seen": 90766352, + "step": 5641 + }, + { + "epoch": 0.395212202404406, + "grad_norm": 4.445149898529053, + "learning_rate": 6.051588091068301e-05, + "loss": 1.0809, + "num_input_tokens_seen": 90782736, + "step": 5642 + }, + { + "epoch": 0.3952822506501353, + "grad_norm": 4.652968406677246, + "learning_rate": 6.0508882661996516e-05, + "loss": 0.9761, + "num_input_tokens_seen": 90799120, + "step": 5643 + }, + { + "epoch": 0.39535229889586454, + "grad_norm": 4.172330856323242, + "learning_rate": 6.0501884413309985e-05, + "loss": 1.0637, + "num_input_tokens_seen": 90815504, + "step": 5644 + }, + { + "epoch": 0.3954223471415938, + "grad_norm": 3.647385358810425, + "learning_rate": 6.0494886164623466e-05, + "loss": 0.9284, + "num_input_tokens_seen": 90831888, + "step": 5645 + }, + { + "epoch": 0.395492395387323, + "grad_norm": 3.9353525638580322, + "learning_rate": 6.0487887915936955e-05, + "loss": 1.1498, + "num_input_tokens_seen": 90848048, + "step": 5646 + }, + { + "epoch": 0.39556244363305226, + "grad_norm": 4.216567039489746, + "learning_rate": 6.048088966725044e-05, + "loss": 1.1247, + "num_input_tokens_seen": 90863576, + "step": 5647 + }, + { + "epoch": 0.3956324918787815, + "grad_norm": 5.031260013580322, + "learning_rate": 6.047389141856392e-05, + "loss": 1.1314, + "num_input_tokens_seen": 90879960, + "step": 5648 + }, + { + "epoch": 0.3957025401245108, + "grad_norm": 4.927192211151123, + "learning_rate": 6.0466893169877414e-05, + "loss": 1.0977, + "num_input_tokens_seen": 90896344, + "step": 5649 + }, + { + "epoch": 0.39577258837024, + "grad_norm": 4.589445114135742, + "learning_rate": 6.045989492119091e-05, + "loss": 1.0233, + "num_input_tokens_seen": 90912728, + "step": 5650 + }, + { + "epoch": 0.39584263661596925, + "grad_norm": 3.5707035064697266, + "learning_rate": 6.045289667250438e-05, + "loss": 0.9732, + "num_input_tokens_seen": 90929112, + "step": 5651 + }, + { + "epoch": 0.3959126848616985, + "grad_norm": 3.637237787246704, + "learning_rate": 6.044589842381787e-05, + "loss": 1.2063, + "num_input_tokens_seen": 90945376, + "step": 5652 + }, + { + "epoch": 0.39598273310742776, + "grad_norm": 4.068975448608398, + "learning_rate": 6.043890017513136e-05, + "loss": 1.0301, + "num_input_tokens_seen": 90961448, + "step": 5653 + }, + { + "epoch": 0.39605278135315697, + "grad_norm": 3.8378570079803467, + "learning_rate": 6.0431901926444837e-05, + "loss": 0.9195, + "num_input_tokens_seen": 90977832, + "step": 5654 + }, + { + "epoch": 0.39612282959888623, + "grad_norm": 4.5788092613220215, + "learning_rate": 6.042490367775832e-05, + "loss": 1.1601, + "num_input_tokens_seen": 90993296, + "step": 5655 + }, + { + "epoch": 0.3961928778446155, + "grad_norm": 3.7392847537994385, + "learning_rate": 6.041790542907181e-05, + "loss": 1.0081, + "num_input_tokens_seen": 91009680, + "step": 5656 + }, + { + "epoch": 0.39626292609034475, + "grad_norm": 5.830812931060791, + "learning_rate": 6.04109071803853e-05, + "loss": 1.0544, + "num_input_tokens_seen": 91025400, + "step": 5657 + }, + { + "epoch": 0.39633297433607395, + "grad_norm": 3.7372663021087646, + "learning_rate": 6.040390893169877e-05, + "loss": 1.1403, + "num_input_tokens_seen": 91041784, + "step": 5658 + }, + { + "epoch": 0.3964030225818032, + "grad_norm": 3.756762981414795, + "learning_rate": 6.0396910683012266e-05, + "loss": 1.0175, + "num_input_tokens_seen": 91058168, + "step": 5659 + }, + { + "epoch": 0.39647307082753247, + "grad_norm": 3.659280776977539, + "learning_rate": 6.0389912434325755e-05, + "loss": 1.0396, + "num_input_tokens_seen": 91074288, + "step": 5660 + }, + { + "epoch": 0.39654311907326173, + "grad_norm": 4.339829921722412, + "learning_rate": 6.038291418563923e-05, + "loss": 1.2101, + "num_input_tokens_seen": 91090024, + "step": 5661 + }, + { + "epoch": 0.39661316731899093, + "grad_norm": 4.062867641448975, + "learning_rate": 6.037591593695272e-05, + "loss": 0.9874, + "num_input_tokens_seen": 91106408, + "step": 5662 + }, + { + "epoch": 0.3966832155647202, + "grad_norm": 4.45166015625, + "learning_rate": 6.0368917688266214e-05, + "loss": 0.9504, + "num_input_tokens_seen": 91122648, + "step": 5663 + }, + { + "epoch": 0.39675326381044945, + "grad_norm": 3.4350759983062744, + "learning_rate": 6.036191943957968e-05, + "loss": 0.8876, + "num_input_tokens_seen": 91138200, + "step": 5664 + }, + { + "epoch": 0.3968233120561787, + "grad_norm": 3.5637154579162598, + "learning_rate": 6.0354921190893164e-05, + "loss": 1.0616, + "num_input_tokens_seen": 91154584, + "step": 5665 + }, + { + "epoch": 0.3968933603019079, + "grad_norm": 3.8793985843658447, + "learning_rate": 6.034792294220666e-05, + "loss": 1.0933, + "num_input_tokens_seen": 91170968, + "step": 5666 + }, + { + "epoch": 0.3969634085476372, + "grad_norm": 4.1613545417785645, + "learning_rate": 6.0340924693520154e-05, + "loss": 1.0895, + "num_input_tokens_seen": 91185856, + "step": 5667 + }, + { + "epoch": 0.39703345679336643, + "grad_norm": 8.646449089050293, + "learning_rate": 6.033392644483362e-05, + "loss": 1.1391, + "num_input_tokens_seen": 91201088, + "step": 5668 + }, + { + "epoch": 0.3971035050390957, + "grad_norm": 4.862243175506592, + "learning_rate": 6.032692819614711e-05, + "loss": 1.1871, + "num_input_tokens_seen": 91217472, + "step": 5669 + }, + { + "epoch": 0.3971735532848249, + "grad_norm": 4.013809680938721, + "learning_rate": 6.0319929947460607e-05, + "loss": 1.0307, + "num_input_tokens_seen": 91233760, + "step": 5670 + }, + { + "epoch": 0.39724360153055416, + "grad_norm": 4.664083480834961, + "learning_rate": 6.0312931698774075e-05, + "loss": 1.1507, + "num_input_tokens_seen": 91250144, + "step": 5671 + }, + { + "epoch": 0.3973136497762834, + "grad_norm": 4.330606937408447, + "learning_rate": 6.030593345008756e-05, + "loss": 1.1741, + "num_input_tokens_seen": 91264592, + "step": 5672 + }, + { + "epoch": 0.3973836980220127, + "grad_norm": 4.158743381500244, + "learning_rate": 6.0298935201401066e-05, + "loss": 0.8608, + "num_input_tokens_seen": 91280520, + "step": 5673 + }, + { + "epoch": 0.3974537462677419, + "grad_norm": 3.800955057144165, + "learning_rate": 6.029193695271455e-05, + "loss": 0.9451, + "num_input_tokens_seen": 91296472, + "step": 5674 + }, + { + "epoch": 0.39752379451347114, + "grad_norm": 4.307434558868408, + "learning_rate": 6.028493870402803e-05, + "loss": 1.0936, + "num_input_tokens_seen": 91312856, + "step": 5675 + }, + { + "epoch": 0.3975938427592004, + "grad_norm": 4.052398204803467, + "learning_rate": 6.027794045534151e-05, + "loss": 1.006, + "num_input_tokens_seen": 91329216, + "step": 5676 + }, + { + "epoch": 0.39766389100492966, + "grad_norm": 4.665764331817627, + "learning_rate": 6.0270942206655e-05, + "loss": 1.1573, + "num_input_tokens_seen": 91345184, + "step": 5677 + }, + { + "epoch": 0.3977339392506589, + "grad_norm": 4.070000648498535, + "learning_rate": 6.0263943957968475e-05, + "loss": 1.1296, + "num_input_tokens_seen": 91361568, + "step": 5678 + }, + { + "epoch": 0.3978039874963881, + "grad_norm": 4.304214954376221, + "learning_rate": 6.025694570928198e-05, + "loss": 0.8688, + "num_input_tokens_seen": 91376656, + "step": 5679 + }, + { + "epoch": 0.3978740357421174, + "grad_norm": 4.051540374755859, + "learning_rate": 6.024994746059546e-05, + "loss": 1.0862, + "num_input_tokens_seen": 91393040, + "step": 5680 + }, + { + "epoch": 0.39794408398784664, + "grad_norm": 3.986542224884033, + "learning_rate": 6.024294921190894e-05, + "loss": 1.1477, + "num_input_tokens_seen": 91408208, + "step": 5681 + }, + { + "epoch": 0.3980141322335759, + "grad_norm": 4.302114963531494, + "learning_rate": 6.023595096322242e-05, + "loss": 0.8569, + "num_input_tokens_seen": 91424592, + "step": 5682 + }, + { + "epoch": 0.3980841804793051, + "grad_norm": 6.019785404205322, + "learning_rate": 6.0228952714535904e-05, + "loss": 1.2166, + "num_input_tokens_seen": 91440976, + "step": 5683 + }, + { + "epoch": 0.39815422872503436, + "grad_norm": 3.667469024658203, + "learning_rate": 6.022195446584939e-05, + "loss": 1.0587, + "num_input_tokens_seen": 91457360, + "step": 5684 + }, + { + "epoch": 0.3982242769707636, + "grad_norm": 4.30043363571167, + "learning_rate": 6.0214956217162874e-05, + "loss": 1.0923, + "num_input_tokens_seen": 91473744, + "step": 5685 + }, + { + "epoch": 0.3982943252164929, + "grad_norm": 3.7446558475494385, + "learning_rate": 6.0207957968476356e-05, + "loss": 1.1302, + "num_input_tokens_seen": 91489544, + "step": 5686 + }, + { + "epoch": 0.3983643734622221, + "grad_norm": 3.722567081451416, + "learning_rate": 6.020095971978985e-05, + "loss": 0.9913, + "num_input_tokens_seen": 91505584, + "step": 5687 + }, + { + "epoch": 0.39843442170795135, + "grad_norm": 4.311237812042236, + "learning_rate": 6.019396147110332e-05, + "loss": 1.0352, + "num_input_tokens_seen": 91520656, + "step": 5688 + }, + { + "epoch": 0.3985044699536806, + "grad_norm": 5.650984764099121, + "learning_rate": 6.018696322241683e-05, + "loss": 1.2826, + "num_input_tokens_seen": 91537040, + "step": 5689 + }, + { + "epoch": 0.39857451819940987, + "grad_norm": 4.106716632843018, + "learning_rate": 6.017996497373031e-05, + "loss": 1.0234, + "num_input_tokens_seen": 91553424, + "step": 5690 + }, + { + "epoch": 0.39864456644513907, + "grad_norm": 3.893007516860962, + "learning_rate": 6.0172966725043786e-05, + "loss": 1.1175, + "num_input_tokens_seen": 91569184, + "step": 5691 + }, + { + "epoch": 0.39871461469086833, + "grad_norm": 3.6435177326202393, + "learning_rate": 6.016596847635727e-05, + "loss": 0.9446, + "num_input_tokens_seen": 91584832, + "step": 5692 + }, + { + "epoch": 0.3987846629365976, + "grad_norm": 3.639324188232422, + "learning_rate": 6.015897022767075e-05, + "loss": 1.0866, + "num_input_tokens_seen": 91601216, + "step": 5693 + }, + { + "epoch": 0.39885471118232685, + "grad_norm": 3.680997848510742, + "learning_rate": 6.0151971978984245e-05, + "loss": 1.1352, + "num_input_tokens_seen": 91617600, + "step": 5694 + }, + { + "epoch": 0.39892475942805605, + "grad_norm": 5.37217903137207, + "learning_rate": 6.0144973730297726e-05, + "loss": 1.1507, + "num_input_tokens_seen": 91633760, + "step": 5695 + }, + { + "epoch": 0.3989948076737853, + "grad_norm": 3.6297101974487305, + "learning_rate": 6.013797548161122e-05, + "loss": 0.8412, + "num_input_tokens_seen": 91649536, + "step": 5696 + }, + { + "epoch": 0.39906485591951457, + "grad_norm": 8.193422317504883, + "learning_rate": 6.0130977232924704e-05, + "loss": 1.1533, + "num_input_tokens_seen": 91665920, + "step": 5697 + }, + { + "epoch": 0.39913490416524383, + "grad_norm": 3.6126644611358643, + "learning_rate": 6.0123978984238185e-05, + "loss": 0.8261, + "num_input_tokens_seen": 91682272, + "step": 5698 + }, + { + "epoch": 0.39920495241097304, + "grad_norm": 4.277047634124756, + "learning_rate": 6.011698073555167e-05, + "loss": 1.0904, + "num_input_tokens_seen": 91698656, + "step": 5699 + }, + { + "epoch": 0.3992750006567023, + "grad_norm": 4.661556720733643, + "learning_rate": 6.0109982486865156e-05, + "loss": 1.1948, + "num_input_tokens_seen": 91715040, + "step": 5700 + }, + { + "epoch": 0.39934504890243155, + "grad_norm": 4.143563270568848, + "learning_rate": 6.010298423817864e-05, + "loss": 1.0421, + "num_input_tokens_seen": 91731424, + "step": 5701 + }, + { + "epoch": 0.3994150971481608, + "grad_norm": 5.750835418701172, + "learning_rate": 6.009598598949212e-05, + "loss": 1.0594, + "num_input_tokens_seen": 91747808, + "step": 5702 + }, + { + "epoch": 0.39948514539389, + "grad_norm": 4.048924446105957, + "learning_rate": 6.0088987740805615e-05, + "loss": 1.0488, + "num_input_tokens_seen": 91764192, + "step": 5703 + }, + { + "epoch": 0.3995551936396193, + "grad_norm": 3.7284796237945557, + "learning_rate": 6.00819894921191e-05, + "loss": 0.9723, + "num_input_tokens_seen": 91780576, + "step": 5704 + }, + { + "epoch": 0.39962524188534854, + "grad_norm": 4.041873931884766, + "learning_rate": 6.007499124343258e-05, + "loss": 0.9354, + "num_input_tokens_seen": 91796960, + "step": 5705 + }, + { + "epoch": 0.3996952901310778, + "grad_norm": 4.37992000579834, + "learning_rate": 6.006799299474607e-05, + "loss": 1.3032, + "num_input_tokens_seen": 91813344, + "step": 5706 + }, + { + "epoch": 0.399765338376807, + "grad_norm": 4.49924373626709, + "learning_rate": 6.006099474605955e-05, + "loss": 0.9709, + "num_input_tokens_seen": 91829728, + "step": 5707 + }, + { + "epoch": 0.39983538662253626, + "grad_norm": 4.609983444213867, + "learning_rate": 6.005399649737303e-05, + "loss": 1.0591, + "num_input_tokens_seen": 91844968, + "step": 5708 + }, + { + "epoch": 0.3999054348682655, + "grad_norm": 4.1750006675720215, + "learning_rate": 6.004699824868651e-05, + "loss": 1.1413, + "num_input_tokens_seen": 91860464, + "step": 5709 + }, + { + "epoch": 0.3999754831139948, + "grad_norm": 4.009062767028809, + "learning_rate": 6.0039999999999994e-05, + "loss": 1.0244, + "num_input_tokens_seen": 91876848, + "step": 5710 + }, + { + "epoch": 0.400045531359724, + "grad_norm": 4.0535078048706055, + "learning_rate": 6.003300175131349e-05, + "loss": 1.0851, + "num_input_tokens_seen": 91892192, + "step": 5711 + }, + { + "epoch": 0.40011557960545324, + "grad_norm": 4.592657089233398, + "learning_rate": 6.002600350262697e-05, + "loss": 0.8823, + "num_input_tokens_seen": 91907040, + "step": 5712 + }, + { + "epoch": 0.4001856278511825, + "grad_norm": 3.87369966506958, + "learning_rate": 6.001900525394046e-05, + "loss": 1.153, + "num_input_tokens_seen": 91922712, + "step": 5713 + }, + { + "epoch": 0.40025567609691176, + "grad_norm": 3.93766713142395, + "learning_rate": 6.001200700525394e-05, + "loss": 1.0225, + "num_input_tokens_seen": 91939096, + "step": 5714 + }, + { + "epoch": 0.400325724342641, + "grad_norm": 3.823153018951416, + "learning_rate": 6.0005008756567424e-05, + "loss": 0.9229, + "num_input_tokens_seen": 91955016, + "step": 5715 + }, + { + "epoch": 0.4003957725883702, + "grad_norm": 3.5592081546783447, + "learning_rate": 5.999801050788092e-05, + "loss": 0.9163, + "num_input_tokens_seen": 91971400, + "step": 5716 + }, + { + "epoch": 0.4004658208340995, + "grad_norm": 3.8749887943267822, + "learning_rate": 5.9991012259194414e-05, + "loss": 1.0194, + "num_input_tokens_seen": 91987784, + "step": 5717 + }, + { + "epoch": 0.40053586907982874, + "grad_norm": 4.749402046203613, + "learning_rate": 5.998401401050788e-05, + "loss": 0.9442, + "num_input_tokens_seen": 92003592, + "step": 5718 + }, + { + "epoch": 0.400605917325558, + "grad_norm": 4.114437580108643, + "learning_rate": 5.9977015761821365e-05, + "loss": 1.0576, + "num_input_tokens_seen": 92019640, + "step": 5719 + }, + { + "epoch": 0.4006759655712872, + "grad_norm": 3.3297617435455322, + "learning_rate": 5.997001751313486e-05, + "loss": 0.8776, + "num_input_tokens_seen": 92035544, + "step": 5720 + }, + { + "epoch": 0.40074601381701647, + "grad_norm": 4.204908847808838, + "learning_rate": 5.996301926444834e-05, + "loss": 1.141, + "num_input_tokens_seen": 92051840, + "step": 5721 + }, + { + "epoch": 0.4008160620627457, + "grad_norm": 4.198369979858398, + "learning_rate": 5.9956021015761824e-05, + "loss": 1.2653, + "num_input_tokens_seen": 92068224, + "step": 5722 + }, + { + "epoch": 0.400886110308475, + "grad_norm": 4.46641206741333, + "learning_rate": 5.994902276707531e-05, + "loss": 1.0866, + "num_input_tokens_seen": 92083656, + "step": 5723 + }, + { + "epoch": 0.4009561585542042, + "grad_norm": 4.2217535972595215, + "learning_rate": 5.994202451838881e-05, + "loss": 1.066, + "num_input_tokens_seen": 92100040, + "step": 5724 + }, + { + "epoch": 0.40102620679993345, + "grad_norm": 4.484360218048096, + "learning_rate": 5.993502626970229e-05, + "loss": 1.1031, + "num_input_tokens_seen": 92115592, + "step": 5725 + }, + { + "epoch": 0.4010962550456627, + "grad_norm": 4.69040060043335, + "learning_rate": 5.992802802101576e-05, + "loss": 1.1487, + "num_input_tokens_seen": 92131280, + "step": 5726 + }, + { + "epoch": 0.40116630329139197, + "grad_norm": 3.8119077682495117, + "learning_rate": 5.992102977232926e-05, + "loss": 1.1336, + "num_input_tokens_seen": 92147664, + "step": 5727 + }, + { + "epoch": 0.40123635153712117, + "grad_norm": 4.186896800994873, + "learning_rate": 5.9914031523642735e-05, + "loss": 0.9449, + "num_input_tokens_seen": 92164048, + "step": 5728 + }, + { + "epoch": 0.40130639978285043, + "grad_norm": 4.658702850341797, + "learning_rate": 5.9907033274956217e-05, + "loss": 1.1733, + "num_input_tokens_seen": 92180432, + "step": 5729 + }, + { + "epoch": 0.4013764480285797, + "grad_norm": 3.8305857181549072, + "learning_rate": 5.9900035026269705e-05, + "loss": 1.0041, + "num_input_tokens_seen": 92196816, + "step": 5730 + }, + { + "epoch": 0.40144649627430895, + "grad_norm": 4.063295364379883, + "learning_rate": 5.989303677758319e-05, + "loss": 1.1743, + "num_input_tokens_seen": 92212928, + "step": 5731 + }, + { + "epoch": 0.40151654452003815, + "grad_norm": 6.850064277648926, + "learning_rate": 5.988603852889667e-05, + "loss": 1.0967, + "num_input_tokens_seen": 92229312, + "step": 5732 + }, + { + "epoch": 0.4015865927657674, + "grad_norm": 5.065973281860352, + "learning_rate": 5.9879040280210164e-05, + "loss": 1.0513, + "num_input_tokens_seen": 92245696, + "step": 5733 + }, + { + "epoch": 0.4016566410114967, + "grad_norm": 4.02882719039917, + "learning_rate": 5.987204203152366e-05, + "loss": 0.934, + "num_input_tokens_seen": 92261936, + "step": 5734 + }, + { + "epoch": 0.40172668925722593, + "grad_norm": 3.9505250453948975, + "learning_rate": 5.986504378283714e-05, + "loss": 0.9871, + "num_input_tokens_seen": 92278320, + "step": 5735 + }, + { + "epoch": 0.40179673750295514, + "grad_norm": 4.949488162994385, + "learning_rate": 5.985804553415061e-05, + "loss": 0.9759, + "num_input_tokens_seen": 92294664, + "step": 5736 + }, + { + "epoch": 0.4018667857486844, + "grad_norm": 3.614008903503418, + "learning_rate": 5.98510472854641e-05, + "loss": 1.006, + "num_input_tokens_seen": 92311048, + "step": 5737 + }, + { + "epoch": 0.40193683399441366, + "grad_norm": 3.739224433898926, + "learning_rate": 5.984404903677758e-05, + "loss": 0.8997, + "num_input_tokens_seen": 92327432, + "step": 5738 + }, + { + "epoch": 0.4020068822401429, + "grad_norm": 3.6126298904418945, + "learning_rate": 5.983705078809106e-05, + "loss": 1.0062, + "num_input_tokens_seen": 92343816, + "step": 5739 + }, + { + "epoch": 0.4020769304858721, + "grad_norm": 4.304609298706055, + "learning_rate": 5.983005253940456e-05, + "loss": 1.0691, + "num_input_tokens_seen": 92358872, + "step": 5740 + }, + { + "epoch": 0.4021469787316014, + "grad_norm": 4.121729850769043, + "learning_rate": 5.982305429071805e-05, + "loss": 1.1047, + "num_input_tokens_seen": 92374960, + "step": 5741 + }, + { + "epoch": 0.40221702697733064, + "grad_norm": 4.137178421020508, + "learning_rate": 5.9816056042031534e-05, + "loss": 1.0809, + "num_input_tokens_seen": 92391344, + "step": 5742 + }, + { + "epoch": 0.4022870752230599, + "grad_norm": 8.537243843078613, + "learning_rate": 5.9809057793345016e-05, + "loss": 1.1571, + "num_input_tokens_seen": 92406096, + "step": 5743 + }, + { + "epoch": 0.4023571234687891, + "grad_norm": 4.613489627838135, + "learning_rate": 5.9802059544658505e-05, + "loss": 1.126, + "num_input_tokens_seen": 92422480, + "step": 5744 + }, + { + "epoch": 0.40242717171451836, + "grad_norm": 4.812812805175781, + "learning_rate": 5.9795061295971987e-05, + "loss": 1.2987, + "num_input_tokens_seen": 92437472, + "step": 5745 + }, + { + "epoch": 0.4024972199602476, + "grad_norm": 4.785153865814209, + "learning_rate": 5.978806304728546e-05, + "loss": 0.9517, + "num_input_tokens_seen": 92452560, + "step": 5746 + }, + { + "epoch": 0.4025672682059769, + "grad_norm": 4.450865268707275, + "learning_rate": 5.978106479859895e-05, + "loss": 0.9144, + "num_input_tokens_seen": 92468312, + "step": 5747 + }, + { + "epoch": 0.40263731645170614, + "grad_norm": 4.854867935180664, + "learning_rate": 5.9774066549912446e-05, + "loss": 1.1128, + "num_input_tokens_seen": 92484616, + "step": 5748 + }, + { + "epoch": 0.40270736469743534, + "grad_norm": 4.00141716003418, + "learning_rate": 5.976706830122593e-05, + "loss": 1.1298, + "num_input_tokens_seen": 92500568, + "step": 5749 + }, + { + "epoch": 0.4027774129431646, + "grad_norm": 3.819101333618164, + "learning_rate": 5.976007005253941e-05, + "loss": 0.9113, + "num_input_tokens_seen": 92515952, + "step": 5750 + }, + { + "epoch": 0.40284746118889386, + "grad_norm": 3.7954423427581787, + "learning_rate": 5.97530718038529e-05, + "loss": 1.1873, + "num_input_tokens_seen": 92532336, + "step": 5751 + }, + { + "epoch": 0.4029175094346231, + "grad_norm": 4.081971645355225, + "learning_rate": 5.974607355516638e-05, + "loss": 1.1075, + "num_input_tokens_seen": 92548720, + "step": 5752 + }, + { + "epoch": 0.4029875576803523, + "grad_norm": 3.834063768386841, + "learning_rate": 5.9739075306479855e-05, + "loss": 0.9963, + "num_input_tokens_seen": 92564648, + "step": 5753 + }, + { + "epoch": 0.4030576059260816, + "grad_norm": 3.6766366958618164, + "learning_rate": 5.973207705779336e-05, + "loss": 1.1064, + "num_input_tokens_seen": 92581032, + "step": 5754 + }, + { + "epoch": 0.40312765417181085, + "grad_norm": 4.423589706420898, + "learning_rate": 5.972507880910684e-05, + "loss": 1.0285, + "num_input_tokens_seen": 92596520, + "step": 5755 + }, + { + "epoch": 0.4031977024175401, + "grad_norm": 4.0820207595825195, + "learning_rate": 5.971808056042031e-05, + "loss": 1.0288, + "num_input_tokens_seen": 92612448, + "step": 5756 + }, + { + "epoch": 0.4032677506632693, + "grad_norm": 6.634023189544678, + "learning_rate": 5.97110823117338e-05, + "loss": 1.1607, + "num_input_tokens_seen": 92627736, + "step": 5757 + }, + { + "epoch": 0.40333779890899857, + "grad_norm": 3.517611026763916, + "learning_rate": 5.970408406304729e-05, + "loss": 0.9292, + "num_input_tokens_seen": 92643216, + "step": 5758 + }, + { + "epoch": 0.4034078471547278, + "grad_norm": 3.7658562660217285, + "learning_rate": 5.969708581436077e-05, + "loss": 1.052, + "num_input_tokens_seen": 92659248, + "step": 5759 + }, + { + "epoch": 0.4034778954004571, + "grad_norm": 4.128793716430664, + "learning_rate": 5.9690087565674254e-05, + "loss": 1.1067, + "num_input_tokens_seen": 92675632, + "step": 5760 + }, + { + "epoch": 0.4035479436461863, + "grad_norm": 4.200130939483643, + "learning_rate": 5.968308931698775e-05, + "loss": 0.9264, + "num_input_tokens_seen": 92691408, + "step": 5761 + }, + { + "epoch": 0.40361799189191555, + "grad_norm": 4.131740093231201, + "learning_rate": 5.967609106830123e-05, + "loss": 1.0687, + "num_input_tokens_seen": 92706568, + "step": 5762 + }, + { + "epoch": 0.4036880401376448, + "grad_norm": 4.074241638183594, + "learning_rate": 5.96690928196147e-05, + "loss": 1.0122, + "num_input_tokens_seen": 92722952, + "step": 5763 + }, + { + "epoch": 0.40375808838337407, + "grad_norm": 4.351722717285156, + "learning_rate": 5.9662094570928195e-05, + "loss": 0.8518, + "num_input_tokens_seen": 92739248, + "step": 5764 + }, + { + "epoch": 0.4038281366291033, + "grad_norm": 4.0495734214782715, + "learning_rate": 5.965509632224169e-05, + "loss": 1.1392, + "num_input_tokens_seen": 92755040, + "step": 5765 + }, + { + "epoch": 0.40389818487483253, + "grad_norm": 3.881098747253418, + "learning_rate": 5.964809807355517e-05, + "loss": 0.9899, + "num_input_tokens_seen": 92771424, + "step": 5766 + }, + { + "epoch": 0.4039682331205618, + "grad_norm": 3.964268922805786, + "learning_rate": 5.9641099824868654e-05, + "loss": 1.1636, + "num_input_tokens_seen": 92787808, + "step": 5767 + }, + { + "epoch": 0.40403828136629105, + "grad_norm": 4.126365661621094, + "learning_rate": 5.963410157618214e-05, + "loss": 1.273, + "num_input_tokens_seen": 92804192, + "step": 5768 + }, + { + "epoch": 0.40410832961202026, + "grad_norm": 4.142693519592285, + "learning_rate": 5.9627103327495625e-05, + "loss": 1.2161, + "num_input_tokens_seen": 92819920, + "step": 5769 + }, + { + "epoch": 0.4041783778577495, + "grad_norm": 4.906876087188721, + "learning_rate": 5.962010507880912e-05, + "loss": 0.9985, + "num_input_tokens_seen": 92836304, + "step": 5770 + }, + { + "epoch": 0.4042484261034788, + "grad_norm": 4.597287654876709, + "learning_rate": 5.96131068301226e-05, + "loss": 1.13, + "num_input_tokens_seen": 92852144, + "step": 5771 + }, + { + "epoch": 0.40431847434920803, + "grad_norm": 3.525669813156128, + "learning_rate": 5.9606108581436084e-05, + "loss": 1.0209, + "num_input_tokens_seen": 92867944, + "step": 5772 + }, + { + "epoch": 0.40438852259493724, + "grad_norm": 4.908353328704834, + "learning_rate": 5.9599110332749565e-05, + "loss": 1.2495, + "num_input_tokens_seen": 92884328, + "step": 5773 + }, + { + "epoch": 0.4044585708406665, + "grad_norm": 4.31436824798584, + "learning_rate": 5.959211208406305e-05, + "loss": 1.0821, + "num_input_tokens_seen": 92900152, + "step": 5774 + }, + { + "epoch": 0.40452861908639576, + "grad_norm": 3.652494430541992, + "learning_rate": 5.9585113835376536e-05, + "loss": 1.0086, + "num_input_tokens_seen": 92916416, + "step": 5775 + }, + { + "epoch": 0.404598667332125, + "grad_norm": 3.9569268226623535, + "learning_rate": 5.957811558669002e-05, + "loss": 1.0284, + "num_input_tokens_seen": 92932088, + "step": 5776 + }, + { + "epoch": 0.4046687155778542, + "grad_norm": 4.301011562347412, + "learning_rate": 5.95711173380035e-05, + "loss": 1.0994, + "num_input_tokens_seen": 92948472, + "step": 5777 + }, + { + "epoch": 0.4047387638235835, + "grad_norm": 4.0318474769592285, + "learning_rate": 5.9564119089316995e-05, + "loss": 1.0636, + "num_input_tokens_seen": 92964856, + "step": 5778 + }, + { + "epoch": 0.40480881206931274, + "grad_norm": 4.05795955657959, + "learning_rate": 5.955712084063048e-05, + "loss": 0.9947, + "num_input_tokens_seen": 92980040, + "step": 5779 + }, + { + "epoch": 0.404878860315042, + "grad_norm": 5.828601837158203, + "learning_rate": 5.9550122591943945e-05, + "loss": 1.1361, + "num_input_tokens_seen": 92996424, + "step": 5780 + }, + { + "epoch": 0.4049489085607712, + "grad_norm": 3.6617836952209473, + "learning_rate": 5.954312434325745e-05, + "loss": 0.9282, + "num_input_tokens_seen": 93012400, + "step": 5781 + }, + { + "epoch": 0.40501895680650046, + "grad_norm": 3.8668923377990723, + "learning_rate": 5.953612609457093e-05, + "loss": 1.2368, + "num_input_tokens_seen": 93028176, + "step": 5782 + }, + { + "epoch": 0.4050890050522297, + "grad_norm": 6.206475257873535, + "learning_rate": 5.952912784588441e-05, + "loss": 1.133, + "num_input_tokens_seen": 93044560, + "step": 5783 + }, + { + "epoch": 0.405159053297959, + "grad_norm": 3.6768481731414795, + "learning_rate": 5.952212959719789e-05, + "loss": 1.0141, + "num_input_tokens_seen": 93060944, + "step": 5784 + }, + { + "epoch": 0.40522910154368824, + "grad_norm": 3.8317768573760986, + "learning_rate": 5.95151313485114e-05, + "loss": 1.1, + "num_input_tokens_seen": 93077328, + "step": 5785 + }, + { + "epoch": 0.40529914978941745, + "grad_norm": 5.8504252433776855, + "learning_rate": 5.950813309982487e-05, + "loss": 1.1581, + "num_input_tokens_seen": 93092016, + "step": 5786 + }, + { + "epoch": 0.4053691980351467, + "grad_norm": 3.6718640327453613, + "learning_rate": 5.9501134851138365e-05, + "loss": 0.9614, + "num_input_tokens_seen": 93108400, + "step": 5787 + }, + { + "epoch": 0.40543924628087596, + "grad_norm": 4.16236686706543, + "learning_rate": 5.949413660245185e-05, + "loss": 1.1426, + "num_input_tokens_seen": 93124784, + "step": 5788 + }, + { + "epoch": 0.4055092945266052, + "grad_norm": 4.1938958168029785, + "learning_rate": 5.948713835376533e-05, + "loss": 1.241, + "num_input_tokens_seen": 93141168, + "step": 5789 + }, + { + "epoch": 0.40557934277233443, + "grad_norm": 3.7515947818756104, + "learning_rate": 5.948014010507881e-05, + "loss": 1.0706, + "num_input_tokens_seen": 93157552, + "step": 5790 + }, + { + "epoch": 0.4056493910180637, + "grad_norm": 3.710805654525757, + "learning_rate": 5.94731418563923e-05, + "loss": 0.9877, + "num_input_tokens_seen": 93173936, + "step": 5791 + }, + { + "epoch": 0.40571943926379295, + "grad_norm": 3.4491820335388184, + "learning_rate": 5.9466143607705794e-05, + "loss": 1.0282, + "num_input_tokens_seen": 93190320, + "step": 5792 + }, + { + "epoch": 0.4057894875095222, + "grad_norm": 3.8709781169891357, + "learning_rate": 5.945914535901926e-05, + "loss": 0.8852, + "num_input_tokens_seen": 93206648, + "step": 5793 + }, + { + "epoch": 0.4058595357552514, + "grad_norm": 5.104569435119629, + "learning_rate": 5.945214711033276e-05, + "loss": 0.9954, + "num_input_tokens_seen": 93223032, + "step": 5794 + }, + { + "epoch": 0.40592958400098067, + "grad_norm": 4.294493675231934, + "learning_rate": 5.944514886164625e-05, + "loss": 1.0673, + "num_input_tokens_seen": 93239344, + "step": 5795 + }, + { + "epoch": 0.40599963224670993, + "grad_norm": 4.654513835906982, + "learning_rate": 5.943815061295972e-05, + "loss": 1.0239, + "num_input_tokens_seen": 93255104, + "step": 5796 + }, + { + "epoch": 0.4060696804924392, + "grad_norm": 4.339935779571533, + "learning_rate": 5.9431152364273204e-05, + "loss": 0.982, + "num_input_tokens_seen": 93270448, + "step": 5797 + }, + { + "epoch": 0.4061397287381684, + "grad_norm": 3.6498191356658936, + "learning_rate": 5.942415411558669e-05, + "loss": 0.8815, + "num_input_tokens_seen": 93286672, + "step": 5798 + }, + { + "epoch": 0.40620977698389765, + "grad_norm": 3.9488580226898193, + "learning_rate": 5.9417155866900174e-05, + "loss": 1.2938, + "num_input_tokens_seen": 93302256, + "step": 5799 + }, + { + "epoch": 0.4062798252296269, + "grad_norm": 3.9446182250976562, + "learning_rate": 5.9410157618213656e-05, + "loss": 0.8862, + "num_input_tokens_seen": 93318640, + "step": 5800 + }, + { + "epoch": 0.4062798252296269, + "eval_loss": 1.1256848573684692, + "eval_runtime": 0.2106, + "eval_samples_per_second": 4.748, + "eval_steps_per_second": 4.748, + "num_input_tokens_seen": 93318640, + "step": 5800 + }, + { + "epoch": 0.40634987347535617, + "grad_norm": 3.986703872680664, + "learning_rate": 5.940315936952715e-05, + "loss": 1.1783, + "num_input_tokens_seen": 93335024, + "step": 5801 + }, + { + "epoch": 0.4064199217210854, + "grad_norm": 3.56948184967041, + "learning_rate": 5.9396161120840647e-05, + "loss": 1.035, + "num_input_tokens_seen": 93351408, + "step": 5802 + }, + { + "epoch": 0.40648996996681463, + "grad_norm": 3.9857194423675537, + "learning_rate": 5.9389162872154115e-05, + "loss": 1.0596, + "num_input_tokens_seen": 93367728, + "step": 5803 + }, + { + "epoch": 0.4065600182125439, + "grad_norm": 5.165848731994629, + "learning_rate": 5.93821646234676e-05, + "loss": 0.9764, + "num_input_tokens_seen": 93384112, + "step": 5804 + }, + { + "epoch": 0.40663006645827315, + "grad_norm": 3.742520809173584, + "learning_rate": 5.9375166374781085e-05, + "loss": 1.0802, + "num_input_tokens_seen": 93400072, + "step": 5805 + }, + { + "epoch": 0.40670011470400236, + "grad_norm": 4.13803768157959, + "learning_rate": 5.936816812609457e-05, + "loss": 1.1845, + "num_input_tokens_seen": 93416016, + "step": 5806 + }, + { + "epoch": 0.4067701629497316, + "grad_norm": 4.530385494232178, + "learning_rate": 5.936116987740805e-05, + "loss": 1.1034, + "num_input_tokens_seen": 93432400, + "step": 5807 + }, + { + "epoch": 0.4068402111954609, + "grad_norm": 4.162608623504639, + "learning_rate": 5.935417162872156e-05, + "loss": 1.0239, + "num_input_tokens_seen": 93448360, + "step": 5808 + }, + { + "epoch": 0.40691025944119014, + "grad_norm": 3.5075366497039795, + "learning_rate": 5.934717338003504e-05, + "loss": 0.8568, + "num_input_tokens_seen": 93464744, + "step": 5809 + }, + { + "epoch": 0.40698030768691934, + "grad_norm": 4.963081359863281, + "learning_rate": 5.934017513134851e-05, + "loss": 1.028, + "num_input_tokens_seen": 93480448, + "step": 5810 + }, + { + "epoch": 0.4070503559326486, + "grad_norm": 3.83306622505188, + "learning_rate": 5.9333176882662e-05, + "loss": 1.1558, + "num_input_tokens_seen": 93496832, + "step": 5811 + }, + { + "epoch": 0.40712040417837786, + "grad_norm": 3.878345489501953, + "learning_rate": 5.932617863397549e-05, + "loss": 0.9784, + "num_input_tokens_seen": 93513216, + "step": 5812 + }, + { + "epoch": 0.4071904524241071, + "grad_norm": 4.1416192054748535, + "learning_rate": 5.931918038528897e-05, + "loss": 1.0167, + "num_input_tokens_seen": 93529504, + "step": 5813 + }, + { + "epoch": 0.4072605006698363, + "grad_norm": 5.459712028503418, + "learning_rate": 5.9312182136602455e-05, + "loss": 1.0816, + "num_input_tokens_seen": 93545888, + "step": 5814 + }, + { + "epoch": 0.4073305489155656, + "grad_norm": 5.8356852531433105, + "learning_rate": 5.930518388791595e-05, + "loss": 1.1543, + "num_input_tokens_seen": 93562272, + "step": 5815 + }, + { + "epoch": 0.40740059716129484, + "grad_norm": 6.23671817779541, + "learning_rate": 5.929818563922943e-05, + "loss": 1.1929, + "num_input_tokens_seen": 93578656, + "step": 5816 + }, + { + "epoch": 0.4074706454070241, + "grad_norm": 3.6252057552337646, + "learning_rate": 5.9291187390542914e-05, + "loss": 1.1491, + "num_input_tokens_seen": 93594816, + "step": 5817 + }, + { + "epoch": 0.40754069365275336, + "grad_norm": 4.245891571044922, + "learning_rate": 5.9284189141856396e-05, + "loss": 1.1624, + "num_input_tokens_seen": 93611200, + "step": 5818 + }, + { + "epoch": 0.40761074189848256, + "grad_norm": 4.052443504333496, + "learning_rate": 5.9277190893169885e-05, + "loss": 0.9608, + "num_input_tokens_seen": 93627080, + "step": 5819 + }, + { + "epoch": 0.4076807901442118, + "grad_norm": 4.427778720855713, + "learning_rate": 5.927019264448336e-05, + "loss": 0.9268, + "num_input_tokens_seen": 93643464, + "step": 5820 + }, + { + "epoch": 0.4077508383899411, + "grad_norm": 4.1961541175842285, + "learning_rate": 5.926319439579685e-05, + "loss": 1.0374, + "num_input_tokens_seen": 93659088, + "step": 5821 + }, + { + "epoch": 0.40782088663567034, + "grad_norm": 4.001824378967285, + "learning_rate": 5.9256196147110344e-05, + "loss": 1.1674, + "num_input_tokens_seen": 93674928, + "step": 5822 + }, + { + "epoch": 0.40789093488139955, + "grad_norm": 4.010315895080566, + "learning_rate": 5.924919789842381e-05, + "loss": 0.9092, + "num_input_tokens_seen": 93690840, + "step": 5823 + }, + { + "epoch": 0.4079609831271288, + "grad_norm": 3.771390438079834, + "learning_rate": 5.9242199649737294e-05, + "loss": 1.1374, + "num_input_tokens_seen": 93707224, + "step": 5824 + }, + { + "epoch": 0.40803103137285807, + "grad_norm": 3.920438051223755, + "learning_rate": 5.92352014010508e-05, + "loss": 0.9206, + "num_input_tokens_seen": 93722536, + "step": 5825 + }, + { + "epoch": 0.4081010796185873, + "grad_norm": 4.679770469665527, + "learning_rate": 5.922820315236428e-05, + "loss": 1.0143, + "num_input_tokens_seen": 93738736, + "step": 5826 + }, + { + "epoch": 0.40817112786431653, + "grad_norm": 4.082173824310303, + "learning_rate": 5.922120490367776e-05, + "loss": 0.9896, + "num_input_tokens_seen": 93755120, + "step": 5827 + }, + { + "epoch": 0.4082411761100458, + "grad_norm": 4.076204776763916, + "learning_rate": 5.921420665499124e-05, + "loss": 0.9632, + "num_input_tokens_seen": 93771504, + "step": 5828 + }, + { + "epoch": 0.40831122435577505, + "grad_norm": 4.721165180206299, + "learning_rate": 5.920720840630474e-05, + "loss": 1.237, + "num_input_tokens_seen": 93787888, + "step": 5829 + }, + { + "epoch": 0.4083812726015043, + "grad_norm": 4.313892841339111, + "learning_rate": 5.9200210157618205e-05, + "loss": 1.1815, + "num_input_tokens_seen": 93804272, + "step": 5830 + }, + { + "epoch": 0.4084513208472335, + "grad_norm": 4.506958961486816, + "learning_rate": 5.91932119089317e-05, + "loss": 0.8849, + "num_input_tokens_seen": 93820416, + "step": 5831 + }, + { + "epoch": 0.40852136909296277, + "grad_norm": 4.6436991691589355, + "learning_rate": 5.9186213660245196e-05, + "loss": 0.8856, + "num_input_tokens_seen": 93836648, + "step": 5832 + }, + { + "epoch": 0.40859141733869203, + "grad_norm": 3.9535446166992188, + "learning_rate": 5.917921541155868e-05, + "loss": 1.1285, + "num_input_tokens_seen": 93853032, + "step": 5833 + }, + { + "epoch": 0.4086614655844213, + "grad_norm": 6.974640846252441, + "learning_rate": 5.917221716287216e-05, + "loss": 1.0669, + "num_input_tokens_seen": 93868640, + "step": 5834 + }, + { + "epoch": 0.4087315138301505, + "grad_norm": 4.3199262619018555, + "learning_rate": 5.916521891418565e-05, + "loss": 1.1921, + "num_input_tokens_seen": 93885024, + "step": 5835 + }, + { + "epoch": 0.40880156207587975, + "grad_norm": 3.892812967300415, + "learning_rate": 5.915822066549913e-05, + "loss": 1.0978, + "num_input_tokens_seen": 93901408, + "step": 5836 + }, + { + "epoch": 0.408871610321609, + "grad_norm": 4.434093952178955, + "learning_rate": 5.915122241681261e-05, + "loss": 1.1484, + "num_input_tokens_seen": 93917632, + "step": 5837 + }, + { + "epoch": 0.4089416585673383, + "grad_norm": 3.960766553878784, + "learning_rate": 5.914422416812611e-05, + "loss": 0.994, + "num_input_tokens_seen": 93934016, + "step": 5838 + }, + { + "epoch": 0.4090117068130675, + "grad_norm": 6.450897693634033, + "learning_rate": 5.913722591943959e-05, + "loss": 1.1364, + "num_input_tokens_seen": 93950328, + "step": 5839 + }, + { + "epoch": 0.40908175505879674, + "grad_norm": 4.3362956047058105, + "learning_rate": 5.913022767075307e-05, + "loss": 1.0023, + "num_input_tokens_seen": 93966712, + "step": 5840 + }, + { + "epoch": 0.409151803304526, + "grad_norm": 4.250185966491699, + "learning_rate": 5.912322942206655e-05, + "loss": 0.9535, + "num_input_tokens_seen": 93982840, + "step": 5841 + }, + { + "epoch": 0.40922185155025526, + "grad_norm": 5.399633407592773, + "learning_rate": 5.911623117338004e-05, + "loss": 1.0504, + "num_input_tokens_seen": 93998192, + "step": 5842 + }, + { + "epoch": 0.40929189979598446, + "grad_norm": 3.926515579223633, + "learning_rate": 5.910923292469352e-05, + "loss": 1.1711, + "num_input_tokens_seen": 94014296, + "step": 5843 + }, + { + "epoch": 0.4093619480417137, + "grad_norm": 3.9847402572631836, + "learning_rate": 5.9102234676007005e-05, + "loss": 1.0845, + "num_input_tokens_seen": 94030520, + "step": 5844 + }, + { + "epoch": 0.409431996287443, + "grad_norm": 4.106837272644043, + "learning_rate": 5.9095236427320486e-05, + "loss": 1.1529, + "num_input_tokens_seen": 94046512, + "step": 5845 + }, + { + "epoch": 0.40950204453317224, + "grad_norm": 5.044075965881348, + "learning_rate": 5.908823817863398e-05, + "loss": 0.8541, + "num_input_tokens_seen": 94061536, + "step": 5846 + }, + { + "epoch": 0.40957209277890144, + "grad_norm": 7.444840908050537, + "learning_rate": 5.908123992994745e-05, + "loss": 0.9811, + "num_input_tokens_seen": 94077920, + "step": 5847 + }, + { + "epoch": 0.4096421410246307, + "grad_norm": 4.676487445831299, + "learning_rate": 5.907424168126095e-05, + "loss": 0.9323, + "num_input_tokens_seen": 94094040, + "step": 5848 + }, + { + "epoch": 0.40971218927035996, + "grad_norm": 3.945162057876587, + "learning_rate": 5.9067243432574434e-05, + "loss": 1.0101, + "num_input_tokens_seen": 94110424, + "step": 5849 + }, + { + "epoch": 0.4097822375160892, + "grad_norm": 3.9881343841552734, + "learning_rate": 5.9060245183887916e-05, + "loss": 1.2385, + "num_input_tokens_seen": 94126808, + "step": 5850 + }, + { + "epoch": 0.4098522857618185, + "grad_norm": 3.5962657928466797, + "learning_rate": 5.90532469352014e-05, + "loss": 0.988, + "num_input_tokens_seen": 94143192, + "step": 5851 + }, + { + "epoch": 0.4099223340075477, + "grad_norm": 5.755387783050537, + "learning_rate": 5.904624868651491e-05, + "loss": 1.1936, + "num_input_tokens_seen": 94158728, + "step": 5852 + }, + { + "epoch": 0.40999238225327694, + "grad_norm": 4.999855995178223, + "learning_rate": 5.9039250437828375e-05, + "loss": 1.1235, + "num_input_tokens_seen": 94174736, + "step": 5853 + }, + { + "epoch": 0.4100624304990062, + "grad_norm": 5.452749729156494, + "learning_rate": 5.903225218914186e-05, + "loss": 1.1477, + "num_input_tokens_seen": 94191120, + "step": 5854 + }, + { + "epoch": 0.41013247874473546, + "grad_norm": 4.211399078369141, + "learning_rate": 5.902525394045535e-05, + "loss": 0.9808, + "num_input_tokens_seen": 94207504, + "step": 5855 + }, + { + "epoch": 0.41020252699046467, + "grad_norm": 3.3774921894073486, + "learning_rate": 5.9018255691768834e-05, + "loss": 1.0297, + "num_input_tokens_seen": 94223888, + "step": 5856 + }, + { + "epoch": 0.4102725752361939, + "grad_norm": 6.850539207458496, + "learning_rate": 5.9011257443082316e-05, + "loss": 1.0313, + "num_input_tokens_seen": 94240120, + "step": 5857 + }, + { + "epoch": 0.4103426234819232, + "grad_norm": 3.2469308376312256, + "learning_rate": 5.9004259194395804e-05, + "loss": 0.9631, + "num_input_tokens_seen": 94256488, + "step": 5858 + }, + { + "epoch": 0.41041267172765244, + "grad_norm": 5.0675201416015625, + "learning_rate": 5.89972609457093e-05, + "loss": 0.8961, + "num_input_tokens_seen": 94272048, + "step": 5859 + }, + { + "epoch": 0.41048271997338165, + "grad_norm": 3.8971400260925293, + "learning_rate": 5.899026269702277e-05, + "loss": 1.2323, + "num_input_tokens_seen": 94287984, + "step": 5860 + }, + { + "epoch": 0.4105527682191109, + "grad_norm": 3.8453164100646973, + "learning_rate": 5.898326444833625e-05, + "loss": 1.1039, + "num_input_tokens_seen": 94303976, + "step": 5861 + }, + { + "epoch": 0.41062281646484017, + "grad_norm": 3.7564680576324463, + "learning_rate": 5.897626619964975e-05, + "loss": 1.0977, + "num_input_tokens_seen": 94319552, + "step": 5862 + }, + { + "epoch": 0.4106928647105694, + "grad_norm": 4.541357517242432, + "learning_rate": 5.896926795096323e-05, + "loss": 1.2374, + "num_input_tokens_seen": 94335936, + "step": 5863 + }, + { + "epoch": 0.41076291295629863, + "grad_norm": 4.841330051422119, + "learning_rate": 5.896226970227671e-05, + "loss": 0.8555, + "num_input_tokens_seen": 94351784, + "step": 5864 + }, + { + "epoch": 0.4108329612020279, + "grad_norm": 3.9247653484344482, + "learning_rate": 5.89552714535902e-05, + "loss": 1.0169, + "num_input_tokens_seen": 94368080, + "step": 5865 + }, + { + "epoch": 0.41090300944775715, + "grad_norm": 4.763803958892822, + "learning_rate": 5.894827320490368e-05, + "loss": 0.9812, + "num_input_tokens_seen": 94384056, + "step": 5866 + }, + { + "epoch": 0.4109730576934864, + "grad_norm": 5.141749858856201, + "learning_rate": 5.894127495621716e-05, + "loss": 1.0001, + "num_input_tokens_seen": 94400440, + "step": 5867 + }, + { + "epoch": 0.4110431059392156, + "grad_norm": 3.360656261444092, + "learning_rate": 5.8934276707530656e-05, + "loss": 0.988, + "num_input_tokens_seen": 94416824, + "step": 5868 + }, + { + "epoch": 0.4111131541849449, + "grad_norm": 4.226006507873535, + "learning_rate": 5.892727845884415e-05, + "loss": 1.0762, + "num_input_tokens_seen": 94433208, + "step": 5869 + }, + { + "epoch": 0.41118320243067413, + "grad_norm": 6.924234390258789, + "learning_rate": 5.892028021015762e-05, + "loss": 1.1498, + "num_input_tokens_seen": 94448848, + "step": 5870 + }, + { + "epoch": 0.4112532506764034, + "grad_norm": 3.643950939178467, + "learning_rate": 5.89132819614711e-05, + "loss": 0.9977, + "num_input_tokens_seen": 94464720, + "step": 5871 + }, + { + "epoch": 0.4113232989221326, + "grad_norm": 3.3077268600463867, + "learning_rate": 5.890628371278459e-05, + "loss": 0.9421, + "num_input_tokens_seen": 94481104, + "step": 5872 + }, + { + "epoch": 0.41139334716786186, + "grad_norm": 3.9556264877319336, + "learning_rate": 5.889928546409807e-05, + "loss": 0.9313, + "num_input_tokens_seen": 94496944, + "step": 5873 + }, + { + "epoch": 0.4114633954135911, + "grad_norm": 7.0952606201171875, + "learning_rate": 5.8892287215411554e-05, + "loss": 1.3029, + "num_input_tokens_seen": 94511336, + "step": 5874 + }, + { + "epoch": 0.4115334436593204, + "grad_norm": 3.946803569793701, + "learning_rate": 5.888528896672505e-05, + "loss": 0.9865, + "num_input_tokens_seen": 94526904, + "step": 5875 + }, + { + "epoch": 0.4116034919050496, + "grad_norm": 4.165796756744385, + "learning_rate": 5.8878290718038545e-05, + "loss": 1.2249, + "num_input_tokens_seen": 94543024, + "step": 5876 + }, + { + "epoch": 0.41167354015077884, + "grad_norm": 3.9492764472961426, + "learning_rate": 5.887129246935201e-05, + "loss": 1.0304, + "num_input_tokens_seen": 94558744, + "step": 5877 + }, + { + "epoch": 0.4117435883965081, + "grad_norm": 4.144934177398682, + "learning_rate": 5.8864294220665495e-05, + "loss": 1.0468, + "num_input_tokens_seen": 94574288, + "step": 5878 + }, + { + "epoch": 0.41181363664223736, + "grad_norm": 3.7620224952697754, + "learning_rate": 5.8857295971979e-05, + "loss": 0.929, + "num_input_tokens_seen": 94590672, + "step": 5879 + }, + { + "epoch": 0.41188368488796656, + "grad_norm": 4.788266181945801, + "learning_rate": 5.885029772329247e-05, + "loss": 1.0106, + "num_input_tokens_seen": 94606792, + "step": 5880 + }, + { + "epoch": 0.4119537331336958, + "grad_norm": 4.383455276489258, + "learning_rate": 5.8843299474605954e-05, + "loss": 0.9611, + "num_input_tokens_seen": 94623176, + "step": 5881 + }, + { + "epoch": 0.4120237813794251, + "grad_norm": 7.097373962402344, + "learning_rate": 5.883630122591944e-05, + "loss": 1.1559, + "num_input_tokens_seen": 94638952, + "step": 5882 + }, + { + "epoch": 0.41209382962515434, + "grad_norm": 5.4228901863098145, + "learning_rate": 5.882930297723294e-05, + "loss": 1.2569, + "num_input_tokens_seen": 94655224, + "step": 5883 + }, + { + "epoch": 0.41216387787088354, + "grad_norm": 3.792999267578125, + "learning_rate": 5.882230472854642e-05, + "loss": 0.8853, + "num_input_tokens_seen": 94671608, + "step": 5884 + }, + { + "epoch": 0.4122339261166128, + "grad_norm": 5.401544094085693, + "learning_rate": 5.88153064798599e-05, + "loss": 1.0763, + "num_input_tokens_seen": 94687048, + "step": 5885 + }, + { + "epoch": 0.41230397436234206, + "grad_norm": 3.2229812145233154, + "learning_rate": 5.880830823117339e-05, + "loss": 0.9375, + "num_input_tokens_seen": 94703432, + "step": 5886 + }, + { + "epoch": 0.4123740226080713, + "grad_norm": 4.5977277755737305, + "learning_rate": 5.8801309982486865e-05, + "loss": 1.1245, + "num_input_tokens_seen": 94719152, + "step": 5887 + }, + { + "epoch": 0.4124440708538006, + "grad_norm": 3.436765670776367, + "learning_rate": 5.879431173380035e-05, + "loss": 0.8353, + "num_input_tokens_seen": 94735536, + "step": 5888 + }, + { + "epoch": 0.4125141190995298, + "grad_norm": 3.4720351696014404, + "learning_rate": 5.878731348511385e-05, + "loss": 1.0329, + "num_input_tokens_seen": 94751840, + "step": 5889 + }, + { + "epoch": 0.41258416734525905, + "grad_norm": 3.621783971786499, + "learning_rate": 5.878031523642732e-05, + "loss": 1.0355, + "num_input_tokens_seen": 94768224, + "step": 5890 + }, + { + "epoch": 0.4126542155909883, + "grad_norm": 4.453585624694824, + "learning_rate": 5.87733169877408e-05, + "loss": 1.1171, + "num_input_tokens_seen": 94783672, + "step": 5891 + }, + { + "epoch": 0.41272426383671756, + "grad_norm": 4.191892147064209, + "learning_rate": 5.8766318739054294e-05, + "loss": 0.9505, + "num_input_tokens_seen": 94800056, + "step": 5892 + }, + { + "epoch": 0.41279431208244677, + "grad_norm": 3.5963308811187744, + "learning_rate": 5.875932049036778e-05, + "loss": 1.0223, + "num_input_tokens_seen": 94816440, + "step": 5893 + }, + { + "epoch": 0.412864360328176, + "grad_norm": 3.904618978500366, + "learning_rate": 5.8752322241681265e-05, + "loss": 1.0205, + "num_input_tokens_seen": 94832824, + "step": 5894 + }, + { + "epoch": 0.4129344085739053, + "grad_norm": 3.650961399078369, + "learning_rate": 5.8745323992994747e-05, + "loss": 0.9613, + "num_input_tokens_seen": 94849208, + "step": 5895 + }, + { + "epoch": 0.41300445681963455, + "grad_norm": 4.438238143920898, + "learning_rate": 5.873832574430824e-05, + "loss": 0.9865, + "num_input_tokens_seen": 94865592, + "step": 5896 + }, + { + "epoch": 0.41307450506536375, + "grad_norm": 3.6906070709228516, + "learning_rate": 5.873132749562171e-05, + "loss": 1.1228, + "num_input_tokens_seen": 94881976, + "step": 5897 + }, + { + "epoch": 0.413144553311093, + "grad_norm": 5.111722469329834, + "learning_rate": 5.872432924693519e-05, + "loss": 1.2678, + "num_input_tokens_seen": 94898360, + "step": 5898 + }, + { + "epoch": 0.41321460155682227, + "grad_norm": 3.7190804481506348, + "learning_rate": 5.871733099824869e-05, + "loss": 1.152, + "num_input_tokens_seen": 94914512, + "step": 5899 + }, + { + "epoch": 0.41328464980255153, + "grad_norm": 3.779918670654297, + "learning_rate": 5.871033274956218e-05, + "loss": 0.905, + "num_input_tokens_seen": 94930816, + "step": 5900 + }, + { + "epoch": 0.41335469804828073, + "grad_norm": 3.5921852588653564, + "learning_rate": 5.8703334500875665e-05, + "loss": 0.9119, + "num_input_tokens_seen": 94947200, + "step": 5901 + }, + { + "epoch": 0.41342474629401, + "grad_norm": 4.024271011352539, + "learning_rate": 5.8696336252189146e-05, + "loss": 1.048, + "num_input_tokens_seen": 94963584, + "step": 5902 + }, + { + "epoch": 0.41349479453973925, + "grad_norm": 4.798417091369629, + "learning_rate": 5.8689338003502635e-05, + "loss": 1.1399, + "num_input_tokens_seen": 94979968, + "step": 5903 + }, + { + "epoch": 0.4135648427854685, + "grad_norm": 3.5821495056152344, + "learning_rate": 5.868233975481612e-05, + "loss": 1.015, + "num_input_tokens_seen": 94996320, + "step": 5904 + }, + { + "epoch": 0.4136348910311977, + "grad_norm": 3.998082399368286, + "learning_rate": 5.867534150612959e-05, + "loss": 0.9569, + "num_input_tokens_seen": 95012704, + "step": 5905 + }, + { + "epoch": 0.413704939276927, + "grad_norm": 3.6389498710632324, + "learning_rate": 5.8668343257443094e-05, + "loss": 1.1562, + "num_input_tokens_seen": 95029088, + "step": 5906 + }, + { + "epoch": 0.41377498752265623, + "grad_norm": 4.137228012084961, + "learning_rate": 5.8661345008756576e-05, + "loss": 1.1683, + "num_input_tokens_seen": 95045472, + "step": 5907 + }, + { + "epoch": 0.4138450357683855, + "grad_norm": 4.181145668029785, + "learning_rate": 5.865434676007006e-05, + "loss": 0.993, + "num_input_tokens_seen": 95061856, + "step": 5908 + }, + { + "epoch": 0.4139150840141147, + "grad_norm": 3.759474754333496, + "learning_rate": 5.864734851138354e-05, + "loss": 1.0144, + "num_input_tokens_seen": 95077904, + "step": 5909 + }, + { + "epoch": 0.41398513225984396, + "grad_norm": 4.474549293518066, + "learning_rate": 5.864035026269703e-05, + "loss": 1.1008, + "num_input_tokens_seen": 95094288, + "step": 5910 + }, + { + "epoch": 0.4140551805055732, + "grad_norm": 7.399059295654297, + "learning_rate": 5.863335201401051e-05, + "loss": 1.2234, + "num_input_tokens_seen": 95107872, + "step": 5911 + }, + { + "epoch": 0.4141252287513025, + "grad_norm": 4.018132209777832, + "learning_rate": 5.862635376532399e-05, + "loss": 0.9736, + "num_input_tokens_seen": 95124256, + "step": 5912 + }, + { + "epoch": 0.4141952769970317, + "grad_norm": 3.825305223464966, + "learning_rate": 5.861935551663749e-05, + "loss": 0.9833, + "num_input_tokens_seen": 95139784, + "step": 5913 + }, + { + "epoch": 0.41426532524276094, + "grad_norm": 3.7942214012145996, + "learning_rate": 5.861235726795097e-05, + "loss": 1.0999, + "num_input_tokens_seen": 95155136, + "step": 5914 + }, + { + "epoch": 0.4143353734884902, + "grad_norm": 3.9006733894348145, + "learning_rate": 5.860535901926444e-05, + "loss": 1.0608, + "num_input_tokens_seen": 95171520, + "step": 5915 + }, + { + "epoch": 0.41440542173421946, + "grad_norm": 3.688754081726074, + "learning_rate": 5.859836077057794e-05, + "loss": 1.1338, + "num_input_tokens_seen": 95187752, + "step": 5916 + }, + { + "epoch": 0.41447546997994866, + "grad_norm": 3.954989433288574, + "learning_rate": 5.859136252189142e-05, + "loss": 0.8965, + "num_input_tokens_seen": 95203864, + "step": 5917 + }, + { + "epoch": 0.4145455182256779, + "grad_norm": 3.733185052871704, + "learning_rate": 5.85843642732049e-05, + "loss": 0.9872, + "num_input_tokens_seen": 95220248, + "step": 5918 + }, + { + "epoch": 0.4146155664714072, + "grad_norm": 4.39019775390625, + "learning_rate": 5.8577366024518385e-05, + "loss": 1.0452, + "num_input_tokens_seen": 95236632, + "step": 5919 + }, + { + "epoch": 0.41468561471713644, + "grad_norm": 3.716066837310791, + "learning_rate": 5.857036777583188e-05, + "loss": 0.9231, + "num_input_tokens_seen": 95251888, + "step": 5920 + }, + { + "epoch": 0.4147556629628657, + "grad_norm": 3.7525405883789062, + "learning_rate": 5.856336952714536e-05, + "loss": 1.1147, + "num_input_tokens_seen": 95268272, + "step": 5921 + }, + { + "epoch": 0.4148257112085949, + "grad_norm": 3.605818033218384, + "learning_rate": 5.855637127845886e-05, + "loss": 0.9874, + "num_input_tokens_seen": 95283880, + "step": 5922 + }, + { + "epoch": 0.41489575945432416, + "grad_norm": 3.878814458847046, + "learning_rate": 5.854937302977234e-05, + "loss": 1.1556, + "num_input_tokens_seen": 95300264, + "step": 5923 + }, + { + "epoch": 0.4149658077000534, + "grad_norm": 4.94001579284668, + "learning_rate": 5.854237478108582e-05, + "loss": 1.0014, + "num_input_tokens_seen": 95316648, + "step": 5924 + }, + { + "epoch": 0.4150358559457827, + "grad_norm": 4.213568210601807, + "learning_rate": 5.85353765323993e-05, + "loss": 1.2315, + "num_input_tokens_seen": 95332880, + "step": 5925 + }, + { + "epoch": 0.4151059041915119, + "grad_norm": 4.2475996017456055, + "learning_rate": 5.8528378283712784e-05, + "loss": 1.0717, + "num_input_tokens_seen": 95349176, + "step": 5926 + }, + { + "epoch": 0.41517595243724115, + "grad_norm": 4.237911224365234, + "learning_rate": 5.852138003502627e-05, + "loss": 0.93, + "num_input_tokens_seen": 95365560, + "step": 5927 + }, + { + "epoch": 0.4152460006829704, + "grad_norm": 3.695140838623047, + "learning_rate": 5.8514381786339755e-05, + "loss": 0.9561, + "num_input_tokens_seen": 95381280, + "step": 5928 + }, + { + "epoch": 0.41531604892869967, + "grad_norm": 4.457770347595215, + "learning_rate": 5.850738353765325e-05, + "loss": 1.0541, + "num_input_tokens_seen": 95397664, + "step": 5929 + }, + { + "epoch": 0.41538609717442887, + "grad_norm": 4.237982273101807, + "learning_rate": 5.850038528896673e-05, + "loss": 1.0123, + "num_input_tokens_seen": 95414048, + "step": 5930 + }, + { + "epoch": 0.41545614542015813, + "grad_norm": 3.5690579414367676, + "learning_rate": 5.8493387040280214e-05, + "loss": 0.9788, + "num_input_tokens_seen": 95430432, + "step": 5931 + }, + { + "epoch": 0.4155261936658874, + "grad_norm": 4.665618419647217, + "learning_rate": 5.8486388791593696e-05, + "loss": 1.0138, + "num_input_tokens_seen": 95446816, + "step": 5932 + }, + { + "epoch": 0.41559624191161665, + "grad_norm": 3.76755952835083, + "learning_rate": 5.8479390542907184e-05, + "loss": 0.9853, + "num_input_tokens_seen": 95463200, + "step": 5933 + }, + { + "epoch": 0.41566629015734585, + "grad_norm": 4.855432033538818, + "learning_rate": 5.8472392294220666e-05, + "loss": 1.1747, + "num_input_tokens_seen": 95478704, + "step": 5934 + }, + { + "epoch": 0.4157363384030751, + "grad_norm": 4.860055446624756, + "learning_rate": 5.846539404553415e-05, + "loss": 0.9747, + "num_input_tokens_seen": 95495088, + "step": 5935 + }, + { + "epoch": 0.41580638664880437, + "grad_norm": 4.268356800079346, + "learning_rate": 5.845839579684763e-05, + "loss": 1.1564, + "num_input_tokens_seen": 95511472, + "step": 5936 + }, + { + "epoch": 0.41587643489453363, + "grad_norm": 7.423181533813477, + "learning_rate": 5.8451397548161125e-05, + "loss": 1.0369, + "num_input_tokens_seen": 95527008, + "step": 5937 + }, + { + "epoch": 0.41594648314026283, + "grad_norm": 5.740126609802246, + "learning_rate": 5.844439929947461e-05, + "loss": 1.1188, + "num_input_tokens_seen": 95542536, + "step": 5938 + }, + { + "epoch": 0.4160165313859921, + "grad_norm": 5.135944366455078, + "learning_rate": 5.8437401050788096e-05, + "loss": 1.1815, + "num_input_tokens_seen": 95558688, + "step": 5939 + }, + { + "epoch": 0.41608657963172135, + "grad_norm": 3.879530191421509, + "learning_rate": 5.843040280210158e-05, + "loss": 1.162, + "num_input_tokens_seen": 95575064, + "step": 5940 + }, + { + "epoch": 0.4161566278774506, + "grad_norm": 4.096410274505615, + "learning_rate": 5.842340455341506e-05, + "loss": 1.0883, + "num_input_tokens_seen": 95591272, + "step": 5941 + }, + { + "epoch": 0.4162266761231798, + "grad_norm": 4.095829486846924, + "learning_rate": 5.841640630472854e-05, + "loss": 1.0516, + "num_input_tokens_seen": 95607656, + "step": 5942 + }, + { + "epoch": 0.4162967243689091, + "grad_norm": 4.076023101806641, + "learning_rate": 5.840940805604205e-05, + "loss": 1.0361, + "num_input_tokens_seen": 95623384, + "step": 5943 + }, + { + "epoch": 0.41636677261463834, + "grad_norm": 4.08365535736084, + "learning_rate": 5.840240980735553e-05, + "loss": 1.1102, + "num_input_tokens_seen": 95639768, + "step": 5944 + }, + { + "epoch": 0.4164368208603676, + "grad_norm": 4.182791233062744, + "learning_rate": 5.8395411558669e-05, + "loss": 1.0212, + "num_input_tokens_seen": 95655928, + "step": 5945 + }, + { + "epoch": 0.4165068691060968, + "grad_norm": 4.3107428550720215, + "learning_rate": 5.8388413309982495e-05, + "loss": 0.8607, + "num_input_tokens_seen": 95672312, + "step": 5946 + }, + { + "epoch": 0.41657691735182606, + "grad_norm": 3.7357101440429688, + "learning_rate": 5.838141506129598e-05, + "loss": 1.0563, + "num_input_tokens_seen": 95688696, + "step": 5947 + }, + { + "epoch": 0.4166469655975553, + "grad_norm": 3.9959046840667725, + "learning_rate": 5.837441681260946e-05, + "loss": 1.1043, + "num_input_tokens_seen": 95704304, + "step": 5948 + }, + { + "epoch": 0.4167170138432846, + "grad_norm": 4.395400524139404, + "learning_rate": 5.836741856392295e-05, + "loss": 1.1087, + "num_input_tokens_seen": 95720688, + "step": 5949 + }, + { + "epoch": 0.4167870620890138, + "grad_norm": 4.998651027679443, + "learning_rate": 5.836042031523644e-05, + "loss": 1.1903, + "num_input_tokens_seen": 95737072, + "step": 5950 + }, + { + "epoch": 0.41685711033474304, + "grad_norm": 4.060539245605469, + "learning_rate": 5.8353422066549925e-05, + "loss": 1.033, + "num_input_tokens_seen": 95753456, + "step": 5951 + }, + { + "epoch": 0.4169271585804723, + "grad_norm": 5.286706924438477, + "learning_rate": 5.834642381786339e-05, + "loss": 1.0968, + "num_input_tokens_seen": 95769840, + "step": 5952 + }, + { + "epoch": 0.41699720682620156, + "grad_norm": 4.501932144165039, + "learning_rate": 5.833942556917689e-05, + "loss": 1.2626, + "num_input_tokens_seen": 95786224, + "step": 5953 + }, + { + "epoch": 0.41706725507193076, + "grad_norm": 5.144174575805664, + "learning_rate": 5.833242732049038e-05, + "loss": 0.8148, + "num_input_tokens_seen": 95802608, + "step": 5954 + }, + { + "epoch": 0.41713730331766, + "grad_norm": 3.6604678630828857, + "learning_rate": 5.832542907180385e-05, + "loss": 0.9467, + "num_input_tokens_seen": 95818992, + "step": 5955 + }, + { + "epoch": 0.4172073515633893, + "grad_norm": 5.387998104095459, + "learning_rate": 5.831843082311734e-05, + "loss": 1.181, + "num_input_tokens_seen": 95835376, + "step": 5956 + }, + { + "epoch": 0.41727739980911854, + "grad_norm": 4.576782703399658, + "learning_rate": 5.831143257443082e-05, + "loss": 1.0577, + "num_input_tokens_seen": 95851760, + "step": 5957 + }, + { + "epoch": 0.4173474480548478, + "grad_norm": 5.737542629241943, + "learning_rate": 5.8304434325744304e-05, + "loss": 1.039, + "num_input_tokens_seen": 95868144, + "step": 5958 + }, + { + "epoch": 0.417417496300577, + "grad_norm": 3.515028238296509, + "learning_rate": 5.8297436077057786e-05, + "loss": 0.9503, + "num_input_tokens_seen": 95884528, + "step": 5959 + }, + { + "epoch": 0.41748754454630627, + "grad_norm": 3.9339003562927246, + "learning_rate": 5.8290437828371295e-05, + "loss": 0.928, + "num_input_tokens_seen": 95900688, + "step": 5960 + }, + { + "epoch": 0.4175575927920355, + "grad_norm": 3.896474838256836, + "learning_rate": 5.828343957968477e-05, + "loss": 1.1249, + "num_input_tokens_seen": 95917072, + "step": 5961 + }, + { + "epoch": 0.4176276410377648, + "grad_norm": 5.101248264312744, + "learning_rate": 5.8276441330998245e-05, + "loss": 1.0097, + "num_input_tokens_seen": 95931976, + "step": 5962 + }, + { + "epoch": 0.417697689283494, + "grad_norm": 7.063873291015625, + "learning_rate": 5.8269443082311734e-05, + "loss": 1.1686, + "num_input_tokens_seen": 95948360, + "step": 5963 + }, + { + "epoch": 0.41776773752922325, + "grad_norm": 3.564887762069702, + "learning_rate": 5.826244483362523e-05, + "loss": 1.1375, + "num_input_tokens_seen": 95964744, + "step": 5964 + }, + { + "epoch": 0.4178377857749525, + "grad_norm": 3.821101427078247, + "learning_rate": 5.82554465849387e-05, + "loss": 1.252, + "num_input_tokens_seen": 95980760, + "step": 5965 + }, + { + "epoch": 0.41790783402068177, + "grad_norm": 3.609252691268921, + "learning_rate": 5.824844833625219e-05, + "loss": 1.0633, + "num_input_tokens_seen": 95997144, + "step": 5966 + }, + { + "epoch": 0.41797788226641097, + "grad_norm": 4.1750874519348145, + "learning_rate": 5.824145008756569e-05, + "loss": 1.119, + "num_input_tokens_seen": 96012872, + "step": 5967 + }, + { + "epoch": 0.41804793051214023, + "grad_norm": 3.776747465133667, + "learning_rate": 5.823445183887917e-05, + "loss": 0.9792, + "num_input_tokens_seen": 96029168, + "step": 5968 + }, + { + "epoch": 0.4181179787578695, + "grad_norm": 8.143741607666016, + "learning_rate": 5.822745359019264e-05, + "loss": 1.0686, + "num_input_tokens_seen": 96045376, + "step": 5969 + }, + { + "epoch": 0.41818802700359875, + "grad_norm": 4.336330890655518, + "learning_rate": 5.822045534150614e-05, + "loss": 1.0597, + "num_input_tokens_seen": 96060904, + "step": 5970 + }, + { + "epoch": 0.41825807524932795, + "grad_norm": 3.731605052947998, + "learning_rate": 5.821345709281962e-05, + "loss": 1.1302, + "num_input_tokens_seen": 96076824, + "step": 5971 + }, + { + "epoch": 0.4183281234950572, + "grad_norm": 3.8380699157714844, + "learning_rate": 5.82064588441331e-05, + "loss": 1.1711, + "num_input_tokens_seen": 96092616, + "step": 5972 + }, + { + "epoch": 0.4183981717407865, + "grad_norm": 3.9088358879089355, + "learning_rate": 5.8199460595446586e-05, + "loss": 1.1622, + "num_input_tokens_seen": 96109000, + "step": 5973 + }, + { + "epoch": 0.41846821998651573, + "grad_norm": 4.0047783851623535, + "learning_rate": 5.819246234676008e-05, + "loss": 1.0949, + "num_input_tokens_seen": 96125344, + "step": 5974 + }, + { + "epoch": 0.41853826823224494, + "grad_norm": 3.936495542526245, + "learning_rate": 5.818546409807356e-05, + "loss": 1.1566, + "num_input_tokens_seen": 96141536, + "step": 5975 + }, + { + "epoch": 0.4186083164779742, + "grad_norm": 3.8510451316833496, + "learning_rate": 5.8178465849387045e-05, + "loss": 1.0454, + "num_input_tokens_seen": 96157920, + "step": 5976 + }, + { + "epoch": 0.41867836472370346, + "grad_norm": 3.5825259685516357, + "learning_rate": 5.817146760070053e-05, + "loss": 1.0491, + "num_input_tokens_seen": 96173848, + "step": 5977 + }, + { + "epoch": 0.4187484129694327, + "grad_norm": 4.0553717613220215, + "learning_rate": 5.8164469352014015e-05, + "loss": 1.0669, + "num_input_tokens_seen": 96190232, + "step": 5978 + }, + { + "epoch": 0.4188184612151619, + "grad_norm": 4.085362434387207, + "learning_rate": 5.815747110332749e-05, + "loss": 1.2832, + "num_input_tokens_seen": 96205896, + "step": 5979 + }, + { + "epoch": 0.4188885094608912, + "grad_norm": 6.552733421325684, + "learning_rate": 5.815047285464098e-05, + "loss": 1.1183, + "num_input_tokens_seen": 96221784, + "step": 5980 + }, + { + "epoch": 0.41895855770662044, + "grad_norm": 4.052005290985107, + "learning_rate": 5.8143474605954474e-05, + "loss": 1.0309, + "num_input_tokens_seen": 96237472, + "step": 5981 + }, + { + "epoch": 0.4190286059523497, + "grad_norm": 3.9679994583129883, + "learning_rate": 5.813647635726794e-05, + "loss": 1.0298, + "num_input_tokens_seen": 96253616, + "step": 5982 + }, + { + "epoch": 0.4190986541980789, + "grad_norm": 4.879584312438965, + "learning_rate": 5.812947810858144e-05, + "loss": 0.9334, + "num_input_tokens_seen": 96270000, + "step": 5983 + }, + { + "epoch": 0.41916870244380816, + "grad_norm": 4.894060134887695, + "learning_rate": 5.8122479859894926e-05, + "loss": 0.9829, + "num_input_tokens_seen": 96286384, + "step": 5984 + }, + { + "epoch": 0.4192387506895374, + "grad_norm": 3.9925336837768555, + "learning_rate": 5.811548161120841e-05, + "loss": 1.0285, + "num_input_tokens_seen": 96302200, + "step": 5985 + }, + { + "epoch": 0.4193087989352667, + "grad_norm": 4.043905258178711, + "learning_rate": 5.810848336252189e-05, + "loss": 1.0217, + "num_input_tokens_seen": 96318584, + "step": 5986 + }, + { + "epoch": 0.4193788471809959, + "grad_norm": 4.216322422027588, + "learning_rate": 5.8101485113835385e-05, + "loss": 0.9483, + "num_input_tokens_seen": 96334720, + "step": 5987 + }, + { + "epoch": 0.41944889542672514, + "grad_norm": 3.772749900817871, + "learning_rate": 5.809448686514887e-05, + "loss": 1.272, + "num_input_tokens_seen": 96351104, + "step": 5988 + }, + { + "epoch": 0.4195189436724544, + "grad_norm": 3.6716036796569824, + "learning_rate": 5.8087488616462335e-05, + "loss": 1.1796, + "num_input_tokens_seen": 96367488, + "step": 5989 + }, + { + "epoch": 0.41958899191818366, + "grad_norm": 3.9748408794403076, + "learning_rate": 5.8080490367775844e-05, + "loss": 1.0994, + "num_input_tokens_seen": 96383872, + "step": 5990 + }, + { + "epoch": 0.4196590401639129, + "grad_norm": 5.4619269371032715, + "learning_rate": 5.8073492119089326e-05, + "loss": 1.0553, + "num_input_tokens_seen": 96400256, + "step": 5991 + }, + { + "epoch": 0.4197290884096421, + "grad_norm": 3.4772391319274902, + "learning_rate": 5.806649387040281e-05, + "loss": 1.0543, + "num_input_tokens_seen": 96416640, + "step": 5992 + }, + { + "epoch": 0.4197991366553714, + "grad_norm": 4.003359794616699, + "learning_rate": 5.805949562171629e-05, + "loss": 0.9306, + "num_input_tokens_seen": 96432872, + "step": 5993 + }, + { + "epoch": 0.41986918490110064, + "grad_norm": 3.433760166168213, + "learning_rate": 5.805249737302979e-05, + "loss": 1.0197, + "num_input_tokens_seen": 96449256, + "step": 5994 + }, + { + "epoch": 0.4199392331468299, + "grad_norm": 4.519425868988037, + "learning_rate": 5.804549912434326e-05, + "loss": 1.0303, + "num_input_tokens_seen": 96465456, + "step": 5995 + }, + { + "epoch": 0.4200092813925591, + "grad_norm": 3.8798038959503174, + "learning_rate": 5.803850087565674e-05, + "loss": 1.1426, + "num_input_tokens_seen": 96481840, + "step": 5996 + }, + { + "epoch": 0.42007932963828837, + "grad_norm": 7.4741058349609375, + "learning_rate": 5.803150262697024e-05, + "loss": 1.0759, + "num_input_tokens_seen": 96497160, + "step": 5997 + }, + { + "epoch": 0.4201493778840176, + "grad_norm": 3.6269989013671875, + "learning_rate": 5.802450437828372e-05, + "loss": 1.0676, + "num_input_tokens_seen": 96512672, + "step": 5998 + }, + { + "epoch": 0.4202194261297469, + "grad_norm": 3.6369056701660156, + "learning_rate": 5.80175061295972e-05, + "loss": 0.9351, + "num_input_tokens_seen": 96529056, + "step": 5999 + }, + { + "epoch": 0.4202894743754761, + "grad_norm": 6.0609564781188965, + "learning_rate": 5.801050788091069e-05, + "loss": 1.3902, + "num_input_tokens_seen": 96545320, + "step": 6000 + }, + { + "epoch": 0.4202894743754761, + "eval_loss": 1.1260257959365845, + "eval_runtime": 0.1972, + "eval_samples_per_second": 5.072, + "eval_steps_per_second": 5.072, + "num_input_tokens_seen": 96545320, + "step": 6000 + }, + { + "epoch": 0.42035952262120535, + "grad_norm": 3.939091682434082, + "learning_rate": 5.800350963222417e-05, + "loss": 1.2935, + "num_input_tokens_seen": 96561704, + "step": 6001 + }, + { + "epoch": 0.4204295708669346, + "grad_norm": 4.907895565032959, + "learning_rate": 5.799651138353765e-05, + "loss": 0.9923, + "num_input_tokens_seen": 96578088, + "step": 6002 + }, + { + "epoch": 0.42049961911266387, + "grad_norm": 4.598423480987549, + "learning_rate": 5.7989513134851135e-05, + "loss": 0.9852, + "num_input_tokens_seen": 96594392, + "step": 6003 + }, + { + "epoch": 0.4205696673583931, + "grad_norm": 4.8221540451049805, + "learning_rate": 5.7982514886164644e-05, + "loss": 1.2558, + "num_input_tokens_seen": 96609688, + "step": 6004 + }, + { + "epoch": 0.42063971560412233, + "grad_norm": 6.331230163574219, + "learning_rate": 5.797551663747811e-05, + "loss": 0.943, + "num_input_tokens_seen": 96625480, + "step": 6005 + }, + { + "epoch": 0.4207097638498516, + "grad_norm": 4.262217044830322, + "learning_rate": 5.7968518388791594e-05, + "loss": 1.1607, + "num_input_tokens_seen": 96641040, + "step": 6006 + }, + { + "epoch": 0.42077981209558085, + "grad_norm": 4.552499294281006, + "learning_rate": 5.796152014010508e-05, + "loss": 0.9532, + "num_input_tokens_seen": 96657424, + "step": 6007 + }, + { + "epoch": 0.42084986034131006, + "grad_norm": 3.414970874786377, + "learning_rate": 5.7954521891418564e-05, + "loss": 0.8991, + "num_input_tokens_seen": 96673792, + "step": 6008 + }, + { + "epoch": 0.4209199085870393, + "grad_norm": 3.70623517036438, + "learning_rate": 5.7947523642732046e-05, + "loss": 1.167, + "num_input_tokens_seen": 96690176, + "step": 6009 + }, + { + "epoch": 0.4209899568327686, + "grad_norm": 4.370288848876953, + "learning_rate": 5.794052539404554e-05, + "loss": 1.0194, + "num_input_tokens_seen": 96706272, + "step": 6010 + }, + { + "epoch": 0.42106000507849783, + "grad_norm": 3.4775140285491943, + "learning_rate": 5.793352714535904e-05, + "loss": 0.9383, + "num_input_tokens_seen": 96722312, + "step": 6011 + }, + { + "epoch": 0.42113005332422704, + "grad_norm": 3.9860763549804688, + "learning_rate": 5.7926528896672505e-05, + "loss": 1.0999, + "num_input_tokens_seen": 96737040, + "step": 6012 + }, + { + "epoch": 0.4212001015699563, + "grad_norm": 5.0287933349609375, + "learning_rate": 5.791953064798599e-05, + "loss": 0.9892, + "num_input_tokens_seen": 96753424, + "step": 6013 + }, + { + "epoch": 0.42127014981568556, + "grad_norm": 3.821143627166748, + "learning_rate": 5.791253239929949e-05, + "loss": 1.0388, + "num_input_tokens_seen": 96769552, + "step": 6014 + }, + { + "epoch": 0.4213401980614148, + "grad_norm": 4.180905818939209, + "learning_rate": 5.7905534150612964e-05, + "loss": 1.1118, + "num_input_tokens_seen": 96785936, + "step": 6015 + }, + { + "epoch": 0.421410246307144, + "grad_norm": 4.334224224090576, + "learning_rate": 5.7898535901926446e-05, + "loss": 1.3425, + "num_input_tokens_seen": 96802320, + "step": 6016 + }, + { + "epoch": 0.4214802945528733, + "grad_norm": 4.317337989807129, + "learning_rate": 5.7891537653239934e-05, + "loss": 1.0218, + "num_input_tokens_seen": 96818360, + "step": 6017 + }, + { + "epoch": 0.42155034279860254, + "grad_norm": 3.789919376373291, + "learning_rate": 5.788453940455343e-05, + "loss": 1.0815, + "num_input_tokens_seen": 96833928, + "step": 6018 + }, + { + "epoch": 0.4216203910443318, + "grad_norm": 4.240170001983643, + "learning_rate": 5.78775411558669e-05, + "loss": 1.0818, + "num_input_tokens_seen": 96850312, + "step": 6019 + }, + { + "epoch": 0.421690439290061, + "grad_norm": 5.163384914398193, + "learning_rate": 5.7870542907180393e-05, + "loss": 1.061, + "num_input_tokens_seen": 96866696, + "step": 6020 + }, + { + "epoch": 0.42176048753579026, + "grad_norm": 3.653265953063965, + "learning_rate": 5.786354465849388e-05, + "loss": 0.8955, + "num_input_tokens_seen": 96883080, + "step": 6021 + }, + { + "epoch": 0.4218305357815195, + "grad_norm": 3.4269649982452393, + "learning_rate": 5.785654640980736e-05, + "loss": 1.009, + "num_input_tokens_seen": 96899176, + "step": 6022 + }, + { + "epoch": 0.4219005840272488, + "grad_norm": 5.8838276863098145, + "learning_rate": 5.784954816112084e-05, + "loss": 1.0385, + "num_input_tokens_seen": 96914576, + "step": 6023 + }, + { + "epoch": 0.42197063227297804, + "grad_norm": 4.201550006866455, + "learning_rate": 5.784254991243433e-05, + "loss": 1.0398, + "num_input_tokens_seen": 96930808, + "step": 6024 + }, + { + "epoch": 0.42204068051870725, + "grad_norm": 3.961399793624878, + "learning_rate": 5.783555166374781e-05, + "loss": 1.0815, + "num_input_tokens_seen": 96947192, + "step": 6025 + }, + { + "epoch": 0.4221107287644365, + "grad_norm": 4.811456680297852, + "learning_rate": 5.782855341506129e-05, + "loss": 1.0576, + "num_input_tokens_seen": 96961896, + "step": 6026 + }, + { + "epoch": 0.42218077701016576, + "grad_norm": 3.6154356002807617, + "learning_rate": 5.7821555166374787e-05, + "loss": 0.9678, + "num_input_tokens_seen": 96977656, + "step": 6027 + }, + { + "epoch": 0.422250825255895, + "grad_norm": 3.787724256515503, + "learning_rate": 5.7814556917688275e-05, + "loss": 1.0813, + "num_input_tokens_seen": 96993936, + "step": 6028 + }, + { + "epoch": 0.4223208735016242, + "grad_norm": 4.215615272521973, + "learning_rate": 5.780755866900175e-05, + "loss": 1.2758, + "num_input_tokens_seen": 97010320, + "step": 6029 + }, + { + "epoch": 0.4223909217473535, + "grad_norm": 3.9257047176361084, + "learning_rate": 5.780056042031524e-05, + "loss": 0.9753, + "num_input_tokens_seen": 97026704, + "step": 6030 + }, + { + "epoch": 0.42246096999308275, + "grad_norm": 3.5415945053100586, + "learning_rate": 5.7793562171628734e-05, + "loss": 1.0718, + "num_input_tokens_seen": 97043088, + "step": 6031 + }, + { + "epoch": 0.422531018238812, + "grad_norm": 4.213465213775635, + "learning_rate": 5.77865639229422e-05, + "loss": 1.0011, + "num_input_tokens_seen": 97059472, + "step": 6032 + }, + { + "epoch": 0.4226010664845412, + "grad_norm": 3.8070178031921387, + "learning_rate": 5.7779565674255684e-05, + "loss": 0.881, + "num_input_tokens_seen": 97074712, + "step": 6033 + }, + { + "epoch": 0.42267111473027047, + "grad_norm": 3.8083109855651855, + "learning_rate": 5.777256742556918e-05, + "loss": 1.0003, + "num_input_tokens_seen": 97091096, + "step": 6034 + }, + { + "epoch": 0.42274116297599973, + "grad_norm": 3.491002082824707, + "learning_rate": 5.7765569176882675e-05, + "loss": 1.0276, + "num_input_tokens_seen": 97107304, + "step": 6035 + }, + { + "epoch": 0.422811211221729, + "grad_norm": 4.1060919761657715, + "learning_rate": 5.775857092819616e-05, + "loss": 0.979, + "num_input_tokens_seen": 97123688, + "step": 6036 + }, + { + "epoch": 0.4228812594674582, + "grad_norm": 3.8975484371185303, + "learning_rate": 5.775157267950964e-05, + "loss": 1.0906, + "num_input_tokens_seen": 97140008, + "step": 6037 + }, + { + "epoch": 0.42295130771318745, + "grad_norm": 4.4457197189331055, + "learning_rate": 5.774457443082313e-05, + "loss": 1.0763, + "num_input_tokens_seen": 97156392, + "step": 6038 + }, + { + "epoch": 0.4230213559589167, + "grad_norm": 3.5186471939086914, + "learning_rate": 5.77375761821366e-05, + "loss": 1.0242, + "num_input_tokens_seen": 97172776, + "step": 6039 + }, + { + "epoch": 0.42309140420464597, + "grad_norm": 3.729041814804077, + "learning_rate": 5.7730577933450084e-05, + "loss": 1.0272, + "num_input_tokens_seen": 97189160, + "step": 6040 + }, + { + "epoch": 0.4231614524503752, + "grad_norm": 4.501081466674805, + "learning_rate": 5.7723579684763586e-05, + "loss": 0.9879, + "num_input_tokens_seen": 97205544, + "step": 6041 + }, + { + "epoch": 0.42323150069610443, + "grad_norm": 5.922353744506836, + "learning_rate": 5.771658143607707e-05, + "loss": 1.1519, + "num_input_tokens_seen": 97221928, + "step": 6042 + }, + { + "epoch": 0.4233015489418337, + "grad_norm": 3.649948835372925, + "learning_rate": 5.770958318739055e-05, + "loss": 0.9467, + "num_input_tokens_seen": 97238048, + "step": 6043 + }, + { + "epoch": 0.42337159718756295, + "grad_norm": 4.660130977630615, + "learning_rate": 5.770258493870403e-05, + "loss": 1.0903, + "num_input_tokens_seen": 97254272, + "step": 6044 + }, + { + "epoch": 0.42344164543329216, + "grad_norm": 4.064535140991211, + "learning_rate": 5.769558669001752e-05, + "loss": 1.2646, + "num_input_tokens_seen": 97270656, + "step": 6045 + }, + { + "epoch": 0.4235116936790214, + "grad_norm": 3.931034803390503, + "learning_rate": 5.7688588441331e-05, + "loss": 0.9511, + "num_input_tokens_seen": 97287040, + "step": 6046 + }, + { + "epoch": 0.4235817419247507, + "grad_norm": 3.920013427734375, + "learning_rate": 5.7681590192644484e-05, + "loss": 1.1886, + "num_input_tokens_seen": 97302784, + "step": 6047 + }, + { + "epoch": 0.42365179017047994, + "grad_norm": 3.356661319732666, + "learning_rate": 5.767459194395798e-05, + "loss": 0.9283, + "num_input_tokens_seen": 97319168, + "step": 6048 + }, + { + "epoch": 0.42372183841620914, + "grad_norm": 4.33698034286499, + "learning_rate": 5.766759369527145e-05, + "loss": 1.0689, + "num_input_tokens_seen": 97335552, + "step": 6049 + }, + { + "epoch": 0.4237918866619384, + "grad_norm": 6.201281547546387, + "learning_rate": 5.766059544658493e-05, + "loss": 0.9756, + "num_input_tokens_seen": 97350720, + "step": 6050 + }, + { + "epoch": 0.42386193490766766, + "grad_norm": 4.005791664123535, + "learning_rate": 5.7653597197898425e-05, + "loss": 0.9559, + "num_input_tokens_seen": 97367104, + "step": 6051 + }, + { + "epoch": 0.4239319831533969, + "grad_norm": 4.238742828369141, + "learning_rate": 5.764659894921191e-05, + "loss": 1.0348, + "num_input_tokens_seen": 97383488, + "step": 6052 + }, + { + "epoch": 0.4240020313991261, + "grad_norm": 4.139926433563232, + "learning_rate": 5.7639600700525395e-05, + "loss": 1.0571, + "num_input_tokens_seen": 97398864, + "step": 6053 + }, + { + "epoch": 0.4240720796448554, + "grad_norm": 3.538890838623047, + "learning_rate": 5.763260245183888e-05, + "loss": 0.9162, + "num_input_tokens_seen": 97414416, + "step": 6054 + }, + { + "epoch": 0.42414212789058464, + "grad_norm": 3.888108253479004, + "learning_rate": 5.762560420315237e-05, + "loss": 1.0937, + "num_input_tokens_seen": 97429616, + "step": 6055 + }, + { + "epoch": 0.4242121761363139, + "grad_norm": 4.287962436676025, + "learning_rate": 5.7618605954465854e-05, + "loss": 0.9786, + "num_input_tokens_seen": 97444784, + "step": 6056 + }, + { + "epoch": 0.4242822243820431, + "grad_norm": 3.5160460472106934, + "learning_rate": 5.761160770577935e-05, + "loss": 1.0405, + "num_input_tokens_seen": 97461104, + "step": 6057 + }, + { + "epoch": 0.42435227262777236, + "grad_norm": 4.076432704925537, + "learning_rate": 5.760460945709283e-05, + "loss": 1.1768, + "num_input_tokens_seen": 97477488, + "step": 6058 + }, + { + "epoch": 0.4244223208735016, + "grad_norm": 3.4506590366363525, + "learning_rate": 5.759761120840631e-05, + "loss": 0.9435, + "num_input_tokens_seen": 97493872, + "step": 6059 + }, + { + "epoch": 0.4244923691192309, + "grad_norm": 4.196661472320557, + "learning_rate": 5.7590612959719795e-05, + "loss": 1.0714, + "num_input_tokens_seen": 97509088, + "step": 6060 + }, + { + "epoch": 0.42456241736496014, + "grad_norm": 4.412662506103516, + "learning_rate": 5.758361471103328e-05, + "loss": 1.1809, + "num_input_tokens_seen": 97525472, + "step": 6061 + }, + { + "epoch": 0.42463246561068935, + "grad_norm": 3.4199881553649902, + "learning_rate": 5.7576616462346765e-05, + "loss": 1.0078, + "num_input_tokens_seen": 97541856, + "step": 6062 + }, + { + "epoch": 0.4247025138564186, + "grad_norm": 4.215256214141846, + "learning_rate": 5.756961821366025e-05, + "loss": 0.9772, + "num_input_tokens_seen": 97558240, + "step": 6063 + }, + { + "epoch": 0.42477256210214787, + "grad_norm": 4.764070510864258, + "learning_rate": 5.756261996497374e-05, + "loss": 1.1994, + "num_input_tokens_seen": 97574624, + "step": 6064 + }, + { + "epoch": 0.4248426103478771, + "grad_norm": 3.8896613121032715, + "learning_rate": 5.7555621716287224e-05, + "loss": 1.1135, + "num_input_tokens_seen": 97591008, + "step": 6065 + }, + { + "epoch": 0.42491265859360633, + "grad_norm": 5.101664066314697, + "learning_rate": 5.7548623467600706e-05, + "loss": 1.0647, + "num_input_tokens_seen": 97607392, + "step": 6066 + }, + { + "epoch": 0.4249827068393356, + "grad_norm": 4.464064121246338, + "learning_rate": 5.754162521891419e-05, + "loss": 1.0992, + "num_input_tokens_seen": 97623776, + "step": 6067 + }, + { + "epoch": 0.42505275508506485, + "grad_norm": 3.9882681369781494, + "learning_rate": 5.7534626970227676e-05, + "loss": 1.0344, + "num_input_tokens_seen": 97639296, + "step": 6068 + }, + { + "epoch": 0.4251228033307941, + "grad_norm": 5.5437331199646, + "learning_rate": 5.752762872154116e-05, + "loss": 1.0782, + "num_input_tokens_seen": 97655592, + "step": 6069 + }, + { + "epoch": 0.4251928515765233, + "grad_norm": 4.157887935638428, + "learning_rate": 5.752063047285464e-05, + "loss": 1.2531, + "num_input_tokens_seen": 97671976, + "step": 6070 + }, + { + "epoch": 0.42526289982225257, + "grad_norm": 4.455500602722168, + "learning_rate": 5.751363222416812e-05, + "loss": 1.0738, + "num_input_tokens_seen": 97688360, + "step": 6071 + }, + { + "epoch": 0.42533294806798183, + "grad_norm": 5.3056254386901855, + "learning_rate": 5.750663397548162e-05, + "loss": 1.2483, + "num_input_tokens_seen": 97704008, + "step": 6072 + }, + { + "epoch": 0.4254029963137111, + "grad_norm": 3.5183193683624268, + "learning_rate": 5.74996357267951e-05, + "loss": 0.9862, + "num_input_tokens_seen": 97720392, + "step": 6073 + }, + { + "epoch": 0.4254730445594403, + "grad_norm": 4.44768762588501, + "learning_rate": 5.749263747810859e-05, + "loss": 1.2951, + "num_input_tokens_seen": 97736584, + "step": 6074 + }, + { + "epoch": 0.42554309280516955, + "grad_norm": 3.6957905292510986, + "learning_rate": 5.748563922942207e-05, + "loss": 1.2134, + "num_input_tokens_seen": 97752968, + "step": 6075 + }, + { + "epoch": 0.4256131410508988, + "grad_norm": 3.6841094493865967, + "learning_rate": 5.747864098073555e-05, + "loss": 0.9744, + "num_input_tokens_seen": 97769352, + "step": 6076 + }, + { + "epoch": 0.4256831892966281, + "grad_norm": 6.541488170623779, + "learning_rate": 5.747164273204903e-05, + "loss": 1.1638, + "num_input_tokens_seen": 97785736, + "step": 6077 + }, + { + "epoch": 0.4257532375423573, + "grad_norm": 4.056735515594482, + "learning_rate": 5.7464644483362515e-05, + "loss": 0.9758, + "num_input_tokens_seen": 97801624, + "step": 6078 + }, + { + "epoch": 0.42582328578808654, + "grad_norm": 3.5294058322906494, + "learning_rate": 5.745764623467601e-05, + "loss": 0.9682, + "num_input_tokens_seen": 97817544, + "step": 6079 + }, + { + "epoch": 0.4258933340338158, + "grad_norm": 3.851330280303955, + "learning_rate": 5.745064798598949e-05, + "loss": 1.0858, + "num_input_tokens_seen": 97833600, + "step": 6080 + }, + { + "epoch": 0.42596338227954506, + "grad_norm": 3.6939046382904053, + "learning_rate": 5.744364973730299e-05, + "loss": 0.9469, + "num_input_tokens_seen": 97849984, + "step": 6081 + }, + { + "epoch": 0.42603343052527426, + "grad_norm": 3.7894139289855957, + "learning_rate": 5.743665148861647e-05, + "loss": 1.1953, + "num_input_tokens_seen": 97866368, + "step": 6082 + }, + { + "epoch": 0.4261034787710035, + "grad_norm": 3.377105712890625, + "learning_rate": 5.742965323992995e-05, + "loss": 1.0573, + "num_input_tokens_seen": 97882752, + "step": 6083 + }, + { + "epoch": 0.4261735270167328, + "grad_norm": 4.0349440574646, + "learning_rate": 5.742265499124344e-05, + "loss": 1.1328, + "num_input_tokens_seen": 97899136, + "step": 6084 + }, + { + "epoch": 0.42624357526246204, + "grad_norm": 3.9353208541870117, + "learning_rate": 5.7415656742556935e-05, + "loss": 0.9787, + "num_input_tokens_seen": 97915520, + "step": 6085 + }, + { + "epoch": 0.42631362350819124, + "grad_norm": 3.6593427658081055, + "learning_rate": 5.740865849387042e-05, + "loss": 1.0417, + "num_input_tokens_seen": 97931904, + "step": 6086 + }, + { + "epoch": 0.4263836717539205, + "grad_norm": 3.543994665145874, + "learning_rate": 5.7401660245183885e-05, + "loss": 0.9268, + "num_input_tokens_seen": 97948288, + "step": 6087 + }, + { + "epoch": 0.42645371999964976, + "grad_norm": 3.925420045852661, + "learning_rate": 5.739466199649738e-05, + "loss": 1.1635, + "num_input_tokens_seen": 97964672, + "step": 6088 + }, + { + "epoch": 0.426523768245379, + "grad_norm": 4.6036224365234375, + "learning_rate": 5.738766374781086e-05, + "loss": 1.1229, + "num_input_tokens_seen": 97981056, + "step": 6089 + }, + { + "epoch": 0.4265938164911082, + "grad_norm": 6.555153846740723, + "learning_rate": 5.7380665499124344e-05, + "loss": 1.1401, + "num_input_tokens_seen": 97997440, + "step": 6090 + }, + { + "epoch": 0.4266638647368375, + "grad_norm": 3.7414231300354004, + "learning_rate": 5.737366725043783e-05, + "loss": 1.0223, + "num_input_tokens_seen": 98013264, + "step": 6091 + }, + { + "epoch": 0.42673391298256674, + "grad_norm": 4.380615234375, + "learning_rate": 5.7366669001751314e-05, + "loss": 1.1524, + "num_input_tokens_seen": 98029176, + "step": 6092 + }, + { + "epoch": 0.426803961228296, + "grad_norm": 4.624136924743652, + "learning_rate": 5.7359670753064796e-05, + "loss": 1.1277, + "num_input_tokens_seen": 98044384, + "step": 6093 + }, + { + "epoch": 0.42687400947402526, + "grad_norm": 4.984564781188965, + "learning_rate": 5.735267250437828e-05, + "loss": 1.1115, + "num_input_tokens_seen": 98060768, + "step": 6094 + }, + { + "epoch": 0.42694405771975447, + "grad_norm": 5.481975078582764, + "learning_rate": 5.734567425569178e-05, + "loss": 1.167, + "num_input_tokens_seen": 98077152, + "step": 6095 + }, + { + "epoch": 0.4270141059654837, + "grad_norm": 3.3822808265686035, + "learning_rate": 5.733867600700526e-05, + "loss": 0.9442, + "num_input_tokens_seen": 98093224, + "step": 6096 + }, + { + "epoch": 0.427084154211213, + "grad_norm": 3.8090853691101074, + "learning_rate": 5.733167775831874e-05, + "loss": 1.0478, + "num_input_tokens_seen": 98109608, + "step": 6097 + }, + { + "epoch": 0.42715420245694224, + "grad_norm": 4.279370307922363, + "learning_rate": 5.7324679509632226e-05, + "loss": 0.918, + "num_input_tokens_seen": 98125992, + "step": 6098 + }, + { + "epoch": 0.42722425070267145, + "grad_norm": 5.998210430145264, + "learning_rate": 5.731768126094571e-05, + "loss": 0.9409, + "num_input_tokens_seen": 98142376, + "step": 6099 + }, + { + "epoch": 0.4272942989484007, + "grad_norm": 4.388184070587158, + "learning_rate": 5.731068301225919e-05, + "loss": 1.0364, + "num_input_tokens_seen": 98158760, + "step": 6100 + }, + { + "epoch": 0.42736434719412997, + "grad_norm": 4.937825679779053, + "learning_rate": 5.7303684763572685e-05, + "loss": 1.1185, + "num_input_tokens_seen": 98175144, + "step": 6101 + }, + { + "epoch": 0.4274343954398592, + "grad_norm": 3.800776720046997, + "learning_rate": 5.729668651488618e-05, + "loss": 1.1608, + "num_input_tokens_seen": 98191184, + "step": 6102 + }, + { + "epoch": 0.42750444368558843, + "grad_norm": 3.857093334197998, + "learning_rate": 5.728968826619966e-05, + "loss": 0.8588, + "num_input_tokens_seen": 98207568, + "step": 6103 + }, + { + "epoch": 0.4275744919313177, + "grad_norm": 3.562218189239502, + "learning_rate": 5.728269001751313e-05, + "loss": 1.1002, + "num_input_tokens_seen": 98223952, + "step": 6104 + }, + { + "epoch": 0.42764454017704695, + "grad_norm": 3.826802968978882, + "learning_rate": 5.7275691768826626e-05, + "loss": 0.7401, + "num_input_tokens_seen": 98239576, + "step": 6105 + }, + { + "epoch": 0.4277145884227762, + "grad_norm": 4.127960205078125, + "learning_rate": 5.7268693520140114e-05, + "loss": 1.0163, + "num_input_tokens_seen": 98255960, + "step": 6106 + }, + { + "epoch": 0.4277846366685054, + "grad_norm": 4.270632743835449, + "learning_rate": 5.726169527145359e-05, + "loss": 1.2359, + "num_input_tokens_seen": 98272080, + "step": 6107 + }, + { + "epoch": 0.4278546849142347, + "grad_norm": 4.543783187866211, + "learning_rate": 5.725469702276708e-05, + "loss": 1.1117, + "num_input_tokens_seen": 98288464, + "step": 6108 + }, + { + "epoch": 0.42792473315996393, + "grad_norm": 3.993234634399414, + "learning_rate": 5.724769877408057e-05, + "loss": 1.0059, + "num_input_tokens_seen": 98304424, + "step": 6109 + }, + { + "epoch": 0.4279947814056932, + "grad_norm": 4.11693000793457, + "learning_rate": 5.7240700525394055e-05, + "loss": 1.0718, + "num_input_tokens_seen": 98320808, + "step": 6110 + }, + { + "epoch": 0.4280648296514224, + "grad_norm": 4.000871658325195, + "learning_rate": 5.723370227670754e-05, + "loss": 0.9777, + "num_input_tokens_seen": 98337192, + "step": 6111 + }, + { + "epoch": 0.42813487789715166, + "grad_norm": 3.642763614654541, + "learning_rate": 5.7226704028021025e-05, + "loss": 0.9108, + "num_input_tokens_seen": 98353320, + "step": 6112 + }, + { + "epoch": 0.4282049261428809, + "grad_norm": 4.22330379486084, + "learning_rate": 5.721970577933451e-05, + "loss": 1.0968, + "num_input_tokens_seen": 98369704, + "step": 6113 + }, + { + "epoch": 0.4282749743886102, + "grad_norm": 3.7961175441741943, + "learning_rate": 5.721270753064798e-05, + "loss": 0.8756, + "num_input_tokens_seen": 98385544, + "step": 6114 + }, + { + "epoch": 0.4283450226343394, + "grad_norm": 3.771034002304077, + "learning_rate": 5.720570928196147e-05, + "loss": 1.1139, + "num_input_tokens_seen": 98401928, + "step": 6115 + }, + { + "epoch": 0.42841507088006864, + "grad_norm": 3.8084332942962646, + "learning_rate": 5.719871103327495e-05, + "loss": 1.1042, + "num_input_tokens_seen": 98418136, + "step": 6116 + }, + { + "epoch": 0.4284851191257979, + "grad_norm": 3.890608549118042, + "learning_rate": 5.7191712784588434e-05, + "loss": 0.9865, + "num_input_tokens_seen": 98433656, + "step": 6117 + }, + { + "epoch": 0.42855516737152716, + "grad_norm": 6.781351089477539, + "learning_rate": 5.718471453590193e-05, + "loss": 0.8032, + "num_input_tokens_seen": 98448776, + "step": 6118 + }, + { + "epoch": 0.42862521561725636, + "grad_norm": 3.941107749938965, + "learning_rate": 5.717771628721542e-05, + "loss": 1.104, + "num_input_tokens_seen": 98465160, + "step": 6119 + }, + { + "epoch": 0.4286952638629856, + "grad_norm": 4.457616329193115, + "learning_rate": 5.71707180385289e-05, + "loss": 1.1159, + "num_input_tokens_seen": 98481184, + "step": 6120 + }, + { + "epoch": 0.4287653121087149, + "grad_norm": 3.889111042022705, + "learning_rate": 5.7163719789842375e-05, + "loss": 1.0685, + "num_input_tokens_seen": 98497568, + "step": 6121 + }, + { + "epoch": 0.42883536035444414, + "grad_norm": 3.7574422359466553, + "learning_rate": 5.715672154115588e-05, + "loss": 0.9091, + "num_input_tokens_seen": 98513920, + "step": 6122 + }, + { + "epoch": 0.42890540860017334, + "grad_norm": 3.578437089920044, + "learning_rate": 5.714972329246936e-05, + "loss": 0.9449, + "num_input_tokens_seen": 98529664, + "step": 6123 + }, + { + "epoch": 0.4289754568459026, + "grad_norm": 5.0676398277282715, + "learning_rate": 5.714272504378283e-05, + "loss": 1.0768, + "num_input_tokens_seen": 98544936, + "step": 6124 + }, + { + "epoch": 0.42904550509163186, + "grad_norm": 4.475335121154785, + "learning_rate": 5.713572679509632e-05, + "loss": 0.9347, + "num_input_tokens_seen": 98560520, + "step": 6125 + }, + { + "epoch": 0.4291155533373611, + "grad_norm": 6.345788955688477, + "learning_rate": 5.712872854640982e-05, + "loss": 1.1897, + "num_input_tokens_seen": 98576320, + "step": 6126 + }, + { + "epoch": 0.4291856015830903, + "grad_norm": 3.775374174118042, + "learning_rate": 5.71217302977233e-05, + "loss": 0.9803, + "num_input_tokens_seen": 98592704, + "step": 6127 + }, + { + "epoch": 0.4292556498288196, + "grad_norm": 4.224292278289795, + "learning_rate": 5.711473204903678e-05, + "loss": 1.2253, + "num_input_tokens_seen": 98607664, + "step": 6128 + }, + { + "epoch": 0.42932569807454884, + "grad_norm": 4.470034122467041, + "learning_rate": 5.710773380035027e-05, + "loss": 0.9915, + "num_input_tokens_seen": 98624048, + "step": 6129 + }, + { + "epoch": 0.4293957463202781, + "grad_norm": 6.22687292098999, + "learning_rate": 5.710073555166375e-05, + "loss": 1.2048, + "num_input_tokens_seen": 98640432, + "step": 6130 + }, + { + "epoch": 0.42946579456600736, + "grad_norm": 3.9434430599212646, + "learning_rate": 5.709373730297722e-05, + "loss": 1.0306, + "num_input_tokens_seen": 98656672, + "step": 6131 + }, + { + "epoch": 0.42953584281173657, + "grad_norm": 3.7640228271484375, + "learning_rate": 5.7086739054290716e-05, + "loss": 1.061, + "num_input_tokens_seen": 98673056, + "step": 6132 + }, + { + "epoch": 0.4296058910574658, + "grad_norm": 5.742674827575684, + "learning_rate": 5.707974080560421e-05, + "loss": 1.0773, + "num_input_tokens_seen": 98688400, + "step": 6133 + }, + { + "epoch": 0.4296759393031951, + "grad_norm": 4.938521862030029, + "learning_rate": 5.707274255691769e-05, + "loss": 0.9877, + "num_input_tokens_seen": 98703304, + "step": 6134 + }, + { + "epoch": 0.42974598754892435, + "grad_norm": 3.7322773933410645, + "learning_rate": 5.7065744308231175e-05, + "loss": 0.9787, + "num_input_tokens_seen": 98717536, + "step": 6135 + }, + { + "epoch": 0.42981603579465355, + "grad_norm": 3.741265296936035, + "learning_rate": 5.7058746059544663e-05, + "loss": 1.1105, + "num_input_tokens_seen": 98733632, + "step": 6136 + }, + { + "epoch": 0.4298860840403828, + "grad_norm": 3.9021074771881104, + "learning_rate": 5.7051747810858145e-05, + "loss": 0.9721, + "num_input_tokens_seen": 98749088, + "step": 6137 + }, + { + "epoch": 0.42995613228611207, + "grad_norm": 4.327329635620117, + "learning_rate": 5.704474956217163e-05, + "loss": 1.0862, + "num_input_tokens_seen": 98765328, + "step": 6138 + }, + { + "epoch": 0.43002618053184133, + "grad_norm": 4.335643768310547, + "learning_rate": 5.703775131348512e-05, + "loss": 1.2386, + "num_input_tokens_seen": 98780744, + "step": 6139 + }, + { + "epoch": 0.43009622877757053, + "grad_norm": 4.66419792175293, + "learning_rate": 5.7030753064798604e-05, + "loss": 1.0696, + "num_input_tokens_seen": 98797128, + "step": 6140 + }, + { + "epoch": 0.4301662770232998, + "grad_norm": 4.208861351013184, + "learning_rate": 5.702375481611207e-05, + "loss": 0.9743, + "num_input_tokens_seen": 98812776, + "step": 6141 + }, + { + "epoch": 0.43023632526902905, + "grad_norm": 4.90700626373291, + "learning_rate": 5.701675656742557e-05, + "loss": 0.9744, + "num_input_tokens_seen": 98829160, + "step": 6142 + }, + { + "epoch": 0.4303063735147583, + "grad_norm": 3.942166805267334, + "learning_rate": 5.7009758318739056e-05, + "loss": 1.0032, + "num_input_tokens_seen": 98845544, + "step": 6143 + }, + { + "epoch": 0.4303764217604875, + "grad_norm": 4.919578552246094, + "learning_rate": 5.700276007005254e-05, + "loss": 1.0218, + "num_input_tokens_seen": 98861928, + "step": 6144 + }, + { + "epoch": 0.4304464700062168, + "grad_norm": 3.6429073810577393, + "learning_rate": 5.699576182136602e-05, + "loss": 1.1052, + "num_input_tokens_seen": 98878288, + "step": 6145 + }, + { + "epoch": 0.43051651825194603, + "grad_norm": 4.227152347564697, + "learning_rate": 5.6988763572679515e-05, + "loss": 1.059, + "num_input_tokens_seen": 98893816, + "step": 6146 + }, + { + "epoch": 0.4305865664976753, + "grad_norm": 4.016188144683838, + "learning_rate": 5.6981765323993e-05, + "loss": 0.9898, + "num_input_tokens_seen": 98909968, + "step": 6147 + }, + { + "epoch": 0.4306566147434045, + "grad_norm": 4.0402069091796875, + "learning_rate": 5.697476707530648e-05, + "loss": 1.1366, + "num_input_tokens_seen": 98926352, + "step": 6148 + }, + { + "epoch": 0.43072666298913376, + "grad_norm": 5.771969318389893, + "learning_rate": 5.6967768826619974e-05, + "loss": 0.9641, + "num_input_tokens_seen": 98941512, + "step": 6149 + }, + { + "epoch": 0.430796711234863, + "grad_norm": 4.444697856903076, + "learning_rate": 5.6960770577933456e-05, + "loss": 1.1114, + "num_input_tokens_seen": 98957624, + "step": 6150 + }, + { + "epoch": 0.4308667594805923, + "grad_norm": 3.386268377304077, + "learning_rate": 5.695377232924694e-05, + "loss": 0.9552, + "num_input_tokens_seen": 98974008, + "step": 6151 + }, + { + "epoch": 0.4309368077263215, + "grad_norm": 3.950138807296753, + "learning_rate": 5.694677408056042e-05, + "loss": 1.0048, + "num_input_tokens_seen": 98990392, + "step": 6152 + }, + { + "epoch": 0.43100685597205074, + "grad_norm": 3.7290585041046143, + "learning_rate": 5.693977583187392e-05, + "loss": 1.103, + "num_input_tokens_seen": 99006776, + "step": 6153 + }, + { + "epoch": 0.43107690421778, + "grad_norm": 3.3678364753723145, + "learning_rate": 5.693277758318739e-05, + "loss": 0.7396, + "num_input_tokens_seen": 99022912, + "step": 6154 + }, + { + "epoch": 0.43114695246350926, + "grad_norm": 5.882314682006836, + "learning_rate": 5.6925779334500886e-05, + "loss": 1.0949, + "num_input_tokens_seen": 99038208, + "step": 6155 + }, + { + "epoch": 0.43121700070923846, + "grad_norm": 4.231525421142578, + "learning_rate": 5.691878108581437e-05, + "loss": 1.0437, + "num_input_tokens_seen": 99053496, + "step": 6156 + }, + { + "epoch": 0.4312870489549677, + "grad_norm": 4.864506721496582, + "learning_rate": 5.691178283712785e-05, + "loss": 1.0978, + "num_input_tokens_seen": 99069600, + "step": 6157 + }, + { + "epoch": 0.431357097200697, + "grad_norm": 6.483276844024658, + "learning_rate": 5.690478458844133e-05, + "loss": 0.9262, + "num_input_tokens_seen": 99085456, + "step": 6158 + }, + { + "epoch": 0.43142714544642624, + "grad_norm": 3.830292224884033, + "learning_rate": 5.689778633975482e-05, + "loss": 1.1837, + "num_input_tokens_seen": 99101840, + "step": 6159 + }, + { + "epoch": 0.43149719369215545, + "grad_norm": 4.078514099121094, + "learning_rate": 5.68907880910683e-05, + "loss": 0.9916, + "num_input_tokens_seen": 99118224, + "step": 6160 + }, + { + "epoch": 0.4315672419378847, + "grad_norm": 4.1833648681640625, + "learning_rate": 5.688378984238178e-05, + "loss": 1.2243, + "num_input_tokens_seen": 99134608, + "step": 6161 + }, + { + "epoch": 0.43163729018361396, + "grad_norm": 4.761826515197754, + "learning_rate": 5.6876791593695265e-05, + "loss": 1.1017, + "num_input_tokens_seen": 99150992, + "step": 6162 + }, + { + "epoch": 0.4317073384293432, + "grad_norm": 4.992908954620361, + "learning_rate": 5.686979334500877e-05, + "loss": 0.9658, + "num_input_tokens_seen": 99167320, + "step": 6163 + }, + { + "epoch": 0.4317773866750725, + "grad_norm": 3.8283936977386475, + "learning_rate": 5.686279509632224e-05, + "loss": 1.0521, + "num_input_tokens_seen": 99183512, + "step": 6164 + }, + { + "epoch": 0.4318474349208017, + "grad_norm": 3.4508893489837646, + "learning_rate": 5.685579684763573e-05, + "loss": 0.9632, + "num_input_tokens_seen": 99199896, + "step": 6165 + }, + { + "epoch": 0.43191748316653095, + "grad_norm": 4.4798431396484375, + "learning_rate": 5.684879859894921e-05, + "loss": 0.8848, + "num_input_tokens_seen": 99216280, + "step": 6166 + }, + { + "epoch": 0.4319875314122602, + "grad_norm": 5.227555751800537, + "learning_rate": 5.6841800350262694e-05, + "loss": 0.9621, + "num_input_tokens_seen": 99230656, + "step": 6167 + }, + { + "epoch": 0.43205757965798947, + "grad_norm": 5.358756065368652, + "learning_rate": 5.6834802101576176e-05, + "loss": 1.0361, + "num_input_tokens_seen": 99246864, + "step": 6168 + }, + { + "epoch": 0.43212762790371867, + "grad_norm": 4.224287986755371, + "learning_rate": 5.682780385288967e-05, + "loss": 1.0804, + "num_input_tokens_seen": 99263248, + "step": 6169 + }, + { + "epoch": 0.43219767614944793, + "grad_norm": 5.748126983642578, + "learning_rate": 5.682080560420317e-05, + "loss": 1.0353, + "num_input_tokens_seen": 99279632, + "step": 6170 + }, + { + "epoch": 0.4322677243951772, + "grad_norm": 4.036735534667969, + "learning_rate": 5.6813807355516635e-05, + "loss": 1.0776, + "num_input_tokens_seen": 99296016, + "step": 6171 + }, + { + "epoch": 0.43233777264090645, + "grad_norm": 4.133121013641357, + "learning_rate": 5.680680910683013e-05, + "loss": 1.0796, + "num_input_tokens_seen": 99312400, + "step": 6172 + }, + { + "epoch": 0.43240782088663565, + "grad_norm": 4.70187520980835, + "learning_rate": 5.679981085814362e-05, + "loss": 1.1069, + "num_input_tokens_seen": 99328504, + "step": 6173 + }, + { + "epoch": 0.4324778691323649, + "grad_norm": 3.515967845916748, + "learning_rate": 5.6792812609457094e-05, + "loss": 1.0462, + "num_input_tokens_seen": 99344424, + "step": 6174 + }, + { + "epoch": 0.43254791737809417, + "grad_norm": 5.408679962158203, + "learning_rate": 5.6785814360770576e-05, + "loss": 1.0638, + "num_input_tokens_seen": 99360056, + "step": 6175 + }, + { + "epoch": 0.43261796562382343, + "grad_norm": 3.7438695430755615, + "learning_rate": 5.677881611208408e-05, + "loss": 1.0115, + "num_input_tokens_seen": 99375920, + "step": 6176 + }, + { + "epoch": 0.43268801386955263, + "grad_norm": 4.337923526763916, + "learning_rate": 5.677181786339756e-05, + "loss": 1.1368, + "num_input_tokens_seen": 99392040, + "step": 6177 + }, + { + "epoch": 0.4327580621152819, + "grad_norm": 6.04982852935791, + "learning_rate": 5.676481961471103e-05, + "loss": 1.1253, + "num_input_tokens_seen": 99407592, + "step": 6178 + }, + { + "epoch": 0.43282811036101115, + "grad_norm": 3.9268686771392822, + "learning_rate": 5.6757821366024524e-05, + "loss": 1.1957, + "num_input_tokens_seen": 99423976, + "step": 6179 + }, + { + "epoch": 0.4328981586067404, + "grad_norm": 4.466431140899658, + "learning_rate": 5.675082311733801e-05, + "loss": 0.8999, + "num_input_tokens_seen": 99440360, + "step": 6180 + }, + { + "epoch": 0.4329682068524696, + "grad_norm": 4.166913032531738, + "learning_rate": 5.674382486865149e-05, + "loss": 1.0157, + "num_input_tokens_seen": 99456744, + "step": 6181 + }, + { + "epoch": 0.4330382550981989, + "grad_norm": 3.525611400604248, + "learning_rate": 5.6736826619964976e-05, + "loss": 1.1182, + "num_input_tokens_seen": 99473080, + "step": 6182 + }, + { + "epoch": 0.43310830334392814, + "grad_norm": 6.099409103393555, + "learning_rate": 5.672982837127847e-05, + "loss": 0.9801, + "num_input_tokens_seen": 99489088, + "step": 6183 + }, + { + "epoch": 0.4331783515896574, + "grad_norm": 3.5886685848236084, + "learning_rate": 5.672283012259194e-05, + "loss": 1.0214, + "num_input_tokens_seen": 99505248, + "step": 6184 + }, + { + "epoch": 0.4332483998353866, + "grad_norm": 3.5279197692871094, + "learning_rate": 5.671583187390542e-05, + "loss": 0.9724, + "num_input_tokens_seen": 99521632, + "step": 6185 + }, + { + "epoch": 0.43331844808111586, + "grad_norm": 4.606603622436523, + "learning_rate": 5.670883362521892e-05, + "loss": 0.8623, + "num_input_tokens_seen": 99537336, + "step": 6186 + }, + { + "epoch": 0.4333884963268451, + "grad_norm": 3.5966908931732178, + "learning_rate": 5.6701835376532405e-05, + "loss": 0.9549, + "num_input_tokens_seen": 99553720, + "step": 6187 + }, + { + "epoch": 0.4334585445725744, + "grad_norm": 3.981893301010132, + "learning_rate": 5.669483712784588e-05, + "loss": 0.923, + "num_input_tokens_seen": 99569384, + "step": 6188 + }, + { + "epoch": 0.4335285928183036, + "grad_norm": 4.06168270111084, + "learning_rate": 5.668783887915937e-05, + "loss": 1.12, + "num_input_tokens_seen": 99585768, + "step": 6189 + }, + { + "epoch": 0.43359864106403284, + "grad_norm": 4.476738929748535, + "learning_rate": 5.6680840630472864e-05, + "loss": 1.0997, + "num_input_tokens_seen": 99602152, + "step": 6190 + }, + { + "epoch": 0.4336686893097621, + "grad_norm": 7.592894554138184, + "learning_rate": 5.667384238178633e-05, + "loss": 1.1001, + "num_input_tokens_seen": 99618536, + "step": 6191 + }, + { + "epoch": 0.43373873755549136, + "grad_norm": 3.4367337226867676, + "learning_rate": 5.666684413309983e-05, + "loss": 0.8883, + "num_input_tokens_seen": 99634920, + "step": 6192 + }, + { + "epoch": 0.43380878580122056, + "grad_norm": 3.8736166954040527, + "learning_rate": 5.665984588441332e-05, + "loss": 1.1062, + "num_input_tokens_seen": 99651304, + "step": 6193 + }, + { + "epoch": 0.4338788340469498, + "grad_norm": 4.0018463134765625, + "learning_rate": 5.6652847635726805e-05, + "loss": 1.15, + "num_input_tokens_seen": 99667688, + "step": 6194 + }, + { + "epoch": 0.4339488822926791, + "grad_norm": 4.243009090423584, + "learning_rate": 5.664584938704029e-05, + "loss": 1.0738, + "num_input_tokens_seen": 99684072, + "step": 6195 + }, + { + "epoch": 0.43401893053840834, + "grad_norm": 5.533624172210693, + "learning_rate": 5.663885113835377e-05, + "loss": 0.9901, + "num_input_tokens_seen": 99698440, + "step": 6196 + }, + { + "epoch": 0.4340889787841376, + "grad_norm": 3.9158618450164795, + "learning_rate": 5.663185288966726e-05, + "loss": 0.9884, + "num_input_tokens_seen": 99714824, + "step": 6197 + }, + { + "epoch": 0.4341590270298668, + "grad_norm": 6.666274070739746, + "learning_rate": 5.662485464098073e-05, + "loss": 1.3152, + "num_input_tokens_seen": 99731208, + "step": 6198 + }, + { + "epoch": 0.43422907527559607, + "grad_norm": 4.02492618560791, + "learning_rate": 5.6617856392294235e-05, + "loss": 1.014, + "num_input_tokens_seen": 99747592, + "step": 6199 + }, + { + "epoch": 0.4342991235213253, + "grad_norm": 3.4257941246032715, + "learning_rate": 5.6610858143607716e-05, + "loss": 0.9081, + "num_input_tokens_seen": 99763656, + "step": 6200 + }, + { + "epoch": 0.4342991235213253, + "eval_loss": 1.126607060432434, + "eval_runtime": 0.1953, + "eval_samples_per_second": 5.119, + "eval_steps_per_second": 5.119, + "num_input_tokens_seen": 99763656, + "step": 6200 + }, + { + "epoch": 0.4343691717670546, + "grad_norm": 3.7283267974853516, + "learning_rate": 5.66038598949212e-05, + "loss": 0.8829, + "num_input_tokens_seen": 99779416, + "step": 6201 + }, + { + "epoch": 0.4344392200127838, + "grad_norm": 5.657198429107666, + "learning_rate": 5.659686164623468e-05, + "loss": 1.1111, + "num_input_tokens_seen": 99795376, + "step": 6202 + }, + { + "epoch": 0.43450926825851305, + "grad_norm": 4.102888107299805, + "learning_rate": 5.658986339754817e-05, + "loss": 1.1068, + "num_input_tokens_seen": 99811760, + "step": 6203 + }, + { + "epoch": 0.4345793165042423, + "grad_norm": 5.449219226837158, + "learning_rate": 5.658286514886165e-05, + "loss": 0.8439, + "num_input_tokens_seen": 99827264, + "step": 6204 + }, + { + "epoch": 0.43464936474997157, + "grad_norm": 4.1982197761535645, + "learning_rate": 5.657586690017513e-05, + "loss": 1.077, + "num_input_tokens_seen": 99843648, + "step": 6205 + }, + { + "epoch": 0.43471941299570077, + "grad_norm": 4.246870517730713, + "learning_rate": 5.6568868651488614e-05, + "loss": 1.0653, + "num_input_tokens_seen": 99859800, + "step": 6206 + }, + { + "epoch": 0.43478946124143003, + "grad_norm": 7.575351238250732, + "learning_rate": 5.656187040280211e-05, + "loss": 1.231, + "num_input_tokens_seen": 99875240, + "step": 6207 + }, + { + "epoch": 0.4348595094871593, + "grad_norm": 4.253138065338135, + "learning_rate": 5.655487215411558e-05, + "loss": 1.1178, + "num_input_tokens_seen": 99891624, + "step": 6208 + }, + { + "epoch": 0.43492955773288855, + "grad_norm": 3.5073490142822266, + "learning_rate": 5.654787390542908e-05, + "loss": 1.0251, + "num_input_tokens_seen": 99908008, + "step": 6209 + }, + { + "epoch": 0.43499960597861775, + "grad_norm": 3.669361114501953, + "learning_rate": 5.654087565674256e-05, + "loss": 1.0233, + "num_input_tokens_seen": 99924392, + "step": 6210 + }, + { + "epoch": 0.435069654224347, + "grad_norm": 4.25203800201416, + "learning_rate": 5.6533877408056043e-05, + "loss": 0.972, + "num_input_tokens_seen": 99940776, + "step": 6211 + }, + { + "epoch": 0.4351397024700763, + "grad_norm": 3.7570602893829346, + "learning_rate": 5.6526879159369525e-05, + "loss": 1.086, + "num_input_tokens_seen": 99956488, + "step": 6212 + }, + { + "epoch": 0.43520975071580553, + "grad_norm": 3.47245454788208, + "learning_rate": 5.651988091068301e-05, + "loss": 0.8526, + "num_input_tokens_seen": 99972792, + "step": 6213 + }, + { + "epoch": 0.43527979896153474, + "grad_norm": 4.902298927307129, + "learning_rate": 5.65128826619965e-05, + "loss": 1.1583, + "num_input_tokens_seen": 99988144, + "step": 6214 + }, + { + "epoch": 0.435349847207264, + "grad_norm": 3.796644926071167, + "learning_rate": 5.6505884413309984e-05, + "loss": 1.0739, + "num_input_tokens_seen": 100004528, + "step": 6215 + }, + { + "epoch": 0.43541989545299326, + "grad_norm": 4.4678425788879395, + "learning_rate": 5.649888616462348e-05, + "loss": 1.0843, + "num_input_tokens_seen": 100020448, + "step": 6216 + }, + { + "epoch": 0.4354899436987225, + "grad_norm": 5.181003570556641, + "learning_rate": 5.649188791593696e-05, + "loss": 0.9784, + "num_input_tokens_seen": 100036832, + "step": 6217 + }, + { + "epoch": 0.4355599919444517, + "grad_norm": 3.4864094257354736, + "learning_rate": 5.648488966725044e-05, + "loss": 1.1016, + "num_input_tokens_seen": 100052656, + "step": 6218 + }, + { + "epoch": 0.435630040190181, + "grad_norm": 3.6500463485717773, + "learning_rate": 5.647789141856393e-05, + "loss": 0.9406, + "num_input_tokens_seen": 100069040, + "step": 6219 + }, + { + "epoch": 0.43570008843591024, + "grad_norm": 4.463146686553955, + "learning_rate": 5.647089316987743e-05, + "loss": 1.0597, + "num_input_tokens_seen": 100085248, + "step": 6220 + }, + { + "epoch": 0.4357701366816395, + "grad_norm": 4.013953685760498, + "learning_rate": 5.6463894921190895e-05, + "loss": 0.9562, + "num_input_tokens_seen": 100100600, + "step": 6221 + }, + { + "epoch": 0.4358401849273687, + "grad_norm": 4.549919128417969, + "learning_rate": 5.645689667250438e-05, + "loss": 0.9911, + "num_input_tokens_seen": 100116368, + "step": 6222 + }, + { + "epoch": 0.43591023317309796, + "grad_norm": 4.227685451507568, + "learning_rate": 5.644989842381787e-05, + "loss": 1.0951, + "num_input_tokens_seen": 100132752, + "step": 6223 + }, + { + "epoch": 0.4359802814188272, + "grad_norm": 3.699406147003174, + "learning_rate": 5.6442900175131354e-05, + "loss": 0.9597, + "num_input_tokens_seen": 100149048, + "step": 6224 + }, + { + "epoch": 0.4360503296645565, + "grad_norm": 4.6700944900512695, + "learning_rate": 5.6435901926444836e-05, + "loss": 1.1303, + "num_input_tokens_seen": 100165432, + "step": 6225 + }, + { + "epoch": 0.4361203779102857, + "grad_norm": 3.524517774581909, + "learning_rate": 5.6428903677758325e-05, + "loss": 1.002, + "num_input_tokens_seen": 100181672, + "step": 6226 + }, + { + "epoch": 0.43619042615601494, + "grad_norm": 3.6368181705474854, + "learning_rate": 5.642190542907181e-05, + "loss": 0.9204, + "num_input_tokens_seen": 100198056, + "step": 6227 + }, + { + "epoch": 0.4362604744017442, + "grad_norm": 4.629672050476074, + "learning_rate": 5.641490718038529e-05, + "loss": 1.0703, + "num_input_tokens_seen": 100214440, + "step": 6228 + }, + { + "epoch": 0.43633052264747346, + "grad_norm": 4.120620250701904, + "learning_rate": 5.640790893169877e-05, + "loss": 0.9761, + "num_input_tokens_seen": 100230824, + "step": 6229 + }, + { + "epoch": 0.43640057089320267, + "grad_norm": 3.8496460914611816, + "learning_rate": 5.640091068301227e-05, + "loss": 1.0973, + "num_input_tokens_seen": 100245920, + "step": 6230 + }, + { + "epoch": 0.4364706191389319, + "grad_norm": 3.8419101238250732, + "learning_rate": 5.639391243432575e-05, + "loss": 1.1615, + "num_input_tokens_seen": 100262184, + "step": 6231 + }, + { + "epoch": 0.4365406673846612, + "grad_norm": 4.283138751983643, + "learning_rate": 5.638691418563923e-05, + "loss": 0.999, + "num_input_tokens_seen": 100278568, + "step": 6232 + }, + { + "epoch": 0.43661071563039044, + "grad_norm": 3.7390506267547607, + "learning_rate": 5.637991593695272e-05, + "loss": 0.9675, + "num_input_tokens_seen": 100294952, + "step": 6233 + }, + { + "epoch": 0.4366807638761197, + "grad_norm": 3.894780158996582, + "learning_rate": 5.63729176882662e-05, + "loss": 0.996, + "num_input_tokens_seen": 100310536, + "step": 6234 + }, + { + "epoch": 0.4367508121218489, + "grad_norm": 5.446288108825684, + "learning_rate": 5.636591943957968e-05, + "loss": 0.973, + "num_input_tokens_seen": 100326336, + "step": 6235 + }, + { + "epoch": 0.43682086036757817, + "grad_norm": 5.487906455993652, + "learning_rate": 5.635892119089318e-05, + "loss": 0.9008, + "num_input_tokens_seen": 100342720, + "step": 6236 + }, + { + "epoch": 0.4368909086133074, + "grad_norm": 4.296425819396973, + "learning_rate": 5.635192294220667e-05, + "loss": 1.0326, + "num_input_tokens_seen": 100358912, + "step": 6237 + }, + { + "epoch": 0.4369609568590367, + "grad_norm": 4.339141845703125, + "learning_rate": 5.634492469352014e-05, + "loss": 1.1232, + "num_input_tokens_seen": 100374952, + "step": 6238 + }, + { + "epoch": 0.4370310051047659, + "grad_norm": 4.520789623260498, + "learning_rate": 5.633792644483362e-05, + "loss": 1.1701, + "num_input_tokens_seen": 100390712, + "step": 6239 + }, + { + "epoch": 0.43710105335049515, + "grad_norm": 3.7790653705596924, + "learning_rate": 5.633092819614712e-05, + "loss": 0.9953, + "num_input_tokens_seen": 100406936, + "step": 6240 + }, + { + "epoch": 0.4371711015962244, + "grad_norm": 3.7649457454681396, + "learning_rate": 5.63239299474606e-05, + "loss": 1.0837, + "num_input_tokens_seen": 100423320, + "step": 6241 + }, + { + "epoch": 0.43724114984195367, + "grad_norm": 5.27927827835083, + "learning_rate": 5.631693169877408e-05, + "loss": 1.1157, + "num_input_tokens_seen": 100439704, + "step": 6242 + }, + { + "epoch": 0.4373111980876829, + "grad_norm": 4.266254901885986, + "learning_rate": 5.630993345008757e-05, + "loss": 1.1607, + "num_input_tokens_seen": 100455008, + "step": 6243 + }, + { + "epoch": 0.43738124633341213, + "grad_norm": 4.195004940032959, + "learning_rate": 5.6302935201401065e-05, + "loss": 1.0163, + "num_input_tokens_seen": 100471392, + "step": 6244 + }, + { + "epoch": 0.4374512945791414, + "grad_norm": 4.85727596282959, + "learning_rate": 5.629593695271455e-05, + "loss": 1.0097, + "num_input_tokens_seen": 100486832, + "step": 6245 + }, + { + "epoch": 0.43752134282487065, + "grad_norm": 3.865466594696045, + "learning_rate": 5.628893870402803e-05, + "loss": 1.1528, + "num_input_tokens_seen": 100502496, + "step": 6246 + }, + { + "epoch": 0.43759139107059986, + "grad_norm": 4.107895851135254, + "learning_rate": 5.628194045534152e-05, + "loss": 1.0452, + "num_input_tokens_seen": 100518816, + "step": 6247 + }, + { + "epoch": 0.4376614393163291, + "grad_norm": 5.402096271514893, + "learning_rate": 5.627494220665499e-05, + "loss": 1.0368, + "num_input_tokens_seen": 100535200, + "step": 6248 + }, + { + "epoch": 0.4377314875620584, + "grad_norm": 4.255467414855957, + "learning_rate": 5.6267943957968474e-05, + "loss": 1.268, + "num_input_tokens_seen": 100551456, + "step": 6249 + }, + { + "epoch": 0.43780153580778763, + "grad_norm": 3.4338836669921875, + "learning_rate": 5.626094570928196e-05, + "loss": 1.0149, + "num_input_tokens_seen": 100567840, + "step": 6250 + }, + { + "epoch": 0.43787158405351684, + "grad_norm": 4.445374488830566, + "learning_rate": 5.6253947460595445e-05, + "loss": 1.2527, + "num_input_tokens_seen": 100583848, + "step": 6251 + }, + { + "epoch": 0.4379416322992461, + "grad_norm": 4.0756072998046875, + "learning_rate": 5.6246949211908927e-05, + "loss": 0.8848, + "num_input_tokens_seen": 100600000, + "step": 6252 + }, + { + "epoch": 0.43801168054497536, + "grad_norm": 3.910945177078247, + "learning_rate": 5.623995096322242e-05, + "loss": 0.9661, + "num_input_tokens_seen": 100615424, + "step": 6253 + }, + { + "epoch": 0.4380817287907046, + "grad_norm": 3.878586769104004, + "learning_rate": 5.623295271453591e-05, + "loss": 1.0631, + "num_input_tokens_seen": 100631808, + "step": 6254 + }, + { + "epoch": 0.4381517770364338, + "grad_norm": 4.295658111572266, + "learning_rate": 5.622595446584939e-05, + "loss": 1.2368, + "num_input_tokens_seen": 100647344, + "step": 6255 + }, + { + "epoch": 0.4382218252821631, + "grad_norm": 3.88688063621521, + "learning_rate": 5.621895621716287e-05, + "loss": 1.0371, + "num_input_tokens_seen": 100663464, + "step": 6256 + }, + { + "epoch": 0.43829187352789234, + "grad_norm": 3.6060731410980225, + "learning_rate": 5.621195796847637e-05, + "loss": 1.1189, + "num_input_tokens_seen": 100679848, + "step": 6257 + }, + { + "epoch": 0.4383619217736216, + "grad_norm": 4.274289608001709, + "learning_rate": 5.620495971978984e-05, + "loss": 0.9809, + "num_input_tokens_seen": 100695760, + "step": 6258 + }, + { + "epoch": 0.4384319700193508, + "grad_norm": 4.854022979736328, + "learning_rate": 5.619796147110332e-05, + "loss": 0.8043, + "num_input_tokens_seen": 100711144, + "step": 6259 + }, + { + "epoch": 0.43850201826508006, + "grad_norm": 3.9589812755584717, + "learning_rate": 5.6190963222416815e-05, + "loss": 1.202, + "num_input_tokens_seen": 100727088, + "step": 6260 + }, + { + "epoch": 0.4385720665108093, + "grad_norm": 5.07575798034668, + "learning_rate": 5.618396497373031e-05, + "loss": 1.0708, + "num_input_tokens_seen": 100743232, + "step": 6261 + }, + { + "epoch": 0.4386421147565386, + "grad_norm": 3.557736396789551, + "learning_rate": 5.617696672504379e-05, + "loss": 1.085, + "num_input_tokens_seen": 100759616, + "step": 6262 + }, + { + "epoch": 0.4387121630022678, + "grad_norm": 4.200889587402344, + "learning_rate": 5.6169968476357274e-05, + "loss": 0.9893, + "num_input_tokens_seen": 100775176, + "step": 6263 + }, + { + "epoch": 0.43878221124799704, + "grad_norm": 4.214064121246338, + "learning_rate": 5.616297022767076e-05, + "loss": 1.2024, + "num_input_tokens_seen": 100791232, + "step": 6264 + }, + { + "epoch": 0.4388522594937263, + "grad_norm": 5.372243881225586, + "learning_rate": 5.6155971978984244e-05, + "loss": 1.0551, + "num_input_tokens_seen": 100807496, + "step": 6265 + }, + { + "epoch": 0.43892230773945556, + "grad_norm": 4.754215717315674, + "learning_rate": 5.614897373029771e-05, + "loss": 1.1895, + "num_input_tokens_seen": 100823880, + "step": 6266 + }, + { + "epoch": 0.4389923559851848, + "grad_norm": 3.3892760276794434, + "learning_rate": 5.614197548161121e-05, + "loss": 1.0086, + "num_input_tokens_seen": 100839928, + "step": 6267 + }, + { + "epoch": 0.439062404230914, + "grad_norm": 4.554326057434082, + "learning_rate": 5.61349772329247e-05, + "loss": 0.993, + "num_input_tokens_seen": 100856312, + "step": 6268 + }, + { + "epoch": 0.4391324524766433, + "grad_norm": 4.118383407592773, + "learning_rate": 5.6127978984238185e-05, + "loss": 0.7906, + "num_input_tokens_seen": 100872696, + "step": 6269 + }, + { + "epoch": 0.43920250072237255, + "grad_norm": 4.403461456298828, + "learning_rate": 5.612098073555167e-05, + "loss": 1.1391, + "num_input_tokens_seen": 100888808, + "step": 6270 + }, + { + "epoch": 0.4392725489681018, + "grad_norm": 3.841547966003418, + "learning_rate": 5.6113982486865156e-05, + "loss": 1.0572, + "num_input_tokens_seen": 100905192, + "step": 6271 + }, + { + "epoch": 0.439342597213831, + "grad_norm": 4.147423267364502, + "learning_rate": 5.610698423817864e-05, + "loss": 1.1632, + "num_input_tokens_seen": 100920552, + "step": 6272 + }, + { + "epoch": 0.43941264545956027, + "grad_norm": 4.717578887939453, + "learning_rate": 5.609998598949212e-05, + "loss": 1.1097, + "num_input_tokens_seen": 100936656, + "step": 6273 + }, + { + "epoch": 0.43948269370528953, + "grad_norm": 5.503146171569824, + "learning_rate": 5.6092987740805615e-05, + "loss": 1.0768, + "num_input_tokens_seen": 100952952, + "step": 6274 + }, + { + "epoch": 0.4395527419510188, + "grad_norm": 3.6871585845947266, + "learning_rate": 5.6085989492119096e-05, + "loss": 0.9386, + "num_input_tokens_seen": 100969152, + "step": 6275 + }, + { + "epoch": 0.439622790196748, + "grad_norm": 3.751429796218872, + "learning_rate": 5.6078991243432565e-05, + "loss": 1.0478, + "num_input_tokens_seen": 100985520, + "step": 6276 + }, + { + "epoch": 0.43969283844247725, + "grad_norm": 4.053867340087891, + "learning_rate": 5.607199299474606e-05, + "loss": 1.1341, + "num_input_tokens_seen": 101001336, + "step": 6277 + }, + { + "epoch": 0.4397628866882065, + "grad_norm": 3.786154270172119, + "learning_rate": 5.606499474605955e-05, + "loss": 1.077, + "num_input_tokens_seen": 101017240, + "step": 6278 + }, + { + "epoch": 0.43983293493393577, + "grad_norm": 3.516772747039795, + "learning_rate": 5.605799649737303e-05, + "loss": 0.9854, + "num_input_tokens_seen": 101033264, + "step": 6279 + }, + { + "epoch": 0.439902983179665, + "grad_norm": 4.568872928619385, + "learning_rate": 5.605099824868651e-05, + "loss": 1.1444, + "num_input_tokens_seen": 101049648, + "step": 6280 + }, + { + "epoch": 0.43997303142539423, + "grad_norm": 4.430622577667236, + "learning_rate": 5.604400000000001e-05, + "loss": 1.0675, + "num_input_tokens_seen": 101066032, + "step": 6281 + }, + { + "epoch": 0.4400430796711235, + "grad_norm": 5.061071872711182, + "learning_rate": 5.603700175131349e-05, + "loss": 1.0891, + "num_input_tokens_seen": 101082416, + "step": 6282 + }, + { + "epoch": 0.44011312791685275, + "grad_norm": 3.696657180786133, + "learning_rate": 5.603000350262696e-05, + "loss": 0.9781, + "num_input_tokens_seen": 101098800, + "step": 6283 + }, + { + "epoch": 0.44018317616258196, + "grad_norm": 4.1430840492248535, + "learning_rate": 5.6023005253940467e-05, + "loss": 1.1299, + "num_input_tokens_seen": 101114672, + "step": 6284 + }, + { + "epoch": 0.4402532244083112, + "grad_norm": 4.862906455993652, + "learning_rate": 5.601600700525395e-05, + "loss": 0.8972, + "num_input_tokens_seen": 101130632, + "step": 6285 + }, + { + "epoch": 0.4403232726540405, + "grad_norm": 4.017249584197998, + "learning_rate": 5.600900875656743e-05, + "loss": 0.9911, + "num_input_tokens_seen": 101145920, + "step": 6286 + }, + { + "epoch": 0.44039332089976974, + "grad_norm": 4.797904014587402, + "learning_rate": 5.600201050788091e-05, + "loss": 1.106, + "num_input_tokens_seen": 101162296, + "step": 6287 + }, + { + "epoch": 0.44046336914549894, + "grad_norm": 3.685084342956543, + "learning_rate": 5.59950122591944e-05, + "loss": 1.0434, + "num_input_tokens_seen": 101178048, + "step": 6288 + }, + { + "epoch": 0.4405334173912282, + "grad_norm": 4.259701728820801, + "learning_rate": 5.598801401050788e-05, + "loss": 1.0203, + "num_input_tokens_seen": 101194080, + "step": 6289 + }, + { + "epoch": 0.44060346563695746, + "grad_norm": 3.961292266845703, + "learning_rate": 5.598101576182138e-05, + "loss": 0.9682, + "num_input_tokens_seen": 101209000, + "step": 6290 + }, + { + "epoch": 0.4406735138826867, + "grad_norm": 3.863640308380127, + "learning_rate": 5.597401751313486e-05, + "loss": 1.0203, + "num_input_tokens_seen": 101224184, + "step": 6291 + }, + { + "epoch": 0.4407435621284159, + "grad_norm": 6.002960681915283, + "learning_rate": 5.596701926444834e-05, + "loss": 1.3855, + "num_input_tokens_seen": 101240568, + "step": 6292 + }, + { + "epoch": 0.4408136103741452, + "grad_norm": 3.870892286300659, + "learning_rate": 5.596002101576182e-05, + "loss": 1.0754, + "num_input_tokens_seen": 101256952, + "step": 6293 + }, + { + "epoch": 0.44088365861987444, + "grad_norm": 3.654907703399658, + "learning_rate": 5.5953022767075305e-05, + "loss": 1.1244, + "num_input_tokens_seen": 101273192, + "step": 6294 + }, + { + "epoch": 0.4409537068656037, + "grad_norm": 3.2243661880493164, + "learning_rate": 5.5946024518388794e-05, + "loss": 0.9084, + "num_input_tokens_seen": 101289488, + "step": 6295 + }, + { + "epoch": 0.4410237551113329, + "grad_norm": 3.947880983352661, + "learning_rate": 5.5939026269702275e-05, + "loss": 1.1628, + "num_input_tokens_seen": 101305064, + "step": 6296 + }, + { + "epoch": 0.44109380335706216, + "grad_norm": 3.546065092086792, + "learning_rate": 5.593202802101576e-05, + "loss": 0.9669, + "num_input_tokens_seen": 101321448, + "step": 6297 + }, + { + "epoch": 0.4411638516027914, + "grad_norm": 4.489794731140137, + "learning_rate": 5.592502977232925e-05, + "loss": 1.1958, + "num_input_tokens_seen": 101337832, + "step": 6298 + }, + { + "epoch": 0.4412338998485207, + "grad_norm": 3.9517438411712646, + "learning_rate": 5.5918031523642734e-05, + "loss": 1.1256, + "num_input_tokens_seen": 101354216, + "step": 6299 + }, + { + "epoch": 0.4413039480942499, + "grad_norm": 4.599244594573975, + "learning_rate": 5.591103327495622e-05, + "loss": 0.9712, + "num_input_tokens_seen": 101370088, + "step": 6300 + }, + { + "epoch": 0.44137399633997915, + "grad_norm": 3.753528356552124, + "learning_rate": 5.5904035026269705e-05, + "loss": 0.9998, + "num_input_tokens_seen": 101385432, + "step": 6301 + }, + { + "epoch": 0.4414440445857084, + "grad_norm": 4.569333553314209, + "learning_rate": 5.589703677758319e-05, + "loss": 1.1676, + "num_input_tokens_seen": 101401816, + "step": 6302 + }, + { + "epoch": 0.44151409283143767, + "grad_norm": 4.010447978973389, + "learning_rate": 5.589003852889667e-05, + "loss": 1.0655, + "num_input_tokens_seen": 101417272, + "step": 6303 + }, + { + "epoch": 0.4415841410771669, + "grad_norm": 5.169422626495361, + "learning_rate": 5.588304028021015e-05, + "loss": 1.3555, + "num_input_tokens_seen": 101433656, + "step": 6304 + }, + { + "epoch": 0.44165418932289613, + "grad_norm": 4.6301069259643555, + "learning_rate": 5.587604203152366e-05, + "loss": 0.929, + "num_input_tokens_seen": 101450040, + "step": 6305 + }, + { + "epoch": 0.4417242375686254, + "grad_norm": 4.814012050628662, + "learning_rate": 5.586904378283713e-05, + "loss": 1.102, + "num_input_tokens_seen": 101466320, + "step": 6306 + }, + { + "epoch": 0.44179428581435465, + "grad_norm": 4.340104579925537, + "learning_rate": 5.586204553415062e-05, + "loss": 1.1179, + "num_input_tokens_seen": 101482592, + "step": 6307 + }, + { + "epoch": 0.4418643340600839, + "grad_norm": 3.807495355606079, + "learning_rate": 5.5855047285464105e-05, + "loss": 1.1407, + "num_input_tokens_seen": 101498920, + "step": 6308 + }, + { + "epoch": 0.4419343823058131, + "grad_norm": 3.97273325920105, + "learning_rate": 5.5848049036777586e-05, + "loss": 1.1377, + "num_input_tokens_seen": 101515304, + "step": 6309 + }, + { + "epoch": 0.44200443055154237, + "grad_norm": 6.926362037658691, + "learning_rate": 5.584105078809107e-05, + "loss": 0.9045, + "num_input_tokens_seen": 101531688, + "step": 6310 + }, + { + "epoch": 0.44207447879727163, + "grad_norm": 4.482272624969482, + "learning_rate": 5.583405253940457e-05, + "loss": 1.1431, + "num_input_tokens_seen": 101547912, + "step": 6311 + }, + { + "epoch": 0.4421445270430009, + "grad_norm": 3.726999044418335, + "learning_rate": 5.582705429071805e-05, + "loss": 1.0609, + "num_input_tokens_seen": 101563640, + "step": 6312 + }, + { + "epoch": 0.4422145752887301, + "grad_norm": 4.305807113647461, + "learning_rate": 5.582005604203152e-05, + "loss": 1.0612, + "num_input_tokens_seen": 101580024, + "step": 6313 + }, + { + "epoch": 0.44228462353445935, + "grad_norm": 5.402091979980469, + "learning_rate": 5.5813057793345016e-05, + "loss": 0.9018, + "num_input_tokens_seen": 101596408, + "step": 6314 + }, + { + "epoch": 0.4423546717801886, + "grad_norm": 3.658170700073242, + "learning_rate": 5.5806059544658504e-05, + "loss": 1.1726, + "num_input_tokens_seen": 101612792, + "step": 6315 + }, + { + "epoch": 0.4424247200259179, + "grad_norm": 3.91109561920166, + "learning_rate": 5.579906129597198e-05, + "loss": 0.9991, + "num_input_tokens_seen": 101628408, + "step": 6316 + }, + { + "epoch": 0.4424947682716471, + "grad_norm": 3.9523725509643555, + "learning_rate": 5.579206304728547e-05, + "loss": 1.1404, + "num_input_tokens_seen": 101644616, + "step": 6317 + }, + { + "epoch": 0.44256481651737634, + "grad_norm": 4.591569423675537, + "learning_rate": 5.578506479859895e-05, + "loss": 0.9778, + "num_input_tokens_seen": 101660536, + "step": 6318 + }, + { + "epoch": 0.4426348647631056, + "grad_norm": 3.7487003803253174, + "learning_rate": 5.577806654991243e-05, + "loss": 1.1305, + "num_input_tokens_seen": 101676920, + "step": 6319 + }, + { + "epoch": 0.44270491300883485, + "grad_norm": 4.111825942993164, + "learning_rate": 5.5771068301225913e-05, + "loss": 1.177, + "num_input_tokens_seen": 101692904, + "step": 6320 + }, + { + "epoch": 0.44277496125456406, + "grad_norm": 3.7022197246551514, + "learning_rate": 5.576407005253941e-05, + "loss": 1.0351, + "num_input_tokens_seen": 101709288, + "step": 6321 + }, + { + "epoch": 0.4428450095002933, + "grad_norm": 5.004938125610352, + "learning_rate": 5.57570718038529e-05, + "loss": 1.1042, + "num_input_tokens_seen": 101725176, + "step": 6322 + }, + { + "epoch": 0.4429150577460226, + "grad_norm": 3.728410005569458, + "learning_rate": 5.575007355516637e-05, + "loss": 0.9879, + "num_input_tokens_seen": 101741160, + "step": 6323 + }, + { + "epoch": 0.44298510599175184, + "grad_norm": 4.526604175567627, + "learning_rate": 5.574307530647986e-05, + "loss": 1.1465, + "num_input_tokens_seen": 101756848, + "step": 6324 + }, + { + "epoch": 0.44305515423748104, + "grad_norm": 3.4281585216522217, + "learning_rate": 5.5736077057793356e-05, + "loss": 1.0865, + "num_input_tokens_seen": 101773232, + "step": 6325 + }, + { + "epoch": 0.4431252024832103, + "grad_norm": 5.678319931030273, + "learning_rate": 5.5729078809106825e-05, + "loss": 1.0443, + "num_input_tokens_seen": 101789064, + "step": 6326 + }, + { + "epoch": 0.44319525072893956, + "grad_norm": 4.231290817260742, + "learning_rate": 5.572208056042032e-05, + "loss": 1.0336, + "num_input_tokens_seen": 101805360, + "step": 6327 + }, + { + "epoch": 0.4432652989746688, + "grad_norm": 3.9336435794830322, + "learning_rate": 5.5715082311733815e-05, + "loss": 1.0592, + "num_input_tokens_seen": 101821744, + "step": 6328 + }, + { + "epoch": 0.443335347220398, + "grad_norm": 3.6775193214416504, + "learning_rate": 5.57080840630473e-05, + "loss": 1.0551, + "num_input_tokens_seen": 101838128, + "step": 6329 + }, + { + "epoch": 0.4434053954661273, + "grad_norm": 3.7788445949554443, + "learning_rate": 5.5701085814360766e-05, + "loss": 0.9174, + "num_input_tokens_seen": 101854216, + "step": 6330 + }, + { + "epoch": 0.44347544371185654, + "grad_norm": 5.49542236328125, + "learning_rate": 5.569408756567426e-05, + "loss": 1.0043, + "num_input_tokens_seen": 101870600, + "step": 6331 + }, + { + "epoch": 0.4435454919575858, + "grad_norm": 4.486842155456543, + "learning_rate": 5.568708931698775e-05, + "loss": 1.2571, + "num_input_tokens_seen": 101886984, + "step": 6332 + }, + { + "epoch": 0.443615540203315, + "grad_norm": 4.949841499328613, + "learning_rate": 5.5680091068301225e-05, + "loss": 0.9723, + "num_input_tokens_seen": 101903368, + "step": 6333 + }, + { + "epoch": 0.44368558844904427, + "grad_norm": 3.6375255584716797, + "learning_rate": 5.567309281961471e-05, + "loss": 1.0938, + "num_input_tokens_seen": 101919568, + "step": 6334 + }, + { + "epoch": 0.4437556366947735, + "grad_norm": 4.649466037750244, + "learning_rate": 5.566609457092821e-05, + "loss": 1.0182, + "num_input_tokens_seen": 101935952, + "step": 6335 + }, + { + "epoch": 0.4438256849405028, + "grad_norm": 3.971482276916504, + "learning_rate": 5.565909632224169e-05, + "loss": 1.2222, + "num_input_tokens_seen": 101952336, + "step": 6336 + }, + { + "epoch": 0.44389573318623204, + "grad_norm": 4.605628967285156, + "learning_rate": 5.565209807355517e-05, + "loss": 1.1634, + "num_input_tokens_seen": 101967752, + "step": 6337 + }, + { + "epoch": 0.44396578143196125, + "grad_norm": 6.878963947296143, + "learning_rate": 5.564509982486866e-05, + "loss": 0.831, + "num_input_tokens_seen": 101982648, + "step": 6338 + }, + { + "epoch": 0.4440358296776905, + "grad_norm": 4.339694976806641, + "learning_rate": 5.563810157618214e-05, + "loss": 1.206, + "num_input_tokens_seen": 101998912, + "step": 6339 + }, + { + "epoch": 0.44410587792341977, + "grad_norm": 3.5509302616119385, + "learning_rate": 5.563110332749562e-05, + "loss": 0.969, + "num_input_tokens_seen": 102015296, + "step": 6340 + }, + { + "epoch": 0.444175926169149, + "grad_norm": 6.927268981933594, + "learning_rate": 5.5624105078809106e-05, + "loss": 1.0158, + "num_input_tokens_seen": 102031136, + "step": 6341 + }, + { + "epoch": 0.44424597441487823, + "grad_norm": 4.596194267272949, + "learning_rate": 5.56171068301226e-05, + "loss": 1.0602, + "num_input_tokens_seen": 102047016, + "step": 6342 + }, + { + "epoch": 0.4443160226606075, + "grad_norm": 3.8641550540924072, + "learning_rate": 5.561010858143607e-05, + "loss": 1.007, + "num_input_tokens_seen": 102062344, + "step": 6343 + }, + { + "epoch": 0.44438607090633675, + "grad_norm": 5.471240997314453, + "learning_rate": 5.5603110332749565e-05, + "loss": 0.9372, + "num_input_tokens_seen": 102078392, + "step": 6344 + }, + { + "epoch": 0.444456119152066, + "grad_norm": 3.971010208129883, + "learning_rate": 5.5596112084063054e-05, + "loss": 1.1335, + "num_input_tokens_seen": 102092872, + "step": 6345 + }, + { + "epoch": 0.4445261673977952, + "grad_norm": 4.037472724914551, + "learning_rate": 5.5589113835376536e-05, + "loss": 0.959, + "num_input_tokens_seen": 102109256, + "step": 6346 + }, + { + "epoch": 0.4445962156435245, + "grad_norm": 3.834984302520752, + "learning_rate": 5.558211558669002e-05, + "loss": 1.129, + "num_input_tokens_seen": 102125424, + "step": 6347 + }, + { + "epoch": 0.44466626388925373, + "grad_norm": 5.131717205047607, + "learning_rate": 5.55751173380035e-05, + "loss": 1.0966, + "num_input_tokens_seen": 102141808, + "step": 6348 + }, + { + "epoch": 0.444736312134983, + "grad_norm": 4.5308837890625, + "learning_rate": 5.5568119089316995e-05, + "loss": 1.1752, + "num_input_tokens_seen": 102158192, + "step": 6349 + }, + { + "epoch": 0.4448063603807122, + "grad_norm": 5.088570594787598, + "learning_rate": 5.556112084063046e-05, + "loss": 1.0215, + "num_input_tokens_seen": 102174576, + "step": 6350 + }, + { + "epoch": 0.44487640862644146, + "grad_norm": 3.588543176651001, + "learning_rate": 5.555412259194397e-05, + "loss": 1.0452, + "num_input_tokens_seen": 102190928, + "step": 6351 + }, + { + "epoch": 0.4449464568721707, + "grad_norm": 3.5698747634887695, + "learning_rate": 5.5547124343257454e-05, + "loss": 1.0472, + "num_input_tokens_seen": 102207072, + "step": 6352 + }, + { + "epoch": 0.4450165051179, + "grad_norm": 3.4842190742492676, + "learning_rate": 5.5540126094570935e-05, + "loss": 0.9213, + "num_input_tokens_seen": 102222664, + "step": 6353 + }, + { + "epoch": 0.4450865533636292, + "grad_norm": 3.742471218109131, + "learning_rate": 5.553312784588442e-05, + "loss": 1.1803, + "num_input_tokens_seen": 102239048, + "step": 6354 + }, + { + "epoch": 0.44515660160935844, + "grad_norm": 4.108808994293213, + "learning_rate": 5.552612959719792e-05, + "loss": 1.2055, + "num_input_tokens_seen": 102255432, + "step": 6355 + }, + { + "epoch": 0.4452266498550877, + "grad_norm": 5.498636722564697, + "learning_rate": 5.551913134851139e-05, + "loss": 1.0159, + "num_input_tokens_seen": 102270760, + "step": 6356 + }, + { + "epoch": 0.44529669810081696, + "grad_norm": 3.639392137527466, + "learning_rate": 5.551213309982487e-05, + "loss": 0.9927, + "num_input_tokens_seen": 102285560, + "step": 6357 + }, + { + "epoch": 0.44536674634654616, + "grad_norm": 4.534916400909424, + "learning_rate": 5.5505134851138365e-05, + "loss": 1.0912, + "num_input_tokens_seen": 102301456, + "step": 6358 + }, + { + "epoch": 0.4454367945922754, + "grad_norm": 3.961845636367798, + "learning_rate": 5.5498136602451847e-05, + "loss": 0.8966, + "num_input_tokens_seen": 102317840, + "step": 6359 + }, + { + "epoch": 0.4455068428380047, + "grad_norm": 3.712111473083496, + "learning_rate": 5.549113835376533e-05, + "loss": 1.0297, + "num_input_tokens_seen": 102334104, + "step": 6360 + }, + { + "epoch": 0.44557689108373394, + "grad_norm": 4.441688537597656, + "learning_rate": 5.548414010507882e-05, + "loss": 1.0116, + "num_input_tokens_seen": 102350488, + "step": 6361 + }, + { + "epoch": 0.44564693932946314, + "grad_norm": 5.881339073181152, + "learning_rate": 5.54771418563923e-05, + "loss": 0.8988, + "num_input_tokens_seen": 102366872, + "step": 6362 + }, + { + "epoch": 0.4457169875751924, + "grad_norm": 3.9736666679382324, + "learning_rate": 5.547014360770578e-05, + "loss": 1.1402, + "num_input_tokens_seen": 102383256, + "step": 6363 + }, + { + "epoch": 0.44578703582092166, + "grad_norm": 4.064074516296387, + "learning_rate": 5.546314535901926e-05, + "loss": 1.0196, + "num_input_tokens_seen": 102399024, + "step": 6364 + }, + { + "epoch": 0.4458570840666509, + "grad_norm": 4.238128662109375, + "learning_rate": 5.5456147110332765e-05, + "loss": 0.8919, + "num_input_tokens_seen": 102415408, + "step": 6365 + }, + { + "epoch": 0.4459271323123801, + "grad_norm": 3.8058905601501465, + "learning_rate": 5.544914886164624e-05, + "loss": 1.0117, + "num_input_tokens_seen": 102430616, + "step": 6366 + }, + { + "epoch": 0.4459971805581094, + "grad_norm": 5.049830436706543, + "learning_rate": 5.544215061295972e-05, + "loss": 1.0986, + "num_input_tokens_seen": 102446448, + "step": 6367 + }, + { + "epoch": 0.44606722880383864, + "grad_norm": 4.0549116134643555, + "learning_rate": 5.543515236427321e-05, + "loss": 1.1462, + "num_input_tokens_seen": 102462832, + "step": 6368 + }, + { + "epoch": 0.4461372770495679, + "grad_norm": 4.005105495452881, + "learning_rate": 5.542815411558669e-05, + "loss": 0.8846, + "num_input_tokens_seen": 102478688, + "step": 6369 + }, + { + "epoch": 0.44620732529529716, + "grad_norm": 4.298024654388428, + "learning_rate": 5.5421155866900174e-05, + "loss": 1.0249, + "num_input_tokens_seen": 102495072, + "step": 6370 + }, + { + "epoch": 0.44627737354102637, + "grad_norm": 4.816470623016357, + "learning_rate": 5.541415761821367e-05, + "loss": 1.0492, + "num_input_tokens_seen": 102511456, + "step": 6371 + }, + { + "epoch": 0.4463474217867556, + "grad_norm": 3.89819598197937, + "learning_rate": 5.5407159369527164e-05, + "loss": 0.9172, + "num_input_tokens_seen": 102527840, + "step": 6372 + }, + { + "epoch": 0.4464174700324849, + "grad_norm": 3.883650541305542, + "learning_rate": 5.540016112084063e-05, + "loss": 0.9475, + "num_input_tokens_seen": 102543008, + "step": 6373 + }, + { + "epoch": 0.44648751827821415, + "grad_norm": 3.8635551929473877, + "learning_rate": 5.5393162872154114e-05, + "loss": 1.2026, + "num_input_tokens_seen": 102558600, + "step": 6374 + }, + { + "epoch": 0.44655756652394335, + "grad_norm": 4.465150356292725, + "learning_rate": 5.538616462346761e-05, + "loss": 1.0031, + "num_input_tokens_seen": 102574984, + "step": 6375 + }, + { + "epoch": 0.4466276147696726, + "grad_norm": 3.8807246685028076, + "learning_rate": 5.537916637478109e-05, + "loss": 1.2355, + "num_input_tokens_seen": 102591368, + "step": 6376 + }, + { + "epoch": 0.44669766301540187, + "grad_norm": 4.526896953582764, + "learning_rate": 5.5372168126094573e-05, + "loss": 1.4136, + "num_input_tokens_seen": 102607520, + "step": 6377 + }, + { + "epoch": 0.44676771126113113, + "grad_norm": 4.905179023742676, + "learning_rate": 5.536516987740806e-05, + "loss": 1.1221, + "num_input_tokens_seen": 102622928, + "step": 6378 + }, + { + "epoch": 0.44683775950686033, + "grad_norm": 3.658268690109253, + "learning_rate": 5.535817162872156e-05, + "loss": 0.9257, + "num_input_tokens_seen": 102639312, + "step": 6379 + }, + { + "epoch": 0.4469078077525896, + "grad_norm": 4.125054359436035, + "learning_rate": 5.5351173380035026e-05, + "loss": 1.0569, + "num_input_tokens_seen": 102655104, + "step": 6380 + }, + { + "epoch": 0.44697785599831885, + "grad_norm": 3.6429343223571777, + "learning_rate": 5.534417513134852e-05, + "loss": 0.9446, + "num_input_tokens_seen": 102671488, + "step": 6381 + }, + { + "epoch": 0.4470479042440481, + "grad_norm": 4.249630928039551, + "learning_rate": 5.533717688266201e-05, + "loss": 1.248, + "num_input_tokens_seen": 102687872, + "step": 6382 + }, + { + "epoch": 0.4471179524897773, + "grad_norm": 4.6079277992248535, + "learning_rate": 5.5330178633975485e-05, + "loss": 1.1774, + "num_input_tokens_seen": 102704256, + "step": 6383 + }, + { + "epoch": 0.4471880007355066, + "grad_norm": 4.406512260437012, + "learning_rate": 5.5323180385288966e-05, + "loss": 1.0111, + "num_input_tokens_seen": 102720640, + "step": 6384 + }, + { + "epoch": 0.44725804898123583, + "grad_norm": 5.140946865081787, + "learning_rate": 5.5316182136602455e-05, + "loss": 1.0323, + "num_input_tokens_seen": 102735608, + "step": 6385 + }, + { + "epoch": 0.4473280972269651, + "grad_norm": 5.1392903327941895, + "learning_rate": 5.530918388791594e-05, + "loss": 1.1535, + "num_input_tokens_seen": 102751992, + "step": 6386 + }, + { + "epoch": 0.4473981454726943, + "grad_norm": 5.466961860656738, + "learning_rate": 5.530218563922942e-05, + "loss": 1.2679, + "num_input_tokens_seen": 102768376, + "step": 6387 + }, + { + "epoch": 0.44746819371842356, + "grad_norm": 3.7714486122131348, + "learning_rate": 5.5295187390542914e-05, + "loss": 0.9847, + "num_input_tokens_seen": 102784328, + "step": 6388 + }, + { + "epoch": 0.4475382419641528, + "grad_norm": 3.665609836578369, + "learning_rate": 5.52881891418564e-05, + "loss": 1.0716, + "num_input_tokens_seen": 102800032, + "step": 6389 + }, + { + "epoch": 0.4476082902098821, + "grad_norm": 6.100143909454346, + "learning_rate": 5.528119089316988e-05, + "loss": 0.8844, + "num_input_tokens_seen": 102816416, + "step": 6390 + }, + { + "epoch": 0.4476783384556113, + "grad_norm": 3.8393003940582275, + "learning_rate": 5.527419264448336e-05, + "loss": 1.1134, + "num_input_tokens_seen": 102832800, + "step": 6391 + }, + { + "epoch": 0.44774838670134054, + "grad_norm": 3.862710952758789, + "learning_rate": 5.526719439579686e-05, + "loss": 1.0571, + "num_input_tokens_seen": 102849128, + "step": 6392 + }, + { + "epoch": 0.4478184349470698, + "grad_norm": 4.032309055328369, + "learning_rate": 5.526019614711033e-05, + "loss": 1.0123, + "num_input_tokens_seen": 102865512, + "step": 6393 + }, + { + "epoch": 0.44788848319279906, + "grad_norm": 4.250918388366699, + "learning_rate": 5.525319789842381e-05, + "loss": 0.8542, + "num_input_tokens_seen": 102881896, + "step": 6394 + }, + { + "epoch": 0.44795853143852826, + "grad_norm": 3.8701565265655518, + "learning_rate": 5.524619964973731e-05, + "loss": 1.0307, + "num_input_tokens_seen": 102898192, + "step": 6395 + }, + { + "epoch": 0.4480285796842575, + "grad_norm": 4.2415571212768555, + "learning_rate": 5.52392014010508e-05, + "loss": 1.0586, + "num_input_tokens_seen": 102914192, + "step": 6396 + }, + { + "epoch": 0.4480986279299868, + "grad_norm": 3.937345027923584, + "learning_rate": 5.5232203152364284e-05, + "loss": 1.175, + "num_input_tokens_seen": 102929976, + "step": 6397 + }, + { + "epoch": 0.44816867617571604, + "grad_norm": 4.339337348937988, + "learning_rate": 5.5225204903677766e-05, + "loss": 1.2494, + "num_input_tokens_seen": 102946360, + "step": 6398 + }, + { + "epoch": 0.44823872442144524, + "grad_norm": 4.744752883911133, + "learning_rate": 5.5218206654991255e-05, + "loss": 1.1717, + "num_input_tokens_seen": 102962744, + "step": 6399 + }, + { + "epoch": 0.4483087726671745, + "grad_norm": 3.6590077877044678, + "learning_rate": 5.521120840630473e-05, + "loss": 1.014, + "num_input_tokens_seen": 102978456, + "step": 6400 + }, + { + "epoch": 0.4483087726671745, + "eval_loss": 1.1259907484054565, + "eval_runtime": 0.159, + "eval_samples_per_second": 6.289, + "eval_steps_per_second": 6.289, + "num_input_tokens_seen": 102978456, + "step": 6400 + }, + { + "epoch": 0.44837882091290376, + "grad_norm": 4.073358535766602, + "learning_rate": 5.5204210157618205e-05, + "loss": 1.1584, + "num_input_tokens_seen": 102994280, + "step": 6401 + }, + { + "epoch": 0.448448869158633, + "grad_norm": 6.3949480056762695, + "learning_rate": 5.51972119089317e-05, + "loss": 1.0338, + "num_input_tokens_seen": 103010664, + "step": 6402 + }, + { + "epoch": 0.4485189174043622, + "grad_norm": 4.09867525100708, + "learning_rate": 5.5190213660245195e-05, + "loss": 1.175, + "num_input_tokens_seen": 103027048, + "step": 6403 + }, + { + "epoch": 0.4485889656500915, + "grad_norm": 3.672407865524292, + "learning_rate": 5.518321541155868e-05, + "loss": 1.1726, + "num_input_tokens_seen": 103043432, + "step": 6404 + }, + { + "epoch": 0.44865901389582075, + "grad_norm": 3.5733370780944824, + "learning_rate": 5.517621716287216e-05, + "loss": 0.9414, + "num_input_tokens_seen": 103059816, + "step": 6405 + }, + { + "epoch": 0.44872906214155, + "grad_norm": 5.21142578125, + "learning_rate": 5.516921891418565e-05, + "loss": 1.1563, + "num_input_tokens_seen": 103076200, + "step": 6406 + }, + { + "epoch": 0.44879911038727927, + "grad_norm": 3.4936230182647705, + "learning_rate": 5.516222066549913e-05, + "loss": 0.9876, + "num_input_tokens_seen": 103092040, + "step": 6407 + }, + { + "epoch": 0.44886915863300847, + "grad_norm": 4.558346271514893, + "learning_rate": 5.515522241681261e-05, + "loss": 0.9841, + "num_input_tokens_seen": 103108424, + "step": 6408 + }, + { + "epoch": 0.44893920687873773, + "grad_norm": 5.485194206237793, + "learning_rate": 5.514822416812611e-05, + "loss": 1.0012, + "num_input_tokens_seen": 103123544, + "step": 6409 + }, + { + "epoch": 0.449009255124467, + "grad_norm": 4.365593433380127, + "learning_rate": 5.5141225919439575e-05, + "loss": 0.9441, + "num_input_tokens_seen": 103139928, + "step": 6410 + }, + { + "epoch": 0.44907930337019625, + "grad_norm": 6.034286022186279, + "learning_rate": 5.513422767075306e-05, + "loss": 1.1408, + "num_input_tokens_seen": 103154960, + "step": 6411 + }, + { + "epoch": 0.44914935161592545, + "grad_norm": 3.88476300239563, + "learning_rate": 5.512722942206655e-05, + "loss": 0.8513, + "num_input_tokens_seen": 103169984, + "step": 6412 + }, + { + "epoch": 0.4492193998616547, + "grad_norm": 3.760528087615967, + "learning_rate": 5.512023117338004e-05, + "loss": 1.07, + "num_input_tokens_seen": 103186296, + "step": 6413 + }, + { + "epoch": 0.44928944810738397, + "grad_norm": 3.779690980911255, + "learning_rate": 5.511323292469352e-05, + "loss": 0.9531, + "num_input_tokens_seen": 103202680, + "step": 6414 + }, + { + "epoch": 0.44935949635311323, + "grad_norm": 3.6536929607391357, + "learning_rate": 5.5106234676007004e-05, + "loss": 1.0908, + "num_input_tokens_seen": 103218656, + "step": 6415 + }, + { + "epoch": 0.44942954459884243, + "grad_norm": 3.9258713722229004, + "learning_rate": 5.50992364273205e-05, + "loss": 1.0175, + "num_input_tokens_seen": 103234872, + "step": 6416 + }, + { + "epoch": 0.4494995928445717, + "grad_norm": 4.860123634338379, + "learning_rate": 5.509223817863397e-05, + "loss": 1.1094, + "num_input_tokens_seen": 103251000, + "step": 6417 + }, + { + "epoch": 0.44956964109030095, + "grad_norm": 4.924446105957031, + "learning_rate": 5.508523992994745e-05, + "loss": 1.1265, + "num_input_tokens_seen": 103267296, + "step": 6418 + }, + { + "epoch": 0.4496396893360302, + "grad_norm": 4.334608554840088, + "learning_rate": 5.507824168126096e-05, + "loss": 0.9163, + "num_input_tokens_seen": 103283440, + "step": 6419 + }, + { + "epoch": 0.4497097375817594, + "grad_norm": 4.686522483825684, + "learning_rate": 5.507124343257444e-05, + "loss": 1.0374, + "num_input_tokens_seen": 103299760, + "step": 6420 + }, + { + "epoch": 0.4497797858274887, + "grad_norm": 4.797657012939453, + "learning_rate": 5.506424518388792e-05, + "loss": 1.1277, + "num_input_tokens_seen": 103316144, + "step": 6421 + }, + { + "epoch": 0.44984983407321794, + "grad_norm": 3.443018674850464, + "learning_rate": 5.5057246935201404e-05, + "loss": 0.943, + "num_input_tokens_seen": 103331712, + "step": 6422 + }, + { + "epoch": 0.4499198823189472, + "grad_norm": 4.118574619293213, + "learning_rate": 5.505024868651489e-05, + "loss": 1.0539, + "num_input_tokens_seen": 103348096, + "step": 6423 + }, + { + "epoch": 0.4499899305646764, + "grad_norm": 4.0539937019348145, + "learning_rate": 5.5043250437828375e-05, + "loss": 0.8437, + "num_input_tokens_seen": 103364480, + "step": 6424 + }, + { + "epoch": 0.45005997881040566, + "grad_norm": 4.269721031188965, + "learning_rate": 5.503625218914187e-05, + "loss": 1.0896, + "num_input_tokens_seen": 103380120, + "step": 6425 + }, + { + "epoch": 0.4501300270561349, + "grad_norm": 4.6834516525268555, + "learning_rate": 5.502925394045535e-05, + "loss": 1.1162, + "num_input_tokens_seen": 103396504, + "step": 6426 + }, + { + "epoch": 0.4502000753018642, + "grad_norm": 4.42267370223999, + "learning_rate": 5.5022255691768834e-05, + "loss": 1.0416, + "num_input_tokens_seen": 103412632, + "step": 6427 + }, + { + "epoch": 0.4502701235475934, + "grad_norm": 4.8119797706604, + "learning_rate": 5.5015257443082315e-05, + "loss": 1.2585, + "num_input_tokens_seen": 103428128, + "step": 6428 + }, + { + "epoch": 0.45034017179332264, + "grad_norm": 4.170595169067383, + "learning_rate": 5.50082591943958e-05, + "loss": 0.9985, + "num_input_tokens_seen": 103444088, + "step": 6429 + }, + { + "epoch": 0.4504102200390519, + "grad_norm": 3.7060906887054443, + "learning_rate": 5.5001260945709286e-05, + "loss": 1.0852, + "num_input_tokens_seen": 103460456, + "step": 6430 + }, + { + "epoch": 0.45048026828478116, + "grad_norm": 4.4231977462768555, + "learning_rate": 5.499426269702277e-05, + "loss": 1.043, + "num_input_tokens_seen": 103476840, + "step": 6431 + }, + { + "epoch": 0.45055031653051036, + "grad_norm": 4.086833477020264, + "learning_rate": 5.498726444833625e-05, + "loss": 1.2797, + "num_input_tokens_seen": 103492808, + "step": 6432 + }, + { + "epoch": 0.4506203647762396, + "grad_norm": 3.912932872772217, + "learning_rate": 5.4980266199649745e-05, + "loss": 0.9846, + "num_input_tokens_seen": 103508672, + "step": 6433 + }, + { + "epoch": 0.4506904130219689, + "grad_norm": 3.6088106632232666, + "learning_rate": 5.4973267950963227e-05, + "loss": 1.0097, + "num_input_tokens_seen": 103525056, + "step": 6434 + }, + { + "epoch": 0.45076046126769814, + "grad_norm": 4.725728511810303, + "learning_rate": 5.4966269702276715e-05, + "loss": 1.1345, + "num_input_tokens_seen": 103541440, + "step": 6435 + }, + { + "epoch": 0.45083050951342735, + "grad_norm": 6.745354175567627, + "learning_rate": 5.49592714535902e-05, + "loss": 0.9549, + "num_input_tokens_seen": 103556264, + "step": 6436 + }, + { + "epoch": 0.4509005577591566, + "grad_norm": 4.462937355041504, + "learning_rate": 5.495227320490368e-05, + "loss": 1.0289, + "num_input_tokens_seen": 103571576, + "step": 6437 + }, + { + "epoch": 0.45097060600488587, + "grad_norm": 4.77189826965332, + "learning_rate": 5.494527495621716e-05, + "loss": 1.2534, + "num_input_tokens_seen": 103587360, + "step": 6438 + }, + { + "epoch": 0.4510406542506151, + "grad_norm": 5.734838962554932, + "learning_rate": 5.493827670753064e-05, + "loss": 1.0698, + "num_input_tokens_seen": 103603744, + "step": 6439 + }, + { + "epoch": 0.4511107024963444, + "grad_norm": 4.221588134765625, + "learning_rate": 5.493127845884414e-05, + "loss": 1.1712, + "num_input_tokens_seen": 103620128, + "step": 6440 + }, + { + "epoch": 0.4511807507420736, + "grad_norm": 3.894184112548828, + "learning_rate": 5.492428021015762e-05, + "loss": 1.1204, + "num_input_tokens_seen": 103636512, + "step": 6441 + }, + { + "epoch": 0.45125079898780285, + "grad_norm": 4.208652496337891, + "learning_rate": 5.4917281961471115e-05, + "loss": 1.1809, + "num_input_tokens_seen": 103652824, + "step": 6442 + }, + { + "epoch": 0.4513208472335321, + "grad_norm": 3.4426159858703613, + "learning_rate": 5.49102837127846e-05, + "loss": 1.0433, + "num_input_tokens_seen": 103669000, + "step": 6443 + }, + { + "epoch": 0.45139089547926137, + "grad_norm": 3.628229856491089, + "learning_rate": 5.490328546409808e-05, + "loss": 0.918, + "num_input_tokens_seen": 103684792, + "step": 6444 + }, + { + "epoch": 0.45146094372499057, + "grad_norm": 6.008549690246582, + "learning_rate": 5.489628721541156e-05, + "loss": 1.2477, + "num_input_tokens_seen": 103701176, + "step": 6445 + }, + { + "epoch": 0.45153099197071983, + "grad_norm": 4.023336887359619, + "learning_rate": 5.488928896672506e-05, + "loss": 1.2522, + "num_input_tokens_seen": 103716488, + "step": 6446 + }, + { + "epoch": 0.4516010402164491, + "grad_norm": 3.6931705474853516, + "learning_rate": 5.488229071803853e-05, + "loss": 0.9123, + "num_input_tokens_seen": 103732560, + "step": 6447 + }, + { + "epoch": 0.45167108846217835, + "grad_norm": 3.7356324195861816, + "learning_rate": 5.487529246935201e-05, + "loss": 1.0913, + "num_input_tokens_seen": 103748512, + "step": 6448 + }, + { + "epoch": 0.45174113670790755, + "grad_norm": 5.176403045654297, + "learning_rate": 5.486829422066551e-05, + "loss": 1.0758, + "num_input_tokens_seen": 103764896, + "step": 6449 + }, + { + "epoch": 0.4518111849536368, + "grad_norm": 4.492616176605225, + "learning_rate": 5.486129597197899e-05, + "loss": 1.2558, + "num_input_tokens_seen": 103781280, + "step": 6450 + }, + { + "epoch": 0.4518812331993661, + "grad_norm": 4.058090686798096, + "learning_rate": 5.485429772329247e-05, + "loss": 1.1313, + "num_input_tokens_seen": 103797664, + "step": 6451 + }, + { + "epoch": 0.45195128144509533, + "grad_norm": 3.6828136444091797, + "learning_rate": 5.484729947460596e-05, + "loss": 1.0972, + "num_input_tokens_seen": 103813912, + "step": 6452 + }, + { + "epoch": 0.45202132969082454, + "grad_norm": 3.4555649757385254, + "learning_rate": 5.484030122591944e-05, + "loss": 1.0052, + "num_input_tokens_seen": 103830296, + "step": 6453 + }, + { + "epoch": 0.4520913779365538, + "grad_norm": 4.12479305267334, + "learning_rate": 5.4833302977232924e-05, + "loss": 0.919, + "num_input_tokens_seen": 103846272, + "step": 6454 + }, + { + "epoch": 0.45216142618228305, + "grad_norm": 4.1249237060546875, + "learning_rate": 5.4826304728546406e-05, + "loss": 0.9013, + "num_input_tokens_seen": 103862408, + "step": 6455 + }, + { + "epoch": 0.4522314744280123, + "grad_norm": 4.026651859283447, + "learning_rate": 5.481930647985989e-05, + "loss": 1.0143, + "num_input_tokens_seen": 103878592, + "step": 6456 + }, + { + "epoch": 0.4523015226737415, + "grad_norm": 4.157918453216553, + "learning_rate": 5.481230823117338e-05, + "loss": 1.0132, + "num_input_tokens_seen": 103894512, + "step": 6457 + }, + { + "epoch": 0.4523715709194708, + "grad_norm": 3.4476771354675293, + "learning_rate": 5.4805309982486865e-05, + "loss": 0.941, + "num_input_tokens_seen": 103910728, + "step": 6458 + }, + { + "epoch": 0.45244161916520004, + "grad_norm": 5.755035877227783, + "learning_rate": 5.479831173380035e-05, + "loss": 1.1735, + "num_input_tokens_seen": 103927112, + "step": 6459 + }, + { + "epoch": 0.4525116674109293, + "grad_norm": 3.774343967437744, + "learning_rate": 5.4791313485113835e-05, + "loss": 1.1775, + "num_input_tokens_seen": 103943496, + "step": 6460 + }, + { + "epoch": 0.4525817156566585, + "grad_norm": 3.8584420680999756, + "learning_rate": 5.478431523642732e-05, + "loss": 1.0433, + "num_input_tokens_seen": 103959880, + "step": 6461 + }, + { + "epoch": 0.45265176390238776, + "grad_norm": 3.545832872390747, + "learning_rate": 5.477731698774081e-05, + "loss": 1.0117, + "num_input_tokens_seen": 103976264, + "step": 6462 + }, + { + "epoch": 0.452721812148117, + "grad_norm": 4.018779277801514, + "learning_rate": 5.477031873905431e-05, + "loss": 1.0711, + "num_input_tokens_seen": 103991720, + "step": 6463 + }, + { + "epoch": 0.4527918603938463, + "grad_norm": 3.966514825820923, + "learning_rate": 5.476332049036779e-05, + "loss": 1.1632, + "num_input_tokens_seen": 104007488, + "step": 6464 + }, + { + "epoch": 0.4528619086395755, + "grad_norm": 3.8280792236328125, + "learning_rate": 5.475632224168126e-05, + "loss": 0.9702, + "num_input_tokens_seen": 104023096, + "step": 6465 + }, + { + "epoch": 0.45293195688530474, + "grad_norm": 6.540561676025391, + "learning_rate": 5.474932399299475e-05, + "loss": 1.2517, + "num_input_tokens_seen": 104038808, + "step": 6466 + }, + { + "epoch": 0.453002005131034, + "grad_norm": 4.703604221343994, + "learning_rate": 5.4742325744308235e-05, + "loss": 1.1562, + "num_input_tokens_seen": 104053944, + "step": 6467 + }, + { + "epoch": 0.45307205337676326, + "grad_norm": 3.950582504272461, + "learning_rate": 5.473532749562171e-05, + "loss": 0.9822, + "num_input_tokens_seen": 104070304, + "step": 6468 + }, + { + "epoch": 0.45314210162249247, + "grad_norm": 5.277374744415283, + "learning_rate": 5.4728329246935205e-05, + "loss": 1.1024, + "num_input_tokens_seen": 104086088, + "step": 6469 + }, + { + "epoch": 0.4532121498682217, + "grad_norm": 4.449152946472168, + "learning_rate": 5.47213309982487e-05, + "loss": 1.2031, + "num_input_tokens_seen": 104102472, + "step": 6470 + }, + { + "epoch": 0.453282198113951, + "grad_norm": 3.780017852783203, + "learning_rate": 5.471433274956218e-05, + "loss": 1.0398, + "num_input_tokens_seen": 104117552, + "step": 6471 + }, + { + "epoch": 0.45335224635968024, + "grad_norm": 3.502319574356079, + "learning_rate": 5.470733450087565e-05, + "loss": 1.0531, + "num_input_tokens_seen": 104133936, + "step": 6472 + }, + { + "epoch": 0.45342229460540945, + "grad_norm": 4.8112311363220215, + "learning_rate": 5.470033625218915e-05, + "loss": 1.3107, + "num_input_tokens_seen": 104150320, + "step": 6473 + }, + { + "epoch": 0.4534923428511387, + "grad_norm": 3.640571355819702, + "learning_rate": 5.4693338003502635e-05, + "loss": 0.8591, + "num_input_tokens_seen": 104165544, + "step": 6474 + }, + { + "epoch": 0.45356239109686797, + "grad_norm": 3.796278953552246, + "learning_rate": 5.468633975481611e-05, + "loss": 0.9476, + "num_input_tokens_seen": 104181928, + "step": 6475 + }, + { + "epoch": 0.4536324393425972, + "grad_norm": 4.749582767486572, + "learning_rate": 5.46793415061296e-05, + "loss": 1.0821, + "num_input_tokens_seen": 104197168, + "step": 6476 + }, + { + "epoch": 0.4537024875883265, + "grad_norm": 4.0797271728515625, + "learning_rate": 5.467234325744308e-05, + "loss": 0.9838, + "num_input_tokens_seen": 104213000, + "step": 6477 + }, + { + "epoch": 0.4537725358340557, + "grad_norm": 4.250664710998535, + "learning_rate": 5.466534500875656e-05, + "loss": 1.1786, + "num_input_tokens_seen": 104229136, + "step": 6478 + }, + { + "epoch": 0.45384258407978495, + "grad_norm": 3.8380579948425293, + "learning_rate": 5.465834676007006e-05, + "loss": 1.0362, + "num_input_tokens_seen": 104245520, + "step": 6479 + }, + { + "epoch": 0.4539126323255142, + "grad_norm": 3.7329459190368652, + "learning_rate": 5.4651348511383546e-05, + "loss": 1.0818, + "num_input_tokens_seen": 104261904, + "step": 6480 + }, + { + "epoch": 0.45398268057124347, + "grad_norm": 4.495264530181885, + "learning_rate": 5.464435026269703e-05, + "loss": 1.1216, + "num_input_tokens_seen": 104278288, + "step": 6481 + }, + { + "epoch": 0.4540527288169727, + "grad_norm": 3.7195420265197754, + "learning_rate": 5.46373520140105e-05, + "loss": 1.1697, + "num_input_tokens_seen": 104294192, + "step": 6482 + }, + { + "epoch": 0.45412277706270193, + "grad_norm": 5.255592346191406, + "learning_rate": 5.463035376532399e-05, + "loss": 1.2848, + "num_input_tokens_seen": 104308704, + "step": 6483 + }, + { + "epoch": 0.4541928253084312, + "grad_norm": 4.61810302734375, + "learning_rate": 5.462335551663749e-05, + "loss": 1.0923, + "num_input_tokens_seen": 104325088, + "step": 6484 + }, + { + "epoch": 0.45426287355416045, + "grad_norm": 7.175589561462402, + "learning_rate": 5.4616357267950955e-05, + "loss": 1.1434, + "num_input_tokens_seen": 104341224, + "step": 6485 + }, + { + "epoch": 0.45433292179988966, + "grad_norm": 3.756762742996216, + "learning_rate": 5.460935901926445e-05, + "loss": 0.9934, + "num_input_tokens_seen": 104356392, + "step": 6486 + }, + { + "epoch": 0.4544029700456189, + "grad_norm": 3.979435920715332, + "learning_rate": 5.4602360770577946e-05, + "loss": 0.777, + "num_input_tokens_seen": 104372776, + "step": 6487 + }, + { + "epoch": 0.4544730182913482, + "grad_norm": 3.761296272277832, + "learning_rate": 5.459536252189143e-05, + "loss": 1.113, + "num_input_tokens_seen": 104389160, + "step": 6488 + }, + { + "epoch": 0.45454306653707743, + "grad_norm": 5.02775239944458, + "learning_rate": 5.458836427320491e-05, + "loss": 0.9833, + "num_input_tokens_seen": 104404768, + "step": 6489 + }, + { + "epoch": 0.45461311478280664, + "grad_norm": 4.161303997039795, + "learning_rate": 5.45813660245184e-05, + "loss": 0.9746, + "num_input_tokens_seen": 104420152, + "step": 6490 + }, + { + "epoch": 0.4546831630285359, + "grad_norm": 3.7053780555725098, + "learning_rate": 5.457436777583188e-05, + "loss": 0.7889, + "num_input_tokens_seen": 104435512, + "step": 6491 + }, + { + "epoch": 0.45475321127426516, + "grad_norm": 4.103651523590088, + "learning_rate": 5.456736952714535e-05, + "loss": 1.0379, + "num_input_tokens_seen": 104451896, + "step": 6492 + }, + { + "epoch": 0.4548232595199944, + "grad_norm": 4.212504863739014, + "learning_rate": 5.456037127845884e-05, + "loss": 1.0417, + "num_input_tokens_seen": 104468280, + "step": 6493 + }, + { + "epoch": 0.4548933077657236, + "grad_norm": 6.549145221710205, + "learning_rate": 5.455337302977234e-05, + "loss": 1.0627, + "num_input_tokens_seen": 104484520, + "step": 6494 + }, + { + "epoch": 0.4549633560114529, + "grad_norm": 3.777740478515625, + "learning_rate": 5.454637478108582e-05, + "loss": 1.2379, + "num_input_tokens_seen": 104500536, + "step": 6495 + }, + { + "epoch": 0.45503340425718214, + "grad_norm": 3.827119827270508, + "learning_rate": 5.45393765323993e-05, + "loss": 0.9852, + "num_input_tokens_seen": 104516712, + "step": 6496 + }, + { + "epoch": 0.4551034525029114, + "grad_norm": 4.231398105621338, + "learning_rate": 5.453237828371279e-05, + "loss": 1.0009, + "num_input_tokens_seen": 104532792, + "step": 6497 + }, + { + "epoch": 0.4551735007486406, + "grad_norm": 5.237041473388672, + "learning_rate": 5.452538003502627e-05, + "loss": 1.0285, + "num_input_tokens_seen": 104549136, + "step": 6498 + }, + { + "epoch": 0.45524354899436986, + "grad_norm": 4.457448482513428, + "learning_rate": 5.451838178633975e-05, + "loss": 1.2198, + "num_input_tokens_seen": 104565320, + "step": 6499 + }, + { + "epoch": 0.4553135972400991, + "grad_norm": 3.7427215576171875, + "learning_rate": 5.451138353765325e-05, + "loss": 1.1668, + "num_input_tokens_seen": 104580744, + "step": 6500 + }, + { + "epoch": 0.4553836454858284, + "grad_norm": 4.094877243041992, + "learning_rate": 5.450438528896673e-05, + "loss": 1.1735, + "num_input_tokens_seen": 104596576, + "step": 6501 + }, + { + "epoch": 0.4554536937315576, + "grad_norm": 4.290172576904297, + "learning_rate": 5.4497387040280214e-05, + "loss": 1.2692, + "num_input_tokens_seen": 104612960, + "step": 6502 + }, + { + "epoch": 0.45552374197728684, + "grad_norm": 3.5206210613250732, + "learning_rate": 5.4490388791593695e-05, + "loss": 0.9754, + "num_input_tokens_seen": 104627664, + "step": 6503 + }, + { + "epoch": 0.4555937902230161, + "grad_norm": 3.7847232818603516, + "learning_rate": 5.4483390542907184e-05, + "loss": 1.1653, + "num_input_tokens_seen": 104644048, + "step": 6504 + }, + { + "epoch": 0.45566383846874536, + "grad_norm": 4.193985939025879, + "learning_rate": 5.4476392294220666e-05, + "loss": 1.1462, + "num_input_tokens_seen": 104659664, + "step": 6505 + }, + { + "epoch": 0.45573388671447457, + "grad_norm": 3.819429874420166, + "learning_rate": 5.446939404553415e-05, + "loss": 1.0596, + "num_input_tokens_seen": 104676048, + "step": 6506 + }, + { + "epoch": 0.4558039349602038, + "grad_norm": 3.7949306964874268, + "learning_rate": 5.446239579684764e-05, + "loss": 1.016, + "num_input_tokens_seen": 104692432, + "step": 6507 + }, + { + "epoch": 0.4558739832059331, + "grad_norm": 3.880740165710449, + "learning_rate": 5.4455397548161125e-05, + "loss": 1.2764, + "num_input_tokens_seen": 104708616, + "step": 6508 + }, + { + "epoch": 0.45594403145166235, + "grad_norm": 5.389898777008057, + "learning_rate": 5.444839929947459e-05, + "loss": 1.1917, + "num_input_tokens_seen": 104724984, + "step": 6509 + }, + { + "epoch": 0.4560140796973916, + "grad_norm": 5.053036689758301, + "learning_rate": 5.44414010507881e-05, + "loss": 1.1091, + "num_input_tokens_seen": 104741368, + "step": 6510 + }, + { + "epoch": 0.4560841279431208, + "grad_norm": 4.7108330726623535, + "learning_rate": 5.4434402802101584e-05, + "loss": 1.1051, + "num_input_tokens_seen": 104757752, + "step": 6511 + }, + { + "epoch": 0.45615417618885007, + "grad_norm": 3.8108251094818115, + "learning_rate": 5.4427404553415066e-05, + "loss": 0.9524, + "num_input_tokens_seen": 104774136, + "step": 6512 + }, + { + "epoch": 0.45622422443457933, + "grad_norm": 3.8631815910339355, + "learning_rate": 5.442040630472854e-05, + "loss": 1.1041, + "num_input_tokens_seen": 104790288, + "step": 6513 + }, + { + "epoch": 0.4562942726803086, + "grad_norm": 3.745565176010132, + "learning_rate": 5.441340805604205e-05, + "loss": 1.086, + "num_input_tokens_seen": 104806672, + "step": 6514 + }, + { + "epoch": 0.4563643209260378, + "grad_norm": 3.6682205200195312, + "learning_rate": 5.440640980735552e-05, + "loss": 1.0626, + "num_input_tokens_seen": 104822824, + "step": 6515 + }, + { + "epoch": 0.45643436917176705, + "grad_norm": 4.422383785247803, + "learning_rate": 5.439941155866901e-05, + "loss": 1.059, + "num_input_tokens_seen": 104839208, + "step": 6516 + }, + { + "epoch": 0.4565044174174963, + "grad_norm": 5.5291242599487305, + "learning_rate": 5.4392413309982495e-05, + "loss": 0.9887, + "num_input_tokens_seen": 104855296, + "step": 6517 + }, + { + "epoch": 0.45657446566322557, + "grad_norm": 7.490913391113281, + "learning_rate": 5.438541506129598e-05, + "loss": 1.2157, + "num_input_tokens_seen": 104871440, + "step": 6518 + }, + { + "epoch": 0.4566445139089548, + "grad_norm": 5.1885528564453125, + "learning_rate": 5.437841681260946e-05, + "loss": 1.1799, + "num_input_tokens_seen": 104887824, + "step": 6519 + }, + { + "epoch": 0.45671456215468403, + "grad_norm": 4.4618096351623535, + "learning_rate": 5.437141856392295e-05, + "loss": 1.34, + "num_input_tokens_seen": 104904208, + "step": 6520 + }, + { + "epoch": 0.4567846104004133, + "grad_norm": 3.8809101581573486, + "learning_rate": 5.436442031523643e-05, + "loss": 1.0538, + "num_input_tokens_seen": 104920592, + "step": 6521 + }, + { + "epoch": 0.45685465864614255, + "grad_norm": 3.429588794708252, + "learning_rate": 5.435742206654991e-05, + "loss": 0.975, + "num_input_tokens_seen": 104936976, + "step": 6522 + }, + { + "epoch": 0.45692470689187176, + "grad_norm": 3.714005947113037, + "learning_rate": 5.435042381786339e-05, + "loss": 0.9339, + "num_input_tokens_seen": 104953136, + "step": 6523 + }, + { + "epoch": 0.456994755137601, + "grad_norm": 4.082497596740723, + "learning_rate": 5.4343425569176895e-05, + "loss": 1.1525, + "num_input_tokens_seen": 104969520, + "step": 6524 + }, + { + "epoch": 0.4570648033833303, + "grad_norm": 5.983520030975342, + "learning_rate": 5.433642732049037e-05, + "loss": 1.1925, + "num_input_tokens_seen": 104984096, + "step": 6525 + }, + { + "epoch": 0.45713485162905954, + "grad_norm": 4.282527446746826, + "learning_rate": 5.432942907180385e-05, + "loss": 1.0791, + "num_input_tokens_seen": 105000120, + "step": 6526 + }, + { + "epoch": 0.45720489987478874, + "grad_norm": 4.0138726234436035, + "learning_rate": 5.432243082311734e-05, + "loss": 1.212, + "num_input_tokens_seen": 105016504, + "step": 6527 + }, + { + "epoch": 0.457274948120518, + "grad_norm": 7.047135829925537, + "learning_rate": 5.431543257443082e-05, + "loss": 1.1071, + "num_input_tokens_seen": 105031360, + "step": 6528 + }, + { + "epoch": 0.45734499636624726, + "grad_norm": 4.966803550720215, + "learning_rate": 5.4308434325744304e-05, + "loss": 1.084, + "num_input_tokens_seen": 105047744, + "step": 6529 + }, + { + "epoch": 0.4574150446119765, + "grad_norm": 3.5288639068603516, + "learning_rate": 5.43014360770578e-05, + "loss": 1.0007, + "num_input_tokens_seen": 105064128, + "step": 6530 + }, + { + "epoch": 0.4574850928577057, + "grad_norm": 5.45017147064209, + "learning_rate": 5.4294437828371295e-05, + "loss": 1.0761, + "num_input_tokens_seen": 105080512, + "step": 6531 + }, + { + "epoch": 0.457555141103435, + "grad_norm": 4.392576694488525, + "learning_rate": 5.428743957968476e-05, + "loss": 1.1192, + "num_input_tokens_seen": 105096896, + "step": 6532 + }, + { + "epoch": 0.45762518934916424, + "grad_norm": 3.4714255332946777, + "learning_rate": 5.428044133099826e-05, + "loss": 0.94, + "num_input_tokens_seen": 105113280, + "step": 6533 + }, + { + "epoch": 0.4576952375948935, + "grad_norm": 4.999673366546631, + "learning_rate": 5.427344308231175e-05, + "loss": 0.8589, + "num_input_tokens_seen": 105128640, + "step": 6534 + }, + { + "epoch": 0.4577652858406227, + "grad_norm": 3.9281249046325684, + "learning_rate": 5.426644483362522e-05, + "loss": 1.2802, + "num_input_tokens_seen": 105144176, + "step": 6535 + }, + { + "epoch": 0.45783533408635196, + "grad_norm": 4.223507881164551, + "learning_rate": 5.42594465849387e-05, + "loss": 0.9767, + "num_input_tokens_seen": 105160560, + "step": 6536 + }, + { + "epoch": 0.4579053823320812, + "grad_norm": 3.7774858474731445, + "learning_rate": 5.425244833625219e-05, + "loss": 0.8401, + "num_input_tokens_seen": 105176696, + "step": 6537 + }, + { + "epoch": 0.4579754305778105, + "grad_norm": 3.666398048400879, + "learning_rate": 5.424545008756569e-05, + "loss": 1.0199, + "num_input_tokens_seen": 105192376, + "step": 6538 + }, + { + "epoch": 0.4580454788235397, + "grad_norm": 4.442626476287842, + "learning_rate": 5.4238451838879156e-05, + "loss": 1.2166, + "num_input_tokens_seen": 105208192, + "step": 6539 + }, + { + "epoch": 0.45811552706926895, + "grad_norm": 3.8626255989074707, + "learning_rate": 5.423145359019265e-05, + "loss": 1.2679, + "num_input_tokens_seen": 105224576, + "step": 6540 + }, + { + "epoch": 0.4581855753149982, + "grad_norm": 3.713498830795288, + "learning_rate": 5.422445534150614e-05, + "loss": 1.0785, + "num_input_tokens_seen": 105240960, + "step": 6541 + }, + { + "epoch": 0.45825562356072747, + "grad_norm": 5.06941032409668, + "learning_rate": 5.4217457092819615e-05, + "loss": 1.1006, + "num_input_tokens_seen": 105257344, + "step": 6542 + }, + { + "epoch": 0.4583256718064567, + "grad_norm": 3.6487746238708496, + "learning_rate": 5.4210458844133103e-05, + "loss": 0.9838, + "num_input_tokens_seen": 105273336, + "step": 6543 + }, + { + "epoch": 0.45839572005218593, + "grad_norm": 3.70211124420166, + "learning_rate": 5.42034605954466e-05, + "loss": 1.0334, + "num_input_tokens_seen": 105289720, + "step": 6544 + }, + { + "epoch": 0.4584657682979152, + "grad_norm": 5.169928073883057, + "learning_rate": 5.419646234676007e-05, + "loss": 0.9696, + "num_input_tokens_seen": 105306104, + "step": 6545 + }, + { + "epoch": 0.45853581654364445, + "grad_norm": 4.101007461547852, + "learning_rate": 5.418946409807355e-05, + "loss": 1.1545, + "num_input_tokens_seen": 105322488, + "step": 6546 + }, + { + "epoch": 0.4586058647893737, + "grad_norm": 4.077839374542236, + "learning_rate": 5.4182465849387044e-05, + "loss": 0.9885, + "num_input_tokens_seen": 105338872, + "step": 6547 + }, + { + "epoch": 0.4586759130351029, + "grad_norm": 4.46600341796875, + "learning_rate": 5.417546760070053e-05, + "loss": 1.0451, + "num_input_tokens_seen": 105355256, + "step": 6548 + }, + { + "epoch": 0.45874596128083217, + "grad_norm": 3.765453577041626, + "learning_rate": 5.416846935201401e-05, + "loss": 0.9365, + "num_input_tokens_seen": 105370928, + "step": 6549 + }, + { + "epoch": 0.45881600952656143, + "grad_norm": 3.913649559020996, + "learning_rate": 5.4161471103327496e-05, + "loss": 1.0764, + "num_input_tokens_seen": 105387312, + "step": 6550 + }, + { + "epoch": 0.4588860577722907, + "grad_norm": 5.323554992675781, + "learning_rate": 5.415447285464099e-05, + "loss": 1.0202, + "num_input_tokens_seen": 105403408, + "step": 6551 + }, + { + "epoch": 0.4589561060180199, + "grad_norm": 3.8482306003570557, + "learning_rate": 5.414747460595446e-05, + "loss": 1.011, + "num_input_tokens_seen": 105419792, + "step": 6552 + }, + { + "epoch": 0.45902615426374915, + "grad_norm": 4.369050025939941, + "learning_rate": 5.414047635726794e-05, + "loss": 0.9605, + "num_input_tokens_seen": 105435240, + "step": 6553 + }, + { + "epoch": 0.4590962025094784, + "grad_norm": 3.4255287647247314, + "learning_rate": 5.413347810858145e-05, + "loss": 0.9871, + "num_input_tokens_seen": 105451568, + "step": 6554 + }, + { + "epoch": 0.45916625075520767, + "grad_norm": 4.246303081512451, + "learning_rate": 5.412647985989493e-05, + "loss": 1.0404, + "num_input_tokens_seen": 105467952, + "step": 6555 + }, + { + "epoch": 0.4592362990009369, + "grad_norm": 3.785661220550537, + "learning_rate": 5.4119481611208414e-05, + "loss": 0.9251, + "num_input_tokens_seen": 105484336, + "step": 6556 + }, + { + "epoch": 0.45930634724666614, + "grad_norm": 3.661653757095337, + "learning_rate": 5.4112483362521896e-05, + "loss": 0.9834, + "num_input_tokens_seen": 105500720, + "step": 6557 + }, + { + "epoch": 0.4593763954923954, + "grad_norm": 4.362829685211182, + "learning_rate": 5.4105485113835385e-05, + "loss": 1.084, + "num_input_tokens_seen": 105516488, + "step": 6558 + }, + { + "epoch": 0.45944644373812465, + "grad_norm": 3.867062568664551, + "learning_rate": 5.409848686514885e-05, + "loss": 1.048, + "num_input_tokens_seen": 105532736, + "step": 6559 + }, + { + "epoch": 0.45951649198385386, + "grad_norm": 3.8351078033447266, + "learning_rate": 5.409148861646236e-05, + "loss": 1.0162, + "num_input_tokens_seen": 105549120, + "step": 6560 + }, + { + "epoch": 0.4595865402295831, + "grad_norm": 4.525234699249268, + "learning_rate": 5.4084490367775844e-05, + "loss": 1.0798, + "num_input_tokens_seen": 105564776, + "step": 6561 + }, + { + "epoch": 0.4596565884753124, + "grad_norm": 3.8182532787323, + "learning_rate": 5.4077492119089326e-05, + "loss": 1.021, + "num_input_tokens_seen": 105581080, + "step": 6562 + }, + { + "epoch": 0.45972663672104164, + "grad_norm": 3.82145619392395, + "learning_rate": 5.407049387040281e-05, + "loss": 1.0482, + "num_input_tokens_seen": 105596496, + "step": 6563 + }, + { + "epoch": 0.45979668496677084, + "grad_norm": 4.378223419189453, + "learning_rate": 5.406349562171629e-05, + "loss": 1.0795, + "num_input_tokens_seen": 105612200, + "step": 6564 + }, + { + "epoch": 0.4598667332125001, + "grad_norm": 4.628854274749756, + "learning_rate": 5.405649737302978e-05, + "loss": 1.1581, + "num_input_tokens_seen": 105628584, + "step": 6565 + }, + { + "epoch": 0.45993678145822936, + "grad_norm": 5.091843128204346, + "learning_rate": 5.404949912434326e-05, + "loss": 1.203, + "num_input_tokens_seen": 105644968, + "step": 6566 + }, + { + "epoch": 0.4600068297039586, + "grad_norm": 4.4174580574035645, + "learning_rate": 5.404250087565674e-05, + "loss": 1.1291, + "num_input_tokens_seen": 105661352, + "step": 6567 + }, + { + "epoch": 0.4600768779496878, + "grad_norm": 4.136083602905273, + "learning_rate": 5.403550262697024e-05, + "loss": 1.183, + "num_input_tokens_seen": 105677200, + "step": 6568 + }, + { + "epoch": 0.4601469261954171, + "grad_norm": 4.388592720031738, + "learning_rate": 5.4028504378283705e-05, + "loss": 1.2882, + "num_input_tokens_seen": 105693584, + "step": 6569 + }, + { + "epoch": 0.46021697444114634, + "grad_norm": 5.642048358917236, + "learning_rate": 5.402150612959721e-05, + "loss": 1.1693, + "num_input_tokens_seen": 105709688, + "step": 6570 + }, + { + "epoch": 0.4602870226868756, + "grad_norm": 4.107602596282959, + "learning_rate": 5.401450788091069e-05, + "loss": 1.2681, + "num_input_tokens_seen": 105725608, + "step": 6571 + }, + { + "epoch": 0.4603570709326048, + "grad_norm": 6.477549076080322, + "learning_rate": 5.400750963222417e-05, + "loss": 1.0574, + "num_input_tokens_seen": 105741992, + "step": 6572 + }, + { + "epoch": 0.46042711917833407, + "grad_norm": 4.7183380126953125, + "learning_rate": 5.400051138353765e-05, + "loss": 1.087, + "num_input_tokens_seen": 105758376, + "step": 6573 + }, + { + "epoch": 0.4604971674240633, + "grad_norm": 4.523158073425293, + "learning_rate": 5.3993513134851135e-05, + "loss": 1.0595, + "num_input_tokens_seen": 105774760, + "step": 6574 + }, + { + "epoch": 0.4605672156697926, + "grad_norm": 3.631554126739502, + "learning_rate": 5.398651488616463e-05, + "loss": 1.0407, + "num_input_tokens_seen": 105791144, + "step": 6575 + }, + { + "epoch": 0.4606372639155218, + "grad_norm": 8.034467697143555, + "learning_rate": 5.397951663747811e-05, + "loss": 1.0565, + "num_input_tokens_seen": 105807528, + "step": 6576 + }, + { + "epoch": 0.46070731216125105, + "grad_norm": 3.780055522918701, + "learning_rate": 5.397251838879161e-05, + "loss": 1.0622, + "num_input_tokens_seen": 105823592, + "step": 6577 + }, + { + "epoch": 0.4607773604069803, + "grad_norm": 3.975475549697876, + "learning_rate": 5.396552014010509e-05, + "loss": 1.0651, + "num_input_tokens_seen": 105839976, + "step": 6578 + }, + { + "epoch": 0.46084740865270957, + "grad_norm": 3.4668362140655518, + "learning_rate": 5.395852189141857e-05, + "loss": 0.9884, + "num_input_tokens_seen": 105856360, + "step": 6579 + }, + { + "epoch": 0.4609174568984388, + "grad_norm": 3.7928245067596436, + "learning_rate": 5.395152364273205e-05, + "loss": 1.0635, + "num_input_tokens_seen": 105872744, + "step": 6580 + }, + { + "epoch": 0.46098750514416803, + "grad_norm": 3.8289833068847656, + "learning_rate": 5.3944525394045555e-05, + "loss": 0.8981, + "num_input_tokens_seen": 105888528, + "step": 6581 + }, + { + "epoch": 0.4610575533898973, + "grad_norm": 6.435444355010986, + "learning_rate": 5.393752714535902e-05, + "loss": 1.0163, + "num_input_tokens_seen": 105903592, + "step": 6582 + }, + { + "epoch": 0.46112760163562655, + "grad_norm": 4.274429798126221, + "learning_rate": 5.3930528896672505e-05, + "loss": 1.3321, + "num_input_tokens_seen": 105919904, + "step": 6583 + }, + { + "epoch": 0.4611976498813558, + "grad_norm": 3.619840145111084, + "learning_rate": 5.3923530647986e-05, + "loss": 0.8481, + "num_input_tokens_seen": 105936288, + "step": 6584 + }, + { + "epoch": 0.461267698127085, + "grad_norm": 3.643489122390747, + "learning_rate": 5.391653239929948e-05, + "loss": 1.2037, + "num_input_tokens_seen": 105952624, + "step": 6585 + }, + { + "epoch": 0.4613377463728143, + "grad_norm": 3.5494256019592285, + "learning_rate": 5.3909534150612964e-05, + "loss": 1.0352, + "num_input_tokens_seen": 105968568, + "step": 6586 + }, + { + "epoch": 0.46140779461854353, + "grad_norm": 5.754514694213867, + "learning_rate": 5.390253590192645e-05, + "loss": 0.8431, + "num_input_tokens_seen": 105984952, + "step": 6587 + }, + { + "epoch": 0.4614778428642728, + "grad_norm": 3.9911015033721924, + "learning_rate": 5.3895537653239934e-05, + "loss": 0.9876, + "num_input_tokens_seen": 106001336, + "step": 6588 + }, + { + "epoch": 0.461547891110002, + "grad_norm": 4.1558756828308105, + "learning_rate": 5.3888539404553416e-05, + "loss": 1.0537, + "num_input_tokens_seen": 106016736, + "step": 6589 + }, + { + "epoch": 0.46161793935573125, + "grad_norm": 4.300850868225098, + "learning_rate": 5.38815411558669e-05, + "loss": 1.0303, + "num_input_tokens_seen": 106033120, + "step": 6590 + }, + { + "epoch": 0.4616879876014605, + "grad_norm": 6.03284215927124, + "learning_rate": 5.387454290718038e-05, + "loss": 1.0919, + "num_input_tokens_seen": 106049504, + "step": 6591 + }, + { + "epoch": 0.4617580358471898, + "grad_norm": 4.091002941131592, + "learning_rate": 5.3867544658493875e-05, + "loss": 1.205, + "num_input_tokens_seen": 106065632, + "step": 6592 + }, + { + "epoch": 0.461828084092919, + "grad_norm": 3.7395520210266113, + "learning_rate": 5.386054640980736e-05, + "loss": 0.9516, + "num_input_tokens_seen": 106081632, + "step": 6593 + }, + { + "epoch": 0.46189813233864824, + "grad_norm": 4.021444797515869, + "learning_rate": 5.3853548161120845e-05, + "loss": 1.0859, + "num_input_tokens_seen": 106097376, + "step": 6594 + }, + { + "epoch": 0.4619681805843775, + "grad_norm": 5.202040672302246, + "learning_rate": 5.384654991243433e-05, + "loss": 0.935, + "num_input_tokens_seen": 106113096, + "step": 6595 + }, + { + "epoch": 0.46203822883010676, + "grad_norm": 8.020401000976562, + "learning_rate": 5.383955166374781e-05, + "loss": 1.0943, + "num_input_tokens_seen": 106129480, + "step": 6596 + }, + { + "epoch": 0.46210827707583596, + "grad_norm": 4.892960548400879, + "learning_rate": 5.3832553415061304e-05, + "loss": 1.0347, + "num_input_tokens_seen": 106145864, + "step": 6597 + }, + { + "epoch": 0.4621783253215652, + "grad_norm": 3.963135004043579, + "learning_rate": 5.38255551663748e-05, + "loss": 1.0275, + "num_input_tokens_seen": 106162248, + "step": 6598 + }, + { + "epoch": 0.4622483735672945, + "grad_norm": 5.362968444824219, + "learning_rate": 5.381855691768827e-05, + "loss": 0.9192, + "num_input_tokens_seen": 106177616, + "step": 6599 + }, + { + "epoch": 0.46231842181302374, + "grad_norm": 5.272266864776611, + "learning_rate": 5.381155866900175e-05, + "loss": 1.3188, + "num_input_tokens_seen": 106194000, + "step": 6600 + }, + { + "epoch": 0.46231842181302374, + "eval_loss": 1.1238571405410767, + "eval_runtime": 0.1703, + "eval_samples_per_second": 5.872, + "eval_steps_per_second": 5.872, + "num_input_tokens_seen": 106194000, + "step": 6600 + }, + { + "epoch": 0.46238847005875294, + "grad_norm": 6.129757881164551, + "learning_rate": 5.3804560420315245e-05, + "loss": 0.9134, + "num_input_tokens_seen": 106210384, + "step": 6601 + }, + { + "epoch": 0.4624585183044822, + "grad_norm": 4.237639904022217, + "learning_rate": 5.379756217162873e-05, + "loss": 1.013, + "num_input_tokens_seen": 106226240, + "step": 6602 + }, + { + "epoch": 0.46252856655021146, + "grad_norm": 3.4758036136627197, + "learning_rate": 5.37905639229422e-05, + "loss": 0.9542, + "num_input_tokens_seen": 106242624, + "step": 6603 + }, + { + "epoch": 0.4625986147959407, + "grad_norm": 4.031625270843506, + "learning_rate": 5.37835656742557e-05, + "loss": 0.9543, + "num_input_tokens_seen": 106259008, + "step": 6604 + }, + { + "epoch": 0.4626686630416699, + "grad_norm": 3.9605302810668945, + "learning_rate": 5.377656742556919e-05, + "loss": 1.1185, + "num_input_tokens_seen": 106274960, + "step": 6605 + }, + { + "epoch": 0.4627387112873992, + "grad_norm": 3.5777320861816406, + "learning_rate": 5.3769569176882675e-05, + "loss": 1.1453, + "num_input_tokens_seen": 106291344, + "step": 6606 + }, + { + "epoch": 0.46280875953312844, + "grad_norm": 3.553462505340576, + "learning_rate": 5.376257092819614e-05, + "loss": 0.9831, + "num_input_tokens_seen": 106306184, + "step": 6607 + }, + { + "epoch": 0.4628788077788577, + "grad_norm": 3.745340347290039, + "learning_rate": 5.3755572679509645e-05, + "loss": 0.8471, + "num_input_tokens_seen": 106320360, + "step": 6608 + }, + { + "epoch": 0.4629488560245869, + "grad_norm": 3.7483649253845215, + "learning_rate": 5.374857443082312e-05, + "loss": 1.0482, + "num_input_tokens_seen": 106336664, + "step": 6609 + }, + { + "epoch": 0.46301890427031617, + "grad_norm": 3.675184726715088, + "learning_rate": 5.37415761821366e-05, + "loss": 1.0679, + "num_input_tokens_seen": 106353048, + "step": 6610 + }, + { + "epoch": 0.4630889525160454, + "grad_norm": 4.733851432800293, + "learning_rate": 5.373457793345009e-05, + "loss": 1.0269, + "num_input_tokens_seen": 106369432, + "step": 6611 + }, + { + "epoch": 0.4631590007617747, + "grad_norm": 3.9618589878082275, + "learning_rate": 5.372757968476357e-05, + "loss": 1.0667, + "num_input_tokens_seen": 106385592, + "step": 6612 + }, + { + "epoch": 0.46322904900750395, + "grad_norm": 3.95268177986145, + "learning_rate": 5.3720581436077054e-05, + "loss": 1.1044, + "num_input_tokens_seen": 106401976, + "step": 6613 + }, + { + "epoch": 0.46329909725323315, + "grad_norm": 4.600008010864258, + "learning_rate": 5.371358318739055e-05, + "loss": 1.0004, + "num_input_tokens_seen": 106418360, + "step": 6614 + }, + { + "epoch": 0.4633691454989624, + "grad_norm": 3.6651558876037598, + "learning_rate": 5.370658493870404e-05, + "loss": 0.9977, + "num_input_tokens_seen": 106434488, + "step": 6615 + }, + { + "epoch": 0.46343919374469167, + "grad_norm": 4.116913318634033, + "learning_rate": 5.369958669001752e-05, + "loss": 1.0409, + "num_input_tokens_seen": 106450872, + "step": 6616 + }, + { + "epoch": 0.46350924199042093, + "grad_norm": 4.44846773147583, + "learning_rate": 5.3692588441330995e-05, + "loss": 0.8553, + "num_input_tokens_seen": 106467256, + "step": 6617 + }, + { + "epoch": 0.46357929023615013, + "grad_norm": 5.590776443481445, + "learning_rate": 5.3685590192644483e-05, + "loss": 0.9049, + "num_input_tokens_seen": 106483640, + "step": 6618 + }, + { + "epoch": 0.4636493384818794, + "grad_norm": 5.505274772644043, + "learning_rate": 5.3678591943957965e-05, + "loss": 1.2003, + "num_input_tokens_seen": 106500024, + "step": 6619 + }, + { + "epoch": 0.46371938672760865, + "grad_norm": 3.8726046085357666, + "learning_rate": 5.367159369527145e-05, + "loss": 0.953, + "num_input_tokens_seen": 106516408, + "step": 6620 + }, + { + "epoch": 0.4637894349733379, + "grad_norm": 3.9251434803009033, + "learning_rate": 5.366459544658494e-05, + "loss": 1.1335, + "num_input_tokens_seen": 106531504, + "step": 6621 + }, + { + "epoch": 0.4638594832190671, + "grad_norm": 3.9294116497039795, + "learning_rate": 5.365759719789844e-05, + "loss": 0.9266, + "num_input_tokens_seen": 106547888, + "step": 6622 + }, + { + "epoch": 0.4639295314647964, + "grad_norm": 4.324211120605469, + "learning_rate": 5.365059894921192e-05, + "loss": 1.0962, + "num_input_tokens_seen": 106564272, + "step": 6623 + }, + { + "epoch": 0.46399957971052563, + "grad_norm": 3.5331010818481445, + "learning_rate": 5.364360070052539e-05, + "loss": 0.9043, + "num_input_tokens_seen": 106580608, + "step": 6624 + }, + { + "epoch": 0.4640696279562549, + "grad_norm": 3.642073392868042, + "learning_rate": 5.363660245183889e-05, + "loss": 1.0047, + "num_input_tokens_seen": 106596760, + "step": 6625 + }, + { + "epoch": 0.4641396762019841, + "grad_norm": 4.356872081756592, + "learning_rate": 5.362960420315237e-05, + "loss": 1.0147, + "num_input_tokens_seen": 106613144, + "step": 6626 + }, + { + "epoch": 0.46420972444771336, + "grad_norm": 3.66884446144104, + "learning_rate": 5.362260595446584e-05, + "loss": 0.9679, + "num_input_tokens_seen": 106629528, + "step": 6627 + }, + { + "epoch": 0.4642797726934426, + "grad_norm": 4.089823246002197, + "learning_rate": 5.3615607705779335e-05, + "loss": 0.9901, + "num_input_tokens_seen": 106645864, + "step": 6628 + }, + { + "epoch": 0.4643498209391719, + "grad_norm": 4.411832332611084, + "learning_rate": 5.360860945709283e-05, + "loss": 1.0351, + "num_input_tokens_seen": 106662248, + "step": 6629 + }, + { + "epoch": 0.4644198691849011, + "grad_norm": 3.7563977241516113, + "learning_rate": 5.360161120840631e-05, + "loss": 1.0096, + "num_input_tokens_seen": 106678632, + "step": 6630 + }, + { + "epoch": 0.46448991743063034, + "grad_norm": 3.6493430137634277, + "learning_rate": 5.3594612959719794e-05, + "loss": 0.966, + "num_input_tokens_seen": 106694552, + "step": 6631 + }, + { + "epoch": 0.4645599656763596, + "grad_norm": 3.6459546089172363, + "learning_rate": 5.358761471103328e-05, + "loss": 1.0288, + "num_input_tokens_seen": 106710544, + "step": 6632 + }, + { + "epoch": 0.46463001392208886, + "grad_norm": 4.07296085357666, + "learning_rate": 5.3580616462346765e-05, + "loss": 1.0409, + "num_input_tokens_seen": 106726928, + "step": 6633 + }, + { + "epoch": 0.46470006216781806, + "grad_norm": 3.623961925506592, + "learning_rate": 5.357361821366024e-05, + "loss": 0.8566, + "num_input_tokens_seen": 106742048, + "step": 6634 + }, + { + "epoch": 0.4647701104135473, + "grad_norm": 3.8658370971679688, + "learning_rate": 5.356661996497374e-05, + "loss": 1.1116, + "num_input_tokens_seen": 106758432, + "step": 6635 + }, + { + "epoch": 0.4648401586592766, + "grad_norm": 7.479616641998291, + "learning_rate": 5.3559621716287224e-05, + "loss": 1.0146, + "num_input_tokens_seen": 106774816, + "step": 6636 + }, + { + "epoch": 0.46491020690500584, + "grad_norm": 5.282004356384277, + "learning_rate": 5.3552623467600706e-05, + "loss": 1.0662, + "num_input_tokens_seen": 106791200, + "step": 6637 + }, + { + "epoch": 0.46498025515073504, + "grad_norm": 5.323639392852783, + "learning_rate": 5.354562521891419e-05, + "loss": 1.159, + "num_input_tokens_seen": 106806400, + "step": 6638 + }, + { + "epoch": 0.4650503033964643, + "grad_norm": 3.709852933883667, + "learning_rate": 5.3538626970227676e-05, + "loss": 1.0024, + "num_input_tokens_seen": 106822224, + "step": 6639 + }, + { + "epoch": 0.46512035164219356, + "grad_norm": 3.583138942718506, + "learning_rate": 5.353162872154116e-05, + "loss": 1.1467, + "num_input_tokens_seen": 106838608, + "step": 6640 + }, + { + "epoch": 0.4651903998879228, + "grad_norm": 4.027291297912598, + "learning_rate": 5.352463047285464e-05, + "loss": 1.1139, + "num_input_tokens_seen": 106854992, + "step": 6641 + }, + { + "epoch": 0.465260448133652, + "grad_norm": 3.9708850383758545, + "learning_rate": 5.3517632224168135e-05, + "loss": 1.0602, + "num_input_tokens_seen": 106871376, + "step": 6642 + }, + { + "epoch": 0.4653304963793813, + "grad_norm": 5.148803234100342, + "learning_rate": 5.351063397548162e-05, + "loss": 1.0852, + "num_input_tokens_seen": 106887760, + "step": 6643 + }, + { + "epoch": 0.46540054462511055, + "grad_norm": 4.076368808746338, + "learning_rate": 5.3503635726795085e-05, + "loss": 1.1273, + "num_input_tokens_seen": 106904112, + "step": 6644 + }, + { + "epoch": 0.4654705928708398, + "grad_norm": 4.920746803283691, + "learning_rate": 5.349663747810858e-05, + "loss": 1.091, + "num_input_tokens_seen": 106919960, + "step": 6645 + }, + { + "epoch": 0.465540641116569, + "grad_norm": 3.8127434253692627, + "learning_rate": 5.3489639229422076e-05, + "loss": 0.9896, + "num_input_tokens_seen": 106935928, + "step": 6646 + }, + { + "epoch": 0.46561068936229827, + "grad_norm": 3.9216270446777344, + "learning_rate": 5.348264098073556e-05, + "loss": 1.0585, + "num_input_tokens_seen": 106952168, + "step": 6647 + }, + { + "epoch": 0.46568073760802753, + "grad_norm": 3.5133566856384277, + "learning_rate": 5.347564273204903e-05, + "loss": 0.8579, + "num_input_tokens_seen": 106968080, + "step": 6648 + }, + { + "epoch": 0.4657507858537568, + "grad_norm": 3.634164333343506, + "learning_rate": 5.346864448336253e-05, + "loss": 1.0907, + "num_input_tokens_seen": 106984464, + "step": 6649 + }, + { + "epoch": 0.46582083409948605, + "grad_norm": 3.7191765308380127, + "learning_rate": 5.346164623467601e-05, + "loss": 1.0374, + "num_input_tokens_seen": 107000848, + "step": 6650 + }, + { + "epoch": 0.46589088234521525, + "grad_norm": 3.767498254776001, + "learning_rate": 5.345464798598948e-05, + "loss": 1.0061, + "num_input_tokens_seen": 107017232, + "step": 6651 + }, + { + "epoch": 0.4659609305909445, + "grad_norm": 3.8340818881988525, + "learning_rate": 5.344764973730299e-05, + "loss": 1.075, + "num_input_tokens_seen": 107033616, + "step": 6652 + }, + { + "epoch": 0.46603097883667377, + "grad_norm": 9.20552921295166, + "learning_rate": 5.344065148861647e-05, + "loss": 1.1089, + "num_input_tokens_seen": 107049120, + "step": 6653 + }, + { + "epoch": 0.46610102708240303, + "grad_norm": 4.367069721221924, + "learning_rate": 5.343365323992995e-05, + "loss": 1.2988, + "num_input_tokens_seen": 107063432, + "step": 6654 + }, + { + "epoch": 0.46617107532813223, + "grad_norm": 3.6735596656799316, + "learning_rate": 5.342665499124343e-05, + "loss": 0.9416, + "num_input_tokens_seen": 107079712, + "step": 6655 + }, + { + "epoch": 0.4662411235738615, + "grad_norm": 4.066924095153809, + "learning_rate": 5.341965674255692e-05, + "loss": 1.1776, + "num_input_tokens_seen": 107096096, + "step": 6656 + }, + { + "epoch": 0.46631117181959075, + "grad_norm": 3.7454941272735596, + "learning_rate": 5.34126584938704e-05, + "loss": 1.0325, + "num_input_tokens_seen": 107112480, + "step": 6657 + }, + { + "epoch": 0.46638122006532, + "grad_norm": 3.738274574279785, + "learning_rate": 5.3405660245183885e-05, + "loss": 1.0846, + "num_input_tokens_seen": 107128864, + "step": 6658 + }, + { + "epoch": 0.4664512683110492, + "grad_norm": 8.665736198425293, + "learning_rate": 5.339866199649738e-05, + "loss": 1.1532, + "num_input_tokens_seen": 107144536, + "step": 6659 + }, + { + "epoch": 0.4665213165567785, + "grad_norm": 3.8733510971069336, + "learning_rate": 5.339166374781086e-05, + "loss": 0.7974, + "num_input_tokens_seen": 107160664, + "step": 6660 + }, + { + "epoch": 0.46659136480250774, + "grad_norm": 4.000319957733154, + "learning_rate": 5.3384665499124344e-05, + "loss": 1.0197, + "num_input_tokens_seen": 107177048, + "step": 6661 + }, + { + "epoch": 0.466661413048237, + "grad_norm": 3.8049557209014893, + "learning_rate": 5.337766725043783e-05, + "loss": 1.1256, + "num_input_tokens_seen": 107192496, + "step": 6662 + }, + { + "epoch": 0.4667314612939662, + "grad_norm": 4.009215354919434, + "learning_rate": 5.3370669001751314e-05, + "loss": 1.1667, + "num_input_tokens_seen": 107207912, + "step": 6663 + }, + { + "epoch": 0.46680150953969546, + "grad_norm": 6.3007378578186035, + "learning_rate": 5.3363670753064796e-05, + "loss": 1.3413, + "num_input_tokens_seen": 107223504, + "step": 6664 + }, + { + "epoch": 0.4668715577854247, + "grad_norm": 3.5798394680023193, + "learning_rate": 5.335667250437828e-05, + "loss": 1.0625, + "num_input_tokens_seen": 107239824, + "step": 6665 + }, + { + "epoch": 0.466941606031154, + "grad_norm": 4.701604843139648, + "learning_rate": 5.334967425569179e-05, + "loss": 1.1597, + "num_input_tokens_seen": 107256208, + "step": 6666 + }, + { + "epoch": 0.4670116542768832, + "grad_norm": 3.462380886077881, + "learning_rate": 5.3342676007005255e-05, + "loss": 0.976, + "num_input_tokens_seen": 107272592, + "step": 6667 + }, + { + "epoch": 0.46708170252261244, + "grad_norm": 5.546586513519287, + "learning_rate": 5.333567775831875e-05, + "loss": 1.3386, + "num_input_tokens_seen": 107287520, + "step": 6668 + }, + { + "epoch": 0.4671517507683417, + "grad_norm": 4.677948474884033, + "learning_rate": 5.332867950963223e-05, + "loss": 1.1263, + "num_input_tokens_seen": 107303608, + "step": 6669 + }, + { + "epoch": 0.46722179901407096, + "grad_norm": 3.95694899559021, + "learning_rate": 5.3321681260945714e-05, + "loss": 1.1536, + "num_input_tokens_seen": 107319992, + "step": 6670 + }, + { + "epoch": 0.46729184725980016, + "grad_norm": 4.037060737609863, + "learning_rate": 5.331468301225919e-05, + "loss": 1.0444, + "num_input_tokens_seen": 107336376, + "step": 6671 + }, + { + "epoch": 0.4673618955055294, + "grad_norm": 3.5486528873443604, + "learning_rate": 5.3307684763572684e-05, + "loss": 0.9887, + "num_input_tokens_seen": 107352760, + "step": 6672 + }, + { + "epoch": 0.4674319437512587, + "grad_norm": 3.868568181991577, + "learning_rate": 5.330068651488618e-05, + "loss": 0.8593, + "num_input_tokens_seen": 107369144, + "step": 6673 + }, + { + "epoch": 0.46750199199698794, + "grad_norm": 7.702548980712891, + "learning_rate": 5.329368826619965e-05, + "loss": 1.188, + "num_input_tokens_seen": 107385528, + "step": 6674 + }, + { + "epoch": 0.46757204024271715, + "grad_norm": 4.390200614929199, + "learning_rate": 5.328669001751314e-05, + "loss": 1.1342, + "num_input_tokens_seen": 107401304, + "step": 6675 + }, + { + "epoch": 0.4676420884884464, + "grad_norm": 3.7440412044525146, + "learning_rate": 5.327969176882663e-05, + "loss": 0.8969, + "num_input_tokens_seen": 107417688, + "step": 6676 + }, + { + "epoch": 0.46771213673417567, + "grad_norm": 4.894672870635986, + "learning_rate": 5.327269352014011e-05, + "loss": 1.0542, + "num_input_tokens_seen": 107433816, + "step": 6677 + }, + { + "epoch": 0.4677821849799049, + "grad_norm": 4.762908458709717, + "learning_rate": 5.326569527145359e-05, + "loss": 1.1926, + "num_input_tokens_seen": 107450200, + "step": 6678 + }, + { + "epoch": 0.46785223322563413, + "grad_norm": 4.3587870597839355, + "learning_rate": 5.325869702276708e-05, + "loss": 1.2127, + "num_input_tokens_seen": 107466584, + "step": 6679 + }, + { + "epoch": 0.4679222814713634, + "grad_norm": 4.166892051696777, + "learning_rate": 5.325169877408056e-05, + "loss": 1.0259, + "num_input_tokens_seen": 107482968, + "step": 6680 + }, + { + "epoch": 0.46799232971709265, + "grad_norm": 4.266642093658447, + "learning_rate": 5.324470052539404e-05, + "loss": 0.9715, + "num_input_tokens_seen": 107498192, + "step": 6681 + }, + { + "epoch": 0.4680623779628219, + "grad_norm": 3.3419625759124756, + "learning_rate": 5.3237702276707536e-05, + "loss": 1.0262, + "num_input_tokens_seen": 107514576, + "step": 6682 + }, + { + "epoch": 0.46813242620855117, + "grad_norm": 3.903163433074951, + "learning_rate": 5.3230704028021025e-05, + "loss": 1.0785, + "num_input_tokens_seen": 107530536, + "step": 6683 + }, + { + "epoch": 0.46820247445428037, + "grad_norm": 5.467947959899902, + "learning_rate": 5.32237057793345e-05, + "loss": 1.246, + "num_input_tokens_seen": 107546248, + "step": 6684 + }, + { + "epoch": 0.46827252270000963, + "grad_norm": 3.9213547706604004, + "learning_rate": 5.321670753064799e-05, + "loss": 1.1432, + "num_input_tokens_seen": 107561992, + "step": 6685 + }, + { + "epoch": 0.4683425709457389, + "grad_norm": 5.265954971313477, + "learning_rate": 5.320970928196147e-05, + "loss": 0.9934, + "num_input_tokens_seen": 107578376, + "step": 6686 + }, + { + "epoch": 0.46841261919146815, + "grad_norm": 3.9765655994415283, + "learning_rate": 5.320271103327495e-05, + "loss": 1.0219, + "num_input_tokens_seen": 107594680, + "step": 6687 + }, + { + "epoch": 0.46848266743719735, + "grad_norm": 4.261830806732178, + "learning_rate": 5.3195712784588434e-05, + "loss": 1.0328, + "num_input_tokens_seen": 107611064, + "step": 6688 + }, + { + "epoch": 0.4685527156829266, + "grad_norm": 7.026014804840088, + "learning_rate": 5.318871453590194e-05, + "loss": 0.9985, + "num_input_tokens_seen": 107626112, + "step": 6689 + }, + { + "epoch": 0.46862276392865587, + "grad_norm": 4.726694107055664, + "learning_rate": 5.3181716287215425e-05, + "loss": 1.0744, + "num_input_tokens_seen": 107642496, + "step": 6690 + }, + { + "epoch": 0.46869281217438513, + "grad_norm": 3.6380646228790283, + "learning_rate": 5.317471803852889e-05, + "loss": 1.1708, + "num_input_tokens_seen": 107658880, + "step": 6691 + }, + { + "epoch": 0.46876286042011434, + "grad_norm": 3.5807487964630127, + "learning_rate": 5.316771978984239e-05, + "loss": 1.1403, + "num_input_tokens_seen": 107675256, + "step": 6692 + }, + { + "epoch": 0.4688329086658436, + "grad_norm": 3.9915847778320312, + "learning_rate": 5.316072154115588e-05, + "loss": 1.0826, + "num_input_tokens_seen": 107691016, + "step": 6693 + }, + { + "epoch": 0.46890295691157285, + "grad_norm": 4.012253284454346, + "learning_rate": 5.3153723292469345e-05, + "loss": 1.0194, + "num_input_tokens_seen": 107707064, + "step": 6694 + }, + { + "epoch": 0.4689730051573021, + "grad_norm": 3.9562582969665527, + "learning_rate": 5.314672504378284e-05, + "loss": 1.0017, + "num_input_tokens_seen": 107723152, + "step": 6695 + }, + { + "epoch": 0.4690430534030313, + "grad_norm": 4.575549125671387, + "learning_rate": 5.3139726795096336e-05, + "loss": 1.0722, + "num_input_tokens_seen": 107739536, + "step": 6696 + }, + { + "epoch": 0.4691131016487606, + "grad_norm": 3.8225462436676025, + "learning_rate": 5.313272854640982e-05, + "loss": 1.0149, + "num_input_tokens_seen": 107755920, + "step": 6697 + }, + { + "epoch": 0.46918314989448984, + "grad_norm": 3.951275587081909, + "learning_rate": 5.31257302977233e-05, + "loss": 0.9675, + "num_input_tokens_seen": 107772296, + "step": 6698 + }, + { + "epoch": 0.4692531981402191, + "grad_norm": 3.5939061641693115, + "learning_rate": 5.311873204903678e-05, + "loss": 0.9912, + "num_input_tokens_seen": 107788480, + "step": 6699 + }, + { + "epoch": 0.4693232463859483, + "grad_norm": 7.109866619110107, + "learning_rate": 5.311173380035027e-05, + "loss": 0.8939, + "num_input_tokens_seen": 107804768, + "step": 6700 + }, + { + "epoch": 0.46939329463167756, + "grad_norm": 3.6135330200195312, + "learning_rate": 5.3104735551663745e-05, + "loss": 1.1388, + "num_input_tokens_seen": 107820632, + "step": 6701 + }, + { + "epoch": 0.4694633428774068, + "grad_norm": 4.7758331298828125, + "learning_rate": 5.3097737302977234e-05, + "loss": 0.9504, + "num_input_tokens_seen": 107837016, + "step": 6702 + }, + { + "epoch": 0.4695333911231361, + "grad_norm": 3.7631545066833496, + "learning_rate": 5.309073905429073e-05, + "loss": 1.021, + "num_input_tokens_seen": 107853400, + "step": 6703 + }, + { + "epoch": 0.4696034393688653, + "grad_norm": 5.737015247344971, + "learning_rate": 5.30837408056042e-05, + "loss": 1.053, + "num_input_tokens_seen": 107869784, + "step": 6704 + }, + { + "epoch": 0.46967348761459454, + "grad_norm": 3.845569610595703, + "learning_rate": 5.307674255691769e-05, + "loss": 1.0225, + "num_input_tokens_seen": 107885760, + "step": 6705 + }, + { + "epoch": 0.4697435358603238, + "grad_norm": 7.402350902557373, + "learning_rate": 5.306974430823118e-05, + "loss": 1.0404, + "num_input_tokens_seen": 107902144, + "step": 6706 + }, + { + "epoch": 0.46981358410605306, + "grad_norm": 4.036012649536133, + "learning_rate": 5.306274605954466e-05, + "loss": 1.0646, + "num_input_tokens_seen": 107918528, + "step": 6707 + }, + { + "epoch": 0.46988363235178227, + "grad_norm": 5.720461845397949, + "learning_rate": 5.3055747810858145e-05, + "loss": 0.9158, + "num_input_tokens_seen": 107934912, + "step": 6708 + }, + { + "epoch": 0.4699536805975115, + "grad_norm": 4.842574119567871, + "learning_rate": 5.304874956217163e-05, + "loss": 1.0039, + "num_input_tokens_seen": 107950800, + "step": 6709 + }, + { + "epoch": 0.4700237288432408, + "grad_norm": 3.787020444869995, + "learning_rate": 5.304175131348512e-05, + "loss": 0.8436, + "num_input_tokens_seen": 107967184, + "step": 6710 + }, + { + "epoch": 0.47009377708897004, + "grad_norm": 4.2691192626953125, + "learning_rate": 5.303475306479859e-05, + "loss": 1.0335, + "num_input_tokens_seen": 107983568, + "step": 6711 + }, + { + "epoch": 0.47016382533469925, + "grad_norm": 5.233339786529541, + "learning_rate": 5.3027754816112086e-05, + "loss": 1.0575, + "num_input_tokens_seen": 107999952, + "step": 6712 + }, + { + "epoch": 0.4702338735804285, + "grad_norm": 3.421193838119507, + "learning_rate": 5.302075656742558e-05, + "loss": 1.022, + "num_input_tokens_seen": 108016336, + "step": 6713 + }, + { + "epoch": 0.47030392182615777, + "grad_norm": 4.561410427093506, + "learning_rate": 5.301375831873906e-05, + "loss": 1.173, + "num_input_tokens_seen": 108032720, + "step": 6714 + }, + { + "epoch": 0.470373970071887, + "grad_norm": 4.749919891357422, + "learning_rate": 5.3006760070052545e-05, + "loss": 1.05, + "num_input_tokens_seen": 108049104, + "step": 6715 + }, + { + "epoch": 0.47044401831761623, + "grad_norm": 4.774212837219238, + "learning_rate": 5.299976182136603e-05, + "loss": 1.0433, + "num_input_tokens_seen": 108065488, + "step": 6716 + }, + { + "epoch": 0.4705140665633455, + "grad_norm": 3.6954824924468994, + "learning_rate": 5.2992763572679515e-05, + "loss": 0.9831, + "num_input_tokens_seen": 108081224, + "step": 6717 + }, + { + "epoch": 0.47058411480907475, + "grad_norm": 5.202620983123779, + "learning_rate": 5.2985765323993e-05, + "loss": 1.0583, + "num_input_tokens_seen": 108097608, + "step": 6718 + }, + { + "epoch": 0.470654163054804, + "grad_norm": 3.7043261528015137, + "learning_rate": 5.297876707530649e-05, + "loss": 0.9753, + "num_input_tokens_seen": 108113992, + "step": 6719 + }, + { + "epoch": 0.47072421130053327, + "grad_norm": 4.06228494644165, + "learning_rate": 5.2971768826619974e-05, + "loss": 1.0117, + "num_input_tokens_seen": 108130376, + "step": 6720 + }, + { + "epoch": 0.4707942595462625, + "grad_norm": 3.4427239894866943, + "learning_rate": 5.2964770577933456e-05, + "loss": 0.9207, + "num_input_tokens_seen": 108146760, + "step": 6721 + }, + { + "epoch": 0.47086430779199173, + "grad_norm": 6.617749214172363, + "learning_rate": 5.295777232924694e-05, + "loss": 1.1472, + "num_input_tokens_seen": 108163144, + "step": 6722 + }, + { + "epoch": 0.470934356037721, + "grad_norm": 3.744797706604004, + "learning_rate": 5.2950774080560426e-05, + "loss": 1.0143, + "num_input_tokens_seen": 108179528, + "step": 6723 + }, + { + "epoch": 0.47100440428345025, + "grad_norm": 5.034976005554199, + "learning_rate": 5.294377583187391e-05, + "loss": 1.0061, + "num_input_tokens_seen": 108195248, + "step": 6724 + }, + { + "epoch": 0.47107445252917945, + "grad_norm": 3.9690632820129395, + "learning_rate": 5.293677758318739e-05, + "loss": 1.2634, + "num_input_tokens_seen": 108210920, + "step": 6725 + }, + { + "epoch": 0.4711445007749087, + "grad_norm": 3.351450204849243, + "learning_rate": 5.292977933450087e-05, + "loss": 0.876, + "num_input_tokens_seen": 108227304, + "step": 6726 + }, + { + "epoch": 0.471214549020638, + "grad_norm": 3.7437920570373535, + "learning_rate": 5.292278108581437e-05, + "loss": 0.9549, + "num_input_tokens_seen": 108243688, + "step": 6727 + }, + { + "epoch": 0.47128459726636723, + "grad_norm": 6.022392272949219, + "learning_rate": 5.291578283712785e-05, + "loss": 1.0568, + "num_input_tokens_seen": 108260072, + "step": 6728 + }, + { + "epoch": 0.47135464551209644, + "grad_norm": 4.407289505004883, + "learning_rate": 5.290878458844134e-05, + "loss": 1.0511, + "num_input_tokens_seen": 108276456, + "step": 6729 + }, + { + "epoch": 0.4714246937578257, + "grad_norm": 3.9509878158569336, + "learning_rate": 5.290178633975482e-05, + "loss": 1.1648, + "num_input_tokens_seen": 108291632, + "step": 6730 + }, + { + "epoch": 0.47149474200355496, + "grad_norm": 4.2412285804748535, + "learning_rate": 5.28947880910683e-05, + "loss": 1.1903, + "num_input_tokens_seen": 108308016, + "step": 6731 + }, + { + "epoch": 0.4715647902492842, + "grad_norm": 4.234686374664307, + "learning_rate": 5.288778984238178e-05, + "loss": 1.1111, + "num_input_tokens_seen": 108323984, + "step": 6732 + }, + { + "epoch": 0.4716348384950134, + "grad_norm": 4.565019130706787, + "learning_rate": 5.288079159369529e-05, + "loss": 1.096, + "num_input_tokens_seen": 108340368, + "step": 6733 + }, + { + "epoch": 0.4717048867407427, + "grad_norm": 4.805628299713135, + "learning_rate": 5.287379334500876e-05, + "loss": 1.1239, + "num_input_tokens_seen": 108356752, + "step": 6734 + }, + { + "epoch": 0.47177493498647194, + "grad_norm": 3.9647700786590576, + "learning_rate": 5.286679509632224e-05, + "loss": 1.1555, + "num_input_tokens_seen": 108372216, + "step": 6735 + }, + { + "epoch": 0.4718449832322012, + "grad_norm": 3.811239004135132, + "learning_rate": 5.285979684763574e-05, + "loss": 0.9169, + "num_input_tokens_seen": 108387696, + "step": 6736 + }, + { + "epoch": 0.4719150314779304, + "grad_norm": 4.559319496154785, + "learning_rate": 5.285279859894922e-05, + "loss": 0.918, + "num_input_tokens_seen": 108403944, + "step": 6737 + }, + { + "epoch": 0.47198507972365966, + "grad_norm": 4.727875232696533, + "learning_rate": 5.2845800350262694e-05, + "loss": 1.1424, + "num_input_tokens_seen": 108420328, + "step": 6738 + }, + { + "epoch": 0.4720551279693889, + "grad_norm": 3.8609120845794678, + "learning_rate": 5.283880210157619e-05, + "loss": 1.0053, + "num_input_tokens_seen": 108436712, + "step": 6739 + }, + { + "epoch": 0.4721251762151182, + "grad_norm": 3.804370164871216, + "learning_rate": 5.2831803852889685e-05, + "loss": 1.0733, + "num_input_tokens_seen": 108453040, + "step": 6740 + }, + { + "epoch": 0.4721952244608474, + "grad_norm": 3.939620018005371, + "learning_rate": 5.282480560420315e-05, + "loss": 1.0229, + "num_input_tokens_seen": 108468880, + "step": 6741 + }, + { + "epoch": 0.47226527270657664, + "grad_norm": 4.376893043518066, + "learning_rate": 5.2817807355516635e-05, + "loss": 1.131, + "num_input_tokens_seen": 108485248, + "step": 6742 + }, + { + "epoch": 0.4723353209523059, + "grad_norm": 5.025060653686523, + "learning_rate": 5.281080910683014e-05, + "loss": 1.1299, + "num_input_tokens_seen": 108501040, + "step": 6743 + }, + { + "epoch": 0.47240536919803516, + "grad_norm": 3.524656057357788, + "learning_rate": 5.280381085814361e-05, + "loss": 0.9653, + "num_input_tokens_seen": 108516624, + "step": 6744 + }, + { + "epoch": 0.47247541744376437, + "grad_norm": 3.8542211055755615, + "learning_rate": 5.2796812609457094e-05, + "loss": 1.0025, + "num_input_tokens_seen": 108533008, + "step": 6745 + }, + { + "epoch": 0.4725454656894936, + "grad_norm": 3.8751041889190674, + "learning_rate": 5.278981436077058e-05, + "loss": 1.1803, + "num_input_tokens_seen": 108549112, + "step": 6746 + }, + { + "epoch": 0.4726155139352229, + "grad_norm": 4.343238353729248, + "learning_rate": 5.2782816112084064e-05, + "loss": 0.9759, + "num_input_tokens_seen": 108564328, + "step": 6747 + }, + { + "epoch": 0.47268556218095215, + "grad_norm": 3.695493698120117, + "learning_rate": 5.2775817863397546e-05, + "loss": 0.8834, + "num_input_tokens_seen": 108580112, + "step": 6748 + }, + { + "epoch": 0.47275561042668135, + "grad_norm": 3.8947877883911133, + "learning_rate": 5.276881961471104e-05, + "loss": 1.0522, + "num_input_tokens_seen": 108596136, + "step": 6749 + }, + { + "epoch": 0.4728256586724106, + "grad_norm": 4.2317633628845215, + "learning_rate": 5.276182136602453e-05, + "loss": 0.9472, + "num_input_tokens_seen": 108612520, + "step": 6750 + }, + { + "epoch": 0.47289570691813987, + "grad_norm": 3.608283281326294, + "learning_rate": 5.2754823117338005e-05, + "loss": 1.0748, + "num_input_tokens_seen": 108628904, + "step": 6751 + }, + { + "epoch": 0.47296575516386913, + "grad_norm": 4.512143611907959, + "learning_rate": 5.274782486865149e-05, + "loss": 1.0156, + "num_input_tokens_seen": 108644248, + "step": 6752 + }, + { + "epoch": 0.4730358034095984, + "grad_norm": 3.81160044670105, + "learning_rate": 5.2740826619964976e-05, + "loss": 1.0496, + "num_input_tokens_seen": 108660488, + "step": 6753 + }, + { + "epoch": 0.4731058516553276, + "grad_norm": 3.760336399078369, + "learning_rate": 5.273382837127846e-05, + "loss": 1.0335, + "num_input_tokens_seen": 108676872, + "step": 6754 + }, + { + "epoch": 0.47317589990105685, + "grad_norm": 3.969651222229004, + "learning_rate": 5.272683012259194e-05, + "loss": 1.2213, + "num_input_tokens_seen": 108693256, + "step": 6755 + }, + { + "epoch": 0.4732459481467861, + "grad_norm": 4.55695915222168, + "learning_rate": 5.2719831873905435e-05, + "loss": 0.9125, + "num_input_tokens_seen": 108709576, + "step": 6756 + }, + { + "epoch": 0.47331599639251537, + "grad_norm": 4.36952018737793, + "learning_rate": 5.271283362521893e-05, + "loss": 1.0403, + "num_input_tokens_seen": 108725520, + "step": 6757 + }, + { + "epoch": 0.4733860446382446, + "grad_norm": 4.689207553863525, + "learning_rate": 5.27058353765324e-05, + "loss": 1.0875, + "num_input_tokens_seen": 108741744, + "step": 6758 + }, + { + "epoch": 0.47345609288397383, + "grad_norm": 3.5912058353424072, + "learning_rate": 5.269883712784588e-05, + "loss": 1.1125, + "num_input_tokens_seen": 108757952, + "step": 6759 + }, + { + "epoch": 0.4735261411297031, + "grad_norm": 4.725868225097656, + "learning_rate": 5.269183887915938e-05, + "loss": 0.9312, + "num_input_tokens_seen": 108774088, + "step": 6760 + }, + { + "epoch": 0.47359618937543235, + "grad_norm": 4.213376045227051, + "learning_rate": 5.268484063047285e-05, + "loss": 1.0752, + "num_input_tokens_seen": 108790472, + "step": 6761 + }, + { + "epoch": 0.47366623762116156, + "grad_norm": 4.116434574127197, + "learning_rate": 5.267784238178633e-05, + "loss": 1.0481, + "num_input_tokens_seen": 108806776, + "step": 6762 + }, + { + "epoch": 0.4737362858668908, + "grad_norm": 3.8367996215820312, + "learning_rate": 5.267084413309983e-05, + "loss": 1.0882, + "num_input_tokens_seen": 108822416, + "step": 6763 + }, + { + "epoch": 0.4738063341126201, + "grad_norm": 3.609545946121216, + "learning_rate": 5.266384588441332e-05, + "loss": 0.929, + "num_input_tokens_seen": 108838208, + "step": 6764 + }, + { + "epoch": 0.47387638235834934, + "grad_norm": 4.108180522918701, + "learning_rate": 5.2656847635726805e-05, + "loss": 0.9622, + "num_input_tokens_seen": 108854592, + "step": 6765 + }, + { + "epoch": 0.47394643060407854, + "grad_norm": 4.884720325469971, + "learning_rate": 5.2649849387040287e-05, + "loss": 1.1246, + "num_input_tokens_seen": 108870976, + "step": 6766 + }, + { + "epoch": 0.4740164788498078, + "grad_norm": 4.856875896453857, + "learning_rate": 5.2642851138353775e-05, + "loss": 1.2403, + "num_input_tokens_seen": 108885688, + "step": 6767 + }, + { + "epoch": 0.47408652709553706, + "grad_norm": 3.5622432231903076, + "learning_rate": 5.263585288966725e-05, + "loss": 0.9572, + "num_input_tokens_seen": 108902072, + "step": 6768 + }, + { + "epoch": 0.4741565753412663, + "grad_norm": 5.305510997772217, + "learning_rate": 5.262885464098073e-05, + "loss": 1.229, + "num_input_tokens_seen": 108917848, + "step": 6769 + }, + { + "epoch": 0.4742266235869955, + "grad_norm": 3.729074478149414, + "learning_rate": 5.2621856392294234e-05, + "loss": 0.9361, + "num_input_tokens_seen": 108934232, + "step": 6770 + }, + { + "epoch": 0.4742966718327248, + "grad_norm": 4.5915937423706055, + "learning_rate": 5.26148581436077e-05, + "loss": 1.0442, + "num_input_tokens_seen": 108949696, + "step": 6771 + }, + { + "epoch": 0.47436672007845404, + "grad_norm": 3.977216958999634, + "learning_rate": 5.26078598949212e-05, + "loss": 1.2395, + "num_input_tokens_seen": 108965848, + "step": 6772 + }, + { + "epoch": 0.4744367683241833, + "grad_norm": 4.012653827667236, + "learning_rate": 5.260086164623468e-05, + "loss": 0.9013, + "num_input_tokens_seen": 108982232, + "step": 6773 + }, + { + "epoch": 0.4745068165699125, + "grad_norm": 4.10910701751709, + "learning_rate": 5.259386339754817e-05, + "loss": 0.9896, + "num_input_tokens_seen": 108997800, + "step": 6774 + }, + { + "epoch": 0.47457686481564176, + "grad_norm": 5.1765336990356445, + "learning_rate": 5.258686514886165e-05, + "loss": 1.1068, + "num_input_tokens_seen": 109013664, + "step": 6775 + }, + { + "epoch": 0.474646913061371, + "grad_norm": 5.6664228439331055, + "learning_rate": 5.257986690017513e-05, + "loss": 1.345, + "num_input_tokens_seen": 109029208, + "step": 6776 + }, + { + "epoch": 0.4747169613071003, + "grad_norm": 6.2354817390441895, + "learning_rate": 5.257286865148863e-05, + "loss": 1.1819, + "num_input_tokens_seen": 109044528, + "step": 6777 + }, + { + "epoch": 0.4747870095528295, + "grad_norm": 3.8308510780334473, + "learning_rate": 5.2565870402802095e-05, + "loss": 0.9639, + "num_input_tokens_seen": 109060912, + "step": 6778 + }, + { + "epoch": 0.47485705779855875, + "grad_norm": 4.019093990325928, + "learning_rate": 5.255887215411558e-05, + "loss": 0.9385, + "num_input_tokens_seen": 109077296, + "step": 6779 + }, + { + "epoch": 0.474927106044288, + "grad_norm": 6.938348293304443, + "learning_rate": 5.255187390542907e-05, + "loss": 0.9974, + "num_input_tokens_seen": 109093680, + "step": 6780 + }, + { + "epoch": 0.47499715429001726, + "grad_norm": 4.200627326965332, + "learning_rate": 5.254487565674257e-05, + "loss": 1.0353, + "num_input_tokens_seen": 109110008, + "step": 6781 + }, + { + "epoch": 0.47506720253574647, + "grad_norm": 4.06279993057251, + "learning_rate": 5.253787740805605e-05, + "loss": 0.9581, + "num_input_tokens_seen": 109126392, + "step": 6782 + }, + { + "epoch": 0.47513725078147573, + "grad_norm": 4.124377250671387, + "learning_rate": 5.2530879159369525e-05, + "loss": 1.2065, + "num_input_tokens_seen": 109142680, + "step": 6783 + }, + { + "epoch": 0.475207299027205, + "grad_norm": 4.182784557342529, + "learning_rate": 5.252388091068302e-05, + "loss": 1.073, + "num_input_tokens_seen": 109158768, + "step": 6784 + }, + { + "epoch": 0.47527734727293425, + "grad_norm": 4.513407230377197, + "learning_rate": 5.25168826619965e-05, + "loss": 1.0392, + "num_input_tokens_seen": 109175152, + "step": 6785 + }, + { + "epoch": 0.4753473955186635, + "grad_norm": 3.251490354537964, + "learning_rate": 5.250988441330997e-05, + "loss": 0.8611, + "num_input_tokens_seen": 109191056, + "step": 6786 + }, + { + "epoch": 0.4754174437643927, + "grad_norm": 8.621055603027344, + "learning_rate": 5.250288616462348e-05, + "loss": 1.0699, + "num_input_tokens_seen": 109206888, + "step": 6787 + }, + { + "epoch": 0.47548749201012197, + "grad_norm": 4.264245986938477, + "learning_rate": 5.249588791593696e-05, + "loss": 0.9409, + "num_input_tokens_seen": 109223272, + "step": 6788 + }, + { + "epoch": 0.47555754025585123, + "grad_norm": 3.6648037433624268, + "learning_rate": 5.248888966725044e-05, + "loss": 1.0575, + "num_input_tokens_seen": 109239544, + "step": 6789 + }, + { + "epoch": 0.4756275885015805, + "grad_norm": 4.528952598571777, + "learning_rate": 5.2481891418563925e-05, + "loss": 1.1251, + "num_input_tokens_seen": 109255816, + "step": 6790 + }, + { + "epoch": 0.4756976367473097, + "grad_norm": 4.45644998550415, + "learning_rate": 5.247489316987741e-05, + "loss": 1.095, + "num_input_tokens_seen": 109272200, + "step": 6791 + }, + { + "epoch": 0.47576768499303895, + "grad_norm": 3.8969879150390625, + "learning_rate": 5.2467894921190895e-05, + "loss": 0.9836, + "num_input_tokens_seen": 109288520, + "step": 6792 + }, + { + "epoch": 0.4758377332387682, + "grad_norm": 3.627748727798462, + "learning_rate": 5.246089667250438e-05, + "loss": 1.1656, + "num_input_tokens_seen": 109304624, + "step": 6793 + }, + { + "epoch": 0.47590778148449747, + "grad_norm": 4.493330478668213, + "learning_rate": 5.245389842381787e-05, + "loss": 1.2352, + "num_input_tokens_seen": 109319976, + "step": 6794 + }, + { + "epoch": 0.4759778297302267, + "grad_norm": 3.5947048664093018, + "learning_rate": 5.2446900175131354e-05, + "loss": 1.035, + "num_input_tokens_seen": 109336360, + "step": 6795 + }, + { + "epoch": 0.47604787797595594, + "grad_norm": 4.194823741912842, + "learning_rate": 5.2439901926444836e-05, + "loss": 0.9851, + "num_input_tokens_seen": 109351624, + "step": 6796 + }, + { + "epoch": 0.4761179262216852, + "grad_norm": 3.9734160900115967, + "learning_rate": 5.2432903677758324e-05, + "loss": 1.2019, + "num_input_tokens_seen": 109367072, + "step": 6797 + }, + { + "epoch": 0.47618797446741445, + "grad_norm": 4.142136096954346, + "learning_rate": 5.2425905429071806e-05, + "loss": 1.1178, + "num_input_tokens_seen": 109383408, + "step": 6798 + }, + { + "epoch": 0.47625802271314366, + "grad_norm": 4.315369129180908, + "learning_rate": 5.241890718038529e-05, + "loss": 1.2254, + "num_input_tokens_seen": 109398616, + "step": 6799 + }, + { + "epoch": 0.4763280709588729, + "grad_norm": 4.77875280380249, + "learning_rate": 5.241190893169877e-05, + "loss": 1.1018, + "num_input_tokens_seen": 109414592, + "step": 6800 + }, + { + "epoch": 0.4763280709588729, + "eval_loss": 1.1252864599227905, + "eval_runtime": 0.1585, + "eval_samples_per_second": 6.311, + "eval_steps_per_second": 6.311, + "num_input_tokens_seen": 109414592, + "step": 6800 + }, + { + "epoch": 0.4763981192046022, + "grad_norm": 4.054019927978516, + "learning_rate": 5.2404910683012265e-05, + "loss": 1.1978, + "num_input_tokens_seen": 109430896, + "step": 6801 + }, + { + "epoch": 0.47646816745033144, + "grad_norm": 4.0688276290893555, + "learning_rate": 5.239791243432575e-05, + "loss": 1.015, + "num_input_tokens_seen": 109447008, + "step": 6802 + }, + { + "epoch": 0.47653821569606064, + "grad_norm": 4.081553936004639, + "learning_rate": 5.239091418563924e-05, + "loss": 1.2566, + "num_input_tokens_seen": 109463392, + "step": 6803 + }, + { + "epoch": 0.4766082639417899, + "grad_norm": 4.719587326049805, + "learning_rate": 5.2383915936952724e-05, + "loss": 1.0577, + "num_input_tokens_seen": 109478768, + "step": 6804 + }, + { + "epoch": 0.47667831218751916, + "grad_norm": 3.7197132110595703, + "learning_rate": 5.2376917688266206e-05, + "loss": 1.0442, + "num_input_tokens_seen": 109494808, + "step": 6805 + }, + { + "epoch": 0.4767483604332484, + "grad_norm": 5.000951290130615, + "learning_rate": 5.236991943957968e-05, + "loss": 1.0497, + "num_input_tokens_seen": 109511192, + "step": 6806 + }, + { + "epoch": 0.4768184086789776, + "grad_norm": 3.9910333156585693, + "learning_rate": 5.236292119089316e-05, + "loss": 1.2905, + "num_input_tokens_seen": 109527576, + "step": 6807 + }, + { + "epoch": 0.4768884569247069, + "grad_norm": 4.522314548492432, + "learning_rate": 5.235592294220666e-05, + "loss": 0.9959, + "num_input_tokens_seen": 109543960, + "step": 6808 + }, + { + "epoch": 0.47695850517043614, + "grad_norm": 3.7235898971557617, + "learning_rate": 5.234892469352014e-05, + "loss": 0.9931, + "num_input_tokens_seen": 109560344, + "step": 6809 + }, + { + "epoch": 0.4770285534161654, + "grad_norm": 3.643763303756714, + "learning_rate": 5.2341926444833635e-05, + "loss": 0.9588, + "num_input_tokens_seen": 109576728, + "step": 6810 + }, + { + "epoch": 0.4770986016618946, + "grad_norm": 5.52113151550293, + "learning_rate": 5.233492819614712e-05, + "loss": 1.2022, + "num_input_tokens_seen": 109592584, + "step": 6811 + }, + { + "epoch": 0.47716864990762387, + "grad_norm": 4.9974188804626465, + "learning_rate": 5.23279299474606e-05, + "loss": 1.1755, + "num_input_tokens_seen": 109608960, + "step": 6812 + }, + { + "epoch": 0.4772386981533531, + "grad_norm": 5.266491889953613, + "learning_rate": 5.232093169877408e-05, + "loss": 1.1099, + "num_input_tokens_seen": 109625104, + "step": 6813 + }, + { + "epoch": 0.4773087463990824, + "grad_norm": 3.9919018745422363, + "learning_rate": 5.231393345008757e-05, + "loss": 1.0423, + "num_input_tokens_seen": 109641488, + "step": 6814 + }, + { + "epoch": 0.4773787946448116, + "grad_norm": 5.361277103424072, + "learning_rate": 5.230693520140105e-05, + "loss": 1.215, + "num_input_tokens_seen": 109657872, + "step": 6815 + }, + { + "epoch": 0.47744884289054085, + "grad_norm": 4.024937629699707, + "learning_rate": 5.229993695271453e-05, + "loss": 1.2601, + "num_input_tokens_seen": 109674256, + "step": 6816 + }, + { + "epoch": 0.4775188911362701, + "grad_norm": 3.7742490768432617, + "learning_rate": 5.229293870402803e-05, + "loss": 1.0789, + "num_input_tokens_seen": 109690576, + "step": 6817 + }, + { + "epoch": 0.47758893938199937, + "grad_norm": 3.622018814086914, + "learning_rate": 5.228594045534151e-05, + "loss": 0.8893, + "num_input_tokens_seen": 109706592, + "step": 6818 + }, + { + "epoch": 0.47765898762772857, + "grad_norm": 4.550981044769287, + "learning_rate": 5.227894220665499e-05, + "loss": 1.31, + "num_input_tokens_seen": 109722976, + "step": 6819 + }, + { + "epoch": 0.47772903587345783, + "grad_norm": 3.8553786277770996, + "learning_rate": 5.227194395796848e-05, + "loss": 0.9512, + "num_input_tokens_seen": 109738920, + "step": 6820 + }, + { + "epoch": 0.4777990841191871, + "grad_norm": 3.7159841060638428, + "learning_rate": 5.226494570928196e-05, + "loss": 0.9445, + "num_input_tokens_seen": 109755304, + "step": 6821 + }, + { + "epoch": 0.47786913236491635, + "grad_norm": 5.884495258331299, + "learning_rate": 5.2257947460595444e-05, + "loss": 0.9789, + "num_input_tokens_seen": 109771576, + "step": 6822 + }, + { + "epoch": 0.4779391806106456, + "grad_norm": 3.7047083377838135, + "learning_rate": 5.2250949211908926e-05, + "loss": 1.0297, + "num_input_tokens_seen": 109787872, + "step": 6823 + }, + { + "epoch": 0.4780092288563748, + "grad_norm": 3.485847234725952, + "learning_rate": 5.224395096322241e-05, + "loss": 0.9269, + "num_input_tokens_seen": 109803384, + "step": 6824 + }, + { + "epoch": 0.47807927710210407, + "grad_norm": 3.9222450256347656, + "learning_rate": 5.223695271453592e-05, + "loss": 1.0749, + "num_input_tokens_seen": 109818704, + "step": 6825 + }, + { + "epoch": 0.47814932534783333, + "grad_norm": 4.232855796813965, + "learning_rate": 5.2229954465849385e-05, + "loss": 1.1773, + "num_input_tokens_seen": 109835088, + "step": 6826 + }, + { + "epoch": 0.4782193735935626, + "grad_norm": 3.5413403511047363, + "learning_rate": 5.222295621716288e-05, + "loss": 0.9407, + "num_input_tokens_seen": 109851472, + "step": 6827 + }, + { + "epoch": 0.4782894218392918, + "grad_norm": 4.55118989944458, + "learning_rate": 5.2215957968476356e-05, + "loss": 1.1199, + "num_input_tokens_seen": 109867856, + "step": 6828 + }, + { + "epoch": 0.47835947008502105, + "grad_norm": 3.691756010055542, + "learning_rate": 5.220895971978984e-05, + "loss": 0.9721, + "num_input_tokens_seen": 109884240, + "step": 6829 + }, + { + "epoch": 0.4784295183307503, + "grad_norm": 3.588829755783081, + "learning_rate": 5.220196147110333e-05, + "loss": 1.0665, + "num_input_tokens_seen": 109900624, + "step": 6830 + }, + { + "epoch": 0.4784995665764796, + "grad_norm": 4.766005516052246, + "learning_rate": 5.219496322241683e-05, + "loss": 1.0467, + "num_input_tokens_seen": 109917008, + "step": 6831 + }, + { + "epoch": 0.4785696148222088, + "grad_norm": 3.7234201431274414, + "learning_rate": 5.218796497373031e-05, + "loss": 1.0377, + "num_input_tokens_seen": 109933392, + "step": 6832 + }, + { + "epoch": 0.47863966306793804, + "grad_norm": 3.434387683868408, + "learning_rate": 5.218096672504378e-05, + "loss": 0.874, + "num_input_tokens_seen": 109949776, + "step": 6833 + }, + { + "epoch": 0.4787097113136673, + "grad_norm": 3.7484259605407715, + "learning_rate": 5.2173968476357274e-05, + "loss": 0.9365, + "num_input_tokens_seen": 109966016, + "step": 6834 + }, + { + "epoch": 0.47877975955939656, + "grad_norm": 5.821316719055176, + "learning_rate": 5.216697022767076e-05, + "loss": 0.9894, + "num_input_tokens_seen": 109981168, + "step": 6835 + }, + { + "epoch": 0.47884980780512576, + "grad_norm": 5.2646484375, + "learning_rate": 5.215997197898424e-05, + "loss": 1.0894, + "num_input_tokens_seen": 109996648, + "step": 6836 + }, + { + "epoch": 0.478919856050855, + "grad_norm": 5.125279426574707, + "learning_rate": 5.2152973730297726e-05, + "loss": 0.9451, + "num_input_tokens_seen": 110013032, + "step": 6837 + }, + { + "epoch": 0.4789899042965843, + "grad_norm": 4.917844295501709, + "learning_rate": 5.214597548161121e-05, + "loss": 1.1573, + "num_input_tokens_seen": 110029040, + "step": 6838 + }, + { + "epoch": 0.47905995254231354, + "grad_norm": 3.6937522888183594, + "learning_rate": 5.21389772329247e-05, + "loss": 1.0922, + "num_input_tokens_seen": 110045032, + "step": 6839 + }, + { + "epoch": 0.47913000078804274, + "grad_norm": 4.9768757820129395, + "learning_rate": 5.213197898423817e-05, + "loss": 1.3347, + "num_input_tokens_seen": 110061416, + "step": 6840 + }, + { + "epoch": 0.479200049033772, + "grad_norm": 5.775148391723633, + "learning_rate": 5.212498073555167e-05, + "loss": 1.1443, + "num_input_tokens_seen": 110077800, + "step": 6841 + }, + { + "epoch": 0.47927009727950126, + "grad_norm": 4.3342766761779785, + "learning_rate": 5.2117982486865155e-05, + "loss": 1.0604, + "num_input_tokens_seen": 110093848, + "step": 6842 + }, + { + "epoch": 0.4793401455252305, + "grad_norm": 3.6098031997680664, + "learning_rate": 5.211098423817863e-05, + "loss": 1.0893, + "num_input_tokens_seen": 110110232, + "step": 6843 + }, + { + "epoch": 0.4794101937709597, + "grad_norm": 3.7780818939208984, + "learning_rate": 5.210398598949212e-05, + "loss": 0.9852, + "num_input_tokens_seen": 110126584, + "step": 6844 + }, + { + "epoch": 0.479480242016689, + "grad_norm": 3.732302188873291, + "learning_rate": 5.2096987740805614e-05, + "loss": 0.9158, + "num_input_tokens_seen": 110142968, + "step": 6845 + }, + { + "epoch": 0.47955029026241824, + "grad_norm": 4.920741558074951, + "learning_rate": 5.208998949211908e-05, + "loss": 0.9931, + "num_input_tokens_seen": 110159352, + "step": 6846 + }, + { + "epoch": 0.4796203385081475, + "grad_norm": 3.847682476043701, + "learning_rate": 5.208299124343258e-05, + "loss": 1.1485, + "num_input_tokens_seen": 110175736, + "step": 6847 + }, + { + "epoch": 0.4796903867538767, + "grad_norm": 3.8941121101379395, + "learning_rate": 5.207599299474607e-05, + "loss": 1.0896, + "num_input_tokens_seen": 110192040, + "step": 6848 + }, + { + "epoch": 0.47976043499960597, + "grad_norm": 4.254310131072998, + "learning_rate": 5.2068994746059555e-05, + "loss": 1.1701, + "num_input_tokens_seen": 110208304, + "step": 6849 + }, + { + "epoch": 0.4798304832453352, + "grad_norm": 3.85739803314209, + "learning_rate": 5.206199649737302e-05, + "loss": 0.9785, + "num_input_tokens_seen": 110224688, + "step": 6850 + }, + { + "epoch": 0.4799005314910645, + "grad_norm": 4.137633323669434, + "learning_rate": 5.205499824868651e-05, + "loss": 1.2111, + "num_input_tokens_seen": 110240160, + "step": 6851 + }, + { + "epoch": 0.4799705797367937, + "grad_norm": 3.827974557876587, + "learning_rate": 5.204800000000001e-05, + "loss": 0.9639, + "num_input_tokens_seen": 110255952, + "step": 6852 + }, + { + "epoch": 0.48004062798252295, + "grad_norm": 4.506080627441406, + "learning_rate": 5.2041001751313475e-05, + "loss": 1.0435, + "num_input_tokens_seen": 110272336, + "step": 6853 + }, + { + "epoch": 0.4801106762282522, + "grad_norm": 3.4824750423431396, + "learning_rate": 5.203400350262697e-05, + "loss": 0.8792, + "num_input_tokens_seen": 110288720, + "step": 6854 + }, + { + "epoch": 0.48018072447398147, + "grad_norm": 3.319546937942505, + "learning_rate": 5.2027005253940466e-05, + "loss": 0.9861, + "num_input_tokens_seen": 110304984, + "step": 6855 + }, + { + "epoch": 0.48025077271971073, + "grad_norm": 5.543242454528809, + "learning_rate": 5.202000700525395e-05, + "loss": 1.0694, + "num_input_tokens_seen": 110320488, + "step": 6856 + }, + { + "epoch": 0.48032082096543993, + "grad_norm": 6.7765069007873535, + "learning_rate": 5.201300875656743e-05, + "loss": 1.0751, + "num_input_tokens_seen": 110336872, + "step": 6857 + }, + { + "epoch": 0.4803908692111692, + "grad_norm": 3.5764353275299072, + "learning_rate": 5.200601050788092e-05, + "loss": 1.0798, + "num_input_tokens_seen": 110353160, + "step": 6858 + }, + { + "epoch": 0.48046091745689845, + "grad_norm": 4.938530921936035, + "learning_rate": 5.19990122591944e-05, + "loss": 0.9155, + "num_input_tokens_seen": 110369544, + "step": 6859 + }, + { + "epoch": 0.4805309657026277, + "grad_norm": 3.5447168350219727, + "learning_rate": 5.1992014010507875e-05, + "loss": 0.9738, + "num_input_tokens_seen": 110385928, + "step": 6860 + }, + { + "epoch": 0.4806010139483569, + "grad_norm": 4.1170220375061035, + "learning_rate": 5.1985015761821364e-05, + "loss": 1.156, + "num_input_tokens_seen": 110402224, + "step": 6861 + }, + { + "epoch": 0.4806710621940862, + "grad_norm": 3.6147382259368896, + "learning_rate": 5.197801751313486e-05, + "loss": 1.0212, + "num_input_tokens_seen": 110418608, + "step": 6862 + }, + { + "epoch": 0.48074111043981543, + "grad_norm": 3.745072841644287, + "learning_rate": 5.197101926444834e-05, + "loss": 1.1518, + "num_input_tokens_seen": 110434792, + "step": 6863 + }, + { + "epoch": 0.4808111586855447, + "grad_norm": 4.3973517417907715, + "learning_rate": 5.196402101576182e-05, + "loss": 1.0627, + "num_input_tokens_seen": 110450376, + "step": 6864 + }, + { + "epoch": 0.4808812069312739, + "grad_norm": 4.029878616333008, + "learning_rate": 5.195702276707531e-05, + "loss": 1.051, + "num_input_tokens_seen": 110466760, + "step": 6865 + }, + { + "epoch": 0.48095125517700316, + "grad_norm": 3.5051989555358887, + "learning_rate": 5.195002451838879e-05, + "loss": 1.1163, + "num_input_tokens_seen": 110483144, + "step": 6866 + }, + { + "epoch": 0.4810213034227324, + "grad_norm": 3.8468475341796875, + "learning_rate": 5.1943026269702275e-05, + "loss": 1.0515, + "num_input_tokens_seen": 110499528, + "step": 6867 + }, + { + "epoch": 0.4810913516684617, + "grad_norm": 3.4679362773895264, + "learning_rate": 5.193602802101577e-05, + "loss": 1.0516, + "num_input_tokens_seen": 110515448, + "step": 6868 + }, + { + "epoch": 0.4811613999141909, + "grad_norm": 3.540043830871582, + "learning_rate": 5.192902977232925e-05, + "loss": 1.0163, + "num_input_tokens_seen": 110531832, + "step": 6869 + }, + { + "epoch": 0.48123144815992014, + "grad_norm": 4.2961835861206055, + "learning_rate": 5.192203152364272e-05, + "loss": 0.9839, + "num_input_tokens_seen": 110548216, + "step": 6870 + }, + { + "epoch": 0.4813014964056494, + "grad_norm": 4.718245029449463, + "learning_rate": 5.191503327495623e-05, + "loss": 1.0214, + "num_input_tokens_seen": 110564600, + "step": 6871 + }, + { + "epoch": 0.48137154465137866, + "grad_norm": 4.846748352050781, + "learning_rate": 5.190803502626971e-05, + "loss": 1.1448, + "num_input_tokens_seen": 110579952, + "step": 6872 + }, + { + "epoch": 0.48144159289710786, + "grad_norm": 3.5760273933410645, + "learning_rate": 5.1901036777583186e-05, + "loss": 1.0028, + "num_input_tokens_seen": 110595984, + "step": 6873 + }, + { + "epoch": 0.4815116411428371, + "grad_norm": 6.386372089385986, + "learning_rate": 5.189403852889667e-05, + "loss": 1.1695, + "num_input_tokens_seen": 110612368, + "step": 6874 + }, + { + "epoch": 0.4815816893885664, + "grad_norm": 5.007279872894287, + "learning_rate": 5.188704028021018e-05, + "loss": 1.0406, + "num_input_tokens_seen": 110628752, + "step": 6875 + }, + { + "epoch": 0.48165173763429564, + "grad_norm": 4.01614236831665, + "learning_rate": 5.1880042031523645e-05, + "loss": 1.075, + "num_input_tokens_seen": 110645136, + "step": 6876 + }, + { + "epoch": 0.48172178588002484, + "grad_norm": 4.7416486740112305, + "learning_rate": 5.187304378283713e-05, + "loss": 1.3402, + "num_input_tokens_seen": 110661400, + "step": 6877 + }, + { + "epoch": 0.4817918341257541, + "grad_norm": 4.886537551879883, + "learning_rate": 5.186604553415062e-05, + "loss": 0.8621, + "num_input_tokens_seen": 110677784, + "step": 6878 + }, + { + "epoch": 0.48186188237148336, + "grad_norm": 4.033387660980225, + "learning_rate": 5.1859047285464104e-05, + "loss": 1.3515, + "num_input_tokens_seen": 110694168, + "step": 6879 + }, + { + "epoch": 0.4819319306172126, + "grad_norm": 3.7201569080352783, + "learning_rate": 5.1852049036777586e-05, + "loss": 1.163, + "num_input_tokens_seen": 110710552, + "step": 6880 + }, + { + "epoch": 0.4820019788629418, + "grad_norm": 3.73651123046875, + "learning_rate": 5.1845050788091075e-05, + "loss": 1.0389, + "num_input_tokens_seen": 110726440, + "step": 6881 + }, + { + "epoch": 0.4820720271086711, + "grad_norm": 4.395266532897949, + "learning_rate": 5.1838052539404556e-05, + "loss": 0.9636, + "num_input_tokens_seen": 110742200, + "step": 6882 + }, + { + "epoch": 0.48214207535440035, + "grad_norm": 3.70263409614563, + "learning_rate": 5.183105429071804e-05, + "loss": 1.0229, + "num_input_tokens_seen": 110758584, + "step": 6883 + }, + { + "epoch": 0.4822121236001296, + "grad_norm": 4.863175868988037, + "learning_rate": 5.1824056042031534e-05, + "loss": 1.1663, + "num_input_tokens_seen": 110774040, + "step": 6884 + }, + { + "epoch": 0.4822821718458588, + "grad_norm": 3.668220043182373, + "learning_rate": 5.181705779334502e-05, + "loss": 1.2351, + "num_input_tokens_seen": 110790352, + "step": 6885 + }, + { + "epoch": 0.48235222009158807, + "grad_norm": 4.210755825042725, + "learning_rate": 5.18100595446585e-05, + "loss": 1.0517, + "num_input_tokens_seen": 110805912, + "step": 6886 + }, + { + "epoch": 0.48242226833731733, + "grad_norm": 3.62275767326355, + "learning_rate": 5.180306129597198e-05, + "loss": 1.0383, + "num_input_tokens_seen": 110822296, + "step": 6887 + }, + { + "epoch": 0.4824923165830466, + "grad_norm": 3.498563051223755, + "learning_rate": 5.179606304728547e-05, + "loss": 0.9063, + "num_input_tokens_seen": 110838680, + "step": 6888 + }, + { + "epoch": 0.4825623648287758, + "grad_norm": 6.4097161293029785, + "learning_rate": 5.178906479859895e-05, + "loss": 0.9482, + "num_input_tokens_seen": 110855064, + "step": 6889 + }, + { + "epoch": 0.48263241307450505, + "grad_norm": 4.8159565925598145, + "learning_rate": 5.178206654991243e-05, + "loss": 1.2248, + "num_input_tokens_seen": 110871328, + "step": 6890 + }, + { + "epoch": 0.4827024613202343, + "grad_norm": 3.976828098297119, + "learning_rate": 5.177506830122593e-05, + "loss": 1.1521, + "num_input_tokens_seen": 110886600, + "step": 6891 + }, + { + "epoch": 0.48277250956596357, + "grad_norm": 3.6857738494873047, + "learning_rate": 5.176807005253942e-05, + "loss": 1.1149, + "num_input_tokens_seen": 110902984, + "step": 6892 + }, + { + "epoch": 0.48284255781169283, + "grad_norm": 4.129028797149658, + "learning_rate": 5.176107180385289e-05, + "loss": 1.1404, + "num_input_tokens_seen": 110918808, + "step": 6893 + }, + { + "epoch": 0.48291260605742203, + "grad_norm": 4.203270435333252, + "learning_rate": 5.175407355516637e-05, + "loss": 1.1844, + "num_input_tokens_seen": 110935192, + "step": 6894 + }, + { + "epoch": 0.4829826543031513, + "grad_norm": 3.7045552730560303, + "learning_rate": 5.1747075306479874e-05, + "loss": 0.9193, + "num_input_tokens_seen": 110951168, + "step": 6895 + }, + { + "epoch": 0.48305270254888055, + "grad_norm": 4.2172112464904785, + "learning_rate": 5.174007705779334e-05, + "loss": 0.8905, + "num_input_tokens_seen": 110967552, + "step": 6896 + }, + { + "epoch": 0.4831227507946098, + "grad_norm": 3.395329236984253, + "learning_rate": 5.1733078809106824e-05, + "loss": 0.9696, + "num_input_tokens_seen": 110983736, + "step": 6897 + }, + { + "epoch": 0.483192799040339, + "grad_norm": 6.649857044219971, + "learning_rate": 5.172608056042032e-05, + "loss": 1.2299, + "num_input_tokens_seen": 111000120, + "step": 6898 + }, + { + "epoch": 0.4832628472860683, + "grad_norm": 5.114965438842773, + "learning_rate": 5.1719082311733815e-05, + "loss": 1.196, + "num_input_tokens_seen": 111016504, + "step": 6899 + }, + { + "epoch": 0.48333289553179754, + "grad_norm": 4.1728410720825195, + "learning_rate": 5.171208406304728e-05, + "loss": 1.1445, + "num_input_tokens_seen": 111032232, + "step": 6900 + }, + { + "epoch": 0.4834029437775268, + "grad_norm": 3.674546241760254, + "learning_rate": 5.170508581436078e-05, + "loss": 1.0889, + "num_input_tokens_seen": 111047576, + "step": 6901 + }, + { + "epoch": 0.483472992023256, + "grad_norm": 3.4895896911621094, + "learning_rate": 5.169808756567427e-05, + "loss": 0.9618, + "num_input_tokens_seen": 111063792, + "step": 6902 + }, + { + "epoch": 0.48354304026898526, + "grad_norm": 8.447297096252441, + "learning_rate": 5.169108931698774e-05, + "loss": 0.911, + "num_input_tokens_seen": 111079136, + "step": 6903 + }, + { + "epoch": 0.4836130885147145, + "grad_norm": 4.854581356048584, + "learning_rate": 5.1684091068301224e-05, + "loss": 0.9725, + "num_input_tokens_seen": 111093808, + "step": 6904 + }, + { + "epoch": 0.4836831367604438, + "grad_norm": 3.4015259742736816, + "learning_rate": 5.167709281961471e-05, + "loss": 0.9395, + "num_input_tokens_seen": 111110192, + "step": 6905 + }, + { + "epoch": 0.483753185006173, + "grad_norm": 3.979801654815674, + "learning_rate": 5.1670094570928195e-05, + "loss": 1.181, + "num_input_tokens_seen": 111126576, + "step": 6906 + }, + { + "epoch": 0.48382323325190224, + "grad_norm": 3.655245542526245, + "learning_rate": 5.166309632224169e-05, + "loss": 0.9631, + "num_input_tokens_seen": 111142960, + "step": 6907 + }, + { + "epoch": 0.4838932814976315, + "grad_norm": 3.820819616317749, + "learning_rate": 5.165609807355517e-05, + "loss": 1.0845, + "num_input_tokens_seen": 111159344, + "step": 6908 + }, + { + "epoch": 0.48396332974336076, + "grad_norm": 3.6869490146636963, + "learning_rate": 5.164909982486866e-05, + "loss": 0.7909, + "num_input_tokens_seen": 111175608, + "step": 6909 + }, + { + "epoch": 0.48403337798908996, + "grad_norm": 3.644277334213257, + "learning_rate": 5.1642101576182135e-05, + "loss": 1.0442, + "num_input_tokens_seen": 111191992, + "step": 6910 + }, + { + "epoch": 0.4841034262348192, + "grad_norm": 3.794215202331543, + "learning_rate": 5.1635103327495624e-05, + "loss": 1.1105, + "num_input_tokens_seen": 111207248, + "step": 6911 + }, + { + "epoch": 0.4841734744805485, + "grad_norm": 4.5081987380981445, + "learning_rate": 5.162810507880912e-05, + "loss": 1.3952, + "num_input_tokens_seen": 111223632, + "step": 6912 + }, + { + "epoch": 0.48424352272627774, + "grad_norm": 3.632924795150757, + "learning_rate": 5.162110683012259e-05, + "loss": 1.0862, + "num_input_tokens_seen": 111240016, + "step": 6913 + }, + { + "epoch": 0.48431357097200695, + "grad_norm": 3.522996425628662, + "learning_rate": 5.161410858143607e-05, + "loss": 0.9521, + "num_input_tokens_seen": 111255840, + "step": 6914 + }, + { + "epoch": 0.4843836192177362, + "grad_norm": 4.495186805725098, + "learning_rate": 5.1607110332749565e-05, + "loss": 1.0066, + "num_input_tokens_seen": 111272224, + "step": 6915 + }, + { + "epoch": 0.48445366746346546, + "grad_norm": 3.6315512657165527, + "learning_rate": 5.160011208406306e-05, + "loss": 1.0991, + "num_input_tokens_seen": 111287920, + "step": 6916 + }, + { + "epoch": 0.4845237157091947, + "grad_norm": 3.4649548530578613, + "learning_rate": 5.159311383537654e-05, + "loss": 1.024, + "num_input_tokens_seen": 111304304, + "step": 6917 + }, + { + "epoch": 0.48459376395492393, + "grad_norm": 4.057675838470459, + "learning_rate": 5.158611558669002e-05, + "loss": 1.0403, + "num_input_tokens_seen": 111320688, + "step": 6918 + }, + { + "epoch": 0.4846638122006532, + "grad_norm": 4.989962100982666, + "learning_rate": 5.157911733800351e-05, + "loss": 1.0446, + "num_input_tokens_seen": 111337072, + "step": 6919 + }, + { + "epoch": 0.48473386044638245, + "grad_norm": 4.090515613555908, + "learning_rate": 5.157211908931698e-05, + "loss": 0.9324, + "num_input_tokens_seen": 111353456, + "step": 6920 + }, + { + "epoch": 0.4848039086921117, + "grad_norm": 4.017073154449463, + "learning_rate": 5.156512084063046e-05, + "loss": 1.0938, + "num_input_tokens_seen": 111369840, + "step": 6921 + }, + { + "epoch": 0.4848739569378409, + "grad_norm": 4.227852821350098, + "learning_rate": 5.155812259194397e-05, + "loss": 1.0553, + "num_input_tokens_seen": 111386096, + "step": 6922 + }, + { + "epoch": 0.48494400518357017, + "grad_norm": 5.356720447540283, + "learning_rate": 5.155112434325745e-05, + "loss": 1.1807, + "num_input_tokens_seen": 111402192, + "step": 6923 + }, + { + "epoch": 0.48501405342929943, + "grad_norm": 3.714996814727783, + "learning_rate": 5.1544126094570935e-05, + "loss": 0.9851, + "num_input_tokens_seen": 111418120, + "step": 6924 + }, + { + "epoch": 0.4850841016750287, + "grad_norm": 3.814669609069824, + "learning_rate": 5.153712784588442e-05, + "loss": 1.1195, + "num_input_tokens_seen": 111434408, + "step": 6925 + }, + { + "epoch": 0.48515414992075795, + "grad_norm": 4.38773250579834, + "learning_rate": 5.1530129597197905e-05, + "loss": 0.9939, + "num_input_tokens_seen": 111450384, + "step": 6926 + }, + { + "epoch": 0.48522419816648715, + "grad_norm": 5.492570877075195, + "learning_rate": 5.152313134851139e-05, + "loss": 1.0629, + "num_input_tokens_seen": 111466768, + "step": 6927 + }, + { + "epoch": 0.4852942464122164, + "grad_norm": 4.867751598358154, + "learning_rate": 5.151613309982487e-05, + "loss": 1.0787, + "num_input_tokens_seen": 111481680, + "step": 6928 + }, + { + "epoch": 0.48536429465794567, + "grad_norm": 3.6009931564331055, + "learning_rate": 5.1509134851138364e-05, + "loss": 1.1068, + "num_input_tokens_seen": 111497784, + "step": 6929 + }, + { + "epoch": 0.48543434290367493, + "grad_norm": 3.451188564300537, + "learning_rate": 5.1502136602451846e-05, + "loss": 0.9131, + "num_input_tokens_seen": 111513856, + "step": 6930 + }, + { + "epoch": 0.48550439114940414, + "grad_norm": 4.886107444763184, + "learning_rate": 5.149513835376533e-05, + "loss": 0.9234, + "num_input_tokens_seen": 111530240, + "step": 6931 + }, + { + "epoch": 0.4855744393951334, + "grad_norm": 4.033775806427002, + "learning_rate": 5.148814010507881e-05, + "loss": 1.0094, + "num_input_tokens_seen": 111546160, + "step": 6932 + }, + { + "epoch": 0.48564448764086265, + "grad_norm": 4.718981742858887, + "learning_rate": 5.14811418563923e-05, + "loss": 0.9965, + "num_input_tokens_seen": 111562432, + "step": 6933 + }, + { + "epoch": 0.4857145358865919, + "grad_norm": 3.7174808979034424, + "learning_rate": 5.147414360770578e-05, + "loss": 1.1065, + "num_input_tokens_seen": 111578816, + "step": 6934 + }, + { + "epoch": 0.4857845841323211, + "grad_norm": 4.0880208015441895, + "learning_rate": 5.146714535901926e-05, + "loss": 1.0742, + "num_input_tokens_seen": 111593928, + "step": 6935 + }, + { + "epoch": 0.4858546323780504, + "grad_norm": 3.3873400688171387, + "learning_rate": 5.146014711033276e-05, + "loss": 0.9752, + "num_input_tokens_seen": 111610312, + "step": 6936 + }, + { + "epoch": 0.48592468062377964, + "grad_norm": 3.6071503162384033, + "learning_rate": 5.145314886164624e-05, + "loss": 0.9917, + "num_input_tokens_seen": 111626696, + "step": 6937 + }, + { + "epoch": 0.4859947288695089, + "grad_norm": 3.502610445022583, + "learning_rate": 5.1446150612959735e-05, + "loss": 0.8912, + "num_input_tokens_seen": 111643080, + "step": 6938 + }, + { + "epoch": 0.4860647771152381, + "grad_norm": 3.5743067264556885, + "learning_rate": 5.1439152364273216e-05, + "loss": 1.1493, + "num_input_tokens_seen": 111659048, + "step": 6939 + }, + { + "epoch": 0.48613482536096736, + "grad_norm": 3.9423654079437256, + "learning_rate": 5.14321541155867e-05, + "loss": 1.1328, + "num_input_tokens_seen": 111675432, + "step": 6940 + }, + { + "epoch": 0.4862048736066966, + "grad_norm": 4.670028209686279, + "learning_rate": 5.142515586690017e-05, + "loss": 0.9023, + "num_input_tokens_seen": 111691816, + "step": 6941 + }, + { + "epoch": 0.4862749218524259, + "grad_norm": 3.8914809226989746, + "learning_rate": 5.1418157618213655e-05, + "loss": 1.0373, + "num_input_tokens_seen": 111708200, + "step": 6942 + }, + { + "epoch": 0.4863449700981551, + "grad_norm": 3.864323139190674, + "learning_rate": 5.141115936952715e-05, + "loss": 0.9064, + "num_input_tokens_seen": 111724488, + "step": 6943 + }, + { + "epoch": 0.48641501834388434, + "grad_norm": 3.700681447982788, + "learning_rate": 5.140416112084063e-05, + "loss": 1.092, + "num_input_tokens_seen": 111740368, + "step": 6944 + }, + { + "epoch": 0.4864850665896136, + "grad_norm": 3.7225606441497803, + "learning_rate": 5.139716287215413e-05, + "loss": 0.971, + "num_input_tokens_seen": 111755936, + "step": 6945 + }, + { + "epoch": 0.48655511483534286, + "grad_norm": 4.638529300689697, + "learning_rate": 5.139016462346761e-05, + "loss": 0.9367, + "num_input_tokens_seen": 111772152, + "step": 6946 + }, + { + "epoch": 0.48662516308107207, + "grad_norm": 5.287013053894043, + "learning_rate": 5.138316637478109e-05, + "loss": 0.9463, + "num_input_tokens_seen": 111787144, + "step": 6947 + }, + { + "epoch": 0.4866952113268013, + "grad_norm": 3.991861343383789, + "learning_rate": 5.137616812609457e-05, + "loss": 0.8265, + "num_input_tokens_seen": 111803528, + "step": 6948 + }, + { + "epoch": 0.4867652595725306, + "grad_norm": 4.166889190673828, + "learning_rate": 5.136916987740806e-05, + "loss": 1.1667, + "num_input_tokens_seen": 111819376, + "step": 6949 + }, + { + "epoch": 0.48683530781825984, + "grad_norm": 4.159299373626709, + "learning_rate": 5.1362171628721543e-05, + "loss": 1.1422, + "num_input_tokens_seen": 111835760, + "step": 6950 + }, + { + "epoch": 0.48690535606398905, + "grad_norm": 5.612180709838867, + "learning_rate": 5.1355173380035025e-05, + "loss": 0.9889, + "num_input_tokens_seen": 111851744, + "step": 6951 + }, + { + "epoch": 0.4869754043097183, + "grad_norm": 5.82523775100708, + "learning_rate": 5.134817513134852e-05, + "loss": 0.9702, + "num_input_tokens_seen": 111868128, + "step": 6952 + }, + { + "epoch": 0.48704545255544757, + "grad_norm": 3.5110416412353516, + "learning_rate": 5.1341176882662e-05, + "loss": 0.9538, + "num_input_tokens_seen": 111884512, + "step": 6953 + }, + { + "epoch": 0.4871155008011768, + "grad_norm": 4.108850479125977, + "learning_rate": 5.1334178633975484e-05, + "loss": 1.0866, + "num_input_tokens_seen": 111899544, + "step": 6954 + }, + { + "epoch": 0.48718554904690603, + "grad_norm": 3.9000258445739746, + "learning_rate": 5.132718038528897e-05, + "loss": 0.9456, + "num_input_tokens_seen": 111915928, + "step": 6955 + }, + { + "epoch": 0.4872555972926353, + "grad_norm": 4.503340244293213, + "learning_rate": 5.1320182136602455e-05, + "loss": 1.004, + "num_input_tokens_seen": 111932288, + "step": 6956 + }, + { + "epoch": 0.48732564553836455, + "grad_norm": 4.052606582641602, + "learning_rate": 5.1313183887915936e-05, + "loss": 1.1662, + "num_input_tokens_seen": 111948672, + "step": 6957 + }, + { + "epoch": 0.4873956937840938, + "grad_norm": 3.4959487915039062, + "learning_rate": 5.130618563922942e-05, + "loss": 1.0242, + "num_input_tokens_seen": 111964272, + "step": 6958 + }, + { + "epoch": 0.48746574202982307, + "grad_norm": 4.654433250427246, + "learning_rate": 5.12991873905429e-05, + "loss": 1.1307, + "num_input_tokens_seen": 111980656, + "step": 6959 + }, + { + "epoch": 0.48753579027555227, + "grad_norm": 4.125091552734375, + "learning_rate": 5.1292189141856395e-05, + "loss": 1.1137, + "num_input_tokens_seen": 111996584, + "step": 6960 + }, + { + "epoch": 0.48760583852128153, + "grad_norm": 4.574272155761719, + "learning_rate": 5.128519089316988e-05, + "loss": 0.9312, + "num_input_tokens_seen": 112011528, + "step": 6961 + }, + { + "epoch": 0.4876758867670108, + "grad_norm": 4.110400676727295, + "learning_rate": 5.127819264448337e-05, + "loss": 0.9669, + "num_input_tokens_seen": 112026256, + "step": 6962 + }, + { + "epoch": 0.48774593501274005, + "grad_norm": 3.4572913646698, + "learning_rate": 5.127119439579685e-05, + "loss": 0.9956, + "num_input_tokens_seen": 112042288, + "step": 6963 + }, + { + "epoch": 0.48781598325846925, + "grad_norm": 4.498427391052246, + "learning_rate": 5.126419614711033e-05, + "loss": 1.1088, + "num_input_tokens_seen": 112058672, + "step": 6964 + }, + { + "epoch": 0.4878860315041985, + "grad_norm": 5.4692301750183105, + "learning_rate": 5.1257197898423825e-05, + "loss": 0.9582, + "num_input_tokens_seen": 112073536, + "step": 6965 + }, + { + "epoch": 0.4879560797499278, + "grad_norm": 3.8990654945373535, + "learning_rate": 5.125019964973732e-05, + "loss": 1.2729, + "num_input_tokens_seen": 112089344, + "step": 6966 + }, + { + "epoch": 0.48802612799565703, + "grad_norm": 3.5601627826690674, + "learning_rate": 5.12432014010508e-05, + "loss": 1.0519, + "num_input_tokens_seen": 112105296, + "step": 6967 + }, + { + "epoch": 0.48809617624138624, + "grad_norm": 3.91282057762146, + "learning_rate": 5.123620315236427e-05, + "loss": 1.2046, + "num_input_tokens_seen": 112120600, + "step": 6968 + }, + { + "epoch": 0.4881662244871155, + "grad_norm": 5.9246602058410645, + "learning_rate": 5.1229204903677766e-05, + "loss": 1.0982, + "num_input_tokens_seen": 112136472, + "step": 6969 + }, + { + "epoch": 0.48823627273284476, + "grad_norm": 8.849782943725586, + "learning_rate": 5.122220665499125e-05, + "loss": 1.0671, + "num_input_tokens_seen": 112152792, + "step": 6970 + }, + { + "epoch": 0.488306320978574, + "grad_norm": 4.184106349945068, + "learning_rate": 5.121520840630473e-05, + "loss": 1.2696, + "num_input_tokens_seen": 112169176, + "step": 6971 + }, + { + "epoch": 0.4883763692243032, + "grad_norm": 4.250857830047607, + "learning_rate": 5.120821015761822e-05, + "loss": 0.9724, + "num_input_tokens_seen": 112184784, + "step": 6972 + }, + { + "epoch": 0.4884464174700325, + "grad_norm": 4.522305011749268, + "learning_rate": 5.12012119089317e-05, + "loss": 0.8251, + "num_input_tokens_seen": 112200960, + "step": 6973 + }, + { + "epoch": 0.48851646571576174, + "grad_norm": 3.5135490894317627, + "learning_rate": 5.1194213660245195e-05, + "loss": 1.0713, + "num_input_tokens_seen": 112217080, + "step": 6974 + }, + { + "epoch": 0.488586513961491, + "grad_norm": 5.541810989379883, + "learning_rate": 5.118721541155866e-05, + "loss": 1.1885, + "num_input_tokens_seen": 112233464, + "step": 6975 + }, + { + "epoch": 0.4886565622072202, + "grad_norm": 3.7535064220428467, + "learning_rate": 5.1180217162872165e-05, + "loss": 1.0558, + "num_input_tokens_seen": 112249848, + "step": 6976 + }, + { + "epoch": 0.48872661045294946, + "grad_norm": 4.454082012176514, + "learning_rate": 5.117321891418565e-05, + "loss": 1.0892, + "num_input_tokens_seen": 112265560, + "step": 6977 + }, + { + "epoch": 0.4887966586986787, + "grad_norm": 3.770138740539551, + "learning_rate": 5.116622066549912e-05, + "loss": 1.082, + "num_input_tokens_seen": 112281944, + "step": 6978 + }, + { + "epoch": 0.488866706944408, + "grad_norm": 5.923669815063477, + "learning_rate": 5.115922241681261e-05, + "loss": 1.0662, + "num_input_tokens_seen": 112298264, + "step": 6979 + }, + { + "epoch": 0.4889367551901372, + "grad_norm": 3.9768123626708984, + "learning_rate": 5.115222416812609e-05, + "loss": 1.1816, + "num_input_tokens_seen": 112314608, + "step": 6980 + }, + { + "epoch": 0.48900680343586644, + "grad_norm": 5.525039196014404, + "learning_rate": 5.1145225919439575e-05, + "loss": 1.0287, + "num_input_tokens_seen": 112330400, + "step": 6981 + }, + { + "epoch": 0.4890768516815957, + "grad_norm": 3.8725640773773193, + "learning_rate": 5.113822767075307e-05, + "loss": 0.9666, + "num_input_tokens_seen": 112345384, + "step": 6982 + }, + { + "epoch": 0.48914689992732496, + "grad_norm": 4.746465682983398, + "learning_rate": 5.1131229422066565e-05, + "loss": 1.1162, + "num_input_tokens_seen": 112361768, + "step": 6983 + }, + { + "epoch": 0.48921694817305417, + "grad_norm": 3.774049997329712, + "learning_rate": 5.112423117338005e-05, + "loss": 1.0898, + "num_input_tokens_seen": 112377432, + "step": 6984 + }, + { + "epoch": 0.4892869964187834, + "grad_norm": 3.686307191848755, + "learning_rate": 5.1117232924693515e-05, + "loss": 1.0459, + "num_input_tokens_seen": 112393672, + "step": 6985 + }, + { + "epoch": 0.4893570446645127, + "grad_norm": 4.177459239959717, + "learning_rate": 5.1110234676007004e-05, + "loss": 1.0504, + "num_input_tokens_seen": 112409600, + "step": 6986 + }, + { + "epoch": 0.48942709291024195, + "grad_norm": 3.8517558574676514, + "learning_rate": 5.1103236427320486e-05, + "loss": 1.0947, + "num_input_tokens_seen": 112425880, + "step": 6987 + }, + { + "epoch": 0.48949714115597115, + "grad_norm": 3.3155159950256348, + "learning_rate": 5.109623817863397e-05, + "loss": 0.9201, + "num_input_tokens_seen": 112442264, + "step": 6988 + }, + { + "epoch": 0.4895671894017004, + "grad_norm": 4.027132987976074, + "learning_rate": 5.108923992994746e-05, + "loss": 1.0882, + "num_input_tokens_seen": 112458504, + "step": 6989 + }, + { + "epoch": 0.48963723764742967, + "grad_norm": 3.622421979904175, + "learning_rate": 5.108224168126096e-05, + "loss": 0.9098, + "num_input_tokens_seen": 112474888, + "step": 6990 + }, + { + "epoch": 0.48970728589315893, + "grad_norm": 4.16541051864624, + "learning_rate": 5.107524343257444e-05, + "loss": 0.967, + "num_input_tokens_seen": 112491272, + "step": 6991 + }, + { + "epoch": 0.48977733413888813, + "grad_norm": 4.473822593688965, + "learning_rate": 5.106824518388792e-05, + "loss": 1.026, + "num_input_tokens_seen": 112506632, + "step": 6992 + }, + { + "epoch": 0.4898473823846174, + "grad_norm": 5.10452127456665, + "learning_rate": 5.106124693520141e-05, + "loss": 1.003, + "num_input_tokens_seen": 112521696, + "step": 6993 + }, + { + "epoch": 0.48991743063034665, + "grad_norm": 4.185652732849121, + "learning_rate": 5.105424868651489e-05, + "loss": 0.9275, + "num_input_tokens_seen": 112537432, + "step": 6994 + }, + { + "epoch": 0.4899874788760759, + "grad_norm": 4.864262580871582, + "learning_rate": 5.104725043782837e-05, + "loss": 1.033, + "num_input_tokens_seen": 112553816, + "step": 6995 + }, + { + "epoch": 0.49005752712180517, + "grad_norm": 3.859199047088623, + "learning_rate": 5.1040252189141856e-05, + "loss": 1.115, + "num_input_tokens_seen": 112570200, + "step": 6996 + }, + { + "epoch": 0.4901275753675344, + "grad_norm": 3.49395751953125, + "learning_rate": 5.103325394045535e-05, + "loss": 0.9285, + "num_input_tokens_seen": 112586584, + "step": 6997 + }, + { + "epoch": 0.49019762361326363, + "grad_norm": 4.164735317230225, + "learning_rate": 5.102625569176883e-05, + "loss": 0.8565, + "num_input_tokens_seen": 112602568, + "step": 6998 + }, + { + "epoch": 0.4902676718589929, + "grad_norm": 6.273041725158691, + "learning_rate": 5.1019257443082315e-05, + "loss": 1.0252, + "num_input_tokens_seen": 112618952, + "step": 6999 + }, + { + "epoch": 0.49033772010472215, + "grad_norm": 3.8460848331451416, + "learning_rate": 5.1012259194395804e-05, + "loss": 0.9854, + "num_input_tokens_seen": 112635336, + "step": 7000 + }, + { + "epoch": 0.49033772010472215, + "eval_loss": 1.1226829290390015, + "eval_runtime": 0.157, + "eval_samples_per_second": 6.371, + "eval_steps_per_second": 6.371, + "num_input_tokens_seen": 112635336, + "step": 7000 + } + ], + "logging_steps": 1, + "max_steps": 14275, + "num_input_tokens_seen": 112635336, + "num_train_epochs": 1, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.418718445933138e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}